1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=CHECK --check-prefix=SSE --check-prefix=SSE-RECIP
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX-RECIP
4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fma | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=FMA-RECIP
5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=bdver2 | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=BDVER2
6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=btver2 | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=BTVER2
7 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=sandybridge | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=SANDY
8 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=haswell | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=HASWELL
9 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=haswell -mattr=-fma | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=HASWELL-NO-FMA
10 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX512 --check-prefix=KNL
11 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=skx | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX512 --check-prefix=SKX
13 ; If the target's divss/divps instructions are substantially
14 ; slower than rcpss/rcpps with a Newton-Raphson refinement,
15 ; we should generate the estimate sequence.
17 ; See PR21385 ( http://llvm.org/bugs/show_bug.cgi?id=21385 )
18 ; for details about the accuracy, speed, and implementation
19 ; differences of x86 reciprocal estimates.
21 define float @f32_no_estimate(float %x) #0 {
22 ; SSE-LABEL: f32_no_estimate:
24 ; SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
25 ; SSE-NEXT: divss %xmm0, %xmm1
26 ; SSE-NEXT: movaps %xmm1, %xmm0
29 ; AVX-LABEL: f32_no_estimate:
31 ; AVX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
32 ; AVX-NEXT: vdivss %xmm0, %xmm1, %xmm0
34 %div = fdiv fast float 1.0, %x
38 define float @f32_one_step(float %x) #1 {
39 ; SSE-LABEL: f32_one_step:
41 ; SSE-NEXT: rcpss %xmm0, %xmm2
42 ; SSE-NEXT: mulss %xmm2, %xmm0
43 ; SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
44 ; SSE-NEXT: subss %xmm0, %xmm1
45 ; SSE-NEXT: mulss %xmm2, %xmm1
46 ; SSE-NEXT: addss %xmm2, %xmm1
47 ; SSE-NEXT: movaps %xmm1, %xmm0
50 ; AVX-RECIP-LABEL: f32_one_step:
52 ; AVX-RECIP-NEXT: vrcpss %xmm0, %xmm0, %xmm1
53 ; AVX-RECIP-NEXT: vmulss %xmm1, %xmm0, %xmm0
54 ; AVX-RECIP-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
55 ; AVX-RECIP-NEXT: vsubss %xmm0, %xmm2, %xmm0
56 ; AVX-RECIP-NEXT: vmulss %xmm0, %xmm1, %xmm0
57 ; AVX-RECIP-NEXT: vaddss %xmm0, %xmm1, %xmm0
58 ; AVX-RECIP-NEXT: retq
60 ; FMA-RECIP-LABEL: f32_one_step:
62 ; FMA-RECIP-NEXT: vrcpss %xmm0, %xmm0, %xmm1
63 ; FMA-RECIP-NEXT: vfmadd213ss {{.*#+}} xmm0 = (xmm1 * xmm0) + mem
64 ; FMA-RECIP-NEXT: vfnmadd132ss {{.*#+}} xmm0 = -(xmm0 * xmm1) + xmm1
65 ; FMA-RECIP-NEXT: retq
67 ; BDVER2-LABEL: f32_one_step:
69 ; BDVER2-NEXT: vrcpss %xmm0, %xmm0, %xmm1
70 ; BDVER2-NEXT: vfmaddss {{.*}}(%rip), %xmm1, %xmm0, %xmm0
71 ; BDVER2-NEXT: vfnmaddss %xmm1, %xmm0, %xmm1, %xmm0
74 ; BTVER2-LABEL: f32_one_step:
76 ; BTVER2-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
77 ; BTVER2-NEXT: vrcpss %xmm0, %xmm0, %xmm1
78 ; BTVER2-NEXT: vmulss %xmm1, %xmm0, %xmm0
79 ; BTVER2-NEXT: vsubss %xmm0, %xmm2, %xmm0
80 ; BTVER2-NEXT: vmulss %xmm0, %xmm1, %xmm0
81 ; BTVER2-NEXT: vaddss %xmm0, %xmm1, %xmm0
84 ; SANDY-LABEL: f32_one_step:
86 ; SANDY-NEXT: vrcpss %xmm0, %xmm0, %xmm1
87 ; SANDY-NEXT: vmulss %xmm1, %xmm0, %xmm0
88 ; SANDY-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
89 ; SANDY-NEXT: vsubss %xmm0, %xmm2, %xmm0
90 ; SANDY-NEXT: vmulss %xmm0, %xmm1, %xmm0
91 ; SANDY-NEXT: vaddss %xmm0, %xmm1, %xmm0
94 ; HASWELL-LABEL: f32_one_step:
96 ; HASWELL-NEXT: vrcpss %xmm0, %xmm0, %xmm1
97 ; HASWELL-NEXT: vfmadd213ss {{.*#+}} xmm0 = (xmm1 * xmm0) + mem
98 ; HASWELL-NEXT: vfnmadd132ss {{.*#+}} xmm0 = -(xmm0 * xmm1) + xmm1
101 ; HASWELL-NO-FMA-LABEL: f32_one_step:
102 ; HASWELL-NO-FMA: # %bb.0:
103 ; HASWELL-NO-FMA-NEXT: vrcpss %xmm0, %xmm0, %xmm1
104 ; HASWELL-NO-FMA-NEXT: vmulss %xmm1, %xmm0, %xmm0
105 ; HASWELL-NO-FMA-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
106 ; HASWELL-NO-FMA-NEXT: vsubss %xmm0, %xmm2, %xmm0
107 ; HASWELL-NO-FMA-NEXT: vmulss %xmm0, %xmm1, %xmm0
108 ; HASWELL-NO-FMA-NEXT: vaddss %xmm0, %xmm1, %xmm0
109 ; HASWELL-NO-FMA-NEXT: retq
111 ; AVX512-LABEL: f32_one_step:
113 ; AVX512-NEXT: vrcpss %xmm0, %xmm0, %xmm1
114 ; AVX512-NEXT: vfmadd213ss {{.*#+}} xmm0 = (xmm1 * xmm0) + mem
115 ; AVX512-NEXT: vfnmadd132ss {{.*#+}} xmm0 = -(xmm0 * xmm1) + xmm1
117 %div = fdiv fast float 1.0, %x
121 define float @f32_one_step_variables(float %x, float %y) #1 {
122 ; SSE-LABEL: f32_one_step_variables:
124 ; SSE-NEXT: rcpss %xmm1, %xmm2
125 ; SSE-NEXT: movaps %xmm0, %xmm3
126 ; SSE-NEXT: mulss %xmm2, %xmm3
127 ; SSE-NEXT: mulss %xmm3, %xmm1
128 ; SSE-NEXT: subss %xmm1, %xmm0
129 ; SSE-NEXT: mulss %xmm2, %xmm0
130 ; SSE-NEXT: addss %xmm3, %xmm0
133 ; AVX-RECIP-LABEL: f32_one_step_variables:
134 ; AVX-RECIP: # %bb.0:
135 ; AVX-RECIP-NEXT: vrcpss %xmm1, %xmm1, %xmm2
136 ; AVX-RECIP-NEXT: vmulss %xmm2, %xmm0, %xmm3
137 ; AVX-RECIP-NEXT: vmulss %xmm3, %xmm1, %xmm1
138 ; AVX-RECIP-NEXT: vsubss %xmm1, %xmm0, %xmm0
139 ; AVX-RECIP-NEXT: vmulss %xmm0, %xmm2, %xmm0
140 ; AVX-RECIP-NEXT: vaddss %xmm0, %xmm3, %xmm0
141 ; AVX-RECIP-NEXT: retq
143 ; FMA-RECIP-LABEL: f32_one_step_variables:
144 ; FMA-RECIP: # %bb.0:
145 ; FMA-RECIP-NEXT: vrcpss %xmm1, %xmm1, %xmm2
146 ; FMA-RECIP-NEXT: vmulss %xmm2, %xmm0, %xmm3
147 ; FMA-RECIP-NEXT: vfnmadd213ss {{.*#+}} xmm1 = -(xmm3 * xmm1) + xmm0
148 ; FMA-RECIP-NEXT: vfmadd213ss {{.*#+}} xmm2 = (xmm1 * xmm2) + xmm3
149 ; FMA-RECIP-NEXT: vmovaps %xmm2, %xmm0
150 ; FMA-RECIP-NEXT: retq
152 ; BDVER2-LABEL: f32_one_step_variables:
154 ; BDVER2-NEXT: vrcpss %xmm1, %xmm1, %xmm2
155 ; BDVER2-NEXT: vmulss %xmm2, %xmm0, %xmm3
156 ; BDVER2-NEXT: vfnmaddss %xmm0, %xmm3, %xmm1, %xmm0
157 ; BDVER2-NEXT: vfmaddss %xmm3, %xmm0, %xmm2, %xmm0
160 ; BTVER2-LABEL: f32_one_step_variables:
162 ; BTVER2-NEXT: vrcpss %xmm1, %xmm1, %xmm2
163 ; BTVER2-NEXT: vmulss %xmm2, %xmm0, %xmm3
164 ; BTVER2-NEXT: vmulss %xmm3, %xmm1, %xmm1
165 ; BTVER2-NEXT: vsubss %xmm1, %xmm0, %xmm0
166 ; BTVER2-NEXT: vmulss %xmm0, %xmm2, %xmm0
167 ; BTVER2-NEXT: vaddss %xmm0, %xmm3, %xmm0
170 ; SANDY-LABEL: f32_one_step_variables:
172 ; SANDY-NEXT: vrcpss %xmm1, %xmm1, %xmm2
173 ; SANDY-NEXT: vmulss %xmm2, %xmm0, %xmm3
174 ; SANDY-NEXT: vmulss %xmm3, %xmm1, %xmm1
175 ; SANDY-NEXT: vsubss %xmm1, %xmm0, %xmm0
176 ; SANDY-NEXT: vmulss %xmm0, %xmm2, %xmm0
177 ; SANDY-NEXT: vaddss %xmm0, %xmm3, %xmm0
180 ; HASWELL-LABEL: f32_one_step_variables:
182 ; HASWELL-NEXT: vrcpss %xmm1, %xmm1, %xmm2
183 ; HASWELL-NEXT: vmulss %xmm2, %xmm0, %xmm3
184 ; HASWELL-NEXT: vfnmadd213ss {{.*#+}} xmm1 = -(xmm3 * xmm1) + xmm0
185 ; HASWELL-NEXT: vfmadd213ss {{.*#+}} xmm2 = (xmm1 * xmm2) + xmm3
186 ; HASWELL-NEXT: vmovaps %xmm2, %xmm0
189 ; HASWELL-NO-FMA-LABEL: f32_one_step_variables:
190 ; HASWELL-NO-FMA: # %bb.0:
191 ; HASWELL-NO-FMA-NEXT: vrcpss %xmm1, %xmm1, %xmm2
192 ; HASWELL-NO-FMA-NEXT: vmulss %xmm2, %xmm0, %xmm3
193 ; HASWELL-NO-FMA-NEXT: vmulss %xmm3, %xmm1, %xmm1
194 ; HASWELL-NO-FMA-NEXT: vsubss %xmm1, %xmm0, %xmm0
195 ; HASWELL-NO-FMA-NEXT: vmulss %xmm0, %xmm2, %xmm0
196 ; HASWELL-NO-FMA-NEXT: vaddss %xmm0, %xmm3, %xmm0
197 ; HASWELL-NO-FMA-NEXT: retq
199 ; AVX512-LABEL: f32_one_step_variables:
201 ; AVX512-NEXT: vrcpss %xmm1, %xmm1, %xmm2
202 ; AVX512-NEXT: vmulss %xmm2, %xmm0, %xmm3
203 ; AVX512-NEXT: vfnmadd213ss {{.*#+}} xmm1 = -(xmm3 * xmm1) + xmm0
204 ; AVX512-NEXT: vfmadd213ss {{.*#+}} xmm2 = (xmm1 * xmm2) + xmm3
205 ; AVX512-NEXT: vmovaps %xmm2, %xmm0
207 %div = fdiv fast float %x, %y
211 define float @f32_two_step(float %x) #2 {
212 ; SSE-LABEL: f32_two_step:
214 ; SSE-NEXT: rcpss %xmm0, %xmm2
215 ; SSE-NEXT: movaps %xmm0, %xmm3
216 ; SSE-NEXT: mulss %xmm2, %xmm3
217 ; SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
218 ; SSE-NEXT: movaps %xmm1, %xmm4
219 ; SSE-NEXT: subss %xmm3, %xmm4
220 ; SSE-NEXT: mulss %xmm2, %xmm4
221 ; SSE-NEXT: addss %xmm2, %xmm4
222 ; SSE-NEXT: mulss %xmm4, %xmm0
223 ; SSE-NEXT: subss %xmm0, %xmm1
224 ; SSE-NEXT: mulss %xmm4, %xmm1
225 ; SSE-NEXT: addss %xmm4, %xmm1
226 ; SSE-NEXT: movaps %xmm1, %xmm0
229 ; AVX-RECIP-LABEL: f32_two_step:
230 ; AVX-RECIP: # %bb.0:
231 ; AVX-RECIP-NEXT: vrcpss %xmm0, %xmm0, %xmm1
232 ; AVX-RECIP-NEXT: vmulss %xmm1, %xmm0, %xmm2
233 ; AVX-RECIP-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero
234 ; AVX-RECIP-NEXT: vsubss %xmm2, %xmm3, %xmm2
235 ; AVX-RECIP-NEXT: vmulss %xmm2, %xmm1, %xmm2
236 ; AVX-RECIP-NEXT: vaddss %xmm2, %xmm1, %xmm1
237 ; AVX-RECIP-NEXT: vmulss %xmm1, %xmm0, %xmm0
238 ; AVX-RECIP-NEXT: vsubss %xmm0, %xmm3, %xmm0
239 ; AVX-RECIP-NEXT: vmulss %xmm0, %xmm1, %xmm0
240 ; AVX-RECIP-NEXT: vaddss %xmm0, %xmm1, %xmm0
241 ; AVX-RECIP-NEXT: retq
243 ; FMA-RECIP-LABEL: f32_two_step:
244 ; FMA-RECIP: # %bb.0:
245 ; FMA-RECIP-NEXT: vrcpss %xmm0, %xmm0, %xmm1
246 ; FMA-RECIP-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
247 ; FMA-RECIP-NEXT: vmovaps %xmm1, %xmm3
248 ; FMA-RECIP-NEXT: vfnmadd213ss {{.*#+}} xmm3 = -(xmm0 * xmm3) + xmm2
249 ; FMA-RECIP-NEXT: vfmadd132ss {{.*#+}} xmm3 = (xmm3 * xmm1) + xmm1
250 ; FMA-RECIP-NEXT: vfnmadd213ss {{.*#+}} xmm0 = -(xmm3 * xmm0) + xmm2
251 ; FMA-RECIP-NEXT: vfmadd132ss {{.*#+}} xmm0 = (xmm0 * xmm3) + xmm3
252 ; FMA-RECIP-NEXT: retq
254 ; BDVER2-LABEL: f32_two_step:
256 ; BDVER2-NEXT: vrcpss %xmm0, %xmm0, %xmm1
257 ; BDVER2-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
258 ; BDVER2-NEXT: vfnmaddss %xmm2, %xmm1, %xmm0, %xmm3
259 ; BDVER2-NEXT: vfmaddss %xmm1, %xmm3, %xmm1, %xmm1
260 ; BDVER2-NEXT: vfnmaddss %xmm2, %xmm1, %xmm0, %xmm0
261 ; BDVER2-NEXT: vfmaddss %xmm1, %xmm0, %xmm1, %xmm0
264 ; BTVER2-LABEL: f32_two_step:
266 ; BTVER2-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero
267 ; BTVER2-NEXT: vrcpss %xmm0, %xmm0, %xmm1
268 ; BTVER2-NEXT: vmulss %xmm1, %xmm0, %xmm2
269 ; BTVER2-NEXT: vsubss %xmm2, %xmm3, %xmm2
270 ; BTVER2-NEXT: vmulss %xmm2, %xmm1, %xmm2
271 ; BTVER2-NEXT: vaddss %xmm2, %xmm1, %xmm1
272 ; BTVER2-NEXT: vmulss %xmm1, %xmm0, %xmm0
273 ; BTVER2-NEXT: vsubss %xmm0, %xmm3, %xmm0
274 ; BTVER2-NEXT: vmulss %xmm0, %xmm1, %xmm0
275 ; BTVER2-NEXT: vaddss %xmm0, %xmm1, %xmm0
278 ; SANDY-LABEL: f32_two_step:
280 ; SANDY-NEXT: vrcpss %xmm0, %xmm0, %xmm1
281 ; SANDY-NEXT: vmulss %xmm1, %xmm0, %xmm2
282 ; SANDY-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero
283 ; SANDY-NEXT: vsubss %xmm2, %xmm3, %xmm2
284 ; SANDY-NEXT: vmulss %xmm2, %xmm1, %xmm2
285 ; SANDY-NEXT: vaddss %xmm2, %xmm1, %xmm1
286 ; SANDY-NEXT: vmulss %xmm1, %xmm0, %xmm0
287 ; SANDY-NEXT: vsubss %xmm0, %xmm3, %xmm0
288 ; SANDY-NEXT: vmulss %xmm0, %xmm1, %xmm0
289 ; SANDY-NEXT: vaddss %xmm0, %xmm1, %xmm0
292 ; HASWELL-LABEL: f32_two_step:
294 ; HASWELL-NEXT: vrcpss %xmm0, %xmm0, %xmm1
295 ; HASWELL-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
296 ; HASWELL-NEXT: vmovaps %xmm1, %xmm3
297 ; HASWELL-NEXT: vfnmadd213ss {{.*#+}} xmm3 = -(xmm0 * xmm3) + xmm2
298 ; HASWELL-NEXT: vfmadd132ss {{.*#+}} xmm3 = (xmm3 * xmm1) + xmm1
299 ; HASWELL-NEXT: vfnmadd213ss {{.*#+}} xmm0 = -(xmm3 * xmm0) + xmm2
300 ; HASWELL-NEXT: vfmadd132ss {{.*#+}} xmm0 = (xmm0 * xmm3) + xmm3
303 ; HASWELL-NO-FMA-LABEL: f32_two_step:
304 ; HASWELL-NO-FMA: # %bb.0:
305 ; HASWELL-NO-FMA-NEXT: vrcpss %xmm0, %xmm0, %xmm1
306 ; HASWELL-NO-FMA-NEXT: vmulss %xmm1, %xmm0, %xmm2
307 ; HASWELL-NO-FMA-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero
308 ; HASWELL-NO-FMA-NEXT: vsubss %xmm2, %xmm3, %xmm2
309 ; HASWELL-NO-FMA-NEXT: vmulss %xmm2, %xmm1, %xmm2
310 ; HASWELL-NO-FMA-NEXT: vaddss %xmm2, %xmm1, %xmm1
311 ; HASWELL-NO-FMA-NEXT: vmulss %xmm1, %xmm0, %xmm0
312 ; HASWELL-NO-FMA-NEXT: vsubss %xmm0, %xmm3, %xmm0
313 ; HASWELL-NO-FMA-NEXT: vmulss %xmm0, %xmm1, %xmm0
314 ; HASWELL-NO-FMA-NEXT: vaddss %xmm0, %xmm1, %xmm0
315 ; HASWELL-NO-FMA-NEXT: retq
317 ; AVX512-LABEL: f32_two_step:
319 ; AVX512-NEXT: vrcpss %xmm0, %xmm0, %xmm1
320 ; AVX512-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
321 ; AVX512-NEXT: vmovaps %xmm1, %xmm3
322 ; AVX512-NEXT: vfnmadd213ss {{.*#+}} xmm3 = -(xmm0 * xmm3) + xmm2
323 ; AVX512-NEXT: vfmadd132ss {{.*#+}} xmm3 = (xmm3 * xmm1) + xmm1
324 ; AVX512-NEXT: vfnmadd213ss {{.*#+}} xmm0 = -(xmm3 * xmm0) + xmm2
325 ; AVX512-NEXT: vfmadd132ss {{.*#+}} xmm0 = (xmm0 * xmm3) + xmm3
327 %div = fdiv fast float 1.0, %x
331 define <4 x float> @v4f32_no_estimate(<4 x float> %x) #0 {
332 ; SSE-LABEL: v4f32_no_estimate:
334 ; SSE-NEXT: movaps {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
335 ; SSE-NEXT: divps %xmm0, %xmm1
336 ; SSE-NEXT: movaps %xmm1, %xmm0
339 ; AVX-RECIP-LABEL: v4f32_no_estimate:
340 ; AVX-RECIP: # %bb.0:
341 ; AVX-RECIP-NEXT: vmovaps {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
342 ; AVX-RECIP-NEXT: vdivps %xmm0, %xmm1, %xmm0
343 ; AVX-RECIP-NEXT: retq
345 ; FMA-RECIP-LABEL: v4f32_no_estimate:
346 ; FMA-RECIP: # %bb.0:
347 ; FMA-RECIP-NEXT: vmovaps {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
348 ; FMA-RECIP-NEXT: vdivps %xmm0, %xmm1, %xmm0
349 ; FMA-RECIP-NEXT: retq
351 ; BDVER2-LABEL: v4f32_no_estimate:
353 ; BDVER2-NEXT: vmovaps {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
354 ; BDVER2-NEXT: vdivps %xmm0, %xmm1, %xmm0
357 ; BTVER2-LABEL: v4f32_no_estimate:
359 ; BTVER2-NEXT: vmovaps {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
360 ; BTVER2-NEXT: vdivps %xmm0, %xmm1, %xmm0
363 ; SANDY-LABEL: v4f32_no_estimate:
365 ; SANDY-NEXT: vmovaps {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
366 ; SANDY-NEXT: vdivps %xmm0, %xmm1, %xmm0
369 ; HASWELL-LABEL: v4f32_no_estimate:
371 ; HASWELL-NEXT: vbroadcastss {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
372 ; HASWELL-NEXT: vdivps %xmm0, %xmm1, %xmm0
375 ; HASWELL-NO-FMA-LABEL: v4f32_no_estimate:
376 ; HASWELL-NO-FMA: # %bb.0:
377 ; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
378 ; HASWELL-NO-FMA-NEXT: vdivps %xmm0, %xmm1, %xmm0
379 ; HASWELL-NO-FMA-NEXT: retq
381 ; AVX512-LABEL: v4f32_no_estimate:
383 ; AVX512-NEXT: vbroadcastss {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
384 ; AVX512-NEXT: vdivps %xmm0, %xmm1, %xmm0
386 %div = fdiv fast <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>, %x
390 define <4 x float> @v4f32_one_step(<4 x float> %x) #1 {
391 ; SSE-LABEL: v4f32_one_step:
393 ; SSE-NEXT: rcpps %xmm0, %xmm2
394 ; SSE-NEXT: mulps %xmm2, %xmm0
395 ; SSE-NEXT: movaps {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
396 ; SSE-NEXT: subps %xmm0, %xmm1
397 ; SSE-NEXT: mulps %xmm2, %xmm1
398 ; SSE-NEXT: addps %xmm2, %xmm1
399 ; SSE-NEXT: movaps %xmm1, %xmm0
402 ; AVX-RECIP-LABEL: v4f32_one_step:
403 ; AVX-RECIP: # %bb.0:
404 ; AVX-RECIP-NEXT: vrcpps %xmm0, %xmm1
405 ; AVX-RECIP-NEXT: vmulps %xmm1, %xmm0, %xmm0
406 ; AVX-RECIP-NEXT: vmovaps {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
407 ; AVX-RECIP-NEXT: vsubps %xmm0, %xmm2, %xmm0
408 ; AVX-RECIP-NEXT: vmulps %xmm0, %xmm1, %xmm0
409 ; AVX-RECIP-NEXT: vaddps %xmm0, %xmm1, %xmm0
410 ; AVX-RECIP-NEXT: retq
412 ; FMA-RECIP-LABEL: v4f32_one_step:
413 ; FMA-RECIP: # %bb.0:
414 ; FMA-RECIP-NEXT: vrcpps %xmm0, %xmm1
415 ; FMA-RECIP-NEXT: vfnmadd213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) + mem
416 ; FMA-RECIP-NEXT: vfmadd132ps {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm1
417 ; FMA-RECIP-NEXT: retq
419 ; BDVER2-LABEL: v4f32_one_step:
421 ; BDVER2-NEXT: vrcpps %xmm0, %xmm1
422 ; BDVER2-NEXT: vfnmaddps {{.*}}(%rip), %xmm1, %xmm0, %xmm0
423 ; BDVER2-NEXT: vfmaddps %xmm1, %xmm0, %xmm1, %xmm0
426 ; BTVER2-LABEL: v4f32_one_step:
428 ; BTVER2-NEXT: vmovaps {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
429 ; BTVER2-NEXT: vrcpps %xmm0, %xmm1
430 ; BTVER2-NEXT: vmulps %xmm1, %xmm0, %xmm0
431 ; BTVER2-NEXT: vsubps %xmm0, %xmm2, %xmm0
432 ; BTVER2-NEXT: vmulps %xmm0, %xmm1, %xmm0
433 ; BTVER2-NEXT: vaddps %xmm0, %xmm1, %xmm0
436 ; SANDY-LABEL: v4f32_one_step:
438 ; SANDY-NEXT: vrcpps %xmm0, %xmm1
439 ; SANDY-NEXT: vmulps %xmm1, %xmm0, %xmm0
440 ; SANDY-NEXT: vmovaps {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
441 ; SANDY-NEXT: vsubps %xmm0, %xmm2, %xmm0
442 ; SANDY-NEXT: vmulps %xmm0, %xmm1, %xmm0
443 ; SANDY-NEXT: vaddps %xmm0, %xmm1, %xmm0
446 ; HASWELL-LABEL: v4f32_one_step:
448 ; HASWELL-NEXT: vrcpps %xmm0, %xmm1
449 ; HASWELL-NEXT: vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
450 ; HASWELL-NEXT: vfnmadd213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm2
451 ; HASWELL-NEXT: vfmadd132ps {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm1
454 ; HASWELL-NO-FMA-LABEL: v4f32_one_step:
455 ; HASWELL-NO-FMA: # %bb.0:
456 ; HASWELL-NO-FMA-NEXT: vrcpps %xmm0, %xmm1
457 ; HASWELL-NO-FMA-NEXT: vmulps %xmm1, %xmm0, %xmm0
458 ; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
459 ; HASWELL-NO-FMA-NEXT: vsubps %xmm0, %xmm2, %xmm0
460 ; HASWELL-NO-FMA-NEXT: vmulps %xmm0, %xmm1, %xmm0
461 ; HASWELL-NO-FMA-NEXT: vaddps %xmm0, %xmm1, %xmm0
462 ; HASWELL-NO-FMA-NEXT: retq
464 ; KNL-LABEL: v4f32_one_step:
466 ; KNL-NEXT: vrcpps %xmm0, %xmm1
467 ; KNL-NEXT: vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
468 ; KNL-NEXT: vfnmadd213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm2
469 ; KNL-NEXT: vfmadd132ps {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm1
472 ; SKX-LABEL: v4f32_one_step:
474 ; SKX-NEXT: vrcpps %xmm0, %xmm1
475 ; SKX-NEXT: vfnmadd213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) + mem
476 ; SKX-NEXT: vfmadd132ps {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm1
478 %div = fdiv fast <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>, %x
482 define <4 x float> @v4f32_one_step_variables(<4 x float> %x, <4 x float> %y) #1 {
483 ; SSE-LABEL: v4f32_one_step_variables:
485 ; SSE-NEXT: rcpps %xmm1, %xmm2
486 ; SSE-NEXT: movaps %xmm0, %xmm3
487 ; SSE-NEXT: mulps %xmm2, %xmm3
488 ; SSE-NEXT: mulps %xmm3, %xmm1
489 ; SSE-NEXT: subps %xmm1, %xmm0
490 ; SSE-NEXT: mulps %xmm2, %xmm0
491 ; SSE-NEXT: addps %xmm3, %xmm0
494 ; AVX-RECIP-LABEL: v4f32_one_step_variables:
495 ; AVX-RECIP: # %bb.0:
496 ; AVX-RECIP-NEXT: vrcpps %xmm1, %xmm2
497 ; AVX-RECIP-NEXT: vmulps %xmm2, %xmm0, %xmm3
498 ; AVX-RECIP-NEXT: vmulps %xmm3, %xmm1, %xmm1
499 ; AVX-RECIP-NEXT: vsubps %xmm1, %xmm0, %xmm0
500 ; AVX-RECIP-NEXT: vmulps %xmm0, %xmm2, %xmm0
501 ; AVX-RECIP-NEXT: vaddps %xmm0, %xmm3, %xmm0
502 ; AVX-RECIP-NEXT: retq
504 ; FMA-RECIP-LABEL: v4f32_one_step_variables:
505 ; FMA-RECIP: # %bb.0:
506 ; FMA-RECIP-NEXT: vrcpps %xmm1, %xmm2
507 ; FMA-RECIP-NEXT: vmulps %xmm2, %xmm0, %xmm3
508 ; FMA-RECIP-NEXT: vfnmadd213ps {{.*#+}} xmm1 = -(xmm3 * xmm1) + xmm0
509 ; FMA-RECIP-NEXT: vfmadd213ps {{.*#+}} xmm2 = (xmm1 * xmm2) + xmm3
510 ; FMA-RECIP-NEXT: vmovaps %xmm2, %xmm0
511 ; FMA-RECIP-NEXT: retq
513 ; BDVER2-LABEL: v4f32_one_step_variables:
515 ; BDVER2-NEXT: vrcpps %xmm1, %xmm2
516 ; BDVER2-NEXT: vmulps %xmm2, %xmm0, %xmm3
517 ; BDVER2-NEXT: vfnmaddps %xmm0, %xmm3, %xmm1, %xmm0
518 ; BDVER2-NEXT: vfmaddps %xmm3, %xmm0, %xmm2, %xmm0
521 ; BTVER2-LABEL: v4f32_one_step_variables:
523 ; BTVER2-NEXT: vrcpps %xmm1, %xmm2
524 ; BTVER2-NEXT: vmulps %xmm2, %xmm0, %xmm3
525 ; BTVER2-NEXT: vmulps %xmm3, %xmm1, %xmm1
526 ; BTVER2-NEXT: vsubps %xmm1, %xmm0, %xmm0
527 ; BTVER2-NEXT: vmulps %xmm0, %xmm2, %xmm0
528 ; BTVER2-NEXT: vaddps %xmm0, %xmm3, %xmm0
531 ; SANDY-LABEL: v4f32_one_step_variables:
533 ; SANDY-NEXT: vrcpps %xmm1, %xmm2
534 ; SANDY-NEXT: vmulps %xmm2, %xmm0, %xmm3
535 ; SANDY-NEXT: vmulps %xmm3, %xmm1, %xmm1
536 ; SANDY-NEXT: vsubps %xmm1, %xmm0, %xmm0
537 ; SANDY-NEXT: vmulps %xmm0, %xmm2, %xmm0
538 ; SANDY-NEXT: vaddps %xmm0, %xmm3, %xmm0
541 ; HASWELL-LABEL: v4f32_one_step_variables:
543 ; HASWELL-NEXT: vrcpps %xmm1, %xmm2
544 ; HASWELL-NEXT: vmulps %xmm2, %xmm0, %xmm3
545 ; HASWELL-NEXT: vfnmadd213ps {{.*#+}} xmm1 = -(xmm3 * xmm1) + xmm0
546 ; HASWELL-NEXT: vfmadd213ps {{.*#+}} xmm2 = (xmm1 * xmm2) + xmm3
547 ; HASWELL-NEXT: vmovaps %xmm2, %xmm0
550 ; HASWELL-NO-FMA-LABEL: v4f32_one_step_variables:
551 ; HASWELL-NO-FMA: # %bb.0:
552 ; HASWELL-NO-FMA-NEXT: vrcpps %xmm1, %xmm2
553 ; HASWELL-NO-FMA-NEXT: vmulps %xmm2, %xmm0, %xmm3
554 ; HASWELL-NO-FMA-NEXT: vmulps %xmm3, %xmm1, %xmm1
555 ; HASWELL-NO-FMA-NEXT: vsubps %xmm1, %xmm0, %xmm0
556 ; HASWELL-NO-FMA-NEXT: vmulps %xmm0, %xmm2, %xmm0
557 ; HASWELL-NO-FMA-NEXT: vaddps %xmm0, %xmm3, %xmm0
558 ; HASWELL-NO-FMA-NEXT: retq
560 ; AVX512-LABEL: v4f32_one_step_variables:
562 ; AVX512-NEXT: vrcpps %xmm1, %xmm2
563 ; AVX512-NEXT: vmulps %xmm2, %xmm0, %xmm3
564 ; AVX512-NEXT: vfnmadd213ps {{.*#+}} xmm1 = -(xmm3 * xmm1) + xmm0
565 ; AVX512-NEXT: vfmadd213ps {{.*#+}} xmm2 = (xmm1 * xmm2) + xmm3
566 ; AVX512-NEXT: vmovaps %xmm2, %xmm0
568 %div = fdiv fast <4 x float> %x, %y
572 define <4 x float> @v4f32_two_step(<4 x float> %x) #2 {
573 ; SSE-LABEL: v4f32_two_step:
575 ; SSE-NEXT: rcpps %xmm0, %xmm2
576 ; SSE-NEXT: movaps %xmm0, %xmm3
577 ; SSE-NEXT: mulps %xmm2, %xmm3
578 ; SSE-NEXT: movaps {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
579 ; SSE-NEXT: movaps %xmm1, %xmm4
580 ; SSE-NEXT: subps %xmm3, %xmm4
581 ; SSE-NEXT: mulps %xmm2, %xmm4
582 ; SSE-NEXT: addps %xmm2, %xmm4
583 ; SSE-NEXT: mulps %xmm4, %xmm0
584 ; SSE-NEXT: subps %xmm0, %xmm1
585 ; SSE-NEXT: mulps %xmm4, %xmm1
586 ; SSE-NEXT: addps %xmm4, %xmm1
587 ; SSE-NEXT: movaps %xmm1, %xmm0
590 ; AVX-RECIP-LABEL: v4f32_two_step:
591 ; AVX-RECIP: # %bb.0:
592 ; AVX-RECIP-NEXT: vrcpps %xmm0, %xmm1
593 ; AVX-RECIP-NEXT: vmulps %xmm1, %xmm0, %xmm2
594 ; AVX-RECIP-NEXT: vmovaps {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
595 ; AVX-RECIP-NEXT: vsubps %xmm2, %xmm3, %xmm2
596 ; AVX-RECIP-NEXT: vmulps %xmm2, %xmm1, %xmm2
597 ; AVX-RECIP-NEXT: vaddps %xmm2, %xmm1, %xmm1
598 ; AVX-RECIP-NEXT: vmulps %xmm1, %xmm0, %xmm0
599 ; AVX-RECIP-NEXT: vsubps %xmm0, %xmm3, %xmm0
600 ; AVX-RECIP-NEXT: vmulps %xmm0, %xmm1, %xmm0
601 ; AVX-RECIP-NEXT: vaddps %xmm0, %xmm1, %xmm0
602 ; AVX-RECIP-NEXT: retq
604 ; FMA-RECIP-LABEL: v4f32_two_step:
605 ; FMA-RECIP: # %bb.0:
606 ; FMA-RECIP-NEXT: vrcpps %xmm0, %xmm1
607 ; FMA-RECIP-NEXT: vmovaps {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
608 ; FMA-RECIP-NEXT: vmovaps %xmm1, %xmm3
609 ; FMA-RECIP-NEXT: vfnmadd213ps {{.*#+}} xmm3 = -(xmm0 * xmm3) + xmm2
610 ; FMA-RECIP-NEXT: vfmadd132ps {{.*#+}} xmm3 = (xmm3 * xmm1) + xmm1
611 ; FMA-RECIP-NEXT: vfnmadd213ps {{.*#+}} xmm0 = -(xmm3 * xmm0) + xmm2
612 ; FMA-RECIP-NEXT: vfmadd132ps {{.*#+}} xmm0 = (xmm0 * xmm3) + xmm3
613 ; FMA-RECIP-NEXT: retq
615 ; BDVER2-LABEL: v4f32_two_step:
617 ; BDVER2-NEXT: vrcpps %xmm0, %xmm1
618 ; BDVER2-NEXT: vmovaps {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
619 ; BDVER2-NEXT: vfnmaddps %xmm2, %xmm1, %xmm0, %xmm3
620 ; BDVER2-NEXT: vfmaddps %xmm1, %xmm3, %xmm1, %xmm1
621 ; BDVER2-NEXT: vfnmaddps %xmm2, %xmm1, %xmm0, %xmm0
622 ; BDVER2-NEXT: vfmaddps %xmm1, %xmm0, %xmm1, %xmm0
625 ; BTVER2-LABEL: v4f32_two_step:
627 ; BTVER2-NEXT: vmovaps {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
628 ; BTVER2-NEXT: vrcpps %xmm0, %xmm1
629 ; BTVER2-NEXT: vmulps %xmm1, %xmm0, %xmm2
630 ; BTVER2-NEXT: vsubps %xmm2, %xmm3, %xmm2
631 ; BTVER2-NEXT: vmulps %xmm2, %xmm1, %xmm2
632 ; BTVER2-NEXT: vaddps %xmm2, %xmm1, %xmm1
633 ; BTVER2-NEXT: vmulps %xmm1, %xmm0, %xmm0
634 ; BTVER2-NEXT: vsubps %xmm0, %xmm3, %xmm0
635 ; BTVER2-NEXT: vmulps %xmm0, %xmm1, %xmm0
636 ; BTVER2-NEXT: vaddps %xmm0, %xmm1, %xmm0
639 ; SANDY-LABEL: v4f32_two_step:
641 ; SANDY-NEXT: vrcpps %xmm0, %xmm1
642 ; SANDY-NEXT: vmulps %xmm1, %xmm0, %xmm2
643 ; SANDY-NEXT: vmovaps {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
644 ; SANDY-NEXT: vsubps %xmm2, %xmm3, %xmm2
645 ; SANDY-NEXT: vmulps %xmm2, %xmm1, %xmm2
646 ; SANDY-NEXT: vaddps %xmm2, %xmm1, %xmm1
647 ; SANDY-NEXT: vmulps %xmm1, %xmm0, %xmm0
648 ; SANDY-NEXT: vsubps %xmm0, %xmm3, %xmm0
649 ; SANDY-NEXT: vmulps %xmm0, %xmm1, %xmm0
650 ; SANDY-NEXT: vaddps %xmm0, %xmm1, %xmm0
653 ; HASWELL-LABEL: v4f32_two_step:
655 ; HASWELL-NEXT: vrcpps %xmm0, %xmm1
656 ; HASWELL-NEXT: vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
657 ; HASWELL-NEXT: vmovaps %xmm1, %xmm3
658 ; HASWELL-NEXT: vfnmadd213ps {{.*#+}} xmm3 = -(xmm0 * xmm3) + xmm2
659 ; HASWELL-NEXT: vfmadd132ps {{.*#+}} xmm3 = (xmm3 * xmm1) + xmm1
660 ; HASWELL-NEXT: vfnmadd213ps {{.*#+}} xmm0 = -(xmm3 * xmm0) + xmm2
661 ; HASWELL-NEXT: vfmadd132ps {{.*#+}} xmm0 = (xmm0 * xmm3) + xmm3
664 ; HASWELL-NO-FMA-LABEL: v4f32_two_step:
665 ; HASWELL-NO-FMA: # %bb.0:
666 ; HASWELL-NO-FMA-NEXT: vrcpps %xmm0, %xmm1
667 ; HASWELL-NO-FMA-NEXT: vmulps %xmm1, %xmm0, %xmm2
668 ; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
669 ; HASWELL-NO-FMA-NEXT: vsubps %xmm2, %xmm3, %xmm2
670 ; HASWELL-NO-FMA-NEXT: vmulps %xmm2, %xmm1, %xmm2
671 ; HASWELL-NO-FMA-NEXT: vaddps %xmm2, %xmm1, %xmm1
672 ; HASWELL-NO-FMA-NEXT: vmulps %xmm1, %xmm0, %xmm0
673 ; HASWELL-NO-FMA-NEXT: vsubps %xmm0, %xmm3, %xmm0
674 ; HASWELL-NO-FMA-NEXT: vmulps %xmm0, %xmm1, %xmm0
675 ; HASWELL-NO-FMA-NEXT: vaddps %xmm0, %xmm1, %xmm0
676 ; HASWELL-NO-FMA-NEXT: retq
678 ; AVX512-LABEL: v4f32_two_step:
680 ; AVX512-NEXT: vrcpps %xmm0, %xmm1
681 ; AVX512-NEXT: vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
682 ; AVX512-NEXT: vmovaps %xmm1, %xmm3
683 ; AVX512-NEXT: vfnmadd213ps {{.*#+}} xmm3 = -(xmm0 * xmm3) + xmm2
684 ; AVX512-NEXT: vfmadd132ps {{.*#+}} xmm3 = (xmm3 * xmm1) + xmm1
685 ; AVX512-NEXT: vfnmadd213ps {{.*#+}} xmm0 = -(xmm3 * xmm0) + xmm2
686 ; AVX512-NEXT: vfmadd132ps {{.*#+}} xmm0 = (xmm0 * xmm3) + xmm3
688 %div = fdiv fast <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>, %x
692 define <8 x float> @v8f32_no_estimate(<8 x float> %x) #0 {
693 ; SSE-LABEL: v8f32_no_estimate:
695 ; SSE-NEXT: movaps {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
696 ; SSE-NEXT: movaps %xmm2, %xmm3
697 ; SSE-NEXT: divps %xmm0, %xmm3
698 ; SSE-NEXT: divps %xmm1, %xmm2
699 ; SSE-NEXT: movaps %xmm3, %xmm0
700 ; SSE-NEXT: movaps %xmm2, %xmm1
703 ; AVX-RECIP-LABEL: v8f32_no_estimate:
704 ; AVX-RECIP: # %bb.0:
705 ; AVX-RECIP-NEXT: vmovaps {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
706 ; AVX-RECIP-NEXT: vdivps %ymm0, %ymm1, %ymm0
707 ; AVX-RECIP-NEXT: retq
709 ; FMA-RECIP-LABEL: v8f32_no_estimate:
710 ; FMA-RECIP: # %bb.0:
711 ; FMA-RECIP-NEXT: vmovaps {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
712 ; FMA-RECIP-NEXT: vdivps %ymm0, %ymm1, %ymm0
713 ; FMA-RECIP-NEXT: retq
715 ; BDVER2-LABEL: v8f32_no_estimate:
717 ; BDVER2-NEXT: vmovaps {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
718 ; BDVER2-NEXT: vdivps %ymm0, %ymm1, %ymm0
721 ; BTVER2-LABEL: v8f32_no_estimate:
723 ; BTVER2-NEXT: vmovaps {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
724 ; BTVER2-NEXT: vdivps %ymm0, %ymm1, %ymm0
727 ; SANDY-LABEL: v8f32_no_estimate:
729 ; SANDY-NEXT: vmovaps {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
730 ; SANDY-NEXT: vdivps %ymm0, %ymm1, %ymm0
733 ; HASWELL-LABEL: v8f32_no_estimate:
735 ; HASWELL-NEXT: vbroadcastss {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
736 ; HASWELL-NEXT: vdivps %ymm0, %ymm1, %ymm0
739 ; HASWELL-NO-FMA-LABEL: v8f32_no_estimate:
740 ; HASWELL-NO-FMA: # %bb.0:
741 ; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
742 ; HASWELL-NO-FMA-NEXT: vdivps %ymm0, %ymm1, %ymm0
743 ; HASWELL-NO-FMA-NEXT: retq
745 ; AVX512-LABEL: v8f32_no_estimate:
747 ; AVX512-NEXT: vbroadcastss {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
748 ; AVX512-NEXT: vdivps %ymm0, %ymm1, %ymm0
750 %div = fdiv fast <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, %x
754 define <8 x float> @v8f32_one_step(<8 x float> %x) #1 {
755 ; SSE-LABEL: v8f32_one_step:
757 ; SSE-NEXT: rcpps %xmm0, %xmm4
758 ; SSE-NEXT: mulps %xmm4, %xmm0
759 ; SSE-NEXT: movaps {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
760 ; SSE-NEXT: movaps %xmm2, %xmm3
761 ; SSE-NEXT: subps %xmm0, %xmm3
762 ; SSE-NEXT: mulps %xmm4, %xmm3
763 ; SSE-NEXT: addps %xmm4, %xmm3
764 ; SSE-NEXT: rcpps %xmm1, %xmm0
765 ; SSE-NEXT: mulps %xmm0, %xmm1
766 ; SSE-NEXT: subps %xmm1, %xmm2
767 ; SSE-NEXT: mulps %xmm0, %xmm2
768 ; SSE-NEXT: addps %xmm0, %xmm2
769 ; SSE-NEXT: movaps %xmm3, %xmm0
770 ; SSE-NEXT: movaps %xmm2, %xmm1
773 ; AVX-RECIP-LABEL: v8f32_one_step:
774 ; AVX-RECIP: # %bb.0:
775 ; AVX-RECIP-NEXT: vrcpps %ymm0, %ymm1
776 ; AVX-RECIP-NEXT: vmulps %ymm1, %ymm0, %ymm0
777 ; AVX-RECIP-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
778 ; AVX-RECIP-NEXT: vsubps %ymm0, %ymm2, %ymm0
779 ; AVX-RECIP-NEXT: vmulps %ymm0, %ymm1, %ymm0
780 ; AVX-RECIP-NEXT: vaddps %ymm0, %ymm1, %ymm0
781 ; AVX-RECIP-NEXT: retq
783 ; FMA-RECIP-LABEL: v8f32_one_step:
784 ; FMA-RECIP: # %bb.0:
785 ; FMA-RECIP-NEXT: vrcpps %ymm0, %ymm1
786 ; FMA-RECIP-NEXT: vfnmadd213ps {{.*#+}} ymm0 = -(ymm1 * ymm0) + mem
787 ; FMA-RECIP-NEXT: vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm1) + ymm1
788 ; FMA-RECIP-NEXT: retq
790 ; BDVER2-LABEL: v8f32_one_step:
792 ; BDVER2-NEXT: vrcpps %ymm0, %ymm1
793 ; BDVER2-NEXT: vfnmaddps {{.*}}(%rip), %ymm1, %ymm0, %ymm0
794 ; BDVER2-NEXT: vfmaddps %ymm1, %ymm0, %ymm1, %ymm0
797 ; BTVER2-LABEL: v8f32_one_step:
799 ; BTVER2-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
800 ; BTVER2-NEXT: vrcpps %ymm0, %ymm1
801 ; BTVER2-NEXT: vmulps %ymm1, %ymm0, %ymm0
802 ; BTVER2-NEXT: vsubps %ymm0, %ymm2, %ymm0
803 ; BTVER2-NEXT: vmulps %ymm0, %ymm1, %ymm0
804 ; BTVER2-NEXT: vaddps %ymm0, %ymm1, %ymm0
807 ; SANDY-LABEL: v8f32_one_step:
809 ; SANDY-NEXT: vrcpps %ymm0, %ymm1
810 ; SANDY-NEXT: vmulps %ymm1, %ymm0, %ymm0
811 ; SANDY-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
812 ; SANDY-NEXT: vsubps %ymm0, %ymm2, %ymm0
813 ; SANDY-NEXT: vmulps %ymm0, %ymm1, %ymm0
814 ; SANDY-NEXT: vaddps %ymm0, %ymm1, %ymm0
817 ; HASWELL-LABEL: v8f32_one_step:
819 ; HASWELL-NEXT: vrcpps %ymm0, %ymm1
820 ; HASWELL-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
821 ; HASWELL-NEXT: vfnmadd213ps {{.*#+}} ymm0 = -(ymm1 * ymm0) + ymm2
822 ; HASWELL-NEXT: vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm1) + ymm1
825 ; HASWELL-NO-FMA-LABEL: v8f32_one_step:
826 ; HASWELL-NO-FMA: # %bb.0:
827 ; HASWELL-NO-FMA-NEXT: vrcpps %ymm0, %ymm1
828 ; HASWELL-NO-FMA-NEXT: vmulps %ymm1, %ymm0, %ymm0
829 ; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
830 ; HASWELL-NO-FMA-NEXT: vsubps %ymm0, %ymm2, %ymm0
831 ; HASWELL-NO-FMA-NEXT: vmulps %ymm0, %ymm1, %ymm0
832 ; HASWELL-NO-FMA-NEXT: vaddps %ymm0, %ymm1, %ymm0
833 ; HASWELL-NO-FMA-NEXT: retq
835 ; KNL-LABEL: v8f32_one_step:
837 ; KNL-NEXT: vrcpps %ymm0, %ymm1
838 ; KNL-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
839 ; KNL-NEXT: vfnmadd213ps {{.*#+}} ymm0 = -(ymm1 * ymm0) + ymm2
840 ; KNL-NEXT: vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm1) + ymm1
843 ; SKX-LABEL: v8f32_one_step:
845 ; SKX-NEXT: vrcpps %ymm0, %ymm1
846 ; SKX-NEXT: vfnmadd213ps {{.*#+}} ymm0 = -(ymm1 * ymm0) + mem
847 ; SKX-NEXT: vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm1) + ymm1
849 %div = fdiv fast <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, %x
853 define <8 x float> @v8f32_two_step(<8 x float> %x) #2 {
854 ; SSE-LABEL: v8f32_two_step:
856 ; SSE-NEXT: movaps %xmm1, %xmm2
857 ; SSE-NEXT: rcpps %xmm0, %xmm3
858 ; SSE-NEXT: movaps %xmm0, %xmm4
859 ; SSE-NEXT: mulps %xmm3, %xmm4
860 ; SSE-NEXT: movaps {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
861 ; SSE-NEXT: movaps %xmm1, %xmm5
862 ; SSE-NEXT: subps %xmm4, %xmm5
863 ; SSE-NEXT: mulps %xmm3, %xmm5
864 ; SSE-NEXT: addps %xmm3, %xmm5
865 ; SSE-NEXT: mulps %xmm5, %xmm0
866 ; SSE-NEXT: movaps %xmm1, %xmm3
867 ; SSE-NEXT: subps %xmm0, %xmm3
868 ; SSE-NEXT: mulps %xmm5, %xmm3
869 ; SSE-NEXT: addps %xmm5, %xmm3
870 ; SSE-NEXT: rcpps %xmm2, %xmm0
871 ; SSE-NEXT: movaps %xmm2, %xmm4
872 ; SSE-NEXT: mulps %xmm0, %xmm4
873 ; SSE-NEXT: movaps %xmm1, %xmm5
874 ; SSE-NEXT: subps %xmm4, %xmm5
875 ; SSE-NEXT: mulps %xmm0, %xmm5
876 ; SSE-NEXT: addps %xmm0, %xmm5
877 ; SSE-NEXT: mulps %xmm5, %xmm2
878 ; SSE-NEXT: subps %xmm2, %xmm1
879 ; SSE-NEXT: mulps %xmm5, %xmm1
880 ; SSE-NEXT: addps %xmm5, %xmm1
881 ; SSE-NEXT: movaps %xmm3, %xmm0
884 ; AVX-RECIP-LABEL: v8f32_two_step:
885 ; AVX-RECIP: # %bb.0:
886 ; AVX-RECIP-NEXT: vrcpps %ymm0, %ymm1
887 ; AVX-RECIP-NEXT: vmulps %ymm1, %ymm0, %ymm2
888 ; AVX-RECIP-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
889 ; AVX-RECIP-NEXT: vsubps %ymm2, %ymm3, %ymm2
890 ; AVX-RECIP-NEXT: vmulps %ymm2, %ymm1, %ymm2
891 ; AVX-RECIP-NEXT: vaddps %ymm2, %ymm1, %ymm1
892 ; AVX-RECIP-NEXT: vmulps %ymm1, %ymm0, %ymm0
893 ; AVX-RECIP-NEXT: vsubps %ymm0, %ymm3, %ymm0
894 ; AVX-RECIP-NEXT: vmulps %ymm0, %ymm1, %ymm0
895 ; AVX-RECIP-NEXT: vaddps %ymm0, %ymm1, %ymm0
896 ; AVX-RECIP-NEXT: retq
898 ; FMA-RECIP-LABEL: v8f32_two_step:
899 ; FMA-RECIP: # %bb.0:
900 ; FMA-RECIP-NEXT: vrcpps %ymm0, %ymm1
901 ; FMA-RECIP-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
902 ; FMA-RECIP-NEXT: vmovaps %ymm1, %ymm3
903 ; FMA-RECIP-NEXT: vfnmadd213ps {{.*#+}} ymm3 = -(ymm0 * ymm3) + ymm2
904 ; FMA-RECIP-NEXT: vfmadd132ps {{.*#+}} ymm3 = (ymm3 * ymm1) + ymm1
905 ; FMA-RECIP-NEXT: vfnmadd213ps {{.*#+}} ymm0 = -(ymm3 * ymm0) + ymm2
906 ; FMA-RECIP-NEXT: vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm3) + ymm3
907 ; FMA-RECIP-NEXT: retq
909 ; BDVER2-LABEL: v8f32_two_step:
911 ; BDVER2-NEXT: vrcpps %ymm0, %ymm1
912 ; BDVER2-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
913 ; BDVER2-NEXT: vfnmaddps %ymm2, %ymm1, %ymm0, %ymm3
914 ; BDVER2-NEXT: vfmaddps %ymm1, %ymm3, %ymm1, %ymm1
915 ; BDVER2-NEXT: vfnmaddps %ymm2, %ymm1, %ymm0, %ymm0
916 ; BDVER2-NEXT: vfmaddps %ymm1, %ymm0, %ymm1, %ymm0
919 ; BTVER2-LABEL: v8f32_two_step:
921 ; BTVER2-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
922 ; BTVER2-NEXT: vrcpps %ymm0, %ymm1
923 ; BTVER2-NEXT: vmulps %ymm1, %ymm0, %ymm2
924 ; BTVER2-NEXT: vsubps %ymm2, %ymm3, %ymm2
925 ; BTVER2-NEXT: vmulps %ymm2, %ymm1, %ymm2
926 ; BTVER2-NEXT: vaddps %ymm2, %ymm1, %ymm1
927 ; BTVER2-NEXT: vmulps %ymm1, %ymm0, %ymm0
928 ; BTVER2-NEXT: vsubps %ymm0, %ymm3, %ymm0
929 ; BTVER2-NEXT: vmulps %ymm0, %ymm1, %ymm0
930 ; BTVER2-NEXT: vaddps %ymm0, %ymm1, %ymm0
933 ; SANDY-LABEL: v8f32_two_step:
935 ; SANDY-NEXT: vrcpps %ymm0, %ymm1
936 ; SANDY-NEXT: vmulps %ymm1, %ymm0, %ymm2
937 ; SANDY-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
938 ; SANDY-NEXT: vsubps %ymm2, %ymm3, %ymm2
939 ; SANDY-NEXT: vmulps %ymm2, %ymm1, %ymm2
940 ; SANDY-NEXT: vaddps %ymm2, %ymm1, %ymm1
941 ; SANDY-NEXT: vmulps %ymm1, %ymm0, %ymm0
942 ; SANDY-NEXT: vsubps %ymm0, %ymm3, %ymm0
943 ; SANDY-NEXT: vmulps %ymm0, %ymm1, %ymm0
944 ; SANDY-NEXT: vaddps %ymm0, %ymm1, %ymm0
947 ; HASWELL-LABEL: v8f32_two_step:
949 ; HASWELL-NEXT: vrcpps %ymm0, %ymm1
950 ; HASWELL-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
951 ; HASWELL-NEXT: vmovaps %ymm1, %ymm3
952 ; HASWELL-NEXT: vfnmadd213ps {{.*#+}} ymm3 = -(ymm0 * ymm3) + ymm2
953 ; HASWELL-NEXT: vfmadd132ps {{.*#+}} ymm3 = (ymm3 * ymm1) + ymm1
954 ; HASWELL-NEXT: vfnmadd213ps {{.*#+}} ymm0 = -(ymm3 * ymm0) + ymm2
955 ; HASWELL-NEXT: vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm3) + ymm3
958 ; HASWELL-NO-FMA-LABEL: v8f32_two_step:
959 ; HASWELL-NO-FMA: # %bb.0:
960 ; HASWELL-NO-FMA-NEXT: vrcpps %ymm0, %ymm1
961 ; HASWELL-NO-FMA-NEXT: vmulps %ymm1, %ymm0, %ymm2
962 ; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
963 ; HASWELL-NO-FMA-NEXT: vsubps %ymm2, %ymm3, %ymm2
964 ; HASWELL-NO-FMA-NEXT: vmulps %ymm2, %ymm1, %ymm2
965 ; HASWELL-NO-FMA-NEXT: vaddps %ymm2, %ymm1, %ymm1
966 ; HASWELL-NO-FMA-NEXT: vmulps %ymm1, %ymm0, %ymm0
967 ; HASWELL-NO-FMA-NEXT: vsubps %ymm0, %ymm3, %ymm0
968 ; HASWELL-NO-FMA-NEXT: vmulps %ymm0, %ymm1, %ymm0
969 ; HASWELL-NO-FMA-NEXT: vaddps %ymm0, %ymm1, %ymm0
970 ; HASWELL-NO-FMA-NEXT: retq
972 ; AVX512-LABEL: v8f32_two_step:
974 ; AVX512-NEXT: vrcpps %ymm0, %ymm1
975 ; AVX512-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
976 ; AVX512-NEXT: vmovaps %ymm1, %ymm3
977 ; AVX512-NEXT: vfnmadd213ps {{.*#+}} ymm3 = -(ymm0 * ymm3) + ymm2
978 ; AVX512-NEXT: vfmadd132ps {{.*#+}} ymm3 = (ymm3 * ymm1) + ymm1
979 ; AVX512-NEXT: vfnmadd213ps {{.*#+}} ymm0 = -(ymm3 * ymm0) + ymm2
980 ; AVX512-NEXT: vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm3) + ymm3
982 %div = fdiv fast <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, %x
986 define <16 x float> @v16f32_no_estimate(<16 x float> %x) #0 {
987 ; SSE-LABEL: v16f32_no_estimate:
989 ; SSE-NEXT: movaps {{.*#+}} xmm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
990 ; SSE-NEXT: movaps %xmm4, %xmm5
991 ; SSE-NEXT: divps %xmm0, %xmm5
992 ; SSE-NEXT: movaps %xmm4, %xmm6
993 ; SSE-NEXT: divps %xmm1, %xmm6
994 ; SSE-NEXT: movaps %xmm4, %xmm7
995 ; SSE-NEXT: divps %xmm2, %xmm7
996 ; SSE-NEXT: divps %xmm3, %xmm4
997 ; SSE-NEXT: movaps %xmm5, %xmm0
998 ; SSE-NEXT: movaps %xmm6, %xmm1
999 ; SSE-NEXT: movaps %xmm7, %xmm2
1000 ; SSE-NEXT: movaps %xmm4, %xmm3
1003 ; AVX-RECIP-LABEL: v16f32_no_estimate:
1004 ; AVX-RECIP: # %bb.0:
1005 ; AVX-RECIP-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
1006 ; AVX-RECIP-NEXT: vdivps %ymm0, %ymm2, %ymm0
1007 ; AVX-RECIP-NEXT: vdivps %ymm1, %ymm2, %ymm1
1008 ; AVX-RECIP-NEXT: retq
1010 ; FMA-RECIP-LABEL: v16f32_no_estimate:
1011 ; FMA-RECIP: # %bb.0:
1012 ; FMA-RECIP-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
1013 ; FMA-RECIP-NEXT: vdivps %ymm0, %ymm2, %ymm0
1014 ; FMA-RECIP-NEXT: vdivps %ymm1, %ymm2, %ymm1
1015 ; FMA-RECIP-NEXT: retq
1017 ; BDVER2-LABEL: v16f32_no_estimate:
1019 ; BDVER2-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
1020 ; BDVER2-NEXT: vdivps %ymm0, %ymm2, %ymm0
1021 ; BDVER2-NEXT: vdivps %ymm1, %ymm2, %ymm1
1024 ; BTVER2-LABEL: v16f32_no_estimate:
1026 ; BTVER2-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
1027 ; BTVER2-NEXT: vdivps %ymm0, %ymm2, %ymm0
1028 ; BTVER2-NEXT: vdivps %ymm1, %ymm2, %ymm1
1031 ; SANDY-LABEL: v16f32_no_estimate:
1033 ; SANDY-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
1034 ; SANDY-NEXT: vdivps %ymm0, %ymm2, %ymm0
1035 ; SANDY-NEXT: vdivps %ymm1, %ymm2, %ymm1
1038 ; HASWELL-LABEL: v16f32_no_estimate:
1040 ; HASWELL-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
1041 ; HASWELL-NEXT: vdivps %ymm0, %ymm2, %ymm0
1042 ; HASWELL-NEXT: vdivps %ymm1, %ymm2, %ymm1
1043 ; HASWELL-NEXT: retq
1045 ; HASWELL-NO-FMA-LABEL: v16f32_no_estimate:
1046 ; HASWELL-NO-FMA: # %bb.0:
1047 ; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
1048 ; HASWELL-NO-FMA-NEXT: vdivps %ymm0, %ymm2, %ymm0
1049 ; HASWELL-NO-FMA-NEXT: vdivps %ymm1, %ymm2, %ymm1
1050 ; HASWELL-NO-FMA-NEXT: retq
1052 ; AVX512-LABEL: v16f32_no_estimate:
1054 ; AVX512-NEXT: vbroadcastss {{.*#+}} zmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
1055 ; AVX512-NEXT: vdivps %zmm0, %zmm1, %zmm0
1057 %div = fdiv fast <16 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, %x
1058 ret <16 x float> %div
1061 define <16 x float> @v16f32_one_step(<16 x float> %x) #1 {
1062 ; SSE-LABEL: v16f32_one_step:
1064 ; SSE-NEXT: movaps %xmm3, %xmm4
1065 ; SSE-NEXT: movaps %xmm0, %xmm5
1066 ; SSE-NEXT: rcpps %xmm0, %xmm6
1067 ; SSE-NEXT: mulps %xmm6, %xmm5
1068 ; SSE-NEXT: movaps {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
1069 ; SSE-NEXT: movaps %xmm3, %xmm0
1070 ; SSE-NEXT: subps %xmm5, %xmm0
1071 ; SSE-NEXT: mulps %xmm6, %xmm0
1072 ; SSE-NEXT: addps %xmm6, %xmm0
1073 ; SSE-NEXT: rcpps %xmm1, %xmm6
1074 ; SSE-NEXT: mulps %xmm6, %xmm1
1075 ; SSE-NEXT: movaps %xmm3, %xmm5
1076 ; SSE-NEXT: subps %xmm1, %xmm5
1077 ; SSE-NEXT: mulps %xmm6, %xmm5
1078 ; SSE-NEXT: addps %xmm6, %xmm5
1079 ; SSE-NEXT: rcpps %xmm2, %xmm1
1080 ; SSE-NEXT: mulps %xmm1, %xmm2
1081 ; SSE-NEXT: movaps %xmm3, %xmm6
1082 ; SSE-NEXT: subps %xmm2, %xmm6
1083 ; SSE-NEXT: mulps %xmm1, %xmm6
1084 ; SSE-NEXT: addps %xmm1, %xmm6
1085 ; SSE-NEXT: rcpps %xmm4, %xmm1
1086 ; SSE-NEXT: mulps %xmm1, %xmm4
1087 ; SSE-NEXT: subps %xmm4, %xmm3
1088 ; SSE-NEXT: mulps %xmm1, %xmm3
1089 ; SSE-NEXT: addps %xmm1, %xmm3
1090 ; SSE-NEXT: movaps %xmm5, %xmm1
1091 ; SSE-NEXT: movaps %xmm6, %xmm2
1094 ; AVX-RECIP-LABEL: v16f32_one_step:
1095 ; AVX-RECIP: # %bb.0:
1096 ; AVX-RECIP-NEXT: vrcpps %ymm0, %ymm2
1097 ; AVX-RECIP-NEXT: vmulps %ymm2, %ymm0, %ymm0
1098 ; AVX-RECIP-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
1099 ; AVX-RECIP-NEXT: vsubps %ymm0, %ymm3, %ymm0
1100 ; AVX-RECIP-NEXT: vmulps %ymm0, %ymm2, %ymm0
1101 ; AVX-RECIP-NEXT: vaddps %ymm0, %ymm2, %ymm0
1102 ; AVX-RECIP-NEXT: vrcpps %ymm1, %ymm2
1103 ; AVX-RECIP-NEXT: vmulps %ymm2, %ymm1, %ymm1
1104 ; AVX-RECIP-NEXT: vsubps %ymm1, %ymm3, %ymm1
1105 ; AVX-RECIP-NEXT: vmulps %ymm1, %ymm2, %ymm1
1106 ; AVX-RECIP-NEXT: vaddps %ymm1, %ymm2, %ymm1
1107 ; AVX-RECIP-NEXT: retq
1109 ; FMA-RECIP-LABEL: v16f32_one_step:
1110 ; FMA-RECIP: # %bb.0:
1111 ; FMA-RECIP-NEXT: vrcpps %ymm0, %ymm2
1112 ; FMA-RECIP-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
1113 ; FMA-RECIP-NEXT: vfnmadd213ps {{.*#+}} ymm0 = -(ymm2 * ymm0) + ymm3
1114 ; FMA-RECIP-NEXT: vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm2) + ymm2
1115 ; FMA-RECIP-NEXT: vrcpps %ymm1, %ymm2
1116 ; FMA-RECIP-NEXT: vfnmadd213ps {{.*#+}} ymm1 = -(ymm2 * ymm1) + ymm3
1117 ; FMA-RECIP-NEXT: vfmadd132ps {{.*#+}} ymm1 = (ymm1 * ymm2) + ymm2
1118 ; FMA-RECIP-NEXT: retq
1120 ; BDVER2-LABEL: v16f32_one_step:
1122 ; BDVER2-NEXT: vrcpps %ymm0, %ymm2
1123 ; BDVER2-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
1124 ; BDVER2-NEXT: vrcpps %ymm1, %ymm4
1125 ; BDVER2-NEXT: vfnmaddps %ymm3, %ymm2, %ymm0, %ymm0
1126 ; BDVER2-NEXT: vfnmaddps %ymm3, %ymm4, %ymm1, %ymm1
1127 ; BDVER2-NEXT: vfmaddps %ymm2, %ymm0, %ymm2, %ymm0
1128 ; BDVER2-NEXT: vfmaddps %ymm4, %ymm1, %ymm4, %ymm1
1131 ; BTVER2-LABEL: v16f32_one_step:
1133 ; BTVER2-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
1134 ; BTVER2-NEXT: vrcpps %ymm0, %ymm2
1135 ; BTVER2-NEXT: vrcpps %ymm1, %ymm4
1136 ; BTVER2-NEXT: vmulps %ymm2, %ymm0, %ymm0
1137 ; BTVER2-NEXT: vmulps %ymm4, %ymm1, %ymm1
1138 ; BTVER2-NEXT: vsubps %ymm0, %ymm3, %ymm0
1139 ; BTVER2-NEXT: vsubps %ymm1, %ymm3, %ymm1
1140 ; BTVER2-NEXT: vmulps %ymm0, %ymm2, %ymm0
1141 ; BTVER2-NEXT: vmulps %ymm1, %ymm4, %ymm1
1142 ; BTVER2-NEXT: vaddps %ymm0, %ymm2, %ymm0
1143 ; BTVER2-NEXT: vaddps %ymm1, %ymm4, %ymm1
1146 ; SANDY-LABEL: v16f32_one_step:
1148 ; SANDY-NEXT: vrcpps %ymm0, %ymm2
1149 ; SANDY-NEXT: vmulps %ymm2, %ymm0, %ymm0
1150 ; SANDY-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
1151 ; SANDY-NEXT: vsubps %ymm0, %ymm3, %ymm0
1152 ; SANDY-NEXT: vmulps %ymm0, %ymm2, %ymm0
1153 ; SANDY-NEXT: vaddps %ymm0, %ymm2, %ymm0
1154 ; SANDY-NEXT: vrcpps %ymm1, %ymm2
1155 ; SANDY-NEXT: vmulps %ymm2, %ymm1, %ymm1
1156 ; SANDY-NEXT: vsubps %ymm1, %ymm3, %ymm1
1157 ; SANDY-NEXT: vmulps %ymm1, %ymm2, %ymm1
1158 ; SANDY-NEXT: vaddps %ymm1, %ymm2, %ymm1
1161 ; HASWELL-LABEL: v16f32_one_step:
1163 ; HASWELL-NEXT: vrcpps %ymm0, %ymm2
1164 ; HASWELL-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
1165 ; HASWELL-NEXT: vrcpps %ymm1, %ymm4
1166 ; HASWELL-NEXT: vfnmadd213ps {{.*#+}} ymm0 = -(ymm2 * ymm0) + ymm3
1167 ; HASWELL-NEXT: vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm2) + ymm2
1168 ; HASWELL-NEXT: vfnmadd213ps {{.*#+}} ymm1 = -(ymm4 * ymm1) + ymm3
1169 ; HASWELL-NEXT: vfmadd132ps {{.*#+}} ymm1 = (ymm1 * ymm4) + ymm4
1170 ; HASWELL-NEXT: retq
1172 ; HASWELL-NO-FMA-LABEL: v16f32_one_step:
1173 ; HASWELL-NO-FMA: # %bb.0:
1174 ; HASWELL-NO-FMA-NEXT: vrcpps %ymm0, %ymm2
1175 ; HASWELL-NO-FMA-NEXT: vmulps %ymm2, %ymm0, %ymm0
1176 ; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
1177 ; HASWELL-NO-FMA-NEXT: vsubps %ymm0, %ymm3, %ymm0
1178 ; HASWELL-NO-FMA-NEXT: vmulps %ymm0, %ymm2, %ymm0
1179 ; HASWELL-NO-FMA-NEXT: vaddps %ymm0, %ymm2, %ymm0
1180 ; HASWELL-NO-FMA-NEXT: vrcpps %ymm1, %ymm2
1181 ; HASWELL-NO-FMA-NEXT: vmulps %ymm2, %ymm1, %ymm1
1182 ; HASWELL-NO-FMA-NEXT: vsubps %ymm1, %ymm3, %ymm1
1183 ; HASWELL-NO-FMA-NEXT: vmulps %ymm1, %ymm2, %ymm1
1184 ; HASWELL-NO-FMA-NEXT: vaddps %ymm1, %ymm2, %ymm1
1185 ; HASWELL-NO-FMA-NEXT: retq
1187 ; AVX512-LABEL: v16f32_one_step:
1189 ; AVX512-NEXT: vrcp14ps %zmm0, %zmm1
1190 ; AVX512-NEXT: vfnmadd213ps {{.*#+}} zmm0 = -(zmm1 * zmm0) + mem
1191 ; AVX512-NEXT: vfmadd132ps {{.*#+}} zmm0 = (zmm0 * zmm1) + zmm1
1193 %div = fdiv fast <16 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, %x
1194 ret <16 x float> %div
1197 define <16 x float> @v16f32_two_step(<16 x float> %x) #2 {
1198 ; SSE-LABEL: v16f32_two_step:
1200 ; SSE-NEXT: movaps %xmm3, %xmm4
1201 ; SSE-NEXT: movaps %xmm1, %xmm5
1202 ; SSE-NEXT: movaps %xmm0, %xmm1
1203 ; SSE-NEXT: rcpps %xmm0, %xmm0
1204 ; SSE-NEXT: movaps %xmm1, %xmm6
1205 ; SSE-NEXT: mulps %xmm0, %xmm6
1206 ; SSE-NEXT: movaps {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
1207 ; SSE-NEXT: movaps %xmm3, %xmm7
1208 ; SSE-NEXT: subps %xmm6, %xmm7
1209 ; SSE-NEXT: mulps %xmm0, %xmm7
1210 ; SSE-NEXT: addps %xmm0, %xmm7
1211 ; SSE-NEXT: mulps %xmm7, %xmm1
1212 ; SSE-NEXT: movaps %xmm3, %xmm0
1213 ; SSE-NEXT: subps %xmm1, %xmm0
1214 ; SSE-NEXT: mulps %xmm7, %xmm0
1215 ; SSE-NEXT: addps %xmm7, %xmm0
1216 ; SSE-NEXT: rcpps %xmm5, %xmm1
1217 ; SSE-NEXT: movaps %xmm5, %xmm6
1218 ; SSE-NEXT: mulps %xmm1, %xmm6
1219 ; SSE-NEXT: movaps %xmm3, %xmm7
1220 ; SSE-NEXT: subps %xmm6, %xmm7
1221 ; SSE-NEXT: mulps %xmm1, %xmm7
1222 ; SSE-NEXT: addps %xmm1, %xmm7
1223 ; SSE-NEXT: mulps %xmm7, %xmm5
1224 ; SSE-NEXT: movaps %xmm3, %xmm1
1225 ; SSE-NEXT: subps %xmm5, %xmm1
1226 ; SSE-NEXT: mulps %xmm7, %xmm1
1227 ; SSE-NEXT: addps %xmm7, %xmm1
1228 ; SSE-NEXT: rcpps %xmm2, %xmm5
1229 ; SSE-NEXT: movaps %xmm2, %xmm6
1230 ; SSE-NEXT: mulps %xmm5, %xmm6
1231 ; SSE-NEXT: movaps %xmm3, %xmm7
1232 ; SSE-NEXT: subps %xmm6, %xmm7
1233 ; SSE-NEXT: mulps %xmm5, %xmm7
1234 ; SSE-NEXT: addps %xmm5, %xmm7
1235 ; SSE-NEXT: mulps %xmm7, %xmm2
1236 ; SSE-NEXT: movaps %xmm3, %xmm5
1237 ; SSE-NEXT: subps %xmm2, %xmm5
1238 ; SSE-NEXT: mulps %xmm7, %xmm5
1239 ; SSE-NEXT: addps %xmm7, %xmm5
1240 ; SSE-NEXT: rcpps %xmm4, %xmm2
1241 ; SSE-NEXT: movaps %xmm4, %xmm6
1242 ; SSE-NEXT: mulps %xmm2, %xmm6
1243 ; SSE-NEXT: movaps %xmm3, %xmm7
1244 ; SSE-NEXT: subps %xmm6, %xmm7
1245 ; SSE-NEXT: mulps %xmm2, %xmm7
1246 ; SSE-NEXT: addps %xmm2, %xmm7
1247 ; SSE-NEXT: mulps %xmm7, %xmm4
1248 ; SSE-NEXT: subps %xmm4, %xmm3
1249 ; SSE-NEXT: mulps %xmm7, %xmm3
1250 ; SSE-NEXT: addps %xmm7, %xmm3
1251 ; SSE-NEXT: movaps %xmm5, %xmm2
1254 ; AVX-RECIP-LABEL: v16f32_two_step:
1255 ; AVX-RECIP: # %bb.0:
1256 ; AVX-RECIP-NEXT: vrcpps %ymm0, %ymm2
1257 ; AVX-RECIP-NEXT: vmulps %ymm2, %ymm0, %ymm3
1258 ; AVX-RECIP-NEXT: vmovaps {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
1259 ; AVX-RECIP-NEXT: vsubps %ymm3, %ymm4, %ymm3
1260 ; AVX-RECIP-NEXT: vmulps %ymm3, %ymm2, %ymm3
1261 ; AVX-RECIP-NEXT: vaddps %ymm3, %ymm2, %ymm2
1262 ; AVX-RECIP-NEXT: vmulps %ymm2, %ymm0, %ymm0
1263 ; AVX-RECIP-NEXT: vsubps %ymm0, %ymm4, %ymm0
1264 ; AVX-RECIP-NEXT: vmulps %ymm0, %ymm2, %ymm0
1265 ; AVX-RECIP-NEXT: vaddps %ymm0, %ymm2, %ymm0
1266 ; AVX-RECIP-NEXT: vrcpps %ymm1, %ymm2
1267 ; AVX-RECIP-NEXT: vmulps %ymm2, %ymm1, %ymm3
1268 ; AVX-RECIP-NEXT: vsubps %ymm3, %ymm4, %ymm3
1269 ; AVX-RECIP-NEXT: vmulps %ymm3, %ymm2, %ymm3
1270 ; AVX-RECIP-NEXT: vaddps %ymm3, %ymm2, %ymm2
1271 ; AVX-RECIP-NEXT: vmulps %ymm2, %ymm1, %ymm1
1272 ; AVX-RECIP-NEXT: vsubps %ymm1, %ymm4, %ymm1
1273 ; AVX-RECIP-NEXT: vmulps %ymm1, %ymm2, %ymm1
1274 ; AVX-RECIP-NEXT: vaddps %ymm1, %ymm2, %ymm1
1275 ; AVX-RECIP-NEXT: retq
1277 ; FMA-RECIP-LABEL: v16f32_two_step:
1278 ; FMA-RECIP: # %bb.0:
1279 ; FMA-RECIP-NEXT: vrcpps %ymm0, %ymm2
1280 ; FMA-RECIP-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
1281 ; FMA-RECIP-NEXT: vmovaps %ymm2, %ymm4
1282 ; FMA-RECIP-NEXT: vfnmadd213ps {{.*#+}} ymm4 = -(ymm0 * ymm4) + ymm3
1283 ; FMA-RECIP-NEXT: vfmadd132ps {{.*#+}} ymm4 = (ymm4 * ymm2) + ymm2
1284 ; FMA-RECIP-NEXT: vfnmadd213ps {{.*#+}} ymm0 = -(ymm4 * ymm0) + ymm3
1285 ; FMA-RECIP-NEXT: vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm4) + ymm4
1286 ; FMA-RECIP-NEXT: vrcpps %ymm1, %ymm2
1287 ; FMA-RECIP-NEXT: vmovaps %ymm2, %ymm4
1288 ; FMA-RECIP-NEXT: vfnmadd213ps {{.*#+}} ymm4 = -(ymm1 * ymm4) + ymm3
1289 ; FMA-RECIP-NEXT: vfmadd132ps {{.*#+}} ymm4 = (ymm4 * ymm2) + ymm2
1290 ; FMA-RECIP-NEXT: vfnmadd213ps {{.*#+}} ymm1 = -(ymm4 * ymm1) + ymm3
1291 ; FMA-RECIP-NEXT: vfmadd132ps {{.*#+}} ymm1 = (ymm1 * ymm4) + ymm4
1292 ; FMA-RECIP-NEXT: retq
1294 ; BDVER2-LABEL: v16f32_two_step:
1296 ; BDVER2-NEXT: vrcpps %ymm0, %ymm2
1297 ; BDVER2-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
1298 ; BDVER2-NEXT: vfnmaddps %ymm3, %ymm2, %ymm0, %ymm4
1299 ; BDVER2-NEXT: vfmaddps %ymm2, %ymm4, %ymm2, %ymm2
1300 ; BDVER2-NEXT: vfnmaddps %ymm3, %ymm2, %ymm0, %ymm0
1301 ; BDVER2-NEXT: vfmaddps %ymm2, %ymm0, %ymm2, %ymm0
1302 ; BDVER2-NEXT: vrcpps %ymm1, %ymm2
1303 ; BDVER2-NEXT: vfnmaddps %ymm3, %ymm2, %ymm1, %ymm4
1304 ; BDVER2-NEXT: vfmaddps %ymm2, %ymm4, %ymm2, %ymm2
1305 ; BDVER2-NEXT: vfnmaddps %ymm3, %ymm2, %ymm1, %ymm1
1306 ; BDVER2-NEXT: vfmaddps %ymm2, %ymm1, %ymm2, %ymm1
1309 ; BTVER2-LABEL: v16f32_two_step:
1311 ; BTVER2-NEXT: vmovaps {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
1312 ; BTVER2-NEXT: vrcpps %ymm0, %ymm2
1313 ; BTVER2-NEXT: vmulps %ymm2, %ymm0, %ymm3
1314 ; BTVER2-NEXT: vsubps %ymm3, %ymm4, %ymm3
1315 ; BTVER2-NEXT: vmulps %ymm3, %ymm2, %ymm3
1316 ; BTVER2-NEXT: vaddps %ymm3, %ymm2, %ymm2
1317 ; BTVER2-NEXT: vmulps %ymm2, %ymm0, %ymm0
1318 ; BTVER2-NEXT: vsubps %ymm0, %ymm4, %ymm0
1319 ; BTVER2-NEXT: vmulps %ymm0, %ymm2, %ymm0
1320 ; BTVER2-NEXT: vaddps %ymm0, %ymm2, %ymm0
1321 ; BTVER2-NEXT: vrcpps %ymm1, %ymm2
1322 ; BTVER2-NEXT: vmulps %ymm2, %ymm1, %ymm3
1323 ; BTVER2-NEXT: vsubps %ymm3, %ymm4, %ymm3
1324 ; BTVER2-NEXT: vmulps %ymm3, %ymm2, %ymm3
1325 ; BTVER2-NEXT: vaddps %ymm3, %ymm2, %ymm2
1326 ; BTVER2-NEXT: vmulps %ymm2, %ymm1, %ymm1
1327 ; BTVER2-NEXT: vsubps %ymm1, %ymm4, %ymm1
1328 ; BTVER2-NEXT: vmulps %ymm1, %ymm2, %ymm1
1329 ; BTVER2-NEXT: vaddps %ymm1, %ymm2, %ymm1
1332 ; SANDY-LABEL: v16f32_two_step:
1334 ; SANDY-NEXT: vrcpps %ymm0, %ymm2
1335 ; SANDY-NEXT: vmulps %ymm2, %ymm0, %ymm3
1336 ; SANDY-NEXT: vmovaps {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
1337 ; SANDY-NEXT: vsubps %ymm3, %ymm4, %ymm3
1338 ; SANDY-NEXT: vmulps %ymm3, %ymm2, %ymm3
1339 ; SANDY-NEXT: vaddps %ymm3, %ymm2, %ymm2
1340 ; SANDY-NEXT: vmulps %ymm2, %ymm0, %ymm0
1341 ; SANDY-NEXT: vsubps %ymm0, %ymm4, %ymm0
1342 ; SANDY-NEXT: vmulps %ymm0, %ymm2, %ymm0
1343 ; SANDY-NEXT: vaddps %ymm0, %ymm2, %ymm0
1344 ; SANDY-NEXT: vrcpps %ymm1, %ymm2
1345 ; SANDY-NEXT: vmulps %ymm2, %ymm1, %ymm3
1346 ; SANDY-NEXT: vsubps %ymm3, %ymm4, %ymm3
1347 ; SANDY-NEXT: vmulps %ymm3, %ymm2, %ymm3
1348 ; SANDY-NEXT: vaddps %ymm3, %ymm2, %ymm2
1349 ; SANDY-NEXT: vmulps %ymm2, %ymm1, %ymm1
1350 ; SANDY-NEXT: vsubps %ymm1, %ymm4, %ymm1
1351 ; SANDY-NEXT: vmulps %ymm1, %ymm2, %ymm1
1352 ; SANDY-NEXT: vaddps %ymm1, %ymm2, %ymm1
1355 ; HASWELL-LABEL: v16f32_two_step:
1357 ; HASWELL-NEXT: vrcpps %ymm0, %ymm2
1358 ; HASWELL-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
1359 ; HASWELL-NEXT: vmovaps %ymm2, %ymm4
1360 ; HASWELL-NEXT: vfnmadd213ps {{.*#+}} ymm4 = -(ymm0 * ymm4) + ymm3
1361 ; HASWELL-NEXT: vfmadd132ps {{.*#+}} ymm4 = (ymm4 * ymm2) + ymm2
1362 ; HASWELL-NEXT: vfnmadd213ps {{.*#+}} ymm0 = -(ymm4 * ymm0) + ymm3
1363 ; HASWELL-NEXT: vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm4) + ymm4
1364 ; HASWELL-NEXT: vrcpps %ymm1, %ymm2
1365 ; HASWELL-NEXT: vmovaps %ymm2, %ymm4
1366 ; HASWELL-NEXT: vfnmadd213ps {{.*#+}} ymm4 = -(ymm1 * ymm4) + ymm3
1367 ; HASWELL-NEXT: vfmadd132ps {{.*#+}} ymm4 = (ymm4 * ymm2) + ymm2
1368 ; HASWELL-NEXT: vfnmadd213ps {{.*#+}} ymm1 = -(ymm4 * ymm1) + ymm3
1369 ; HASWELL-NEXT: vfmadd132ps {{.*#+}} ymm1 = (ymm1 * ymm4) + ymm4
1370 ; HASWELL-NEXT: retq
1372 ; HASWELL-NO-FMA-LABEL: v16f32_two_step:
1373 ; HASWELL-NO-FMA: # %bb.0:
1374 ; HASWELL-NO-FMA-NEXT: vrcpps %ymm0, %ymm2
1375 ; HASWELL-NO-FMA-NEXT: vmulps %ymm2, %ymm0, %ymm3
1376 ; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
1377 ; HASWELL-NO-FMA-NEXT: vsubps %ymm3, %ymm4, %ymm3
1378 ; HASWELL-NO-FMA-NEXT: vmulps %ymm3, %ymm2, %ymm3
1379 ; HASWELL-NO-FMA-NEXT: vaddps %ymm3, %ymm2, %ymm2
1380 ; HASWELL-NO-FMA-NEXT: vmulps %ymm2, %ymm0, %ymm0
1381 ; HASWELL-NO-FMA-NEXT: vsubps %ymm0, %ymm4, %ymm0
1382 ; HASWELL-NO-FMA-NEXT: vmulps %ymm0, %ymm2, %ymm0
1383 ; HASWELL-NO-FMA-NEXT: vaddps %ymm0, %ymm2, %ymm0
1384 ; HASWELL-NO-FMA-NEXT: vrcpps %ymm1, %ymm2
1385 ; HASWELL-NO-FMA-NEXT: vmulps %ymm2, %ymm1, %ymm3
1386 ; HASWELL-NO-FMA-NEXT: vsubps %ymm3, %ymm4, %ymm3
1387 ; HASWELL-NO-FMA-NEXT: vmulps %ymm3, %ymm2, %ymm3
1388 ; HASWELL-NO-FMA-NEXT: vaddps %ymm3, %ymm2, %ymm2
1389 ; HASWELL-NO-FMA-NEXT: vmulps %ymm2, %ymm1, %ymm1
1390 ; HASWELL-NO-FMA-NEXT: vsubps %ymm1, %ymm4, %ymm1
1391 ; HASWELL-NO-FMA-NEXT: vmulps %ymm1, %ymm2, %ymm1
1392 ; HASWELL-NO-FMA-NEXT: vaddps %ymm1, %ymm2, %ymm1
1393 ; HASWELL-NO-FMA-NEXT: retq
1395 ; AVX512-LABEL: v16f32_two_step:
1397 ; AVX512-NEXT: vrcp14ps %zmm0, %zmm1
1398 ; AVX512-NEXT: vbroadcastss {{.*#+}} zmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
1399 ; AVX512-NEXT: vmovaps %zmm1, %zmm3
1400 ; AVX512-NEXT: vfnmadd213ps {{.*#+}} zmm3 = -(zmm0 * zmm3) + zmm2
1401 ; AVX512-NEXT: vfmadd132ps {{.*#+}} zmm3 = (zmm3 * zmm1) + zmm1
1402 ; AVX512-NEXT: vfnmadd213ps {{.*#+}} zmm0 = -(zmm3 * zmm0) + zmm2
1403 ; AVX512-NEXT: vfmadd132ps {{.*#+}} zmm0 = (zmm0 * zmm3) + zmm3
1405 %div = fdiv fast <16 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, %x
1406 ret <16 x float> %div
1409 attributes #0 = { "unsafe-fp-math"="true" "reciprocal-estimates"="!divf,!vec-divf" }
1410 attributes #1 = { "unsafe-fp-math"="true" "reciprocal-estimates"="divf,vec-divf" }
1411 attributes #2 = { "unsafe-fp-math"="true" "reciprocal-estimates"="divf:2,vec-divf:2" }