1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=CHECK --check-prefix=SSE --check-prefix=SSE-RECIP
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX-RECIP
4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fma | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=FMA-RECIP
5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=btver2 -print-schedule | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=BTVER2
6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=sandybridge -print-schedule | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=SANDY
7 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=haswell -print-schedule | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=HASWELL
8 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=haswell -mattr=-fma | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=HASWELL-NO-FMA
9 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -print-schedule | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX512 --check-prefix=KNL
10 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=skx -print-schedule | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX512 --check-prefix=SKX
12 ; If the target's divss/divps instructions are substantially
13 ; slower than rcpss/rcpps with a Newton-Raphson refinement,
14 ; we should generate the estimate sequence.
16 ; See PR21385 ( http://llvm.org/bugs/show_bug.cgi?id=21385 )
17 ; for details about the accuracy, speed, and implementation
18 ; differences of x86 reciprocal estimates.
20 define float @f32_no_estimate(float %x) #0 {
21 ; SSE-LABEL: f32_no_estimate:
23 ; SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
24 ; SSE-NEXT: divss %xmm0, %xmm1
25 ; SSE-NEXT: movaps %xmm1, %xmm0
28 ; AVX-RECIP-LABEL: f32_no_estimate:
30 ; AVX-RECIP-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
31 ; AVX-RECIP-NEXT: vdivss %xmm0, %xmm1, %xmm0
32 ; AVX-RECIP-NEXT: retq
34 ; FMA-RECIP-LABEL: f32_no_estimate:
36 ; FMA-RECIP-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
37 ; FMA-RECIP-NEXT: vdivss %xmm0, %xmm1, %xmm0
38 ; FMA-RECIP-NEXT: retq
40 ; BTVER2-LABEL: f32_no_estimate:
42 ; BTVER2-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [5:1.00]
43 ; BTVER2-NEXT: vdivss %xmm0, %xmm1, %xmm0 # sched: [19:19.00]
44 ; BTVER2-NEXT: retq # sched: [4:1.00]
46 ; SANDY-LABEL: f32_no_estimate:
48 ; SANDY-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [6:0.50]
49 ; SANDY-NEXT: vdivss %xmm0, %xmm1, %xmm0 # sched: [14:1.00]
50 ; SANDY-NEXT: retq # sched: [1:1.00]
52 ; HASWELL-LABEL: f32_no_estimate:
54 ; HASWELL-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [1:0.50]
55 ; HASWELL-NEXT: vdivss %xmm0, %xmm1, %xmm0 # sched: [13:1.00]
56 ; HASWELL-NEXT: retq # sched: [2:1.00]
58 ; HASWELL-NO-FMA-LABEL: f32_no_estimate:
59 ; HASWELL-NO-FMA: # BB#0:
60 ; HASWELL-NO-FMA-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
61 ; HASWELL-NO-FMA-NEXT: vdivss %xmm0, %xmm1, %xmm0
62 ; HASWELL-NO-FMA-NEXT: retq
64 ; KNL-LABEL: f32_no_estimate:
66 ; KNL-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [1:0.50]
67 ; KNL-NEXT: vdivss %xmm0, %xmm1, %xmm0 # sched: [13:1.00]
68 ; KNL-NEXT: retq # sched: [2:1.00]
70 ; SKX-LABEL: f32_no_estimate:
72 ; SKX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [5:0.50]
73 ; SKX-NEXT: vdivss %xmm0, %xmm1, %xmm0 # sched: [11:1.00]
74 ; SKX-NEXT: retq # sched: [7:1.00]
75 %div = fdiv fast float 1.0, %x
79 define float @f32_one_step(float %x) #1 {
80 ; SSE-LABEL: f32_one_step:
82 ; SSE-NEXT: rcpss %xmm0, %xmm2
83 ; SSE-NEXT: mulss %xmm2, %xmm0
84 ; SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
85 ; SSE-NEXT: subss %xmm0, %xmm1
86 ; SSE-NEXT: mulss %xmm2, %xmm1
87 ; SSE-NEXT: addss %xmm2, %xmm1
88 ; SSE-NEXT: movaps %xmm1, %xmm0
91 ; AVX-RECIP-LABEL: f32_one_step:
93 ; AVX-RECIP-NEXT: vrcpss %xmm0, %xmm0, %xmm1
94 ; AVX-RECIP-NEXT: vmulss %xmm1, %xmm0, %xmm0
95 ; AVX-RECIP-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
96 ; AVX-RECIP-NEXT: vsubss %xmm0, %xmm2, %xmm0
97 ; AVX-RECIP-NEXT: vmulss %xmm0, %xmm1, %xmm0
98 ; AVX-RECIP-NEXT: vaddss %xmm0, %xmm1, %xmm0
99 ; AVX-RECIP-NEXT: retq
101 ; FMA-RECIP-LABEL: f32_one_step:
103 ; FMA-RECIP-NEXT: vrcpss %xmm0, %xmm0, %xmm1
104 ; FMA-RECIP-NEXT: vfnmadd213ss {{.*}}(%rip), %xmm1, %xmm0
105 ; FMA-RECIP-NEXT: vfmadd132ss %xmm1, %xmm1, %xmm0
106 ; FMA-RECIP-NEXT: retq
108 ; BTVER2-LABEL: f32_one_step:
110 ; BTVER2-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [5:1.00]
111 ; BTVER2-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [2:1.00]
112 ; BTVER2-NEXT: vmulss %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
113 ; BTVER2-NEXT: vsubss %xmm0, %xmm2, %xmm0 # sched: [3:1.00]
114 ; BTVER2-NEXT: vmulss %xmm0, %xmm1, %xmm0 # sched: [2:1.00]
115 ; BTVER2-NEXT: vaddss %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
116 ; BTVER2-NEXT: retq # sched: [4:1.00]
118 ; SANDY-LABEL: f32_one_step:
120 ; SANDY-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00]
121 ; SANDY-NEXT: vmulss %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
122 ; SANDY-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [6:0.50]
123 ; SANDY-NEXT: vsubss %xmm0, %xmm2, %xmm0 # sched: [3:1.00]
124 ; SANDY-NEXT: vmulss %xmm0, %xmm1, %xmm0 # sched: [5:1.00]
125 ; SANDY-NEXT: vaddss %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
126 ; SANDY-NEXT: retq # sched: [1:1.00]
128 ; HASWELL-LABEL: f32_one_step:
130 ; HASWELL-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00]
131 ; HASWELL-NEXT: vfnmadd213ss {{.*}}(%rip), %xmm1, %xmm0 # sched: [5:0.50]
132 ; HASWELL-NEXT: vfmadd132ss %xmm1, %xmm1, %xmm0 # sched: [5:0.50]
133 ; HASWELL-NEXT: retq # sched: [2:1.00]
135 ; HASWELL-NO-FMA-LABEL: f32_one_step:
136 ; HASWELL-NO-FMA: # BB#0:
137 ; HASWELL-NO-FMA-NEXT: vrcpss %xmm0, %xmm0, %xmm1
138 ; HASWELL-NO-FMA-NEXT: vmulss %xmm1, %xmm0, %xmm0
139 ; HASWELL-NO-FMA-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
140 ; HASWELL-NO-FMA-NEXT: vsubss %xmm0, %xmm2, %xmm0
141 ; HASWELL-NO-FMA-NEXT: vmulss %xmm0, %xmm1, %xmm0
142 ; HASWELL-NO-FMA-NEXT: vaddss %xmm0, %xmm1, %xmm0
143 ; HASWELL-NO-FMA-NEXT: retq
145 ; KNL-LABEL: f32_one_step:
147 ; KNL-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00]
148 ; KNL-NEXT: vfnmadd213ss {{.*}}(%rip), %xmm1, %xmm0 # sched: [5:0.50]
149 ; KNL-NEXT: vfmadd132ss %xmm1, %xmm1, %xmm0 # sched: [5:0.50]
150 ; KNL-NEXT: retq # sched: [2:1.00]
152 ; SKX-LABEL: f32_one_step:
154 ; SKX-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [4:1.00]
155 ; SKX-NEXT: vfnmadd213ss {{.*}}(%rip), %xmm1, %xmm0 # sched: [9:0.50]
156 ; SKX-NEXT: vfmadd132ss %xmm1, %xmm1, %xmm0 # sched: [4:0.33]
157 ; SKX-NEXT: retq # sched: [7:1.00]
158 %div = fdiv fast float 1.0, %x
162 define float @f32_two_step(float %x) #2 {
163 ; SSE-LABEL: f32_two_step:
165 ; SSE-NEXT: rcpss %xmm0, %xmm2
166 ; SSE-NEXT: movaps %xmm0, %xmm3
167 ; SSE-NEXT: mulss %xmm2, %xmm3
168 ; SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
169 ; SSE-NEXT: movaps %xmm1, %xmm4
170 ; SSE-NEXT: subss %xmm3, %xmm4
171 ; SSE-NEXT: mulss %xmm2, %xmm4
172 ; SSE-NEXT: addss %xmm2, %xmm4
173 ; SSE-NEXT: mulss %xmm4, %xmm0
174 ; SSE-NEXT: subss %xmm0, %xmm1
175 ; SSE-NEXT: mulss %xmm4, %xmm1
176 ; SSE-NEXT: addss %xmm4, %xmm1
177 ; SSE-NEXT: movaps %xmm1, %xmm0
180 ; AVX-RECIP-LABEL: f32_two_step:
182 ; AVX-RECIP-NEXT: vrcpss %xmm0, %xmm0, %xmm1
183 ; AVX-RECIP-NEXT: vmulss %xmm1, %xmm0, %xmm2
184 ; AVX-RECIP-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero
185 ; AVX-RECIP-NEXT: vsubss %xmm2, %xmm3, %xmm2
186 ; AVX-RECIP-NEXT: vmulss %xmm2, %xmm1, %xmm2
187 ; AVX-RECIP-NEXT: vaddss %xmm2, %xmm1, %xmm1
188 ; AVX-RECIP-NEXT: vmulss %xmm1, %xmm0, %xmm0
189 ; AVX-RECIP-NEXT: vsubss %xmm0, %xmm3, %xmm0
190 ; AVX-RECIP-NEXT: vmulss %xmm0, %xmm1, %xmm0
191 ; AVX-RECIP-NEXT: vaddss %xmm0, %xmm1, %xmm0
192 ; AVX-RECIP-NEXT: retq
194 ; FMA-RECIP-LABEL: f32_two_step:
196 ; FMA-RECIP-NEXT: vrcpss %xmm0, %xmm0, %xmm1
197 ; FMA-RECIP-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
198 ; FMA-RECIP-NEXT: vmovaps %xmm1, %xmm3
199 ; FMA-RECIP-NEXT: vfnmadd213ss %xmm2, %xmm0, %xmm3
200 ; FMA-RECIP-NEXT: vfmadd132ss %xmm1, %xmm1, %xmm3
201 ; FMA-RECIP-NEXT: vfnmadd213ss %xmm2, %xmm3, %xmm0
202 ; FMA-RECIP-NEXT: vfmadd132ss %xmm3, %xmm3, %xmm0
203 ; FMA-RECIP-NEXT: retq
205 ; BTVER2-LABEL: f32_two_step:
207 ; BTVER2-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero sched: [5:1.00]
208 ; BTVER2-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [2:1.00]
209 ; BTVER2-NEXT: vmulss %xmm1, %xmm0, %xmm2 # sched: [2:1.00]
210 ; BTVER2-NEXT: vsubss %xmm2, %xmm3, %xmm2 # sched: [3:1.00]
211 ; BTVER2-NEXT: vmulss %xmm2, %xmm1, %xmm2 # sched: [2:1.00]
212 ; BTVER2-NEXT: vaddss %xmm2, %xmm1, %xmm1 # sched: [3:1.00]
213 ; BTVER2-NEXT: vmulss %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
214 ; BTVER2-NEXT: vsubss %xmm0, %xmm3, %xmm0 # sched: [3:1.00]
215 ; BTVER2-NEXT: vmulss %xmm0, %xmm1, %xmm0 # sched: [2:1.00]
216 ; BTVER2-NEXT: vaddss %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
217 ; BTVER2-NEXT: retq # sched: [4:1.00]
219 ; SANDY-LABEL: f32_two_step:
221 ; SANDY-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00]
222 ; SANDY-NEXT: vmulss %xmm1, %xmm0, %xmm2 # sched: [5:1.00]
223 ; SANDY-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero sched: [6:0.50]
224 ; SANDY-NEXT: vsubss %xmm2, %xmm3, %xmm2 # sched: [3:1.00]
225 ; SANDY-NEXT: vmulss %xmm2, %xmm1, %xmm2 # sched: [5:1.00]
226 ; SANDY-NEXT: vaddss %xmm2, %xmm1, %xmm1 # sched: [3:1.00]
227 ; SANDY-NEXT: vmulss %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
228 ; SANDY-NEXT: vsubss %xmm0, %xmm3, %xmm0 # sched: [3:1.00]
229 ; SANDY-NEXT: vmulss %xmm0, %xmm1, %xmm0 # sched: [5:1.00]
230 ; SANDY-NEXT: vaddss %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
231 ; SANDY-NEXT: retq # sched: [1:1.00]
233 ; HASWELL-LABEL: f32_two_step:
235 ; HASWELL-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00]
236 ; HASWELL-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [1:0.50]
237 ; HASWELL-NEXT: vmovaps %xmm1, %xmm3 # sched: [1:1.00]
238 ; HASWELL-NEXT: vfnmadd213ss %xmm2, %xmm0, %xmm3 # sched: [5:0.50]
239 ; HASWELL-NEXT: vfmadd132ss %xmm1, %xmm1, %xmm3 # sched: [5:0.50]
240 ; HASWELL-NEXT: vfnmadd213ss %xmm2, %xmm3, %xmm0 # sched: [5:0.50]
241 ; HASWELL-NEXT: vfmadd132ss %xmm3, %xmm3, %xmm0 # sched: [5:0.50]
242 ; HASWELL-NEXT: retq # sched: [2:1.00]
244 ; HASWELL-NO-FMA-LABEL: f32_two_step:
245 ; HASWELL-NO-FMA: # BB#0:
246 ; HASWELL-NO-FMA-NEXT: vrcpss %xmm0, %xmm0, %xmm1
247 ; HASWELL-NO-FMA-NEXT: vmulss %xmm1, %xmm0, %xmm2
248 ; HASWELL-NO-FMA-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero
249 ; HASWELL-NO-FMA-NEXT: vsubss %xmm2, %xmm3, %xmm2
250 ; HASWELL-NO-FMA-NEXT: vmulss %xmm2, %xmm1, %xmm2
251 ; HASWELL-NO-FMA-NEXT: vaddss %xmm2, %xmm1, %xmm1
252 ; HASWELL-NO-FMA-NEXT: vmulss %xmm1, %xmm0, %xmm0
253 ; HASWELL-NO-FMA-NEXT: vsubss %xmm0, %xmm3, %xmm0
254 ; HASWELL-NO-FMA-NEXT: vmulss %xmm0, %xmm1, %xmm0
255 ; HASWELL-NO-FMA-NEXT: vaddss %xmm0, %xmm1, %xmm0
256 ; HASWELL-NO-FMA-NEXT: retq
258 ; KNL-LABEL: f32_two_step:
260 ; KNL-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00]
261 ; KNL-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [1:0.50]
262 ; KNL-NEXT: vmovaps %xmm1, %xmm3 # sched: [1:1.00]
263 ; KNL-NEXT: vfnmadd213ss %xmm2, %xmm0, %xmm3 # sched: [5:0.50]
264 ; KNL-NEXT: vfmadd132ss %xmm1, %xmm1, %xmm3 # sched: [5:0.50]
265 ; KNL-NEXT: vfnmadd213ss %xmm2, %xmm3, %xmm0 # sched: [5:0.50]
266 ; KNL-NEXT: vfmadd132ss %xmm3, %xmm3, %xmm0 # sched: [5:0.50]
267 ; KNL-NEXT: retq # sched: [2:1.00]
269 ; SKX-LABEL: f32_two_step:
271 ; SKX-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [4:1.00]
272 ; SKX-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [5:0.50]
273 ; SKX-NEXT: vmovaps %xmm1, %xmm3 # sched: [1:1.00]
274 ; SKX-NEXT: vfnmadd213ss %xmm2, %xmm0, %xmm3 # sched: [4:0.33]
275 ; SKX-NEXT: vfmadd132ss %xmm1, %xmm1, %xmm3 # sched: [4:0.33]
276 ; SKX-NEXT: vfnmadd213ss %xmm2, %xmm3, %xmm0 # sched: [4:0.33]
277 ; SKX-NEXT: vfmadd132ss %xmm3, %xmm3, %xmm0 # sched: [4:0.33]
278 ; SKX-NEXT: retq # sched: [7:1.00]
279 %div = fdiv fast float 1.0, %x
283 define <4 x float> @v4f32_no_estimate(<4 x float> %x) #0 {
284 ; SSE-LABEL: v4f32_no_estimate:
286 ; SSE-NEXT: movaps {{.*#+}} xmm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
287 ; SSE-NEXT: divps %xmm0, %xmm1
288 ; SSE-NEXT: movaps %xmm1, %xmm0
291 ; AVX-RECIP-LABEL: v4f32_no_estimate:
293 ; AVX-RECIP-NEXT: vmovaps {{.*#+}} xmm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
294 ; AVX-RECIP-NEXT: vdivps %xmm0, %xmm1, %xmm0
295 ; AVX-RECIP-NEXT: retq
297 ; FMA-RECIP-LABEL: v4f32_no_estimate:
299 ; FMA-RECIP-NEXT: vmovaps {{.*#+}} xmm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
300 ; FMA-RECIP-NEXT: vdivps %xmm0, %xmm1, %xmm0
301 ; FMA-RECIP-NEXT: retq
303 ; BTVER2-LABEL: v4f32_no_estimate:
305 ; BTVER2-NEXT: vmovaps {{.*#+}} xmm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [5:1.00]
306 ; BTVER2-NEXT: vdivps %xmm0, %xmm1, %xmm0 # sched: [19:19.00]
307 ; BTVER2-NEXT: retq # sched: [4:1.00]
309 ; SANDY-LABEL: v4f32_no_estimate:
311 ; SANDY-NEXT: vmovaps {{.*#+}} xmm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [6:0.50]
312 ; SANDY-NEXT: vdivps %xmm0, %xmm1, %xmm0 # sched: [14:1.00]
313 ; SANDY-NEXT: retq # sched: [1:1.00]
315 ; HASWELL-LABEL: v4f32_no_estimate:
317 ; HASWELL-NEXT: vbroadcastss {{.*#+}} xmm1 = [1,1,1,1] sched: [1:0.50]
318 ; HASWELL-NEXT: vdivps %xmm0, %xmm1, %xmm0 # sched: [13:1.00]
319 ; HASWELL-NEXT: retq # sched: [2:1.00]
321 ; HASWELL-NO-FMA-LABEL: v4f32_no_estimate:
322 ; HASWELL-NO-FMA: # BB#0:
323 ; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*#+}} xmm1 = [1,1,1,1]
324 ; HASWELL-NO-FMA-NEXT: vdivps %xmm0, %xmm1, %xmm0
325 ; HASWELL-NO-FMA-NEXT: retq
327 ; KNL-LABEL: v4f32_no_estimate:
329 ; KNL-NEXT: vbroadcastss {{.*#+}} xmm1 = [1,1,1,1] sched: [1:0.50]
330 ; KNL-NEXT: vdivps %xmm0, %xmm1, %xmm0 # sched: [13:1.00]
331 ; KNL-NEXT: retq # sched: [2:1.00]
333 ; SKX-LABEL: v4f32_no_estimate:
335 ; SKX-NEXT: vbroadcastss {{.*#+}} xmm1 = [1,1,1,1] sched: [6:0.50]
336 ; SKX-NEXT: vdivps %xmm0, %xmm1, %xmm0 # sched: [11:1.00]
337 ; SKX-NEXT: retq # sched: [7:1.00]
338 %div = fdiv fast <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>, %x
342 define <4 x float> @v4f32_one_step(<4 x float> %x) #1 {
343 ; SSE-LABEL: v4f32_one_step:
345 ; SSE-NEXT: rcpps %xmm0, %xmm2
346 ; SSE-NEXT: mulps %xmm2, %xmm0
347 ; SSE-NEXT: movaps {{.*#+}} xmm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
348 ; SSE-NEXT: subps %xmm0, %xmm1
349 ; SSE-NEXT: mulps %xmm2, %xmm1
350 ; SSE-NEXT: addps %xmm2, %xmm1
351 ; SSE-NEXT: movaps %xmm1, %xmm0
354 ; AVX-RECIP-LABEL: v4f32_one_step:
356 ; AVX-RECIP-NEXT: vrcpps %xmm0, %xmm1
357 ; AVX-RECIP-NEXT: vmulps %xmm1, %xmm0, %xmm0
358 ; AVX-RECIP-NEXT: vmovaps {{.*#+}} xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
359 ; AVX-RECIP-NEXT: vsubps %xmm0, %xmm2, %xmm0
360 ; AVX-RECIP-NEXT: vmulps %xmm0, %xmm1, %xmm0
361 ; AVX-RECIP-NEXT: vaddps %xmm0, %xmm1, %xmm0
362 ; AVX-RECIP-NEXT: retq
364 ; FMA-RECIP-LABEL: v4f32_one_step:
366 ; FMA-RECIP-NEXT: vrcpps %xmm0, %xmm1
367 ; FMA-RECIP-NEXT: vfnmadd213ps {{.*}}(%rip), %xmm1, %xmm0
368 ; FMA-RECIP-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm0
369 ; FMA-RECIP-NEXT: retq
371 ; BTVER2-LABEL: v4f32_one_step:
373 ; BTVER2-NEXT: vmovaps {{.*#+}} xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [5:1.00]
374 ; BTVER2-NEXT: vrcpps %xmm0, %xmm1 # sched: [2:1.00]
375 ; BTVER2-NEXT: vmulps %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
376 ; BTVER2-NEXT: vsubps %xmm0, %xmm2, %xmm0 # sched: [3:1.00]
377 ; BTVER2-NEXT: vmulps %xmm0, %xmm1, %xmm0 # sched: [2:1.00]
378 ; BTVER2-NEXT: vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
379 ; BTVER2-NEXT: retq # sched: [4:1.00]
381 ; SANDY-LABEL: v4f32_one_step:
383 ; SANDY-NEXT: vrcpps %xmm0, %xmm1 # sched: [5:1.00]
384 ; SANDY-NEXT: vmulps %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
385 ; SANDY-NEXT: vmovaps {{.*#+}} xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [6:0.50]
386 ; SANDY-NEXT: vsubps %xmm0, %xmm2, %xmm0 # sched: [3:1.00]
387 ; SANDY-NEXT: vmulps %xmm0, %xmm1, %xmm0 # sched: [5:1.00]
388 ; SANDY-NEXT: vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
389 ; SANDY-NEXT: retq # sched: [1:1.00]
391 ; HASWELL-LABEL: v4f32_one_step:
393 ; HASWELL-NEXT: vrcpps %xmm0, %xmm1 # sched: [5:1.00]
394 ; HASWELL-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [1:0.50]
395 ; HASWELL-NEXT: vfnmadd213ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
396 ; HASWELL-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm0 # sched: [5:0.50]
397 ; HASWELL-NEXT: retq # sched: [2:1.00]
399 ; HASWELL-NO-FMA-LABEL: v4f32_one_step:
400 ; HASWELL-NO-FMA: # BB#0:
401 ; HASWELL-NO-FMA-NEXT: vrcpps %xmm0, %xmm1
402 ; HASWELL-NO-FMA-NEXT: vmulps %xmm1, %xmm0, %xmm0
403 ; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,1,1,1]
404 ; HASWELL-NO-FMA-NEXT: vsubps %xmm0, %xmm2, %xmm0
405 ; HASWELL-NO-FMA-NEXT: vmulps %xmm0, %xmm1, %xmm0
406 ; HASWELL-NO-FMA-NEXT: vaddps %xmm0, %xmm1, %xmm0
407 ; HASWELL-NO-FMA-NEXT: retq
409 ; KNL-LABEL: v4f32_one_step:
411 ; KNL-NEXT: vrcpps %xmm0, %xmm1 # sched: [5:1.00]
412 ; KNL-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [1:0.50]
413 ; KNL-NEXT: vfnmadd213ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
414 ; KNL-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm0 # sched: [5:0.50]
415 ; KNL-NEXT: retq # sched: [2:1.00]
417 ; SKX-LABEL: v4f32_one_step:
419 ; SKX-NEXT: vrcpps %xmm0, %xmm1 # sched: [4:1.00]
420 ; SKX-NEXT: vfnmadd213ps {{.*}}(%rip){1to4}, %xmm1, %xmm0 # sched: [10:0.50]
421 ; SKX-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm0 # sched: [4:0.33]
422 ; SKX-NEXT: retq # sched: [7:1.00]
423 %div = fdiv fast <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>, %x
427 define <4 x float> @v4f32_two_step(<4 x float> %x) #2 {
428 ; SSE-LABEL: v4f32_two_step:
430 ; SSE-NEXT: rcpps %xmm0, %xmm2
431 ; SSE-NEXT: movaps %xmm0, %xmm3
432 ; SSE-NEXT: mulps %xmm2, %xmm3
433 ; SSE-NEXT: movaps {{.*#+}} xmm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
434 ; SSE-NEXT: movaps %xmm1, %xmm4
435 ; SSE-NEXT: subps %xmm3, %xmm4
436 ; SSE-NEXT: mulps %xmm2, %xmm4
437 ; SSE-NEXT: addps %xmm2, %xmm4
438 ; SSE-NEXT: mulps %xmm4, %xmm0
439 ; SSE-NEXT: subps %xmm0, %xmm1
440 ; SSE-NEXT: mulps %xmm4, %xmm1
441 ; SSE-NEXT: addps %xmm4, %xmm1
442 ; SSE-NEXT: movaps %xmm1, %xmm0
445 ; AVX-RECIP-LABEL: v4f32_two_step:
447 ; AVX-RECIP-NEXT: vrcpps %xmm0, %xmm1
448 ; AVX-RECIP-NEXT: vmulps %xmm1, %xmm0, %xmm2
449 ; AVX-RECIP-NEXT: vmovaps {{.*#+}} xmm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
450 ; AVX-RECIP-NEXT: vsubps %xmm2, %xmm3, %xmm2
451 ; AVX-RECIP-NEXT: vmulps %xmm2, %xmm1, %xmm2
452 ; AVX-RECIP-NEXT: vaddps %xmm2, %xmm1, %xmm1
453 ; AVX-RECIP-NEXT: vmulps %xmm1, %xmm0, %xmm0
454 ; AVX-RECIP-NEXT: vsubps %xmm0, %xmm3, %xmm0
455 ; AVX-RECIP-NEXT: vmulps %xmm0, %xmm1, %xmm0
456 ; AVX-RECIP-NEXT: vaddps %xmm0, %xmm1, %xmm0
457 ; AVX-RECIP-NEXT: retq
459 ; FMA-RECIP-LABEL: v4f32_two_step:
461 ; FMA-RECIP-NEXT: vrcpps %xmm0, %xmm1
462 ; FMA-RECIP-NEXT: vmovaps {{.*#+}} xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
463 ; FMA-RECIP-NEXT: vmovaps %xmm1, %xmm3
464 ; FMA-RECIP-NEXT: vfnmadd213ps %xmm2, %xmm0, %xmm3
465 ; FMA-RECIP-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm3
466 ; FMA-RECIP-NEXT: vfnmadd213ps %xmm2, %xmm3, %xmm0
467 ; FMA-RECIP-NEXT: vfmadd132ps %xmm3, %xmm3, %xmm0
468 ; FMA-RECIP-NEXT: retq
470 ; BTVER2-LABEL: v4f32_two_step:
472 ; BTVER2-NEXT: vmovaps {{.*#+}} xmm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [5:1.00]
473 ; BTVER2-NEXT: vrcpps %xmm0, %xmm1 # sched: [2:1.00]
474 ; BTVER2-NEXT: vmulps %xmm1, %xmm0, %xmm2 # sched: [2:1.00]
475 ; BTVER2-NEXT: vsubps %xmm2, %xmm3, %xmm2 # sched: [3:1.00]
476 ; BTVER2-NEXT: vmulps %xmm2, %xmm1, %xmm2 # sched: [2:1.00]
477 ; BTVER2-NEXT: vaddps %xmm2, %xmm1, %xmm1 # sched: [3:1.00]
478 ; BTVER2-NEXT: vmulps %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
479 ; BTVER2-NEXT: vsubps %xmm0, %xmm3, %xmm0 # sched: [3:1.00]
480 ; BTVER2-NEXT: vmulps %xmm0, %xmm1, %xmm0 # sched: [2:1.00]
481 ; BTVER2-NEXT: vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
482 ; BTVER2-NEXT: retq # sched: [4:1.00]
484 ; SANDY-LABEL: v4f32_two_step:
486 ; SANDY-NEXT: vrcpps %xmm0, %xmm1 # sched: [5:1.00]
487 ; SANDY-NEXT: vmulps %xmm1, %xmm0, %xmm2 # sched: [5:1.00]
488 ; SANDY-NEXT: vmovaps {{.*#+}} xmm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [6:0.50]
489 ; SANDY-NEXT: vsubps %xmm2, %xmm3, %xmm2 # sched: [3:1.00]
490 ; SANDY-NEXT: vmulps %xmm2, %xmm1, %xmm2 # sched: [5:1.00]
491 ; SANDY-NEXT: vaddps %xmm2, %xmm1, %xmm1 # sched: [3:1.00]
492 ; SANDY-NEXT: vmulps %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
493 ; SANDY-NEXT: vsubps %xmm0, %xmm3, %xmm0 # sched: [3:1.00]
494 ; SANDY-NEXT: vmulps %xmm0, %xmm1, %xmm0 # sched: [5:1.00]
495 ; SANDY-NEXT: vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
496 ; SANDY-NEXT: retq # sched: [1:1.00]
498 ; HASWELL-LABEL: v4f32_two_step:
500 ; HASWELL-NEXT: vrcpps %xmm0, %xmm1 # sched: [5:1.00]
501 ; HASWELL-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [1:0.50]
502 ; HASWELL-NEXT: vmovaps %xmm1, %xmm3 # sched: [1:1.00]
503 ; HASWELL-NEXT: vfnmadd213ps %xmm2, %xmm0, %xmm3 # sched: [5:0.50]
504 ; HASWELL-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm3 # sched: [5:0.50]
505 ; HASWELL-NEXT: vfnmadd213ps %xmm2, %xmm3, %xmm0 # sched: [5:0.50]
506 ; HASWELL-NEXT: vfmadd132ps %xmm3, %xmm3, %xmm0 # sched: [5:0.50]
507 ; HASWELL-NEXT: retq # sched: [2:1.00]
509 ; HASWELL-NO-FMA-LABEL: v4f32_two_step:
510 ; HASWELL-NO-FMA: # BB#0:
511 ; HASWELL-NO-FMA-NEXT: vrcpps %xmm0, %xmm1
512 ; HASWELL-NO-FMA-NEXT: vmulps %xmm1, %xmm0, %xmm2
513 ; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*#+}} xmm3 = [1,1,1,1]
514 ; HASWELL-NO-FMA-NEXT: vsubps %xmm2, %xmm3, %xmm2
515 ; HASWELL-NO-FMA-NEXT: vmulps %xmm2, %xmm1, %xmm2
516 ; HASWELL-NO-FMA-NEXT: vaddps %xmm2, %xmm1, %xmm1
517 ; HASWELL-NO-FMA-NEXT: vmulps %xmm1, %xmm0, %xmm0
518 ; HASWELL-NO-FMA-NEXT: vsubps %xmm0, %xmm3, %xmm0
519 ; HASWELL-NO-FMA-NEXT: vmulps %xmm0, %xmm1, %xmm0
520 ; HASWELL-NO-FMA-NEXT: vaddps %xmm0, %xmm1, %xmm0
521 ; HASWELL-NO-FMA-NEXT: retq
523 ; KNL-LABEL: v4f32_two_step:
525 ; KNL-NEXT: vrcpps %xmm0, %xmm1 # sched: [5:1.00]
526 ; KNL-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [1:0.50]
527 ; KNL-NEXT: vmovaps %xmm1, %xmm3 # sched: [1:1.00]
528 ; KNL-NEXT: vfnmadd213ps %xmm2, %xmm0, %xmm3 # sched: [5:0.50]
529 ; KNL-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm3 # sched: [5:0.50]
530 ; KNL-NEXT: vfnmadd213ps %xmm2, %xmm3, %xmm0 # sched: [5:0.50]
531 ; KNL-NEXT: vfmadd132ps %xmm3, %xmm3, %xmm0 # sched: [5:0.50]
532 ; KNL-NEXT: retq # sched: [2:1.00]
534 ; SKX-LABEL: v4f32_two_step:
536 ; SKX-NEXT: vrcpps %xmm0, %xmm1 # sched: [4:1.00]
537 ; SKX-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [6:0.50]
538 ; SKX-NEXT: vmovaps %xmm1, %xmm3 # sched: [1:1.00]
539 ; SKX-NEXT: vfnmadd213ps %xmm2, %xmm0, %xmm3 # sched: [4:0.33]
540 ; SKX-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm3 # sched: [4:0.33]
541 ; SKX-NEXT: vfnmadd213ps %xmm2, %xmm3, %xmm0 # sched: [4:0.33]
542 ; SKX-NEXT: vfmadd132ps %xmm3, %xmm3, %xmm0 # sched: [4:0.33]
543 ; SKX-NEXT: retq # sched: [7:1.00]
544 %div = fdiv fast <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>, %x
548 define <8 x float> @v8f32_no_estimate(<8 x float> %x) #0 {
549 ; SSE-LABEL: v8f32_no_estimate:
551 ; SSE-NEXT: movaps {{.*#+}} xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
552 ; SSE-NEXT: movaps %xmm2, %xmm3
553 ; SSE-NEXT: divps %xmm0, %xmm3
554 ; SSE-NEXT: divps %xmm1, %xmm2
555 ; SSE-NEXT: movaps %xmm3, %xmm0
556 ; SSE-NEXT: movaps %xmm2, %xmm1
559 ; AVX-RECIP-LABEL: v8f32_no_estimate:
561 ; AVX-RECIP-NEXT: vmovaps {{.*#+}} ymm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
562 ; AVX-RECIP-NEXT: vdivps %ymm0, %ymm1, %ymm0
563 ; AVX-RECIP-NEXT: retq
565 ; FMA-RECIP-LABEL: v8f32_no_estimate:
567 ; FMA-RECIP-NEXT: vmovaps {{.*#+}} ymm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
568 ; FMA-RECIP-NEXT: vdivps %ymm0, %ymm1, %ymm0
569 ; FMA-RECIP-NEXT: retq
571 ; BTVER2-LABEL: v8f32_no_estimate:
573 ; BTVER2-NEXT: vmovaps {{.*#+}} ymm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [5:1.00]
574 ; BTVER2-NEXT: vdivps %ymm0, %ymm1, %ymm0 # sched: [38:38.00]
575 ; BTVER2-NEXT: retq # sched: [4:1.00]
577 ; SANDY-LABEL: v8f32_no_estimate:
579 ; SANDY-NEXT: vmovaps {{.*#+}} ymm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [7:0.50]
580 ; SANDY-NEXT: vdivps %ymm0, %ymm1, %ymm0 # sched: [29:2.00]
581 ; SANDY-NEXT: retq # sched: [1:1.00]
583 ; HASWELL-LABEL: v8f32_no_estimate:
585 ; HASWELL-NEXT: vbroadcastss {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1] sched: [1:0.50]
586 ; HASWELL-NEXT: vdivps %ymm0, %ymm1, %ymm0 # sched: [21:2.00]
587 ; HASWELL-NEXT: retq # sched: [2:1.00]
589 ; HASWELL-NO-FMA-LABEL: v8f32_no_estimate:
590 ; HASWELL-NO-FMA: # BB#0:
591 ; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1]
592 ; HASWELL-NO-FMA-NEXT: vdivps %ymm0, %ymm1, %ymm0
593 ; HASWELL-NO-FMA-NEXT: retq
595 ; KNL-LABEL: v8f32_no_estimate:
597 ; KNL-NEXT: vbroadcastss {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1] sched: [1:0.50]
598 ; KNL-NEXT: vdivps %ymm0, %ymm1, %ymm0 # sched: [21:2.00]
599 ; KNL-NEXT: retq # sched: [2:1.00]
601 ; SKX-LABEL: v8f32_no_estimate:
603 ; SKX-NEXT: vbroadcastss {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1] sched: [7:0.50]
604 ; SKX-NEXT: vdivps %ymm0, %ymm1, %ymm0 # sched: [11:1.00]
605 ; SKX-NEXT: retq # sched: [7:1.00]
606 %div = fdiv fast <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, %x
610 define <8 x float> @v8f32_one_step(<8 x float> %x) #1 {
611 ; SSE-LABEL: v8f32_one_step:
613 ; SSE-NEXT: rcpps %xmm0, %xmm4
614 ; SSE-NEXT: mulps %xmm4, %xmm0
615 ; SSE-NEXT: movaps {{.*#+}} xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
616 ; SSE-NEXT: movaps %xmm2, %xmm3
617 ; SSE-NEXT: subps %xmm0, %xmm3
618 ; SSE-NEXT: mulps %xmm4, %xmm3
619 ; SSE-NEXT: addps %xmm4, %xmm3
620 ; SSE-NEXT: rcpps %xmm1, %xmm0
621 ; SSE-NEXT: mulps %xmm0, %xmm1
622 ; SSE-NEXT: subps %xmm1, %xmm2
623 ; SSE-NEXT: mulps %xmm0, %xmm2
624 ; SSE-NEXT: addps %xmm0, %xmm2
625 ; SSE-NEXT: movaps %xmm3, %xmm0
626 ; SSE-NEXT: movaps %xmm2, %xmm1
629 ; AVX-RECIP-LABEL: v8f32_one_step:
631 ; AVX-RECIP-NEXT: vrcpps %ymm0, %ymm1
632 ; AVX-RECIP-NEXT: vmulps %ymm1, %ymm0, %ymm0
633 ; AVX-RECIP-NEXT: vmovaps {{.*#+}} ymm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
634 ; AVX-RECIP-NEXT: vsubps %ymm0, %ymm2, %ymm0
635 ; AVX-RECIP-NEXT: vmulps %ymm0, %ymm1, %ymm0
636 ; AVX-RECIP-NEXT: vaddps %ymm0, %ymm1, %ymm0
637 ; AVX-RECIP-NEXT: retq
639 ; FMA-RECIP-LABEL: v8f32_one_step:
641 ; FMA-RECIP-NEXT: vrcpps %ymm0, %ymm1
642 ; FMA-RECIP-NEXT: vfnmadd213ps {{.*}}(%rip), %ymm1, %ymm0
643 ; FMA-RECIP-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm0
644 ; FMA-RECIP-NEXT: retq
646 ; BTVER2-LABEL: v8f32_one_step:
648 ; BTVER2-NEXT: vmovaps {{.*#+}} ymm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [5:1.00]
649 ; BTVER2-NEXT: vrcpps %ymm0, %ymm1 # sched: [2:2.00]
650 ; BTVER2-NEXT: vmulps %ymm1, %ymm0, %ymm0 # sched: [2:2.00]
651 ; BTVER2-NEXT: vsubps %ymm0, %ymm2, %ymm0 # sched: [3:2.00]
652 ; BTVER2-NEXT: vmulps %ymm0, %ymm1, %ymm0 # sched: [2:2.00]
653 ; BTVER2-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:2.00]
654 ; BTVER2-NEXT: retq # sched: [4:1.00]
656 ; SANDY-LABEL: v8f32_one_step:
658 ; SANDY-NEXT: vrcpps %ymm0, %ymm1 # sched: [7:2.00]
659 ; SANDY-NEXT: vmulps %ymm1, %ymm0, %ymm0 # sched: [5:1.00]
660 ; SANDY-NEXT: vmovaps {{.*#+}} ymm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [7:0.50]
661 ; SANDY-NEXT: vsubps %ymm0, %ymm2, %ymm0 # sched: [3:1.00]
662 ; SANDY-NEXT: vmulps %ymm0, %ymm1, %ymm0 # sched: [5:1.00]
663 ; SANDY-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
664 ; SANDY-NEXT: retq # sched: [1:1.00]
666 ; HASWELL-LABEL: v8f32_one_step:
668 ; HASWELL-NEXT: vrcpps %ymm0, %ymm1 # sched: [11:2.00]
669 ; HASWELL-NEXT: vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [1:0.50]
670 ; HASWELL-NEXT: vfnmadd213ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
671 ; HASWELL-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm0 # sched: [5:0.50]
672 ; HASWELL-NEXT: retq # sched: [2:1.00]
674 ; HASWELL-NO-FMA-LABEL: v8f32_one_step:
675 ; HASWELL-NO-FMA: # BB#0:
676 ; HASWELL-NO-FMA-NEXT: vrcpps %ymm0, %ymm1
677 ; HASWELL-NO-FMA-NEXT: vmulps %ymm1, %ymm0, %ymm0
678 ; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1]
679 ; HASWELL-NO-FMA-NEXT: vsubps %ymm0, %ymm2, %ymm0
680 ; HASWELL-NO-FMA-NEXT: vmulps %ymm0, %ymm1, %ymm0
681 ; HASWELL-NO-FMA-NEXT: vaddps %ymm0, %ymm1, %ymm0
682 ; HASWELL-NO-FMA-NEXT: retq
684 ; KNL-LABEL: v8f32_one_step:
686 ; KNL-NEXT: vrcpps %ymm0, %ymm1 # sched: [11:2.00]
687 ; KNL-NEXT: vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [1:0.50]
688 ; KNL-NEXT: vfnmadd213ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
689 ; KNL-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm0 # sched: [5:0.50]
690 ; KNL-NEXT: retq # sched: [2:1.00]
692 ; SKX-LABEL: v8f32_one_step:
694 ; SKX-NEXT: vrcpps %ymm0, %ymm1 # sched: [4:1.00]
695 ; SKX-NEXT: vfnmadd213ps {{.*}}(%rip){1to8}, %ymm1, %ymm0 # sched: [11:0.50]
696 ; SKX-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm0 # sched: [4:0.33]
697 ; SKX-NEXT: retq # sched: [7:1.00]
698 %div = fdiv fast <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, %x
702 define <8 x float> @v8f32_two_step(<8 x float> %x) #2 {
703 ; SSE-LABEL: v8f32_two_step:
705 ; SSE-NEXT: movaps %xmm1, %xmm2
706 ; SSE-NEXT: rcpps %xmm0, %xmm3
707 ; SSE-NEXT: movaps %xmm0, %xmm4
708 ; SSE-NEXT: mulps %xmm3, %xmm4
709 ; SSE-NEXT: movaps {{.*#+}} xmm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
710 ; SSE-NEXT: movaps %xmm1, %xmm5
711 ; SSE-NEXT: subps %xmm4, %xmm5
712 ; SSE-NEXT: mulps %xmm3, %xmm5
713 ; SSE-NEXT: addps %xmm3, %xmm5
714 ; SSE-NEXT: mulps %xmm5, %xmm0
715 ; SSE-NEXT: movaps %xmm1, %xmm3
716 ; SSE-NEXT: subps %xmm0, %xmm3
717 ; SSE-NEXT: mulps %xmm5, %xmm3
718 ; SSE-NEXT: addps %xmm5, %xmm3
719 ; SSE-NEXT: rcpps %xmm2, %xmm0
720 ; SSE-NEXT: movaps %xmm2, %xmm4
721 ; SSE-NEXT: mulps %xmm0, %xmm4
722 ; SSE-NEXT: movaps %xmm1, %xmm5
723 ; SSE-NEXT: subps %xmm4, %xmm5
724 ; SSE-NEXT: mulps %xmm0, %xmm5
725 ; SSE-NEXT: addps %xmm0, %xmm5
726 ; SSE-NEXT: mulps %xmm5, %xmm2
727 ; SSE-NEXT: subps %xmm2, %xmm1
728 ; SSE-NEXT: mulps %xmm5, %xmm1
729 ; SSE-NEXT: addps %xmm5, %xmm1
730 ; SSE-NEXT: movaps %xmm3, %xmm0
733 ; AVX-RECIP-LABEL: v8f32_two_step:
735 ; AVX-RECIP-NEXT: vrcpps %ymm0, %ymm1
736 ; AVX-RECIP-NEXT: vmulps %ymm1, %ymm0, %ymm2
737 ; AVX-RECIP-NEXT: vmovaps {{.*#+}} ymm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
738 ; AVX-RECIP-NEXT: vsubps %ymm2, %ymm3, %ymm2
739 ; AVX-RECIP-NEXT: vmulps %ymm2, %ymm1, %ymm2
740 ; AVX-RECIP-NEXT: vaddps %ymm2, %ymm1, %ymm1
741 ; AVX-RECIP-NEXT: vmulps %ymm1, %ymm0, %ymm0
742 ; AVX-RECIP-NEXT: vsubps %ymm0, %ymm3, %ymm0
743 ; AVX-RECIP-NEXT: vmulps %ymm0, %ymm1, %ymm0
744 ; AVX-RECIP-NEXT: vaddps %ymm0, %ymm1, %ymm0
745 ; AVX-RECIP-NEXT: retq
747 ; FMA-RECIP-LABEL: v8f32_two_step:
749 ; FMA-RECIP-NEXT: vrcpps %ymm0, %ymm1
750 ; FMA-RECIP-NEXT: vmovaps {{.*#+}} ymm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
751 ; FMA-RECIP-NEXT: vmovaps %ymm1, %ymm3
752 ; FMA-RECIP-NEXT: vfnmadd213ps %ymm2, %ymm0, %ymm3
753 ; FMA-RECIP-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm3
754 ; FMA-RECIP-NEXT: vfnmadd213ps %ymm2, %ymm3, %ymm0
755 ; FMA-RECIP-NEXT: vfmadd132ps %ymm3, %ymm3, %ymm0
756 ; FMA-RECIP-NEXT: retq
758 ; BTVER2-LABEL: v8f32_two_step:
760 ; BTVER2-NEXT: vmovaps {{.*#+}} ymm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [5:1.00]
761 ; BTVER2-NEXT: vrcpps %ymm0, %ymm1 # sched: [2:2.00]
762 ; BTVER2-NEXT: vmulps %ymm1, %ymm0, %ymm2 # sched: [2:2.00]
763 ; BTVER2-NEXT: vsubps %ymm2, %ymm3, %ymm2 # sched: [3:2.00]
764 ; BTVER2-NEXT: vmulps %ymm2, %ymm1, %ymm2 # sched: [2:2.00]
765 ; BTVER2-NEXT: vaddps %ymm2, %ymm1, %ymm1 # sched: [3:2.00]
766 ; BTVER2-NEXT: vmulps %ymm1, %ymm0, %ymm0 # sched: [2:2.00]
767 ; BTVER2-NEXT: vsubps %ymm0, %ymm3, %ymm0 # sched: [3:2.00]
768 ; BTVER2-NEXT: vmulps %ymm0, %ymm1, %ymm0 # sched: [2:2.00]
769 ; BTVER2-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:2.00]
770 ; BTVER2-NEXT: retq # sched: [4:1.00]
772 ; SANDY-LABEL: v8f32_two_step:
774 ; SANDY-NEXT: vrcpps %ymm0, %ymm1 # sched: [7:2.00]
775 ; SANDY-NEXT: vmulps %ymm1, %ymm0, %ymm2 # sched: [5:1.00]
776 ; SANDY-NEXT: vmovaps {{.*#+}} ymm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [7:0.50]
777 ; SANDY-NEXT: vsubps %ymm2, %ymm3, %ymm2 # sched: [3:1.00]
778 ; SANDY-NEXT: vmulps %ymm2, %ymm1, %ymm2 # sched: [5:1.00]
779 ; SANDY-NEXT: vaddps %ymm2, %ymm1, %ymm1 # sched: [3:1.00]
780 ; SANDY-NEXT: vmulps %ymm1, %ymm0, %ymm0 # sched: [5:1.00]
781 ; SANDY-NEXT: vsubps %ymm0, %ymm3, %ymm0 # sched: [3:1.00]
782 ; SANDY-NEXT: vmulps %ymm0, %ymm1, %ymm0 # sched: [5:1.00]
783 ; SANDY-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
784 ; SANDY-NEXT: retq # sched: [1:1.00]
786 ; HASWELL-LABEL: v8f32_two_step:
788 ; HASWELL-NEXT: vrcpps %ymm0, %ymm1 # sched: [11:2.00]
789 ; HASWELL-NEXT: vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [1:0.50]
790 ; HASWELL-NEXT: vmovaps %ymm1, %ymm3 # sched: [1:1.00]
791 ; HASWELL-NEXT: vfnmadd213ps %ymm2, %ymm0, %ymm3 # sched: [5:0.50]
792 ; HASWELL-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm3 # sched: [5:0.50]
793 ; HASWELL-NEXT: vfnmadd213ps %ymm2, %ymm3, %ymm0 # sched: [5:0.50]
794 ; HASWELL-NEXT: vfmadd132ps %ymm3, %ymm3, %ymm0 # sched: [5:0.50]
795 ; HASWELL-NEXT: retq # sched: [2:1.00]
797 ; HASWELL-NO-FMA-LABEL: v8f32_two_step:
798 ; HASWELL-NO-FMA: # BB#0:
799 ; HASWELL-NO-FMA-NEXT: vrcpps %ymm0, %ymm1
800 ; HASWELL-NO-FMA-NEXT: vmulps %ymm1, %ymm0, %ymm2
801 ; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1]
802 ; HASWELL-NO-FMA-NEXT: vsubps %ymm2, %ymm3, %ymm2
803 ; HASWELL-NO-FMA-NEXT: vmulps %ymm2, %ymm1, %ymm2
804 ; HASWELL-NO-FMA-NEXT: vaddps %ymm2, %ymm1, %ymm1
805 ; HASWELL-NO-FMA-NEXT: vmulps %ymm1, %ymm0, %ymm0
806 ; HASWELL-NO-FMA-NEXT: vsubps %ymm0, %ymm3, %ymm0
807 ; HASWELL-NO-FMA-NEXT: vmulps %ymm0, %ymm1, %ymm0
808 ; HASWELL-NO-FMA-NEXT: vaddps %ymm0, %ymm1, %ymm0
809 ; HASWELL-NO-FMA-NEXT: retq
811 ; KNL-LABEL: v8f32_two_step:
813 ; KNL-NEXT: vrcpps %ymm0, %ymm1 # sched: [11:2.00]
814 ; KNL-NEXT: vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [1:0.50]
815 ; KNL-NEXT: vmovaps %ymm1, %ymm3 # sched: [1:1.00]
816 ; KNL-NEXT: vfnmadd213ps %ymm2, %ymm0, %ymm3 # sched: [5:0.50]
817 ; KNL-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm3 # sched: [5:0.50]
818 ; KNL-NEXT: vfnmadd213ps %ymm2, %ymm3, %ymm0 # sched: [5:0.50]
819 ; KNL-NEXT: vfmadd132ps %ymm3, %ymm3, %ymm0 # sched: [5:0.50]
820 ; KNL-NEXT: retq # sched: [2:1.00]
822 ; SKX-LABEL: v8f32_two_step:
824 ; SKX-NEXT: vrcpps %ymm0, %ymm1 # sched: [4:1.00]
825 ; SKX-NEXT: vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [7:0.50]
826 ; SKX-NEXT: vmovaps %ymm1, %ymm3 # sched: [1:1.00]
827 ; SKX-NEXT: vfnmadd213ps %ymm2, %ymm0, %ymm3 # sched: [4:0.33]
828 ; SKX-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm3 # sched: [4:0.33]
829 ; SKX-NEXT: vfnmadd213ps %ymm2, %ymm3, %ymm0 # sched: [4:0.33]
830 ; SKX-NEXT: vfmadd132ps %ymm3, %ymm3, %ymm0 # sched: [4:0.33]
831 ; SKX-NEXT: retq # sched: [7:1.00]
832 %div = fdiv fast <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, %x
836 attributes #0 = { "unsafe-fp-math"="true" "reciprocal-estimates"="!divf,!vec-divf" }
837 attributes #1 = { "unsafe-fp-math"="true" "reciprocal-estimates"="divf,vec-divf" }
838 attributes #2 = { "unsafe-fp-math"="true" "reciprocal-estimates"="divf:2,vec-divf:2" }