1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 -print-schedule | FileCheck %s --check-prefix=CHECK --check-prefix=SSE --check-prefix=SSE-RECIP
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx -print-schedule | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX-RECIP
4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fma -print-schedule | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=FMA-RECIP
5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -print-schedule | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=BDVER2
6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=btver2 -print-schedule | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=BTVER2
7 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=sandybridge -print-schedule | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=SANDY
8 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=haswell -print-schedule | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=HASWELL
9 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=haswell -print-schedule -mattr=-fma | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=HASWELL-NO-FMA
10 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -print-schedule | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX512 --check-prefix=KNL
11 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=skx -print-schedule | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX512 --check-prefix=SKX
13 ; It's the extra tests coverage for recip as discussed on D26855.
15 define float @f32_no_step_2(float %x) #3 {
16 ; SSE-LABEL: f32_no_step_2:
18 ; SSE-NEXT: rcpss %xmm0, %xmm0
19 ; SSE-NEXT: mulss {{.*}}(%rip), %xmm0
22 ; AVX-RECIP-LABEL: f32_no_step_2:
24 ; AVX-RECIP-NEXT: vrcpss %xmm0, %xmm0, %xmm0
25 ; AVX-RECIP-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0
26 ; AVX-RECIP-NEXT: retq
28 ; FMA-RECIP-LABEL: f32_no_step_2:
30 ; FMA-RECIP-NEXT: vrcpss %xmm0, %xmm0, %xmm0
31 ; FMA-RECIP-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0
32 ; FMA-RECIP-NEXT: retq
34 ; BDVER2-LABEL: f32_no_step_2:
36 ; BDVER2-NEXT: vrcpss %xmm0, %xmm0, %xmm0 # sched: [5:1.00]
37 ; BDVER2-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [10:1.00]
38 ; BDVER2-NEXT: retq # sched: [5:1.00]
40 ; BTVER2-LABEL: f32_no_step_2:
42 ; BTVER2-NEXT: vrcpss %xmm0, %xmm0, %xmm0 # sched: [2:1.00]
43 ; BTVER2-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [7:1.00]
44 ; BTVER2-NEXT: retq # sched: [4:1.00]
46 ; SANDY-LABEL: f32_no_step_2:
48 ; SANDY-NEXT: vrcpss %xmm0, %xmm0, %xmm0 # sched: [5:1.00]
49 ; SANDY-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [11:1.00]
50 ; SANDY-NEXT: retq # sched: [1:1.00]
52 ; HASWELL-LABEL: f32_no_step_2:
54 ; HASWELL-NEXT: vrcpss %xmm0, %xmm0, %xmm0 # sched: [5:1.00]
55 ; HASWELL-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [10:0.50]
56 ; HASWELL-NEXT: retq # sched: [7:1.00]
58 ; HASWELL-NO-FMA-LABEL: f32_no_step_2:
59 ; HASWELL-NO-FMA: # %bb.0:
60 ; HASWELL-NO-FMA-NEXT: vrcpss %xmm0, %xmm0, %xmm0 # sched: [5:1.00]
61 ; HASWELL-NO-FMA-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [10:0.50]
62 ; HASWELL-NO-FMA-NEXT: retq # sched: [7:1.00]
64 ; KNL-LABEL: f32_no_step_2:
66 ; KNL-NEXT: vrcpss %xmm0, %xmm0, %xmm0 # sched: [5:1.00]
67 ; KNL-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [10:0.50]
68 ; KNL-NEXT: retq # sched: [7:1.00]
70 ; SKX-LABEL: f32_no_step_2:
72 ; SKX-NEXT: vrcpss %xmm0, %xmm0, %xmm0 # sched: [4:1.00]
73 ; SKX-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [9:0.50]
74 ; SKX-NEXT: retq # sched: [7:1.00]
75 %div = fdiv fast float 1234.0, %x
79 define float @f32_one_step_2(float %x) #1 {
80 ; SSE-LABEL: f32_one_step_2:
82 ; SSE-NEXT: rcpss %xmm0, %xmm2
83 ; SSE-NEXT: mulss %xmm2, %xmm0
84 ; SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
85 ; SSE-NEXT: subss %xmm0, %xmm1
86 ; SSE-NEXT: mulss %xmm2, %xmm1
87 ; SSE-NEXT: addss %xmm2, %xmm1
88 ; SSE-NEXT: mulss {{.*}}(%rip), %xmm1
89 ; SSE-NEXT: movaps %xmm1, %xmm0
92 ; AVX-RECIP-LABEL: f32_one_step_2:
94 ; AVX-RECIP-NEXT: vrcpss %xmm0, %xmm0, %xmm1
95 ; AVX-RECIP-NEXT: vmulss %xmm1, %xmm0, %xmm0
96 ; AVX-RECIP-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
97 ; AVX-RECIP-NEXT: vsubss %xmm0, %xmm2, %xmm0
98 ; AVX-RECIP-NEXT: vmulss %xmm0, %xmm1, %xmm0
99 ; AVX-RECIP-NEXT: vaddss %xmm0, %xmm1, %xmm0
100 ; AVX-RECIP-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0
101 ; AVX-RECIP-NEXT: retq
103 ; FMA-RECIP-LABEL: f32_one_step_2:
104 ; FMA-RECIP: # %bb.0:
105 ; FMA-RECIP-NEXT: vrcpss %xmm0, %xmm0, %xmm1
106 ; FMA-RECIP-NEXT: vfnmadd213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) + mem
107 ; FMA-RECIP-NEXT: vfmadd132ss {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm1
108 ; FMA-RECIP-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0
109 ; FMA-RECIP-NEXT: retq
111 ; BDVER2-LABEL: f32_one_step_2:
113 ; BDVER2-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00]
114 ; BDVER2-NEXT: vfnmaddss {{.*}}(%rip), %xmm1, %xmm0, %xmm0 # sched: [10:0.50]
115 ; BDVER2-NEXT: vfmaddss %xmm1, %xmm0, %xmm1, %xmm0 # sched: [5:0.50]
116 ; BDVER2-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [10:1.00]
117 ; BDVER2-NEXT: retq # sched: [5:1.00]
119 ; BTVER2-LABEL: f32_one_step_2:
121 ; BTVER2-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [5:1.00]
122 ; BTVER2-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [2:1.00]
123 ; BTVER2-NEXT: vmulss %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
124 ; BTVER2-NEXT: vsubss %xmm0, %xmm2, %xmm0 # sched: [3:1.00]
125 ; BTVER2-NEXT: vmulss %xmm0, %xmm1, %xmm0 # sched: [2:1.00]
126 ; BTVER2-NEXT: vaddss %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
127 ; BTVER2-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [7:1.00]
128 ; BTVER2-NEXT: retq # sched: [4:1.00]
130 ; SANDY-LABEL: f32_one_step_2:
132 ; SANDY-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00]
133 ; SANDY-NEXT: vmulss %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
134 ; SANDY-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [6:0.50]
135 ; SANDY-NEXT: vsubss %xmm0, %xmm2, %xmm0 # sched: [3:1.00]
136 ; SANDY-NEXT: vmulss %xmm0, %xmm1, %xmm0 # sched: [5:1.00]
137 ; SANDY-NEXT: vaddss %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
138 ; SANDY-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [11:1.00]
139 ; SANDY-NEXT: retq # sched: [1:1.00]
141 ; HASWELL-LABEL: f32_one_step_2:
143 ; HASWELL-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00]
144 ; HASWELL-NEXT: vfnmadd213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) + mem sched: [10:0.50]
145 ; HASWELL-NEXT: vfmadd132ss {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm1 sched: [5:0.50]
146 ; HASWELL-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [10:0.50]
147 ; HASWELL-NEXT: retq # sched: [7:1.00]
149 ; HASWELL-NO-FMA-LABEL: f32_one_step_2:
150 ; HASWELL-NO-FMA: # %bb.0:
151 ; HASWELL-NO-FMA-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00]
152 ; HASWELL-NO-FMA-NEXT: vmulss %xmm1, %xmm0, %xmm0 # sched: [5:0.50]
153 ; HASWELL-NO-FMA-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [5:0.50]
154 ; HASWELL-NO-FMA-NEXT: vsubss %xmm0, %xmm2, %xmm0 # sched: [3:1.00]
155 ; HASWELL-NO-FMA-NEXT: vmulss %xmm0, %xmm1, %xmm0 # sched: [5:0.50]
156 ; HASWELL-NO-FMA-NEXT: vaddss %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
157 ; HASWELL-NO-FMA-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [10:0.50]
158 ; HASWELL-NO-FMA-NEXT: retq # sched: [7:1.00]
160 ; KNL-LABEL: f32_one_step_2:
162 ; KNL-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00]
163 ; KNL-NEXT: vfnmadd213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) + mem sched: [10:0.50]
164 ; KNL-NEXT: vfmadd132ss {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm1 sched: [5:0.50]
165 ; KNL-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [10:0.50]
166 ; KNL-NEXT: retq # sched: [7:1.00]
168 ; SKX-LABEL: f32_one_step_2:
170 ; SKX-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [4:1.00]
171 ; SKX-NEXT: vfnmadd213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) + mem sched: [9:0.50]
172 ; SKX-NEXT: vfmadd132ss {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm1 sched: [4:0.50]
173 ; SKX-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [9:0.50]
174 ; SKX-NEXT: retq # sched: [7:1.00]
175 %div = fdiv fast float 3456.0, %x
179 define float @f32_one_step_2_divs(float %x) #1 {
180 ; SSE-LABEL: f32_one_step_2_divs:
182 ; SSE-NEXT: rcpss %xmm0, %xmm1
183 ; SSE-NEXT: mulss %xmm1, %xmm0
184 ; SSE-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
185 ; SSE-NEXT: subss %xmm0, %xmm2
186 ; SSE-NEXT: mulss %xmm1, %xmm2
187 ; SSE-NEXT: addss %xmm1, %xmm2
188 ; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
189 ; SSE-NEXT: mulss %xmm2, %xmm0
190 ; SSE-NEXT: mulss %xmm2, %xmm0
193 ; AVX-RECIP-LABEL: f32_one_step_2_divs:
194 ; AVX-RECIP: # %bb.0:
195 ; AVX-RECIP-NEXT: vrcpss %xmm0, %xmm0, %xmm1
196 ; AVX-RECIP-NEXT: vmulss %xmm1, %xmm0, %xmm0
197 ; AVX-RECIP-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
198 ; AVX-RECIP-NEXT: vsubss %xmm0, %xmm2, %xmm0
199 ; AVX-RECIP-NEXT: vmulss %xmm0, %xmm1, %xmm0
200 ; AVX-RECIP-NEXT: vaddss %xmm0, %xmm1, %xmm0
201 ; AVX-RECIP-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm1
202 ; AVX-RECIP-NEXT: vmulss %xmm0, %xmm1, %xmm0
203 ; AVX-RECIP-NEXT: retq
205 ; FMA-RECIP-LABEL: f32_one_step_2_divs:
206 ; FMA-RECIP: # %bb.0:
207 ; FMA-RECIP-NEXT: vrcpss %xmm0, %xmm0, %xmm1
208 ; FMA-RECIP-NEXT: vfnmadd213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) + mem
209 ; FMA-RECIP-NEXT: vfmadd132ss {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm1
210 ; FMA-RECIP-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm1
211 ; FMA-RECIP-NEXT: vmulss %xmm0, %xmm1, %xmm0
212 ; FMA-RECIP-NEXT: retq
214 ; BDVER2-LABEL: f32_one_step_2_divs:
216 ; BDVER2-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00]
217 ; BDVER2-NEXT: vfnmaddss {{.*}}(%rip), %xmm1, %xmm0, %xmm0 # sched: [10:0.50]
218 ; BDVER2-NEXT: vfmaddss %xmm1, %xmm0, %xmm1, %xmm0 # sched: [5:0.50]
219 ; BDVER2-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm1 # sched: [10:1.00]
220 ; BDVER2-NEXT: vmulss %xmm0, %xmm1, %xmm0 # sched: [5:1.00]
221 ; BDVER2-NEXT: retq # sched: [5:1.00]
223 ; BTVER2-LABEL: f32_one_step_2_divs:
225 ; BTVER2-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [5:1.00]
226 ; BTVER2-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [2:1.00]
227 ; BTVER2-NEXT: vmulss %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
228 ; BTVER2-NEXT: vsubss %xmm0, %xmm2, %xmm0 # sched: [3:1.00]
229 ; BTVER2-NEXT: vmulss %xmm0, %xmm1, %xmm0 # sched: [2:1.00]
230 ; BTVER2-NEXT: vaddss %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
231 ; BTVER2-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm1 # sched: [7:1.00]
232 ; BTVER2-NEXT: vmulss %xmm0, %xmm1, %xmm0 # sched: [2:1.00]
233 ; BTVER2-NEXT: retq # sched: [4:1.00]
235 ; SANDY-LABEL: f32_one_step_2_divs:
237 ; SANDY-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00]
238 ; SANDY-NEXT: vmulss %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
239 ; SANDY-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [6:0.50]
240 ; SANDY-NEXT: vsubss %xmm0, %xmm2, %xmm0 # sched: [3:1.00]
241 ; SANDY-NEXT: vmulss %xmm0, %xmm1, %xmm0 # sched: [5:1.00]
242 ; SANDY-NEXT: vaddss %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
243 ; SANDY-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm1 # sched: [11:1.00]
244 ; SANDY-NEXT: vmulss %xmm0, %xmm1, %xmm0 # sched: [5:1.00]
245 ; SANDY-NEXT: retq # sched: [1:1.00]
247 ; HASWELL-LABEL: f32_one_step_2_divs:
249 ; HASWELL-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00]
250 ; HASWELL-NEXT: vfnmadd213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) + mem sched: [10:0.50]
251 ; HASWELL-NEXT: vfmadd132ss {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm1 sched: [5:0.50]
252 ; HASWELL-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm1 # sched: [10:0.50]
253 ; HASWELL-NEXT: vmulss %xmm0, %xmm1, %xmm0 # sched: [5:0.50]
254 ; HASWELL-NEXT: retq # sched: [7:1.00]
256 ; HASWELL-NO-FMA-LABEL: f32_one_step_2_divs:
257 ; HASWELL-NO-FMA: # %bb.0:
258 ; HASWELL-NO-FMA-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00]
259 ; HASWELL-NO-FMA-NEXT: vmulss %xmm1, %xmm0, %xmm0 # sched: [5:0.50]
260 ; HASWELL-NO-FMA-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [5:0.50]
261 ; HASWELL-NO-FMA-NEXT: vsubss %xmm0, %xmm2, %xmm0 # sched: [3:1.00]
262 ; HASWELL-NO-FMA-NEXT: vmulss %xmm0, %xmm1, %xmm0 # sched: [5:0.50]
263 ; HASWELL-NO-FMA-NEXT: vaddss %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
264 ; HASWELL-NO-FMA-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm1 # sched: [10:0.50]
265 ; HASWELL-NO-FMA-NEXT: vmulss %xmm0, %xmm1, %xmm0 # sched: [5:0.50]
266 ; HASWELL-NO-FMA-NEXT: retq # sched: [7:1.00]
268 ; KNL-LABEL: f32_one_step_2_divs:
270 ; KNL-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00]
271 ; KNL-NEXT: vfnmadd213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) + mem sched: [10:0.50]
272 ; KNL-NEXT: vfmadd132ss {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm1 sched: [5:0.50]
273 ; KNL-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm1 # sched: [10:0.50]
274 ; KNL-NEXT: vmulss %xmm0, %xmm1, %xmm0 # sched: [5:0.50]
275 ; KNL-NEXT: retq # sched: [7:1.00]
277 ; SKX-LABEL: f32_one_step_2_divs:
279 ; SKX-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [4:1.00]
280 ; SKX-NEXT: vfnmadd213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) + mem sched: [9:0.50]
281 ; SKX-NEXT: vfmadd132ss {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm1 sched: [4:0.50]
282 ; SKX-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm1 # sched: [9:0.50]
283 ; SKX-NEXT: vmulss %xmm0, %xmm1, %xmm0 # sched: [4:0.50]
284 ; SKX-NEXT: retq # sched: [7:1.00]
285 %div = fdiv fast float 3456.0, %x
286 %div2 = fdiv fast float %div, %x
290 define float @f32_two_step_2(float %x) #2 {
291 ; SSE-LABEL: f32_two_step_2:
293 ; SSE-NEXT: rcpss %xmm0, %xmm2
294 ; SSE-NEXT: movaps %xmm0, %xmm3
295 ; SSE-NEXT: mulss %xmm2, %xmm3
296 ; SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
297 ; SSE-NEXT: movaps %xmm1, %xmm4
298 ; SSE-NEXT: subss %xmm3, %xmm4
299 ; SSE-NEXT: mulss %xmm2, %xmm4
300 ; SSE-NEXT: addss %xmm2, %xmm4
301 ; SSE-NEXT: mulss %xmm4, %xmm0
302 ; SSE-NEXT: subss %xmm0, %xmm1
303 ; SSE-NEXT: mulss %xmm4, %xmm1
304 ; SSE-NEXT: addss %xmm4, %xmm1
305 ; SSE-NEXT: mulss {{.*}}(%rip), %xmm1
306 ; SSE-NEXT: movaps %xmm1, %xmm0
309 ; AVX-RECIP-LABEL: f32_two_step_2:
310 ; AVX-RECIP: # %bb.0:
311 ; AVX-RECIP-NEXT: vrcpss %xmm0, %xmm0, %xmm1
312 ; AVX-RECIP-NEXT: vmulss %xmm1, %xmm0, %xmm2
313 ; AVX-RECIP-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero
314 ; AVX-RECIP-NEXT: vsubss %xmm2, %xmm3, %xmm2
315 ; AVX-RECIP-NEXT: vmulss %xmm2, %xmm1, %xmm2
316 ; AVX-RECIP-NEXT: vaddss %xmm2, %xmm1, %xmm1
317 ; AVX-RECIP-NEXT: vmulss %xmm1, %xmm0, %xmm0
318 ; AVX-RECIP-NEXT: vsubss %xmm0, %xmm3, %xmm0
319 ; AVX-RECIP-NEXT: vmulss %xmm0, %xmm1, %xmm0
320 ; AVX-RECIP-NEXT: vaddss %xmm0, %xmm1, %xmm0
321 ; AVX-RECIP-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0
322 ; AVX-RECIP-NEXT: retq
324 ; FMA-RECIP-LABEL: f32_two_step_2:
325 ; FMA-RECIP: # %bb.0:
326 ; FMA-RECIP-NEXT: vrcpss %xmm0, %xmm0, %xmm1
327 ; FMA-RECIP-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
328 ; FMA-RECIP-NEXT: vmovaps %xmm1, %xmm3
329 ; FMA-RECIP-NEXT: vfnmadd213ss {{.*#+}} xmm3 = -(xmm0 * xmm3) + xmm2
330 ; FMA-RECIP-NEXT: vfmadd132ss {{.*#+}} xmm3 = (xmm3 * xmm1) + xmm1
331 ; FMA-RECIP-NEXT: vfnmadd213ss {{.*#+}} xmm0 = -(xmm3 * xmm0) + xmm2
332 ; FMA-RECIP-NEXT: vfmadd132ss {{.*#+}} xmm0 = (xmm0 * xmm3) + xmm3
333 ; FMA-RECIP-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0
334 ; FMA-RECIP-NEXT: retq
336 ; BDVER2-LABEL: f32_two_step_2:
338 ; BDVER2-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00]
339 ; BDVER2-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [5:0.50]
340 ; BDVER2-NEXT: vfnmaddss %xmm2, %xmm1, %xmm0, %xmm3 # sched: [5:0.50]
341 ; BDVER2-NEXT: vfmaddss %xmm1, %xmm3, %xmm1, %xmm1 # sched: [5:0.50]
342 ; BDVER2-NEXT: vfnmaddss %xmm2, %xmm1, %xmm0, %xmm0 # sched: [5:0.50]
343 ; BDVER2-NEXT: vfmaddss %xmm1, %xmm0, %xmm1, %xmm0 # sched: [5:0.50]
344 ; BDVER2-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [10:1.00]
345 ; BDVER2-NEXT: retq # sched: [5:1.00]
347 ; BTVER2-LABEL: f32_two_step_2:
349 ; BTVER2-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero sched: [5:1.00]
350 ; BTVER2-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [2:1.00]
351 ; BTVER2-NEXT: vmulss %xmm1, %xmm0, %xmm2 # sched: [2:1.00]
352 ; BTVER2-NEXT: vsubss %xmm2, %xmm3, %xmm2 # sched: [3:1.00]
353 ; BTVER2-NEXT: vmulss %xmm2, %xmm1, %xmm2 # sched: [2:1.00]
354 ; BTVER2-NEXT: vaddss %xmm2, %xmm1, %xmm1 # sched: [3:1.00]
355 ; BTVER2-NEXT: vmulss %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
356 ; BTVER2-NEXT: vsubss %xmm0, %xmm3, %xmm0 # sched: [3:1.00]
357 ; BTVER2-NEXT: vmulss %xmm0, %xmm1, %xmm0 # sched: [2:1.00]
358 ; BTVER2-NEXT: vaddss %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
359 ; BTVER2-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [7:1.00]
360 ; BTVER2-NEXT: retq # sched: [4:1.00]
362 ; SANDY-LABEL: f32_two_step_2:
364 ; SANDY-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00]
365 ; SANDY-NEXT: vmulss %xmm1, %xmm0, %xmm2 # sched: [5:1.00]
366 ; SANDY-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero sched: [6:0.50]
367 ; SANDY-NEXT: vsubss %xmm2, %xmm3, %xmm2 # sched: [3:1.00]
368 ; SANDY-NEXT: vmulss %xmm2, %xmm1, %xmm2 # sched: [5:1.00]
369 ; SANDY-NEXT: vaddss %xmm2, %xmm1, %xmm1 # sched: [3:1.00]
370 ; SANDY-NEXT: vmulss %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
371 ; SANDY-NEXT: vsubss %xmm0, %xmm3, %xmm0 # sched: [3:1.00]
372 ; SANDY-NEXT: vmulss %xmm0, %xmm1, %xmm0 # sched: [5:1.00]
373 ; SANDY-NEXT: vaddss %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
374 ; SANDY-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [11:1.00]
375 ; SANDY-NEXT: retq # sched: [1:1.00]
377 ; HASWELL-LABEL: f32_two_step_2:
379 ; HASWELL-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00]
380 ; HASWELL-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [5:0.50]
381 ; HASWELL-NEXT: vmovaps %xmm1, %xmm3 # sched: [1:1.00]
382 ; HASWELL-NEXT: vfnmadd213ss {{.*#+}} xmm3 = -(xmm0 * xmm3) + xmm2 sched: [5:0.50]
383 ; HASWELL-NEXT: vfmadd132ss {{.*#+}} xmm3 = (xmm3 * xmm1) + xmm1 sched: [5:0.50]
384 ; HASWELL-NEXT: vfnmadd213ss {{.*#+}} xmm0 = -(xmm3 * xmm0) + xmm2 sched: [5:0.50]
385 ; HASWELL-NEXT: vfmadd132ss {{.*#+}} xmm0 = (xmm0 * xmm3) + xmm3 sched: [5:0.50]
386 ; HASWELL-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [10:0.50]
387 ; HASWELL-NEXT: retq # sched: [7:1.00]
389 ; HASWELL-NO-FMA-LABEL: f32_two_step_2:
390 ; HASWELL-NO-FMA: # %bb.0:
391 ; HASWELL-NO-FMA-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00]
392 ; HASWELL-NO-FMA-NEXT: vmulss %xmm1, %xmm0, %xmm2 # sched: [5:0.50]
393 ; HASWELL-NO-FMA-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero sched: [5:0.50]
394 ; HASWELL-NO-FMA-NEXT: vsubss %xmm2, %xmm3, %xmm2 # sched: [3:1.00]
395 ; HASWELL-NO-FMA-NEXT: vmulss %xmm2, %xmm1, %xmm2 # sched: [5:0.50]
396 ; HASWELL-NO-FMA-NEXT: vaddss %xmm2, %xmm1, %xmm1 # sched: [3:1.00]
397 ; HASWELL-NO-FMA-NEXT: vmulss %xmm1, %xmm0, %xmm0 # sched: [5:0.50]
398 ; HASWELL-NO-FMA-NEXT: vsubss %xmm0, %xmm3, %xmm0 # sched: [3:1.00]
399 ; HASWELL-NO-FMA-NEXT: vmulss %xmm0, %xmm1, %xmm0 # sched: [5:0.50]
400 ; HASWELL-NO-FMA-NEXT: vaddss %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
401 ; HASWELL-NO-FMA-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [10:0.50]
402 ; HASWELL-NO-FMA-NEXT: retq # sched: [7:1.00]
404 ; KNL-LABEL: f32_two_step_2:
406 ; KNL-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00]
407 ; KNL-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [5:0.50]
408 ; KNL-NEXT: vmovaps %xmm1, %xmm3 # sched: [1:1.00]
409 ; KNL-NEXT: vfnmadd213ss {{.*#+}} xmm3 = -(xmm0 * xmm3) + xmm2 sched: [5:0.50]
410 ; KNL-NEXT: vfmadd132ss {{.*#+}} xmm3 = (xmm3 * xmm1) + xmm1 sched: [5:0.50]
411 ; KNL-NEXT: vfnmadd213ss {{.*#+}} xmm0 = -(xmm3 * xmm0) + xmm2 sched: [5:0.50]
412 ; KNL-NEXT: vfmadd132ss {{.*#+}} xmm0 = (xmm0 * xmm3) + xmm3 sched: [5:0.50]
413 ; KNL-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [10:0.50]
414 ; KNL-NEXT: retq # sched: [7:1.00]
416 ; SKX-LABEL: f32_two_step_2:
418 ; SKX-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [4:1.00]
419 ; SKX-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [5:0.50]
420 ; SKX-NEXT: vmovaps %xmm1, %xmm3 # sched: [1:0.33]
421 ; SKX-NEXT: vfnmadd213ss {{.*#+}} xmm3 = -(xmm0 * xmm3) + xmm2 sched: [4:0.50]
422 ; SKX-NEXT: vfmadd132ss {{.*#+}} xmm3 = (xmm3 * xmm1) + xmm1 sched: [4:0.50]
423 ; SKX-NEXT: vfnmadd213ss {{.*#+}} xmm0 = -(xmm3 * xmm0) + xmm2 sched: [4:0.50]
424 ; SKX-NEXT: vfmadd132ss {{.*#+}} xmm0 = (xmm0 * xmm3) + xmm3 sched: [4:0.50]
425 ; SKX-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [9:0.50]
426 ; SKX-NEXT: retq # sched: [7:1.00]
427 %div = fdiv fast float 6789.0, %x
431 define <4 x float> @v4f32_one_step2(<4 x float> %x) #1 {
432 ; SSE-LABEL: v4f32_one_step2:
434 ; SSE-NEXT: rcpps %xmm0, %xmm2
435 ; SSE-NEXT: mulps %xmm2, %xmm0
436 ; SSE-NEXT: movaps {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
437 ; SSE-NEXT: subps %xmm0, %xmm1
438 ; SSE-NEXT: mulps %xmm2, %xmm1
439 ; SSE-NEXT: addps %xmm2, %xmm1
440 ; SSE-NEXT: mulps {{.*}}(%rip), %xmm1
441 ; SSE-NEXT: movaps %xmm1, %xmm0
444 ; AVX-RECIP-LABEL: v4f32_one_step2:
445 ; AVX-RECIP: # %bb.0:
446 ; AVX-RECIP-NEXT: vrcpps %xmm0, %xmm1
447 ; AVX-RECIP-NEXT: vmulps %xmm1, %xmm0, %xmm0
448 ; AVX-RECIP-NEXT: vmovaps {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
449 ; AVX-RECIP-NEXT: vsubps %xmm0, %xmm2, %xmm0
450 ; AVX-RECIP-NEXT: vmulps %xmm0, %xmm1, %xmm0
451 ; AVX-RECIP-NEXT: vaddps %xmm0, %xmm1, %xmm0
452 ; AVX-RECIP-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0
453 ; AVX-RECIP-NEXT: retq
455 ; FMA-RECIP-LABEL: v4f32_one_step2:
456 ; FMA-RECIP: # %bb.0:
457 ; FMA-RECIP-NEXT: vrcpps %xmm0, %xmm1
458 ; FMA-RECIP-NEXT: vfnmadd213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) + mem
459 ; FMA-RECIP-NEXT: vfmadd132ps {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm1
460 ; FMA-RECIP-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0
461 ; FMA-RECIP-NEXT: retq
463 ; BDVER2-LABEL: v4f32_one_step2:
465 ; BDVER2-NEXT: vrcpps %xmm0, %xmm1 # sched: [5:1.00]
466 ; BDVER2-NEXT: vfnmaddps {{.*}}(%rip), %xmm1, %xmm0, %xmm0 # sched: [10:0.50]
467 ; BDVER2-NEXT: vfmaddps %xmm1, %xmm0, %xmm1, %xmm0 # sched: [5:0.50]
468 ; BDVER2-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [10:1.00]
469 ; BDVER2-NEXT: retq # sched: [5:1.00]
471 ; BTVER2-LABEL: v4f32_one_step2:
473 ; BTVER2-NEXT: vmovaps {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [5:1.00]
474 ; BTVER2-NEXT: vrcpps %xmm0, %xmm1 # sched: [2:1.00]
475 ; BTVER2-NEXT: vmulps %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
476 ; BTVER2-NEXT: vsubps %xmm0, %xmm2, %xmm0 # sched: [3:1.00]
477 ; BTVER2-NEXT: vmulps %xmm0, %xmm1, %xmm0 # sched: [2:1.00]
478 ; BTVER2-NEXT: vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
479 ; BTVER2-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [7:1.00]
480 ; BTVER2-NEXT: retq # sched: [4:1.00]
482 ; SANDY-LABEL: v4f32_one_step2:
484 ; SANDY-NEXT: vrcpps %xmm0, %xmm1 # sched: [5:1.00]
485 ; SANDY-NEXT: vmulps %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
486 ; SANDY-NEXT: vmovaps {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [6:0.50]
487 ; SANDY-NEXT: vsubps %xmm0, %xmm2, %xmm0 # sched: [3:1.00]
488 ; SANDY-NEXT: vmulps %xmm0, %xmm1, %xmm0 # sched: [5:1.00]
489 ; SANDY-NEXT: vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
490 ; SANDY-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [11:1.00]
491 ; SANDY-NEXT: retq # sched: [1:1.00]
493 ; HASWELL-LABEL: v4f32_one_step2:
495 ; HASWELL-NEXT: vrcpps %xmm0, %xmm1 # sched: [5:1.00]
496 ; HASWELL-NEXT: vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [6:0.50]
497 ; HASWELL-NEXT: vfnmadd213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm2 sched: [5:0.50]
498 ; HASWELL-NEXT: vfmadd132ps {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm1 sched: [5:0.50]
499 ; HASWELL-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [11:0.50]
500 ; HASWELL-NEXT: retq # sched: [7:1.00]
502 ; HASWELL-NO-FMA-LABEL: v4f32_one_step2:
503 ; HASWELL-NO-FMA: # %bb.0:
504 ; HASWELL-NO-FMA-NEXT: vrcpps %xmm0, %xmm1 # sched: [5:1.00]
505 ; HASWELL-NO-FMA-NEXT: vmulps %xmm1, %xmm0, %xmm0 # sched: [5:0.50]
506 ; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [6:0.50]
507 ; HASWELL-NO-FMA-NEXT: vsubps %xmm0, %xmm2, %xmm0 # sched: [3:1.00]
508 ; HASWELL-NO-FMA-NEXT: vmulps %xmm0, %xmm1, %xmm0 # sched: [5:0.50]
509 ; HASWELL-NO-FMA-NEXT: vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
510 ; HASWELL-NO-FMA-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [11:0.50]
511 ; HASWELL-NO-FMA-NEXT: retq # sched: [7:1.00]
513 ; KNL-LABEL: v4f32_one_step2:
515 ; KNL-NEXT: vrcpps %xmm0, %xmm1 # sched: [5:1.00]
516 ; KNL-NEXT: vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [6:0.50]
517 ; KNL-NEXT: vfnmadd213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm2 sched: [5:0.50]
518 ; KNL-NEXT: vfmadd132ps {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm1 sched: [5:0.50]
519 ; KNL-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [11:0.50]
520 ; KNL-NEXT: retq # sched: [7:1.00]
522 ; SKX-LABEL: v4f32_one_step2:
524 ; SKX-NEXT: vrcpps %xmm0, %xmm1 # sched: [4:1.00]
525 ; SKX-NEXT: vfnmadd213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) + mem sched: [10:0.50]
526 ; SKX-NEXT: vfmadd132ps {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm1 sched: [4:0.50]
527 ; SKX-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [10:0.50]
528 ; SKX-NEXT: retq # sched: [7:1.00]
529 %div = fdiv fast <4 x float> <float 1.0, float 2.0, float 3.0, float 4.0>, %x
533 define <4 x float> @v4f32_one_step_2_divs(<4 x float> %x) #1 {
534 ; SSE-LABEL: v4f32_one_step_2_divs:
536 ; SSE-NEXT: rcpps %xmm0, %xmm1
537 ; SSE-NEXT: mulps %xmm1, %xmm0
538 ; SSE-NEXT: movaps {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
539 ; SSE-NEXT: subps %xmm0, %xmm2
540 ; SSE-NEXT: mulps %xmm1, %xmm2
541 ; SSE-NEXT: addps %xmm1, %xmm2
542 ; SSE-NEXT: movaps {{.*#+}} xmm0 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0]
543 ; SSE-NEXT: mulps %xmm2, %xmm0
544 ; SSE-NEXT: mulps %xmm2, %xmm0
547 ; AVX-RECIP-LABEL: v4f32_one_step_2_divs:
548 ; AVX-RECIP: # %bb.0:
549 ; AVX-RECIP-NEXT: vrcpps %xmm0, %xmm1
550 ; AVX-RECIP-NEXT: vmulps %xmm1, %xmm0, %xmm0
551 ; AVX-RECIP-NEXT: vmovaps {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
552 ; AVX-RECIP-NEXT: vsubps %xmm0, %xmm2, %xmm0
553 ; AVX-RECIP-NEXT: vmulps %xmm0, %xmm1, %xmm0
554 ; AVX-RECIP-NEXT: vaddps %xmm0, %xmm1, %xmm0
555 ; AVX-RECIP-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm1
556 ; AVX-RECIP-NEXT: vmulps %xmm0, %xmm1, %xmm0
557 ; AVX-RECIP-NEXT: retq
559 ; FMA-RECIP-LABEL: v4f32_one_step_2_divs:
560 ; FMA-RECIP: # %bb.0:
561 ; FMA-RECIP-NEXT: vrcpps %xmm0, %xmm1
562 ; FMA-RECIP-NEXT: vfnmadd213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) + mem
563 ; FMA-RECIP-NEXT: vfmadd132ps {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm1
564 ; FMA-RECIP-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm1
565 ; FMA-RECIP-NEXT: vmulps %xmm0, %xmm1, %xmm0
566 ; FMA-RECIP-NEXT: retq
568 ; BDVER2-LABEL: v4f32_one_step_2_divs:
570 ; BDVER2-NEXT: vrcpps %xmm0, %xmm1 # sched: [5:1.00]
571 ; BDVER2-NEXT: vfnmaddps {{.*}}(%rip), %xmm1, %xmm0, %xmm0 # sched: [10:0.50]
572 ; BDVER2-NEXT: vfmaddps %xmm1, %xmm0, %xmm1, %xmm0 # sched: [5:0.50]
573 ; BDVER2-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm1 # sched: [10:1.00]
574 ; BDVER2-NEXT: vmulps %xmm0, %xmm1, %xmm0 # sched: [5:1.00]
575 ; BDVER2-NEXT: retq # sched: [5:1.00]
577 ; BTVER2-LABEL: v4f32_one_step_2_divs:
579 ; BTVER2-NEXT: vmovaps {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [5:1.00]
580 ; BTVER2-NEXT: vrcpps %xmm0, %xmm1 # sched: [2:1.00]
581 ; BTVER2-NEXT: vmulps %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
582 ; BTVER2-NEXT: vsubps %xmm0, %xmm2, %xmm0 # sched: [3:1.00]
583 ; BTVER2-NEXT: vmulps %xmm0, %xmm1, %xmm0 # sched: [2:1.00]
584 ; BTVER2-NEXT: vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
585 ; BTVER2-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm1 # sched: [7:1.00]
586 ; BTVER2-NEXT: vmulps %xmm0, %xmm1, %xmm0 # sched: [2:1.00]
587 ; BTVER2-NEXT: retq # sched: [4:1.00]
589 ; SANDY-LABEL: v4f32_one_step_2_divs:
591 ; SANDY-NEXT: vrcpps %xmm0, %xmm1 # sched: [5:1.00]
592 ; SANDY-NEXT: vmulps %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
593 ; SANDY-NEXT: vmovaps {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [6:0.50]
594 ; SANDY-NEXT: vsubps %xmm0, %xmm2, %xmm0 # sched: [3:1.00]
595 ; SANDY-NEXT: vmulps %xmm0, %xmm1, %xmm0 # sched: [5:1.00]
596 ; SANDY-NEXT: vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
597 ; SANDY-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm1 # sched: [11:1.00]
598 ; SANDY-NEXT: vmulps %xmm0, %xmm1, %xmm0 # sched: [5:1.00]
599 ; SANDY-NEXT: retq # sched: [1:1.00]
601 ; HASWELL-LABEL: v4f32_one_step_2_divs:
603 ; HASWELL-NEXT: vrcpps %xmm0, %xmm1 # sched: [5:1.00]
604 ; HASWELL-NEXT: vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [6:0.50]
605 ; HASWELL-NEXT: vfnmadd213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm2 sched: [5:0.50]
606 ; HASWELL-NEXT: vfmadd132ps {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm1 sched: [5:0.50]
607 ; HASWELL-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm1 # sched: [11:0.50]
608 ; HASWELL-NEXT: vmulps %xmm0, %xmm1, %xmm0 # sched: [5:0.50]
609 ; HASWELL-NEXT: retq # sched: [7:1.00]
611 ; HASWELL-NO-FMA-LABEL: v4f32_one_step_2_divs:
612 ; HASWELL-NO-FMA: # %bb.0:
613 ; HASWELL-NO-FMA-NEXT: vrcpps %xmm0, %xmm1 # sched: [5:1.00]
614 ; HASWELL-NO-FMA-NEXT: vmulps %xmm1, %xmm0, %xmm0 # sched: [5:0.50]
615 ; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [6:0.50]
616 ; HASWELL-NO-FMA-NEXT: vsubps %xmm0, %xmm2, %xmm0 # sched: [3:1.00]
617 ; HASWELL-NO-FMA-NEXT: vmulps %xmm0, %xmm1, %xmm0 # sched: [5:0.50]
618 ; HASWELL-NO-FMA-NEXT: vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
619 ; HASWELL-NO-FMA-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm1 # sched: [11:0.50]
620 ; HASWELL-NO-FMA-NEXT: vmulps %xmm0, %xmm1, %xmm0 # sched: [5:0.50]
621 ; HASWELL-NO-FMA-NEXT: retq # sched: [7:1.00]
623 ; KNL-LABEL: v4f32_one_step_2_divs:
625 ; KNL-NEXT: vrcpps %xmm0, %xmm1 # sched: [5:1.00]
626 ; KNL-NEXT: vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [6:0.50]
627 ; KNL-NEXT: vfnmadd213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm2 sched: [5:0.50]
628 ; KNL-NEXT: vfmadd132ps {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm1 sched: [5:0.50]
629 ; KNL-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm1 # sched: [11:0.50]
630 ; KNL-NEXT: vmulps %xmm0, %xmm1, %xmm0 # sched: [5:0.50]
631 ; KNL-NEXT: retq # sched: [7:1.00]
633 ; SKX-LABEL: v4f32_one_step_2_divs:
635 ; SKX-NEXT: vrcpps %xmm0, %xmm1 # sched: [4:1.00]
636 ; SKX-NEXT: vfnmadd213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) + mem sched: [10:0.50]
637 ; SKX-NEXT: vfmadd132ps {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm1 sched: [4:0.50]
638 ; SKX-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm1 # sched: [10:0.50]
639 ; SKX-NEXT: vmulps %xmm0, %xmm1, %xmm0 # sched: [4:0.50]
640 ; SKX-NEXT: retq # sched: [7:1.00]
641 %div = fdiv fast <4 x float> <float 1.0, float 2.0, float 3.0, float 4.0>, %x
642 %div2 = fdiv fast <4 x float> %div, %x
643 ret <4 x float> %div2
646 define <4 x float> @v4f32_two_step2(<4 x float> %x) #2 {
647 ; SSE-LABEL: v4f32_two_step2:
649 ; SSE-NEXT: rcpps %xmm0, %xmm2
650 ; SSE-NEXT: movaps %xmm0, %xmm3
651 ; SSE-NEXT: mulps %xmm2, %xmm3
652 ; SSE-NEXT: movaps {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
653 ; SSE-NEXT: movaps %xmm1, %xmm4
654 ; SSE-NEXT: subps %xmm3, %xmm4
655 ; SSE-NEXT: mulps %xmm2, %xmm4
656 ; SSE-NEXT: addps %xmm2, %xmm4
657 ; SSE-NEXT: mulps %xmm4, %xmm0
658 ; SSE-NEXT: subps %xmm0, %xmm1
659 ; SSE-NEXT: mulps %xmm4, %xmm1
660 ; SSE-NEXT: addps %xmm4, %xmm1
661 ; SSE-NEXT: mulps {{.*}}(%rip), %xmm1
662 ; SSE-NEXT: movaps %xmm1, %xmm0
665 ; AVX-RECIP-LABEL: v4f32_two_step2:
666 ; AVX-RECIP: # %bb.0:
667 ; AVX-RECIP-NEXT: vrcpps %xmm0, %xmm1
668 ; AVX-RECIP-NEXT: vmulps %xmm1, %xmm0, %xmm2
669 ; AVX-RECIP-NEXT: vmovaps {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
670 ; AVX-RECIP-NEXT: vsubps %xmm2, %xmm3, %xmm2
671 ; AVX-RECIP-NEXT: vmulps %xmm2, %xmm1, %xmm2
672 ; AVX-RECIP-NEXT: vaddps %xmm2, %xmm1, %xmm1
673 ; AVX-RECIP-NEXT: vmulps %xmm1, %xmm0, %xmm0
674 ; AVX-RECIP-NEXT: vsubps %xmm0, %xmm3, %xmm0
675 ; AVX-RECIP-NEXT: vmulps %xmm0, %xmm1, %xmm0
676 ; AVX-RECIP-NEXT: vaddps %xmm0, %xmm1, %xmm0
677 ; AVX-RECIP-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0
678 ; AVX-RECIP-NEXT: retq
680 ; FMA-RECIP-LABEL: v4f32_two_step2:
681 ; FMA-RECIP: # %bb.0:
682 ; FMA-RECIP-NEXT: vrcpps %xmm0, %xmm1
683 ; FMA-RECIP-NEXT: vmovaps {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
684 ; FMA-RECIP-NEXT: vmovaps %xmm1, %xmm3
685 ; FMA-RECIP-NEXT: vfnmadd213ps {{.*#+}} xmm3 = -(xmm0 * xmm3) + xmm2
686 ; FMA-RECIP-NEXT: vfmadd132ps {{.*#+}} xmm3 = (xmm3 * xmm1) + xmm1
687 ; FMA-RECIP-NEXT: vfnmadd213ps {{.*#+}} xmm0 = -(xmm3 * xmm0) + xmm2
688 ; FMA-RECIP-NEXT: vfmadd132ps {{.*#+}} xmm0 = (xmm0 * xmm3) + xmm3
689 ; FMA-RECIP-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0
690 ; FMA-RECIP-NEXT: retq
692 ; BDVER2-LABEL: v4f32_two_step2:
694 ; BDVER2-NEXT: vrcpps %xmm0, %xmm1 # sched: [5:1.00]
695 ; BDVER2-NEXT: vmovaps {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [5:0.50]
696 ; BDVER2-NEXT: vfnmaddps %xmm2, %xmm1, %xmm0, %xmm3 # sched: [5:0.50]
697 ; BDVER2-NEXT: vfmaddps %xmm1, %xmm3, %xmm1, %xmm1 # sched: [5:0.50]
698 ; BDVER2-NEXT: vfnmaddps %xmm2, %xmm1, %xmm0, %xmm0 # sched: [5:0.50]
699 ; BDVER2-NEXT: vfmaddps %xmm1, %xmm0, %xmm1, %xmm0 # sched: [5:0.50]
700 ; BDVER2-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [10:1.00]
701 ; BDVER2-NEXT: retq # sched: [5:1.00]
703 ; BTVER2-LABEL: v4f32_two_step2:
705 ; BTVER2-NEXT: vmovaps {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [5:1.00]
706 ; BTVER2-NEXT: vrcpps %xmm0, %xmm1 # sched: [2:1.00]
707 ; BTVER2-NEXT: vmulps %xmm1, %xmm0, %xmm2 # sched: [2:1.00]
708 ; BTVER2-NEXT: vsubps %xmm2, %xmm3, %xmm2 # sched: [3:1.00]
709 ; BTVER2-NEXT: vmulps %xmm2, %xmm1, %xmm2 # sched: [2:1.00]
710 ; BTVER2-NEXT: vaddps %xmm2, %xmm1, %xmm1 # sched: [3:1.00]
711 ; BTVER2-NEXT: vmulps %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
712 ; BTVER2-NEXT: vsubps %xmm0, %xmm3, %xmm0 # sched: [3:1.00]
713 ; BTVER2-NEXT: vmulps %xmm0, %xmm1, %xmm0 # sched: [2:1.00]
714 ; BTVER2-NEXT: vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
715 ; BTVER2-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [7:1.00]
716 ; BTVER2-NEXT: retq # sched: [4:1.00]
718 ; SANDY-LABEL: v4f32_two_step2:
720 ; SANDY-NEXT: vrcpps %xmm0, %xmm1 # sched: [5:1.00]
721 ; SANDY-NEXT: vmulps %xmm1, %xmm0, %xmm2 # sched: [5:1.00]
722 ; SANDY-NEXT: vmovaps {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [6:0.50]
723 ; SANDY-NEXT: vsubps %xmm2, %xmm3, %xmm2 # sched: [3:1.00]
724 ; SANDY-NEXT: vmulps %xmm2, %xmm1, %xmm2 # sched: [5:1.00]
725 ; SANDY-NEXT: vaddps %xmm2, %xmm1, %xmm1 # sched: [3:1.00]
726 ; SANDY-NEXT: vmulps %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
727 ; SANDY-NEXT: vsubps %xmm0, %xmm3, %xmm0 # sched: [3:1.00]
728 ; SANDY-NEXT: vmulps %xmm0, %xmm1, %xmm0 # sched: [5:1.00]
729 ; SANDY-NEXT: vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
730 ; SANDY-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [11:1.00]
731 ; SANDY-NEXT: retq # sched: [1:1.00]
733 ; HASWELL-LABEL: v4f32_two_step2:
735 ; HASWELL-NEXT: vrcpps %xmm0, %xmm1 # sched: [5:1.00]
736 ; HASWELL-NEXT: vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [6:0.50]
737 ; HASWELL-NEXT: vmovaps %xmm1, %xmm3 # sched: [1:1.00]
738 ; HASWELL-NEXT: vfnmadd213ps {{.*#+}} xmm3 = -(xmm0 * xmm3) + xmm2 sched: [5:0.50]
739 ; HASWELL-NEXT: vfmadd132ps {{.*#+}} xmm3 = (xmm3 * xmm1) + xmm1 sched: [5:0.50]
740 ; HASWELL-NEXT: vfnmadd213ps {{.*#+}} xmm0 = -(xmm3 * xmm0) + xmm2 sched: [5:0.50]
741 ; HASWELL-NEXT: vfmadd132ps {{.*#+}} xmm0 = (xmm0 * xmm3) + xmm3 sched: [5:0.50]
742 ; HASWELL-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [11:0.50]
743 ; HASWELL-NEXT: retq # sched: [7:1.00]
745 ; HASWELL-NO-FMA-LABEL: v4f32_two_step2:
746 ; HASWELL-NO-FMA: # %bb.0:
747 ; HASWELL-NO-FMA-NEXT: vrcpps %xmm0, %xmm1 # sched: [5:1.00]
748 ; HASWELL-NO-FMA-NEXT: vmulps %xmm1, %xmm0, %xmm2 # sched: [5:0.50]
749 ; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [6:0.50]
750 ; HASWELL-NO-FMA-NEXT: vsubps %xmm2, %xmm3, %xmm2 # sched: [3:1.00]
751 ; HASWELL-NO-FMA-NEXT: vmulps %xmm2, %xmm1, %xmm2 # sched: [5:0.50]
752 ; HASWELL-NO-FMA-NEXT: vaddps %xmm2, %xmm1, %xmm1 # sched: [3:1.00]
753 ; HASWELL-NO-FMA-NEXT: vmulps %xmm1, %xmm0, %xmm0 # sched: [5:0.50]
754 ; HASWELL-NO-FMA-NEXT: vsubps %xmm0, %xmm3, %xmm0 # sched: [3:1.00]
755 ; HASWELL-NO-FMA-NEXT: vmulps %xmm0, %xmm1, %xmm0 # sched: [5:0.50]
756 ; HASWELL-NO-FMA-NEXT: vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
757 ; HASWELL-NO-FMA-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [11:0.50]
758 ; HASWELL-NO-FMA-NEXT: retq # sched: [7:1.00]
760 ; KNL-LABEL: v4f32_two_step2:
762 ; KNL-NEXT: vrcpps %xmm0, %xmm1 # sched: [5:1.00]
763 ; KNL-NEXT: vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [6:0.50]
764 ; KNL-NEXT: vmovaps %xmm1, %xmm3 # sched: [1:1.00]
765 ; KNL-NEXT: vfnmadd213ps {{.*#+}} xmm3 = -(xmm0 * xmm3) + xmm2 sched: [5:0.50]
766 ; KNL-NEXT: vfmadd132ps {{.*#+}} xmm3 = (xmm3 * xmm1) + xmm1 sched: [5:0.50]
767 ; KNL-NEXT: vfnmadd213ps {{.*#+}} xmm0 = -(xmm3 * xmm0) + xmm2 sched: [5:0.50]
768 ; KNL-NEXT: vfmadd132ps {{.*#+}} xmm0 = (xmm0 * xmm3) + xmm3 sched: [5:0.50]
769 ; KNL-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [11:0.50]
770 ; KNL-NEXT: retq # sched: [7:1.00]
772 ; SKX-LABEL: v4f32_two_step2:
774 ; SKX-NEXT: vrcpps %xmm0, %xmm1 # sched: [4:1.00]
775 ; SKX-NEXT: vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [6:0.50]
776 ; SKX-NEXT: vmovaps %xmm1, %xmm3 # sched: [1:0.33]
777 ; SKX-NEXT: vfnmadd213ps {{.*#+}} xmm3 = -(xmm0 * xmm3) + xmm2 sched: [4:0.50]
778 ; SKX-NEXT: vfmadd132ps {{.*#+}} xmm3 = (xmm3 * xmm1) + xmm1 sched: [4:0.50]
779 ; SKX-NEXT: vfnmadd213ps {{.*#+}} xmm0 = -(xmm3 * xmm0) + xmm2 sched: [4:0.50]
780 ; SKX-NEXT: vfmadd132ps {{.*#+}} xmm0 = (xmm0 * xmm3) + xmm3 sched: [4:0.50]
781 ; SKX-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [10:0.50]
782 ; SKX-NEXT: retq # sched: [7:1.00]
783 %div = fdiv fast <4 x float> <float 1.0, float 2.0, float 3.0, float 4.0>, %x
787 define <8 x float> @v8f32_one_step2(<8 x float> %x) #1 {
788 ; SSE-LABEL: v8f32_one_step2:
790 ; SSE-NEXT: rcpps %xmm1, %xmm4
791 ; SSE-NEXT: mulps %xmm4, %xmm1
792 ; SSE-NEXT: movaps {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
793 ; SSE-NEXT: movaps %xmm2, %xmm3
794 ; SSE-NEXT: subps %xmm1, %xmm3
795 ; SSE-NEXT: mulps %xmm4, %xmm3
796 ; SSE-NEXT: addps %xmm4, %xmm3
797 ; SSE-NEXT: rcpps %xmm0, %xmm1
798 ; SSE-NEXT: mulps %xmm1, %xmm0
799 ; SSE-NEXT: subps %xmm0, %xmm2
800 ; SSE-NEXT: mulps %xmm1, %xmm2
801 ; SSE-NEXT: addps %xmm1, %xmm2
802 ; SSE-NEXT: mulps {{.*}}(%rip), %xmm2
803 ; SSE-NEXT: mulps {{.*}}(%rip), %xmm3
804 ; SSE-NEXT: movaps %xmm2, %xmm0
805 ; SSE-NEXT: movaps %xmm3, %xmm1
808 ; AVX-RECIP-LABEL: v8f32_one_step2:
809 ; AVX-RECIP: # %bb.0:
810 ; AVX-RECIP-NEXT: vrcpps %ymm0, %ymm1
811 ; AVX-RECIP-NEXT: vmulps %ymm1, %ymm0, %ymm0
812 ; AVX-RECIP-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
813 ; AVX-RECIP-NEXT: vsubps %ymm0, %ymm2, %ymm0
814 ; AVX-RECIP-NEXT: vmulps %ymm0, %ymm1, %ymm0
815 ; AVX-RECIP-NEXT: vaddps %ymm0, %ymm1, %ymm0
816 ; AVX-RECIP-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0
817 ; AVX-RECIP-NEXT: retq
819 ; FMA-RECIP-LABEL: v8f32_one_step2:
820 ; FMA-RECIP: # %bb.0:
821 ; FMA-RECIP-NEXT: vrcpps %ymm0, %ymm1
822 ; FMA-RECIP-NEXT: vfnmadd213ps {{.*#+}} ymm0 = -(ymm1 * ymm0) + mem
823 ; FMA-RECIP-NEXT: vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm1) + ymm1
824 ; FMA-RECIP-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0
825 ; FMA-RECIP-NEXT: retq
827 ; BDVER2-LABEL: v8f32_one_step2:
829 ; BDVER2-NEXT: vrcpps %ymm0, %ymm1 # sched: [5:2.00]
830 ; BDVER2-NEXT: vfnmaddps {{.*}}(%rip), %ymm1, %ymm0, %ymm0 # sched: [10:1.00]
831 ; BDVER2-NEXT: vfmaddps %ymm1, %ymm0, %ymm1, %ymm0 # sched: [5:0.50]
832 ; BDVER2-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [10:2.00]
833 ; BDVER2-NEXT: retq # sched: [5:1.00]
835 ; BTVER2-LABEL: v8f32_one_step2:
837 ; BTVER2-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [5:1.00]
838 ; BTVER2-NEXT: vrcpps %ymm0, %ymm1 # sched: [2:2.00]
839 ; BTVER2-NEXT: vmulps %ymm1, %ymm0, %ymm0 # sched: [2:2.00]
840 ; BTVER2-NEXT: vsubps %ymm0, %ymm2, %ymm0 # sched: [3:2.00]
841 ; BTVER2-NEXT: vmulps %ymm0, %ymm1, %ymm0 # sched: [2:2.00]
842 ; BTVER2-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:2.00]
843 ; BTVER2-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [7:2.00]
844 ; BTVER2-NEXT: retq # sched: [4:1.00]
846 ; SANDY-LABEL: v8f32_one_step2:
848 ; SANDY-NEXT: vrcpps %ymm0, %ymm1 # sched: [7:2.00]
849 ; SANDY-NEXT: vmulps %ymm1, %ymm0, %ymm0 # sched: [5:1.00]
850 ; SANDY-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [7:0.50]
851 ; SANDY-NEXT: vsubps %ymm0, %ymm2, %ymm0 # sched: [3:1.00]
852 ; SANDY-NEXT: vmulps %ymm0, %ymm1, %ymm0 # sched: [5:1.00]
853 ; SANDY-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
854 ; SANDY-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [12:1.00]
855 ; SANDY-NEXT: retq # sched: [1:1.00]
857 ; HASWELL-LABEL: v8f32_one_step2:
859 ; HASWELL-NEXT: vrcpps %ymm0, %ymm1 # sched: [11:2.00]
860 ; HASWELL-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [7:0.50]
861 ; HASWELL-NEXT: vfnmadd213ps {{.*#+}} ymm0 = -(ymm1 * ymm0) + ymm2 sched: [5:0.50]
862 ; HASWELL-NEXT: vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm1) + ymm1 sched: [5:0.50]
863 ; HASWELL-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [12:0.50]
864 ; HASWELL-NEXT: retq # sched: [7:1.00]
866 ; HASWELL-NO-FMA-LABEL: v8f32_one_step2:
867 ; HASWELL-NO-FMA: # %bb.0:
868 ; HASWELL-NO-FMA-NEXT: vrcpps %ymm0, %ymm1 # sched: [11:2.00]
869 ; HASWELL-NO-FMA-NEXT: vmulps %ymm1, %ymm0, %ymm0 # sched: [5:0.50]
870 ; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [7:0.50]
871 ; HASWELL-NO-FMA-NEXT: vsubps %ymm0, %ymm2, %ymm0 # sched: [3:1.00]
872 ; HASWELL-NO-FMA-NEXT: vmulps %ymm0, %ymm1, %ymm0 # sched: [5:0.50]
873 ; HASWELL-NO-FMA-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
874 ; HASWELL-NO-FMA-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [12:0.50]
875 ; HASWELL-NO-FMA-NEXT: retq # sched: [7:1.00]
877 ; KNL-LABEL: v8f32_one_step2:
879 ; KNL-NEXT: vrcpps %ymm0, %ymm1 # sched: [11:2.00]
880 ; KNL-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [7:0.50]
881 ; KNL-NEXT: vfnmadd213ps {{.*#+}} ymm0 = -(ymm1 * ymm0) + ymm2 sched: [5:0.50]
882 ; KNL-NEXT: vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm1) + ymm1 sched: [5:0.50]
883 ; KNL-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [12:0.50]
884 ; KNL-NEXT: retq # sched: [7:1.00]
886 ; SKX-LABEL: v8f32_one_step2:
888 ; SKX-NEXT: vrcpps %ymm0, %ymm1 # sched: [4:1.00]
889 ; SKX-NEXT: vfnmadd213ps {{.*#+}} ymm0 = -(ymm1 * ymm0) + mem sched: [11:0.50]
890 ; SKX-NEXT: vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm1) + ymm1 sched: [4:0.50]
891 ; SKX-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [11:0.50]
892 ; SKX-NEXT: retq # sched: [7:1.00]
893 %div = fdiv fast <8 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0>, %x
897 define <8 x float> @v8f32_one_step_2_divs(<8 x float> %x) #1 {
898 ; SSE-LABEL: v8f32_one_step_2_divs:
900 ; SSE-NEXT: rcpps %xmm0, %xmm2
901 ; SSE-NEXT: mulps %xmm2, %xmm0
902 ; SSE-NEXT: movaps {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
903 ; SSE-NEXT: movaps %xmm3, %xmm4
904 ; SSE-NEXT: subps %xmm0, %xmm4
905 ; SSE-NEXT: mulps %xmm2, %xmm4
906 ; SSE-NEXT: addps %xmm2, %xmm4
907 ; SSE-NEXT: rcpps %xmm1, %xmm0
908 ; SSE-NEXT: mulps %xmm0, %xmm1
909 ; SSE-NEXT: subps %xmm1, %xmm3
910 ; SSE-NEXT: mulps %xmm0, %xmm3
911 ; SSE-NEXT: addps %xmm0, %xmm3
912 ; SSE-NEXT: movaps {{.*#+}} xmm1 = [5.0E+0,6.0E+0,7.0E+0,8.0E+0]
913 ; SSE-NEXT: mulps %xmm3, %xmm1
914 ; SSE-NEXT: movaps {{.*#+}} xmm0 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0]
915 ; SSE-NEXT: mulps %xmm4, %xmm0
916 ; SSE-NEXT: mulps %xmm4, %xmm0
917 ; SSE-NEXT: mulps %xmm3, %xmm1
920 ; AVX-RECIP-LABEL: v8f32_one_step_2_divs:
921 ; AVX-RECIP: # %bb.0:
922 ; AVX-RECIP-NEXT: vrcpps %ymm0, %ymm1
923 ; AVX-RECIP-NEXT: vmulps %ymm1, %ymm0, %ymm0
924 ; AVX-RECIP-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
925 ; AVX-RECIP-NEXT: vsubps %ymm0, %ymm2, %ymm0
926 ; AVX-RECIP-NEXT: vmulps %ymm0, %ymm1, %ymm0
927 ; AVX-RECIP-NEXT: vaddps %ymm0, %ymm1, %ymm0
928 ; AVX-RECIP-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm1
929 ; AVX-RECIP-NEXT: vmulps %ymm0, %ymm1, %ymm0
930 ; AVX-RECIP-NEXT: retq
932 ; FMA-RECIP-LABEL: v8f32_one_step_2_divs:
933 ; FMA-RECIP: # %bb.0:
934 ; FMA-RECIP-NEXT: vrcpps %ymm0, %ymm1
935 ; FMA-RECIP-NEXT: vfnmadd213ps {{.*#+}} ymm0 = -(ymm1 * ymm0) + mem
936 ; FMA-RECIP-NEXT: vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm1) + ymm1
937 ; FMA-RECIP-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm1
938 ; FMA-RECIP-NEXT: vmulps %ymm0, %ymm1, %ymm0
939 ; FMA-RECIP-NEXT: retq
941 ; BDVER2-LABEL: v8f32_one_step_2_divs:
943 ; BDVER2-NEXT: vrcpps %ymm0, %ymm1 # sched: [5:2.00]
944 ; BDVER2-NEXT: vfnmaddps {{.*}}(%rip), %ymm1, %ymm0, %ymm0 # sched: [10:1.00]
945 ; BDVER2-NEXT: vfmaddps %ymm1, %ymm0, %ymm1, %ymm0 # sched: [5:0.50]
946 ; BDVER2-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm1 # sched: [10:2.00]
947 ; BDVER2-NEXT: vmulps %ymm0, %ymm1, %ymm0 # sched: [5:2.00]
948 ; BDVER2-NEXT: retq # sched: [5:1.00]
950 ; BTVER2-LABEL: v8f32_one_step_2_divs:
952 ; BTVER2-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [5:1.00]
953 ; BTVER2-NEXT: vrcpps %ymm0, %ymm1 # sched: [2:2.00]
954 ; BTVER2-NEXT: vmulps %ymm1, %ymm0, %ymm0 # sched: [2:2.00]
955 ; BTVER2-NEXT: vsubps %ymm0, %ymm2, %ymm0 # sched: [3:2.00]
956 ; BTVER2-NEXT: vmulps %ymm0, %ymm1, %ymm0 # sched: [2:2.00]
957 ; BTVER2-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:2.00]
958 ; BTVER2-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm1 # sched: [7:2.00]
959 ; BTVER2-NEXT: vmulps %ymm0, %ymm1, %ymm0 # sched: [2:2.00]
960 ; BTVER2-NEXT: retq # sched: [4:1.00]
962 ; SANDY-LABEL: v8f32_one_step_2_divs:
964 ; SANDY-NEXT: vrcpps %ymm0, %ymm1 # sched: [7:2.00]
965 ; SANDY-NEXT: vmulps %ymm1, %ymm0, %ymm0 # sched: [5:1.00]
966 ; SANDY-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [7:0.50]
967 ; SANDY-NEXT: vsubps %ymm0, %ymm2, %ymm0 # sched: [3:1.00]
968 ; SANDY-NEXT: vmulps %ymm0, %ymm1, %ymm0 # sched: [5:1.00]
969 ; SANDY-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
970 ; SANDY-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm1 # sched: [12:1.00]
971 ; SANDY-NEXT: vmulps %ymm0, %ymm1, %ymm0 # sched: [5:1.00]
972 ; SANDY-NEXT: retq # sched: [1:1.00]
974 ; HASWELL-LABEL: v8f32_one_step_2_divs:
976 ; HASWELL-NEXT: vrcpps %ymm0, %ymm1 # sched: [11:2.00]
977 ; HASWELL-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [7:0.50]
978 ; HASWELL-NEXT: vfnmadd213ps {{.*#+}} ymm0 = -(ymm1 * ymm0) + ymm2 sched: [5:0.50]
979 ; HASWELL-NEXT: vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm1) + ymm1 sched: [5:0.50]
980 ; HASWELL-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm1 # sched: [12:0.50]
981 ; HASWELL-NEXT: vmulps %ymm0, %ymm1, %ymm0 # sched: [5:0.50]
982 ; HASWELL-NEXT: retq # sched: [7:1.00]
984 ; HASWELL-NO-FMA-LABEL: v8f32_one_step_2_divs:
985 ; HASWELL-NO-FMA: # %bb.0:
986 ; HASWELL-NO-FMA-NEXT: vrcpps %ymm0, %ymm1 # sched: [11:2.00]
987 ; HASWELL-NO-FMA-NEXT: vmulps %ymm1, %ymm0, %ymm0 # sched: [5:0.50]
988 ; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [7:0.50]
989 ; HASWELL-NO-FMA-NEXT: vsubps %ymm0, %ymm2, %ymm0 # sched: [3:1.00]
990 ; HASWELL-NO-FMA-NEXT: vmulps %ymm0, %ymm1, %ymm0 # sched: [5:0.50]
991 ; HASWELL-NO-FMA-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
992 ; HASWELL-NO-FMA-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm1 # sched: [12:0.50]
993 ; HASWELL-NO-FMA-NEXT: vmulps %ymm0, %ymm1, %ymm0 # sched: [5:0.50]
994 ; HASWELL-NO-FMA-NEXT: retq # sched: [7:1.00]
996 ; KNL-LABEL: v8f32_one_step_2_divs:
998 ; KNL-NEXT: vrcpps %ymm0, %ymm1 # sched: [11:2.00]
999 ; KNL-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [7:0.50]
1000 ; KNL-NEXT: vfnmadd213ps {{.*#+}} ymm0 = -(ymm1 * ymm0) + ymm2 sched: [5:0.50]
1001 ; KNL-NEXT: vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm1) + ymm1 sched: [5:0.50]
1002 ; KNL-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm1 # sched: [12:0.50]
1003 ; KNL-NEXT: vmulps %ymm0, %ymm1, %ymm0 # sched: [5:0.50]
1004 ; KNL-NEXT: retq # sched: [7:1.00]
1006 ; SKX-LABEL: v8f32_one_step_2_divs:
1008 ; SKX-NEXT: vrcpps %ymm0, %ymm1 # sched: [4:1.00]
1009 ; SKX-NEXT: vfnmadd213ps {{.*#+}} ymm0 = -(ymm1 * ymm0) + mem sched: [11:0.50]
1010 ; SKX-NEXT: vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm1) + ymm1 sched: [4:0.50]
1011 ; SKX-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm1 # sched: [11:0.50]
1012 ; SKX-NEXT: vmulps %ymm0, %ymm1, %ymm0 # sched: [4:0.50]
1013 ; SKX-NEXT: retq # sched: [7:1.00]
1014 %div = fdiv fast <8 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0>, %x
1015 %div2 = fdiv fast <8 x float> %div, %x
1016 ret <8 x float> %div2
1019 define <8 x float> @v8f32_two_step2(<8 x float> %x) #2 {
1020 ; SSE-LABEL: v8f32_two_step2:
1022 ; SSE-NEXT: movaps %xmm0, %xmm2
1023 ; SSE-NEXT: rcpps %xmm1, %xmm3
1024 ; SSE-NEXT: movaps %xmm1, %xmm4
1025 ; SSE-NEXT: mulps %xmm3, %xmm4
1026 ; SSE-NEXT: movaps {{.*#+}} xmm0 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
1027 ; SSE-NEXT: movaps %xmm0, %xmm5
1028 ; SSE-NEXT: subps %xmm4, %xmm5
1029 ; SSE-NEXT: mulps %xmm3, %xmm5
1030 ; SSE-NEXT: addps %xmm3, %xmm5
1031 ; SSE-NEXT: mulps %xmm5, %xmm1
1032 ; SSE-NEXT: movaps %xmm0, %xmm3
1033 ; SSE-NEXT: subps %xmm1, %xmm3
1034 ; SSE-NEXT: mulps %xmm5, %xmm3
1035 ; SSE-NEXT: addps %xmm5, %xmm3
1036 ; SSE-NEXT: rcpps %xmm2, %xmm1
1037 ; SSE-NEXT: movaps %xmm2, %xmm4
1038 ; SSE-NEXT: mulps %xmm1, %xmm4
1039 ; SSE-NEXT: movaps %xmm0, %xmm5
1040 ; SSE-NEXT: subps %xmm4, %xmm5
1041 ; SSE-NEXT: mulps %xmm1, %xmm5
1042 ; SSE-NEXT: addps %xmm1, %xmm5
1043 ; SSE-NEXT: mulps %xmm5, %xmm2
1044 ; SSE-NEXT: subps %xmm2, %xmm0
1045 ; SSE-NEXT: mulps %xmm5, %xmm0
1046 ; SSE-NEXT: addps %xmm5, %xmm0
1047 ; SSE-NEXT: mulps {{.*}}(%rip), %xmm0
1048 ; SSE-NEXT: mulps {{.*}}(%rip), %xmm3
1049 ; SSE-NEXT: movaps %xmm3, %xmm1
1052 ; AVX-RECIP-LABEL: v8f32_two_step2:
1053 ; AVX-RECIP: # %bb.0:
1054 ; AVX-RECIP-NEXT: vrcpps %ymm0, %ymm1
1055 ; AVX-RECIP-NEXT: vmulps %ymm1, %ymm0, %ymm2
1056 ; AVX-RECIP-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
1057 ; AVX-RECIP-NEXT: vsubps %ymm2, %ymm3, %ymm2
1058 ; AVX-RECIP-NEXT: vmulps %ymm2, %ymm1, %ymm2
1059 ; AVX-RECIP-NEXT: vaddps %ymm2, %ymm1, %ymm1
1060 ; AVX-RECIP-NEXT: vmulps %ymm1, %ymm0, %ymm0
1061 ; AVX-RECIP-NEXT: vsubps %ymm0, %ymm3, %ymm0
1062 ; AVX-RECIP-NEXT: vmulps %ymm0, %ymm1, %ymm0
1063 ; AVX-RECIP-NEXT: vaddps %ymm0, %ymm1, %ymm0
1064 ; AVX-RECIP-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0
1065 ; AVX-RECIP-NEXT: retq
1067 ; FMA-RECIP-LABEL: v8f32_two_step2:
1068 ; FMA-RECIP: # %bb.0:
1069 ; FMA-RECIP-NEXT: vrcpps %ymm0, %ymm1
1070 ; FMA-RECIP-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
1071 ; FMA-RECIP-NEXT: vmovaps %ymm1, %ymm3
1072 ; FMA-RECIP-NEXT: vfnmadd213ps {{.*#+}} ymm3 = -(ymm0 * ymm3) + ymm2
1073 ; FMA-RECIP-NEXT: vfmadd132ps {{.*#+}} ymm3 = (ymm3 * ymm1) + ymm1
1074 ; FMA-RECIP-NEXT: vfnmadd213ps {{.*#+}} ymm0 = -(ymm3 * ymm0) + ymm2
1075 ; FMA-RECIP-NEXT: vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm3) + ymm3
1076 ; FMA-RECIP-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0
1077 ; FMA-RECIP-NEXT: retq
1079 ; BDVER2-LABEL: v8f32_two_step2:
1081 ; BDVER2-NEXT: vrcpps %ymm0, %ymm1 # sched: [5:2.00]
1082 ; BDVER2-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [5:0.50]
1083 ; BDVER2-NEXT: vfnmaddps %ymm2, %ymm1, %ymm0, %ymm3 # sched: [5:0.50]
1084 ; BDVER2-NEXT: vfmaddps %ymm1, %ymm3, %ymm1, %ymm1 # sched: [5:0.50]
1085 ; BDVER2-NEXT: vfnmaddps %ymm2, %ymm1, %ymm0, %ymm0 # sched: [5:0.50]
1086 ; BDVER2-NEXT: vfmaddps %ymm1, %ymm0, %ymm1, %ymm0 # sched: [5:0.50]
1087 ; BDVER2-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [10:2.00]
1088 ; BDVER2-NEXT: retq # sched: [5:1.00]
1090 ; BTVER2-LABEL: v8f32_two_step2:
1092 ; BTVER2-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [5:1.00]
1093 ; BTVER2-NEXT: vrcpps %ymm0, %ymm1 # sched: [2:2.00]
1094 ; BTVER2-NEXT: vmulps %ymm1, %ymm0, %ymm2 # sched: [2:2.00]
1095 ; BTVER2-NEXT: vsubps %ymm2, %ymm3, %ymm2 # sched: [3:2.00]
1096 ; BTVER2-NEXT: vmulps %ymm2, %ymm1, %ymm2 # sched: [2:2.00]
1097 ; BTVER2-NEXT: vaddps %ymm2, %ymm1, %ymm1 # sched: [3:2.00]
1098 ; BTVER2-NEXT: vmulps %ymm1, %ymm0, %ymm0 # sched: [2:2.00]
1099 ; BTVER2-NEXT: vsubps %ymm0, %ymm3, %ymm0 # sched: [3:2.00]
1100 ; BTVER2-NEXT: vmulps %ymm0, %ymm1, %ymm0 # sched: [2:2.00]
1101 ; BTVER2-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:2.00]
1102 ; BTVER2-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [7:2.00]
1103 ; BTVER2-NEXT: retq # sched: [4:1.00]
1105 ; SANDY-LABEL: v8f32_two_step2:
1107 ; SANDY-NEXT: vrcpps %ymm0, %ymm1 # sched: [7:2.00]
1108 ; SANDY-NEXT: vmulps %ymm1, %ymm0, %ymm2 # sched: [5:1.00]
1109 ; SANDY-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [7:0.50]
1110 ; SANDY-NEXT: vsubps %ymm2, %ymm3, %ymm2 # sched: [3:1.00]
1111 ; SANDY-NEXT: vmulps %ymm2, %ymm1, %ymm2 # sched: [5:1.00]
1112 ; SANDY-NEXT: vaddps %ymm2, %ymm1, %ymm1 # sched: [3:1.00]
1113 ; SANDY-NEXT: vmulps %ymm1, %ymm0, %ymm0 # sched: [5:1.00]
1114 ; SANDY-NEXT: vsubps %ymm0, %ymm3, %ymm0 # sched: [3:1.00]
1115 ; SANDY-NEXT: vmulps %ymm0, %ymm1, %ymm0 # sched: [5:1.00]
1116 ; SANDY-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
1117 ; SANDY-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [12:1.00]
1118 ; SANDY-NEXT: retq # sched: [1:1.00]
1120 ; HASWELL-LABEL: v8f32_two_step2:
1122 ; HASWELL-NEXT: vrcpps %ymm0, %ymm1 # sched: [11:2.00]
1123 ; HASWELL-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [7:0.50]
1124 ; HASWELL-NEXT: vmovaps %ymm1, %ymm3 # sched: [1:1.00]
1125 ; HASWELL-NEXT: vfnmadd213ps {{.*#+}} ymm3 = -(ymm0 * ymm3) + ymm2 sched: [5:0.50]
1126 ; HASWELL-NEXT: vfmadd132ps {{.*#+}} ymm3 = (ymm3 * ymm1) + ymm1 sched: [5:0.50]
1127 ; HASWELL-NEXT: vfnmadd213ps {{.*#+}} ymm0 = -(ymm3 * ymm0) + ymm2 sched: [5:0.50]
1128 ; HASWELL-NEXT: vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm3) + ymm3 sched: [5:0.50]
1129 ; HASWELL-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [12:0.50]
1130 ; HASWELL-NEXT: retq # sched: [7:1.00]
1132 ; HASWELL-NO-FMA-LABEL: v8f32_two_step2:
1133 ; HASWELL-NO-FMA: # %bb.0:
1134 ; HASWELL-NO-FMA-NEXT: vrcpps %ymm0, %ymm1 # sched: [11:2.00]
1135 ; HASWELL-NO-FMA-NEXT: vmulps %ymm1, %ymm0, %ymm2 # sched: [5:0.50]
1136 ; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [7:0.50]
1137 ; HASWELL-NO-FMA-NEXT: vsubps %ymm2, %ymm3, %ymm2 # sched: [3:1.00]
1138 ; HASWELL-NO-FMA-NEXT: vmulps %ymm2, %ymm1, %ymm2 # sched: [5:0.50]
1139 ; HASWELL-NO-FMA-NEXT: vaddps %ymm2, %ymm1, %ymm1 # sched: [3:1.00]
1140 ; HASWELL-NO-FMA-NEXT: vmulps %ymm1, %ymm0, %ymm0 # sched: [5:0.50]
1141 ; HASWELL-NO-FMA-NEXT: vsubps %ymm0, %ymm3, %ymm0 # sched: [3:1.00]
1142 ; HASWELL-NO-FMA-NEXT: vmulps %ymm0, %ymm1, %ymm0 # sched: [5:0.50]
1143 ; HASWELL-NO-FMA-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
1144 ; HASWELL-NO-FMA-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [12:0.50]
1145 ; HASWELL-NO-FMA-NEXT: retq # sched: [7:1.00]
1147 ; KNL-LABEL: v8f32_two_step2:
1149 ; KNL-NEXT: vrcpps %ymm0, %ymm1 # sched: [11:2.00]
1150 ; KNL-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [7:0.50]
1151 ; KNL-NEXT: vmovaps %ymm1, %ymm3 # sched: [1:1.00]
1152 ; KNL-NEXT: vfnmadd213ps {{.*#+}} ymm3 = -(ymm0 * ymm3) + ymm2 sched: [5:0.50]
1153 ; KNL-NEXT: vfmadd132ps {{.*#+}} ymm3 = (ymm3 * ymm1) + ymm1 sched: [5:0.50]
1154 ; KNL-NEXT: vfnmadd213ps {{.*#+}} ymm0 = -(ymm3 * ymm0) + ymm2 sched: [5:0.50]
1155 ; KNL-NEXT: vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm3) + ymm3 sched: [5:0.50]
1156 ; KNL-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [12:0.50]
1157 ; KNL-NEXT: retq # sched: [7:1.00]
1159 ; SKX-LABEL: v8f32_two_step2:
1161 ; SKX-NEXT: vrcpps %ymm0, %ymm1 # sched: [4:1.00]
1162 ; SKX-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [7:0.50]
1163 ; SKX-NEXT: vmovaps %ymm1, %ymm3 # sched: [1:0.33]
1164 ; SKX-NEXT: vfnmadd213ps {{.*#+}} ymm3 = -(ymm0 * ymm3) + ymm2 sched: [4:0.50]
1165 ; SKX-NEXT: vfmadd132ps {{.*#+}} ymm3 = (ymm3 * ymm1) + ymm1 sched: [4:0.50]
1166 ; SKX-NEXT: vfnmadd213ps {{.*#+}} ymm0 = -(ymm3 * ymm0) + ymm2 sched: [4:0.50]
1167 ; SKX-NEXT: vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm3) + ymm3 sched: [4:0.50]
1168 ; SKX-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [11:0.50]
1169 ; SKX-NEXT: retq # sched: [7:1.00]
1170 %div = fdiv fast <8 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0>, %x
1171 ret <8 x float> %div
1174 define <8 x float> @v8f32_no_step(<8 x float> %x) #3 {
1175 ; SSE-LABEL: v8f32_no_step:
1177 ; SSE-NEXT: rcpps %xmm0, %xmm0
1178 ; SSE-NEXT: rcpps %xmm1, %xmm1
1181 ; AVX-RECIP-LABEL: v8f32_no_step:
1182 ; AVX-RECIP: # %bb.0:
1183 ; AVX-RECIP-NEXT: vrcpps %ymm0, %ymm0
1184 ; AVX-RECIP-NEXT: retq
1186 ; FMA-RECIP-LABEL: v8f32_no_step:
1187 ; FMA-RECIP: # %bb.0:
1188 ; FMA-RECIP-NEXT: vrcpps %ymm0, %ymm0
1189 ; FMA-RECIP-NEXT: retq
1191 ; BDVER2-LABEL: v8f32_no_step:
1193 ; BDVER2-NEXT: vrcpps %ymm0, %ymm0 # sched: [5:2.00]
1194 ; BDVER2-NEXT: retq # sched: [5:1.00]
1196 ; BTVER2-LABEL: v8f32_no_step:
1198 ; BTVER2-NEXT: vrcpps %ymm0, %ymm0 # sched: [2:2.00]
1199 ; BTVER2-NEXT: retq # sched: [4:1.00]
1201 ; SANDY-LABEL: v8f32_no_step:
1203 ; SANDY-NEXT: vrcpps %ymm0, %ymm0 # sched: [7:2.00]
1204 ; SANDY-NEXT: retq # sched: [1:1.00]
1206 ; HASWELL-LABEL: v8f32_no_step:
1208 ; HASWELL-NEXT: vrcpps %ymm0, %ymm0 # sched: [11:2.00]
1209 ; HASWELL-NEXT: retq # sched: [7:1.00]
1211 ; HASWELL-NO-FMA-LABEL: v8f32_no_step:
1212 ; HASWELL-NO-FMA: # %bb.0:
1213 ; HASWELL-NO-FMA-NEXT: vrcpps %ymm0, %ymm0 # sched: [11:2.00]
1214 ; HASWELL-NO-FMA-NEXT: retq # sched: [7:1.00]
1216 ; KNL-LABEL: v8f32_no_step:
1218 ; KNL-NEXT: vrcpps %ymm0, %ymm0 # sched: [11:2.00]
1219 ; KNL-NEXT: retq # sched: [7:1.00]
1221 ; SKX-LABEL: v8f32_no_step:
1223 ; SKX-NEXT: vrcpps %ymm0, %ymm0 # sched: [4:1.00]
1224 ; SKX-NEXT: retq # sched: [7:1.00]
1225 %div = fdiv fast <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, %x
1226 ret <8 x float> %div
1229 define <8 x float> @v8f32_no_step2(<8 x float> %x) #3 {
1230 ; SSE-LABEL: v8f32_no_step2:
1232 ; SSE-NEXT: rcpps %xmm1, %xmm1
1233 ; SSE-NEXT: rcpps %xmm0, %xmm0
1234 ; SSE-NEXT: mulps {{.*}}(%rip), %xmm0
1235 ; SSE-NEXT: mulps {{.*}}(%rip), %xmm1
1238 ; AVX-RECIP-LABEL: v8f32_no_step2:
1239 ; AVX-RECIP: # %bb.0:
1240 ; AVX-RECIP-NEXT: vrcpps %ymm0, %ymm0
1241 ; AVX-RECIP-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0
1242 ; AVX-RECIP-NEXT: retq
1244 ; FMA-RECIP-LABEL: v8f32_no_step2:
1245 ; FMA-RECIP: # %bb.0:
1246 ; FMA-RECIP-NEXT: vrcpps %ymm0, %ymm0
1247 ; FMA-RECIP-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0
1248 ; FMA-RECIP-NEXT: retq
1250 ; BDVER2-LABEL: v8f32_no_step2:
1252 ; BDVER2-NEXT: vrcpps %ymm0, %ymm0 # sched: [5:2.00]
1253 ; BDVER2-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [10:2.00]
1254 ; BDVER2-NEXT: retq # sched: [5:1.00]
1256 ; BTVER2-LABEL: v8f32_no_step2:
1258 ; BTVER2-NEXT: vrcpps %ymm0, %ymm0 # sched: [2:2.00]
1259 ; BTVER2-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [7:2.00]
1260 ; BTVER2-NEXT: retq # sched: [4:1.00]
1262 ; SANDY-LABEL: v8f32_no_step2:
1264 ; SANDY-NEXT: vrcpps %ymm0, %ymm0 # sched: [7:2.00]
1265 ; SANDY-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [12:1.00]
1266 ; SANDY-NEXT: retq # sched: [1:1.00]
1268 ; HASWELL-LABEL: v8f32_no_step2:
1270 ; HASWELL-NEXT: vrcpps %ymm0, %ymm0 # sched: [11:2.00]
1271 ; HASWELL-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [12:0.50]
1272 ; HASWELL-NEXT: retq # sched: [7:1.00]
1274 ; HASWELL-NO-FMA-LABEL: v8f32_no_step2:
1275 ; HASWELL-NO-FMA: # %bb.0:
1276 ; HASWELL-NO-FMA-NEXT: vrcpps %ymm0, %ymm0 # sched: [11:2.00]
1277 ; HASWELL-NO-FMA-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [12:0.50]
1278 ; HASWELL-NO-FMA-NEXT: retq # sched: [7:1.00]
1280 ; KNL-LABEL: v8f32_no_step2:
1282 ; KNL-NEXT: vrcpps %ymm0, %ymm0 # sched: [11:2.00]
1283 ; KNL-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [12:0.50]
1284 ; KNL-NEXT: retq # sched: [7:1.00]
1286 ; SKX-LABEL: v8f32_no_step2:
1288 ; SKX-NEXT: vrcpps %ymm0, %ymm0 # sched: [4:1.00]
1289 ; SKX-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [11:0.50]
1290 ; SKX-NEXT: retq # sched: [7:1.00]
1291 %div = fdiv fast <8 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0>, %x
1292 ret <8 x float> %div
1295 define <16 x float> @v16f32_one_step2(<16 x float> %x) #1 {
1296 ; SSE-LABEL: v16f32_one_step2:
1298 ; SSE-NEXT: movaps %xmm3, %xmm4
1299 ; SSE-NEXT: movaps %xmm2, %xmm5
1300 ; SSE-NEXT: movaps %xmm0, %xmm6
1301 ; SSE-NEXT: rcpps %xmm3, %xmm2
1302 ; SSE-NEXT: mulps %xmm2, %xmm4
1303 ; SSE-NEXT: movaps {{.*#+}} xmm0 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
1304 ; SSE-NEXT: movaps %xmm0, %xmm3
1305 ; SSE-NEXT: subps %xmm4, %xmm3
1306 ; SSE-NEXT: mulps %xmm2, %xmm3
1307 ; SSE-NEXT: addps %xmm2, %xmm3
1308 ; SSE-NEXT: rcpps %xmm5, %xmm4
1309 ; SSE-NEXT: mulps %xmm4, %xmm5
1310 ; SSE-NEXT: movaps %xmm0, %xmm2
1311 ; SSE-NEXT: subps %xmm5, %xmm2
1312 ; SSE-NEXT: mulps %xmm4, %xmm2
1313 ; SSE-NEXT: addps %xmm4, %xmm2
1314 ; SSE-NEXT: rcpps %xmm1, %xmm5
1315 ; SSE-NEXT: mulps %xmm5, %xmm1
1316 ; SSE-NEXT: movaps %xmm0, %xmm4
1317 ; SSE-NEXT: subps %xmm1, %xmm4
1318 ; SSE-NEXT: mulps %xmm5, %xmm4
1319 ; SSE-NEXT: addps %xmm5, %xmm4
1320 ; SSE-NEXT: rcpps %xmm6, %xmm1
1321 ; SSE-NEXT: mulps %xmm1, %xmm6
1322 ; SSE-NEXT: subps %xmm6, %xmm0
1323 ; SSE-NEXT: mulps %xmm1, %xmm0
1324 ; SSE-NEXT: addps %xmm1, %xmm0
1325 ; SSE-NEXT: mulps {{.*}}(%rip), %xmm0
1326 ; SSE-NEXT: mulps {{.*}}(%rip), %xmm4
1327 ; SSE-NEXT: mulps {{.*}}(%rip), %xmm2
1328 ; SSE-NEXT: mulps {{.*}}(%rip), %xmm3
1329 ; SSE-NEXT: movaps %xmm4, %xmm1
1332 ; AVX-RECIP-LABEL: v16f32_one_step2:
1333 ; AVX-RECIP: # %bb.0:
1334 ; AVX-RECIP-NEXT: vrcpps %ymm1, %ymm2
1335 ; AVX-RECIP-NEXT: vmulps %ymm2, %ymm1, %ymm1
1336 ; AVX-RECIP-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
1337 ; AVX-RECIP-NEXT: vsubps %ymm1, %ymm3, %ymm1
1338 ; AVX-RECIP-NEXT: vmulps %ymm1, %ymm2, %ymm1
1339 ; AVX-RECIP-NEXT: vaddps %ymm1, %ymm2, %ymm1
1340 ; AVX-RECIP-NEXT: vrcpps %ymm0, %ymm2
1341 ; AVX-RECIP-NEXT: vmulps %ymm2, %ymm0, %ymm0
1342 ; AVX-RECIP-NEXT: vsubps %ymm0, %ymm3, %ymm0
1343 ; AVX-RECIP-NEXT: vmulps %ymm0, %ymm2, %ymm0
1344 ; AVX-RECIP-NEXT: vaddps %ymm0, %ymm2, %ymm0
1345 ; AVX-RECIP-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0
1346 ; AVX-RECIP-NEXT: vmulps {{.*}}(%rip), %ymm1, %ymm1
1347 ; AVX-RECIP-NEXT: retq
1349 ; FMA-RECIP-LABEL: v16f32_one_step2:
1350 ; FMA-RECIP: # %bb.0:
1351 ; FMA-RECIP-NEXT: vrcpps %ymm1, %ymm2
1352 ; FMA-RECIP-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
1353 ; FMA-RECIP-NEXT: vfnmadd213ps {{.*#+}} ymm1 = -(ymm2 * ymm1) + ymm3
1354 ; FMA-RECIP-NEXT: vfmadd132ps {{.*#+}} ymm1 = (ymm1 * ymm2) + ymm2
1355 ; FMA-RECIP-NEXT: vrcpps %ymm0, %ymm2
1356 ; FMA-RECIP-NEXT: vfnmadd213ps {{.*#+}} ymm0 = -(ymm2 * ymm0) + ymm3
1357 ; FMA-RECIP-NEXT: vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm2) + ymm2
1358 ; FMA-RECIP-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0
1359 ; FMA-RECIP-NEXT: vmulps {{.*}}(%rip), %ymm1, %ymm1
1360 ; FMA-RECIP-NEXT: retq
1362 ; BDVER2-LABEL: v16f32_one_step2:
1364 ; BDVER2-NEXT: vrcpps %ymm1, %ymm2 # sched: [5:2.00]
1365 ; BDVER2-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [5:0.50]
1366 ; BDVER2-NEXT: vrcpps %ymm0, %ymm4 # sched: [5:2.00]
1367 ; BDVER2-NEXT: vfnmaddps %ymm3, %ymm2, %ymm1, %ymm1 # sched: [5:0.50]
1368 ; BDVER2-NEXT: vfnmaddps %ymm3, %ymm4, %ymm0, %ymm0 # sched: [5:0.50]
1369 ; BDVER2-NEXT: vfmaddps %ymm2, %ymm1, %ymm2, %ymm1 # sched: [5:0.50]
1370 ; BDVER2-NEXT: vfmaddps %ymm4, %ymm0, %ymm4, %ymm0 # sched: [5:0.50]
1371 ; BDVER2-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [10:2.00]
1372 ; BDVER2-NEXT: vmulps {{.*}}(%rip), %ymm1, %ymm1 # sched: [10:2.00]
1373 ; BDVER2-NEXT: retq # sched: [5:1.00]
1375 ; BTVER2-LABEL: v16f32_one_step2:
1377 ; BTVER2-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [5:1.00]
1378 ; BTVER2-NEXT: vrcpps %ymm1, %ymm2 # sched: [2:2.00]
1379 ; BTVER2-NEXT: vrcpps %ymm0, %ymm4 # sched: [2:2.00]
1380 ; BTVER2-NEXT: vmulps %ymm2, %ymm1, %ymm1 # sched: [2:2.00]
1381 ; BTVER2-NEXT: vmulps %ymm4, %ymm0, %ymm0 # sched: [2:2.00]
1382 ; BTVER2-NEXT: vsubps %ymm1, %ymm3, %ymm1 # sched: [3:2.00]
1383 ; BTVER2-NEXT: vsubps %ymm0, %ymm3, %ymm0 # sched: [3:2.00]
1384 ; BTVER2-NEXT: vmulps %ymm1, %ymm2, %ymm1 # sched: [2:2.00]
1385 ; BTVER2-NEXT: vmulps %ymm0, %ymm4, %ymm0 # sched: [2:2.00]
1386 ; BTVER2-NEXT: vaddps %ymm1, %ymm2, %ymm1 # sched: [3:2.00]
1387 ; BTVER2-NEXT: vaddps %ymm0, %ymm4, %ymm0 # sched: [3:2.00]
1388 ; BTVER2-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [7:2.00]
1389 ; BTVER2-NEXT: vmulps {{.*}}(%rip), %ymm1, %ymm1 # sched: [7:2.00]
1390 ; BTVER2-NEXT: retq # sched: [4:1.00]
1392 ; SANDY-LABEL: v16f32_one_step2:
1394 ; SANDY-NEXT: vrcpps %ymm1, %ymm2 # sched: [7:2.00]
1395 ; SANDY-NEXT: vmulps %ymm2, %ymm1, %ymm1 # sched: [5:1.00]
1396 ; SANDY-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [7:0.50]
1397 ; SANDY-NEXT: vsubps %ymm1, %ymm3, %ymm1 # sched: [3:1.00]
1398 ; SANDY-NEXT: vmulps %ymm1, %ymm2, %ymm1 # sched: [5:1.00]
1399 ; SANDY-NEXT: vaddps %ymm1, %ymm2, %ymm1 # sched: [3:1.00]
1400 ; SANDY-NEXT: vrcpps %ymm0, %ymm2 # sched: [7:2.00]
1401 ; SANDY-NEXT: vmulps %ymm2, %ymm0, %ymm0 # sched: [5:1.00]
1402 ; SANDY-NEXT: vsubps %ymm0, %ymm3, %ymm0 # sched: [3:1.00]
1403 ; SANDY-NEXT: vmulps %ymm0, %ymm2, %ymm0 # sched: [5:1.00]
1404 ; SANDY-NEXT: vaddps %ymm0, %ymm2, %ymm0 # sched: [3:1.00]
1405 ; SANDY-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [12:1.00]
1406 ; SANDY-NEXT: vmulps {{.*}}(%rip), %ymm1, %ymm1 # sched: [12:1.00]
1407 ; SANDY-NEXT: retq # sched: [1:1.00]
1409 ; HASWELL-LABEL: v16f32_one_step2:
1411 ; HASWELL-NEXT: vrcpps %ymm1, %ymm2 # sched: [11:2.00]
1412 ; HASWELL-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [7:0.50]
1413 ; HASWELL-NEXT: vrcpps %ymm0, %ymm4 # sched: [11:2.00]
1414 ; HASWELL-NEXT: vfnmadd213ps {{.*#+}} ymm1 = -(ymm2 * ymm1) + ymm3 sched: [5:0.50]
1415 ; HASWELL-NEXT: vfmadd132ps {{.*#+}} ymm1 = (ymm1 * ymm2) + ymm2 sched: [5:0.50]
1416 ; HASWELL-NEXT: vfnmadd213ps {{.*#+}} ymm0 = -(ymm4 * ymm0) + ymm3 sched: [5:0.50]
1417 ; HASWELL-NEXT: vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm4) + ymm4 sched: [5:0.50]
1418 ; HASWELL-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [12:0.50]
1419 ; HASWELL-NEXT: vmulps {{.*}}(%rip), %ymm1, %ymm1 # sched: [12:0.50]
1420 ; HASWELL-NEXT: retq # sched: [7:1.00]
1422 ; HASWELL-NO-FMA-LABEL: v16f32_one_step2:
1423 ; HASWELL-NO-FMA: # %bb.0:
1424 ; HASWELL-NO-FMA-NEXT: vrcpps %ymm1, %ymm2 # sched: [11:2.00]
1425 ; HASWELL-NO-FMA-NEXT: vmulps %ymm2, %ymm1, %ymm1 # sched: [5:0.50]
1426 ; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [7:0.50]
1427 ; HASWELL-NO-FMA-NEXT: vsubps %ymm1, %ymm3, %ymm1 # sched: [3:1.00]
1428 ; HASWELL-NO-FMA-NEXT: vmulps %ymm1, %ymm2, %ymm1 # sched: [5:0.50]
1429 ; HASWELL-NO-FMA-NEXT: vaddps %ymm1, %ymm2, %ymm1 # sched: [3:1.00]
1430 ; HASWELL-NO-FMA-NEXT: vrcpps %ymm0, %ymm2 # sched: [11:2.00]
1431 ; HASWELL-NO-FMA-NEXT: vmulps %ymm2, %ymm0, %ymm0 # sched: [5:0.50]
1432 ; HASWELL-NO-FMA-NEXT: vsubps %ymm0, %ymm3, %ymm0 # sched: [3:1.00]
1433 ; HASWELL-NO-FMA-NEXT: vmulps %ymm0, %ymm2, %ymm0 # sched: [5:0.50]
1434 ; HASWELL-NO-FMA-NEXT: vaddps %ymm0, %ymm2, %ymm0 # sched: [3:1.00]
1435 ; HASWELL-NO-FMA-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [12:0.50]
1436 ; HASWELL-NO-FMA-NEXT: vmulps {{.*}}(%rip), %ymm1, %ymm1 # sched: [12:0.50]
1437 ; HASWELL-NO-FMA-NEXT: retq # sched: [7:1.00]
1439 ; KNL-LABEL: v16f32_one_step2:
1441 ; KNL-NEXT: vrcp14ps %zmm0, %zmm1 # sched: [11:2.00]
1442 ; KNL-NEXT: vfnmadd213ps {{.*#+}} zmm0 = -(zmm1 * zmm0) + mem sched: [12:0.50]
1443 ; KNL-NEXT: vfmadd132ps {{.*#+}} zmm0 = (zmm0 * zmm1) + zmm1 sched: [5:0.50]
1444 ; KNL-NEXT: vmulps {{.*}}(%rip), %zmm0, %zmm0 # sched: [12:0.50]
1445 ; KNL-NEXT: retq # sched: [7:1.00]
1447 ; SKX-LABEL: v16f32_one_step2:
1449 ; SKX-NEXT: vrcp14ps %zmm0, %zmm1 # sched: [4:2.00]
1450 ; SKX-NEXT: vfnmadd213ps {{.*#+}} zmm0 = -(zmm1 * zmm0) + mem sched: [11:0.50]
1451 ; SKX-NEXT: vfmadd132ps {{.*#+}} zmm0 = (zmm0 * zmm1) + zmm1 sched: [4:0.50]
1452 ; SKX-NEXT: vmulps {{.*}}(%rip), %zmm0, %zmm0 # sched: [11:0.50]
1453 ; SKX-NEXT: retq # sched: [7:1.00]
1454 %div = fdiv fast <16 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0, float 9.0, float 10.0, float 11.0, float 12.0, float 13.0, float 14.0, float 15.0, float 16.0>, %x
1455 ret <16 x float> %div
1458 define <16 x float> @v16f32_one_step_2_divs(<16 x float> %x) #1 {
1459 ; SSE-LABEL: v16f32_one_step_2_divs:
1461 ; SSE-NEXT: rcpps %xmm0, %xmm6
1462 ; SSE-NEXT: mulps %xmm6, %xmm0
1463 ; SSE-NEXT: movaps {{.*#+}} xmm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
1464 ; SSE-NEXT: movaps %xmm4, %xmm5
1465 ; SSE-NEXT: subps %xmm0, %xmm5
1466 ; SSE-NEXT: mulps %xmm6, %xmm5
1467 ; SSE-NEXT: addps %xmm6, %xmm5
1468 ; SSE-NEXT: rcpps %xmm1, %xmm0
1469 ; SSE-NEXT: mulps %xmm0, %xmm1
1470 ; SSE-NEXT: movaps %xmm4, %xmm6
1471 ; SSE-NEXT: subps %xmm1, %xmm6
1472 ; SSE-NEXT: mulps %xmm0, %xmm6
1473 ; SSE-NEXT: addps %xmm0, %xmm6
1474 ; SSE-NEXT: rcpps %xmm2, %xmm0
1475 ; SSE-NEXT: mulps %xmm0, %xmm2
1476 ; SSE-NEXT: movaps %xmm4, %xmm7
1477 ; SSE-NEXT: subps %xmm2, %xmm7
1478 ; SSE-NEXT: mulps %xmm0, %xmm7
1479 ; SSE-NEXT: addps %xmm0, %xmm7
1480 ; SSE-NEXT: rcpps %xmm3, %xmm0
1481 ; SSE-NEXT: mulps %xmm0, %xmm3
1482 ; SSE-NEXT: subps %xmm3, %xmm4
1483 ; SSE-NEXT: mulps %xmm0, %xmm4
1484 ; SSE-NEXT: addps %xmm0, %xmm4
1485 ; SSE-NEXT: movaps {{.*#+}} xmm3 = [1.3E+1,1.4E+1,1.5E+1,1.6E+1]
1486 ; SSE-NEXT: mulps %xmm4, %xmm3
1487 ; SSE-NEXT: movaps {{.*#+}} xmm2 = [9.0E+0,1.0E+1,1.1E+1,1.2E+1]
1488 ; SSE-NEXT: mulps %xmm7, %xmm2
1489 ; SSE-NEXT: movaps {{.*#+}} xmm1 = [5.0E+0,6.0E+0,7.0E+0,8.0E+0]
1490 ; SSE-NEXT: mulps %xmm6, %xmm1
1491 ; SSE-NEXT: movaps {{.*#+}} xmm0 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0]
1492 ; SSE-NEXT: mulps %xmm5, %xmm0
1493 ; SSE-NEXT: mulps %xmm5, %xmm0
1494 ; SSE-NEXT: mulps %xmm6, %xmm1
1495 ; SSE-NEXT: mulps %xmm7, %xmm2
1496 ; SSE-NEXT: mulps %xmm4, %xmm3
1499 ; AVX-RECIP-LABEL: v16f32_one_step_2_divs:
1500 ; AVX-RECIP: # %bb.0:
1501 ; AVX-RECIP-NEXT: vrcpps %ymm0, %ymm2
1502 ; AVX-RECIP-NEXT: vmulps %ymm2, %ymm0, %ymm0
1503 ; AVX-RECIP-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
1504 ; AVX-RECIP-NEXT: vsubps %ymm0, %ymm3, %ymm0
1505 ; AVX-RECIP-NEXT: vmulps %ymm0, %ymm2, %ymm0
1506 ; AVX-RECIP-NEXT: vaddps %ymm0, %ymm2, %ymm0
1507 ; AVX-RECIP-NEXT: vrcpps %ymm1, %ymm2
1508 ; AVX-RECIP-NEXT: vmulps %ymm2, %ymm1, %ymm1
1509 ; AVX-RECIP-NEXT: vsubps %ymm1, %ymm3, %ymm1
1510 ; AVX-RECIP-NEXT: vmulps %ymm1, %ymm2, %ymm1
1511 ; AVX-RECIP-NEXT: vaddps %ymm1, %ymm2, %ymm1
1512 ; AVX-RECIP-NEXT: vmulps {{.*}}(%rip), %ymm1, %ymm2
1513 ; AVX-RECIP-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm3
1514 ; AVX-RECIP-NEXT: vmulps %ymm0, %ymm3, %ymm0
1515 ; AVX-RECIP-NEXT: vmulps %ymm1, %ymm2, %ymm1
1516 ; AVX-RECIP-NEXT: retq
1518 ; FMA-RECIP-LABEL: v16f32_one_step_2_divs:
1519 ; FMA-RECIP: # %bb.0:
1520 ; FMA-RECIP-NEXT: vrcpps %ymm0, %ymm2
1521 ; FMA-RECIP-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
1522 ; FMA-RECIP-NEXT: vfnmadd213ps {{.*#+}} ymm0 = -(ymm2 * ymm0) + ymm3
1523 ; FMA-RECIP-NEXT: vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm2) + ymm2
1524 ; FMA-RECIP-NEXT: vrcpps %ymm1, %ymm2
1525 ; FMA-RECIP-NEXT: vfnmadd213ps {{.*#+}} ymm1 = -(ymm2 * ymm1) + ymm3
1526 ; FMA-RECIP-NEXT: vfmadd132ps {{.*#+}} ymm1 = (ymm1 * ymm2) + ymm2
1527 ; FMA-RECIP-NEXT: vmulps {{.*}}(%rip), %ymm1, %ymm2
1528 ; FMA-RECIP-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm3
1529 ; FMA-RECIP-NEXT: vmulps %ymm0, %ymm3, %ymm0
1530 ; FMA-RECIP-NEXT: vmulps %ymm1, %ymm2, %ymm1
1531 ; FMA-RECIP-NEXT: retq
1533 ; BDVER2-LABEL: v16f32_one_step_2_divs:
1535 ; BDVER2-NEXT: vrcpps %ymm0, %ymm2 # sched: [5:2.00]
1536 ; BDVER2-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [5:0.50]
1537 ; BDVER2-NEXT: vfnmaddps %ymm3, %ymm2, %ymm0, %ymm0 # sched: [5:0.50]
1538 ; BDVER2-NEXT: vfmaddps %ymm2, %ymm0, %ymm2, %ymm0 # sched: [5:0.50]
1539 ; BDVER2-NEXT: vrcpps %ymm1, %ymm2 # sched: [5:2.00]
1540 ; BDVER2-NEXT: vfnmaddps %ymm3, %ymm2, %ymm1, %ymm1 # sched: [5:0.50]
1541 ; BDVER2-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm3 # sched: [10:2.00]
1542 ; BDVER2-NEXT: vfmaddps %ymm2, %ymm1, %ymm2, %ymm1 # sched: [5:0.50]
1543 ; BDVER2-NEXT: vmulps {{.*}}(%rip), %ymm1, %ymm2 # sched: [10:2.00]
1544 ; BDVER2-NEXT: vmulps %ymm0, %ymm3, %ymm0 # sched: [5:2.00]
1545 ; BDVER2-NEXT: vmulps %ymm1, %ymm2, %ymm1 # sched: [5:2.00]
1546 ; BDVER2-NEXT: retq # sched: [5:1.00]
1548 ; BTVER2-LABEL: v16f32_one_step_2_divs:
1550 ; BTVER2-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [5:1.00]
1551 ; BTVER2-NEXT: vrcpps %ymm0, %ymm2 # sched: [2:2.00]
1552 ; BTVER2-NEXT: vmulps %ymm2, %ymm0, %ymm0 # sched: [2:2.00]
1553 ; BTVER2-NEXT: vsubps %ymm0, %ymm3, %ymm0 # sched: [3:2.00]
1554 ; BTVER2-NEXT: vmulps %ymm0, %ymm2, %ymm0 # sched: [2:2.00]
1555 ; BTVER2-NEXT: vaddps %ymm0, %ymm2, %ymm0 # sched: [3:2.00]
1556 ; BTVER2-NEXT: vrcpps %ymm1, %ymm2 # sched: [2:2.00]
1557 ; BTVER2-NEXT: vmulps %ymm2, %ymm1, %ymm1 # sched: [2:2.00]
1558 ; BTVER2-NEXT: vsubps %ymm1, %ymm3, %ymm1 # sched: [3:2.00]
1559 ; BTVER2-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm3 # sched: [7:2.00]
1560 ; BTVER2-NEXT: vmulps %ymm1, %ymm2, %ymm1 # sched: [2:2.00]
1561 ; BTVER2-NEXT: vaddps %ymm1, %ymm2, %ymm1 # sched: [3:2.00]
1562 ; BTVER2-NEXT: vmulps {{.*}}(%rip), %ymm1, %ymm2 # sched: [7:2.00]
1563 ; BTVER2-NEXT: vmulps %ymm0, %ymm3, %ymm0 # sched: [2:2.00]
1564 ; BTVER2-NEXT: vmulps %ymm1, %ymm2, %ymm1 # sched: [2:2.00]
1565 ; BTVER2-NEXT: retq # sched: [4:1.00]
1567 ; SANDY-LABEL: v16f32_one_step_2_divs:
1569 ; SANDY-NEXT: vrcpps %ymm0, %ymm2 # sched: [7:2.00]
1570 ; SANDY-NEXT: vmulps %ymm2, %ymm0, %ymm0 # sched: [5:1.00]
1571 ; SANDY-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [7:0.50]
1572 ; SANDY-NEXT: vsubps %ymm0, %ymm3, %ymm0 # sched: [3:1.00]
1573 ; SANDY-NEXT: vrcpps %ymm1, %ymm4 # sched: [7:2.00]
1574 ; SANDY-NEXT: vmulps %ymm0, %ymm2, %ymm0 # sched: [5:1.00]
1575 ; SANDY-NEXT: vaddps %ymm0, %ymm2, %ymm0 # sched: [3:1.00]
1576 ; SANDY-NEXT: vmulps %ymm4, %ymm1, %ymm1 # sched: [5:1.00]
1577 ; SANDY-NEXT: vsubps %ymm1, %ymm3, %ymm1 # sched: [3:1.00]
1578 ; SANDY-NEXT: vmulps %ymm1, %ymm4, %ymm1 # sched: [5:1.00]
1579 ; SANDY-NEXT: vaddps %ymm1, %ymm4, %ymm1 # sched: [3:1.00]
1580 ; SANDY-NEXT: vmulps {{.*}}(%rip), %ymm1, %ymm2 # sched: [12:1.00]
1581 ; SANDY-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm3 # sched: [12:1.00]
1582 ; SANDY-NEXT: vmulps %ymm0, %ymm3, %ymm0 # sched: [5:1.00]
1583 ; SANDY-NEXT: vmulps %ymm1, %ymm2, %ymm1 # sched: [5:1.00]
1584 ; SANDY-NEXT: retq # sched: [1:1.00]
1586 ; HASWELL-LABEL: v16f32_one_step_2_divs:
1588 ; HASWELL-NEXT: vrcpps %ymm0, %ymm2 # sched: [11:2.00]
1589 ; HASWELL-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [7:0.50]
1590 ; HASWELL-NEXT: vfnmadd213ps {{.*#+}} ymm0 = -(ymm2 * ymm0) + ymm3 sched: [5:0.50]
1591 ; HASWELL-NEXT: vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm2) + ymm2 sched: [5:0.50]
1592 ; HASWELL-NEXT: vrcpps %ymm1, %ymm2 # sched: [11:2.00]
1593 ; HASWELL-NEXT: vfnmadd213ps {{.*#+}} ymm1 = -(ymm2 * ymm1) + ymm3 sched: [5:0.50]
1594 ; HASWELL-NEXT: vfmadd132ps {{.*#+}} ymm1 = (ymm1 * ymm2) + ymm2 sched: [5:0.50]
1595 ; HASWELL-NEXT: vmulps {{.*}}(%rip), %ymm1, %ymm2 # sched: [12:0.50]
1596 ; HASWELL-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm3 # sched: [12:0.50]
1597 ; HASWELL-NEXT: vmulps %ymm0, %ymm3, %ymm0 # sched: [5:0.50]
1598 ; HASWELL-NEXT: vmulps %ymm1, %ymm2, %ymm1 # sched: [5:0.50]
1599 ; HASWELL-NEXT: retq # sched: [7:1.00]
1601 ; HASWELL-NO-FMA-LABEL: v16f32_one_step_2_divs:
1602 ; HASWELL-NO-FMA: # %bb.0:
1603 ; HASWELL-NO-FMA-NEXT: vrcpps %ymm0, %ymm2 # sched: [11:2.00]
1604 ; HASWELL-NO-FMA-NEXT: vmulps %ymm2, %ymm0, %ymm0 # sched: [5:0.50]
1605 ; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [7:0.50]
1606 ; HASWELL-NO-FMA-NEXT: vsubps %ymm0, %ymm3, %ymm0 # sched: [3:1.00]
1607 ; HASWELL-NO-FMA-NEXT: vrcpps %ymm1, %ymm4 # sched: [11:2.00]
1608 ; HASWELL-NO-FMA-NEXT: vmulps %ymm0, %ymm2, %ymm0 # sched: [5:0.50]
1609 ; HASWELL-NO-FMA-NEXT: vaddps %ymm0, %ymm2, %ymm0 # sched: [3:1.00]
1610 ; HASWELL-NO-FMA-NEXT: vmulps %ymm4, %ymm1, %ymm1 # sched: [5:0.50]
1611 ; HASWELL-NO-FMA-NEXT: vsubps %ymm1, %ymm3, %ymm1 # sched: [3:1.00]
1612 ; HASWELL-NO-FMA-NEXT: vmulps %ymm1, %ymm4, %ymm1 # sched: [5:0.50]
1613 ; HASWELL-NO-FMA-NEXT: vaddps %ymm1, %ymm4, %ymm1 # sched: [3:1.00]
1614 ; HASWELL-NO-FMA-NEXT: vmulps {{.*}}(%rip), %ymm1, %ymm2 # sched: [12:0.50]
1615 ; HASWELL-NO-FMA-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm3 # sched: [12:0.50]
1616 ; HASWELL-NO-FMA-NEXT: vmulps %ymm0, %ymm3, %ymm0 # sched: [5:0.50]
1617 ; HASWELL-NO-FMA-NEXT: vmulps %ymm1, %ymm2, %ymm1 # sched: [5:0.50]
1618 ; HASWELL-NO-FMA-NEXT: retq # sched: [7:1.00]
1620 ; KNL-LABEL: v16f32_one_step_2_divs:
1622 ; KNL-NEXT: vrcp14ps %zmm0, %zmm1 # sched: [11:2.00]
1623 ; KNL-NEXT: vfnmadd213ps {{.*#+}} zmm0 = -(zmm1 * zmm0) + mem sched: [12:0.50]
1624 ; KNL-NEXT: vfmadd132ps {{.*#+}} zmm0 = (zmm0 * zmm1) + zmm1 sched: [5:0.50]
1625 ; KNL-NEXT: vmulps {{.*}}(%rip), %zmm0, %zmm1 # sched: [12:0.50]
1626 ; KNL-NEXT: vmulps %zmm0, %zmm1, %zmm0 # sched: [5:0.50]
1627 ; KNL-NEXT: retq # sched: [7:1.00]
1629 ; SKX-LABEL: v16f32_one_step_2_divs:
1631 ; SKX-NEXT: vrcp14ps %zmm0, %zmm1 # sched: [4:2.00]
1632 ; SKX-NEXT: vfnmadd213ps {{.*#+}} zmm0 = -(zmm1 * zmm0) + mem sched: [11:0.50]
1633 ; SKX-NEXT: vfmadd132ps {{.*#+}} zmm0 = (zmm0 * zmm1) + zmm1 sched: [4:0.50]
1634 ; SKX-NEXT: vmulps {{.*}}(%rip), %zmm0, %zmm1 # sched: [11:0.50]
1635 ; SKX-NEXT: vmulps %zmm0, %zmm1, %zmm0 # sched: [4:0.50]
1636 ; SKX-NEXT: retq # sched: [7:1.00]
1637 %div = fdiv fast <16 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0, float 9.0, float 10.0, float 11.0, float 12.0, float 13.0, float 14.0, float 15.0, float 16.0>, %x
1638 %div2 = fdiv fast <16 x float> %div, %x
1639 ret <16 x float> %div2
1642 define <16 x float> @v16f32_two_step2(<16 x float> %x) #2 {
1643 ; SSE-LABEL: v16f32_two_step2:
1645 ; SSE-NEXT: movaps %xmm3, %xmm6
1646 ; SSE-NEXT: movaps %xmm2, %xmm5
1647 ; SSE-NEXT: movaps %xmm0, %xmm4
1648 ; SSE-NEXT: rcpps %xmm3, %xmm2
1649 ; SSE-NEXT: mulps %xmm2, %xmm3
1650 ; SSE-NEXT: movaps {{.*#+}} xmm0 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
1651 ; SSE-NEXT: movaps %xmm0, %xmm7
1652 ; SSE-NEXT: subps %xmm3, %xmm7
1653 ; SSE-NEXT: mulps %xmm2, %xmm7
1654 ; SSE-NEXT: addps %xmm2, %xmm7
1655 ; SSE-NEXT: mulps %xmm7, %xmm6
1656 ; SSE-NEXT: movaps %xmm0, %xmm3
1657 ; SSE-NEXT: subps %xmm6, %xmm3
1658 ; SSE-NEXT: mulps %xmm7, %xmm3
1659 ; SSE-NEXT: addps %xmm7, %xmm3
1660 ; SSE-NEXT: rcpps %xmm5, %xmm2
1661 ; SSE-NEXT: movaps %xmm5, %xmm6
1662 ; SSE-NEXT: mulps %xmm2, %xmm6
1663 ; SSE-NEXT: movaps %xmm0, %xmm7
1664 ; SSE-NEXT: subps %xmm6, %xmm7
1665 ; SSE-NEXT: mulps %xmm2, %xmm7
1666 ; SSE-NEXT: addps %xmm2, %xmm7
1667 ; SSE-NEXT: mulps %xmm7, %xmm5
1668 ; SSE-NEXT: movaps %xmm0, %xmm2
1669 ; SSE-NEXT: subps %xmm5, %xmm2
1670 ; SSE-NEXT: mulps %xmm7, %xmm2
1671 ; SSE-NEXT: addps %xmm7, %xmm2
1672 ; SSE-NEXT: rcpps %xmm1, %xmm5
1673 ; SSE-NEXT: movaps %xmm1, %xmm6
1674 ; SSE-NEXT: mulps %xmm5, %xmm6
1675 ; SSE-NEXT: movaps %xmm0, %xmm7
1676 ; SSE-NEXT: subps %xmm6, %xmm7
1677 ; SSE-NEXT: mulps %xmm5, %xmm7
1678 ; SSE-NEXT: addps %xmm5, %xmm7
1679 ; SSE-NEXT: mulps %xmm7, %xmm1
1680 ; SSE-NEXT: movaps %xmm0, %xmm5
1681 ; SSE-NEXT: subps %xmm1, %xmm5
1682 ; SSE-NEXT: mulps %xmm7, %xmm5
1683 ; SSE-NEXT: addps %xmm7, %xmm5
1684 ; SSE-NEXT: rcpps %xmm4, %xmm1
1685 ; SSE-NEXT: movaps %xmm4, %xmm6
1686 ; SSE-NEXT: mulps %xmm1, %xmm6
1687 ; SSE-NEXT: movaps %xmm0, %xmm7
1688 ; SSE-NEXT: subps %xmm6, %xmm7
1689 ; SSE-NEXT: mulps %xmm1, %xmm7
1690 ; SSE-NEXT: addps %xmm1, %xmm7
1691 ; SSE-NEXT: mulps %xmm7, %xmm4
1692 ; SSE-NEXT: subps %xmm4, %xmm0
1693 ; SSE-NEXT: mulps %xmm7, %xmm0
1694 ; SSE-NEXT: addps %xmm7, %xmm0
1695 ; SSE-NEXT: mulps {{.*}}(%rip), %xmm0
1696 ; SSE-NEXT: mulps {{.*}}(%rip), %xmm5
1697 ; SSE-NEXT: mulps {{.*}}(%rip), %xmm2
1698 ; SSE-NEXT: mulps {{.*}}(%rip), %xmm3
1699 ; SSE-NEXT: movaps %xmm5, %xmm1
1702 ; AVX-RECIP-LABEL: v16f32_two_step2:
1703 ; AVX-RECIP: # %bb.0:
1704 ; AVX-RECIP-NEXT: vrcpps %ymm1, %ymm2
1705 ; AVX-RECIP-NEXT: vmulps %ymm2, %ymm1, %ymm3
1706 ; AVX-RECIP-NEXT: vmovaps {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
1707 ; AVX-RECIP-NEXT: vsubps %ymm3, %ymm4, %ymm3
1708 ; AVX-RECIP-NEXT: vmulps %ymm3, %ymm2, %ymm3
1709 ; AVX-RECIP-NEXT: vaddps %ymm3, %ymm2, %ymm2
1710 ; AVX-RECIP-NEXT: vmulps %ymm2, %ymm1, %ymm1
1711 ; AVX-RECIP-NEXT: vsubps %ymm1, %ymm4, %ymm1
1712 ; AVX-RECIP-NEXT: vmulps %ymm1, %ymm2, %ymm1
1713 ; AVX-RECIP-NEXT: vaddps %ymm1, %ymm2, %ymm1
1714 ; AVX-RECIP-NEXT: vrcpps %ymm0, %ymm2
1715 ; AVX-RECIP-NEXT: vmulps %ymm2, %ymm0, %ymm3
1716 ; AVX-RECIP-NEXT: vsubps %ymm3, %ymm4, %ymm3
1717 ; AVX-RECIP-NEXT: vmulps %ymm3, %ymm2, %ymm3
1718 ; AVX-RECIP-NEXT: vaddps %ymm3, %ymm2, %ymm2
1719 ; AVX-RECIP-NEXT: vmulps %ymm2, %ymm0, %ymm0
1720 ; AVX-RECIP-NEXT: vsubps %ymm0, %ymm4, %ymm0
1721 ; AVX-RECIP-NEXT: vmulps %ymm0, %ymm2, %ymm0
1722 ; AVX-RECIP-NEXT: vaddps %ymm0, %ymm2, %ymm0
1723 ; AVX-RECIP-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0
1724 ; AVX-RECIP-NEXT: vmulps {{.*}}(%rip), %ymm1, %ymm1
1725 ; AVX-RECIP-NEXT: retq
1727 ; FMA-RECIP-LABEL: v16f32_two_step2:
1728 ; FMA-RECIP: # %bb.0:
1729 ; FMA-RECIP-NEXT: vrcpps %ymm1, %ymm2
1730 ; FMA-RECIP-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
1731 ; FMA-RECIP-NEXT: vmovaps %ymm2, %ymm4
1732 ; FMA-RECIP-NEXT: vfnmadd213ps {{.*#+}} ymm4 = -(ymm1 * ymm4) + ymm3
1733 ; FMA-RECIP-NEXT: vfmadd132ps {{.*#+}} ymm4 = (ymm4 * ymm2) + ymm2
1734 ; FMA-RECIP-NEXT: vfnmadd213ps {{.*#+}} ymm1 = -(ymm4 * ymm1) + ymm3
1735 ; FMA-RECIP-NEXT: vfmadd132ps {{.*#+}} ymm1 = (ymm1 * ymm4) + ymm4
1736 ; FMA-RECIP-NEXT: vrcpps %ymm0, %ymm2
1737 ; FMA-RECIP-NEXT: vmovaps %ymm2, %ymm4
1738 ; FMA-RECIP-NEXT: vfnmadd213ps {{.*#+}} ymm4 = -(ymm0 * ymm4) + ymm3
1739 ; FMA-RECIP-NEXT: vfmadd132ps {{.*#+}} ymm4 = (ymm4 * ymm2) + ymm2
1740 ; FMA-RECIP-NEXT: vfnmadd213ps {{.*#+}} ymm0 = -(ymm4 * ymm0) + ymm3
1741 ; FMA-RECIP-NEXT: vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm4) + ymm4
1742 ; FMA-RECIP-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0
1743 ; FMA-RECIP-NEXT: vmulps {{.*}}(%rip), %ymm1, %ymm1
1744 ; FMA-RECIP-NEXT: retq
1746 ; BDVER2-LABEL: v16f32_two_step2:
1748 ; BDVER2-NEXT: vrcpps %ymm1, %ymm2 # sched: [5:2.00]
1749 ; BDVER2-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [5:0.50]
1750 ; BDVER2-NEXT: vfnmaddps %ymm3, %ymm2, %ymm1, %ymm4 # sched: [5:0.50]
1751 ; BDVER2-NEXT: vfmaddps %ymm2, %ymm4, %ymm2, %ymm2 # sched: [5:0.50]
1752 ; BDVER2-NEXT: vfnmaddps %ymm3, %ymm2, %ymm1, %ymm1 # sched: [5:0.50]
1753 ; BDVER2-NEXT: vfmaddps %ymm2, %ymm1, %ymm2, %ymm1 # sched: [5:0.50]
1754 ; BDVER2-NEXT: vrcpps %ymm0, %ymm2 # sched: [5:2.00]
1755 ; BDVER2-NEXT: vmulps {{.*}}(%rip), %ymm1, %ymm1 # sched: [10:2.00]
1756 ; BDVER2-NEXT: vfnmaddps %ymm3, %ymm2, %ymm0, %ymm4 # sched: [5:0.50]
1757 ; BDVER2-NEXT: vfmaddps %ymm2, %ymm4, %ymm2, %ymm2 # sched: [5:0.50]
1758 ; BDVER2-NEXT: vfnmaddps %ymm3, %ymm2, %ymm0, %ymm0 # sched: [5:0.50]
1759 ; BDVER2-NEXT: vfmaddps %ymm2, %ymm0, %ymm2, %ymm0 # sched: [5:0.50]
1760 ; BDVER2-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [10:2.00]
1761 ; BDVER2-NEXT: retq # sched: [5:1.00]
1763 ; BTVER2-LABEL: v16f32_two_step2:
1765 ; BTVER2-NEXT: vmovaps {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [5:1.00]
1766 ; BTVER2-NEXT: vrcpps %ymm1, %ymm2 # sched: [2:2.00]
1767 ; BTVER2-NEXT: vmulps %ymm2, %ymm1, %ymm3 # sched: [2:2.00]
1768 ; BTVER2-NEXT: vsubps %ymm3, %ymm4, %ymm3 # sched: [3:2.00]
1769 ; BTVER2-NEXT: vmulps %ymm3, %ymm2, %ymm3 # sched: [2:2.00]
1770 ; BTVER2-NEXT: vaddps %ymm3, %ymm2, %ymm2 # sched: [3:2.00]
1771 ; BTVER2-NEXT: vmulps %ymm2, %ymm1, %ymm1 # sched: [2:2.00]
1772 ; BTVER2-NEXT: vsubps %ymm1, %ymm4, %ymm1 # sched: [3:2.00]
1773 ; BTVER2-NEXT: vmulps %ymm1, %ymm2, %ymm1 # sched: [2:2.00]
1774 ; BTVER2-NEXT: vaddps %ymm1, %ymm2, %ymm1 # sched: [3:2.00]
1775 ; BTVER2-NEXT: vrcpps %ymm0, %ymm2 # sched: [2:2.00]
1776 ; BTVER2-NEXT: vmulps {{.*}}(%rip), %ymm1, %ymm1 # sched: [7:2.00]
1777 ; BTVER2-NEXT: vmulps %ymm2, %ymm0, %ymm3 # sched: [2:2.00]
1778 ; BTVER2-NEXT: vsubps %ymm3, %ymm4, %ymm3 # sched: [3:2.00]
1779 ; BTVER2-NEXT: vmulps %ymm3, %ymm2, %ymm3 # sched: [2:2.00]
1780 ; BTVER2-NEXT: vaddps %ymm3, %ymm2, %ymm2 # sched: [3:2.00]
1781 ; BTVER2-NEXT: vmulps %ymm2, %ymm0, %ymm0 # sched: [2:2.00]
1782 ; BTVER2-NEXT: vsubps %ymm0, %ymm4, %ymm0 # sched: [3:2.00]
1783 ; BTVER2-NEXT: vmulps %ymm0, %ymm2, %ymm0 # sched: [2:2.00]
1784 ; BTVER2-NEXT: vaddps %ymm0, %ymm2, %ymm0 # sched: [3:2.00]
1785 ; BTVER2-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [7:2.00]
1786 ; BTVER2-NEXT: retq # sched: [4:1.00]
1788 ; SANDY-LABEL: v16f32_two_step2:
1790 ; SANDY-NEXT: vrcpps %ymm1, %ymm2 # sched: [7:2.00]
1791 ; SANDY-NEXT: vmulps %ymm2, %ymm1, %ymm3 # sched: [5:1.00]
1792 ; SANDY-NEXT: vmovaps {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [7:0.50]
1793 ; SANDY-NEXT: vsubps %ymm3, %ymm4, %ymm3 # sched: [3:1.00]
1794 ; SANDY-NEXT: vmulps %ymm3, %ymm2, %ymm3 # sched: [5:1.00]
1795 ; SANDY-NEXT: vaddps %ymm3, %ymm2, %ymm2 # sched: [3:1.00]
1796 ; SANDY-NEXT: vmulps %ymm2, %ymm1, %ymm1 # sched: [5:1.00]
1797 ; SANDY-NEXT: vsubps %ymm1, %ymm4, %ymm1 # sched: [3:1.00]
1798 ; SANDY-NEXT: vmulps %ymm1, %ymm2, %ymm1 # sched: [5:1.00]
1799 ; SANDY-NEXT: vaddps %ymm1, %ymm2, %ymm1 # sched: [3:1.00]
1800 ; SANDY-NEXT: vrcpps %ymm0, %ymm2 # sched: [7:2.00]
1801 ; SANDY-NEXT: vmulps %ymm2, %ymm0, %ymm3 # sched: [5:1.00]
1802 ; SANDY-NEXT: vsubps %ymm3, %ymm4, %ymm3 # sched: [3:1.00]
1803 ; SANDY-NEXT: vmulps %ymm3, %ymm2, %ymm3 # sched: [5:1.00]
1804 ; SANDY-NEXT: vaddps %ymm3, %ymm2, %ymm2 # sched: [3:1.00]
1805 ; SANDY-NEXT: vmulps %ymm2, %ymm0, %ymm0 # sched: [5:1.00]
1806 ; SANDY-NEXT: vsubps %ymm0, %ymm4, %ymm0 # sched: [3:1.00]
1807 ; SANDY-NEXT: vmulps %ymm0, %ymm2, %ymm0 # sched: [5:1.00]
1808 ; SANDY-NEXT: vaddps %ymm0, %ymm2, %ymm0 # sched: [3:1.00]
1809 ; SANDY-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [12:1.00]
1810 ; SANDY-NEXT: vmulps {{.*}}(%rip), %ymm1, %ymm1 # sched: [12:1.00]
1811 ; SANDY-NEXT: retq # sched: [1:1.00]
1813 ; HASWELL-LABEL: v16f32_two_step2:
1815 ; HASWELL-NEXT: vrcpps %ymm1, %ymm2 # sched: [11:2.00]
1816 ; HASWELL-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [7:0.50]
1817 ; HASWELL-NEXT: vmovaps %ymm2, %ymm4 # sched: [1:1.00]
1818 ; HASWELL-NEXT: vfnmadd213ps {{.*#+}} ymm4 = -(ymm1 * ymm4) + ymm3 sched: [5:0.50]
1819 ; HASWELL-NEXT: vfmadd132ps {{.*#+}} ymm4 = (ymm4 * ymm2) + ymm2 sched: [5:0.50]
1820 ; HASWELL-NEXT: vfnmadd213ps {{.*#+}} ymm1 = -(ymm4 * ymm1) + ymm3 sched: [5:0.50]
1821 ; HASWELL-NEXT: vfmadd132ps {{.*#+}} ymm1 = (ymm1 * ymm4) + ymm4 sched: [5:0.50]
1822 ; HASWELL-NEXT: vrcpps %ymm0, %ymm2 # sched: [11:2.00]
1823 ; HASWELL-NEXT: vmovaps %ymm2, %ymm4 # sched: [1:1.00]
1824 ; HASWELL-NEXT: vfnmadd213ps {{.*#+}} ymm4 = -(ymm0 * ymm4) + ymm3 sched: [5:0.50]
1825 ; HASWELL-NEXT: vfmadd132ps {{.*#+}} ymm4 = (ymm4 * ymm2) + ymm2 sched: [5:0.50]
1826 ; HASWELL-NEXT: vfnmadd213ps {{.*#+}} ymm0 = -(ymm4 * ymm0) + ymm3 sched: [5:0.50]
1827 ; HASWELL-NEXT: vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm4) + ymm4 sched: [5:0.50]
1828 ; HASWELL-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [12:0.50]
1829 ; HASWELL-NEXT: vmulps {{.*}}(%rip), %ymm1, %ymm1 # sched: [12:0.50]
1830 ; HASWELL-NEXT: retq # sched: [7:1.00]
1832 ; HASWELL-NO-FMA-LABEL: v16f32_two_step2:
1833 ; HASWELL-NO-FMA: # %bb.0:
1834 ; HASWELL-NO-FMA-NEXT: vrcpps %ymm1, %ymm2 # sched: [11:2.00]
1835 ; HASWELL-NO-FMA-NEXT: vmulps %ymm2, %ymm1, %ymm3 # sched: [5:0.50]
1836 ; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [7:0.50]
1837 ; HASWELL-NO-FMA-NEXT: vsubps %ymm3, %ymm4, %ymm3 # sched: [3:1.00]
1838 ; HASWELL-NO-FMA-NEXT: vmulps %ymm3, %ymm2, %ymm3 # sched: [5:0.50]
1839 ; HASWELL-NO-FMA-NEXT: vaddps %ymm3, %ymm2, %ymm2 # sched: [3:1.00]
1840 ; HASWELL-NO-FMA-NEXT: vmulps %ymm2, %ymm1, %ymm1 # sched: [5:0.50]
1841 ; HASWELL-NO-FMA-NEXT: vsubps %ymm1, %ymm4, %ymm1 # sched: [3:1.00]
1842 ; HASWELL-NO-FMA-NEXT: vmulps %ymm1, %ymm2, %ymm1 # sched: [5:0.50]
1843 ; HASWELL-NO-FMA-NEXT: vaddps %ymm1, %ymm2, %ymm1 # sched: [3:1.00]
1844 ; HASWELL-NO-FMA-NEXT: vrcpps %ymm0, %ymm2 # sched: [11:2.00]
1845 ; HASWELL-NO-FMA-NEXT: vmulps %ymm2, %ymm0, %ymm3 # sched: [5:0.50]
1846 ; HASWELL-NO-FMA-NEXT: vsubps %ymm3, %ymm4, %ymm3 # sched: [3:1.00]
1847 ; HASWELL-NO-FMA-NEXT: vmulps %ymm3, %ymm2, %ymm3 # sched: [5:0.50]
1848 ; HASWELL-NO-FMA-NEXT: vaddps %ymm3, %ymm2, %ymm2 # sched: [3:1.00]
1849 ; HASWELL-NO-FMA-NEXT: vmulps %ymm2, %ymm0, %ymm0 # sched: [5:0.50]
1850 ; HASWELL-NO-FMA-NEXT: vsubps %ymm0, %ymm4, %ymm0 # sched: [3:1.00]
1851 ; HASWELL-NO-FMA-NEXT: vmulps %ymm0, %ymm2, %ymm0 # sched: [5:0.50]
1852 ; HASWELL-NO-FMA-NEXT: vaddps %ymm0, %ymm2, %ymm0 # sched: [3:1.00]
1853 ; HASWELL-NO-FMA-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [12:0.50]
1854 ; HASWELL-NO-FMA-NEXT: vmulps {{.*}}(%rip), %ymm1, %ymm1 # sched: [12:0.50]
1855 ; HASWELL-NO-FMA-NEXT: retq # sched: [7:1.00]
1857 ; KNL-LABEL: v16f32_two_step2:
1859 ; KNL-NEXT: vrcp14ps %zmm0, %zmm1 # sched: [11:2.00]
1860 ; KNL-NEXT: vbroadcastss {{.*#+}} zmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [10:1.00]
1861 ; KNL-NEXT: vmovaps %zmm1, %zmm3 # sched: [1:1.00]
1862 ; KNL-NEXT: vfnmadd213ps {{.*#+}} zmm3 = -(zmm0 * zmm3) + zmm2 sched: [5:0.50]
1863 ; KNL-NEXT: vfmadd132ps {{.*#+}} zmm3 = (zmm3 * zmm1) + zmm1 sched: [5:0.50]
1864 ; KNL-NEXT: vfnmadd213ps {{.*#+}} zmm0 = -(zmm3 * zmm0) + zmm2 sched: [5:0.50]
1865 ; KNL-NEXT: vfmadd132ps {{.*#+}} zmm0 = (zmm0 * zmm3) + zmm3 sched: [5:0.50]
1866 ; KNL-NEXT: vmulps {{.*}}(%rip), %zmm0, %zmm0 # sched: [12:0.50]
1867 ; KNL-NEXT: retq # sched: [7:1.00]
1869 ; SKX-LABEL: v16f32_two_step2:
1871 ; SKX-NEXT: vrcp14ps %zmm0, %zmm1 # sched: [4:2.00]
1872 ; SKX-NEXT: vbroadcastss {{.*#+}} zmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [8:0.50]
1873 ; SKX-NEXT: vmovaps %zmm1, %zmm3 # sched: [1:0.33]
1874 ; SKX-NEXT: vfnmadd213ps {{.*#+}} zmm3 = -(zmm0 * zmm3) + zmm2 sched: [4:0.50]
1875 ; SKX-NEXT: vfmadd132ps {{.*#+}} zmm3 = (zmm3 * zmm1) + zmm1 sched: [4:0.50]
1876 ; SKX-NEXT: vfnmadd213ps {{.*#+}} zmm0 = -(zmm3 * zmm0) + zmm2 sched: [4:0.50]
1877 ; SKX-NEXT: vfmadd132ps {{.*#+}} zmm0 = (zmm0 * zmm3) + zmm3 sched: [4:0.50]
1878 ; SKX-NEXT: vmulps {{.*}}(%rip), %zmm0, %zmm0 # sched: [11:0.50]
1879 ; SKX-NEXT: retq # sched: [7:1.00]
1880 %div = fdiv fast <16 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0, float 9.0, float 10.0, float 11.0, float 12.0, float 13.0, float 14.0, float 15.0, float 16.0>, %x
1881 ret <16 x float> %div
1884 define <16 x float> @v16f32_no_step(<16 x float> %x) #3 {
1885 ; SSE-LABEL: v16f32_no_step:
1887 ; SSE-NEXT: rcpps %xmm0, %xmm0
1888 ; SSE-NEXT: rcpps %xmm1, %xmm1
1889 ; SSE-NEXT: rcpps %xmm2, %xmm2
1890 ; SSE-NEXT: rcpps %xmm3, %xmm3
1893 ; AVX-RECIP-LABEL: v16f32_no_step:
1894 ; AVX-RECIP: # %bb.0:
1895 ; AVX-RECIP-NEXT: vrcpps %ymm0, %ymm0
1896 ; AVX-RECIP-NEXT: vrcpps %ymm1, %ymm1
1897 ; AVX-RECIP-NEXT: retq
1899 ; FMA-RECIP-LABEL: v16f32_no_step:
1900 ; FMA-RECIP: # %bb.0:
1901 ; FMA-RECIP-NEXT: vrcpps %ymm0, %ymm0
1902 ; FMA-RECIP-NEXT: vrcpps %ymm1, %ymm1
1903 ; FMA-RECIP-NEXT: retq
1905 ; BDVER2-LABEL: v16f32_no_step:
1907 ; BDVER2-NEXT: vrcpps %ymm0, %ymm0 # sched: [5:2.00]
1908 ; BDVER2-NEXT: vrcpps %ymm1, %ymm1 # sched: [5:2.00]
1909 ; BDVER2-NEXT: retq # sched: [5:1.00]
1911 ; BTVER2-LABEL: v16f32_no_step:
1913 ; BTVER2-NEXT: vrcpps %ymm0, %ymm0 # sched: [2:2.00]
1914 ; BTVER2-NEXT: vrcpps %ymm1, %ymm1 # sched: [2:2.00]
1915 ; BTVER2-NEXT: retq # sched: [4:1.00]
1917 ; SANDY-LABEL: v16f32_no_step:
1919 ; SANDY-NEXT: vrcpps %ymm0, %ymm0 # sched: [7:2.00]
1920 ; SANDY-NEXT: vrcpps %ymm1, %ymm1 # sched: [7:2.00]
1921 ; SANDY-NEXT: retq # sched: [1:1.00]
1923 ; HASWELL-LABEL: v16f32_no_step:
1925 ; HASWELL-NEXT: vrcpps %ymm0, %ymm0 # sched: [11:2.00]
1926 ; HASWELL-NEXT: vrcpps %ymm1, %ymm1 # sched: [11:2.00]
1927 ; HASWELL-NEXT: retq # sched: [7:1.00]
1929 ; HASWELL-NO-FMA-LABEL: v16f32_no_step:
1930 ; HASWELL-NO-FMA: # %bb.0:
1931 ; HASWELL-NO-FMA-NEXT: vrcpps %ymm0, %ymm0 # sched: [11:2.00]
1932 ; HASWELL-NO-FMA-NEXT: vrcpps %ymm1, %ymm1 # sched: [11:2.00]
1933 ; HASWELL-NO-FMA-NEXT: retq # sched: [7:1.00]
1935 ; KNL-LABEL: v16f32_no_step:
1937 ; KNL-NEXT: vrcp14ps %zmm0, %zmm0 # sched: [11:2.00]
1938 ; KNL-NEXT: retq # sched: [7:1.00]
1940 ; SKX-LABEL: v16f32_no_step:
1942 ; SKX-NEXT: vrcp14ps %zmm0, %zmm0 # sched: [4:2.00]
1943 ; SKX-NEXT: retq # sched: [7:1.00]
1944 %div = fdiv fast <16 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, %x
1945 ret <16 x float> %div
1948 define <16 x float> @v16f32_no_step2(<16 x float> %x) #3 {
1949 ; SSE-LABEL: v16f32_no_step2:
1951 ; SSE-NEXT: rcpps %xmm3, %xmm3
1952 ; SSE-NEXT: rcpps %xmm2, %xmm2
1953 ; SSE-NEXT: rcpps %xmm1, %xmm1
1954 ; SSE-NEXT: rcpps %xmm0, %xmm0
1955 ; SSE-NEXT: mulps {{.*}}(%rip), %xmm0
1956 ; SSE-NEXT: mulps {{.*}}(%rip), %xmm1
1957 ; SSE-NEXT: mulps {{.*}}(%rip), %xmm2
1958 ; SSE-NEXT: mulps {{.*}}(%rip), %xmm3
1961 ; AVX-RECIP-LABEL: v16f32_no_step2:
1962 ; AVX-RECIP: # %bb.0:
1963 ; AVX-RECIP-NEXT: vrcpps %ymm1, %ymm1
1964 ; AVX-RECIP-NEXT: vrcpps %ymm0, %ymm0
1965 ; AVX-RECIP-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0
1966 ; AVX-RECIP-NEXT: vmulps {{.*}}(%rip), %ymm1, %ymm1
1967 ; AVX-RECIP-NEXT: retq
1969 ; FMA-RECIP-LABEL: v16f32_no_step2:
1970 ; FMA-RECIP: # %bb.0:
1971 ; FMA-RECIP-NEXT: vrcpps %ymm1, %ymm1
1972 ; FMA-RECIP-NEXT: vrcpps %ymm0, %ymm0
1973 ; FMA-RECIP-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0
1974 ; FMA-RECIP-NEXT: vmulps {{.*}}(%rip), %ymm1, %ymm1
1975 ; FMA-RECIP-NEXT: retq
1977 ; BDVER2-LABEL: v16f32_no_step2:
1979 ; BDVER2-NEXT: vrcpps %ymm1, %ymm1 # sched: [5:2.00]
1980 ; BDVER2-NEXT: vrcpps %ymm0, %ymm0 # sched: [5:2.00]
1981 ; BDVER2-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [10:2.00]
1982 ; BDVER2-NEXT: vmulps {{.*}}(%rip), %ymm1, %ymm1 # sched: [10:2.00]
1983 ; BDVER2-NEXT: retq # sched: [5:1.00]
1985 ; BTVER2-LABEL: v16f32_no_step2:
1987 ; BTVER2-NEXT: vrcpps %ymm1, %ymm1 # sched: [2:2.00]
1988 ; BTVER2-NEXT: vrcpps %ymm0, %ymm0 # sched: [2:2.00]
1989 ; BTVER2-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [7:2.00]
1990 ; BTVER2-NEXT: vmulps {{.*}}(%rip), %ymm1, %ymm1 # sched: [7:2.00]
1991 ; BTVER2-NEXT: retq # sched: [4:1.00]
1993 ; SANDY-LABEL: v16f32_no_step2:
1995 ; SANDY-NEXT: vrcpps %ymm1, %ymm1 # sched: [7:2.00]
1996 ; SANDY-NEXT: vrcpps %ymm0, %ymm0 # sched: [7:2.00]
1997 ; SANDY-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [12:1.00]
1998 ; SANDY-NEXT: vmulps {{.*}}(%rip), %ymm1, %ymm1 # sched: [12:1.00]
1999 ; SANDY-NEXT: retq # sched: [1:1.00]
2001 ; HASWELL-LABEL: v16f32_no_step2:
2003 ; HASWELL-NEXT: vrcpps %ymm1, %ymm1 # sched: [11:2.00]
2004 ; HASWELL-NEXT: vrcpps %ymm0, %ymm0 # sched: [11:2.00]
2005 ; HASWELL-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [12:0.50]
2006 ; HASWELL-NEXT: vmulps {{.*}}(%rip), %ymm1, %ymm1 # sched: [12:0.50]
2007 ; HASWELL-NEXT: retq # sched: [7:1.00]
2009 ; HASWELL-NO-FMA-LABEL: v16f32_no_step2:
2010 ; HASWELL-NO-FMA: # %bb.0:
2011 ; HASWELL-NO-FMA-NEXT: vrcpps %ymm1, %ymm1 # sched: [11:2.00]
2012 ; HASWELL-NO-FMA-NEXT: vrcpps %ymm0, %ymm0 # sched: [11:2.00]
2013 ; HASWELL-NO-FMA-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [12:0.50]
2014 ; HASWELL-NO-FMA-NEXT: vmulps {{.*}}(%rip), %ymm1, %ymm1 # sched: [12:0.50]
2015 ; HASWELL-NO-FMA-NEXT: retq # sched: [7:1.00]
2017 ; KNL-LABEL: v16f32_no_step2:
2019 ; KNL-NEXT: vrcp14ps %zmm0, %zmm0 # sched: [11:2.00]
2020 ; KNL-NEXT: vmulps {{.*}}(%rip), %zmm0, %zmm0 # sched: [12:0.50]
2021 ; KNL-NEXT: retq # sched: [7:1.00]
2023 ; SKX-LABEL: v16f32_no_step2:
2025 ; SKX-NEXT: vrcp14ps %zmm0, %zmm0 # sched: [4:2.00]
2026 ; SKX-NEXT: vmulps {{.*}}(%rip), %zmm0, %zmm0 # sched: [11:0.50]
2027 ; SKX-NEXT: retq # sched: [7:1.00]
2028 %div = fdiv fast <16 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0, float 9.0, float 10.0, float 11.0, float 12.0, float 13.0, float 14.0, float 15.0, float 16.0>, %x
2029 ret <16 x float> %div
2032 attributes #0 = { "unsafe-fp-math"="true" "reciprocal-estimates"="!divf,!vec-divf" }
2033 attributes #1 = { "unsafe-fp-math"="true" "reciprocal-estimates"="divf,vec-divf" }
2034 attributes #2 = { "unsafe-fp-math"="true" "reciprocal-estimates"="divf:2,vec-divf:2" }
2035 attributes #3 = { "unsafe-fp-math"="true" "reciprocal-estimates"="divf:0,vec-divf:0" }