1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX-RECIP
4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fma | FileCheck %s --check-prefixes=AVX,FMA-RECIP
5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=bdver2 | FileCheck %s --check-prefixes=AVX,BDVER2
6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=btver2 | FileCheck %s --check-prefixes=AVX,BTVER2
7 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=sandybridge | FileCheck %s --check-prefixes=AVX,SANDY
8 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=haswell | FileCheck %s --check-prefixes=AVX,HASWELL
9 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=haswell -mattr=-fma | FileCheck %s --check-prefixes=AVX,HASWELL-NO-FMA
10 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl | FileCheck %s --check-prefixes=AVX,AVX512,KNL
11 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=skx | FileCheck %s --check-prefixes=AVX,AVX512,SKX
13 ; It's the extra tests coverage for recip as discussed on D26855.
15 define float @f32_no_step_2(float %x) #3 {
16 ; SSE-LABEL: f32_no_step_2:
18 ; SSE-NEXT: rcpss %xmm0, %xmm0
19 ; SSE-NEXT: mulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
22 ; AVX-LABEL: f32_no_step_2:
24 ; AVX-NEXT: vrcpss %xmm0, %xmm0, %xmm0
25 ; AVX-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
27 %div = fdiv fast float 1234.0, %x
31 define float @f32_one_step_2(float %x) #1 {
32 ; SSE-LABEL: f32_one_step_2:
34 ; SSE-NEXT: rcpss %xmm0, %xmm2
35 ; SSE-NEXT: movss {{.*#+}} xmm1 = [3.456E+3,0.0E+0,0.0E+0,0.0E+0]
36 ; SSE-NEXT: movaps %xmm2, %xmm3
37 ; SSE-NEXT: mulss %xmm1, %xmm3
38 ; SSE-NEXT: mulss %xmm3, %xmm0
39 ; SSE-NEXT: subss %xmm0, %xmm1
40 ; SSE-NEXT: mulss %xmm2, %xmm1
41 ; SSE-NEXT: addss %xmm3, %xmm1
42 ; SSE-NEXT: movaps %xmm1, %xmm0
45 ; AVX-RECIP-LABEL: f32_one_step_2:
47 ; AVX-RECIP-NEXT: vrcpss %xmm0, %xmm0, %xmm1
48 ; AVX-RECIP-NEXT: vmovss {{.*#+}} xmm2 = [3.456E+3,0.0E+0,0.0E+0,0.0E+0]
49 ; AVX-RECIP-NEXT: vmulss %xmm2, %xmm1, %xmm3
50 ; AVX-RECIP-NEXT: vmulss %xmm3, %xmm0, %xmm0
51 ; AVX-RECIP-NEXT: vsubss %xmm0, %xmm2, %xmm0
52 ; AVX-RECIP-NEXT: vmulss %xmm0, %xmm1, %xmm0
53 ; AVX-RECIP-NEXT: vaddss %xmm0, %xmm3, %xmm0
54 ; AVX-RECIP-NEXT: retq
56 ; FMA-RECIP-LABEL: f32_one_step_2:
58 ; FMA-RECIP-NEXT: vrcpss %xmm0, %xmm0, %xmm1
59 ; FMA-RECIP-NEXT: vmovss {{.*#+}} xmm2 = [3.456E+3,0.0E+0,0.0E+0,0.0E+0]
60 ; FMA-RECIP-NEXT: vmulss %xmm2, %xmm1, %xmm3
61 ; FMA-RECIP-NEXT: vfmsub213ss {{.*#+}} xmm0 = (xmm3 * xmm0) - xmm2
62 ; FMA-RECIP-NEXT: vfnmadd213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm3
63 ; FMA-RECIP-NEXT: retq
65 ; BDVER2-LABEL: f32_one_step_2:
67 ; BDVER2-NEXT: vrcpss %xmm0, %xmm0, %xmm1
68 ; BDVER2-NEXT: vmovss {{.*#+}} xmm2 = [3.456E+3,0.0E+0,0.0E+0,0.0E+0]
69 ; BDVER2-NEXT: vmulss %xmm2, %xmm1, %xmm3
70 ; BDVER2-NEXT: vfmsubss {{.*#+}} xmm0 = (xmm0 * xmm3) - xmm2
71 ; BDVER2-NEXT: vfnmaddss {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm3
74 ; BTVER2-LABEL: f32_one_step_2:
76 ; BTVER2-NEXT: vmovss {{.*#+}} xmm2 = [3.456E+3,0.0E+0,0.0E+0,0.0E+0]
77 ; BTVER2-NEXT: vrcpss %xmm0, %xmm0, %xmm1
78 ; BTVER2-NEXT: vmulss %xmm2, %xmm1, %xmm3
79 ; BTVER2-NEXT: vmulss %xmm3, %xmm0, %xmm0
80 ; BTVER2-NEXT: vsubss %xmm0, %xmm2, %xmm0
81 ; BTVER2-NEXT: vmulss %xmm0, %xmm1, %xmm0
82 ; BTVER2-NEXT: vaddss %xmm0, %xmm3, %xmm0
85 ; SANDY-LABEL: f32_one_step_2:
87 ; SANDY-NEXT: vrcpss %xmm0, %xmm0, %xmm1
88 ; SANDY-NEXT: vmovss {{.*#+}} xmm2 = [3.456E+3,0.0E+0,0.0E+0,0.0E+0]
89 ; SANDY-NEXT: vmulss %xmm2, %xmm1, %xmm3
90 ; SANDY-NEXT: vmulss %xmm3, %xmm0, %xmm0
91 ; SANDY-NEXT: vsubss %xmm0, %xmm2, %xmm0
92 ; SANDY-NEXT: vmulss %xmm0, %xmm1, %xmm0
93 ; SANDY-NEXT: vaddss %xmm0, %xmm3, %xmm0
96 ; HASWELL-LABEL: f32_one_step_2:
98 ; HASWELL-NEXT: vrcpss %xmm0, %xmm0, %xmm1
99 ; HASWELL-NEXT: vmovss {{.*#+}} xmm2 = [3.456E+3,0.0E+0,0.0E+0,0.0E+0]
100 ; HASWELL-NEXT: vmulss %xmm2, %xmm1, %xmm3
101 ; HASWELL-NEXT: vfmsub213ss {{.*#+}} xmm0 = (xmm3 * xmm0) - xmm2
102 ; HASWELL-NEXT: vfnmadd213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm3
105 ; HASWELL-NO-FMA-LABEL: f32_one_step_2:
106 ; HASWELL-NO-FMA: # %bb.0:
107 ; HASWELL-NO-FMA-NEXT: vrcpss %xmm0, %xmm0, %xmm1
108 ; HASWELL-NO-FMA-NEXT: vmovss {{.*#+}} xmm2 = [3.456E+3,0.0E+0,0.0E+0,0.0E+0]
109 ; HASWELL-NO-FMA-NEXT: vmulss %xmm2, %xmm1, %xmm3
110 ; HASWELL-NO-FMA-NEXT: vmulss %xmm3, %xmm0, %xmm0
111 ; HASWELL-NO-FMA-NEXT: vsubss %xmm0, %xmm2, %xmm0
112 ; HASWELL-NO-FMA-NEXT: vmulss %xmm0, %xmm1, %xmm0
113 ; HASWELL-NO-FMA-NEXT: vaddss %xmm0, %xmm3, %xmm0
114 ; HASWELL-NO-FMA-NEXT: retq
116 ; AVX512-LABEL: f32_one_step_2:
118 ; AVX512-NEXT: vrcpss %xmm0, %xmm0, %xmm1
119 ; AVX512-NEXT: vmovss {{.*#+}} xmm2 = [3.456E+3,0.0E+0,0.0E+0,0.0E+0]
120 ; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm3
121 ; AVX512-NEXT: vfmsub213ss {{.*#+}} xmm0 = (xmm3 * xmm0) - xmm2
122 ; AVX512-NEXT: vfnmadd213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm3
124 %div = fdiv fast float 3456.0, %x
128 define float @f32_one_step_2_divs(float %x) #1 {
129 ; SSE-LABEL: f32_one_step_2_divs:
131 ; SSE-NEXT: rcpss %xmm0, %xmm1
132 ; SSE-NEXT: mulss %xmm1, %xmm0
133 ; SSE-NEXT: movss {{.*#+}} xmm2 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0]
134 ; SSE-NEXT: subss %xmm0, %xmm2
135 ; SSE-NEXT: mulss %xmm1, %xmm2
136 ; SSE-NEXT: addss %xmm1, %xmm2
137 ; SSE-NEXT: movss {{.*#+}} xmm0 = [3.456E+3,0.0E+0,0.0E+0,0.0E+0]
138 ; SSE-NEXT: mulss %xmm2, %xmm0
139 ; SSE-NEXT: mulss %xmm2, %xmm0
142 ; AVX-RECIP-LABEL: f32_one_step_2_divs:
143 ; AVX-RECIP: # %bb.0:
144 ; AVX-RECIP-NEXT: vrcpss %xmm0, %xmm0, %xmm1
145 ; AVX-RECIP-NEXT: vmulss %xmm1, %xmm0, %xmm0
146 ; AVX-RECIP-NEXT: vmovss {{.*#+}} xmm2 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0]
147 ; AVX-RECIP-NEXT: vsubss %xmm0, %xmm2, %xmm0
148 ; AVX-RECIP-NEXT: vmulss %xmm0, %xmm1, %xmm0
149 ; AVX-RECIP-NEXT: vaddss %xmm0, %xmm1, %xmm0
150 ; AVX-RECIP-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
151 ; AVX-RECIP-NEXT: vmulss %xmm0, %xmm1, %xmm0
152 ; AVX-RECIP-NEXT: retq
154 ; FMA-RECIP-LABEL: f32_one_step_2_divs:
155 ; FMA-RECIP: # %bb.0:
156 ; FMA-RECIP-NEXT: vrcpss %xmm0, %xmm0, %xmm1
157 ; FMA-RECIP-NEXT: vfmsub213ss {{.*#+}} xmm0 = (xmm1 * xmm0) - mem
158 ; FMA-RECIP-NEXT: vfnmadd132ss {{.*#+}} xmm0 = -(xmm0 * xmm1) + xmm1
159 ; FMA-RECIP-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
160 ; FMA-RECIP-NEXT: vmulss %xmm0, %xmm1, %xmm0
161 ; FMA-RECIP-NEXT: retq
163 ; BDVER2-LABEL: f32_one_step_2_divs:
165 ; BDVER2-NEXT: vrcpss %xmm0, %xmm0, %xmm1
166 ; BDVER2-NEXT: vfmsubss {{.*#+}} xmm0 = (xmm0 * xmm1) - mem
167 ; BDVER2-NEXT: vfnmaddss {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm1
168 ; BDVER2-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
169 ; BDVER2-NEXT: vmulss %xmm0, %xmm1, %xmm0
172 ; BTVER2-LABEL: f32_one_step_2_divs:
174 ; BTVER2-NEXT: vmovss {{.*#+}} xmm2 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0]
175 ; BTVER2-NEXT: vrcpss %xmm0, %xmm0, %xmm1
176 ; BTVER2-NEXT: vmulss %xmm1, %xmm0, %xmm0
177 ; BTVER2-NEXT: vsubss %xmm0, %xmm2, %xmm0
178 ; BTVER2-NEXT: vmulss %xmm0, %xmm1, %xmm0
179 ; BTVER2-NEXT: vaddss %xmm0, %xmm1, %xmm0
180 ; BTVER2-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
181 ; BTVER2-NEXT: vmulss %xmm0, %xmm1, %xmm0
184 ; SANDY-LABEL: f32_one_step_2_divs:
186 ; SANDY-NEXT: vrcpss %xmm0, %xmm0, %xmm1
187 ; SANDY-NEXT: vmulss %xmm1, %xmm0, %xmm0
188 ; SANDY-NEXT: vmovss {{.*#+}} xmm2 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0]
189 ; SANDY-NEXT: vsubss %xmm0, %xmm2, %xmm0
190 ; SANDY-NEXT: vmulss %xmm0, %xmm1, %xmm0
191 ; SANDY-NEXT: vaddss %xmm0, %xmm1, %xmm0
192 ; SANDY-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
193 ; SANDY-NEXT: vmulss %xmm0, %xmm1, %xmm0
196 ; HASWELL-LABEL: f32_one_step_2_divs:
198 ; HASWELL-NEXT: vrcpss %xmm0, %xmm0, %xmm1
199 ; HASWELL-NEXT: vfmsub213ss {{.*#+}} xmm0 = (xmm1 * xmm0) - mem
200 ; HASWELL-NEXT: vfnmadd132ss {{.*#+}} xmm0 = -(xmm0 * xmm1) + xmm1
201 ; HASWELL-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
202 ; HASWELL-NEXT: vmulss %xmm0, %xmm1, %xmm0
205 ; HASWELL-NO-FMA-LABEL: f32_one_step_2_divs:
206 ; HASWELL-NO-FMA: # %bb.0:
207 ; HASWELL-NO-FMA-NEXT: vrcpss %xmm0, %xmm0, %xmm1
208 ; HASWELL-NO-FMA-NEXT: vmulss %xmm1, %xmm0, %xmm0
209 ; HASWELL-NO-FMA-NEXT: vmovss {{.*#+}} xmm2 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0]
210 ; HASWELL-NO-FMA-NEXT: vsubss %xmm0, %xmm2, %xmm0
211 ; HASWELL-NO-FMA-NEXT: vmulss %xmm0, %xmm1, %xmm0
212 ; HASWELL-NO-FMA-NEXT: vaddss %xmm0, %xmm1, %xmm0
213 ; HASWELL-NO-FMA-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
214 ; HASWELL-NO-FMA-NEXT: vmulss %xmm0, %xmm1, %xmm0
215 ; HASWELL-NO-FMA-NEXT: retq
217 ; AVX512-LABEL: f32_one_step_2_divs:
219 ; AVX512-NEXT: vrcpss %xmm0, %xmm0, %xmm1
220 ; AVX512-NEXT: vfmsub213ss {{.*#+}} xmm0 = (xmm1 * xmm0) - mem
221 ; AVX512-NEXT: vfnmadd132ss {{.*#+}} xmm0 = -(xmm0 * xmm1) + xmm1
222 ; AVX512-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
223 ; AVX512-NEXT: vmulss %xmm0, %xmm1, %xmm0
225 %div = fdiv fast float 3456.0, %x
226 %div2 = fdiv fast float %div, %x
230 define float @f32_two_step_2(float %x) #2 {
231 ; SSE-LABEL: f32_two_step_2:
233 ; SSE-NEXT: rcpss %xmm0, %xmm1
234 ; SSE-NEXT: movaps %xmm0, %xmm2
235 ; SSE-NEXT: mulss %xmm1, %xmm2
236 ; SSE-NEXT: movss {{.*#+}} xmm3 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0]
237 ; SSE-NEXT: subss %xmm2, %xmm3
238 ; SSE-NEXT: mulss %xmm1, %xmm3
239 ; SSE-NEXT: addss %xmm1, %xmm3
240 ; SSE-NEXT: movss {{.*#+}} xmm1 = [6.789E+3,0.0E+0,0.0E+0,0.0E+0]
241 ; SSE-NEXT: movaps %xmm3, %xmm2
242 ; SSE-NEXT: mulss %xmm1, %xmm2
243 ; SSE-NEXT: mulss %xmm2, %xmm0
244 ; SSE-NEXT: subss %xmm0, %xmm1
245 ; SSE-NEXT: mulss %xmm3, %xmm1
246 ; SSE-NEXT: addss %xmm2, %xmm1
247 ; SSE-NEXT: movaps %xmm1, %xmm0
250 ; AVX-RECIP-LABEL: f32_two_step_2:
251 ; AVX-RECIP: # %bb.0:
252 ; AVX-RECIP-NEXT: vrcpss %xmm0, %xmm0, %xmm1
253 ; AVX-RECIP-NEXT: vmulss %xmm1, %xmm0, %xmm2
254 ; AVX-RECIP-NEXT: vmovss {{.*#+}} xmm3 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0]
255 ; AVX-RECIP-NEXT: vsubss %xmm2, %xmm3, %xmm2
256 ; AVX-RECIP-NEXT: vmulss %xmm2, %xmm1, %xmm2
257 ; AVX-RECIP-NEXT: vaddss %xmm2, %xmm1, %xmm1
258 ; AVX-RECIP-NEXT: vmovss {{.*#+}} xmm2 = [6.789E+3,0.0E+0,0.0E+0,0.0E+0]
259 ; AVX-RECIP-NEXT: vmulss %xmm2, %xmm1, %xmm3
260 ; AVX-RECIP-NEXT: vmulss %xmm3, %xmm0, %xmm0
261 ; AVX-RECIP-NEXT: vsubss %xmm0, %xmm2, %xmm0
262 ; AVX-RECIP-NEXT: vmulss %xmm0, %xmm1, %xmm0
263 ; AVX-RECIP-NEXT: vaddss %xmm0, %xmm3, %xmm0
264 ; AVX-RECIP-NEXT: retq
266 ; FMA-RECIP-LABEL: f32_two_step_2:
267 ; FMA-RECIP: # %bb.0:
268 ; FMA-RECIP-NEXT: vrcpss %xmm0, %xmm0, %xmm1
269 ; FMA-RECIP-NEXT: vmovss {{.*#+}} xmm2 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0]
270 ; FMA-RECIP-NEXT: vfmsub231ss {{.*#+}} xmm2 = (xmm0 * xmm1) - xmm2
271 ; FMA-RECIP-NEXT: vfnmadd132ss {{.*#+}} xmm2 = -(xmm2 * xmm1) + xmm1
272 ; FMA-RECIP-NEXT: vmovss {{.*#+}} xmm1 = [6.789E+3,0.0E+0,0.0E+0,0.0E+0]
273 ; FMA-RECIP-NEXT: vmulss %xmm1, %xmm2, %xmm3
274 ; FMA-RECIP-NEXT: vfmsub213ss {{.*#+}} xmm0 = (xmm3 * xmm0) - xmm1
275 ; FMA-RECIP-NEXT: vfnmadd213ss {{.*#+}} xmm0 = -(xmm2 * xmm0) + xmm3
276 ; FMA-RECIP-NEXT: retq
278 ; BDVER2-LABEL: f32_two_step_2:
280 ; BDVER2-NEXT: vrcpss %xmm0, %xmm0, %xmm1
281 ; BDVER2-NEXT: vfmsubss {{.*#+}} xmm2 = (xmm0 * xmm1) - mem
282 ; BDVER2-NEXT: vmovss {{.*#+}} xmm4 = [6.789E+3,0.0E+0,0.0E+0,0.0E+0]
283 ; BDVER2-NEXT: vfnmaddss {{.*#+}} xmm1 = -(xmm1 * xmm2) + xmm1
284 ; BDVER2-NEXT: vmulss %xmm4, %xmm1, %xmm3
285 ; BDVER2-NEXT: vfmsubss {{.*#+}} xmm0 = (xmm0 * xmm3) - xmm4
286 ; BDVER2-NEXT: vfnmaddss {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm3
289 ; BTVER2-LABEL: f32_two_step_2:
291 ; BTVER2-NEXT: vmovss {{.*#+}} xmm3 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0]
292 ; BTVER2-NEXT: vrcpss %xmm0, %xmm0, %xmm1
293 ; BTVER2-NEXT: vmovss {{.*#+}} xmm4 = [6.789E+3,0.0E+0,0.0E+0,0.0E+0]
294 ; BTVER2-NEXT: vmulss %xmm1, %xmm0, %xmm2
295 ; BTVER2-NEXT: vsubss %xmm2, %xmm3, %xmm2
296 ; BTVER2-NEXT: vmulss %xmm2, %xmm1, %xmm2
297 ; BTVER2-NEXT: vaddss %xmm2, %xmm1, %xmm1
298 ; BTVER2-NEXT: vmulss %xmm4, %xmm1, %xmm3
299 ; BTVER2-NEXT: vmulss %xmm3, %xmm0, %xmm0
300 ; BTVER2-NEXT: vsubss %xmm0, %xmm4, %xmm0
301 ; BTVER2-NEXT: vmulss %xmm0, %xmm1, %xmm0
302 ; BTVER2-NEXT: vaddss %xmm0, %xmm3, %xmm0
305 ; SANDY-LABEL: f32_two_step_2:
307 ; SANDY-NEXT: vrcpss %xmm0, %xmm0, %xmm1
308 ; SANDY-NEXT: vmulss %xmm1, %xmm0, %xmm2
309 ; SANDY-NEXT: vmovss {{.*#+}} xmm3 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0]
310 ; SANDY-NEXT: vsubss %xmm2, %xmm3, %xmm2
311 ; SANDY-NEXT: vmulss %xmm2, %xmm1, %xmm2
312 ; SANDY-NEXT: vaddss %xmm2, %xmm1, %xmm1
313 ; SANDY-NEXT: vmovss {{.*#+}} xmm2 = [6.789E+3,0.0E+0,0.0E+0,0.0E+0]
314 ; SANDY-NEXT: vmulss %xmm2, %xmm1, %xmm3
315 ; SANDY-NEXT: vmulss %xmm3, %xmm0, %xmm0
316 ; SANDY-NEXT: vsubss %xmm0, %xmm2, %xmm0
317 ; SANDY-NEXT: vmulss %xmm0, %xmm1, %xmm0
318 ; SANDY-NEXT: vaddss %xmm0, %xmm3, %xmm0
321 ; HASWELL-LABEL: f32_two_step_2:
323 ; HASWELL-NEXT: vrcpss %xmm0, %xmm0, %xmm1
324 ; HASWELL-NEXT: vmovss {{.*#+}} xmm2 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0]
325 ; HASWELL-NEXT: vfmsub231ss {{.*#+}} xmm2 = (xmm0 * xmm1) - xmm2
326 ; HASWELL-NEXT: vfnmadd132ss {{.*#+}} xmm2 = -(xmm2 * xmm1) + xmm1
327 ; HASWELL-NEXT: vmovss {{.*#+}} xmm1 = [6.789E+3,0.0E+0,0.0E+0,0.0E+0]
328 ; HASWELL-NEXT: vmulss %xmm1, %xmm2, %xmm3
329 ; HASWELL-NEXT: vfmsub213ss {{.*#+}} xmm0 = (xmm3 * xmm0) - xmm1
330 ; HASWELL-NEXT: vfnmadd213ss {{.*#+}} xmm0 = -(xmm2 * xmm0) + xmm3
333 ; HASWELL-NO-FMA-LABEL: f32_two_step_2:
334 ; HASWELL-NO-FMA: # %bb.0:
335 ; HASWELL-NO-FMA-NEXT: vrcpss %xmm0, %xmm0, %xmm1
336 ; HASWELL-NO-FMA-NEXT: vmulss %xmm1, %xmm0, %xmm2
337 ; HASWELL-NO-FMA-NEXT: vmovss {{.*#+}} xmm3 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0]
338 ; HASWELL-NO-FMA-NEXT: vsubss %xmm2, %xmm3, %xmm2
339 ; HASWELL-NO-FMA-NEXT: vmulss %xmm2, %xmm1, %xmm2
340 ; HASWELL-NO-FMA-NEXT: vaddss %xmm2, %xmm1, %xmm1
341 ; HASWELL-NO-FMA-NEXT: vmovss {{.*#+}} xmm2 = [6.789E+3,0.0E+0,0.0E+0,0.0E+0]
342 ; HASWELL-NO-FMA-NEXT: vmulss %xmm2, %xmm1, %xmm3
343 ; HASWELL-NO-FMA-NEXT: vmulss %xmm3, %xmm0, %xmm0
344 ; HASWELL-NO-FMA-NEXT: vsubss %xmm0, %xmm2, %xmm0
345 ; HASWELL-NO-FMA-NEXT: vmulss %xmm0, %xmm1, %xmm0
346 ; HASWELL-NO-FMA-NEXT: vaddss %xmm0, %xmm3, %xmm0
347 ; HASWELL-NO-FMA-NEXT: retq
349 ; AVX512-LABEL: f32_two_step_2:
351 ; AVX512-NEXT: vrcpss %xmm0, %xmm0, %xmm1
352 ; AVX512-NEXT: vmovss {{.*#+}} xmm2 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0]
353 ; AVX512-NEXT: vfmsub231ss {{.*#+}} xmm2 = (xmm0 * xmm1) - xmm2
354 ; AVX512-NEXT: vfnmadd132ss {{.*#+}} xmm2 = -(xmm2 * xmm1) + xmm1
355 ; AVX512-NEXT: vmovss {{.*#+}} xmm1 = [6.789E+3,0.0E+0,0.0E+0,0.0E+0]
356 ; AVX512-NEXT: vmulss %xmm1, %xmm2, %xmm3
357 ; AVX512-NEXT: vfmsub213ss {{.*#+}} xmm0 = (xmm3 * xmm0) - xmm1
358 ; AVX512-NEXT: vfnmadd213ss {{.*#+}} xmm0 = -(xmm2 * xmm0) + xmm3
360 %div = fdiv fast float 6789.0, %x
364 define <4 x float> @v4f32_one_step2(<4 x float> %x) #1 {
365 ; SSE-LABEL: v4f32_one_step2:
367 ; SSE-NEXT: rcpps %xmm0, %xmm2
368 ; SSE-NEXT: movaps {{.*#+}} xmm1 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0]
369 ; SSE-NEXT: movaps %xmm2, %xmm3
370 ; SSE-NEXT: mulps %xmm1, %xmm3
371 ; SSE-NEXT: mulps %xmm3, %xmm0
372 ; SSE-NEXT: subps %xmm0, %xmm1
373 ; SSE-NEXT: mulps %xmm2, %xmm1
374 ; SSE-NEXT: addps %xmm3, %xmm1
375 ; SSE-NEXT: movaps %xmm1, %xmm0
378 ; AVX-RECIP-LABEL: v4f32_one_step2:
379 ; AVX-RECIP: # %bb.0:
380 ; AVX-RECIP-NEXT: vrcpps %xmm0, %xmm1
381 ; AVX-RECIP-NEXT: vmovaps {{.*#+}} xmm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0]
382 ; AVX-RECIP-NEXT: vmulps %xmm2, %xmm1, %xmm3
383 ; AVX-RECIP-NEXT: vmulps %xmm3, %xmm0, %xmm0
384 ; AVX-RECIP-NEXT: vsubps %xmm0, %xmm2, %xmm0
385 ; AVX-RECIP-NEXT: vmulps %xmm0, %xmm1, %xmm0
386 ; AVX-RECIP-NEXT: vaddps %xmm0, %xmm3, %xmm0
387 ; AVX-RECIP-NEXT: retq
389 ; FMA-RECIP-LABEL: v4f32_one_step2:
390 ; FMA-RECIP: # %bb.0:
391 ; FMA-RECIP-NEXT: vrcpps %xmm0, %xmm1
392 ; FMA-RECIP-NEXT: vmovaps {{.*#+}} xmm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0]
393 ; FMA-RECIP-NEXT: vmulps %xmm2, %xmm1, %xmm3
394 ; FMA-RECIP-NEXT: vfmsub213ps {{.*#+}} xmm0 = (xmm3 * xmm0) - xmm2
395 ; FMA-RECIP-NEXT: vfnmadd213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm3
396 ; FMA-RECIP-NEXT: retq
398 ; BDVER2-LABEL: v4f32_one_step2:
400 ; BDVER2-NEXT: vrcpps %xmm0, %xmm1
401 ; BDVER2-NEXT: vmovaps {{.*#+}} xmm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0]
402 ; BDVER2-NEXT: vmulps %xmm2, %xmm1, %xmm3
403 ; BDVER2-NEXT: vfmsubps {{.*#+}} xmm0 = (xmm0 * xmm3) - xmm2
404 ; BDVER2-NEXT: vfnmaddps {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm3
407 ; BTVER2-LABEL: v4f32_one_step2:
409 ; BTVER2-NEXT: vmovaps {{.*#+}} xmm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0]
410 ; BTVER2-NEXT: vrcpps %xmm0, %xmm1
411 ; BTVER2-NEXT: vmulps %xmm2, %xmm1, %xmm3
412 ; BTVER2-NEXT: vmulps %xmm3, %xmm0, %xmm0
413 ; BTVER2-NEXT: vsubps %xmm0, %xmm2, %xmm0
414 ; BTVER2-NEXT: vmulps %xmm0, %xmm1, %xmm0
415 ; BTVER2-NEXT: vaddps %xmm0, %xmm3, %xmm0
418 ; SANDY-LABEL: v4f32_one_step2:
420 ; SANDY-NEXT: vrcpps %xmm0, %xmm1
421 ; SANDY-NEXT: vmovaps {{.*#+}} xmm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0]
422 ; SANDY-NEXT: vmulps %xmm2, %xmm1, %xmm3
423 ; SANDY-NEXT: vmulps %xmm3, %xmm0, %xmm0
424 ; SANDY-NEXT: vsubps %xmm0, %xmm2, %xmm0
425 ; SANDY-NEXT: vmulps %xmm0, %xmm1, %xmm0
426 ; SANDY-NEXT: vaddps %xmm0, %xmm3, %xmm0
429 ; HASWELL-LABEL: v4f32_one_step2:
431 ; HASWELL-NEXT: vrcpps %xmm0, %xmm1
432 ; HASWELL-NEXT: vmovaps {{.*#+}} xmm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0]
433 ; HASWELL-NEXT: vmulps %xmm2, %xmm1, %xmm3
434 ; HASWELL-NEXT: vfmsub213ps {{.*#+}} xmm0 = (xmm3 * xmm0) - xmm2
435 ; HASWELL-NEXT: vfnmadd213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm3
438 ; HASWELL-NO-FMA-LABEL: v4f32_one_step2:
439 ; HASWELL-NO-FMA: # %bb.0:
440 ; HASWELL-NO-FMA-NEXT: vrcpps %xmm0, %xmm1
441 ; HASWELL-NO-FMA-NEXT: vmovaps {{.*#+}} xmm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0]
442 ; HASWELL-NO-FMA-NEXT: vmulps %xmm2, %xmm1, %xmm3
443 ; HASWELL-NO-FMA-NEXT: vmulps %xmm3, %xmm0, %xmm0
444 ; HASWELL-NO-FMA-NEXT: vsubps %xmm0, %xmm2, %xmm0
445 ; HASWELL-NO-FMA-NEXT: vmulps %xmm0, %xmm1, %xmm0
446 ; HASWELL-NO-FMA-NEXT: vaddps %xmm0, %xmm3, %xmm0
447 ; HASWELL-NO-FMA-NEXT: retq
449 ; AVX512-LABEL: v4f32_one_step2:
451 ; AVX512-NEXT: vrcpps %xmm0, %xmm1
452 ; AVX512-NEXT: vmovaps {{.*#+}} xmm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0]
453 ; AVX512-NEXT: vmulps %xmm2, %xmm1, %xmm3
454 ; AVX512-NEXT: vfmsub213ps {{.*#+}} xmm0 = (xmm3 * xmm0) - xmm2
455 ; AVX512-NEXT: vfnmadd213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm3
457 %div = fdiv fast <4 x float> <float 1.0, float 2.0, float 3.0, float 4.0>, %x
461 define <4 x float> @v4f32_one_step_2_divs(<4 x float> %x) #1 {
462 ; SSE-LABEL: v4f32_one_step_2_divs:
464 ; SSE-NEXT: rcpps %xmm0, %xmm1
465 ; SSE-NEXT: mulps %xmm1, %xmm0
466 ; SSE-NEXT: movaps {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
467 ; SSE-NEXT: subps %xmm0, %xmm2
468 ; SSE-NEXT: mulps %xmm1, %xmm2
469 ; SSE-NEXT: addps %xmm1, %xmm2
470 ; SSE-NEXT: movaps {{.*#+}} xmm0 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0]
471 ; SSE-NEXT: mulps %xmm2, %xmm0
472 ; SSE-NEXT: mulps %xmm2, %xmm0
475 ; AVX-RECIP-LABEL: v4f32_one_step_2_divs:
476 ; AVX-RECIP: # %bb.0:
477 ; AVX-RECIP-NEXT: vrcpps %xmm0, %xmm1
478 ; AVX-RECIP-NEXT: vmulps %xmm1, %xmm0, %xmm0
479 ; AVX-RECIP-NEXT: vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
480 ; AVX-RECIP-NEXT: vsubps %xmm0, %xmm2, %xmm0
481 ; AVX-RECIP-NEXT: vmulps %xmm0, %xmm1, %xmm0
482 ; AVX-RECIP-NEXT: vaddps %xmm0, %xmm1, %xmm0
483 ; AVX-RECIP-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
484 ; AVX-RECIP-NEXT: vmulps %xmm0, %xmm1, %xmm0
485 ; AVX-RECIP-NEXT: retq
487 ; FMA-RECIP-LABEL: v4f32_one_step_2_divs:
488 ; FMA-RECIP: # %bb.0:
489 ; FMA-RECIP-NEXT: vrcpps %xmm0, %xmm1
490 ; FMA-RECIP-NEXT: vfmsub213ps {{.*#+}} xmm0 = (xmm1 * xmm0) - mem
491 ; FMA-RECIP-NEXT: vfnmadd132ps {{.*#+}} xmm0 = -(xmm0 * xmm1) + xmm1
492 ; FMA-RECIP-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
493 ; FMA-RECIP-NEXT: vmulps %xmm0, %xmm1, %xmm0
494 ; FMA-RECIP-NEXT: retq
496 ; BDVER2-LABEL: v4f32_one_step_2_divs:
498 ; BDVER2-NEXT: vrcpps %xmm0, %xmm1
499 ; BDVER2-NEXT: vfmsubps {{.*#+}} xmm0 = (xmm0 * xmm1) - mem
500 ; BDVER2-NEXT: vfnmaddps {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm1
501 ; BDVER2-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
502 ; BDVER2-NEXT: vmulps %xmm0, %xmm1, %xmm0
505 ; BTVER2-LABEL: v4f32_one_step_2_divs:
507 ; BTVER2-NEXT: vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
508 ; BTVER2-NEXT: vrcpps %xmm0, %xmm1
509 ; BTVER2-NEXT: vmulps %xmm1, %xmm0, %xmm0
510 ; BTVER2-NEXT: vsubps %xmm0, %xmm2, %xmm0
511 ; BTVER2-NEXT: vmulps %xmm0, %xmm1, %xmm0
512 ; BTVER2-NEXT: vaddps %xmm0, %xmm1, %xmm0
513 ; BTVER2-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
514 ; BTVER2-NEXT: vmulps %xmm0, %xmm1, %xmm0
517 ; SANDY-LABEL: v4f32_one_step_2_divs:
519 ; SANDY-NEXT: vrcpps %xmm0, %xmm1
520 ; SANDY-NEXT: vmulps %xmm1, %xmm0, %xmm0
521 ; SANDY-NEXT: vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
522 ; SANDY-NEXT: vsubps %xmm0, %xmm2, %xmm0
523 ; SANDY-NEXT: vmulps %xmm0, %xmm1, %xmm0
524 ; SANDY-NEXT: vaddps %xmm0, %xmm1, %xmm0
525 ; SANDY-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
526 ; SANDY-NEXT: vmulps %xmm0, %xmm1, %xmm0
529 ; HASWELL-LABEL: v4f32_one_step_2_divs:
531 ; HASWELL-NEXT: vrcpps %xmm0, %xmm1
532 ; HASWELL-NEXT: vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
533 ; HASWELL-NEXT: vfmsub231ps {{.*#+}} xmm2 = (xmm1 * xmm0) - xmm2
534 ; HASWELL-NEXT: vfnmadd132ps {{.*#+}} xmm2 = -(xmm2 * xmm1) + xmm1
535 ; HASWELL-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm0
536 ; HASWELL-NEXT: vmulps %xmm2, %xmm0, %xmm0
539 ; HASWELL-NO-FMA-LABEL: v4f32_one_step_2_divs:
540 ; HASWELL-NO-FMA: # %bb.0:
541 ; HASWELL-NO-FMA-NEXT: vrcpps %xmm0, %xmm1
542 ; HASWELL-NO-FMA-NEXT: vmulps %xmm1, %xmm0, %xmm0
543 ; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
544 ; HASWELL-NO-FMA-NEXT: vsubps %xmm0, %xmm2, %xmm0
545 ; HASWELL-NO-FMA-NEXT: vmulps %xmm0, %xmm1, %xmm0
546 ; HASWELL-NO-FMA-NEXT: vaddps %xmm0, %xmm1, %xmm0
547 ; HASWELL-NO-FMA-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
548 ; HASWELL-NO-FMA-NEXT: vmulps %xmm0, %xmm1, %xmm0
549 ; HASWELL-NO-FMA-NEXT: retq
551 ; KNL-LABEL: v4f32_one_step_2_divs:
553 ; KNL-NEXT: vrcpps %xmm0, %xmm1
554 ; KNL-NEXT: vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
555 ; KNL-NEXT: vfmsub231ps {{.*#+}} xmm2 = (xmm1 * xmm0) - xmm2
556 ; KNL-NEXT: vfnmadd132ps {{.*#+}} xmm2 = -(xmm2 * xmm1) + xmm1
557 ; KNL-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm0
558 ; KNL-NEXT: vmulps %xmm2, %xmm0, %xmm0
561 ; SKX-LABEL: v4f32_one_step_2_divs:
563 ; SKX-NEXT: vrcpps %xmm0, %xmm1
564 ; SKX-NEXT: vfmsub213ps {{.*#+}} xmm0 = (xmm1 * xmm0) - mem
565 ; SKX-NEXT: vfnmadd132ps {{.*#+}} xmm0 = -(xmm0 * xmm1) + xmm1
566 ; SKX-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
567 ; SKX-NEXT: vmulps %xmm0, %xmm1, %xmm0
569 %div = fdiv fast <4 x float> <float 1.0, float 2.0, float 3.0, float 4.0>, %x
570 %div2 = fdiv fast <4 x float> %div, %x
571 ret <4 x float> %div2
574 define <4 x float> @v4f32_two_step2(<4 x float> %x) #2 {
575 ; SSE-LABEL: v4f32_two_step2:
577 ; SSE-NEXT: rcpps %xmm0, %xmm1
578 ; SSE-NEXT: movaps %xmm0, %xmm2
579 ; SSE-NEXT: mulps %xmm1, %xmm2
580 ; SSE-NEXT: movaps {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
581 ; SSE-NEXT: subps %xmm2, %xmm3
582 ; SSE-NEXT: mulps %xmm1, %xmm3
583 ; SSE-NEXT: addps %xmm1, %xmm3
584 ; SSE-NEXT: movaps {{.*#+}} xmm1 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0]
585 ; SSE-NEXT: movaps %xmm3, %xmm2
586 ; SSE-NEXT: mulps %xmm1, %xmm2
587 ; SSE-NEXT: mulps %xmm2, %xmm0
588 ; SSE-NEXT: subps %xmm0, %xmm1
589 ; SSE-NEXT: mulps %xmm3, %xmm1
590 ; SSE-NEXT: addps %xmm2, %xmm1
591 ; SSE-NEXT: movaps %xmm1, %xmm0
594 ; AVX-RECIP-LABEL: v4f32_two_step2:
595 ; AVX-RECIP: # %bb.0:
596 ; AVX-RECIP-NEXT: vrcpps %xmm0, %xmm1
597 ; AVX-RECIP-NEXT: vmulps %xmm1, %xmm0, %xmm2
598 ; AVX-RECIP-NEXT: vbroadcastss {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
599 ; AVX-RECIP-NEXT: vsubps %xmm2, %xmm3, %xmm2
600 ; AVX-RECIP-NEXT: vmulps %xmm2, %xmm1, %xmm2
601 ; AVX-RECIP-NEXT: vaddps %xmm2, %xmm1, %xmm1
602 ; AVX-RECIP-NEXT: vmovaps {{.*#+}} xmm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0]
603 ; AVX-RECIP-NEXT: vmulps %xmm2, %xmm1, %xmm3
604 ; AVX-RECIP-NEXT: vmulps %xmm3, %xmm0, %xmm0
605 ; AVX-RECIP-NEXT: vsubps %xmm0, %xmm2, %xmm0
606 ; AVX-RECIP-NEXT: vmulps %xmm0, %xmm1, %xmm0
607 ; AVX-RECIP-NEXT: vaddps %xmm0, %xmm3, %xmm0
608 ; AVX-RECIP-NEXT: retq
610 ; FMA-RECIP-LABEL: v4f32_two_step2:
611 ; FMA-RECIP: # %bb.0:
612 ; FMA-RECIP-NEXT: vrcpps %xmm0, %xmm1
613 ; FMA-RECIP-NEXT: vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
614 ; FMA-RECIP-NEXT: vfmsub231ps {{.*#+}} xmm2 = (xmm0 * xmm1) - xmm2
615 ; FMA-RECIP-NEXT: vfnmadd132ps {{.*#+}} xmm2 = -(xmm2 * xmm1) + xmm1
616 ; FMA-RECIP-NEXT: vmovaps {{.*#+}} xmm1 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0]
617 ; FMA-RECIP-NEXT: vmulps %xmm1, %xmm2, %xmm3
618 ; FMA-RECIP-NEXT: vfmsub213ps {{.*#+}} xmm0 = (xmm3 * xmm0) - xmm1
619 ; FMA-RECIP-NEXT: vfnmadd213ps {{.*#+}} xmm0 = -(xmm2 * xmm0) + xmm3
620 ; FMA-RECIP-NEXT: retq
622 ; BDVER2-LABEL: v4f32_two_step2:
624 ; BDVER2-NEXT: vrcpps %xmm0, %xmm1
625 ; BDVER2-NEXT: vfmsubps {{.*#+}} xmm2 = (xmm0 * xmm1) - mem
626 ; BDVER2-NEXT: vmovaps {{.*#+}} xmm4 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0]
627 ; BDVER2-NEXT: vfnmaddps {{.*#+}} xmm1 = -(xmm1 * xmm2) + xmm1
628 ; BDVER2-NEXT: vmulps %xmm4, %xmm1, %xmm3
629 ; BDVER2-NEXT: vfmsubps {{.*#+}} xmm0 = (xmm0 * xmm3) - xmm4
630 ; BDVER2-NEXT: vfnmaddps {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm3
633 ; BTVER2-LABEL: v4f32_two_step2:
635 ; BTVER2-NEXT: vbroadcastss {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
636 ; BTVER2-NEXT: vrcpps %xmm0, %xmm1
637 ; BTVER2-NEXT: vmovaps {{.*#+}} xmm4 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0]
638 ; BTVER2-NEXT: vmulps %xmm1, %xmm0, %xmm2
639 ; BTVER2-NEXT: vsubps %xmm2, %xmm3, %xmm2
640 ; BTVER2-NEXT: vmulps %xmm2, %xmm1, %xmm2
641 ; BTVER2-NEXT: vaddps %xmm2, %xmm1, %xmm1
642 ; BTVER2-NEXT: vmulps %xmm4, %xmm1, %xmm3
643 ; BTVER2-NEXT: vmulps %xmm3, %xmm0, %xmm0
644 ; BTVER2-NEXT: vsubps %xmm0, %xmm4, %xmm0
645 ; BTVER2-NEXT: vmulps %xmm0, %xmm1, %xmm0
646 ; BTVER2-NEXT: vaddps %xmm0, %xmm3, %xmm0
649 ; SANDY-LABEL: v4f32_two_step2:
651 ; SANDY-NEXT: vrcpps %xmm0, %xmm1
652 ; SANDY-NEXT: vmulps %xmm1, %xmm0, %xmm2
653 ; SANDY-NEXT: vbroadcastss {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
654 ; SANDY-NEXT: vsubps %xmm2, %xmm3, %xmm2
655 ; SANDY-NEXT: vmulps %xmm2, %xmm1, %xmm2
656 ; SANDY-NEXT: vaddps %xmm2, %xmm1, %xmm1
657 ; SANDY-NEXT: vmovaps {{.*#+}} xmm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0]
658 ; SANDY-NEXT: vmulps %xmm2, %xmm1, %xmm3
659 ; SANDY-NEXT: vmulps %xmm3, %xmm0, %xmm0
660 ; SANDY-NEXT: vsubps %xmm0, %xmm2, %xmm0
661 ; SANDY-NEXT: vmulps %xmm0, %xmm1, %xmm0
662 ; SANDY-NEXT: vaddps %xmm0, %xmm3, %xmm0
665 ; HASWELL-LABEL: v4f32_two_step2:
667 ; HASWELL-NEXT: vrcpps %xmm0, %xmm1
668 ; HASWELL-NEXT: vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
669 ; HASWELL-NEXT: vfmsub231ps {{.*#+}} xmm2 = (xmm0 * xmm1) - xmm2
670 ; HASWELL-NEXT: vfnmadd132ps {{.*#+}} xmm2 = -(xmm2 * xmm1) + xmm1
671 ; HASWELL-NEXT: vmovaps {{.*#+}} xmm1 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0]
672 ; HASWELL-NEXT: vmulps %xmm1, %xmm2, %xmm3
673 ; HASWELL-NEXT: vfmsub213ps {{.*#+}} xmm0 = (xmm3 * xmm0) - xmm1
674 ; HASWELL-NEXT: vfnmadd213ps {{.*#+}} xmm0 = -(xmm2 * xmm0) + xmm3
677 ; HASWELL-NO-FMA-LABEL: v4f32_two_step2:
678 ; HASWELL-NO-FMA: # %bb.0:
679 ; HASWELL-NO-FMA-NEXT: vrcpps %xmm0, %xmm1
680 ; HASWELL-NO-FMA-NEXT: vmulps %xmm1, %xmm0, %xmm2
681 ; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
682 ; HASWELL-NO-FMA-NEXT: vsubps %xmm2, %xmm3, %xmm2
683 ; HASWELL-NO-FMA-NEXT: vmulps %xmm2, %xmm1, %xmm2
684 ; HASWELL-NO-FMA-NEXT: vaddps %xmm2, %xmm1, %xmm1
685 ; HASWELL-NO-FMA-NEXT: vmovaps {{.*#+}} xmm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0]
686 ; HASWELL-NO-FMA-NEXT: vmulps %xmm2, %xmm1, %xmm3
687 ; HASWELL-NO-FMA-NEXT: vmulps %xmm3, %xmm0, %xmm0
688 ; HASWELL-NO-FMA-NEXT: vsubps %xmm0, %xmm2, %xmm0
689 ; HASWELL-NO-FMA-NEXT: vmulps %xmm0, %xmm1, %xmm0
690 ; HASWELL-NO-FMA-NEXT: vaddps %xmm0, %xmm3, %xmm0
691 ; HASWELL-NO-FMA-NEXT: retq
693 ; AVX512-LABEL: v4f32_two_step2:
695 ; AVX512-NEXT: vrcpps %xmm0, %xmm1
696 ; AVX512-NEXT: vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
697 ; AVX512-NEXT: vfmsub231ps {{.*#+}} xmm2 = (xmm0 * xmm1) - xmm2
698 ; AVX512-NEXT: vfnmadd132ps {{.*#+}} xmm2 = -(xmm2 * xmm1) + xmm1
699 ; AVX512-NEXT: vmovaps {{.*#+}} xmm1 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0]
700 ; AVX512-NEXT: vmulps %xmm1, %xmm2, %xmm3
701 ; AVX512-NEXT: vfmsub213ps {{.*#+}} xmm0 = (xmm3 * xmm0) - xmm1
702 ; AVX512-NEXT: vfnmadd213ps {{.*#+}} xmm0 = -(xmm2 * xmm0) + xmm3
704 %div = fdiv fast <4 x float> <float 1.0, float 2.0, float 3.0, float 4.0>, %x
708 define <8 x float> @v8f32_one_step2(<8 x float> %x) #1 {
709 ; SSE-LABEL: v8f32_one_step2:
711 ; SSE-NEXT: rcpps %xmm0, %xmm3
712 ; SSE-NEXT: movaps {{.*#+}} xmm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0]
713 ; SSE-NEXT: movaps %xmm3, %xmm4
714 ; SSE-NEXT: mulps %xmm2, %xmm4
715 ; SSE-NEXT: mulps %xmm4, %xmm0
716 ; SSE-NEXT: subps %xmm0, %xmm2
717 ; SSE-NEXT: mulps %xmm3, %xmm2
718 ; SSE-NEXT: addps %xmm4, %xmm2
719 ; SSE-NEXT: rcpps %xmm1, %xmm0
720 ; SSE-NEXT: movaps {{.*#+}} xmm3 = [5.0E+0,6.0E+0,7.0E+0,8.0E+0]
721 ; SSE-NEXT: movaps %xmm0, %xmm4
722 ; SSE-NEXT: mulps %xmm3, %xmm4
723 ; SSE-NEXT: mulps %xmm4, %xmm1
724 ; SSE-NEXT: subps %xmm1, %xmm3
725 ; SSE-NEXT: mulps %xmm0, %xmm3
726 ; SSE-NEXT: addps %xmm4, %xmm3
727 ; SSE-NEXT: movaps %xmm2, %xmm0
728 ; SSE-NEXT: movaps %xmm3, %xmm1
731 ; AVX-RECIP-LABEL: v8f32_one_step2:
732 ; AVX-RECIP: # %bb.0:
733 ; AVX-RECIP-NEXT: vrcpps %ymm0, %ymm1
734 ; AVX-RECIP-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0]
735 ; AVX-RECIP-NEXT: vmulps %ymm2, %ymm1, %ymm3
736 ; AVX-RECIP-NEXT: vmulps %ymm3, %ymm0, %ymm0
737 ; AVX-RECIP-NEXT: vsubps %ymm0, %ymm2, %ymm0
738 ; AVX-RECIP-NEXT: vmulps %ymm0, %ymm1, %ymm0
739 ; AVX-RECIP-NEXT: vaddps %ymm0, %ymm3, %ymm0
740 ; AVX-RECIP-NEXT: retq
742 ; FMA-RECIP-LABEL: v8f32_one_step2:
743 ; FMA-RECIP: # %bb.0:
744 ; FMA-RECIP-NEXT: vrcpps %ymm0, %ymm1
745 ; FMA-RECIP-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0]
746 ; FMA-RECIP-NEXT: vmulps %ymm2, %ymm1, %ymm3
747 ; FMA-RECIP-NEXT: vfmsub213ps {{.*#+}} ymm0 = (ymm3 * ymm0) - ymm2
748 ; FMA-RECIP-NEXT: vfnmadd213ps {{.*#+}} ymm0 = -(ymm1 * ymm0) + ymm3
749 ; FMA-RECIP-NEXT: retq
751 ; BDVER2-LABEL: v8f32_one_step2:
753 ; BDVER2-NEXT: vrcpps %ymm0, %ymm1
754 ; BDVER2-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0]
755 ; BDVER2-NEXT: vmulps %ymm2, %ymm1, %ymm3
756 ; BDVER2-NEXT: vfmsubps {{.*#+}} ymm0 = (ymm0 * ymm3) - ymm2
757 ; BDVER2-NEXT: vfnmaddps {{.*#+}} ymm0 = -(ymm1 * ymm0) + ymm3
760 ; BTVER2-LABEL: v8f32_one_step2:
762 ; BTVER2-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0]
763 ; BTVER2-NEXT: vrcpps %ymm0, %ymm1
764 ; BTVER2-NEXT: vmulps %ymm2, %ymm1, %ymm3
765 ; BTVER2-NEXT: vmulps %ymm3, %ymm0, %ymm0
766 ; BTVER2-NEXT: vsubps %ymm0, %ymm2, %ymm0
767 ; BTVER2-NEXT: vmulps %ymm0, %ymm1, %ymm0
768 ; BTVER2-NEXT: vaddps %ymm0, %ymm3, %ymm0
771 ; SANDY-LABEL: v8f32_one_step2:
773 ; SANDY-NEXT: vrcpps %ymm0, %ymm1
774 ; SANDY-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0]
775 ; SANDY-NEXT: vmulps %ymm2, %ymm1, %ymm3
776 ; SANDY-NEXT: vmulps %ymm3, %ymm0, %ymm0
777 ; SANDY-NEXT: vsubps %ymm0, %ymm2, %ymm0
778 ; SANDY-NEXT: vmulps %ymm0, %ymm1, %ymm0
779 ; SANDY-NEXT: vaddps %ymm0, %ymm3, %ymm0
782 ; HASWELL-LABEL: v8f32_one_step2:
784 ; HASWELL-NEXT: vrcpps %ymm0, %ymm1
785 ; HASWELL-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0]
786 ; HASWELL-NEXT: vmulps %ymm2, %ymm1, %ymm3
787 ; HASWELL-NEXT: vfmsub213ps {{.*#+}} ymm0 = (ymm3 * ymm0) - ymm2
788 ; HASWELL-NEXT: vfnmadd213ps {{.*#+}} ymm0 = -(ymm1 * ymm0) + ymm3
791 ; HASWELL-NO-FMA-LABEL: v8f32_one_step2:
792 ; HASWELL-NO-FMA: # %bb.0:
793 ; HASWELL-NO-FMA-NEXT: vrcpps %ymm0, %ymm1
794 ; HASWELL-NO-FMA-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0]
795 ; HASWELL-NO-FMA-NEXT: vmulps %ymm2, %ymm1, %ymm3
796 ; HASWELL-NO-FMA-NEXT: vmulps %ymm3, %ymm0, %ymm0
797 ; HASWELL-NO-FMA-NEXT: vsubps %ymm0, %ymm2, %ymm0
798 ; HASWELL-NO-FMA-NEXT: vmulps %ymm0, %ymm1, %ymm0
799 ; HASWELL-NO-FMA-NEXT: vaddps %ymm0, %ymm3, %ymm0
800 ; HASWELL-NO-FMA-NEXT: retq
802 ; AVX512-LABEL: v8f32_one_step2:
804 ; AVX512-NEXT: vrcpps %ymm0, %ymm1
805 ; AVX512-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0]
806 ; AVX512-NEXT: vmulps %ymm2, %ymm1, %ymm3
807 ; AVX512-NEXT: vfmsub213ps {{.*#+}} ymm0 = (ymm3 * ymm0) - ymm2
808 ; AVX512-NEXT: vfnmadd213ps {{.*#+}} ymm0 = -(ymm1 * ymm0) + ymm3
810 %div = fdiv fast <8 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0>, %x
814 define <8 x float> @v8f32_one_step_2_divs(<8 x float> %x) #1 {
815 ; SSE-LABEL: v8f32_one_step_2_divs:
817 ; SSE-NEXT: rcpps %xmm0, %xmm2
818 ; SSE-NEXT: mulps %xmm2, %xmm0
819 ; SSE-NEXT: movaps {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
820 ; SSE-NEXT: movaps %xmm3, %xmm4
821 ; SSE-NEXT: subps %xmm0, %xmm4
822 ; SSE-NEXT: mulps %xmm2, %xmm4
823 ; SSE-NEXT: addps %xmm2, %xmm4
824 ; SSE-NEXT: rcpps %xmm1, %xmm0
825 ; SSE-NEXT: mulps %xmm0, %xmm1
826 ; SSE-NEXT: subps %xmm1, %xmm3
827 ; SSE-NEXT: mulps %xmm0, %xmm3
828 ; SSE-NEXT: addps %xmm0, %xmm3
829 ; SSE-NEXT: movaps {{.*#+}} xmm1 = [5.0E+0,6.0E+0,7.0E+0,8.0E+0]
830 ; SSE-NEXT: mulps %xmm3, %xmm1
831 ; SSE-NEXT: movaps {{.*#+}} xmm0 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0]
832 ; SSE-NEXT: mulps %xmm4, %xmm0
833 ; SSE-NEXT: mulps %xmm4, %xmm0
834 ; SSE-NEXT: mulps %xmm3, %xmm1
837 ; AVX-RECIP-LABEL: v8f32_one_step_2_divs:
838 ; AVX-RECIP: # %bb.0:
839 ; AVX-RECIP-NEXT: vrcpps %ymm0, %ymm1
840 ; AVX-RECIP-NEXT: vmulps %ymm1, %ymm0, %ymm0
841 ; AVX-RECIP-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
842 ; AVX-RECIP-NEXT: vsubps %ymm0, %ymm2, %ymm0
843 ; AVX-RECIP-NEXT: vmulps %ymm0, %ymm1, %ymm0
844 ; AVX-RECIP-NEXT: vaddps %ymm0, %ymm1, %ymm0
845 ; AVX-RECIP-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1
846 ; AVX-RECIP-NEXT: vmulps %ymm0, %ymm1, %ymm0
847 ; AVX-RECIP-NEXT: retq
849 ; FMA-RECIP-LABEL: v8f32_one_step_2_divs:
850 ; FMA-RECIP: # %bb.0:
851 ; FMA-RECIP-NEXT: vrcpps %ymm0, %ymm1
852 ; FMA-RECIP-NEXT: vfmsub213ps {{.*#+}} ymm0 = (ymm1 * ymm0) - mem
853 ; FMA-RECIP-NEXT: vfnmadd132ps {{.*#+}} ymm0 = -(ymm0 * ymm1) + ymm1
854 ; FMA-RECIP-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1
855 ; FMA-RECIP-NEXT: vmulps %ymm0, %ymm1, %ymm0
856 ; FMA-RECIP-NEXT: retq
858 ; BDVER2-LABEL: v8f32_one_step_2_divs:
860 ; BDVER2-NEXT: vrcpps %ymm0, %ymm1
861 ; BDVER2-NEXT: vfmsubps {{.*#+}} ymm0 = (ymm0 * ymm1) - mem
862 ; BDVER2-NEXT: vfnmaddps {{.*#+}} ymm0 = -(ymm1 * ymm0) + ymm1
863 ; BDVER2-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1
864 ; BDVER2-NEXT: vmulps %ymm0, %ymm1, %ymm0
867 ; BTVER2-LABEL: v8f32_one_step_2_divs:
869 ; BTVER2-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
870 ; BTVER2-NEXT: vrcpps %ymm0, %ymm1
871 ; BTVER2-NEXT: vmulps %ymm1, %ymm0, %ymm0
872 ; BTVER2-NEXT: vsubps %ymm0, %ymm2, %ymm0
873 ; BTVER2-NEXT: vmulps %ymm0, %ymm1, %ymm0
874 ; BTVER2-NEXT: vaddps %ymm0, %ymm1, %ymm0
875 ; BTVER2-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1
876 ; BTVER2-NEXT: vmulps %ymm0, %ymm1, %ymm0
879 ; SANDY-LABEL: v8f32_one_step_2_divs:
881 ; SANDY-NEXT: vrcpps %ymm0, %ymm1
882 ; SANDY-NEXT: vmulps %ymm1, %ymm0, %ymm0
883 ; SANDY-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
884 ; SANDY-NEXT: vsubps %ymm0, %ymm2, %ymm0
885 ; SANDY-NEXT: vmulps %ymm0, %ymm1, %ymm0
886 ; SANDY-NEXT: vaddps %ymm0, %ymm1, %ymm0
887 ; SANDY-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1
888 ; SANDY-NEXT: vmulps %ymm0, %ymm1, %ymm0
891 ; HASWELL-LABEL: v8f32_one_step_2_divs:
893 ; HASWELL-NEXT: vrcpps %ymm0, %ymm1
894 ; HASWELL-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
895 ; HASWELL-NEXT: vfmsub231ps {{.*#+}} ymm2 = (ymm1 * ymm0) - ymm2
896 ; HASWELL-NEXT: vfnmadd132ps {{.*#+}} ymm2 = -(ymm2 * ymm1) + ymm1
897 ; HASWELL-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm0
898 ; HASWELL-NEXT: vmulps %ymm2, %ymm0, %ymm0
901 ; HASWELL-NO-FMA-LABEL: v8f32_one_step_2_divs:
902 ; HASWELL-NO-FMA: # %bb.0:
903 ; HASWELL-NO-FMA-NEXT: vrcpps %ymm0, %ymm1
904 ; HASWELL-NO-FMA-NEXT: vmulps %ymm1, %ymm0, %ymm0
905 ; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
906 ; HASWELL-NO-FMA-NEXT: vsubps %ymm0, %ymm2, %ymm0
907 ; HASWELL-NO-FMA-NEXT: vmulps %ymm0, %ymm1, %ymm0
908 ; HASWELL-NO-FMA-NEXT: vaddps %ymm0, %ymm1, %ymm0
909 ; HASWELL-NO-FMA-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1
910 ; HASWELL-NO-FMA-NEXT: vmulps %ymm0, %ymm1, %ymm0
911 ; HASWELL-NO-FMA-NEXT: retq
913 ; KNL-LABEL: v8f32_one_step_2_divs:
915 ; KNL-NEXT: vrcpps %ymm0, %ymm1
916 ; KNL-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
917 ; KNL-NEXT: vfmsub231ps {{.*#+}} ymm2 = (ymm1 * ymm0) - ymm2
918 ; KNL-NEXT: vfnmadd132ps {{.*#+}} ymm2 = -(ymm2 * ymm1) + ymm1
919 ; KNL-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm0
920 ; KNL-NEXT: vmulps %ymm2, %ymm0, %ymm0
923 ; SKX-LABEL: v8f32_one_step_2_divs:
925 ; SKX-NEXT: vrcpps %ymm0, %ymm1
926 ; SKX-NEXT: vfmsub213ps {{.*#+}} ymm0 = (ymm1 * ymm0) - mem
927 ; SKX-NEXT: vfnmadd132ps {{.*#+}} ymm0 = -(ymm0 * ymm1) + ymm1
928 ; SKX-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1
929 ; SKX-NEXT: vmulps %ymm0, %ymm1, %ymm0
931 %div = fdiv fast <8 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0>, %x
932 %div2 = fdiv fast <8 x float> %div, %x
933 ret <8 x float> %div2
936 define <8 x float> @v8f32_two_step2(<8 x float> %x) #2 {
937 ; SSE-LABEL: v8f32_two_step2:
939 ; SSE-NEXT: rcpps %xmm0, %xmm2
940 ; SSE-NEXT: movaps %xmm0, %xmm3
941 ; SSE-NEXT: mulps %xmm2, %xmm3
942 ; SSE-NEXT: movaps {{.*#+}} xmm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
943 ; SSE-NEXT: movaps %xmm4, %xmm5
944 ; SSE-NEXT: subps %xmm3, %xmm5
945 ; SSE-NEXT: mulps %xmm2, %xmm5
946 ; SSE-NEXT: addps %xmm2, %xmm5
947 ; SSE-NEXT: movaps {{.*#+}} xmm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0]
948 ; SSE-NEXT: movaps %xmm5, %xmm3
949 ; SSE-NEXT: mulps %xmm2, %xmm3
950 ; SSE-NEXT: mulps %xmm3, %xmm0
951 ; SSE-NEXT: subps %xmm0, %xmm2
952 ; SSE-NEXT: mulps %xmm5, %xmm2
953 ; SSE-NEXT: addps %xmm3, %xmm2
954 ; SSE-NEXT: rcpps %xmm1, %xmm0
955 ; SSE-NEXT: movaps %xmm1, %xmm3
956 ; SSE-NEXT: mulps %xmm0, %xmm3
957 ; SSE-NEXT: subps %xmm3, %xmm4
958 ; SSE-NEXT: mulps %xmm0, %xmm4
959 ; SSE-NEXT: addps %xmm0, %xmm4
960 ; SSE-NEXT: movaps {{.*#+}} xmm3 = [5.0E+0,6.0E+0,7.0E+0,8.0E+0]
961 ; SSE-NEXT: movaps %xmm4, %xmm0
962 ; SSE-NEXT: mulps %xmm3, %xmm0
963 ; SSE-NEXT: mulps %xmm0, %xmm1
964 ; SSE-NEXT: subps %xmm1, %xmm3
965 ; SSE-NEXT: mulps %xmm4, %xmm3
966 ; SSE-NEXT: addps %xmm0, %xmm3
967 ; SSE-NEXT: movaps %xmm2, %xmm0
968 ; SSE-NEXT: movaps %xmm3, %xmm1
971 ; AVX-RECIP-LABEL: v8f32_two_step2:
972 ; AVX-RECIP: # %bb.0:
973 ; AVX-RECIP-NEXT: vrcpps %ymm0, %ymm1
974 ; AVX-RECIP-NEXT: vmulps %ymm1, %ymm0, %ymm2
975 ; AVX-RECIP-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
976 ; AVX-RECIP-NEXT: vsubps %ymm2, %ymm3, %ymm2
977 ; AVX-RECIP-NEXT: vmulps %ymm2, %ymm1, %ymm2
978 ; AVX-RECIP-NEXT: vaddps %ymm2, %ymm1, %ymm1
979 ; AVX-RECIP-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0]
980 ; AVX-RECIP-NEXT: vmulps %ymm2, %ymm1, %ymm3
981 ; AVX-RECIP-NEXT: vmulps %ymm3, %ymm0, %ymm0
982 ; AVX-RECIP-NEXT: vsubps %ymm0, %ymm2, %ymm0
983 ; AVX-RECIP-NEXT: vmulps %ymm0, %ymm1, %ymm0
984 ; AVX-RECIP-NEXT: vaddps %ymm0, %ymm3, %ymm0
985 ; AVX-RECIP-NEXT: retq
987 ; FMA-RECIP-LABEL: v8f32_two_step2:
988 ; FMA-RECIP: # %bb.0:
989 ; FMA-RECIP-NEXT: vrcpps %ymm0, %ymm1
990 ; FMA-RECIP-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
991 ; FMA-RECIP-NEXT: vfmsub231ps {{.*#+}} ymm2 = (ymm0 * ymm1) - ymm2
992 ; FMA-RECIP-NEXT: vfnmadd132ps {{.*#+}} ymm2 = -(ymm2 * ymm1) + ymm1
993 ; FMA-RECIP-NEXT: vmovaps {{.*#+}} ymm1 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0]
994 ; FMA-RECIP-NEXT: vmulps %ymm1, %ymm2, %ymm3
995 ; FMA-RECIP-NEXT: vfmsub213ps {{.*#+}} ymm0 = (ymm3 * ymm0) - ymm1
996 ; FMA-RECIP-NEXT: vfnmadd213ps {{.*#+}} ymm0 = -(ymm2 * ymm0) + ymm3
997 ; FMA-RECIP-NEXT: retq
999 ; BDVER2-LABEL: v8f32_two_step2:
1001 ; BDVER2-NEXT: vrcpps %ymm0, %ymm1
1002 ; BDVER2-NEXT: vfmsubps {{.*#+}} ymm2 = (ymm0 * ymm1) - mem
1003 ; BDVER2-NEXT: vmovaps {{.*#+}} ymm4 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0]
1004 ; BDVER2-NEXT: vfnmaddps {{.*#+}} ymm1 = -(ymm1 * ymm2) + ymm1
1005 ; BDVER2-NEXT: vmulps %ymm4, %ymm1, %ymm3
1006 ; BDVER2-NEXT: vfmsubps {{.*#+}} ymm0 = (ymm0 * ymm3) - ymm4
1007 ; BDVER2-NEXT: vfnmaddps {{.*#+}} ymm0 = -(ymm1 * ymm0) + ymm3
1010 ; BTVER2-LABEL: v8f32_two_step2:
1012 ; BTVER2-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
1013 ; BTVER2-NEXT: vrcpps %ymm0, %ymm1
1014 ; BTVER2-NEXT: vmovaps {{.*#+}} ymm4 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0]
1015 ; BTVER2-NEXT: vmulps %ymm1, %ymm0, %ymm2
1016 ; BTVER2-NEXT: vsubps %ymm2, %ymm3, %ymm2
1017 ; BTVER2-NEXT: vmulps %ymm2, %ymm1, %ymm2
1018 ; BTVER2-NEXT: vaddps %ymm2, %ymm1, %ymm1
1019 ; BTVER2-NEXT: vmulps %ymm4, %ymm1, %ymm3
1020 ; BTVER2-NEXT: vmulps %ymm3, %ymm0, %ymm0
1021 ; BTVER2-NEXT: vsubps %ymm0, %ymm4, %ymm0
1022 ; BTVER2-NEXT: vmulps %ymm0, %ymm1, %ymm0
1023 ; BTVER2-NEXT: vaddps %ymm0, %ymm3, %ymm0
1026 ; SANDY-LABEL: v8f32_two_step2:
1028 ; SANDY-NEXT: vrcpps %ymm0, %ymm1
1029 ; SANDY-NEXT: vmulps %ymm1, %ymm0, %ymm2
1030 ; SANDY-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
1031 ; SANDY-NEXT: vsubps %ymm2, %ymm3, %ymm2
1032 ; SANDY-NEXT: vmulps %ymm2, %ymm1, %ymm2
1033 ; SANDY-NEXT: vaddps %ymm2, %ymm1, %ymm1
1034 ; SANDY-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0]
1035 ; SANDY-NEXT: vmulps %ymm2, %ymm1, %ymm3
1036 ; SANDY-NEXT: vmulps %ymm3, %ymm0, %ymm0
1037 ; SANDY-NEXT: vsubps %ymm0, %ymm2, %ymm0
1038 ; SANDY-NEXT: vmulps %ymm0, %ymm1, %ymm0
1039 ; SANDY-NEXT: vaddps %ymm0, %ymm3, %ymm0
1042 ; HASWELL-LABEL: v8f32_two_step2:
1044 ; HASWELL-NEXT: vrcpps %ymm0, %ymm1
1045 ; HASWELL-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
1046 ; HASWELL-NEXT: vfmsub231ps {{.*#+}} ymm2 = (ymm0 * ymm1) - ymm2
1047 ; HASWELL-NEXT: vfnmadd132ps {{.*#+}} ymm2 = -(ymm2 * ymm1) + ymm1
1048 ; HASWELL-NEXT: vmovaps {{.*#+}} ymm1 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0]
1049 ; HASWELL-NEXT: vmulps %ymm1, %ymm2, %ymm3
1050 ; HASWELL-NEXT: vfmsub213ps {{.*#+}} ymm0 = (ymm3 * ymm0) - ymm1
1051 ; HASWELL-NEXT: vfnmadd213ps {{.*#+}} ymm0 = -(ymm2 * ymm0) + ymm3
1052 ; HASWELL-NEXT: retq
1054 ; HASWELL-NO-FMA-LABEL: v8f32_two_step2:
1055 ; HASWELL-NO-FMA: # %bb.0:
1056 ; HASWELL-NO-FMA-NEXT: vrcpps %ymm0, %ymm1
1057 ; HASWELL-NO-FMA-NEXT: vmulps %ymm1, %ymm0, %ymm2
1058 ; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
1059 ; HASWELL-NO-FMA-NEXT: vsubps %ymm2, %ymm3, %ymm2
1060 ; HASWELL-NO-FMA-NEXT: vmulps %ymm2, %ymm1, %ymm2
1061 ; HASWELL-NO-FMA-NEXT: vaddps %ymm2, %ymm1, %ymm1
1062 ; HASWELL-NO-FMA-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0]
1063 ; HASWELL-NO-FMA-NEXT: vmulps %ymm2, %ymm1, %ymm3
1064 ; HASWELL-NO-FMA-NEXT: vmulps %ymm3, %ymm0, %ymm0
1065 ; HASWELL-NO-FMA-NEXT: vsubps %ymm0, %ymm2, %ymm0
1066 ; HASWELL-NO-FMA-NEXT: vmulps %ymm0, %ymm1, %ymm0
1067 ; HASWELL-NO-FMA-NEXT: vaddps %ymm0, %ymm3, %ymm0
1068 ; HASWELL-NO-FMA-NEXT: retq
1070 ; AVX512-LABEL: v8f32_two_step2:
1072 ; AVX512-NEXT: vrcpps %ymm0, %ymm1
1073 ; AVX512-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
1074 ; AVX512-NEXT: vfmsub231ps {{.*#+}} ymm2 = (ymm0 * ymm1) - ymm2
1075 ; AVX512-NEXT: vfnmadd132ps {{.*#+}} ymm2 = -(ymm2 * ymm1) + ymm1
1076 ; AVX512-NEXT: vmovaps {{.*#+}} ymm1 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0]
1077 ; AVX512-NEXT: vmulps %ymm1, %ymm2, %ymm3
1078 ; AVX512-NEXT: vfmsub213ps {{.*#+}} ymm0 = (ymm3 * ymm0) - ymm1
1079 ; AVX512-NEXT: vfnmadd213ps {{.*#+}} ymm0 = -(ymm2 * ymm0) + ymm3
1081 %div = fdiv fast <8 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0>, %x
1082 ret <8 x float> %div
1085 define <8 x float> @v8f32_no_step(<8 x float> %x) #3 {
1086 ; SSE-LABEL: v8f32_no_step:
1088 ; SSE-NEXT: rcpps %xmm0, %xmm0
1089 ; SSE-NEXT: rcpps %xmm1, %xmm1
1092 ; AVX-LABEL: v8f32_no_step:
1094 ; AVX-NEXT: vrcpps %ymm0, %ymm0
1096 %div = fdiv fast <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, %x
1097 ret <8 x float> %div
1100 define <8 x float> @v8f32_no_step2(<8 x float> %x) #3 {
1101 ; SSE-LABEL: v8f32_no_step2:
1103 ; SSE-NEXT: rcpps %xmm0, %xmm0
1104 ; SSE-NEXT: mulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1105 ; SSE-NEXT: rcpps %xmm1, %xmm1
1106 ; SSE-NEXT: mulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
1109 ; AVX-LABEL: v8f32_no_step2:
1111 ; AVX-NEXT: vrcpps %ymm0, %ymm0
1112 ; AVX-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1114 %div = fdiv fast <8 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0>, %x
1115 ret <8 x float> %div
1118 define <16 x float> @v16f32_one_step2(<16 x float> %x) #1 {
1119 ; SSE-LABEL: v16f32_one_step2:
1121 ; SSE-NEXT: movaps %xmm1, %xmm4
1122 ; SSE-NEXT: movaps %xmm0, %xmm1
1123 ; SSE-NEXT: rcpps %xmm0, %xmm5
1124 ; SSE-NEXT: movaps {{.*#+}} xmm0 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0]
1125 ; SSE-NEXT: movaps %xmm5, %xmm6
1126 ; SSE-NEXT: mulps %xmm0, %xmm6
1127 ; SSE-NEXT: mulps %xmm6, %xmm1
1128 ; SSE-NEXT: subps %xmm1, %xmm0
1129 ; SSE-NEXT: mulps %xmm5, %xmm0
1130 ; SSE-NEXT: addps %xmm6, %xmm0
1131 ; SSE-NEXT: rcpps %xmm4, %xmm5
1132 ; SSE-NEXT: movaps {{.*#+}} xmm1 = [5.0E+0,6.0E+0,7.0E+0,8.0E+0]
1133 ; SSE-NEXT: movaps %xmm5, %xmm6
1134 ; SSE-NEXT: mulps %xmm1, %xmm6
1135 ; SSE-NEXT: mulps %xmm6, %xmm4
1136 ; SSE-NEXT: subps %xmm4, %xmm1
1137 ; SSE-NEXT: mulps %xmm5, %xmm1
1138 ; SSE-NEXT: addps %xmm6, %xmm1
1139 ; SSE-NEXT: rcpps %xmm2, %xmm5
1140 ; SSE-NEXT: movaps {{.*#+}} xmm4 = [9.0E+0,1.0E+1,1.1E+1,1.2E+1]
1141 ; SSE-NEXT: movaps %xmm5, %xmm6
1142 ; SSE-NEXT: mulps %xmm4, %xmm6
1143 ; SSE-NEXT: mulps %xmm6, %xmm2
1144 ; SSE-NEXT: subps %xmm2, %xmm4
1145 ; SSE-NEXT: mulps %xmm5, %xmm4
1146 ; SSE-NEXT: addps %xmm6, %xmm4
1147 ; SSE-NEXT: rcpps %xmm3, %xmm2
1148 ; SSE-NEXT: movaps {{.*#+}} xmm5 = [1.3E+1,1.4E+1,1.5E+1,1.6E+1]
1149 ; SSE-NEXT: movaps %xmm2, %xmm6
1150 ; SSE-NEXT: mulps %xmm5, %xmm6
1151 ; SSE-NEXT: mulps %xmm6, %xmm3
1152 ; SSE-NEXT: subps %xmm3, %xmm5
1153 ; SSE-NEXT: mulps %xmm2, %xmm5
1154 ; SSE-NEXT: addps %xmm6, %xmm5
1155 ; SSE-NEXT: movaps %xmm4, %xmm2
1156 ; SSE-NEXT: movaps %xmm5, %xmm3
1159 ; AVX-RECIP-LABEL: v16f32_one_step2:
1160 ; AVX-RECIP: # %bb.0:
1161 ; AVX-RECIP-NEXT: vrcpps %ymm0, %ymm2
1162 ; AVX-RECIP-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0]
1163 ; AVX-RECIP-NEXT: vmulps %ymm3, %ymm2, %ymm4
1164 ; AVX-RECIP-NEXT: vmulps %ymm4, %ymm0, %ymm0
1165 ; AVX-RECIP-NEXT: vsubps %ymm0, %ymm3, %ymm0
1166 ; AVX-RECIP-NEXT: vmulps %ymm0, %ymm2, %ymm0
1167 ; AVX-RECIP-NEXT: vaddps %ymm0, %ymm4, %ymm0
1168 ; AVX-RECIP-NEXT: vrcpps %ymm1, %ymm2
1169 ; AVX-RECIP-NEXT: vmovaps {{.*#+}} ymm3 = [9.0E+0,1.0E+1,1.1E+1,1.2E+1,1.3E+1,1.4E+1,1.5E+1,1.6E+1]
1170 ; AVX-RECIP-NEXT: vmulps %ymm3, %ymm2, %ymm4
1171 ; AVX-RECIP-NEXT: vmulps %ymm4, %ymm1, %ymm1
1172 ; AVX-RECIP-NEXT: vsubps %ymm1, %ymm3, %ymm1
1173 ; AVX-RECIP-NEXT: vmulps %ymm1, %ymm2, %ymm1
1174 ; AVX-RECIP-NEXT: vaddps %ymm1, %ymm4, %ymm1
1175 ; AVX-RECIP-NEXT: retq
1177 ; FMA-RECIP-LABEL: v16f32_one_step2:
1178 ; FMA-RECIP: # %bb.0:
1179 ; FMA-RECIP-NEXT: vrcpps %ymm0, %ymm2
1180 ; FMA-RECIP-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0]
1181 ; FMA-RECIP-NEXT: vmulps %ymm3, %ymm2, %ymm4
1182 ; FMA-RECIP-NEXT: vfmsub213ps {{.*#+}} ymm0 = (ymm4 * ymm0) - ymm3
1183 ; FMA-RECIP-NEXT: vfnmadd213ps {{.*#+}} ymm0 = -(ymm2 * ymm0) + ymm4
1184 ; FMA-RECIP-NEXT: vrcpps %ymm1, %ymm2
1185 ; FMA-RECIP-NEXT: vmovaps {{.*#+}} ymm3 = [9.0E+0,1.0E+1,1.1E+1,1.2E+1,1.3E+1,1.4E+1,1.5E+1,1.6E+1]
1186 ; FMA-RECIP-NEXT: vmulps %ymm3, %ymm2, %ymm4
1187 ; FMA-RECIP-NEXT: vfmsub213ps {{.*#+}} ymm1 = (ymm4 * ymm1) - ymm3
1188 ; FMA-RECIP-NEXT: vfnmadd213ps {{.*#+}} ymm1 = -(ymm2 * ymm1) + ymm4
1189 ; FMA-RECIP-NEXT: retq
1191 ; BDVER2-LABEL: v16f32_one_step2:
1193 ; BDVER2-NEXT: vrcpps %ymm0, %ymm2
1194 ; BDVER2-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0]
1195 ; BDVER2-NEXT: vrcpps %ymm1, %ymm5
1196 ; BDVER2-NEXT: vmulps %ymm3, %ymm2, %ymm4
1197 ; BDVER2-NEXT: vfmsubps {{.*#+}} ymm0 = (ymm0 * ymm4) - ymm3
1198 ; BDVER2-NEXT: vmovaps {{.*#+}} ymm3 = [9.0E+0,1.0E+1,1.1E+1,1.2E+1,1.3E+1,1.4E+1,1.5E+1,1.6E+1]
1199 ; BDVER2-NEXT: vfnmaddps {{.*#+}} ymm0 = -(ymm2 * ymm0) + ymm4
1200 ; BDVER2-NEXT: vmulps %ymm3, %ymm5, %ymm4
1201 ; BDVER2-NEXT: vfmsubps {{.*#+}} ymm1 = (ymm1 * ymm4) - ymm3
1202 ; BDVER2-NEXT: vfnmaddps {{.*#+}} ymm1 = -(ymm5 * ymm1) + ymm4
1205 ; BTVER2-LABEL: v16f32_one_step2:
1207 ; BTVER2-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0]
1208 ; BTVER2-NEXT: vrcpps %ymm0, %ymm2
1209 ; BTVER2-NEXT: vmulps %ymm3, %ymm2, %ymm4
1210 ; BTVER2-NEXT: vmulps %ymm4, %ymm0, %ymm0
1211 ; BTVER2-NEXT: vsubps %ymm0, %ymm3, %ymm0
1212 ; BTVER2-NEXT: vmovaps {{.*#+}} ymm3 = [9.0E+0,1.0E+1,1.1E+1,1.2E+1,1.3E+1,1.4E+1,1.5E+1,1.6E+1]
1213 ; BTVER2-NEXT: vmulps %ymm0, %ymm2, %ymm0
1214 ; BTVER2-NEXT: vrcpps %ymm1, %ymm2
1215 ; BTVER2-NEXT: vmulps %ymm3, %ymm2, %ymm5
1216 ; BTVER2-NEXT: vaddps %ymm0, %ymm4, %ymm0
1217 ; BTVER2-NEXT: vmulps %ymm5, %ymm1, %ymm1
1218 ; BTVER2-NEXT: vsubps %ymm1, %ymm3, %ymm1
1219 ; BTVER2-NEXT: vmulps %ymm1, %ymm2, %ymm1
1220 ; BTVER2-NEXT: vaddps %ymm1, %ymm5, %ymm1
1223 ; SANDY-LABEL: v16f32_one_step2:
1225 ; SANDY-NEXT: vrcpps %ymm0, %ymm2
1226 ; SANDY-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0]
1227 ; SANDY-NEXT: vmulps %ymm3, %ymm2, %ymm4
1228 ; SANDY-NEXT: vmulps %ymm4, %ymm0, %ymm0
1229 ; SANDY-NEXT: vsubps %ymm0, %ymm3, %ymm0
1230 ; SANDY-NEXT: vmulps %ymm0, %ymm2, %ymm0
1231 ; SANDY-NEXT: vaddps %ymm0, %ymm4, %ymm0
1232 ; SANDY-NEXT: vrcpps %ymm1, %ymm2
1233 ; SANDY-NEXT: vmovaps {{.*#+}} ymm3 = [9.0E+0,1.0E+1,1.1E+1,1.2E+1,1.3E+1,1.4E+1,1.5E+1,1.6E+1]
1234 ; SANDY-NEXT: vmulps %ymm3, %ymm2, %ymm4
1235 ; SANDY-NEXT: vmulps %ymm4, %ymm1, %ymm1
1236 ; SANDY-NEXT: vsubps %ymm1, %ymm3, %ymm1
1237 ; SANDY-NEXT: vmulps %ymm1, %ymm2, %ymm1
1238 ; SANDY-NEXT: vaddps %ymm1, %ymm4, %ymm1
1241 ; HASWELL-LABEL: v16f32_one_step2:
1243 ; HASWELL-NEXT: vrcpps %ymm0, %ymm2
1244 ; HASWELL-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0]
1245 ; HASWELL-NEXT: vmulps %ymm3, %ymm2, %ymm4
1246 ; HASWELL-NEXT: vfmsub213ps {{.*#+}} ymm0 = (ymm4 * ymm0) - ymm3
1247 ; HASWELL-NEXT: vfnmadd213ps {{.*#+}} ymm0 = -(ymm2 * ymm0) + ymm4
1248 ; HASWELL-NEXT: vrcpps %ymm1, %ymm2
1249 ; HASWELL-NEXT: vmovaps {{.*#+}} ymm3 = [9.0E+0,1.0E+1,1.1E+1,1.2E+1,1.3E+1,1.4E+1,1.5E+1,1.6E+1]
1250 ; HASWELL-NEXT: vmulps %ymm3, %ymm2, %ymm4
1251 ; HASWELL-NEXT: vfmsub213ps {{.*#+}} ymm1 = (ymm4 * ymm1) - ymm3
1252 ; HASWELL-NEXT: vfnmadd213ps {{.*#+}} ymm1 = -(ymm2 * ymm1) + ymm4
1253 ; HASWELL-NEXT: retq
1255 ; HASWELL-NO-FMA-LABEL: v16f32_one_step2:
1256 ; HASWELL-NO-FMA: # %bb.0:
1257 ; HASWELL-NO-FMA-NEXT: vrcpps %ymm0, %ymm2
1258 ; HASWELL-NO-FMA-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0]
1259 ; HASWELL-NO-FMA-NEXT: vmulps %ymm3, %ymm2, %ymm4
1260 ; HASWELL-NO-FMA-NEXT: vmulps %ymm4, %ymm0, %ymm0
1261 ; HASWELL-NO-FMA-NEXT: vsubps %ymm0, %ymm3, %ymm0
1262 ; HASWELL-NO-FMA-NEXT: vrcpps %ymm1, %ymm3
1263 ; HASWELL-NO-FMA-NEXT: vmulps %ymm0, %ymm2, %ymm0
1264 ; HASWELL-NO-FMA-NEXT: vaddps %ymm0, %ymm4, %ymm0
1265 ; HASWELL-NO-FMA-NEXT: vmovaps {{.*#+}} ymm2 = [9.0E+0,1.0E+1,1.1E+1,1.2E+1,1.3E+1,1.4E+1,1.5E+1,1.6E+1]
1266 ; HASWELL-NO-FMA-NEXT: vmulps %ymm2, %ymm3, %ymm4
1267 ; HASWELL-NO-FMA-NEXT: vmulps %ymm4, %ymm1, %ymm1
1268 ; HASWELL-NO-FMA-NEXT: vsubps %ymm1, %ymm2, %ymm1
1269 ; HASWELL-NO-FMA-NEXT: vmulps %ymm1, %ymm3, %ymm1
1270 ; HASWELL-NO-FMA-NEXT: vaddps %ymm1, %ymm4, %ymm1
1271 ; HASWELL-NO-FMA-NEXT: retq
1273 ; AVX512-LABEL: v16f32_one_step2:
1275 ; AVX512-NEXT: vrcp14ps %zmm0, %zmm1
1276 ; AVX512-NEXT: vmovaps {{.*#+}} zmm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0,9.0E+0,1.0E+1,1.1E+1,1.2E+1,1.3E+1,1.4E+1,1.5E+1,1.6E+1]
1277 ; AVX512-NEXT: vmulps %zmm2, %zmm1, %zmm3
1278 ; AVX512-NEXT: vfmsub213ps {{.*#+}} zmm0 = (zmm3 * zmm0) - zmm2
1279 ; AVX512-NEXT: vfnmadd213ps {{.*#+}} zmm0 = -(zmm1 * zmm0) + zmm3
1281 %div = fdiv fast <16 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0, float 9.0, float 10.0, float 11.0, float 12.0, float 13.0, float 14.0, float 15.0, float 16.0>, %x
1282 ret <16 x float> %div
1285 define <16 x float> @v16f32_one_step_2_divs(<16 x float> %x) #1 {
1286 ; SSE-LABEL: v16f32_one_step_2_divs:
1288 ; SSE-NEXT: rcpps %xmm0, %xmm6
1289 ; SSE-NEXT: mulps %xmm6, %xmm0
1290 ; SSE-NEXT: movaps {{.*#+}} xmm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
1291 ; SSE-NEXT: movaps %xmm4, %xmm5
1292 ; SSE-NEXT: subps %xmm0, %xmm5
1293 ; SSE-NEXT: mulps %xmm6, %xmm5
1294 ; SSE-NEXT: addps %xmm6, %xmm5
1295 ; SSE-NEXT: rcpps %xmm1, %xmm0
1296 ; SSE-NEXT: mulps %xmm0, %xmm1
1297 ; SSE-NEXT: movaps %xmm4, %xmm6
1298 ; SSE-NEXT: subps %xmm1, %xmm6
1299 ; SSE-NEXT: mulps %xmm0, %xmm6
1300 ; SSE-NEXT: addps %xmm0, %xmm6
1301 ; SSE-NEXT: rcpps %xmm2, %xmm0
1302 ; SSE-NEXT: mulps %xmm0, %xmm2
1303 ; SSE-NEXT: movaps %xmm4, %xmm7
1304 ; SSE-NEXT: subps %xmm2, %xmm7
1305 ; SSE-NEXT: mulps %xmm0, %xmm7
1306 ; SSE-NEXT: addps %xmm0, %xmm7
1307 ; SSE-NEXT: rcpps %xmm3, %xmm0
1308 ; SSE-NEXT: mulps %xmm0, %xmm3
1309 ; SSE-NEXT: subps %xmm3, %xmm4
1310 ; SSE-NEXT: mulps %xmm0, %xmm4
1311 ; SSE-NEXT: addps %xmm0, %xmm4
1312 ; SSE-NEXT: movaps {{.*#+}} xmm3 = [1.3E+1,1.4E+1,1.5E+1,1.6E+1]
1313 ; SSE-NEXT: mulps %xmm4, %xmm3
1314 ; SSE-NEXT: movaps {{.*#+}} xmm2 = [9.0E+0,1.0E+1,1.1E+1,1.2E+1]
1315 ; SSE-NEXT: mulps %xmm7, %xmm2
1316 ; SSE-NEXT: movaps {{.*#+}} xmm1 = [5.0E+0,6.0E+0,7.0E+0,8.0E+0]
1317 ; SSE-NEXT: mulps %xmm6, %xmm1
1318 ; SSE-NEXT: movaps {{.*#+}} xmm0 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0]
1319 ; SSE-NEXT: mulps %xmm5, %xmm0
1320 ; SSE-NEXT: mulps %xmm5, %xmm0
1321 ; SSE-NEXT: mulps %xmm6, %xmm1
1322 ; SSE-NEXT: mulps %xmm7, %xmm2
1323 ; SSE-NEXT: mulps %xmm4, %xmm3
1326 ; AVX-RECIP-LABEL: v16f32_one_step_2_divs:
1327 ; AVX-RECIP: # %bb.0:
1328 ; AVX-RECIP-NEXT: vrcpps %ymm0, %ymm2
1329 ; AVX-RECIP-NEXT: vmulps %ymm2, %ymm0, %ymm0
1330 ; AVX-RECIP-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
1331 ; AVX-RECIP-NEXT: vsubps %ymm0, %ymm3, %ymm0
1332 ; AVX-RECIP-NEXT: vmulps %ymm0, %ymm2, %ymm0
1333 ; AVX-RECIP-NEXT: vaddps %ymm0, %ymm2, %ymm0
1334 ; AVX-RECIP-NEXT: vrcpps %ymm1, %ymm2
1335 ; AVX-RECIP-NEXT: vmulps %ymm2, %ymm1, %ymm1
1336 ; AVX-RECIP-NEXT: vsubps %ymm1, %ymm3, %ymm1
1337 ; AVX-RECIP-NEXT: vmulps %ymm1, %ymm2, %ymm1
1338 ; AVX-RECIP-NEXT: vaddps %ymm1, %ymm2, %ymm1
1339 ; AVX-RECIP-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm2
1340 ; AVX-RECIP-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm3
1341 ; AVX-RECIP-NEXT: vmulps %ymm0, %ymm3, %ymm0
1342 ; AVX-RECIP-NEXT: vmulps %ymm1, %ymm2, %ymm1
1343 ; AVX-RECIP-NEXT: retq
1345 ; FMA-RECIP-LABEL: v16f32_one_step_2_divs:
1346 ; FMA-RECIP: # %bb.0:
1347 ; FMA-RECIP-NEXT: vrcpps %ymm0, %ymm2
1348 ; FMA-RECIP-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
1349 ; FMA-RECIP-NEXT: vfmsub213ps {{.*#+}} ymm0 = (ymm2 * ymm0) - ymm3
1350 ; FMA-RECIP-NEXT: vfnmadd132ps {{.*#+}} ymm0 = -(ymm0 * ymm2) + ymm2
1351 ; FMA-RECIP-NEXT: vrcpps %ymm1, %ymm2
1352 ; FMA-RECIP-NEXT: vfmsub213ps {{.*#+}} ymm1 = (ymm2 * ymm1) - ymm3
1353 ; FMA-RECIP-NEXT: vfnmadd132ps {{.*#+}} ymm1 = -(ymm1 * ymm2) + ymm2
1354 ; FMA-RECIP-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm2
1355 ; FMA-RECIP-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm3
1356 ; FMA-RECIP-NEXT: vmulps %ymm0, %ymm3, %ymm0
1357 ; FMA-RECIP-NEXT: vmulps %ymm1, %ymm2, %ymm1
1358 ; FMA-RECIP-NEXT: retq
1360 ; BDVER2-LABEL: v16f32_one_step_2_divs:
1362 ; BDVER2-NEXT: vrcpps %ymm0, %ymm2
1363 ; BDVER2-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
1364 ; BDVER2-NEXT: vfmsubps {{.*#+}} ymm0 = (ymm0 * ymm2) - ymm3
1365 ; BDVER2-NEXT: vfnmaddps {{.*#+}} ymm0 = -(ymm2 * ymm0) + ymm2
1366 ; BDVER2-NEXT: vrcpps %ymm1, %ymm2
1367 ; BDVER2-NEXT: vfmsubps {{.*#+}} ymm1 = (ymm1 * ymm2) - ymm3
1368 ; BDVER2-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm3
1369 ; BDVER2-NEXT: vfnmaddps {{.*#+}} ymm1 = -(ymm2 * ymm1) + ymm2
1370 ; BDVER2-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm2
1371 ; BDVER2-NEXT: vmulps %ymm0, %ymm3, %ymm0
1372 ; BDVER2-NEXT: vmulps %ymm1, %ymm2, %ymm1
1375 ; BTVER2-LABEL: v16f32_one_step_2_divs:
1377 ; BTVER2-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
1378 ; BTVER2-NEXT: vrcpps %ymm0, %ymm2
1379 ; BTVER2-NEXT: vmulps %ymm2, %ymm0, %ymm0
1380 ; BTVER2-NEXT: vsubps %ymm0, %ymm3, %ymm0
1381 ; BTVER2-NEXT: vmulps %ymm0, %ymm2, %ymm0
1382 ; BTVER2-NEXT: vaddps %ymm0, %ymm2, %ymm0
1383 ; BTVER2-NEXT: vrcpps %ymm1, %ymm2
1384 ; BTVER2-NEXT: vmulps %ymm2, %ymm1, %ymm1
1385 ; BTVER2-NEXT: vsubps %ymm1, %ymm3, %ymm1
1386 ; BTVER2-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm3
1387 ; BTVER2-NEXT: vmulps %ymm1, %ymm2, %ymm1
1388 ; BTVER2-NEXT: vaddps %ymm1, %ymm2, %ymm1
1389 ; BTVER2-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm2
1390 ; BTVER2-NEXT: vmulps %ymm0, %ymm3, %ymm0
1391 ; BTVER2-NEXT: vmulps %ymm1, %ymm2, %ymm1
1394 ; SANDY-LABEL: v16f32_one_step_2_divs:
1396 ; SANDY-NEXT: vrcpps %ymm0, %ymm2
1397 ; SANDY-NEXT: vmulps %ymm2, %ymm0, %ymm0
1398 ; SANDY-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
1399 ; SANDY-NEXT: vsubps %ymm0, %ymm3, %ymm0
1400 ; SANDY-NEXT: vmulps %ymm0, %ymm2, %ymm0
1401 ; SANDY-NEXT: vaddps %ymm0, %ymm2, %ymm0
1402 ; SANDY-NEXT: vrcpps %ymm1, %ymm2
1403 ; SANDY-NEXT: vmulps %ymm2, %ymm1, %ymm1
1404 ; SANDY-NEXT: vsubps %ymm1, %ymm3, %ymm1
1405 ; SANDY-NEXT: vmulps %ymm1, %ymm2, %ymm1
1406 ; SANDY-NEXT: vaddps %ymm1, %ymm2, %ymm1
1407 ; SANDY-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm2
1408 ; SANDY-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm3
1409 ; SANDY-NEXT: vmulps %ymm0, %ymm3, %ymm0
1410 ; SANDY-NEXT: vmulps %ymm1, %ymm2, %ymm1
1413 ; HASWELL-LABEL: v16f32_one_step_2_divs:
1415 ; HASWELL-NEXT: vrcpps %ymm0, %ymm2
1416 ; HASWELL-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
1417 ; HASWELL-NEXT: vfmsub213ps {{.*#+}} ymm0 = (ymm2 * ymm0) - ymm3
1418 ; HASWELL-NEXT: vfnmadd132ps {{.*#+}} ymm0 = -(ymm0 * ymm2) + ymm2
1419 ; HASWELL-NEXT: vrcpps %ymm1, %ymm2
1420 ; HASWELL-NEXT: vfmsub213ps {{.*#+}} ymm1 = (ymm2 * ymm1) - ymm3
1421 ; HASWELL-NEXT: vfnmadd132ps {{.*#+}} ymm1 = -(ymm1 * ymm2) + ymm2
1422 ; HASWELL-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm2
1423 ; HASWELL-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm3
1424 ; HASWELL-NEXT: vmulps %ymm0, %ymm3, %ymm0
1425 ; HASWELL-NEXT: vmulps %ymm1, %ymm2, %ymm1
1426 ; HASWELL-NEXT: retq
1428 ; HASWELL-NO-FMA-LABEL: v16f32_one_step_2_divs:
1429 ; HASWELL-NO-FMA: # %bb.0:
1430 ; HASWELL-NO-FMA-NEXT: vrcpps %ymm0, %ymm2
1431 ; HASWELL-NO-FMA-NEXT: vmulps %ymm2, %ymm0, %ymm0
1432 ; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
1433 ; HASWELL-NO-FMA-NEXT: vsubps %ymm0, %ymm3, %ymm0
1434 ; HASWELL-NO-FMA-NEXT: vrcpps %ymm1, %ymm4
1435 ; HASWELL-NO-FMA-NEXT: vmulps %ymm0, %ymm2, %ymm0
1436 ; HASWELL-NO-FMA-NEXT: vaddps %ymm0, %ymm2, %ymm0
1437 ; HASWELL-NO-FMA-NEXT: vmulps %ymm4, %ymm1, %ymm1
1438 ; HASWELL-NO-FMA-NEXT: vsubps %ymm1, %ymm3, %ymm1
1439 ; HASWELL-NO-FMA-NEXT: vmulps %ymm1, %ymm4, %ymm1
1440 ; HASWELL-NO-FMA-NEXT: vaddps %ymm1, %ymm4, %ymm1
1441 ; HASWELL-NO-FMA-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm2
1442 ; HASWELL-NO-FMA-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm3
1443 ; HASWELL-NO-FMA-NEXT: vmulps %ymm0, %ymm3, %ymm0
1444 ; HASWELL-NO-FMA-NEXT: vmulps %ymm1, %ymm2, %ymm1
1445 ; HASWELL-NO-FMA-NEXT: retq
1447 ; AVX512-LABEL: v16f32_one_step_2_divs:
1449 ; AVX512-NEXT: vrcp14ps %zmm0, %zmm1
1450 ; AVX512-NEXT: vfmsub213ps {{.*#+}} zmm0 = (zmm1 * zmm0) - mem
1451 ; AVX512-NEXT: vfnmadd132ps {{.*#+}} zmm0 = -(zmm0 * zmm1) + zmm1
1452 ; AVX512-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1
1453 ; AVX512-NEXT: vmulps %zmm0, %zmm1, %zmm0
1455 %div = fdiv fast <16 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0, float 9.0, float 10.0, float 11.0, float 12.0, float 13.0, float 14.0, float 15.0, float 16.0>, %x
1456 %div2 = fdiv fast <16 x float> %div, %x
1457 ret <16 x float> %div2
1460 define <16 x float> @v16f32_two_step2(<16 x float> %x) #2 {
1461 ; SSE-LABEL: v16f32_two_step2:
1463 ; SSE-NEXT: movaps %xmm1, %xmm4
1464 ; SSE-NEXT: movaps %xmm0, %xmm1
1465 ; SSE-NEXT: rcpps %xmm0, %xmm0
1466 ; SSE-NEXT: movaps %xmm1, %xmm5
1467 ; SSE-NEXT: mulps %xmm0, %xmm5
1468 ; SSE-NEXT: movaps {{.*#+}} xmm6 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
1469 ; SSE-NEXT: movaps %xmm6, %xmm7
1470 ; SSE-NEXT: subps %xmm5, %xmm7
1471 ; SSE-NEXT: mulps %xmm0, %xmm7
1472 ; SSE-NEXT: addps %xmm0, %xmm7
1473 ; SSE-NEXT: movaps {{.*#+}} xmm0 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0]
1474 ; SSE-NEXT: movaps %xmm7, %xmm5
1475 ; SSE-NEXT: mulps %xmm0, %xmm5
1476 ; SSE-NEXT: mulps %xmm5, %xmm1
1477 ; SSE-NEXT: subps %xmm1, %xmm0
1478 ; SSE-NEXT: mulps %xmm7, %xmm0
1479 ; SSE-NEXT: addps %xmm5, %xmm0
1480 ; SSE-NEXT: rcpps %xmm4, %xmm1
1481 ; SSE-NEXT: movaps %xmm4, %xmm5
1482 ; SSE-NEXT: mulps %xmm1, %xmm5
1483 ; SSE-NEXT: movaps %xmm6, %xmm7
1484 ; SSE-NEXT: subps %xmm5, %xmm7
1485 ; SSE-NEXT: mulps %xmm1, %xmm7
1486 ; SSE-NEXT: addps %xmm1, %xmm7
1487 ; SSE-NEXT: movaps {{.*#+}} xmm1 = [5.0E+0,6.0E+0,7.0E+0,8.0E+0]
1488 ; SSE-NEXT: movaps %xmm7, %xmm5
1489 ; SSE-NEXT: mulps %xmm1, %xmm5
1490 ; SSE-NEXT: mulps %xmm5, %xmm4
1491 ; SSE-NEXT: subps %xmm4, %xmm1
1492 ; SSE-NEXT: mulps %xmm7, %xmm1
1493 ; SSE-NEXT: addps %xmm5, %xmm1
1494 ; SSE-NEXT: rcpps %xmm2, %xmm4
1495 ; SSE-NEXT: movaps %xmm2, %xmm5
1496 ; SSE-NEXT: mulps %xmm4, %xmm5
1497 ; SSE-NEXT: movaps %xmm6, %xmm7
1498 ; SSE-NEXT: subps %xmm5, %xmm7
1499 ; SSE-NEXT: mulps %xmm4, %xmm7
1500 ; SSE-NEXT: addps %xmm4, %xmm7
1501 ; SSE-NEXT: movaps {{.*#+}} xmm4 = [9.0E+0,1.0E+1,1.1E+1,1.2E+1]
1502 ; SSE-NEXT: movaps %xmm7, %xmm5
1503 ; SSE-NEXT: mulps %xmm4, %xmm5
1504 ; SSE-NEXT: mulps %xmm5, %xmm2
1505 ; SSE-NEXT: subps %xmm2, %xmm4
1506 ; SSE-NEXT: mulps %xmm7, %xmm4
1507 ; SSE-NEXT: addps %xmm5, %xmm4
1508 ; SSE-NEXT: rcpps %xmm3, %xmm2
1509 ; SSE-NEXT: movaps %xmm3, %xmm5
1510 ; SSE-NEXT: mulps %xmm2, %xmm5
1511 ; SSE-NEXT: subps %xmm5, %xmm6
1512 ; SSE-NEXT: mulps %xmm2, %xmm6
1513 ; SSE-NEXT: addps %xmm2, %xmm6
1514 ; SSE-NEXT: movaps {{.*#+}} xmm5 = [1.3E+1,1.4E+1,1.5E+1,1.6E+1]
1515 ; SSE-NEXT: movaps %xmm6, %xmm2
1516 ; SSE-NEXT: mulps %xmm5, %xmm2
1517 ; SSE-NEXT: mulps %xmm2, %xmm3
1518 ; SSE-NEXT: subps %xmm3, %xmm5
1519 ; SSE-NEXT: mulps %xmm6, %xmm5
1520 ; SSE-NEXT: addps %xmm2, %xmm5
1521 ; SSE-NEXT: movaps %xmm4, %xmm2
1522 ; SSE-NEXT: movaps %xmm5, %xmm3
1525 ; AVX-RECIP-LABEL: v16f32_two_step2:
1526 ; AVX-RECIP: # %bb.0:
1527 ; AVX-RECIP-NEXT: vrcpps %ymm0, %ymm2
1528 ; AVX-RECIP-NEXT: vmulps %ymm2, %ymm0, %ymm3
1529 ; AVX-RECIP-NEXT: vbroadcastss {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
1530 ; AVX-RECIP-NEXT: vsubps %ymm3, %ymm4, %ymm3
1531 ; AVX-RECIP-NEXT: vmulps %ymm3, %ymm2, %ymm3
1532 ; AVX-RECIP-NEXT: vaddps %ymm3, %ymm2, %ymm2
1533 ; AVX-RECIP-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0]
1534 ; AVX-RECIP-NEXT: vmulps %ymm3, %ymm2, %ymm5
1535 ; AVX-RECIP-NEXT: vmulps %ymm5, %ymm0, %ymm0
1536 ; AVX-RECIP-NEXT: vsubps %ymm0, %ymm3, %ymm0
1537 ; AVX-RECIP-NEXT: vmulps %ymm0, %ymm2, %ymm0
1538 ; AVX-RECIP-NEXT: vaddps %ymm0, %ymm5, %ymm0
1539 ; AVX-RECIP-NEXT: vrcpps %ymm1, %ymm2
1540 ; AVX-RECIP-NEXT: vmulps %ymm2, %ymm1, %ymm3
1541 ; AVX-RECIP-NEXT: vsubps %ymm3, %ymm4, %ymm3
1542 ; AVX-RECIP-NEXT: vmulps %ymm3, %ymm2, %ymm3
1543 ; AVX-RECIP-NEXT: vaddps %ymm3, %ymm2, %ymm2
1544 ; AVX-RECIP-NEXT: vmovaps {{.*#+}} ymm3 = [9.0E+0,1.0E+1,1.1E+1,1.2E+1,1.3E+1,1.4E+1,1.5E+1,1.6E+1]
1545 ; AVX-RECIP-NEXT: vmulps %ymm3, %ymm2, %ymm4
1546 ; AVX-RECIP-NEXT: vmulps %ymm4, %ymm1, %ymm1
1547 ; AVX-RECIP-NEXT: vsubps %ymm1, %ymm3, %ymm1
1548 ; AVX-RECIP-NEXT: vmulps %ymm1, %ymm2, %ymm1
1549 ; AVX-RECIP-NEXT: vaddps %ymm1, %ymm4, %ymm1
1550 ; AVX-RECIP-NEXT: retq
1552 ; FMA-RECIP-LABEL: v16f32_two_step2:
1553 ; FMA-RECIP: # %bb.0:
1554 ; FMA-RECIP-NEXT: vrcpps %ymm0, %ymm2
1555 ; FMA-RECIP-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
1556 ; FMA-RECIP-NEXT: vmovaps %ymm2, %ymm4
1557 ; FMA-RECIP-NEXT: vfmsub213ps {{.*#+}} ymm4 = (ymm0 * ymm4) - ymm3
1558 ; FMA-RECIP-NEXT: vfnmadd132ps {{.*#+}} ymm4 = -(ymm4 * ymm2) + ymm2
1559 ; FMA-RECIP-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0]
1560 ; FMA-RECIP-NEXT: vmulps %ymm2, %ymm4, %ymm5
1561 ; FMA-RECIP-NEXT: vfmsub213ps {{.*#+}} ymm0 = (ymm5 * ymm0) - ymm2
1562 ; FMA-RECIP-NEXT: vfnmadd213ps {{.*#+}} ymm0 = -(ymm4 * ymm0) + ymm5
1563 ; FMA-RECIP-NEXT: vrcpps %ymm1, %ymm2
1564 ; FMA-RECIP-NEXT: vfmsub231ps {{.*#+}} ymm3 = (ymm1 * ymm2) - ymm3
1565 ; FMA-RECIP-NEXT: vfnmadd132ps {{.*#+}} ymm3 = -(ymm3 * ymm2) + ymm2
1566 ; FMA-RECIP-NEXT: vmovaps {{.*#+}} ymm2 = [9.0E+0,1.0E+1,1.1E+1,1.2E+1,1.3E+1,1.4E+1,1.5E+1,1.6E+1]
1567 ; FMA-RECIP-NEXT: vmulps %ymm2, %ymm3, %ymm4
1568 ; FMA-RECIP-NEXT: vfmsub213ps {{.*#+}} ymm1 = (ymm4 * ymm1) - ymm2
1569 ; FMA-RECIP-NEXT: vfnmadd213ps {{.*#+}} ymm1 = -(ymm3 * ymm1) + ymm4
1570 ; FMA-RECIP-NEXT: retq
1572 ; BDVER2-LABEL: v16f32_two_step2:
1574 ; BDVER2-NEXT: vrcpps %ymm0, %ymm2
1575 ; BDVER2-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
1576 ; BDVER2-NEXT: vfmsubps {{.*#+}} ymm4 = (ymm0 * ymm2) - ymm3
1577 ; BDVER2-NEXT: vfnmaddps {{.*#+}} ymm2 = -(ymm2 * ymm4) + ymm2
1578 ; BDVER2-NEXT: vmovaps {{.*#+}} ymm4 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0]
1579 ; BDVER2-NEXT: vmulps %ymm4, %ymm2, %ymm5
1580 ; BDVER2-NEXT: vfmsubps {{.*#+}} ymm0 = (ymm0 * ymm5) - ymm4
1581 ; BDVER2-NEXT: vfnmaddps {{.*#+}} ymm0 = -(ymm2 * ymm0) + ymm5
1582 ; BDVER2-NEXT: vrcpps %ymm1, %ymm2
1583 ; BDVER2-NEXT: vmovaps {{.*#+}} ymm5 = [9.0E+0,1.0E+1,1.1E+1,1.2E+1,1.3E+1,1.4E+1,1.5E+1,1.6E+1]
1584 ; BDVER2-NEXT: vfmsubps {{.*#+}} ymm3 = (ymm1 * ymm2) - ymm3
1585 ; BDVER2-NEXT: vfnmaddps {{.*#+}} ymm2 = -(ymm2 * ymm3) + ymm2
1586 ; BDVER2-NEXT: vmulps %ymm5, %ymm2, %ymm4
1587 ; BDVER2-NEXT: vfmsubps {{.*#+}} ymm1 = (ymm1 * ymm4) - ymm5
1588 ; BDVER2-NEXT: vfnmaddps {{.*#+}} ymm1 = -(ymm2 * ymm1) + ymm4
1591 ; BTVER2-LABEL: v16f32_two_step2:
1593 ; BTVER2-NEXT: vbroadcastss {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
1594 ; BTVER2-NEXT: vrcpps %ymm0, %ymm2
1595 ; BTVER2-NEXT: vmulps %ymm2, %ymm0, %ymm3
1596 ; BTVER2-NEXT: vsubps %ymm3, %ymm4, %ymm3
1597 ; BTVER2-NEXT: vmulps %ymm3, %ymm2, %ymm3
1598 ; BTVER2-NEXT: vaddps %ymm3, %ymm2, %ymm2
1599 ; BTVER2-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0]
1600 ; BTVER2-NEXT: vmulps %ymm3, %ymm2, %ymm5
1601 ; BTVER2-NEXT: vmulps %ymm5, %ymm0, %ymm0
1602 ; BTVER2-NEXT: vsubps %ymm0, %ymm3, %ymm0
1603 ; BTVER2-NEXT: vmulps %ymm0, %ymm2, %ymm0
1604 ; BTVER2-NEXT: vrcpps %ymm1, %ymm2
1605 ; BTVER2-NEXT: vmulps %ymm2, %ymm1, %ymm3
1606 ; BTVER2-NEXT: vaddps %ymm0, %ymm5, %ymm0
1607 ; BTVER2-NEXT: vmovaps {{.*#+}} ymm5 = [9.0E+0,1.0E+1,1.1E+1,1.2E+1,1.3E+1,1.4E+1,1.5E+1,1.6E+1]
1608 ; BTVER2-NEXT: vsubps %ymm3, %ymm4, %ymm3
1609 ; BTVER2-NEXT: vmulps %ymm3, %ymm2, %ymm3
1610 ; BTVER2-NEXT: vaddps %ymm3, %ymm2, %ymm2
1611 ; BTVER2-NEXT: vmulps %ymm5, %ymm2, %ymm4
1612 ; BTVER2-NEXT: vmulps %ymm4, %ymm1, %ymm1
1613 ; BTVER2-NEXT: vsubps %ymm1, %ymm5, %ymm1
1614 ; BTVER2-NEXT: vmulps %ymm1, %ymm2, %ymm1
1615 ; BTVER2-NEXT: vaddps %ymm1, %ymm4, %ymm1
1618 ; SANDY-LABEL: v16f32_two_step2:
1620 ; SANDY-NEXT: vrcpps %ymm0, %ymm2
1621 ; SANDY-NEXT: vmulps %ymm2, %ymm0, %ymm3
1622 ; SANDY-NEXT: vbroadcastss {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
1623 ; SANDY-NEXT: vsubps %ymm3, %ymm4, %ymm3
1624 ; SANDY-NEXT: vmulps %ymm3, %ymm2, %ymm3
1625 ; SANDY-NEXT: vaddps %ymm3, %ymm2, %ymm2
1626 ; SANDY-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0]
1627 ; SANDY-NEXT: vmulps %ymm3, %ymm2, %ymm5
1628 ; SANDY-NEXT: vmulps %ymm5, %ymm0, %ymm0
1629 ; SANDY-NEXT: vsubps %ymm0, %ymm3, %ymm0
1630 ; SANDY-NEXT: vmulps %ymm0, %ymm2, %ymm0
1631 ; SANDY-NEXT: vaddps %ymm0, %ymm5, %ymm0
1632 ; SANDY-NEXT: vrcpps %ymm1, %ymm2
1633 ; SANDY-NEXT: vmulps %ymm2, %ymm1, %ymm3
1634 ; SANDY-NEXT: vsubps %ymm3, %ymm4, %ymm3
1635 ; SANDY-NEXT: vmulps %ymm3, %ymm2, %ymm3
1636 ; SANDY-NEXT: vaddps %ymm3, %ymm2, %ymm2
1637 ; SANDY-NEXT: vmovaps {{.*#+}} ymm3 = [9.0E+0,1.0E+1,1.1E+1,1.2E+1,1.3E+1,1.4E+1,1.5E+1,1.6E+1]
1638 ; SANDY-NEXT: vmulps %ymm3, %ymm2, %ymm4
1639 ; SANDY-NEXT: vmulps %ymm4, %ymm1, %ymm1
1640 ; SANDY-NEXT: vsubps %ymm1, %ymm3, %ymm1
1641 ; SANDY-NEXT: vmulps %ymm1, %ymm2, %ymm1
1642 ; SANDY-NEXT: vaddps %ymm1, %ymm4, %ymm1
1645 ; HASWELL-LABEL: v16f32_two_step2:
1647 ; HASWELL-NEXT: vrcpps %ymm0, %ymm2
1648 ; HASWELL-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
1649 ; HASWELL-NEXT: vmovaps %ymm2, %ymm4
1650 ; HASWELL-NEXT: vfmsub213ps {{.*#+}} ymm4 = (ymm0 * ymm4) - ymm3
1651 ; HASWELL-NEXT: vfnmadd132ps {{.*#+}} ymm4 = -(ymm4 * ymm2) + ymm2
1652 ; HASWELL-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0]
1653 ; HASWELL-NEXT: vmulps %ymm2, %ymm4, %ymm5
1654 ; HASWELL-NEXT: vrcpps %ymm1, %ymm6
1655 ; HASWELL-NEXT: vfmsub213ps {{.*#+}} ymm0 = (ymm5 * ymm0) - ymm2
1656 ; HASWELL-NEXT: vfnmadd213ps {{.*#+}} ymm0 = -(ymm4 * ymm0) + ymm5
1657 ; HASWELL-NEXT: vfmsub231ps {{.*#+}} ymm3 = (ymm1 * ymm6) - ymm3
1658 ; HASWELL-NEXT: vfnmadd132ps {{.*#+}} ymm3 = -(ymm3 * ymm6) + ymm6
1659 ; HASWELL-NEXT: vmovaps {{.*#+}} ymm2 = [9.0E+0,1.0E+1,1.1E+1,1.2E+1,1.3E+1,1.4E+1,1.5E+1,1.6E+1]
1660 ; HASWELL-NEXT: vmulps %ymm2, %ymm3, %ymm4
1661 ; HASWELL-NEXT: vfmsub213ps {{.*#+}} ymm1 = (ymm4 * ymm1) - ymm2
1662 ; HASWELL-NEXT: vfnmadd213ps {{.*#+}} ymm1 = -(ymm3 * ymm1) + ymm4
1663 ; HASWELL-NEXT: retq
1665 ; HASWELL-NO-FMA-LABEL: v16f32_two_step2:
1666 ; HASWELL-NO-FMA: # %bb.0:
1667 ; HASWELL-NO-FMA-NEXT: vrcpps %ymm0, %ymm2
1668 ; HASWELL-NO-FMA-NEXT: vmulps %ymm2, %ymm0, %ymm3
1669 ; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
1670 ; HASWELL-NO-FMA-NEXT: vsubps %ymm3, %ymm4, %ymm3
1671 ; HASWELL-NO-FMA-NEXT: vmulps %ymm3, %ymm2, %ymm3
1672 ; HASWELL-NO-FMA-NEXT: vaddps %ymm3, %ymm2, %ymm2
1673 ; HASWELL-NO-FMA-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0]
1674 ; HASWELL-NO-FMA-NEXT: vmulps %ymm3, %ymm2, %ymm5
1675 ; HASWELL-NO-FMA-NEXT: vmulps %ymm5, %ymm0, %ymm0
1676 ; HASWELL-NO-FMA-NEXT: vsubps %ymm0, %ymm3, %ymm0
1677 ; HASWELL-NO-FMA-NEXT: vrcpps %ymm1, %ymm3
1678 ; HASWELL-NO-FMA-NEXT: vmulps %ymm0, %ymm2, %ymm0
1679 ; HASWELL-NO-FMA-NEXT: vaddps %ymm0, %ymm5, %ymm0
1680 ; HASWELL-NO-FMA-NEXT: vmulps %ymm3, %ymm1, %ymm2
1681 ; HASWELL-NO-FMA-NEXT: vsubps %ymm2, %ymm4, %ymm2
1682 ; HASWELL-NO-FMA-NEXT: vmulps %ymm2, %ymm3, %ymm2
1683 ; HASWELL-NO-FMA-NEXT: vaddps %ymm2, %ymm3, %ymm2
1684 ; HASWELL-NO-FMA-NEXT: vmovaps {{.*#+}} ymm3 = [9.0E+0,1.0E+1,1.1E+1,1.2E+1,1.3E+1,1.4E+1,1.5E+1,1.6E+1]
1685 ; HASWELL-NO-FMA-NEXT: vmulps %ymm3, %ymm2, %ymm4
1686 ; HASWELL-NO-FMA-NEXT: vmulps %ymm4, %ymm1, %ymm1
1687 ; HASWELL-NO-FMA-NEXT: vsubps %ymm1, %ymm3, %ymm1
1688 ; HASWELL-NO-FMA-NEXT: vmulps %ymm1, %ymm2, %ymm1
1689 ; HASWELL-NO-FMA-NEXT: vaddps %ymm1, %ymm4, %ymm1
1690 ; HASWELL-NO-FMA-NEXT: retq
1692 ; AVX512-LABEL: v16f32_two_step2:
1694 ; AVX512-NEXT: vrcp14ps %zmm0, %zmm1
1695 ; AVX512-NEXT: vbroadcastss {{.*#+}} zmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
1696 ; AVX512-NEXT: vfmsub231ps {{.*#+}} zmm2 = (zmm0 * zmm1) - zmm2
1697 ; AVX512-NEXT: vfnmadd132ps {{.*#+}} zmm2 = -(zmm2 * zmm1) + zmm1
1698 ; AVX512-NEXT: vmovaps {{.*#+}} zmm1 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0,9.0E+0,1.0E+1,1.1E+1,1.2E+1,1.3E+1,1.4E+1,1.5E+1,1.6E+1]
1699 ; AVX512-NEXT: vmulps %zmm1, %zmm2, %zmm3
1700 ; AVX512-NEXT: vfmsub213ps {{.*#+}} zmm0 = (zmm3 * zmm0) - zmm1
1701 ; AVX512-NEXT: vfnmadd213ps {{.*#+}} zmm0 = -(zmm2 * zmm0) + zmm3
1703 %div = fdiv fast <16 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0, float 9.0, float 10.0, float 11.0, float 12.0, float 13.0, float 14.0, float 15.0, float 16.0>, %x
1704 ret <16 x float> %div
1707 define <16 x float> @v16f32_no_step(<16 x float> %x) #3 {
1708 ; SSE-LABEL: v16f32_no_step:
1710 ; SSE-NEXT: rcpps %xmm0, %xmm0
1711 ; SSE-NEXT: rcpps %xmm1, %xmm1
1712 ; SSE-NEXT: rcpps %xmm2, %xmm2
1713 ; SSE-NEXT: rcpps %xmm3, %xmm3
1716 ; AVX-RECIP-LABEL: v16f32_no_step:
1717 ; AVX-RECIP: # %bb.0:
1718 ; AVX-RECIP-NEXT: vrcpps %ymm0, %ymm0
1719 ; AVX-RECIP-NEXT: vrcpps %ymm1, %ymm1
1720 ; AVX-RECIP-NEXT: retq
1722 ; FMA-RECIP-LABEL: v16f32_no_step:
1723 ; FMA-RECIP: # %bb.0:
1724 ; FMA-RECIP-NEXT: vrcpps %ymm0, %ymm0
1725 ; FMA-RECIP-NEXT: vrcpps %ymm1, %ymm1
1726 ; FMA-RECIP-NEXT: retq
1728 ; BDVER2-LABEL: v16f32_no_step:
1730 ; BDVER2-NEXT: vrcpps %ymm0, %ymm0
1731 ; BDVER2-NEXT: vrcpps %ymm1, %ymm1
1734 ; BTVER2-LABEL: v16f32_no_step:
1736 ; BTVER2-NEXT: vrcpps %ymm0, %ymm0
1737 ; BTVER2-NEXT: vrcpps %ymm1, %ymm1
1740 ; SANDY-LABEL: v16f32_no_step:
1742 ; SANDY-NEXT: vrcpps %ymm0, %ymm0
1743 ; SANDY-NEXT: vrcpps %ymm1, %ymm1
1746 ; HASWELL-LABEL: v16f32_no_step:
1748 ; HASWELL-NEXT: vrcpps %ymm0, %ymm0
1749 ; HASWELL-NEXT: vrcpps %ymm1, %ymm1
1750 ; HASWELL-NEXT: retq
1752 ; HASWELL-NO-FMA-LABEL: v16f32_no_step:
1753 ; HASWELL-NO-FMA: # %bb.0:
1754 ; HASWELL-NO-FMA-NEXT: vrcpps %ymm0, %ymm0
1755 ; HASWELL-NO-FMA-NEXT: vrcpps %ymm1, %ymm1
1756 ; HASWELL-NO-FMA-NEXT: retq
1758 ; AVX512-LABEL: v16f32_no_step:
1760 ; AVX512-NEXT: vrcp14ps %zmm0, %zmm0
1762 %div = fdiv fast <16 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, %x
1763 ret <16 x float> %div
1766 define <16 x float> @v16f32_no_step2(<16 x float> %x) #3 {
1767 ; SSE-LABEL: v16f32_no_step2:
1769 ; SSE-NEXT: rcpps %xmm0, %xmm0
1770 ; SSE-NEXT: mulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1771 ; SSE-NEXT: rcpps %xmm1, %xmm1
1772 ; SSE-NEXT: mulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
1773 ; SSE-NEXT: rcpps %xmm2, %xmm2
1774 ; SSE-NEXT: mulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
1775 ; SSE-NEXT: rcpps %xmm3, %xmm3
1776 ; SSE-NEXT: mulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
1779 ; AVX-RECIP-LABEL: v16f32_no_step2:
1780 ; AVX-RECIP: # %bb.0:
1781 ; AVX-RECIP-NEXT: vrcpps %ymm0, %ymm0
1782 ; AVX-RECIP-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1783 ; AVX-RECIP-NEXT: vrcpps %ymm1, %ymm1
1784 ; AVX-RECIP-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
1785 ; AVX-RECIP-NEXT: retq
1787 ; FMA-RECIP-LABEL: v16f32_no_step2:
1788 ; FMA-RECIP: # %bb.0:
1789 ; FMA-RECIP-NEXT: vrcpps %ymm0, %ymm0
1790 ; FMA-RECIP-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1791 ; FMA-RECIP-NEXT: vrcpps %ymm1, %ymm1
1792 ; FMA-RECIP-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
1793 ; FMA-RECIP-NEXT: retq
1795 ; BDVER2-LABEL: v16f32_no_step2:
1797 ; BDVER2-NEXT: vrcpps %ymm0, %ymm0
1798 ; BDVER2-NEXT: vrcpps %ymm1, %ymm1
1799 ; BDVER2-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1800 ; BDVER2-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
1803 ; BTVER2-LABEL: v16f32_no_step2:
1805 ; BTVER2-NEXT: vrcpps %ymm0, %ymm0
1806 ; BTVER2-NEXT: vrcpps %ymm1, %ymm1
1807 ; BTVER2-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1808 ; BTVER2-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
1811 ; SANDY-LABEL: v16f32_no_step2:
1813 ; SANDY-NEXT: vrcpps %ymm0, %ymm0
1814 ; SANDY-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1815 ; SANDY-NEXT: vrcpps %ymm1, %ymm1
1816 ; SANDY-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
1819 ; HASWELL-LABEL: v16f32_no_step2:
1821 ; HASWELL-NEXT: vrcpps %ymm0, %ymm0
1822 ; HASWELL-NEXT: vrcpps %ymm1, %ymm1
1823 ; HASWELL-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1824 ; HASWELL-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
1825 ; HASWELL-NEXT: retq
1827 ; HASWELL-NO-FMA-LABEL: v16f32_no_step2:
1828 ; HASWELL-NO-FMA: # %bb.0:
1829 ; HASWELL-NO-FMA-NEXT: vrcpps %ymm0, %ymm0
1830 ; HASWELL-NO-FMA-NEXT: vrcpps %ymm1, %ymm1
1831 ; HASWELL-NO-FMA-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1832 ; HASWELL-NO-FMA-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
1833 ; HASWELL-NO-FMA-NEXT: retq
1835 ; AVX512-LABEL: v16f32_no_step2:
1837 ; AVX512-NEXT: vrcp14ps %zmm0, %zmm0
1838 ; AVX512-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
1840 %div = fdiv fast <16 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0, float 9.0, float 10.0, float 11.0, float 12.0, float 13.0, float 14.0, float 15.0, float 16.0>, %x
1841 ret <16 x float> %div
1844 attributes #0 = { "unsafe-fp-math"="true" "reciprocal-estimates"="!divf,!vec-divf" }
1845 attributes #1 = { "unsafe-fp-math"="true" "reciprocal-estimates"="divf,vec-divf" }
1846 attributes #2 = { "unsafe-fp-math"="true" "reciprocal-estimates"="divf:2,vec-divf:2" }
1847 attributes #3 = { "unsafe-fp-math"="true" "reciprocal-estimates"="divf:0,vec-divf:0" }