1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=CHECK --check-prefix=SSE
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX1
4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX512
6 declare double @__sqrt_finite(double)
7 declare float @__sqrtf_finite(float)
8 declare x86_fp80 @__sqrtl_finite(x86_fp80)
9 declare float @llvm.sqrt.f32(float)
10 declare <4 x float> @llvm.sqrt.v4f32(<4 x float>)
11 declare <8 x float> @llvm.sqrt.v8f32(<8 x float>)
12 declare <16 x float> @llvm.sqrt.v16f32(<16 x float>)
15 define double @finite_f64_no_estimate(double %d) #0 {
16 ; SSE-LABEL: finite_f64_no_estimate:
18 ; SSE-NEXT: sqrtsd %xmm0, %xmm0
21 ; AVX-LABEL: finite_f64_no_estimate:
23 ; AVX-NEXT: vsqrtsd %xmm0, %xmm0, %xmm0
25 %call = tail call double @__sqrt_finite(double %d) #2
29 ; No estimates for doubles.
31 define double @finite_f64_estimate(double %d) #1 {
32 ; SSE-LABEL: finite_f64_estimate:
34 ; SSE-NEXT: sqrtsd %xmm0, %xmm0
37 ; AVX-LABEL: finite_f64_estimate:
39 ; AVX-NEXT: vsqrtsd %xmm0, %xmm0, %xmm0
41 %call = tail call double @__sqrt_finite(double %d) #2
45 define float @finite_f32_no_estimate(float %f) #0 {
46 ; SSE-LABEL: finite_f32_no_estimate:
48 ; SSE-NEXT: sqrtss %xmm0, %xmm0
51 ; AVX-LABEL: finite_f32_no_estimate:
53 ; AVX-NEXT: vsqrtss %xmm0, %xmm0, %xmm0
55 %call = tail call float @__sqrtf_finite(float %f) #2
59 define float @finite_f32_estimate(float %f) #1 {
60 ; SSE-LABEL: finite_f32_estimate:
62 ; SSE-NEXT: rsqrtss %xmm0, %xmm1
63 ; SSE-NEXT: movaps %xmm0, %xmm2
64 ; SSE-NEXT: mulss %xmm1, %xmm2
65 ; SSE-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
66 ; SSE-NEXT: mulss %xmm2, %xmm3
67 ; SSE-NEXT: mulss %xmm1, %xmm2
68 ; SSE-NEXT: addss {{.*}}(%rip), %xmm2
69 ; SSE-NEXT: mulss %xmm3, %xmm2
70 ; SSE-NEXT: xorps %xmm1, %xmm1
71 ; SSE-NEXT: cmpeqss %xmm1, %xmm0
72 ; SSE-NEXT: andnps %xmm2, %xmm0
75 ; AVX1-LABEL: finite_f32_estimate:
77 ; AVX1-NEXT: vrsqrtss %xmm0, %xmm0, %xmm1
78 ; AVX1-NEXT: vmulss %xmm1, %xmm0, %xmm2
79 ; AVX1-NEXT: vmulss %xmm1, %xmm2, %xmm1
80 ; AVX1-NEXT: vaddss {{.*}}(%rip), %xmm1, %xmm1
81 ; AVX1-NEXT: vmulss {{.*}}(%rip), %xmm2, %xmm2
82 ; AVX1-NEXT: vmulss %xmm1, %xmm2, %xmm1
83 ; AVX1-NEXT: vxorps %xmm2, %xmm2, %xmm2
84 ; AVX1-NEXT: vcmpeqss %xmm2, %xmm0, %xmm0
85 ; AVX1-NEXT: vandnps %xmm1, %xmm0, %xmm0
88 ; AVX512-LABEL: finite_f32_estimate:
90 ; AVX512-NEXT: vrsqrtss %xmm0, %xmm0, %xmm1
91 ; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm2
92 ; AVX512-NEXT: vfmadd213ss {{.*#+}} xmm1 = (xmm2 * xmm1) + mem
93 ; AVX512-NEXT: vmulss {{.*}}(%rip), %xmm2, %xmm2
94 ; AVX512-NEXT: vmulss %xmm1, %xmm2, %xmm1
95 ; AVX512-NEXT: vxorps %xmm2, %xmm2, %xmm2
96 ; AVX512-NEXT: vcmpeqss %xmm2, %xmm0, %k1
97 ; AVX512-NEXT: vmovss %xmm2, %xmm0, %xmm1 {%k1}
98 ; AVX512-NEXT: vmovaps %xmm1, %xmm0
100 %call = tail call float @__sqrtf_finite(float %f) #2
104 define x86_fp80 @finite_f80_no_estimate(x86_fp80 %ld) #0 {
105 ; CHECK-LABEL: finite_f80_no_estimate:
107 ; CHECK-NEXT: fldt {{[0-9]+}}(%rsp)
110 %call = tail call x86_fp80 @__sqrtl_finite(x86_fp80 %ld) #2
114 ; Don't die on the impossible.
116 define x86_fp80 @finite_f80_estimate_but_no(x86_fp80 %ld) #1 {
117 ; CHECK-LABEL: finite_f80_estimate_but_no:
119 ; CHECK-NEXT: fldt {{[0-9]+}}(%rsp)
122 %call = tail call x86_fp80 @__sqrtl_finite(x86_fp80 %ld) #2
126 ; PR34994 - https://bugs.llvm.org/show_bug.cgi?id=34994
128 define float @sqrtf_check_denorms(float %x) #3 {
129 ; SSE-LABEL: sqrtf_check_denorms:
131 ; SSE-NEXT: rsqrtss %xmm0, %xmm1
132 ; SSE-NEXT: movaps %xmm0, %xmm2
133 ; SSE-NEXT: mulss %xmm1, %xmm2
134 ; SSE-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
135 ; SSE-NEXT: mulss %xmm2, %xmm3
136 ; SSE-NEXT: mulss %xmm1, %xmm2
137 ; SSE-NEXT: addss {{.*}}(%rip), %xmm2
138 ; SSE-NEXT: mulss %xmm3, %xmm2
139 ; SSE-NEXT: andps {{.*}}(%rip), %xmm0
140 ; SSE-NEXT: cmpltss {{.*}}(%rip), %xmm0
141 ; SSE-NEXT: andnps %xmm2, %xmm0
144 ; AVX1-LABEL: sqrtf_check_denorms:
146 ; AVX1-NEXT: vrsqrtss %xmm0, %xmm0, %xmm1
147 ; AVX1-NEXT: vmulss %xmm1, %xmm0, %xmm2
148 ; AVX1-NEXT: vmulss %xmm1, %xmm2, %xmm1
149 ; AVX1-NEXT: vaddss {{.*}}(%rip), %xmm1, %xmm1
150 ; AVX1-NEXT: vmulss {{.*}}(%rip), %xmm2, %xmm2
151 ; AVX1-NEXT: vmulss %xmm1, %xmm2, %xmm1
152 ; AVX1-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0
153 ; AVX1-NEXT: vcmpltss {{.*}}(%rip), %xmm0, %xmm0
154 ; AVX1-NEXT: vandnps %xmm1, %xmm0, %xmm0
157 ; AVX512-LABEL: sqrtf_check_denorms:
159 ; AVX512-NEXT: vrsqrtss %xmm0, %xmm0, %xmm1
160 ; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm2
161 ; AVX512-NEXT: vfmadd213ss {{.*#+}} xmm1 = (xmm2 * xmm1) + mem
162 ; AVX512-NEXT: vmulss {{.*}}(%rip), %xmm2, %xmm2
163 ; AVX512-NEXT: vmulss %xmm1, %xmm2, %xmm1
164 ; AVX512-NEXT: vbroadcastss {{.*#+}} xmm2 = [NaN,NaN,NaN,NaN]
165 ; AVX512-NEXT: vandps %xmm2, %xmm0, %xmm0
166 ; AVX512-NEXT: vcmpltss {{.*}}(%rip), %xmm0, %k1
167 ; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0
168 ; AVX512-NEXT: vmovss %xmm0, %xmm0, %xmm1 {%k1}
169 ; AVX512-NEXT: vmovaps %xmm1, %xmm0
171 %call = tail call float @__sqrtf_finite(float %x) #2
175 define <4 x float> @sqrt_v4f32_check_denorms(<4 x float> %x) #3 {
176 ; SSE-LABEL: sqrt_v4f32_check_denorms:
178 ; SSE-NEXT: rsqrtps %xmm0, %xmm2
179 ; SSE-NEXT: movaps %xmm0, %xmm1
180 ; SSE-NEXT: mulps %xmm2, %xmm1
181 ; SSE-NEXT: movaps {{.*#+}} xmm3 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
182 ; SSE-NEXT: mulps %xmm1, %xmm3
183 ; SSE-NEXT: mulps %xmm2, %xmm1
184 ; SSE-NEXT: addps {{.*}}(%rip), %xmm1
185 ; SSE-NEXT: mulps %xmm3, %xmm1
186 ; SSE-NEXT: andps {{.*}}(%rip), %xmm0
187 ; SSE-NEXT: movaps {{.*#+}} xmm2 = [1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38]
188 ; SSE-NEXT: cmpleps %xmm0, %xmm2
189 ; SSE-NEXT: andps %xmm2, %xmm1
190 ; SSE-NEXT: movaps %xmm1, %xmm0
193 ; AVX1-LABEL: sqrt_v4f32_check_denorms:
195 ; AVX1-NEXT: vrsqrtps %xmm0, %xmm1
196 ; AVX1-NEXT: vmulps %xmm1, %xmm0, %xmm2
197 ; AVX1-NEXT: vmulps {{.*}}(%rip), %xmm2, %xmm3
198 ; AVX1-NEXT: vmulps %xmm1, %xmm2, %xmm1
199 ; AVX1-NEXT: vaddps {{.*}}(%rip), %xmm1, %xmm1
200 ; AVX1-NEXT: vmulps %xmm1, %xmm3, %xmm1
201 ; AVX1-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0
202 ; AVX1-NEXT: vmovaps {{.*#+}} xmm2 = [1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38]
203 ; AVX1-NEXT: vcmpleps %xmm0, %xmm2, %xmm0
204 ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0
207 ; AVX512-LABEL: sqrt_v4f32_check_denorms:
209 ; AVX512-NEXT: vrsqrtps %xmm0, %xmm1
210 ; AVX512-NEXT: vmulps %xmm1, %xmm0, %xmm2
211 ; AVX512-NEXT: vbroadcastss {{.*#+}} xmm3 = [-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0]
212 ; AVX512-NEXT: vfmadd231ps {{.*#+}} xmm3 = (xmm2 * xmm1) + xmm3
213 ; AVX512-NEXT: vbroadcastss {{.*#+}} xmm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
214 ; AVX512-NEXT: vmulps %xmm3, %xmm1, %xmm1
215 ; AVX512-NEXT: vmulps %xmm1, %xmm2, %xmm1
216 ; AVX512-NEXT: vbroadcastss {{.*#+}} xmm2 = [NaN,NaN,NaN,NaN]
217 ; AVX512-NEXT: vandps %xmm2, %xmm0, %xmm0
218 ; AVX512-NEXT: vbroadcastss {{.*#+}} xmm2 = [1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38]
219 ; AVX512-NEXT: vcmpleps %xmm0, %xmm2, %xmm0
220 ; AVX512-NEXT: vandps %xmm1, %xmm0, %xmm0
222 %call = tail call <4 x float> @llvm.sqrt.v4f32(<4 x float> %x) #2
223 ret <4 x float> %call
226 define float @f32_no_estimate(float %x) #0 {
227 ; SSE-LABEL: f32_no_estimate:
229 ; SSE-NEXT: sqrtss %xmm0, %xmm1
230 ; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
231 ; SSE-NEXT: divss %xmm1, %xmm0
234 ; AVX-LABEL: f32_no_estimate:
236 ; AVX-NEXT: vsqrtss %xmm0, %xmm0, %xmm0
237 ; AVX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
238 ; AVX-NEXT: vdivss %xmm0, %xmm1, %xmm0
240 %sqrt = tail call float @llvm.sqrt.f32(float %x)
241 %div = fdiv fast float 1.0, %sqrt
245 define float @f32_estimate(float %x) #1 {
246 ; SSE-LABEL: f32_estimate:
248 ; SSE-NEXT: rsqrtss %xmm0, %xmm1
249 ; SSE-NEXT: movaps %xmm1, %xmm2
250 ; SSE-NEXT: mulss %xmm1, %xmm2
251 ; SSE-NEXT: mulss %xmm0, %xmm2
252 ; SSE-NEXT: addss {{.*}}(%rip), %xmm2
253 ; SSE-NEXT: mulss {{.*}}(%rip), %xmm1
254 ; SSE-NEXT: mulss %xmm2, %xmm1
255 ; SSE-NEXT: movaps %xmm1, %xmm0
258 ; AVX1-LABEL: f32_estimate:
260 ; AVX1-NEXT: vrsqrtss %xmm0, %xmm0, %xmm1
261 ; AVX1-NEXT: vmulss %xmm1, %xmm1, %xmm2
262 ; AVX1-NEXT: vmulss %xmm2, %xmm0, %xmm0
263 ; AVX1-NEXT: vaddss {{.*}}(%rip), %xmm0, %xmm0
264 ; AVX1-NEXT: vmulss {{.*}}(%rip), %xmm1, %xmm1
265 ; AVX1-NEXT: vmulss %xmm0, %xmm1, %xmm0
268 ; AVX512-LABEL: f32_estimate:
270 ; AVX512-NEXT: vrsqrtss %xmm0, %xmm0, %xmm1
271 ; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0
272 ; AVX512-NEXT: vfmadd213ss {{.*#+}} xmm0 = (xmm1 * xmm0) + mem
273 ; AVX512-NEXT: vmulss {{.*}}(%rip), %xmm1, %xmm1
274 ; AVX512-NEXT: vmulss %xmm0, %xmm1, %xmm0
276 %sqrt = tail call float @llvm.sqrt.f32(float %x)
277 %div = fdiv fast float 1.0, %sqrt
281 define <4 x float> @v4f32_no_estimate(<4 x float> %x) #0 {
282 ; SSE-LABEL: v4f32_no_estimate:
284 ; SSE-NEXT: sqrtps %xmm0, %xmm1
285 ; SSE-NEXT: movaps {{.*#+}} xmm0 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
286 ; SSE-NEXT: divps %xmm1, %xmm0
289 ; AVX1-LABEL: v4f32_no_estimate:
291 ; AVX1-NEXT: vsqrtps %xmm0, %xmm0
292 ; AVX1-NEXT: vmovaps {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
293 ; AVX1-NEXT: vdivps %xmm0, %xmm1, %xmm0
296 ; AVX512-LABEL: v4f32_no_estimate:
298 ; AVX512-NEXT: vsqrtps %xmm0, %xmm0
299 ; AVX512-NEXT: vbroadcastss {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
300 ; AVX512-NEXT: vdivps %xmm0, %xmm1, %xmm0
302 %sqrt = tail call <4 x float> @llvm.sqrt.v4f32(<4 x float> %x)
303 %div = fdiv fast <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>, %sqrt
307 define <4 x float> @v4f32_estimate(<4 x float> %x) #1 {
308 ; SSE-LABEL: v4f32_estimate:
310 ; SSE-NEXT: rsqrtps %xmm0, %xmm1
311 ; SSE-NEXT: movaps %xmm1, %xmm2
312 ; SSE-NEXT: mulps %xmm1, %xmm2
313 ; SSE-NEXT: mulps %xmm0, %xmm2
314 ; SSE-NEXT: addps {{.*}}(%rip), %xmm2
315 ; SSE-NEXT: mulps {{.*}}(%rip), %xmm1
316 ; SSE-NEXT: mulps %xmm2, %xmm1
317 ; SSE-NEXT: movaps %xmm1, %xmm0
320 ; AVX1-LABEL: v4f32_estimate:
322 ; AVX1-NEXT: vrsqrtps %xmm0, %xmm1
323 ; AVX1-NEXT: vmulps %xmm1, %xmm1, %xmm2
324 ; AVX1-NEXT: vmulps %xmm2, %xmm0, %xmm0
325 ; AVX1-NEXT: vaddps {{.*}}(%rip), %xmm0, %xmm0
326 ; AVX1-NEXT: vmulps {{.*}}(%rip), %xmm1, %xmm1
327 ; AVX1-NEXT: vmulps %xmm0, %xmm1, %xmm0
330 ; AVX512-LABEL: v4f32_estimate:
332 ; AVX512-NEXT: vrsqrtps %xmm0, %xmm1
333 ; AVX512-NEXT: vmulps %xmm1, %xmm0, %xmm0
334 ; AVX512-NEXT: vbroadcastss {{.*#+}} xmm2 = [-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0]
335 ; AVX512-NEXT: vfmadd213ps {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2
336 ; AVX512-NEXT: vbroadcastss {{.*#+}} xmm2 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
337 ; AVX512-NEXT: vmulps %xmm0, %xmm2, %xmm0
338 ; AVX512-NEXT: vmulps %xmm0, %xmm1, %xmm0
340 %sqrt = tail call <4 x float> @llvm.sqrt.v4f32(<4 x float> %x)
341 %div = fdiv fast <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>, %sqrt
345 define <8 x float> @v8f32_no_estimate(<8 x float> %x) #0 {
346 ; SSE-LABEL: v8f32_no_estimate:
348 ; SSE-NEXT: sqrtps %xmm1, %xmm2
349 ; SSE-NEXT: sqrtps %xmm0, %xmm3
350 ; SSE-NEXT: movaps {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
351 ; SSE-NEXT: movaps %xmm1, %xmm0
352 ; SSE-NEXT: divps %xmm3, %xmm0
353 ; SSE-NEXT: divps %xmm2, %xmm1
356 ; AVX1-LABEL: v8f32_no_estimate:
358 ; AVX1-NEXT: vsqrtps %ymm0, %ymm0
359 ; AVX1-NEXT: vmovaps {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
360 ; AVX1-NEXT: vdivps %ymm0, %ymm1, %ymm0
363 ; AVX512-LABEL: v8f32_no_estimate:
365 ; AVX512-NEXT: vsqrtps %ymm0, %ymm0
366 ; AVX512-NEXT: vbroadcastss {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
367 ; AVX512-NEXT: vdivps %ymm0, %ymm1, %ymm0
369 %sqrt = tail call <8 x float> @llvm.sqrt.v8f32(<8 x float> %x)
370 %div = fdiv fast <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, %sqrt
374 define <8 x float> @v8f32_estimate(<8 x float> %x) #1 {
375 ; SSE-LABEL: v8f32_estimate:
377 ; SSE-NEXT: rsqrtps %xmm0, %xmm3
378 ; SSE-NEXT: movaps {{.*#+}} xmm4 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
379 ; SSE-NEXT: movaps %xmm3, %xmm2
380 ; SSE-NEXT: mulps %xmm3, %xmm2
381 ; SSE-NEXT: mulps %xmm0, %xmm2
382 ; SSE-NEXT: movaps {{.*#+}} xmm0 = [-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0]
383 ; SSE-NEXT: addps %xmm0, %xmm2
384 ; SSE-NEXT: mulps %xmm4, %xmm2
385 ; SSE-NEXT: mulps %xmm3, %xmm2
386 ; SSE-NEXT: rsqrtps %xmm1, %xmm5
387 ; SSE-NEXT: movaps %xmm5, %xmm3
388 ; SSE-NEXT: mulps %xmm5, %xmm3
389 ; SSE-NEXT: mulps %xmm1, %xmm3
390 ; SSE-NEXT: addps %xmm0, %xmm3
391 ; SSE-NEXT: mulps %xmm4, %xmm3
392 ; SSE-NEXT: mulps %xmm5, %xmm3
393 ; SSE-NEXT: movaps %xmm2, %xmm0
394 ; SSE-NEXT: movaps %xmm3, %xmm1
397 ; AVX1-LABEL: v8f32_estimate:
399 ; AVX1-NEXT: vrsqrtps %ymm0, %ymm1
400 ; AVX1-NEXT: vmulps %ymm1, %ymm1, %ymm2
401 ; AVX1-NEXT: vmulps %ymm2, %ymm0, %ymm0
402 ; AVX1-NEXT: vaddps {{.*}}(%rip), %ymm0, %ymm0
403 ; AVX1-NEXT: vmulps {{.*}}(%rip), %ymm1, %ymm1
404 ; AVX1-NEXT: vmulps %ymm0, %ymm1, %ymm0
407 ; AVX512-LABEL: v8f32_estimate:
409 ; AVX512-NEXT: vrsqrtps %ymm0, %ymm1
410 ; AVX512-NEXT: vmulps %ymm1, %ymm0, %ymm0
411 ; AVX512-NEXT: vbroadcastss {{.*#+}} ymm2 = [-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0]
412 ; AVX512-NEXT: vfmadd213ps {{.*#+}} ymm0 = (ymm1 * ymm0) + ymm2
413 ; AVX512-NEXT: vbroadcastss {{.*#+}} ymm2 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
414 ; AVX512-NEXT: vmulps %ymm0, %ymm2, %ymm0
415 ; AVX512-NEXT: vmulps %ymm0, %ymm1, %ymm0
417 %sqrt = tail call <8 x float> @llvm.sqrt.v8f32(<8 x float> %x)
418 %div = fdiv fast <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, %sqrt
422 define <16 x float> @v16f32_no_estimate(<16 x float> %x) #0 {
423 ; SSE-LABEL: v16f32_no_estimate:
425 ; SSE-NEXT: sqrtps %xmm3, %xmm4
426 ; SSE-NEXT: sqrtps %xmm2, %xmm5
427 ; SSE-NEXT: sqrtps %xmm1, %xmm2
428 ; SSE-NEXT: sqrtps %xmm0, %xmm1
429 ; SSE-NEXT: movaps {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
430 ; SSE-NEXT: movaps %xmm3, %xmm0
431 ; SSE-NEXT: divps %xmm1, %xmm0
432 ; SSE-NEXT: movaps %xmm3, %xmm1
433 ; SSE-NEXT: divps %xmm2, %xmm1
434 ; SSE-NEXT: movaps %xmm3, %xmm2
435 ; SSE-NEXT: divps %xmm5, %xmm2
436 ; SSE-NEXT: divps %xmm4, %xmm3
439 ; AVX1-LABEL: v16f32_no_estimate:
441 ; AVX1-NEXT: vsqrtps %ymm1, %ymm1
442 ; AVX1-NEXT: vsqrtps %ymm0, %ymm0
443 ; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
444 ; AVX1-NEXT: vdivps %ymm0, %ymm2, %ymm0
445 ; AVX1-NEXT: vdivps %ymm1, %ymm2, %ymm1
448 ; AVX512-LABEL: v16f32_no_estimate:
450 ; AVX512-NEXT: vsqrtps %zmm0, %zmm0
451 ; AVX512-NEXT: vbroadcastss {{.*#+}} zmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
452 ; AVX512-NEXT: vdivps %zmm0, %zmm1, %zmm0
454 %sqrt = tail call <16 x float> @llvm.sqrt.v16f32(<16 x float> %x)
455 %div = fdiv fast <16 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, %sqrt
456 ret <16 x float> %div
459 define <16 x float> @v16f32_estimate(<16 x float> %x) #1 {
460 ; SSE-LABEL: v16f32_estimate:
462 ; SSE-NEXT: movaps %xmm1, %xmm4
463 ; SSE-NEXT: movaps %xmm0, %xmm1
464 ; SSE-NEXT: rsqrtps %xmm0, %xmm5
465 ; SSE-NEXT: movaps {{.*#+}} xmm6 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
466 ; SSE-NEXT: movaps %xmm5, %xmm0
467 ; SSE-NEXT: mulps %xmm5, %xmm0
468 ; SSE-NEXT: mulps %xmm1, %xmm0
469 ; SSE-NEXT: movaps {{.*#+}} xmm7 = [-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0]
470 ; SSE-NEXT: addps %xmm7, %xmm0
471 ; SSE-NEXT: mulps %xmm6, %xmm0
472 ; SSE-NEXT: mulps %xmm5, %xmm0
473 ; SSE-NEXT: rsqrtps %xmm4, %xmm5
474 ; SSE-NEXT: movaps %xmm5, %xmm1
475 ; SSE-NEXT: mulps %xmm5, %xmm1
476 ; SSE-NEXT: mulps %xmm4, %xmm1
477 ; SSE-NEXT: addps %xmm7, %xmm1
478 ; SSE-NEXT: mulps %xmm6, %xmm1
479 ; SSE-NEXT: mulps %xmm5, %xmm1
480 ; SSE-NEXT: rsqrtps %xmm2, %xmm5
481 ; SSE-NEXT: movaps %xmm5, %xmm4
482 ; SSE-NEXT: mulps %xmm5, %xmm4
483 ; SSE-NEXT: mulps %xmm2, %xmm4
484 ; SSE-NEXT: addps %xmm7, %xmm4
485 ; SSE-NEXT: mulps %xmm6, %xmm4
486 ; SSE-NEXT: mulps %xmm5, %xmm4
487 ; SSE-NEXT: rsqrtps %xmm3, %xmm2
488 ; SSE-NEXT: movaps %xmm2, %xmm5
489 ; SSE-NEXT: mulps %xmm2, %xmm5
490 ; SSE-NEXT: mulps %xmm3, %xmm5
491 ; SSE-NEXT: addps %xmm7, %xmm5
492 ; SSE-NEXT: mulps %xmm6, %xmm5
493 ; SSE-NEXT: mulps %xmm2, %xmm5
494 ; SSE-NEXT: movaps %xmm4, %xmm2
495 ; SSE-NEXT: movaps %xmm5, %xmm3
498 ; AVX1-LABEL: v16f32_estimate:
500 ; AVX1-NEXT: vrsqrtps %ymm0, %ymm2
501 ; AVX1-NEXT: vmovaps {{.*#+}} ymm3 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
502 ; AVX1-NEXT: vmulps %ymm2, %ymm2, %ymm4
503 ; AVX1-NEXT: vmulps %ymm4, %ymm0, %ymm0
504 ; AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0]
505 ; AVX1-NEXT: vaddps %ymm4, %ymm0, %ymm0
506 ; AVX1-NEXT: vmulps %ymm0, %ymm3, %ymm0
507 ; AVX1-NEXT: vmulps %ymm0, %ymm2, %ymm0
508 ; AVX1-NEXT: vrsqrtps %ymm1, %ymm2
509 ; AVX1-NEXT: vmulps %ymm2, %ymm2, %ymm5
510 ; AVX1-NEXT: vmulps %ymm5, %ymm1, %ymm1
511 ; AVX1-NEXT: vaddps %ymm4, %ymm1, %ymm1
512 ; AVX1-NEXT: vmulps %ymm1, %ymm3, %ymm1
513 ; AVX1-NEXT: vmulps %ymm1, %ymm2, %ymm1
516 ; AVX512-LABEL: v16f32_estimate:
518 ; AVX512-NEXT: vrsqrt14ps %zmm0, %zmm1
519 ; AVX512-NEXT: vmulps %zmm1, %zmm0, %zmm0
520 ; AVX512-NEXT: vfmadd213ps {{.*#+}} zmm0 = (zmm1 * zmm0) + mem
521 ; AVX512-NEXT: vmulps {{.*}}(%rip){1to16}, %zmm1, %zmm1
522 ; AVX512-NEXT: vmulps %zmm0, %zmm1, %zmm0
524 %sqrt = tail call <16 x float> @llvm.sqrt.v16f32(<16 x float> %x)
525 %div = fdiv fast <16 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, %sqrt
526 ret <16 x float> %div
530 attributes #0 = { "unsafe-fp-math"="true" "reciprocal-estimates"="!sqrtf,!vec-sqrtf,!divf,!vec-divf" }
531 attributes #1 = { "unsafe-fp-math"="true" "reciprocal-estimates"="sqrt,vec-sqrt" }
532 attributes #2 = { nounwind readnone }
533 attributes #3 = { "unsafe-fp-math"="true" "reciprocal-estimates"="sqrt,vec-sqrt" "denormal-fp-math"="ieee" }