1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64 -mattr=+sse2 | FileCheck %s --check-prefix=CHECK --check-prefix=SSE
3 ; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64 -mattr=+avx | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX1
4 ; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64 -mattr=+avx512f | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX512
6 declare double @__sqrt_finite(double)
7 declare float @__sqrtf_finite(float)
8 declare x86_fp80 @__sqrtl_finite(x86_fp80)
9 declare float @llvm.sqrt.f32(float)
10 declare <4 x float> @llvm.sqrt.v4f32(<4 x float>)
11 declare <8 x float> @llvm.sqrt.v8f32(<8 x float>)
12 declare <16 x float> @llvm.sqrt.v16f32(<16 x float>)
13 declare double @llvm.sqrt.f64(double)
14 declare <2 x double> @llvm.sqrt.v2f64(<2 x double>)
16 declare float @llvm.fabs.f32(float)
17 declare <4 x float> @llvm.fabs.v4f32(<4 x float>)
18 declare double @llvm.fabs.f64(double)
20 define double @finite_f64_no_estimate(double %d) #0 {
21 ; SSE-LABEL: finite_f64_no_estimate:
23 ; SSE-NEXT: sqrtsd %xmm0, %xmm0
26 ; AVX-LABEL: finite_f64_no_estimate:
28 ; AVX-NEXT: vsqrtsd %xmm0, %xmm0, %xmm0
30 %call = tail call double @__sqrt_finite(double %d) #2
34 ; No estimates for doubles.
36 define double @finite_f64_estimate(double %d) #1 {
37 ; SSE-LABEL: finite_f64_estimate:
39 ; SSE-NEXT: sqrtsd %xmm0, %xmm0
42 ; AVX-LABEL: finite_f64_estimate:
44 ; AVX-NEXT: vsqrtsd %xmm0, %xmm0, %xmm0
46 %call = tail call double @__sqrt_finite(double %d) #2
50 define float @finite_f32_no_estimate(float %f) #0 {
51 ; SSE-LABEL: finite_f32_no_estimate:
53 ; SSE-NEXT: sqrtss %xmm0, %xmm0
56 ; AVX-LABEL: finite_f32_no_estimate:
58 ; AVX-NEXT: vsqrtss %xmm0, %xmm0, %xmm0
60 %call = tail call float @__sqrtf_finite(float %f) #2
64 define float @finite_f32_estimate_ieee(float %f) #1 {
65 ; SSE-LABEL: finite_f32_estimate_ieee:
67 ; SSE-NEXT: sqrtss %xmm0, %xmm0
70 ; AVX-LABEL: finite_f32_estimate_ieee:
72 ; AVX-NEXT: vsqrtss %xmm0, %xmm0, %xmm0
74 %call = tail call float @__sqrtf_finite(float %f) #2
78 define float @finite_f32_estimate_ieee_ninf(float %f) #1 {
79 ; SSE-LABEL: finite_f32_estimate_ieee_ninf:
81 ; SSE-NEXT: rsqrtss %xmm0, %xmm1
82 ; SSE-NEXT: movaps %xmm0, %xmm2
83 ; SSE-NEXT: mulss %xmm1, %xmm2
84 ; SSE-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
85 ; SSE-NEXT: mulss %xmm2, %xmm3
86 ; SSE-NEXT: mulss %xmm1, %xmm2
87 ; SSE-NEXT: addss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
88 ; SSE-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
89 ; SSE-NEXT: mulss %xmm3, %xmm2
90 ; SSE-NEXT: cmpltss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
91 ; SSE-NEXT: andnps %xmm2, %xmm0
94 ; AVX1-LABEL: finite_f32_estimate_ieee_ninf:
96 ; AVX1-NEXT: vrsqrtss %xmm0, %xmm0, %xmm1
97 ; AVX1-NEXT: vmulss %xmm1, %xmm0, %xmm2
98 ; AVX1-NEXT: vmulss %xmm1, %xmm2, %xmm1
99 ; AVX1-NEXT: vaddss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
100 ; AVX1-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
101 ; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
102 ; AVX1-NEXT: vmulss %xmm1, %xmm2, %xmm1
103 ; AVX1-NEXT: vcmpltss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
104 ; AVX1-NEXT: vandnps %xmm1, %xmm0, %xmm0
107 ; AVX512-LABEL: finite_f32_estimate_ieee_ninf:
109 ; AVX512-NEXT: vrsqrtss %xmm0, %xmm0, %xmm1
110 ; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm2
111 ; AVX512-NEXT: vfmadd213ss {{.*#+}} xmm1 = (xmm2 * xmm1) + mem
112 ; AVX512-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
113 ; AVX512-NEXT: vmulss %xmm1, %xmm2, %xmm1
114 ; AVX512-NEXT: vbroadcastss {{.*#+}} xmm2 = [NaN,NaN,NaN,NaN]
115 ; AVX512-NEXT: vandps %xmm2, %xmm0, %xmm0
116 ; AVX512-NEXT: vcmpltss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1
117 ; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0
118 ; AVX512-NEXT: vmovss %xmm0, %xmm1, %xmm1 {%k1}
119 ; AVX512-NEXT: vmovaps %xmm1, %xmm0
121 %call = tail call ninf afn float @__sqrtf_finite(float %f) #2
125 define float @finite_f32_estimate_daz(float %f) #4 {
126 ; SSE-LABEL: finite_f32_estimate_daz:
128 ; SSE-NEXT: sqrtss %xmm0, %xmm0
131 ; AVX-LABEL: finite_f32_estimate_daz:
133 ; AVX-NEXT: vsqrtss %xmm0, %xmm0, %xmm0
135 %call = tail call float @__sqrtf_finite(float %f) #2
139 define float @finite_f32_estimate_daz_ninf(float %f) #4 {
140 ; SSE-LABEL: finite_f32_estimate_daz_ninf:
142 ; SSE-NEXT: rsqrtss %xmm0, %xmm1
143 ; SSE-NEXT: movaps %xmm0, %xmm2
144 ; SSE-NEXT: mulss %xmm1, %xmm2
145 ; SSE-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
146 ; SSE-NEXT: mulss %xmm2, %xmm3
147 ; SSE-NEXT: mulss %xmm1, %xmm2
148 ; SSE-NEXT: addss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
149 ; SSE-NEXT: mulss %xmm3, %xmm2
150 ; SSE-NEXT: xorps %xmm1, %xmm1
151 ; SSE-NEXT: cmpeqss %xmm1, %xmm0
152 ; SSE-NEXT: andnps %xmm2, %xmm0
155 ; AVX1-LABEL: finite_f32_estimate_daz_ninf:
157 ; AVX1-NEXT: vrsqrtss %xmm0, %xmm0, %xmm1
158 ; AVX1-NEXT: vmulss %xmm1, %xmm0, %xmm2
159 ; AVX1-NEXT: vmulss %xmm1, %xmm2, %xmm1
160 ; AVX1-NEXT: vaddss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
161 ; AVX1-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
162 ; AVX1-NEXT: vmulss %xmm1, %xmm2, %xmm1
163 ; AVX1-NEXT: vxorps %xmm2, %xmm2, %xmm2
164 ; AVX1-NEXT: vcmpeqss %xmm2, %xmm0, %xmm0
165 ; AVX1-NEXT: vandnps %xmm1, %xmm0, %xmm0
168 ; AVX512-LABEL: finite_f32_estimate_daz_ninf:
170 ; AVX512-NEXT: vrsqrtss %xmm0, %xmm0, %xmm1
171 ; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm2
172 ; AVX512-NEXT: vfmadd213ss {{.*#+}} xmm1 = (xmm2 * xmm1) + mem
173 ; AVX512-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
174 ; AVX512-NEXT: vmulss %xmm1, %xmm2, %xmm1
175 ; AVX512-NEXT: vxorps %xmm2, %xmm2, %xmm2
176 ; AVX512-NEXT: vcmpeqss %xmm2, %xmm0, %k1
177 ; AVX512-NEXT: vmovss %xmm2, %xmm1, %xmm1 {%k1}
178 ; AVX512-NEXT: vmovaps %xmm1, %xmm0
180 %call = tail call ninf afn float @__sqrtf_finite(float %f) #2
184 define x86_fp80 @finite_f80_no_estimate(x86_fp80 %ld) #0 {
185 ; CHECK-LABEL: finite_f80_no_estimate:
187 ; CHECK-NEXT: fldt {{[0-9]+}}(%rsp)
190 %call = tail call x86_fp80 @__sqrtl_finite(x86_fp80 %ld) #2
194 ; Don't die on the impossible.
196 define x86_fp80 @finite_f80_estimate_but_no(x86_fp80 %ld) #1 {
197 ; CHECK-LABEL: finite_f80_estimate_but_no:
199 ; CHECK-NEXT: fldt {{[0-9]+}}(%rsp)
202 %call = tail call x86_fp80 @__sqrtl_finite(x86_fp80 %ld) #2
206 ; PR34994 - https://bugs.llvm.org/show_bug.cgi?id=34994
208 define float @sqrtf_check_denorms(float %x) #3 {
209 ; SSE-LABEL: sqrtf_check_denorms:
211 ; SSE-NEXT: sqrtss %xmm0, %xmm0
214 ; AVX-LABEL: sqrtf_check_denorms:
216 ; AVX-NEXT: vsqrtss %xmm0, %xmm0, %xmm0
218 %call = tail call float @__sqrtf_finite(float %x) #2
222 define float @sqrtf_check_denorms_ninf(float %x) #3 {
223 ; SSE-LABEL: sqrtf_check_denorms_ninf:
225 ; SSE-NEXT: rsqrtss %xmm0, %xmm1
226 ; SSE-NEXT: movaps %xmm0, %xmm2
227 ; SSE-NEXT: mulss %xmm1, %xmm2
228 ; SSE-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
229 ; SSE-NEXT: mulss %xmm2, %xmm3
230 ; SSE-NEXT: mulss %xmm1, %xmm2
231 ; SSE-NEXT: addss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
232 ; SSE-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
233 ; SSE-NEXT: mulss %xmm3, %xmm2
234 ; SSE-NEXT: cmpltss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
235 ; SSE-NEXT: andnps %xmm2, %xmm0
238 ; AVX1-LABEL: sqrtf_check_denorms_ninf:
240 ; AVX1-NEXT: vrsqrtss %xmm0, %xmm0, %xmm1
241 ; AVX1-NEXT: vmulss %xmm1, %xmm0, %xmm2
242 ; AVX1-NEXT: vmulss %xmm1, %xmm2, %xmm1
243 ; AVX1-NEXT: vaddss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
244 ; AVX1-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
245 ; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
246 ; AVX1-NEXT: vmulss %xmm1, %xmm2, %xmm1
247 ; AVX1-NEXT: vcmpltss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
248 ; AVX1-NEXT: vandnps %xmm1, %xmm0, %xmm0
251 ; AVX512-LABEL: sqrtf_check_denorms_ninf:
253 ; AVX512-NEXT: vrsqrtss %xmm0, %xmm0, %xmm1
254 ; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm2
255 ; AVX512-NEXT: vfmadd213ss {{.*#+}} xmm1 = (xmm2 * xmm1) + mem
256 ; AVX512-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
257 ; AVX512-NEXT: vmulss %xmm1, %xmm2, %xmm1
258 ; AVX512-NEXT: vbroadcastss {{.*#+}} xmm2 = [NaN,NaN,NaN,NaN]
259 ; AVX512-NEXT: vandps %xmm2, %xmm0, %xmm0
260 ; AVX512-NEXT: vcmpltss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1
261 ; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0
262 ; AVX512-NEXT: vmovss %xmm0, %xmm1, %xmm1 {%k1}
263 ; AVX512-NEXT: vmovaps %xmm1, %xmm0
265 %call = tail call ninf afn float @__sqrtf_finite(float %x) #2
269 define <4 x float> @sqrt_v4f32_check_denorms(<4 x float> %x) #3 {
270 ; SSE-LABEL: sqrt_v4f32_check_denorms:
272 ; SSE-NEXT: sqrtps %xmm0, %xmm0
275 ; AVX-LABEL: sqrt_v4f32_check_denorms:
277 ; AVX-NEXT: vsqrtps %xmm0, %xmm0
279 %call = tail call <4 x float> @llvm.sqrt.v4f32(<4 x float> %x) #2
280 ret <4 x float> %call
283 define <4 x float> @sqrt_v4f32_check_denorms_ninf(<4 x float> %x) #3 {
284 ; SSE-LABEL: sqrt_v4f32_check_denorms_ninf:
286 ; SSE-NEXT: rsqrtps %xmm0, %xmm2
287 ; SSE-NEXT: movaps %xmm0, %xmm1
288 ; SSE-NEXT: mulps %xmm2, %xmm1
289 ; SSE-NEXT: movaps {{.*#+}} xmm3 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
290 ; SSE-NEXT: mulps %xmm1, %xmm3
291 ; SSE-NEXT: mulps %xmm2, %xmm1
292 ; SSE-NEXT: addps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
293 ; SSE-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
294 ; SSE-NEXT: mulps %xmm3, %xmm1
295 ; SSE-NEXT: movaps {{.*#+}} xmm2 = [1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38]
296 ; SSE-NEXT: cmpleps %xmm0, %xmm2
297 ; SSE-NEXT: andps %xmm2, %xmm1
298 ; SSE-NEXT: movaps %xmm1, %xmm0
301 ; AVX1-LABEL: sqrt_v4f32_check_denorms_ninf:
303 ; AVX1-NEXT: vrsqrtps %xmm0, %xmm1
304 ; AVX1-NEXT: vmulps %xmm1, %xmm0, %xmm2
305 ; AVX1-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm3
306 ; AVX1-NEXT: vmulps %xmm1, %xmm2, %xmm1
307 ; AVX1-NEXT: vaddps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
308 ; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
309 ; AVX1-NEXT: vmulps %xmm1, %xmm3, %xmm1
310 ; AVX1-NEXT: vmovaps {{.*#+}} xmm2 = [1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38]
311 ; AVX1-NEXT: vcmpleps %xmm0, %xmm2, %xmm0
312 ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0
315 ; AVX512-LABEL: sqrt_v4f32_check_denorms_ninf:
317 ; AVX512-NEXT: vrsqrtps %xmm0, %xmm1
318 ; AVX512-NEXT: vmulps %xmm1, %xmm0, %xmm2
319 ; AVX512-NEXT: vbroadcastss {{.*#+}} xmm3 = [-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0]
320 ; AVX512-NEXT: vfmadd231ps {{.*#+}} xmm3 = (xmm2 * xmm1) + xmm3
321 ; AVX512-NEXT: vbroadcastss {{.*#+}} xmm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
322 ; AVX512-NEXT: vmulps %xmm1, %xmm2, %xmm1
323 ; AVX512-NEXT: vmulps %xmm3, %xmm1, %xmm1
324 ; AVX512-NEXT: vbroadcastss {{.*#+}} xmm2 = [NaN,NaN,NaN,NaN]
325 ; AVX512-NEXT: vandps %xmm2, %xmm0, %xmm0
326 ; AVX512-NEXT: vbroadcastss {{.*#+}} xmm2 = [1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38]
327 ; AVX512-NEXT: vcmpleps %xmm0, %xmm2, %xmm0
328 ; AVX512-NEXT: vandps %xmm1, %xmm0, %xmm0
330 %call = tail call ninf afn <4 x float> @llvm.sqrt.v4f32(<4 x float> %x) #2
331 ret <4 x float> %call
334 define float @f32_no_estimate(float %x) #0 {
335 ; SSE-LABEL: f32_no_estimate:
337 ; SSE-NEXT: sqrtss %xmm0, %xmm1
338 ; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
339 ; SSE-NEXT: divss %xmm1, %xmm0
342 ; AVX-LABEL: f32_no_estimate:
344 ; AVX-NEXT: vsqrtss %xmm0, %xmm0, %xmm0
345 ; AVX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
346 ; AVX-NEXT: vdivss %xmm0, %xmm1, %xmm0
348 %sqrt = tail call float @llvm.sqrt.f32(float %x)
349 %div = fdiv fast float 1.0, %sqrt
353 define float @f32_estimate(float %x) #1 {
354 ; SSE-LABEL: f32_estimate:
356 ; SSE-NEXT: rsqrtss %xmm0, %xmm1
357 ; SSE-NEXT: mulss %xmm1, %xmm0
358 ; SSE-NEXT: mulss %xmm1, %xmm0
359 ; SSE-NEXT: addss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
360 ; SSE-NEXT: mulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
361 ; SSE-NEXT: mulss %xmm1, %xmm0
364 ; AVX1-LABEL: f32_estimate:
366 ; AVX1-NEXT: vrsqrtss %xmm0, %xmm0, %xmm1
367 ; AVX1-NEXT: vmulss %xmm1, %xmm0, %xmm0
368 ; AVX1-NEXT: vmulss %xmm1, %xmm0, %xmm0
369 ; AVX1-NEXT: vaddss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
370 ; AVX1-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
371 ; AVX1-NEXT: vmulss %xmm0, %xmm1, %xmm0
374 ; AVX512-LABEL: f32_estimate:
376 ; AVX512-NEXT: vrsqrtss %xmm0, %xmm0, %xmm1
377 ; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0
378 ; AVX512-NEXT: vfmadd213ss {{.*#+}} xmm0 = (xmm1 * xmm0) + mem
379 ; AVX512-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
380 ; AVX512-NEXT: vmulss %xmm0, %xmm1, %xmm0
382 %sqrt = tail call float @llvm.sqrt.f32(float %x)
383 %div = fdiv fast float 1.0, %sqrt
387 define <4 x float> @v4f32_no_estimate(<4 x float> %x) #0 {
388 ; SSE-LABEL: v4f32_no_estimate:
390 ; SSE-NEXT: sqrtps %xmm0, %xmm1
391 ; SSE-NEXT: movaps {{.*#+}} xmm0 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
392 ; SSE-NEXT: divps %xmm1, %xmm0
395 ; AVX1-LABEL: v4f32_no_estimate:
397 ; AVX1-NEXT: vsqrtps %xmm0, %xmm0
398 ; AVX1-NEXT: vmovaps {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
399 ; AVX1-NEXT: vdivps %xmm0, %xmm1, %xmm0
402 ; AVX512-LABEL: v4f32_no_estimate:
404 ; AVX512-NEXT: vsqrtps %xmm0, %xmm0
405 ; AVX512-NEXT: vbroadcastss {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
406 ; AVX512-NEXT: vdivps %xmm0, %xmm1, %xmm0
408 %sqrt = tail call <4 x float> @llvm.sqrt.v4f32(<4 x float> %x)
409 %div = fdiv fast <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>, %sqrt
413 define <4 x float> @v4f32_estimate(<4 x float> %x) #1 {
414 ; SSE-LABEL: v4f32_estimate:
416 ; SSE-NEXT: rsqrtps %xmm0, %xmm1
417 ; SSE-NEXT: mulps %xmm1, %xmm0
418 ; SSE-NEXT: mulps %xmm1, %xmm0
419 ; SSE-NEXT: addps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
420 ; SSE-NEXT: mulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
421 ; SSE-NEXT: mulps %xmm1, %xmm0
424 ; AVX1-LABEL: v4f32_estimate:
426 ; AVX1-NEXT: vrsqrtps %xmm0, %xmm1
427 ; AVX1-NEXT: vmulps %xmm1, %xmm0, %xmm0
428 ; AVX1-NEXT: vmulps %xmm1, %xmm0, %xmm0
429 ; AVX1-NEXT: vaddps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
430 ; AVX1-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
431 ; AVX1-NEXT: vmulps %xmm0, %xmm1, %xmm0
434 ; AVX512-LABEL: v4f32_estimate:
436 ; AVX512-NEXT: vrsqrtps %xmm0, %xmm1
437 ; AVX512-NEXT: vmulps %xmm1, %xmm0, %xmm0
438 ; AVX512-NEXT: vbroadcastss {{.*#+}} xmm2 = [-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0]
439 ; AVX512-NEXT: vfmadd231ps {{.*#+}} xmm2 = (xmm1 * xmm0) + xmm2
440 ; AVX512-NEXT: vbroadcastss {{.*#+}} xmm0 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
441 ; AVX512-NEXT: vmulps %xmm0, %xmm1, %xmm0
442 ; AVX512-NEXT: vmulps %xmm2, %xmm0, %xmm0
444 %sqrt = tail call <4 x float> @llvm.sqrt.v4f32(<4 x float> %x)
445 %div = fdiv fast <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>, %sqrt
449 define <8 x float> @v8f32_no_estimate(<8 x float> %x) #0 {
450 ; SSE-LABEL: v8f32_no_estimate:
452 ; SSE-NEXT: sqrtps %xmm1, %xmm2
453 ; SSE-NEXT: sqrtps %xmm0, %xmm3
454 ; SSE-NEXT: movaps {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
455 ; SSE-NEXT: movaps %xmm1, %xmm0
456 ; SSE-NEXT: divps %xmm3, %xmm0
457 ; SSE-NEXT: divps %xmm2, %xmm1
460 ; AVX1-LABEL: v8f32_no_estimate:
462 ; AVX1-NEXT: vsqrtps %ymm0, %ymm0
463 ; AVX1-NEXT: vmovaps {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
464 ; AVX1-NEXT: vdivps %ymm0, %ymm1, %ymm0
467 ; AVX512-LABEL: v8f32_no_estimate:
469 ; AVX512-NEXT: vsqrtps %ymm0, %ymm0
470 ; AVX512-NEXT: vbroadcastss {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
471 ; AVX512-NEXT: vdivps %ymm0, %ymm1, %ymm0
473 %sqrt = tail call <8 x float> @llvm.sqrt.v8f32(<8 x float> %x)
474 %div = fdiv fast <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, %sqrt
478 define <8 x float> @v8f32_estimate(<8 x float> %x) #1 {
479 ; SSE-LABEL: v8f32_estimate:
481 ; SSE-NEXT: rsqrtps %xmm0, %xmm2
482 ; SSE-NEXT: movaps {{.*#+}} xmm3 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
483 ; SSE-NEXT: mulps %xmm2, %xmm0
484 ; SSE-NEXT: mulps %xmm2, %xmm0
485 ; SSE-NEXT: mulps %xmm3, %xmm2
486 ; SSE-NEXT: movaps {{.*#+}} xmm4 = [-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0]
487 ; SSE-NEXT: addps %xmm4, %xmm0
488 ; SSE-NEXT: mulps %xmm2, %xmm0
489 ; SSE-NEXT: rsqrtps %xmm1, %xmm2
490 ; SSE-NEXT: mulps %xmm2, %xmm3
491 ; SSE-NEXT: mulps %xmm2, %xmm1
492 ; SSE-NEXT: mulps %xmm2, %xmm1
493 ; SSE-NEXT: addps %xmm4, %xmm1
494 ; SSE-NEXT: mulps %xmm3, %xmm1
497 ; AVX1-LABEL: v8f32_estimate:
499 ; AVX1-NEXT: vrsqrtps %ymm0, %ymm1
500 ; AVX1-NEXT: vmulps %ymm1, %ymm0, %ymm0
501 ; AVX1-NEXT: vmulps %ymm1, %ymm0, %ymm0
502 ; AVX1-NEXT: vaddps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
503 ; AVX1-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
504 ; AVX1-NEXT: vmulps %ymm0, %ymm1, %ymm0
507 ; AVX512-LABEL: v8f32_estimate:
509 ; AVX512-NEXT: vrsqrtps %ymm0, %ymm1
510 ; AVX512-NEXT: vmulps %ymm1, %ymm0, %ymm0
511 ; AVX512-NEXT: vbroadcastss {{.*#+}} ymm2 = [-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0]
512 ; AVX512-NEXT: vfmadd231ps {{.*#+}} ymm2 = (ymm1 * ymm0) + ymm2
513 ; AVX512-NEXT: vbroadcastss {{.*#+}} ymm0 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
514 ; AVX512-NEXT: vmulps %ymm0, %ymm1, %ymm0
515 ; AVX512-NEXT: vmulps %ymm2, %ymm0, %ymm0
517 %sqrt = tail call <8 x float> @llvm.sqrt.v8f32(<8 x float> %x)
518 %div = fdiv fast <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, %sqrt
522 define <16 x float> @v16f32_no_estimate(<16 x float> %x) #0 {
523 ; SSE-LABEL: v16f32_no_estimate:
525 ; SSE-NEXT: sqrtps %xmm3, %xmm4
526 ; SSE-NEXT: sqrtps %xmm2, %xmm5
527 ; SSE-NEXT: sqrtps %xmm1, %xmm2
528 ; SSE-NEXT: sqrtps %xmm0, %xmm1
529 ; SSE-NEXT: movaps {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
530 ; SSE-NEXT: movaps %xmm3, %xmm0
531 ; SSE-NEXT: divps %xmm1, %xmm0
532 ; SSE-NEXT: movaps %xmm3, %xmm1
533 ; SSE-NEXT: divps %xmm2, %xmm1
534 ; SSE-NEXT: movaps %xmm3, %xmm2
535 ; SSE-NEXT: divps %xmm5, %xmm2
536 ; SSE-NEXT: divps %xmm4, %xmm3
539 ; AVX1-LABEL: v16f32_no_estimate:
541 ; AVX1-NEXT: vsqrtps %ymm1, %ymm1
542 ; AVX1-NEXT: vsqrtps %ymm0, %ymm0
543 ; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
544 ; AVX1-NEXT: vdivps %ymm0, %ymm2, %ymm0
545 ; AVX1-NEXT: vdivps %ymm1, %ymm2, %ymm1
548 ; AVX512-LABEL: v16f32_no_estimate:
550 ; AVX512-NEXT: vsqrtps %zmm0, %zmm0
551 ; AVX512-NEXT: vbroadcastss {{.*#+}} zmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
552 ; AVX512-NEXT: vdivps %zmm0, %zmm1, %zmm0
554 %sqrt = tail call <16 x float> @llvm.sqrt.v16f32(<16 x float> %x)
555 %div = fdiv fast <16 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, %sqrt
556 ret <16 x float> %div
559 define <16 x float> @v16f32_estimate(<16 x float> %x) #1 {
560 ; SSE-LABEL: v16f32_estimate:
562 ; SSE-NEXT: rsqrtps %xmm0, %xmm5
563 ; SSE-NEXT: movaps {{.*#+}} xmm4 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
564 ; SSE-NEXT: mulps %xmm5, %xmm0
565 ; SSE-NEXT: mulps %xmm5, %xmm0
566 ; SSE-NEXT: movaps %xmm5, %xmm6
567 ; SSE-NEXT: mulps %xmm4, %xmm6
568 ; SSE-NEXT: movaps {{.*#+}} xmm5 = [-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0]
569 ; SSE-NEXT: addps %xmm5, %xmm0
570 ; SSE-NEXT: mulps %xmm6, %xmm0
571 ; SSE-NEXT: rsqrtps %xmm1, %xmm6
572 ; SSE-NEXT: mulps %xmm6, %xmm1
573 ; SSE-NEXT: mulps %xmm6, %xmm1
574 ; SSE-NEXT: mulps %xmm4, %xmm6
575 ; SSE-NEXT: addps %xmm5, %xmm1
576 ; SSE-NEXT: mulps %xmm6, %xmm1
577 ; SSE-NEXT: rsqrtps %xmm2, %xmm6
578 ; SSE-NEXT: mulps %xmm6, %xmm2
579 ; SSE-NEXT: mulps %xmm6, %xmm2
580 ; SSE-NEXT: mulps %xmm4, %xmm6
581 ; SSE-NEXT: addps %xmm5, %xmm2
582 ; SSE-NEXT: mulps %xmm6, %xmm2
583 ; SSE-NEXT: rsqrtps %xmm3, %xmm6
584 ; SSE-NEXT: mulps %xmm6, %xmm4
585 ; SSE-NEXT: mulps %xmm6, %xmm3
586 ; SSE-NEXT: mulps %xmm6, %xmm3
587 ; SSE-NEXT: addps %xmm5, %xmm3
588 ; SSE-NEXT: mulps %xmm4, %xmm3
591 ; AVX1-LABEL: v16f32_estimate:
593 ; AVX1-NEXT: vrsqrtps %ymm0, %ymm2
594 ; AVX1-NEXT: vmovaps {{.*#+}} ymm3 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
595 ; AVX1-NEXT: vmulps %ymm3, %ymm2, %ymm4
596 ; AVX1-NEXT: vmulps %ymm2, %ymm0, %ymm0
597 ; AVX1-NEXT: vmulps %ymm2, %ymm0, %ymm0
598 ; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0]
599 ; AVX1-NEXT: vaddps %ymm2, %ymm0, %ymm0
600 ; AVX1-NEXT: vmulps %ymm0, %ymm4, %ymm0
601 ; AVX1-NEXT: vrsqrtps %ymm1, %ymm4
602 ; AVX1-NEXT: vmulps %ymm3, %ymm4, %ymm3
603 ; AVX1-NEXT: vmulps %ymm4, %ymm1, %ymm1
604 ; AVX1-NEXT: vmulps %ymm4, %ymm1, %ymm1
605 ; AVX1-NEXT: vaddps %ymm2, %ymm1, %ymm1
606 ; AVX1-NEXT: vmulps %ymm1, %ymm3, %ymm1
609 ; AVX512-LABEL: v16f32_estimate:
611 ; AVX512-NEXT: vrsqrt14ps %zmm0, %zmm1
612 ; AVX512-NEXT: vmulps %zmm1, %zmm0, %zmm0
613 ; AVX512-NEXT: vfmadd213ps {{.*#+}} zmm0 = (zmm1 * zmm0) + mem
614 ; AVX512-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm1
615 ; AVX512-NEXT: vmulps %zmm0, %zmm1, %zmm0
617 %sqrt = tail call <16 x float> @llvm.sqrt.v16f32(<16 x float> %x)
618 %div = fdiv fast <16 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, %sqrt
619 ret <16 x float> %div
622 ; x / (fabs(y) * sqrt(z)) --> x * rsqrt(y*y*z)
624 define float @div_sqrt_fabs_f32(float %x, float %y, float %z) {
625 ; SSE-LABEL: div_sqrt_fabs_f32:
627 ; SSE-NEXT: mulss %xmm1, %xmm1
628 ; SSE-NEXT: mulss %xmm2, %xmm1
629 ; SSE-NEXT: xorps %xmm2, %xmm2
630 ; SSE-NEXT: rsqrtss %xmm1, %xmm2
631 ; SSE-NEXT: mulss %xmm2, %xmm1
632 ; SSE-NEXT: mulss %xmm2, %xmm1
633 ; SSE-NEXT: addss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
634 ; SSE-NEXT: mulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
635 ; SSE-NEXT: mulss %xmm0, %xmm2
636 ; SSE-NEXT: mulss %xmm1, %xmm2
637 ; SSE-NEXT: movaps %xmm2, %xmm0
640 ; AVX1-LABEL: div_sqrt_fabs_f32:
642 ; AVX1-NEXT: vmulss %xmm1, %xmm1, %xmm1
643 ; AVX1-NEXT: vmulss %xmm2, %xmm1, %xmm1
644 ; AVX1-NEXT: vrsqrtss %xmm1, %xmm1, %xmm2
645 ; AVX1-NEXT: vmulss %xmm2, %xmm1, %xmm1
646 ; AVX1-NEXT: vmulss %xmm2, %xmm1, %xmm1
647 ; AVX1-NEXT: vaddss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
648 ; AVX1-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
649 ; AVX1-NEXT: vmulss %xmm0, %xmm2, %xmm0
650 ; AVX1-NEXT: vmulss %xmm0, %xmm1, %xmm0
653 ; AVX512-LABEL: div_sqrt_fabs_f32:
655 ; AVX512-NEXT: vmulss %xmm1, %xmm1, %xmm1
656 ; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1
657 ; AVX512-NEXT: vrsqrtss %xmm1, %xmm1, %xmm2
658 ; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1
659 ; AVX512-NEXT: vfmadd213ss {{.*#+}} xmm1 = (xmm2 * xmm1) + mem
660 ; AVX512-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
661 ; AVX512-NEXT: vmulss %xmm0, %xmm2, %xmm0
662 ; AVX512-NEXT: vmulss %xmm0, %xmm1, %xmm0
664 %s = call fast float @llvm.sqrt.f32(float %z)
665 %a = call fast float @llvm.fabs.f32(float %y)
666 %m = fmul fast float %s, %a
667 %d = fdiv fast float %x, %m
671 ; x / (fabs(y) * sqrt(z)) --> x * rsqrt(y*y*z)
673 define <4 x float> @div_sqrt_fabs_v4f32(<4 x float> %x, <4 x float> %y, <4 x float> %z) {
674 ; SSE-LABEL: div_sqrt_fabs_v4f32:
676 ; SSE-NEXT: mulps %xmm1, %xmm1
677 ; SSE-NEXT: mulps %xmm2, %xmm1
678 ; SSE-NEXT: rsqrtps %xmm1, %xmm2
679 ; SSE-NEXT: mulps %xmm2, %xmm1
680 ; SSE-NEXT: mulps %xmm2, %xmm1
681 ; SSE-NEXT: addps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
682 ; SSE-NEXT: mulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
683 ; SSE-NEXT: mulps %xmm1, %xmm2
684 ; SSE-NEXT: mulps %xmm2, %xmm0
687 ; AVX1-LABEL: div_sqrt_fabs_v4f32:
689 ; AVX1-NEXT: vmulps %xmm1, %xmm1, %xmm1
690 ; AVX1-NEXT: vmulps %xmm2, %xmm1, %xmm1
691 ; AVX1-NEXT: vrsqrtps %xmm1, %xmm2
692 ; AVX1-NEXT: vmulps %xmm2, %xmm1, %xmm1
693 ; AVX1-NEXT: vmulps %xmm2, %xmm1, %xmm1
694 ; AVX1-NEXT: vaddps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
695 ; AVX1-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
696 ; AVX1-NEXT: vmulps %xmm1, %xmm2, %xmm1
697 ; AVX1-NEXT: vmulps %xmm1, %xmm0, %xmm0
700 ; AVX512-LABEL: div_sqrt_fabs_v4f32:
702 ; AVX512-NEXT: vmulps %xmm1, %xmm1, %xmm1
703 ; AVX512-NEXT: vmulps %xmm2, %xmm1, %xmm1
704 ; AVX512-NEXT: vrsqrtps %xmm1, %xmm2
705 ; AVX512-NEXT: vmulps %xmm2, %xmm1, %xmm1
706 ; AVX512-NEXT: vbroadcastss {{.*#+}} xmm3 = [-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0]
707 ; AVX512-NEXT: vfmadd231ps {{.*#+}} xmm3 = (xmm2 * xmm1) + xmm3
708 ; AVX512-NEXT: vbroadcastss {{.*#+}} xmm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
709 ; AVX512-NEXT: vmulps %xmm1, %xmm2, %xmm1
710 ; AVX512-NEXT: vmulps %xmm3, %xmm1, %xmm1
711 ; AVX512-NEXT: vmulps %xmm1, %xmm0, %xmm0
713 %s = call <4 x float> @llvm.sqrt.v4f32(<4 x float> %z)
714 %a = call <4 x float> @llvm.fabs.v4f32(<4 x float> %y)
715 %m = fmul contract reassoc <4 x float> %a, %s
716 %d = fdiv contract reassoc arcp <4 x float> %x, %m
720 ; This has 'arcp' but does not have 'reassoc' FMF.
721 ; We allow converting the sqrt to an estimate, but
722 ; do not pull the divisor into the estimate.
723 ; x / (fabs(y) * sqrt(z)) --> x * rsqrt(z) / fabs(y)
725 define <4 x float> @div_sqrt_fabs_v4f32_fmf(<4 x float> %x, <4 x float> %y, <4 x float> %z) {
726 ; SSE-LABEL: div_sqrt_fabs_v4f32_fmf:
728 ; SSE-NEXT: rsqrtps %xmm2, %xmm3
729 ; SSE-NEXT: mulps %xmm3, %xmm2
730 ; SSE-NEXT: mulps %xmm3, %xmm2
731 ; SSE-NEXT: addps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
732 ; SSE-NEXT: mulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
733 ; SSE-NEXT: mulps %xmm2, %xmm3
734 ; SSE-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
735 ; SSE-NEXT: divps %xmm1, %xmm3
736 ; SSE-NEXT: mulps %xmm3, %xmm0
739 ; AVX1-LABEL: div_sqrt_fabs_v4f32_fmf:
741 ; AVX1-NEXT: vrsqrtps %xmm2, %xmm3
742 ; AVX1-NEXT: vmulps %xmm3, %xmm2, %xmm2
743 ; AVX1-NEXT: vmulps %xmm3, %xmm2, %xmm2
744 ; AVX1-NEXT: vaddps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
745 ; AVX1-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3
746 ; AVX1-NEXT: vmulps %xmm2, %xmm3, %xmm2
747 ; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
748 ; AVX1-NEXT: vdivps %xmm1, %xmm2, %xmm1
749 ; AVX1-NEXT: vmulps %xmm1, %xmm0, %xmm0
752 ; AVX512-LABEL: div_sqrt_fabs_v4f32_fmf:
754 ; AVX512-NEXT: vrsqrtps %xmm2, %xmm3
755 ; AVX512-NEXT: vbroadcastss {{.*#+}} xmm4 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
756 ; AVX512-NEXT: vmulps %xmm4, %xmm3, %xmm4
757 ; AVX512-NEXT: vmulps %xmm3, %xmm2, %xmm2
758 ; AVX512-NEXT: vmulps %xmm3, %xmm2, %xmm2
759 ; AVX512-NEXT: vbroadcastss {{.*#+}} xmm3 = [-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0]
760 ; AVX512-NEXT: vaddps %xmm3, %xmm2, %xmm2
761 ; AVX512-NEXT: vbroadcastss {{.*#+}} xmm3 = [NaN,NaN,NaN,NaN]
762 ; AVX512-NEXT: vmulps %xmm2, %xmm4, %xmm2
763 ; AVX512-NEXT: vandps %xmm3, %xmm1, %xmm1
764 ; AVX512-NEXT: vdivps %xmm1, %xmm2, %xmm1
765 ; AVX512-NEXT: vmulps %xmm1, %xmm0, %xmm0
767 %s = call <4 x float> @llvm.sqrt.v4f32(<4 x float> %z)
768 %a = call <4 x float> @llvm.fabs.v4f32(<4 x float> %y)
769 %m = fmul <4 x float> %a, %s
770 %d = fdiv arcp <4 x float> %x, %m
774 ; No estimates for f64, so do not convert fabs into an fmul.
776 define double @div_sqrt_fabs_f64(double %x, double %y, double %z) {
777 ; SSE-LABEL: div_sqrt_fabs_f64:
779 ; SSE-NEXT: sqrtsd %xmm2, %xmm2
780 ; SSE-NEXT: andpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
781 ; SSE-NEXT: mulsd %xmm2, %xmm1
782 ; SSE-NEXT: divsd %xmm1, %xmm0
785 ; AVX-LABEL: div_sqrt_fabs_f64:
787 ; AVX-NEXT: vsqrtsd %xmm2, %xmm2, %xmm2
788 ; AVX-NEXT: vandpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
789 ; AVX-NEXT: vmulsd %xmm1, %xmm2, %xmm1
790 ; AVX-NEXT: vdivsd %xmm1, %xmm0, %xmm0
792 %s = call fast double @llvm.sqrt.f64(double %z)
793 %a = call fast double @llvm.fabs.f64(double %y)
794 %m = fmul fast double %s, %a
795 %d = fdiv fast double %x, %m
799 ; This is a special case for the general pattern above -
800 ; if the sqrt operand is the same as the other mul op,
801 ; then fabs may be omitted.
802 ; x / (y * sqrt(y)) --> x * rsqrt(y*y*y)
804 define float @div_sqrt_f32(float %x, float %y) {
805 ; SSE-LABEL: div_sqrt_f32:
807 ; SSE-NEXT: movaps %xmm1, %xmm2
808 ; SSE-NEXT: mulss %xmm1, %xmm2
809 ; SSE-NEXT: mulss %xmm1, %xmm2
810 ; SSE-NEXT: xorps %xmm1, %xmm1
811 ; SSE-NEXT: rsqrtss %xmm2, %xmm1
812 ; SSE-NEXT: mulss %xmm1, %xmm2
813 ; SSE-NEXT: mulss %xmm1, %xmm2
814 ; SSE-NEXT: addss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
815 ; SSE-NEXT: mulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
816 ; SSE-NEXT: mulss %xmm0, %xmm1
817 ; SSE-NEXT: mulss %xmm2, %xmm1
818 ; SSE-NEXT: movaps %xmm1, %xmm0
821 ; AVX1-LABEL: div_sqrt_f32:
823 ; AVX1-NEXT: vmulss %xmm1, %xmm1, %xmm2
824 ; AVX1-NEXT: vmulss %xmm1, %xmm2, %xmm1
825 ; AVX1-NEXT: vrsqrtss %xmm1, %xmm1, %xmm2
826 ; AVX1-NEXT: vmulss %xmm2, %xmm1, %xmm1
827 ; AVX1-NEXT: vmulss %xmm2, %xmm1, %xmm1
828 ; AVX1-NEXT: vaddss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
829 ; AVX1-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
830 ; AVX1-NEXT: vmulss %xmm0, %xmm2, %xmm0
831 ; AVX1-NEXT: vmulss %xmm0, %xmm1, %xmm0
834 ; AVX512-LABEL: div_sqrt_f32:
836 ; AVX512-NEXT: vmulss %xmm1, %xmm1, %xmm2
837 ; AVX512-NEXT: vmulss %xmm1, %xmm2, %xmm1
838 ; AVX512-NEXT: vrsqrtss %xmm1, %xmm1, %xmm2
839 ; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1
840 ; AVX512-NEXT: vfmadd213ss {{.*#+}} xmm1 = (xmm2 * xmm1) + mem
841 ; AVX512-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
842 ; AVX512-NEXT: vmulss %xmm0, %xmm2, %xmm0
843 ; AVX512-NEXT: vmulss %xmm0, %xmm1, %xmm0
845 %s = call fast float @llvm.sqrt.f32(float %y)
846 %m = fmul fast float %s, %y
847 %d = fdiv fast float %x, %m
851 ; This is a special case for the general pattern above -
852 ; if the sqrt operand is the same as the other mul op,
853 ; then fabs may be omitted.
854 ; x / (y * sqrt(y)) --> x * rsqrt(y*y*y)
856 define <4 x float> @div_sqrt_v4f32(<4 x float> %x, <4 x float> %y) {
857 ; SSE-LABEL: div_sqrt_v4f32:
859 ; SSE-NEXT: movaps %xmm1, %xmm2
860 ; SSE-NEXT: mulps %xmm1, %xmm2
861 ; SSE-NEXT: mulps %xmm1, %xmm2
862 ; SSE-NEXT: rsqrtps %xmm2, %xmm1
863 ; SSE-NEXT: mulps %xmm1, %xmm2
864 ; SSE-NEXT: mulps %xmm1, %xmm2
865 ; SSE-NEXT: addps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
866 ; SSE-NEXT: mulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
867 ; SSE-NEXT: mulps %xmm2, %xmm1
868 ; SSE-NEXT: mulps %xmm1, %xmm0
871 ; AVX1-LABEL: div_sqrt_v4f32:
873 ; AVX1-NEXT: vmulps %xmm1, %xmm1, %xmm2
874 ; AVX1-NEXT: vmulps %xmm1, %xmm2, %xmm1
875 ; AVX1-NEXT: vrsqrtps %xmm1, %xmm2
876 ; AVX1-NEXT: vmulps %xmm2, %xmm1, %xmm1
877 ; AVX1-NEXT: vmulps %xmm2, %xmm1, %xmm1
878 ; AVX1-NEXT: vaddps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
879 ; AVX1-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
880 ; AVX1-NEXT: vmulps %xmm1, %xmm2, %xmm1
881 ; AVX1-NEXT: vmulps %xmm1, %xmm0, %xmm0
884 ; AVX512-LABEL: div_sqrt_v4f32:
886 ; AVX512-NEXT: vmulps %xmm1, %xmm1, %xmm2
887 ; AVX512-NEXT: vmulps %xmm1, %xmm2, %xmm1
888 ; AVX512-NEXT: vrsqrtps %xmm1, %xmm2
889 ; AVX512-NEXT: vmulps %xmm2, %xmm1, %xmm1
890 ; AVX512-NEXT: vbroadcastss {{.*#+}} xmm3 = [-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0]
891 ; AVX512-NEXT: vfmadd231ps {{.*#+}} xmm3 = (xmm2 * xmm1) + xmm3
892 ; AVX512-NEXT: vbroadcastss {{.*#+}} xmm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
893 ; AVX512-NEXT: vmulps %xmm1, %xmm2, %xmm1
894 ; AVX512-NEXT: vmulps %xmm3, %xmm1, %xmm1
895 ; AVX512-NEXT: vmulps %xmm1, %xmm0, %xmm0
897 %s = call <4 x float> @llvm.sqrt.v4f32(<4 x float> %y)
898 %m = fmul contract reassoc <4 x float> %y, %s
899 %d = fdiv contract reassoc arcp <4 x float> %x, %m
903 define double @sqrt_fdiv_common_operand(double %x) nounwind {
904 ; SSE-LABEL: sqrt_fdiv_common_operand:
906 ; SSE-NEXT: sqrtsd %xmm0, %xmm0
909 ; AVX-LABEL: sqrt_fdiv_common_operand:
911 ; AVX-NEXT: vsqrtsd %xmm0, %xmm0, %xmm0
913 %sqrt = call fast double @llvm.sqrt.f64(double %x)
914 %r = fdiv fast double %x, %sqrt
918 define <2 x double> @sqrt_fdiv_common_operand_vec(<2 x double> %x) nounwind {
919 ; SSE-LABEL: sqrt_fdiv_common_operand_vec:
921 ; SSE-NEXT: sqrtpd %xmm0, %xmm0
924 ; AVX-LABEL: sqrt_fdiv_common_operand_vec:
926 ; AVX-NEXT: vsqrtpd %xmm0, %xmm0
928 %sqrt = call <2 x double> @llvm.sqrt.v2f64(<2 x double> %x)
929 %r = fdiv arcp nsz reassoc <2 x double> %x, %sqrt
933 define double @sqrt_fdiv_common_operand_extra_use(double %x, double* %p) nounwind {
934 ; SSE-LABEL: sqrt_fdiv_common_operand_extra_use:
936 ; SSE-NEXT: sqrtsd %xmm0, %xmm0
937 ; SSE-NEXT: movsd %xmm0, (%rdi)
940 ; AVX-LABEL: sqrt_fdiv_common_operand_extra_use:
942 ; AVX-NEXT: vsqrtsd %xmm0, %xmm0, %xmm0
943 ; AVX-NEXT: vmovsd %xmm0, (%rdi)
945 %sqrt = call fast double @llvm.sqrt.f64(double %x)
946 store double %sqrt, double* %p
947 %r = fdiv fast double %x, %sqrt
951 define double @sqrt_simplify_before_recip(double %x, double* %p) nounwind {
952 ; SSE-LABEL: sqrt_simplify_before_recip:
954 ; SSE-NEXT: sqrtsd %xmm0, %xmm0
955 ; SSE-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
956 ; SSE-NEXT: divsd %xmm0, %xmm1
957 ; SSE-NEXT: movsd %xmm1, (%rdi)
960 ; AVX-LABEL: sqrt_simplify_before_recip:
962 ; AVX-NEXT: vsqrtsd %xmm0, %xmm0, %xmm0
963 ; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
964 ; AVX-NEXT: vdivsd %xmm0, %xmm1, %xmm1
965 ; AVX-NEXT: vmovsd %xmm1, (%rdi)
967 %sqrt = tail call fast double @llvm.sqrt.f64(double %x)
968 %rsqrt = fdiv fast double 1.0, %sqrt
969 %sqrt_fast = fdiv fast double %x, %sqrt
970 store double %rsqrt, double* %p, align 8
971 ret double %sqrt_fast
974 define <2 x double> @sqrt_simplify_before_recip_vec(<2 x double> %x, <2 x double>* %p) nounwind {
975 ; SSE-LABEL: sqrt_simplify_before_recip_vec:
977 ; SSE-NEXT: sqrtpd %xmm0, %xmm0
978 ; SSE-NEXT: movapd {{.*#+}} xmm1 = [1.0E+0,1.0E+0]
979 ; SSE-NEXT: divpd %xmm0, %xmm1
980 ; SSE-NEXT: movupd %xmm1, (%rdi)
983 ; AVX-LABEL: sqrt_simplify_before_recip_vec:
985 ; AVX-NEXT: vsqrtpd %xmm0, %xmm0
986 ; AVX-NEXT: vmovapd {{.*#+}} xmm1 = [1.0E+0,1.0E+0]
987 ; AVX-NEXT: vdivpd %xmm0, %xmm1, %xmm1
988 ; AVX-NEXT: vmovupd %xmm1, (%rdi)
990 %sqrt = tail call fast <2 x double> @llvm.sqrt.v2f64(<2 x double> %x)
991 %rsqrt = fdiv fast <2 x double> <double 1.0, double 1.0>, %sqrt
992 %sqrt_fast = fdiv fast <2 x double> %x, %sqrt
993 store <2 x double> %rsqrt, <2 x double>* %p, align 8
994 ret <2 x double> %sqrt_fast
997 define double @sqrt_simplify_before_recip_order(double %x, double* %p) nounwind {
998 ; SSE-LABEL: sqrt_simplify_before_recip_order:
1000 ; SSE-NEXT: sqrtsd %xmm0, %xmm0
1001 ; SSE-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
1002 ; SSE-NEXT: divsd %xmm0, %xmm1
1003 ; SSE-NEXT: movsd %xmm1, (%rdi)
1006 ; AVX-LABEL: sqrt_simplify_before_recip_order:
1008 ; AVX-NEXT: vsqrtsd %xmm0, %xmm0, %xmm0
1009 ; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
1010 ; AVX-NEXT: vdivsd %xmm0, %xmm1, %xmm1
1011 ; AVX-NEXT: vmovsd %xmm1, (%rdi)
1013 %sqrt = tail call fast double @llvm.sqrt.f64(double %x)
1014 %sqrt_fast = fdiv fast double %x, %sqrt
1015 %rsqrt = fdiv fast double 42.0, %sqrt
1016 store double %rsqrt, double* %p, align 8
1017 ret double %sqrt_fast
1020 attributes #0 = { "unsafe-fp-math"="true" "reciprocal-estimates"="!sqrtf,!vec-sqrtf,!divf,!vec-divf" }
1021 attributes #1 = { "unsafe-fp-math"="true" "reciprocal-estimates"="sqrt,vec-sqrt" }
1022 attributes #2 = { nounwind readnone }
1023 attributes #3 = { "unsafe-fp-math"="true" "reciprocal-estimates"="sqrt,vec-sqrt" "denormal-fp-math"="ieee" }
1024 attributes #4 = { "unsafe-fp-math"="true" "reciprocal-estimates"="sqrt,vec-sqrt" "denormal-fp-math"="ieee,preserve-sign" }