1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64--linux-gnu -mcpu=x86-64 -mattr=+sse2 | FileCheck %s --check-prefix=CHECK --check-prefix=SSE
3 ; RUN: llc < %s -mtriple=x86_64--linux-gnu -mcpu=x86-64 -mattr=+avx | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX1
4 ; RUN: llc < %s -mtriple=x86_64--linux-gnu -mcpu=x86-64 -mattr=+avx512f | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX512
6 declare double @__sqrt_finite(double)
7 declare float @__sqrtf_finite(float)
8 declare x86_fp80 @__sqrtl_finite(x86_fp80)
9 declare float @llvm.sqrt.f32(float)
10 declare <4 x float> @llvm.sqrt.v4f32(<4 x float>)
11 declare <8 x float> @llvm.sqrt.v8f32(<8 x float>)
12 declare <16 x float> @llvm.sqrt.v16f32(<16 x float>)
13 declare double @llvm.sqrt.f64(double)
14 declare <2 x double> @llvm.sqrt.v2f64(<2 x double>)
16 declare float @llvm.fabs.f32(float)
17 declare <4 x float> @llvm.fabs.v4f32(<4 x float>)
18 declare double @llvm.fabs.f64(double)
20 define double @finite_f64_no_estimate(double %d) #0 {
21 ; SSE-LABEL: finite_f64_no_estimate:
23 ; SSE-NEXT: sqrtsd %xmm0, %xmm0
26 ; AVX-LABEL: finite_f64_no_estimate:
28 ; AVX-NEXT: vsqrtsd %xmm0, %xmm0, %xmm0
30 %call = tail call double @__sqrt_finite(double %d) #2
34 ; No estimates for doubles.
36 define double @finite_f64_estimate(double %d) #1 {
37 ; SSE-LABEL: finite_f64_estimate:
39 ; SSE-NEXT: sqrtsd %xmm0, %xmm0
42 ; AVX-LABEL: finite_f64_estimate:
44 ; AVX-NEXT: vsqrtsd %xmm0, %xmm0, %xmm0
46 %call = tail call double @__sqrt_finite(double %d) #2
50 define float @finite_f32_no_estimate(float %f) #0 {
51 ; SSE-LABEL: finite_f32_no_estimate:
53 ; SSE-NEXT: sqrtss %xmm0, %xmm0
56 ; AVX-LABEL: finite_f32_no_estimate:
58 ; AVX-NEXT: vsqrtss %xmm0, %xmm0, %xmm0
60 %call = tail call float @__sqrtf_finite(float %f) #2
64 define float @finite_f32_estimate_ieee(float %f) #1 {
65 ; SSE-LABEL: finite_f32_estimate_ieee:
67 ; SSE-NEXT: sqrtss %xmm0, %xmm0
70 ; AVX-LABEL: finite_f32_estimate_ieee:
72 ; AVX-NEXT: vsqrtss %xmm0, %xmm0, %xmm0
74 %call = tail call float @__sqrtf_finite(float %f) #2
78 define float @finite_f32_estimate_ieee_ninf(float %f) #1 {
79 ; SSE-LABEL: finite_f32_estimate_ieee_ninf:
81 ; SSE-NEXT: sqrtss %xmm0, %xmm0
84 ; AVX-LABEL: finite_f32_estimate_ieee_ninf:
86 ; AVX-NEXT: vsqrtss %xmm0, %xmm0, %xmm0
88 %call = tail call ninf afn float @__sqrtf_finite(float %f) #2
92 define float @finite_f32_estimate_daz(float %f) #4 {
93 ; SSE-LABEL: finite_f32_estimate_daz:
95 ; SSE-NEXT: sqrtss %xmm0, %xmm0
98 ; AVX-LABEL: finite_f32_estimate_daz:
100 ; AVX-NEXT: vsqrtss %xmm0, %xmm0, %xmm0
102 %call = tail call float @__sqrtf_finite(float %f) #2
106 define float @finite_f32_estimate_daz_ninf(float %f) #4 {
107 ; SSE-LABEL: finite_f32_estimate_daz_ninf:
109 ; SSE-NEXT: sqrtss %xmm0, %xmm0
112 ; AVX-LABEL: finite_f32_estimate_daz_ninf:
114 ; AVX-NEXT: vsqrtss %xmm0, %xmm0, %xmm0
116 %call = tail call ninf afn float @__sqrtf_finite(float %f) #2
120 define x86_fp80 @finite_f80_no_estimate(x86_fp80 %ld) #0 {
121 ; CHECK-LABEL: finite_f80_no_estimate:
123 ; CHECK-NEXT: fldt {{[0-9]+}}(%rsp)
126 %call = tail call x86_fp80 @__sqrtl_finite(x86_fp80 %ld) #2
130 ; Don't die on the impossible.
132 define x86_fp80 @finite_f80_estimate_but_no(x86_fp80 %ld) #1 {
133 ; CHECK-LABEL: finite_f80_estimate_but_no:
135 ; CHECK-NEXT: fldt {{[0-9]+}}(%rsp)
138 %call = tail call x86_fp80 @__sqrtl_finite(x86_fp80 %ld) #2
142 ; PR34994 - https://bugs.llvm.org/show_bug.cgi?id=34994
144 define float @sqrtf_check_denorms(float %x) #3 {
145 ; SSE-LABEL: sqrtf_check_denorms:
147 ; SSE-NEXT: sqrtss %xmm0, %xmm0
150 ; AVX-LABEL: sqrtf_check_denorms:
152 ; AVX-NEXT: vsqrtss %xmm0, %xmm0, %xmm0
154 %call = tail call float @__sqrtf_finite(float %x) #2
158 define float @sqrtf_check_denorms_ninf(float %x) #3 {
159 ; SSE-LABEL: sqrtf_check_denorms_ninf:
161 ; SSE-NEXT: sqrtss %xmm0, %xmm0
164 ; AVX-LABEL: sqrtf_check_denorms_ninf:
166 ; AVX-NEXT: vsqrtss %xmm0, %xmm0, %xmm0
168 %call = tail call ninf afn float @__sqrtf_finite(float %x) #2
172 define <4 x float> @sqrt_v4f32_check_denorms(<4 x float> %x) #3 {
173 ; SSE-LABEL: sqrt_v4f32_check_denorms:
175 ; SSE-NEXT: sqrtps %xmm0, %xmm0
178 ; AVX-LABEL: sqrt_v4f32_check_denorms:
180 ; AVX-NEXT: vsqrtps %xmm0, %xmm0
182 %call = tail call <4 x float> @llvm.sqrt.v4f32(<4 x float> %x) #2
183 ret <4 x float> %call
186 define <4 x float> @sqrt_v4f32_check_denorms_ieee_ninf(<4 x float> %x) #3 {
187 ; SSE-LABEL: sqrt_v4f32_check_denorms_ieee_ninf:
189 ; SSE-NEXT: rsqrtps %xmm0, %xmm1
190 ; SSE-NEXT: movaps %xmm0, %xmm2
191 ; SSE-NEXT: mulps %xmm1, %xmm2
192 ; SSE-NEXT: movaps {{.*#+}} xmm3 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
193 ; SSE-NEXT: mulps %xmm2, %xmm3
194 ; SSE-NEXT: mulps %xmm1, %xmm2
195 ; SSE-NEXT: addps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
196 ; SSE-NEXT: mulps %xmm3, %xmm2
197 ; SSE-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
198 ; SSE-NEXT: movaps {{.*#+}} xmm1 = [1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38]
199 ; SSE-NEXT: cmpleps %xmm0, %xmm1
200 ; SSE-NEXT: andps %xmm2, %xmm1
201 ; SSE-NEXT: movaps %xmm1, %xmm0
204 ; AVX1-LABEL: sqrt_v4f32_check_denorms_ieee_ninf:
206 ; AVX1-NEXT: vrsqrtps %xmm0, %xmm1
207 ; AVX1-NEXT: vmulps %xmm1, %xmm0, %xmm2
208 ; AVX1-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm3
209 ; AVX1-NEXT: vmulps %xmm1, %xmm2, %xmm1
210 ; AVX1-NEXT: vaddps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
211 ; AVX1-NEXT: vmulps %xmm1, %xmm3, %xmm1
212 ; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
213 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38]
214 ; AVX1-NEXT: vcmpleps %xmm0, %xmm2, %xmm0
215 ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0
218 ; AVX512-LABEL: sqrt_v4f32_check_denorms_ieee_ninf:
220 ; AVX512-NEXT: vrsqrtps %xmm0, %xmm1
221 ; AVX512-NEXT: vmulps %xmm1, %xmm0, %xmm2
222 ; AVX512-NEXT: vbroadcastss {{.*#+}} xmm3 = [-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0]
223 ; AVX512-NEXT: vfmadd231ps {{.*#+}} xmm3 = (xmm2 * xmm1) + xmm3
224 ; AVX512-NEXT: vbroadcastss {{.*#+}} xmm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
225 ; AVX512-NEXT: vmulps %xmm1, %xmm2, %xmm1
226 ; AVX512-NEXT: vmulps %xmm3, %xmm1, %xmm1
227 ; AVX512-NEXT: vbroadcastss {{.*#+}} xmm2 = [NaN,NaN,NaN,NaN]
228 ; AVX512-NEXT: vandps %xmm2, %xmm0, %xmm0
229 ; AVX512-NEXT: vbroadcastss {{.*#+}} xmm2 = [1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38]
230 ; AVX512-NEXT: vcmpleps %xmm0, %xmm2, %xmm0
231 ; AVX512-NEXT: vandps %xmm1, %xmm0, %xmm0
233 %call = tail call ninf afn <4 x float> @llvm.sqrt.v4f32(<4 x float> %x) #2
234 ret <4 x float> %call
237 define <4 x float> @sqrt_v4f32_check_denorms_dynamic_ninf(<4 x float> %x) #6 {
238 ; SSE-LABEL: sqrt_v4f32_check_denorms_dynamic_ninf:
240 ; SSE-NEXT: rsqrtps %xmm0, %xmm1
241 ; SSE-NEXT: movaps %xmm0, %xmm2
242 ; SSE-NEXT: mulps %xmm1, %xmm2
243 ; SSE-NEXT: movaps {{.*#+}} xmm3 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
244 ; SSE-NEXT: mulps %xmm2, %xmm3
245 ; SSE-NEXT: mulps %xmm1, %xmm2
246 ; SSE-NEXT: addps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
247 ; SSE-NEXT: mulps %xmm3, %xmm2
248 ; SSE-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
249 ; SSE-NEXT: movaps {{.*#+}} xmm1 = [1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38]
250 ; SSE-NEXT: cmpleps %xmm0, %xmm1
251 ; SSE-NEXT: andps %xmm2, %xmm1
252 ; SSE-NEXT: movaps %xmm1, %xmm0
255 ; AVX1-LABEL: sqrt_v4f32_check_denorms_dynamic_ninf:
257 ; AVX1-NEXT: vrsqrtps %xmm0, %xmm1
258 ; AVX1-NEXT: vmulps %xmm1, %xmm0, %xmm2
259 ; AVX1-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm3
260 ; AVX1-NEXT: vmulps %xmm1, %xmm2, %xmm1
261 ; AVX1-NEXT: vaddps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
262 ; AVX1-NEXT: vmulps %xmm1, %xmm3, %xmm1
263 ; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
264 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38]
265 ; AVX1-NEXT: vcmpleps %xmm0, %xmm2, %xmm0
266 ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0
269 ; AVX512-LABEL: sqrt_v4f32_check_denorms_dynamic_ninf:
271 ; AVX512-NEXT: vrsqrtps %xmm0, %xmm1
272 ; AVX512-NEXT: vmulps %xmm1, %xmm0, %xmm2
273 ; AVX512-NEXT: vbroadcastss {{.*#+}} xmm3 = [-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0]
274 ; AVX512-NEXT: vfmadd231ps {{.*#+}} xmm3 = (xmm2 * xmm1) + xmm3
275 ; AVX512-NEXT: vbroadcastss {{.*#+}} xmm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
276 ; AVX512-NEXT: vmulps %xmm1, %xmm2, %xmm1
277 ; AVX512-NEXT: vmulps %xmm3, %xmm1, %xmm1
278 ; AVX512-NEXT: vbroadcastss {{.*#+}} xmm2 = [NaN,NaN,NaN,NaN]
279 ; AVX512-NEXT: vandps %xmm2, %xmm0, %xmm0
280 ; AVX512-NEXT: vbroadcastss {{.*#+}} xmm2 = [1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38]
281 ; AVX512-NEXT: vcmpleps %xmm0, %xmm2, %xmm0
282 ; AVX512-NEXT: vandps %xmm1, %xmm0, %xmm0
284 %call = tail call ninf afn <4 x float> @llvm.sqrt.v4f32(<4 x float> %x) #2
285 ret <4 x float> %call
288 define float @f32_no_estimate(float %x) #0 {
289 ; SSE-LABEL: f32_no_estimate:
291 ; SSE-NEXT: sqrtss %xmm0, %xmm1
292 ; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
293 ; SSE-NEXT: divss %xmm1, %xmm0
296 ; AVX-LABEL: f32_no_estimate:
298 ; AVX-NEXT: vsqrtss %xmm0, %xmm0, %xmm0
299 ; AVX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
300 ; AVX-NEXT: vdivss %xmm0, %xmm1, %xmm0
302 %sqrt = tail call float @llvm.sqrt.f32(float %x)
303 %div = fdiv fast float 1.0, %sqrt
307 define float @f32_estimate(float %x) #1 {
308 ; SSE-LABEL: f32_estimate:
310 ; SSE-NEXT: rsqrtss %xmm0, %xmm1
311 ; SSE-NEXT: mulss %xmm1, %xmm0
312 ; SSE-NEXT: mulss %xmm1, %xmm0
313 ; SSE-NEXT: addss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
314 ; SSE-NEXT: mulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
315 ; SSE-NEXT: mulss %xmm1, %xmm0
318 ; AVX1-LABEL: f32_estimate:
320 ; AVX1-NEXT: vrsqrtss %xmm0, %xmm0, %xmm1
321 ; AVX1-NEXT: vmulss %xmm1, %xmm0, %xmm0
322 ; AVX1-NEXT: vmulss %xmm1, %xmm0, %xmm0
323 ; AVX1-NEXT: vaddss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
324 ; AVX1-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
325 ; AVX1-NEXT: vmulss %xmm0, %xmm1, %xmm0
328 ; AVX512-LABEL: f32_estimate:
330 ; AVX512-NEXT: vrsqrtss %xmm0, %xmm0, %xmm1
331 ; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0
332 ; AVX512-NEXT: vfmadd213ss {{.*#+}} xmm0 = (xmm1 * xmm0) + mem
333 ; AVX512-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
334 ; AVX512-NEXT: vmulss %xmm0, %xmm1, %xmm0
336 %sqrt = tail call float @llvm.sqrt.f32(float %x)
337 %div = fdiv fast float 1.0, %sqrt
341 define float @f32_estimate2(float %x) #5 {
342 ; SSE-LABEL: f32_estimate2:
344 ; SSE-NEXT: sqrtss %xmm0, %xmm0
347 ; AVX-LABEL: f32_estimate2:
349 ; AVX-NEXT: vsqrtss %xmm0, %xmm0, %xmm0
351 %sqrt = tail call fast float @llvm.sqrt.f32(float %x)
355 define <4 x float> @v4f32_no_estimate(<4 x float> %x) #0 {
356 ; SSE-LABEL: v4f32_no_estimate:
358 ; SSE-NEXT: sqrtps %xmm0, %xmm1
359 ; SSE-NEXT: movaps {{.*#+}} xmm0 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
360 ; SSE-NEXT: divps %xmm1, %xmm0
363 ; AVX-LABEL: v4f32_no_estimate:
365 ; AVX-NEXT: vsqrtps %xmm0, %xmm0
366 ; AVX-NEXT: vbroadcastss {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
367 ; AVX-NEXT: vdivps %xmm0, %xmm1, %xmm0
369 %sqrt = tail call <4 x float> @llvm.sqrt.v4f32(<4 x float> %x)
370 %div = fdiv fast <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>, %sqrt
374 define <4 x float> @v4f32_estimate(<4 x float> %x) #1 {
375 ; SSE-LABEL: v4f32_estimate:
377 ; SSE-NEXT: rsqrtps %xmm0, %xmm1
378 ; SSE-NEXT: mulps %xmm1, %xmm0
379 ; SSE-NEXT: mulps %xmm1, %xmm0
380 ; SSE-NEXT: addps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
381 ; SSE-NEXT: mulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
382 ; SSE-NEXT: mulps %xmm1, %xmm0
385 ; AVX1-LABEL: v4f32_estimate:
387 ; AVX1-NEXT: vrsqrtps %xmm0, %xmm1
388 ; AVX1-NEXT: vmulps %xmm1, %xmm0, %xmm0
389 ; AVX1-NEXT: vmulps %xmm1, %xmm0, %xmm0
390 ; AVX1-NEXT: vaddps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
391 ; AVX1-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
392 ; AVX1-NEXT: vmulps %xmm0, %xmm1, %xmm0
395 ; AVX512-LABEL: v4f32_estimate:
397 ; AVX512-NEXT: vrsqrtps %xmm0, %xmm1
398 ; AVX512-NEXT: vmulps %xmm1, %xmm0, %xmm0
399 ; AVX512-NEXT: vbroadcastss {{.*#+}} xmm2 = [-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0]
400 ; AVX512-NEXT: vfmadd231ps {{.*#+}} xmm2 = (xmm1 * xmm0) + xmm2
401 ; AVX512-NEXT: vbroadcastss {{.*#+}} xmm0 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
402 ; AVX512-NEXT: vmulps %xmm0, %xmm1, %xmm0
403 ; AVX512-NEXT: vmulps %xmm2, %xmm0, %xmm0
405 %sqrt = tail call <4 x float> @llvm.sqrt.v4f32(<4 x float> %x)
406 %div = fdiv fast <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>, %sqrt
410 define <4 x float> @v4f32_estimate2(<4 x float> %x) #5 {
411 ; SSE-LABEL: v4f32_estimate2:
413 ; SSE-NEXT: rsqrtps %xmm0, %xmm2
414 ; SSE-NEXT: mulps %xmm0, %xmm2
415 ; SSE-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
416 ; SSE-NEXT: movaps {{.*#+}} xmm1 = [1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38]
417 ; SSE-NEXT: cmpleps %xmm0, %xmm1
418 ; SSE-NEXT: andps %xmm2, %xmm1
419 ; SSE-NEXT: movaps %xmm1, %xmm0
422 ; AVX1-LABEL: v4f32_estimate2:
424 ; AVX1-NEXT: vrsqrtps %xmm0, %xmm1
425 ; AVX1-NEXT: vmulps %xmm1, %xmm0, %xmm1
426 ; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
427 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38]
428 ; AVX1-NEXT: vcmpleps %xmm0, %xmm2, %xmm0
429 ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0
432 ; AVX512-LABEL: v4f32_estimate2:
434 ; AVX512-NEXT: vrsqrtps %xmm0, %xmm1
435 ; AVX512-NEXT: vmulps %xmm1, %xmm0, %xmm1
436 ; AVX512-NEXT: vbroadcastss {{.*#+}} xmm2 = [NaN,NaN,NaN,NaN]
437 ; AVX512-NEXT: vandps %xmm2, %xmm0, %xmm0
438 ; AVX512-NEXT: vbroadcastss {{.*#+}} xmm2 = [1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38]
439 ; AVX512-NEXT: vcmpleps %xmm0, %xmm2, %xmm0
440 ; AVX512-NEXT: vandps %xmm1, %xmm0, %xmm0
442 %sqrt = tail call fast <4 x float> @llvm.sqrt.v4f32(<4 x float> %x)
443 ret <4 x float> %sqrt
446 define <8 x float> @v8f32_no_estimate(<8 x float> %x) #0 {
447 ; SSE-LABEL: v8f32_no_estimate:
449 ; SSE-NEXT: sqrtps %xmm1, %xmm2
450 ; SSE-NEXT: sqrtps %xmm0, %xmm3
451 ; SSE-NEXT: movaps {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
452 ; SSE-NEXT: movaps %xmm1, %xmm0
453 ; SSE-NEXT: divps %xmm3, %xmm0
454 ; SSE-NEXT: divps %xmm2, %xmm1
457 ; AVX-LABEL: v8f32_no_estimate:
459 ; AVX-NEXT: vsqrtps %ymm0, %ymm0
460 ; AVX-NEXT: vbroadcastss {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
461 ; AVX-NEXT: vdivps %ymm0, %ymm1, %ymm0
463 %sqrt = tail call <8 x float> @llvm.sqrt.v8f32(<8 x float> %x)
464 %div = fdiv fast <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, %sqrt
468 define <8 x float> @v8f32_estimate(<8 x float> %x) #1 {
469 ; SSE-LABEL: v8f32_estimate:
471 ; SSE-NEXT: rsqrtps %xmm0, %xmm2
472 ; SSE-NEXT: movaps {{.*#+}} xmm3 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
473 ; SSE-NEXT: mulps %xmm2, %xmm0
474 ; SSE-NEXT: mulps %xmm2, %xmm0
475 ; SSE-NEXT: mulps %xmm3, %xmm2
476 ; SSE-NEXT: movaps {{.*#+}} xmm4 = [-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0]
477 ; SSE-NEXT: addps %xmm4, %xmm0
478 ; SSE-NEXT: mulps %xmm2, %xmm0
479 ; SSE-NEXT: rsqrtps %xmm1, %xmm2
480 ; SSE-NEXT: mulps %xmm2, %xmm3
481 ; SSE-NEXT: mulps %xmm2, %xmm1
482 ; SSE-NEXT: mulps %xmm2, %xmm1
483 ; SSE-NEXT: addps %xmm4, %xmm1
484 ; SSE-NEXT: mulps %xmm3, %xmm1
487 ; AVX1-LABEL: v8f32_estimate:
489 ; AVX1-NEXT: vrsqrtps %ymm0, %ymm1
490 ; AVX1-NEXT: vmulps %ymm1, %ymm0, %ymm0
491 ; AVX1-NEXT: vmulps %ymm1, %ymm0, %ymm0
492 ; AVX1-NEXT: vaddps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
493 ; AVX1-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
494 ; AVX1-NEXT: vmulps %ymm0, %ymm1, %ymm0
497 ; AVX512-LABEL: v8f32_estimate:
499 ; AVX512-NEXT: vrsqrtps %ymm0, %ymm1
500 ; AVX512-NEXT: vmulps %ymm1, %ymm0, %ymm0
501 ; AVX512-NEXT: vbroadcastss {{.*#+}} ymm2 = [-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0]
502 ; AVX512-NEXT: vbroadcastss {{.*#+}} ymm3 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
503 ; AVX512-NEXT: vfmadd231ps {{.*#+}} ymm2 = (ymm1 * ymm0) + ymm2
504 ; AVX512-NEXT: vmulps %ymm3, %ymm1, %ymm0
505 ; AVX512-NEXT: vmulps %ymm2, %ymm0, %ymm0
507 %sqrt = tail call <8 x float> @llvm.sqrt.v8f32(<8 x float> %x)
508 %div = fdiv fast <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, %sqrt
512 define <16 x float> @v16f32_no_estimate(<16 x float> %x) #0 {
513 ; SSE-LABEL: v16f32_no_estimate:
515 ; SSE-NEXT: sqrtps %xmm3, %xmm4
516 ; SSE-NEXT: sqrtps %xmm2, %xmm5
517 ; SSE-NEXT: sqrtps %xmm1, %xmm2
518 ; SSE-NEXT: sqrtps %xmm0, %xmm1
519 ; SSE-NEXT: movaps {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
520 ; SSE-NEXT: movaps %xmm3, %xmm0
521 ; SSE-NEXT: divps %xmm1, %xmm0
522 ; SSE-NEXT: movaps %xmm3, %xmm1
523 ; SSE-NEXT: divps %xmm2, %xmm1
524 ; SSE-NEXT: movaps %xmm3, %xmm2
525 ; SSE-NEXT: divps %xmm5, %xmm2
526 ; SSE-NEXT: divps %xmm4, %xmm3
529 ; AVX1-LABEL: v16f32_no_estimate:
531 ; AVX1-NEXT: vsqrtps %ymm1, %ymm1
532 ; AVX1-NEXT: vsqrtps %ymm0, %ymm0
533 ; AVX1-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
534 ; AVX1-NEXT: vdivps %ymm0, %ymm2, %ymm0
535 ; AVX1-NEXT: vdivps %ymm1, %ymm2, %ymm1
538 ; AVX512-LABEL: v16f32_no_estimate:
540 ; AVX512-NEXT: vsqrtps %zmm0, %zmm0
541 ; AVX512-NEXT: vbroadcastss {{.*#+}} zmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
542 ; AVX512-NEXT: vdivps %zmm0, %zmm1, %zmm0
544 %sqrt = tail call <16 x float> @llvm.sqrt.v16f32(<16 x float> %x)
545 %div = fdiv fast <16 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, %sqrt
546 ret <16 x float> %div
549 define <16 x float> @v16f32_estimate(<16 x float> %x) #1 {
550 ; SSE-LABEL: v16f32_estimate:
552 ; SSE-NEXT: rsqrtps %xmm0, %xmm5
553 ; SSE-NEXT: movaps {{.*#+}} xmm4 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
554 ; SSE-NEXT: mulps %xmm5, %xmm0
555 ; SSE-NEXT: mulps %xmm5, %xmm0
556 ; SSE-NEXT: movaps %xmm5, %xmm6
557 ; SSE-NEXT: mulps %xmm4, %xmm6
558 ; SSE-NEXT: movaps {{.*#+}} xmm5 = [-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0]
559 ; SSE-NEXT: addps %xmm5, %xmm0
560 ; SSE-NEXT: mulps %xmm6, %xmm0
561 ; SSE-NEXT: rsqrtps %xmm1, %xmm6
562 ; SSE-NEXT: mulps %xmm6, %xmm1
563 ; SSE-NEXT: mulps %xmm6, %xmm1
564 ; SSE-NEXT: mulps %xmm4, %xmm6
565 ; SSE-NEXT: addps %xmm5, %xmm1
566 ; SSE-NEXT: mulps %xmm6, %xmm1
567 ; SSE-NEXT: rsqrtps %xmm2, %xmm6
568 ; SSE-NEXT: mulps %xmm6, %xmm2
569 ; SSE-NEXT: mulps %xmm6, %xmm2
570 ; SSE-NEXT: mulps %xmm4, %xmm6
571 ; SSE-NEXT: addps %xmm5, %xmm2
572 ; SSE-NEXT: mulps %xmm6, %xmm2
573 ; SSE-NEXT: rsqrtps %xmm3, %xmm6
574 ; SSE-NEXT: mulps %xmm6, %xmm4
575 ; SSE-NEXT: mulps %xmm6, %xmm3
576 ; SSE-NEXT: mulps %xmm6, %xmm3
577 ; SSE-NEXT: addps %xmm5, %xmm3
578 ; SSE-NEXT: mulps %xmm4, %xmm3
581 ; AVX1-LABEL: v16f32_estimate:
583 ; AVX1-NEXT: vrsqrtps %ymm0, %ymm2
584 ; AVX1-NEXT: vbroadcastss {{.*#+}} ymm3 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
585 ; AVX1-NEXT: vmulps %ymm3, %ymm2, %ymm4
586 ; AVX1-NEXT: vmulps %ymm2, %ymm0, %ymm0
587 ; AVX1-NEXT: vmulps %ymm2, %ymm0, %ymm0
588 ; AVX1-NEXT: vbroadcastss {{.*#+}} ymm2 = [-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0]
589 ; AVX1-NEXT: vaddps %ymm2, %ymm0, %ymm0
590 ; AVX1-NEXT: vrsqrtps %ymm1, %ymm5
591 ; AVX1-NEXT: vmulps %ymm0, %ymm4, %ymm0
592 ; AVX1-NEXT: vmulps %ymm3, %ymm5, %ymm3
593 ; AVX1-NEXT: vmulps %ymm5, %ymm1, %ymm1
594 ; AVX1-NEXT: vmulps %ymm5, %ymm1, %ymm1
595 ; AVX1-NEXT: vaddps %ymm2, %ymm1, %ymm1
596 ; AVX1-NEXT: vmulps %ymm1, %ymm3, %ymm1
599 ; AVX512-LABEL: v16f32_estimate:
601 ; AVX512-NEXT: vrsqrt14ps %zmm0, %zmm1
602 ; AVX512-NEXT: vmulps %zmm1, %zmm0, %zmm0
603 ; AVX512-NEXT: vfmadd213ps {{.*#+}} zmm0 = (zmm1 * zmm0) + mem
604 ; AVX512-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm1
605 ; AVX512-NEXT: vmulps %zmm0, %zmm1, %zmm0
607 %sqrt = tail call <16 x float> @llvm.sqrt.v16f32(<16 x float> %x)
608 %div = fdiv fast <16 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, %sqrt
609 ret <16 x float> %div
612 ; x / (fabs(y) * sqrt(z)) --> x * rsqrt(y*y*z)
614 define float @div_sqrt_fabs_f32(float %x, float %y, float %z) {
615 ; SSE-LABEL: div_sqrt_fabs_f32:
617 ; SSE-NEXT: mulss %xmm1, %xmm1
618 ; SSE-NEXT: mulss %xmm2, %xmm1
619 ; SSE-NEXT: xorps %xmm2, %xmm2
620 ; SSE-NEXT: rsqrtss %xmm1, %xmm2
621 ; SSE-NEXT: mulss %xmm2, %xmm1
622 ; SSE-NEXT: mulss %xmm2, %xmm1
623 ; SSE-NEXT: addss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
624 ; SSE-NEXT: mulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
625 ; SSE-NEXT: mulss %xmm2, %xmm0
626 ; SSE-NEXT: mulss %xmm1, %xmm0
629 ; AVX1-LABEL: div_sqrt_fabs_f32:
631 ; AVX1-NEXT: vmulss %xmm1, %xmm1, %xmm1
632 ; AVX1-NEXT: vmulss %xmm2, %xmm1, %xmm1
633 ; AVX1-NEXT: vrsqrtss %xmm1, %xmm1, %xmm2
634 ; AVX1-NEXT: vmulss %xmm2, %xmm1, %xmm1
635 ; AVX1-NEXT: vmulss %xmm2, %xmm1, %xmm1
636 ; AVX1-NEXT: vaddss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
637 ; AVX1-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
638 ; AVX1-NEXT: vmulss %xmm2, %xmm0, %xmm0
639 ; AVX1-NEXT: vmulss %xmm1, %xmm0, %xmm0
642 ; AVX512-LABEL: div_sqrt_fabs_f32:
644 ; AVX512-NEXT: vmulss %xmm1, %xmm1, %xmm1
645 ; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1
646 ; AVX512-NEXT: vrsqrtss %xmm1, %xmm1, %xmm2
647 ; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1
648 ; AVX512-NEXT: vfmadd213ss {{.*#+}} xmm1 = (xmm2 * xmm1) + mem
649 ; AVX512-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
650 ; AVX512-NEXT: vmulss %xmm2, %xmm0, %xmm0
651 ; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0
653 %s = call fast float @llvm.sqrt.f32(float %z)
654 %a = call fast float @llvm.fabs.f32(float %y)
655 %m = fmul fast float %s, %a
656 %d = fdiv fast float %x, %m
660 ; x / (fabs(y) * sqrt(z)) --> x * rsqrt(y*y*z)
662 define <4 x float> @div_sqrt_fabs_v4f32(<4 x float> %x, <4 x float> %y, <4 x float> %z) {
663 ; SSE-LABEL: div_sqrt_fabs_v4f32:
665 ; SSE-NEXT: mulps %xmm1, %xmm1
666 ; SSE-NEXT: mulps %xmm2, %xmm1
667 ; SSE-NEXT: rsqrtps %xmm1, %xmm2
668 ; SSE-NEXT: mulps %xmm2, %xmm1
669 ; SSE-NEXT: mulps %xmm2, %xmm1
670 ; SSE-NEXT: addps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
671 ; SSE-NEXT: mulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
672 ; SSE-NEXT: mulps %xmm1, %xmm2
673 ; SSE-NEXT: mulps %xmm2, %xmm0
676 ; AVX1-LABEL: div_sqrt_fabs_v4f32:
678 ; AVX1-NEXT: vmulps %xmm1, %xmm1, %xmm1
679 ; AVX1-NEXT: vmulps %xmm2, %xmm1, %xmm1
680 ; AVX1-NEXT: vrsqrtps %xmm1, %xmm2
681 ; AVX1-NEXT: vmulps %xmm2, %xmm1, %xmm1
682 ; AVX1-NEXT: vmulps %xmm2, %xmm1, %xmm1
683 ; AVX1-NEXT: vaddps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
684 ; AVX1-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
685 ; AVX1-NEXT: vmulps %xmm1, %xmm2, %xmm1
686 ; AVX1-NEXT: vmulps %xmm1, %xmm0, %xmm0
689 ; AVX512-LABEL: div_sqrt_fabs_v4f32:
691 ; AVX512-NEXT: vmulps %xmm1, %xmm1, %xmm1
692 ; AVX512-NEXT: vmulps %xmm2, %xmm1, %xmm1
693 ; AVX512-NEXT: vrsqrtps %xmm1, %xmm2
694 ; AVX512-NEXT: vmulps %xmm2, %xmm1, %xmm1
695 ; AVX512-NEXT: vbroadcastss {{.*#+}} xmm3 = [-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0]
696 ; AVX512-NEXT: vfmadd231ps {{.*#+}} xmm3 = (xmm2 * xmm1) + xmm3
697 ; AVX512-NEXT: vbroadcastss {{.*#+}} xmm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
698 ; AVX512-NEXT: vmulps %xmm1, %xmm2, %xmm1
699 ; AVX512-NEXT: vmulps %xmm3, %xmm1, %xmm1
700 ; AVX512-NEXT: vmulps %xmm1, %xmm0, %xmm0
702 %s = call <4 x float> @llvm.sqrt.v4f32(<4 x float> %z)
703 %a = call <4 x float> @llvm.fabs.v4f32(<4 x float> %y)
704 %m = fmul contract reassoc <4 x float> %a, %s
705 %d = fdiv contract reassoc arcp <4 x float> %x, %m
709 ; This has 'arcp' but does not have 'reassoc' FMF.
710 ; We allow converting the sqrt to an estimate, but
711 ; do not pull the divisor into the estimate.
712 ; x / (fabs(y) * sqrt(z)) --> x * rsqrt(z) / fabs(y)
714 define <4 x float> @div_sqrt_fabs_v4f32_fmf(<4 x float> %x, <4 x float> %y, <4 x float> %z) {
715 ; SSE-LABEL: div_sqrt_fabs_v4f32_fmf:
717 ; SSE-NEXT: rsqrtps %xmm2, %xmm3
718 ; SSE-NEXT: mulps %xmm3, %xmm2
719 ; SSE-NEXT: mulps %xmm3, %xmm2
720 ; SSE-NEXT: addps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
721 ; SSE-NEXT: mulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
722 ; SSE-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
723 ; SSE-NEXT: mulps %xmm2, %xmm3
724 ; SSE-NEXT: divps %xmm1, %xmm3
725 ; SSE-NEXT: mulps %xmm3, %xmm0
728 ; AVX1-LABEL: div_sqrt_fabs_v4f32_fmf:
730 ; AVX1-NEXT: vrsqrtps %xmm2, %xmm3
731 ; AVX1-NEXT: vmulps %xmm3, %xmm2, %xmm2
732 ; AVX1-NEXT: vmulps %xmm3, %xmm2, %xmm2
733 ; AVX1-NEXT: vaddps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
734 ; AVX1-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3
735 ; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
736 ; AVX1-NEXT: vmulps %xmm2, %xmm3, %xmm2
737 ; AVX1-NEXT: vdivps %xmm1, %xmm2, %xmm1
738 ; AVX1-NEXT: vmulps %xmm1, %xmm0, %xmm0
741 ; AVX512-LABEL: div_sqrt_fabs_v4f32_fmf:
743 ; AVX512-NEXT: vrsqrtps %xmm2, %xmm3
744 ; AVX512-NEXT: vbroadcastss {{.*#+}} xmm4 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
745 ; AVX512-NEXT: vmulps %xmm4, %xmm3, %xmm4
746 ; AVX512-NEXT: vmulps %xmm3, %xmm2, %xmm2
747 ; AVX512-NEXT: vmulps %xmm3, %xmm2, %xmm2
748 ; AVX512-NEXT: vbroadcastss {{.*#+}} xmm3 = [-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0]
749 ; AVX512-NEXT: vaddps %xmm3, %xmm2, %xmm2
750 ; AVX512-NEXT: vmulps %xmm2, %xmm4, %xmm2
751 ; AVX512-NEXT: vbroadcastss {{.*#+}} xmm3 = [NaN,NaN,NaN,NaN]
752 ; AVX512-NEXT: vandps %xmm3, %xmm1, %xmm1
753 ; AVX512-NEXT: vdivps %xmm1, %xmm2, %xmm1
754 ; AVX512-NEXT: vmulps %xmm1, %xmm0, %xmm0
756 %s = call <4 x float> @llvm.sqrt.v4f32(<4 x float> %z)
757 %a = call <4 x float> @llvm.fabs.v4f32(<4 x float> %y)
758 %m = fmul <4 x float> %a, %s
759 %d = fdiv arcp <4 x float> %x, %m
763 ; No estimates for f64, so do not convert fabs into an fmul.
765 define double @div_sqrt_fabs_f64(double %x, double %y, double %z) {
766 ; SSE-LABEL: div_sqrt_fabs_f64:
768 ; SSE-NEXT: andpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
769 ; SSE-NEXT: sqrtsd %xmm2, %xmm2
770 ; SSE-NEXT: mulsd %xmm2, %xmm1
771 ; SSE-NEXT: divsd %xmm1, %xmm0
774 ; AVX-LABEL: div_sqrt_fabs_f64:
776 ; AVX-NEXT: vandpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
777 ; AVX-NEXT: vsqrtsd %xmm2, %xmm2, %xmm2
778 ; AVX-NEXT: vmulsd %xmm1, %xmm2, %xmm1
779 ; AVX-NEXT: vdivsd %xmm1, %xmm0, %xmm0
781 %s = call fast double @llvm.sqrt.f64(double %z)
782 %a = call fast double @llvm.fabs.f64(double %y)
783 %m = fmul fast double %s, %a
784 %d = fdiv fast double %x, %m
788 ; This is a special case for the general pattern above -
789 ; if the sqrt operand is the same as the other mul op,
790 ; then fabs may be omitted.
791 ; x / (y * sqrt(y)) --> x * rsqrt(y*y*y)
793 define float @div_sqrt_f32(float %x, float %y) {
794 ; SSE-LABEL: div_sqrt_f32:
796 ; SSE-NEXT: movaps %xmm1, %xmm2
797 ; SSE-NEXT: mulss %xmm1, %xmm2
798 ; SSE-NEXT: mulss %xmm1, %xmm2
799 ; SSE-NEXT: xorps %xmm1, %xmm1
800 ; SSE-NEXT: rsqrtss %xmm2, %xmm1
801 ; SSE-NEXT: mulss %xmm1, %xmm2
802 ; SSE-NEXT: mulss %xmm1, %xmm2
803 ; SSE-NEXT: addss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
804 ; SSE-NEXT: mulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
805 ; SSE-NEXT: mulss %xmm1, %xmm0
806 ; SSE-NEXT: mulss %xmm2, %xmm0
809 ; AVX1-LABEL: div_sqrt_f32:
811 ; AVX1-NEXT: vmulss %xmm1, %xmm1, %xmm2
812 ; AVX1-NEXT: vmulss %xmm1, %xmm2, %xmm1
813 ; AVX1-NEXT: vrsqrtss %xmm1, %xmm1, %xmm2
814 ; AVX1-NEXT: vmulss %xmm2, %xmm1, %xmm1
815 ; AVX1-NEXT: vmulss %xmm2, %xmm1, %xmm1
816 ; AVX1-NEXT: vaddss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
817 ; AVX1-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
818 ; AVX1-NEXT: vmulss %xmm2, %xmm0, %xmm0
819 ; AVX1-NEXT: vmulss %xmm1, %xmm0, %xmm0
822 ; AVX512-LABEL: div_sqrt_f32:
824 ; AVX512-NEXT: vmulss %xmm1, %xmm1, %xmm2
825 ; AVX512-NEXT: vmulss %xmm1, %xmm2, %xmm1
826 ; AVX512-NEXT: vrsqrtss %xmm1, %xmm1, %xmm2
827 ; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1
828 ; AVX512-NEXT: vfmadd213ss {{.*#+}} xmm1 = (xmm2 * xmm1) + mem
829 ; AVX512-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
830 ; AVX512-NEXT: vmulss %xmm2, %xmm0, %xmm0
831 ; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0
833 %s = call fast float @llvm.sqrt.f32(float %y)
834 %m = fmul fast float %s, %y
835 %d = fdiv fast float %x, %m
839 ; This is a special case for the general pattern above -
840 ; if the sqrt operand is the same as the other mul op,
841 ; then fabs may be omitted.
842 ; x / (y * sqrt(y)) --> x * rsqrt(y*y*y)
844 define <4 x float> @div_sqrt_v4f32(<4 x float> %x, <4 x float> %y) {
845 ; SSE-LABEL: div_sqrt_v4f32:
847 ; SSE-NEXT: movaps %xmm1, %xmm2
848 ; SSE-NEXT: mulps %xmm1, %xmm2
849 ; SSE-NEXT: mulps %xmm1, %xmm2
850 ; SSE-NEXT: rsqrtps %xmm2, %xmm1
851 ; SSE-NEXT: mulps %xmm1, %xmm2
852 ; SSE-NEXT: mulps %xmm1, %xmm2
853 ; SSE-NEXT: addps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
854 ; SSE-NEXT: mulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
855 ; SSE-NEXT: mulps %xmm2, %xmm1
856 ; SSE-NEXT: mulps %xmm1, %xmm0
859 ; AVX1-LABEL: div_sqrt_v4f32:
861 ; AVX1-NEXT: vmulps %xmm1, %xmm1, %xmm2
862 ; AVX1-NEXT: vmulps %xmm1, %xmm2, %xmm1
863 ; AVX1-NEXT: vrsqrtps %xmm1, %xmm2
864 ; AVX1-NEXT: vmulps %xmm2, %xmm1, %xmm1
865 ; AVX1-NEXT: vmulps %xmm2, %xmm1, %xmm1
866 ; AVX1-NEXT: vaddps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
867 ; AVX1-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
868 ; AVX1-NEXT: vmulps %xmm1, %xmm2, %xmm1
869 ; AVX1-NEXT: vmulps %xmm1, %xmm0, %xmm0
872 ; AVX512-LABEL: div_sqrt_v4f32:
874 ; AVX512-NEXT: vmulps %xmm1, %xmm1, %xmm2
875 ; AVX512-NEXT: vmulps %xmm1, %xmm2, %xmm1
876 ; AVX512-NEXT: vrsqrtps %xmm1, %xmm2
877 ; AVX512-NEXT: vmulps %xmm2, %xmm1, %xmm1
878 ; AVX512-NEXT: vbroadcastss {{.*#+}} xmm3 = [-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0]
879 ; AVX512-NEXT: vfmadd231ps {{.*#+}} xmm3 = (xmm2 * xmm1) + xmm3
880 ; AVX512-NEXT: vbroadcastss {{.*#+}} xmm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
881 ; AVX512-NEXT: vmulps %xmm1, %xmm2, %xmm1
882 ; AVX512-NEXT: vmulps %xmm3, %xmm1, %xmm1
883 ; AVX512-NEXT: vmulps %xmm1, %xmm0, %xmm0
885 %s = call <4 x float> @llvm.sqrt.v4f32(<4 x float> %y)
886 %m = fmul contract reassoc <4 x float> %y, %s
887 %d = fdiv contract reassoc arcp <4 x float> %x, %m
891 define double @sqrt_fdiv_common_operand(double %x) nounwind {
892 ; SSE-LABEL: sqrt_fdiv_common_operand:
894 ; SSE-NEXT: sqrtsd %xmm0, %xmm0
897 ; AVX-LABEL: sqrt_fdiv_common_operand:
899 ; AVX-NEXT: vsqrtsd %xmm0, %xmm0, %xmm0
901 %sqrt = call fast double @llvm.sqrt.f64(double %x)
902 %r = fdiv fast double %x, %sqrt
906 define <2 x double> @sqrt_fdiv_common_operand_vec(<2 x double> %x) nounwind {
907 ; SSE-LABEL: sqrt_fdiv_common_operand_vec:
909 ; SSE-NEXT: sqrtpd %xmm0, %xmm0
912 ; AVX-LABEL: sqrt_fdiv_common_operand_vec:
914 ; AVX-NEXT: vsqrtpd %xmm0, %xmm0
916 %sqrt = call <2 x double> @llvm.sqrt.v2f64(<2 x double> %x)
917 %r = fdiv arcp nsz reassoc <2 x double> %x, %sqrt
921 define double @sqrt_fdiv_common_operand_extra_use(double %x, ptr %p) nounwind {
922 ; SSE-LABEL: sqrt_fdiv_common_operand_extra_use:
924 ; SSE-NEXT: sqrtsd %xmm0, %xmm0
925 ; SSE-NEXT: movsd %xmm0, (%rdi)
928 ; AVX-LABEL: sqrt_fdiv_common_operand_extra_use:
930 ; AVX-NEXT: vsqrtsd %xmm0, %xmm0, %xmm0
931 ; AVX-NEXT: vmovsd %xmm0, (%rdi)
933 %sqrt = call fast double @llvm.sqrt.f64(double %x)
934 store double %sqrt, ptr %p
935 %r = fdiv fast double %x, %sqrt
939 define double @sqrt_simplify_before_recip(double %x, ptr %p) nounwind {
940 ; SSE-LABEL: sqrt_simplify_before_recip:
942 ; SSE-NEXT: sqrtsd %xmm0, %xmm0
943 ; SSE-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
944 ; SSE-NEXT: divsd %xmm0, %xmm1
945 ; SSE-NEXT: movsd %xmm1, (%rdi)
948 ; AVX-LABEL: sqrt_simplify_before_recip:
950 ; AVX-NEXT: vsqrtsd %xmm0, %xmm0, %xmm0
951 ; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
952 ; AVX-NEXT: vdivsd %xmm0, %xmm1, %xmm1
953 ; AVX-NEXT: vmovsd %xmm1, (%rdi)
955 %sqrt = tail call fast double @llvm.sqrt.f64(double %x)
956 %rsqrt = fdiv fast double 1.0, %sqrt
957 %sqrt_fast = fdiv fast double %x, %sqrt
958 store double %rsqrt, ptr %p, align 8
959 ret double %sqrt_fast
962 define <2 x double> @sqrt_simplify_before_recip_vec(<2 x double> %x, ptr %p) nounwind {
963 ; SSE-LABEL: sqrt_simplify_before_recip_vec:
965 ; SSE-NEXT: sqrtpd %xmm0, %xmm0
966 ; SSE-NEXT: movapd {{.*#+}} xmm1 = [1.0E+0,1.0E+0]
967 ; SSE-NEXT: divpd %xmm0, %xmm1
968 ; SSE-NEXT: movupd %xmm1, (%rdi)
971 ; AVX-LABEL: sqrt_simplify_before_recip_vec:
973 ; AVX-NEXT: vsqrtpd %xmm0, %xmm0
974 ; AVX-NEXT: vmovddup {{.*#+}} xmm1 = [1.0E+0,1.0E+0]
975 ; AVX-NEXT: # xmm1 = mem[0,0]
976 ; AVX-NEXT: vdivpd %xmm0, %xmm1, %xmm1
977 ; AVX-NEXT: vmovupd %xmm1, (%rdi)
979 %sqrt = tail call fast <2 x double> @llvm.sqrt.v2f64(<2 x double> %x)
980 %rsqrt = fdiv fast <2 x double> <double 1.0, double 1.0>, %sqrt
981 %sqrt_fast = fdiv fast <2 x double> %x, %sqrt
982 store <2 x double> %rsqrt, ptr %p, align 8
983 ret <2 x double> %sqrt_fast
986 define double @sqrt_simplify_before_recip_order(double %x, ptr %p) nounwind {
987 ; SSE-LABEL: sqrt_simplify_before_recip_order:
989 ; SSE-NEXT: sqrtsd %xmm0, %xmm0
990 ; SSE-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
991 ; SSE-NEXT: divsd %xmm0, %xmm1
992 ; SSE-NEXT: movsd %xmm1, (%rdi)
995 ; AVX-LABEL: sqrt_simplify_before_recip_order:
997 ; AVX-NEXT: vsqrtsd %xmm0, %xmm0, %xmm0
998 ; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
999 ; AVX-NEXT: vdivsd %xmm0, %xmm1, %xmm1
1000 ; AVX-NEXT: vmovsd %xmm1, (%rdi)
1002 %sqrt = tail call fast double @llvm.sqrt.f64(double %x)
1003 %sqrt_fast = fdiv fast double %x, %sqrt
1004 %rsqrt = fdiv fast double 42.0, %sqrt
1005 store double %rsqrt, ptr %p, align 8
1006 ret double %sqrt_fast
1009 attributes #0 = { "unsafe-fp-math"="true" "reciprocal-estimates"="!sqrtf,!vec-sqrtf,!divf,!vec-divf" }
1010 attributes #1 = { "unsafe-fp-math"="true" "reciprocal-estimates"="sqrt,vec-sqrt" }
1011 attributes #2 = { nounwind readnone }
1012 attributes #3 = { "unsafe-fp-math"="true" "reciprocal-estimates"="sqrt,vec-sqrt" "denormal-fp-math"="preserve-sign,ieee" }
1013 attributes #4 = { "unsafe-fp-math"="true" "reciprocal-estimates"="sqrt,vec-sqrt" "denormal-fp-math"="ieee,preserve-sign" }
1014 attributes #5 = { "unsafe-fp-math"="true" "reciprocal-estimates"="all:0" }
1015 attributes #6 = { "unsafe-fp-math"="true" "reciprocal-estimates"="sqrt,vec-sqrt" "denormal-fp-math"="preserve-sign,dynamic" }