1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-- -mcpu=nehalem | FileCheck %s --check-prefixes=NHM
3 ; RUN: llc < %s -mtriple=x86_64-- -mcpu=sandybridge | FileCheck %s --check-prefixes=FAST-SCALAR,SNB
4 ; RUN: llc < %s -mtriple=x86_64-- -mcpu=broadwell | FileCheck %s --check-prefixes=FAST-SCALAR,BDW
5 ; RUN: llc < %s -mtriple=x86_64-- -mcpu=skylake | FileCheck %s --check-prefixes=FAST-SCALAR,FAST-VECTOR
6 ; RUN: llc < %s -mtriple=x86_64-- -mcpu=znver1 | FileCheck %s --check-prefixes=FAST-SCALAR,FAST-VECTOR
7 ; RUN: llc < %s -mtriple=x86_64-- -mcpu=znver3 | FileCheck %s --check-prefixes=FAST-SCALAR,FAST-VECTOR
8 ; RUN: llc < %s -mtriple=x86_64-- -mcpu=znver4 | FileCheck %s --check-prefixes=FAST-SCALAR,FAST-VECTOR
9 ; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64 | FileCheck %s --check-prefixes=X86-64
11 define float @f32_no_daz(float %f) #0 {
12 ; NHM-LABEL: f32_no_daz:
14 ; NHM-NEXT: rsqrtss %xmm0, %xmm1
15 ; NHM-NEXT: movaps %xmm0, %xmm2
16 ; NHM-NEXT: mulss %xmm1, %xmm2
17 ; NHM-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
18 ; NHM-NEXT: mulss %xmm2, %xmm3
19 ; NHM-NEXT: mulss %xmm1, %xmm2
20 ; NHM-NEXT: addss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
21 ; NHM-NEXT: mulss %xmm3, %xmm2
22 ; NHM-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
23 ; NHM-NEXT: cmpltss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
24 ; NHM-NEXT: andnps %xmm2, %xmm0
27 ; FAST-SCALAR-LABEL: f32_no_daz:
28 ; FAST-SCALAR: # %bb.0:
29 ; FAST-SCALAR-NEXT: vsqrtss %xmm0, %xmm0, %xmm0
30 ; FAST-SCALAR-NEXT: retq
32 ; X86-64-LABEL: f32_no_daz:
34 ; X86-64-NEXT: sqrtss %xmm0, %xmm0
36 %call = tail call fast float @llvm.sqrt.f32(float %f) #2
40 define <4 x float> @v4f32_no_daz(<4 x float> %f) #0 {
41 ; NHM-LABEL: v4f32_no_daz:
43 ; NHM-NEXT: rsqrtps %xmm0, %xmm1
44 ; NHM-NEXT: movaps %xmm0, %xmm2
45 ; NHM-NEXT: mulps %xmm1, %xmm2
46 ; NHM-NEXT: movaps {{.*#+}} xmm3 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
47 ; NHM-NEXT: mulps %xmm2, %xmm3
48 ; NHM-NEXT: mulps %xmm1, %xmm2
49 ; NHM-NEXT: addps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
50 ; NHM-NEXT: mulps %xmm3, %xmm2
51 ; NHM-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
52 ; NHM-NEXT: movaps {{.*#+}} xmm1 = [1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38]
53 ; NHM-NEXT: cmpleps %xmm0, %xmm1
54 ; NHM-NEXT: andps %xmm2, %xmm1
55 ; NHM-NEXT: movaps %xmm1, %xmm0
58 ; SNB-LABEL: v4f32_no_daz:
60 ; SNB-NEXT: vrsqrtps %xmm0, %xmm1
61 ; SNB-NEXT: vmulps %xmm1, %xmm0, %xmm2
62 ; SNB-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm3
63 ; SNB-NEXT: vmulps %xmm1, %xmm2, %xmm1
64 ; SNB-NEXT: vaddps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
65 ; SNB-NEXT: vmulps %xmm1, %xmm3, %xmm1
66 ; SNB-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
67 ; SNB-NEXT: vbroadcastss {{.*#+}} xmm2 = [1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38]
68 ; SNB-NEXT: vcmpleps %xmm0, %xmm2, %xmm0
69 ; SNB-NEXT: vandps %xmm1, %xmm0, %xmm0
72 ; BDW-LABEL: v4f32_no_daz:
74 ; BDW-NEXT: vrsqrtps %xmm0, %xmm1
75 ; BDW-NEXT: vmulps %xmm1, %xmm0, %xmm2
76 ; BDW-NEXT: vbroadcastss {{.*#+}} xmm3 = [-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0]
77 ; BDW-NEXT: vfmadd231ps {{.*#+}} xmm3 = (xmm2 * xmm1) + xmm3
78 ; BDW-NEXT: vbroadcastss {{.*#+}} xmm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
79 ; BDW-NEXT: vmulps %xmm1, %xmm2, %xmm1
80 ; BDW-NEXT: vmulps %xmm3, %xmm1, %xmm1
81 ; BDW-NEXT: vbroadcastss {{.*#+}} xmm2 = [NaN,NaN,NaN,NaN]
82 ; BDW-NEXT: vandps %xmm2, %xmm0, %xmm0
83 ; BDW-NEXT: vbroadcastss {{.*#+}} xmm2 = [1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38]
84 ; BDW-NEXT: vcmpleps %xmm0, %xmm2, %xmm0
85 ; BDW-NEXT: vandps %xmm1, %xmm0, %xmm0
88 ; FAST-VECTOR-LABEL: v4f32_no_daz:
89 ; FAST-VECTOR: # %bb.0:
90 ; FAST-VECTOR-NEXT: vsqrtps %xmm0, %xmm0
91 ; FAST-VECTOR-NEXT: retq
93 ; X86-64-LABEL: v4f32_no_daz:
95 ; X86-64-NEXT: rsqrtps %xmm0, %xmm1
96 ; X86-64-NEXT: movaps %xmm0, %xmm2
97 ; X86-64-NEXT: mulps %xmm1, %xmm2
98 ; X86-64-NEXT: movaps {{.*#+}} xmm3 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
99 ; X86-64-NEXT: mulps %xmm2, %xmm3
100 ; X86-64-NEXT: mulps %xmm1, %xmm2
101 ; X86-64-NEXT: addps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
102 ; X86-64-NEXT: mulps %xmm3, %xmm2
103 ; X86-64-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
104 ; X86-64-NEXT: movaps {{.*#+}} xmm1 = [1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38]
105 ; X86-64-NEXT: cmpleps %xmm0, %xmm1
106 ; X86-64-NEXT: andps %xmm2, %xmm1
107 ; X86-64-NEXT: movaps %xmm1, %xmm0
109 %call = tail call fast <4 x float> @llvm.sqrt.v4f32(<4 x float> %f) #2
110 ret <4 x float> %call
113 define <8 x float> @v8f32_no_daz(<8 x float> %f) #0 {
114 ; NHM-LABEL: v8f32_no_daz:
116 ; NHM-NEXT: rsqrtps %xmm0, %xmm2
117 ; NHM-NEXT: movaps %xmm0, %xmm4
118 ; NHM-NEXT: mulps %xmm2, %xmm4
119 ; NHM-NEXT: movaps {{.*#+}} xmm5 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
120 ; NHM-NEXT: movaps %xmm4, %xmm3
121 ; NHM-NEXT: mulps %xmm5, %xmm3
122 ; NHM-NEXT: mulps %xmm2, %xmm4
123 ; NHM-NEXT: movaps {{.*#+}} xmm6 = [-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0]
124 ; NHM-NEXT: addps %xmm6, %xmm4
125 ; NHM-NEXT: mulps %xmm3, %xmm4
126 ; NHM-NEXT: movaps {{.*#+}} xmm7 = [NaN,NaN,NaN,NaN]
127 ; NHM-NEXT: andps %xmm7, %xmm0
128 ; NHM-NEXT: movaps {{.*#+}} xmm2 = [1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38]
129 ; NHM-NEXT: movaps %xmm2, %xmm3
130 ; NHM-NEXT: cmpleps %xmm0, %xmm3
131 ; NHM-NEXT: andps %xmm4, %xmm3
132 ; NHM-NEXT: rsqrtps %xmm1, %xmm0
133 ; NHM-NEXT: movaps %xmm1, %xmm4
134 ; NHM-NEXT: mulps %xmm0, %xmm4
135 ; NHM-NEXT: mulps %xmm4, %xmm5
136 ; NHM-NEXT: mulps %xmm0, %xmm4
137 ; NHM-NEXT: addps %xmm6, %xmm4
138 ; NHM-NEXT: mulps %xmm5, %xmm4
139 ; NHM-NEXT: andps %xmm7, %xmm1
140 ; NHM-NEXT: cmpleps %xmm1, %xmm2
141 ; NHM-NEXT: andps %xmm4, %xmm2
142 ; NHM-NEXT: movaps %xmm3, %xmm0
143 ; NHM-NEXT: movaps %xmm2, %xmm1
146 ; SNB-LABEL: v8f32_no_daz:
148 ; SNB-NEXT: vrsqrtps %ymm0, %ymm1
149 ; SNB-NEXT: vmulps %ymm1, %ymm0, %ymm2
150 ; SNB-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm3
151 ; SNB-NEXT: vmulps %ymm1, %ymm2, %ymm1
152 ; SNB-NEXT: vaddps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
153 ; SNB-NEXT: vmulps %ymm1, %ymm3, %ymm1
154 ; SNB-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
155 ; SNB-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38]
156 ; SNB-NEXT: vcmpleps %ymm0, %ymm2, %ymm0
157 ; SNB-NEXT: vandps %ymm1, %ymm0, %ymm0
160 ; BDW-LABEL: v8f32_no_daz:
162 ; BDW-NEXT: vrsqrtps %ymm0, %ymm1
163 ; BDW-NEXT: vmulps %ymm1, %ymm0, %ymm2
164 ; BDW-NEXT: vbroadcastss {{.*#+}} ymm3 = [-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0]
165 ; BDW-NEXT: vfmadd231ps {{.*#+}} ymm3 = (ymm2 * ymm1) + ymm3
166 ; BDW-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
167 ; BDW-NEXT: vmulps %ymm1, %ymm2, %ymm1
168 ; BDW-NEXT: vmulps %ymm3, %ymm1, %ymm1
169 ; BDW-NEXT: vbroadcastss {{.*#+}} ymm2 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN]
170 ; BDW-NEXT: vandps %ymm2, %ymm0, %ymm0
171 ; BDW-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38]
172 ; BDW-NEXT: vcmpleps %ymm0, %ymm2, %ymm0
173 ; BDW-NEXT: vandps %ymm1, %ymm0, %ymm0
176 ; FAST-VECTOR-LABEL: v8f32_no_daz:
177 ; FAST-VECTOR: # %bb.0:
178 ; FAST-VECTOR-NEXT: vsqrtps %ymm0, %ymm0
179 ; FAST-VECTOR-NEXT: retq
181 ; X86-64-LABEL: v8f32_no_daz:
183 ; X86-64-NEXT: rsqrtps %xmm0, %xmm2
184 ; X86-64-NEXT: movaps %xmm0, %xmm4
185 ; X86-64-NEXT: mulps %xmm2, %xmm4
186 ; X86-64-NEXT: movaps {{.*#+}} xmm5 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
187 ; X86-64-NEXT: movaps %xmm4, %xmm3
188 ; X86-64-NEXT: mulps %xmm5, %xmm3
189 ; X86-64-NEXT: mulps %xmm2, %xmm4
190 ; X86-64-NEXT: movaps {{.*#+}} xmm6 = [-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0]
191 ; X86-64-NEXT: addps %xmm6, %xmm4
192 ; X86-64-NEXT: mulps %xmm3, %xmm4
193 ; X86-64-NEXT: movaps {{.*#+}} xmm7 = [NaN,NaN,NaN,NaN]
194 ; X86-64-NEXT: andps %xmm7, %xmm0
195 ; X86-64-NEXT: movaps {{.*#+}} xmm2 = [1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38]
196 ; X86-64-NEXT: movaps %xmm2, %xmm3
197 ; X86-64-NEXT: cmpleps %xmm0, %xmm3
198 ; X86-64-NEXT: andps %xmm4, %xmm3
199 ; X86-64-NEXT: rsqrtps %xmm1, %xmm0
200 ; X86-64-NEXT: movaps %xmm1, %xmm4
201 ; X86-64-NEXT: mulps %xmm0, %xmm4
202 ; X86-64-NEXT: mulps %xmm4, %xmm5
203 ; X86-64-NEXT: mulps %xmm0, %xmm4
204 ; X86-64-NEXT: addps %xmm6, %xmm4
205 ; X86-64-NEXT: mulps %xmm5, %xmm4
206 ; X86-64-NEXT: andps %xmm7, %xmm1
207 ; X86-64-NEXT: cmpleps %xmm1, %xmm2
208 ; X86-64-NEXT: andps %xmm4, %xmm2
209 ; X86-64-NEXT: movaps %xmm3, %xmm0
210 ; X86-64-NEXT: movaps %xmm2, %xmm1
212 %call = tail call fast <8 x float> @llvm.sqrt.v8f32(<8 x float> %f) #2
213 ret <8 x float> %call
216 ; Repeat all tests with denorms-as-zero enabled.
218 define float @f32_daz(float %f) #1 {
219 ; NHM-LABEL: f32_daz:
221 ; NHM-NEXT: rsqrtss %xmm0, %xmm1
222 ; NHM-NEXT: movaps %xmm0, %xmm2
223 ; NHM-NEXT: mulss %xmm1, %xmm2
224 ; NHM-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
225 ; NHM-NEXT: mulss %xmm2, %xmm3
226 ; NHM-NEXT: mulss %xmm1, %xmm2
227 ; NHM-NEXT: addss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
228 ; NHM-NEXT: mulss %xmm3, %xmm2
229 ; NHM-NEXT: xorps %xmm1, %xmm1
230 ; NHM-NEXT: cmpeqss %xmm1, %xmm0
231 ; NHM-NEXT: andnps %xmm2, %xmm0
234 ; FAST-SCALAR-LABEL: f32_daz:
235 ; FAST-SCALAR: # %bb.0:
236 ; FAST-SCALAR-NEXT: vsqrtss %xmm0, %xmm0, %xmm0
237 ; FAST-SCALAR-NEXT: retq
239 ; X86-64-LABEL: f32_daz:
241 ; X86-64-NEXT: sqrtss %xmm0, %xmm0
243 %call = tail call fast float @llvm.sqrt.f32(float %f) #2
247 define <4 x float> @v4f32_daz(<4 x float> %f) #1 {
248 ; NHM-LABEL: v4f32_daz:
250 ; NHM-NEXT: rsqrtps %xmm0, %xmm1
251 ; NHM-NEXT: movaps %xmm0, %xmm2
252 ; NHM-NEXT: mulps %xmm1, %xmm2
253 ; NHM-NEXT: movaps {{.*#+}} xmm3 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
254 ; NHM-NEXT: mulps %xmm2, %xmm3
255 ; NHM-NEXT: mulps %xmm1, %xmm2
256 ; NHM-NEXT: addps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
257 ; NHM-NEXT: mulps %xmm3, %xmm2
258 ; NHM-NEXT: xorps %xmm1, %xmm1
259 ; NHM-NEXT: cmpneqps %xmm1, %xmm0
260 ; NHM-NEXT: andps %xmm2, %xmm0
263 ; SNB-LABEL: v4f32_daz:
265 ; SNB-NEXT: vrsqrtps %xmm0, %xmm1
266 ; SNB-NEXT: vmulps %xmm1, %xmm0, %xmm2
267 ; SNB-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm3
268 ; SNB-NEXT: vmulps %xmm1, %xmm2, %xmm1
269 ; SNB-NEXT: vaddps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
270 ; SNB-NEXT: vmulps %xmm1, %xmm3, %xmm1
271 ; SNB-NEXT: vxorps %xmm2, %xmm2, %xmm2
272 ; SNB-NEXT: vcmpneqps %xmm2, %xmm0, %xmm0
273 ; SNB-NEXT: vandps %xmm1, %xmm0, %xmm0
276 ; BDW-LABEL: v4f32_daz:
278 ; BDW-NEXT: vrsqrtps %xmm0, %xmm1
279 ; BDW-NEXT: vmulps %xmm1, %xmm0, %xmm2
280 ; BDW-NEXT: vbroadcastss {{.*#+}} xmm3 = [-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0]
281 ; BDW-NEXT: vfmadd231ps {{.*#+}} xmm3 = (xmm2 * xmm1) + xmm3
282 ; BDW-NEXT: vbroadcastss {{.*#+}} xmm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
283 ; BDW-NEXT: vmulps %xmm1, %xmm2, %xmm1
284 ; BDW-NEXT: vmulps %xmm3, %xmm1, %xmm1
285 ; BDW-NEXT: vxorps %xmm2, %xmm2, %xmm2
286 ; BDW-NEXT: vcmpneqps %xmm2, %xmm0, %xmm0
287 ; BDW-NEXT: vandps %xmm1, %xmm0, %xmm0
290 ; FAST-VECTOR-LABEL: v4f32_daz:
291 ; FAST-VECTOR: # %bb.0:
292 ; FAST-VECTOR-NEXT: vsqrtps %xmm0, %xmm0
293 ; FAST-VECTOR-NEXT: retq
295 ; X86-64-LABEL: v4f32_daz:
297 ; X86-64-NEXT: rsqrtps %xmm0, %xmm1
298 ; X86-64-NEXT: movaps %xmm0, %xmm2
299 ; X86-64-NEXT: mulps %xmm1, %xmm2
300 ; X86-64-NEXT: movaps {{.*#+}} xmm3 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
301 ; X86-64-NEXT: mulps %xmm2, %xmm3
302 ; X86-64-NEXT: mulps %xmm1, %xmm2
303 ; X86-64-NEXT: addps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
304 ; X86-64-NEXT: mulps %xmm3, %xmm2
305 ; X86-64-NEXT: xorps %xmm1, %xmm1
306 ; X86-64-NEXT: cmpneqps %xmm1, %xmm0
307 ; X86-64-NEXT: andps %xmm2, %xmm0
309 %call = tail call fast <4 x float> @llvm.sqrt.v4f32(<4 x float> %f) #2
310 ret <4 x float> %call
313 define <8 x float> @v8f32_daz(<8 x float> %f) #1 {
314 ; NHM-LABEL: v8f32_daz:
316 ; NHM-NEXT: rsqrtps %xmm0, %xmm2
317 ; NHM-NEXT: movaps %xmm0, %xmm3
318 ; NHM-NEXT: mulps %xmm2, %xmm3
319 ; NHM-NEXT: movaps {{.*#+}} xmm4 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
320 ; NHM-NEXT: movaps %xmm3, %xmm5
321 ; NHM-NEXT: mulps %xmm4, %xmm5
322 ; NHM-NEXT: mulps %xmm2, %xmm3
323 ; NHM-NEXT: movaps {{.*#+}} xmm2 = [-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0]
324 ; NHM-NEXT: addps %xmm2, %xmm3
325 ; NHM-NEXT: mulps %xmm5, %xmm3
326 ; NHM-NEXT: xorps %xmm5, %xmm5
327 ; NHM-NEXT: cmpneqps %xmm5, %xmm0
328 ; NHM-NEXT: andps %xmm3, %xmm0
329 ; NHM-NEXT: rsqrtps %xmm1, %xmm3
330 ; NHM-NEXT: movaps %xmm1, %xmm6
331 ; NHM-NEXT: mulps %xmm3, %xmm6
332 ; NHM-NEXT: mulps %xmm6, %xmm4
333 ; NHM-NEXT: mulps %xmm3, %xmm6
334 ; NHM-NEXT: addps %xmm2, %xmm6
335 ; NHM-NEXT: mulps %xmm4, %xmm6
336 ; NHM-NEXT: cmpneqps %xmm5, %xmm1
337 ; NHM-NEXT: andps %xmm6, %xmm1
340 ; SNB-LABEL: v8f32_daz:
342 ; SNB-NEXT: vrsqrtps %ymm0, %ymm1
343 ; SNB-NEXT: vmulps %ymm1, %ymm0, %ymm2
344 ; SNB-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm3
345 ; SNB-NEXT: vmulps %ymm1, %ymm2, %ymm1
346 ; SNB-NEXT: vaddps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
347 ; SNB-NEXT: vmulps %ymm1, %ymm3, %ymm1
348 ; SNB-NEXT: vxorps %xmm2, %xmm2, %xmm2
349 ; SNB-NEXT: vcmpneqps %ymm2, %ymm0, %ymm0
350 ; SNB-NEXT: vandps %ymm1, %ymm0, %ymm0
353 ; BDW-LABEL: v8f32_daz:
355 ; BDW-NEXT: vrsqrtps %ymm0, %ymm1
356 ; BDW-NEXT: vmulps %ymm1, %ymm0, %ymm2
357 ; BDW-NEXT: vbroadcastss {{.*#+}} ymm3 = [-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0]
358 ; BDW-NEXT: vfmadd231ps {{.*#+}} ymm3 = (ymm2 * ymm1) + ymm3
359 ; BDW-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
360 ; BDW-NEXT: vmulps %ymm1, %ymm2, %ymm1
361 ; BDW-NEXT: vmulps %ymm3, %ymm1, %ymm1
362 ; BDW-NEXT: vxorps %xmm2, %xmm2, %xmm2
363 ; BDW-NEXT: vcmpneqps %ymm2, %ymm0, %ymm0
364 ; BDW-NEXT: vandps %ymm1, %ymm0, %ymm0
367 ; FAST-VECTOR-LABEL: v8f32_daz:
368 ; FAST-VECTOR: # %bb.0:
369 ; FAST-VECTOR-NEXT: vsqrtps %ymm0, %ymm0
370 ; FAST-VECTOR-NEXT: retq
372 ; X86-64-LABEL: v8f32_daz:
374 ; X86-64-NEXT: rsqrtps %xmm0, %xmm2
375 ; X86-64-NEXT: movaps %xmm0, %xmm3
376 ; X86-64-NEXT: mulps %xmm2, %xmm3
377 ; X86-64-NEXT: movaps {{.*#+}} xmm4 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
378 ; X86-64-NEXT: movaps %xmm3, %xmm5
379 ; X86-64-NEXT: mulps %xmm4, %xmm5
380 ; X86-64-NEXT: mulps %xmm2, %xmm3
381 ; X86-64-NEXT: movaps {{.*#+}} xmm2 = [-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0]
382 ; X86-64-NEXT: addps %xmm2, %xmm3
383 ; X86-64-NEXT: mulps %xmm5, %xmm3
384 ; X86-64-NEXT: xorps %xmm5, %xmm5
385 ; X86-64-NEXT: cmpneqps %xmm5, %xmm0
386 ; X86-64-NEXT: andps %xmm3, %xmm0
387 ; X86-64-NEXT: rsqrtps %xmm1, %xmm3
388 ; X86-64-NEXT: movaps %xmm1, %xmm6
389 ; X86-64-NEXT: mulps %xmm3, %xmm6
390 ; X86-64-NEXT: mulps %xmm6, %xmm4
391 ; X86-64-NEXT: mulps %xmm3, %xmm6
392 ; X86-64-NEXT: addps %xmm2, %xmm6
393 ; X86-64-NEXT: mulps %xmm4, %xmm6
394 ; X86-64-NEXT: cmpneqps %xmm5, %xmm1
395 ; X86-64-NEXT: andps %xmm6, %xmm1
397 %call = tail call fast <8 x float> @llvm.sqrt.v8f32(<8 x float> %f) #2
398 ret <8 x float> %call
401 declare float @llvm.sqrt.f32(float) #2
402 declare <4 x float> @llvm.sqrt.v4f32(<4 x float>) #2
403 declare <8 x float> @llvm.sqrt.v8f32(<8 x float>) #2
405 attributes #0 = { "denormal-fp-math"="ieee,ieee" }
406 attributes #1 = { "denormal-fp-math"="ieee,preserve-sign" }
407 attributes #2 = { nounwind readnone }