1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 -O3 | FileCheck %s --check-prefixes=SSE,SSE-X86
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 -O3 | FileCheck %s --check-prefixes=SSE,SSE-X64
4 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+fma -O3 | FileCheck %s --check-prefixes=AVX
5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+fma -O3 | FileCheck %s --check-prefixes=AVX
6 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512f -mattr=+avx512vl -O3 | FileCheck %s --check-prefixes=AVX
7 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f -mattr=+avx512vl -O3 | FileCheck %s --check-prefixes=AVX
9 declare <2 x double> @llvm.experimental.constrained.fadd.v2f64(<2 x double>, <2 x double>, metadata, metadata)
10 declare <4 x float> @llvm.experimental.constrained.fadd.v4f32(<4 x float>, <4 x float>, metadata, metadata)
11 declare <2 x double> @llvm.experimental.constrained.fsub.v2f64(<2 x double>, <2 x double>, metadata, metadata)
12 declare <4 x float> @llvm.experimental.constrained.fsub.v4f32(<4 x float>, <4 x float>, metadata, metadata)
13 declare <2 x double> @llvm.experimental.constrained.fmul.v2f64(<2 x double>, <2 x double>, metadata, metadata)
14 declare <4 x float> @llvm.experimental.constrained.fmul.v4f32(<4 x float>, <4 x float>, metadata, metadata)
15 declare <2 x double> @llvm.experimental.constrained.fdiv.v2f64(<2 x double>, <2 x double>, metadata, metadata)
16 declare <4 x float> @llvm.experimental.constrained.fdiv.v4f32(<4 x float>, <4 x float>, metadata, metadata)
17 declare <2 x double> @llvm.experimental.constrained.sqrt.v2f64(<2 x double>, metadata, metadata)
18 declare <4 x float> @llvm.experimental.constrained.sqrt.v4f32(<4 x float>, metadata, metadata)
19 declare float @llvm.experimental.constrained.fptrunc.f32.f64(double, metadata, metadata)
20 declare <2 x float> @llvm.experimental.constrained.fptrunc.v2f32.v2f64(<2 x double>, metadata, metadata)
21 declare double @llvm.experimental.constrained.fpext.f64.f32(float, metadata)
22 declare <2 x double> @llvm.experimental.constrained.fpext.v2f64.v2f32(<2 x float>, metadata)
23 declare <2 x double> @llvm.experimental.constrained.fma.v2f64(<2 x double>, <2 x double>, <2 x double>, metadata, metadata)
24 declare <4 x float> @llvm.experimental.constrained.fma.v4f32(<4 x float>, <4 x float>, <4 x float>, metadata, metadata)
26 define <2 x double> @f1(<2 x double> %a, <2 x double> %b) #0 {
29 ; SSE-NEXT: addpd %xmm1, %xmm0
30 ; SSE-NEXT: ret{{[l|q]}}
34 ; AVX-NEXT: vaddpd %xmm1, %xmm0, %xmm0
35 ; AVX-NEXT: ret{{[l|q]}}
36 %ret = call <2 x double> @llvm.experimental.constrained.fadd.v2f64(<2 x double> %a, <2 x double> %b,
37 metadata !"round.dynamic",
38 metadata !"fpexcept.strict") #0
42 define <4 x float> @f2(<4 x float> %a, <4 x float> %b) #0 {
45 ; SSE-NEXT: addps %xmm1, %xmm0
46 ; SSE-NEXT: ret{{[l|q]}}
50 ; AVX-NEXT: vaddps %xmm1, %xmm0, %xmm0
51 ; AVX-NEXT: ret{{[l|q]}}
52 %ret = call <4 x float> @llvm.experimental.constrained.fadd.v4f32(<4 x float> %a, <4 x float> %b,
53 metadata !"round.dynamic",
54 metadata !"fpexcept.strict") #0
58 define <2 x double> @f3(<2 x double> %a, <2 x double> %b) #0 {
61 ; SSE-NEXT: subpd %xmm1, %xmm0
62 ; SSE-NEXT: ret{{[l|q]}}
66 ; AVX-NEXT: vsubpd %xmm1, %xmm0, %xmm0
67 ; AVX-NEXT: ret{{[l|q]}}
68 %ret = call <2 x double> @llvm.experimental.constrained.fsub.v2f64(<2 x double> %a, <2 x double> %b,
69 metadata !"round.dynamic",
70 metadata !"fpexcept.strict") #0
74 define <4 x float> @f4(<4 x float> %a, <4 x float> %b) #0 {
77 ; SSE-NEXT: subps %xmm1, %xmm0
78 ; SSE-NEXT: ret{{[l|q]}}
82 ; AVX-NEXT: vsubps %xmm1, %xmm0, %xmm0
83 ; AVX-NEXT: ret{{[l|q]}}
84 %ret = call <4 x float> @llvm.experimental.constrained.fsub.v4f32(<4 x float> %a, <4 x float> %b,
85 metadata !"round.dynamic",
86 metadata !"fpexcept.strict") #0
90 define <2 x double> @f5(<2 x double> %a, <2 x double> %b) #0 {
93 ; SSE-NEXT: mulpd %xmm1, %xmm0
94 ; SSE-NEXT: ret{{[l|q]}}
98 ; AVX-NEXT: vmulpd %xmm1, %xmm0, %xmm0
99 ; AVX-NEXT: ret{{[l|q]}}
100 %ret = call <2 x double> @llvm.experimental.constrained.fmul.v2f64(<2 x double> %a, <2 x double> %b,
101 metadata !"round.dynamic",
102 metadata !"fpexcept.strict") #0
103 ret <2 x double> %ret
106 define <4 x float> @f6(<4 x float> %a, <4 x float> %b) #0 {
109 ; SSE-NEXT: mulps %xmm1, %xmm0
110 ; SSE-NEXT: ret{{[l|q]}}
114 ; AVX-NEXT: vmulps %xmm1, %xmm0, %xmm0
115 ; AVX-NEXT: ret{{[l|q]}}
116 %ret = call <4 x float> @llvm.experimental.constrained.fmul.v4f32(<4 x float> %a, <4 x float> %b,
117 metadata !"round.dynamic",
118 metadata !"fpexcept.strict") #0
122 define <2 x double> @f7(<2 x double> %a, <2 x double> %b) #0 {
125 ; SSE-NEXT: divpd %xmm1, %xmm0
126 ; SSE-NEXT: ret{{[l|q]}}
130 ; AVX-NEXT: vdivpd %xmm1, %xmm0, %xmm0
131 ; AVX-NEXT: ret{{[l|q]}}
132 %ret = call <2 x double> @llvm.experimental.constrained.fdiv.v2f64(<2 x double> %a, <2 x double> %b,
133 metadata !"round.dynamic",
134 metadata !"fpexcept.strict") #0
135 ret <2 x double> %ret
138 define <4 x float> @f8(<4 x float> %a, <4 x float> %b) #0 {
141 ; SSE-NEXT: divps %xmm1, %xmm0
142 ; SSE-NEXT: ret{{[l|q]}}
146 ; AVX-NEXT: vdivps %xmm1, %xmm0, %xmm0
147 ; AVX-NEXT: ret{{[l|q]}}
148 %ret = call <4 x float> @llvm.experimental.constrained.fdiv.v4f32(<4 x float> %a, <4 x float> %b,
149 metadata !"round.dynamic",
150 metadata !"fpexcept.strict") #0
154 define <2 x double> @f9(<2 x double> %a) #0 {
157 ; SSE-NEXT: sqrtpd %xmm0, %xmm0
158 ; SSE-NEXT: ret{{[l|q]}}
162 ; AVX-NEXT: vsqrtpd %xmm0, %xmm0
163 ; AVX-NEXT: ret{{[l|q]}}
164 %sqrt = call <2 x double> @llvm.experimental.constrained.sqrt.v2f64(
166 metadata !"round.dynamic",
167 metadata !"fpexcept.strict") #0
168 ret <2 x double> %sqrt
171 define <4 x float> @f10(<4 x float> %a) #0 {
174 ; SSE-NEXT: sqrtps %xmm0, %xmm0
175 ; SSE-NEXT: ret{{[l|q]}}
179 ; AVX-NEXT: vsqrtps %xmm0, %xmm0
180 ; AVX-NEXT: ret{{[l|q]}}
181 %sqrt = call <4 x float> @llvm.experimental.constrained.sqrt.v4f32(
183 metadata !"round.dynamic",
184 metadata !"fpexcept.strict") #0
185 ret <4 x float > %sqrt
188 define <4 x float> @f11(<2 x double> %a0, <4 x float> %a1) #0 {
191 ; SSE-NEXT: cvtsd2ss %xmm0, %xmm1
192 ; SSE-NEXT: movaps %xmm1, %xmm0
193 ; SSE-NEXT: ret{{[l|q]}}
197 ; AVX-NEXT: vcvtsd2ss %xmm0, %xmm1, %xmm0
198 ; AVX-NEXT: ret{{[l|q]}}
199 %ext = extractelement <2 x double> %a0, i32 0
200 %cvt = call float @llvm.experimental.constrained.fptrunc.f32.f64(double %ext,
201 metadata !"round.dynamic",
202 metadata !"fpexcept.strict") #0
203 %res = insertelement <4 x float> %a1, float %cvt, i32 0
207 define <2 x double> @f12(<2 x double> %a0, <4 x float> %a1) #0 {
210 ; SSE-NEXT: cvtss2sd %xmm1, %xmm0
211 ; SSE-NEXT: ret{{[l|q]}}
215 ; AVX-NEXT: vcvtss2sd %xmm1, %xmm0, %xmm0
216 ; AVX-NEXT: ret{{[l|q]}}
217 %ext = extractelement <4 x float> %a1, i32 0
218 %cvt = call double @llvm.experimental.constrained.fpext.f64.f32(float %ext,
219 metadata !"fpexcept.strict") #0
220 %res = insertelement <2 x double> %a0, double %cvt, i32 0
221 ret <2 x double> %res
224 define <4 x float> @f13(<4 x float> %a, <4 x float> %b, <4 x float> %c) #0 {
225 ; SSE-X86-LABEL: f13:
227 ; SSE-X86-NEXT: subl $100, %esp
228 ; SSE-X86-NEXT: .cfi_def_cfa_offset 104
229 ; SSE-X86-NEXT: movups %xmm2, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
230 ; SSE-X86-NEXT: movups %xmm1, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
231 ; SSE-X86-NEXT: movups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
232 ; SSE-X86-NEXT: movss %xmm2, {{[0-9]+}}(%esp)
233 ; SSE-X86-NEXT: movss %xmm1, {{[0-9]+}}(%esp)
234 ; SSE-X86-NEXT: movss %xmm0, (%esp)
235 ; SSE-X86-NEXT: calll fmaf
236 ; SSE-X86-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Spill
238 ; SSE-X86-NEXT: movups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
239 ; SSE-X86-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
240 ; SSE-X86-NEXT: movss %xmm0, {{[0-9]+}}(%esp)
241 ; SSE-X86-NEXT: movups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
242 ; SSE-X86-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
243 ; SSE-X86-NEXT: movss %xmm0, {{[0-9]+}}(%esp)
244 ; SSE-X86-NEXT: movups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
245 ; SSE-X86-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
246 ; SSE-X86-NEXT: movss %xmm0, (%esp)
247 ; SSE-X86-NEXT: calll fmaf
248 ; SSE-X86-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Spill
250 ; SSE-X86-NEXT: movups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
251 ; SSE-X86-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
252 ; SSE-X86-NEXT: movss %xmm0, {{[0-9]+}}(%esp)
253 ; SSE-X86-NEXT: movups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
254 ; SSE-X86-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
255 ; SSE-X86-NEXT: movss %xmm0, {{[0-9]+}}(%esp)
256 ; SSE-X86-NEXT: movups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
257 ; SSE-X86-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
258 ; SSE-X86-NEXT: movss %xmm0, (%esp)
259 ; SSE-X86-NEXT: calll fmaf
260 ; SSE-X86-NEXT: movups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
261 ; SSE-X86-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
262 ; SSE-X86-NEXT: movss %xmm0, {{[0-9]+}}(%esp)
263 ; SSE-X86-NEXT: movups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
264 ; SSE-X86-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
265 ; SSE-X86-NEXT: movss %xmm0, {{[0-9]+}}(%esp)
266 ; SSE-X86-NEXT: movups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
267 ; SSE-X86-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
268 ; SSE-X86-NEXT: movss %xmm0, (%esp)
269 ; SSE-X86-NEXT: fstps {{[0-9]+}}(%esp)
270 ; SSE-X86-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Reload
271 ; SSE-X86-NEXT: fstps {{[0-9]+}}(%esp)
272 ; SSE-X86-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Reload
273 ; SSE-X86-NEXT: fstps {{[0-9]+}}(%esp)
275 ; SSE-X86-NEXT: calll fmaf
276 ; SSE-X86-NEXT: fstps {{[0-9]+}}(%esp)
278 ; SSE-X86-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
279 ; SSE-X86-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
280 ; SSE-X86-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
281 ; SSE-X86-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
282 ; SSE-X86-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
283 ; SSE-X86-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
284 ; SSE-X86-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
285 ; SSE-X86-NEXT: addl $100, %esp
286 ; SSE-X86-NEXT: .cfi_def_cfa_offset 4
289 ; SSE-X64-LABEL: f13:
291 ; SSE-X64-NEXT: subq $88, %rsp
292 ; SSE-X64-NEXT: .cfi_def_cfa_offset 96
293 ; SSE-X64-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
294 ; SSE-X64-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
295 ; SSE-X64-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
296 ; SSE-X64-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
297 ; SSE-X64-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
298 ; SSE-X64-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3,3,3]
299 ; SSE-X64-NEXT: callq fmaf@PLT
300 ; SSE-X64-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill
301 ; SSE-X64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
302 ; SSE-X64-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
303 ; SSE-X64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
304 ; SSE-X64-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1]
305 ; SSE-X64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
306 ; SSE-X64-NEXT: movhlps {{.*#+}} xmm2 = xmm2[1,1]
307 ; SSE-X64-NEXT: callq fmaf@PLT
308 ; SSE-X64-NEXT: unpcklps (%rsp), %xmm0 # 16-byte Folded Reload
309 ; SSE-X64-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
310 ; SSE-X64-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill
311 ; SSE-X64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
312 ; SSE-X64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
313 ; SSE-X64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
314 ; SSE-X64-NEXT: callq fmaf@PLT
315 ; SSE-X64-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
316 ; SSE-X64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
317 ; SSE-X64-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
318 ; SSE-X64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
319 ; SSE-X64-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,1,1]
320 ; SSE-X64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
321 ; SSE-X64-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1,1,1]
322 ; SSE-X64-NEXT: callq fmaf@PLT
323 ; SSE-X64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
324 ; SSE-X64-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
325 ; SSE-X64-NEXT: unpcklpd (%rsp), %xmm1 # 16-byte Folded Reload
326 ; SSE-X64-NEXT: # xmm1 = xmm1[0],mem[0]
327 ; SSE-X64-NEXT: movaps %xmm1, %xmm0
328 ; SSE-X64-NEXT: addq $88, %rsp
329 ; SSE-X64-NEXT: .cfi_def_cfa_offset 8
334 ; AVX-NEXT: vfmadd213ps {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2
335 ; AVX-NEXT: ret{{[l|q]}}
336 %res = call <4 x float> @llvm.experimental.constrained.fma.v4f32(<4 x float> %a, <4 x float> %b, <4 x float> %c,
337 metadata !"round.dynamic",
338 metadata !"fpexcept.strict") #0
342 define <2 x double> @f14(<2 x double> %a, <2 x double> %b, <2 x double> %c) #0 {
343 ; SSE-X86-LABEL: f14:
345 ; SSE-X86-NEXT: pushl %ebp
346 ; SSE-X86-NEXT: .cfi_def_cfa_offset 8
347 ; SSE-X86-NEXT: .cfi_offset %ebp, -8
348 ; SSE-X86-NEXT: movl %esp, %ebp
349 ; SSE-X86-NEXT: .cfi_def_cfa_register %ebp
350 ; SSE-X86-NEXT: andl $-16, %esp
351 ; SSE-X86-NEXT: subl $112, %esp
352 ; SSE-X86-NEXT: movaps %xmm2, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
353 ; SSE-X86-NEXT: movaps %xmm1, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
354 ; SSE-X86-NEXT: movaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
355 ; SSE-X86-NEXT: movlps %xmm2, {{[0-9]+}}(%esp)
356 ; SSE-X86-NEXT: movlps %xmm1, {{[0-9]+}}(%esp)
357 ; SSE-X86-NEXT: movlps %xmm0, (%esp)
358 ; SSE-X86-NEXT: calll fma
359 ; SSE-X86-NEXT: movaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
360 ; SSE-X86-NEXT: movhps %xmm0, {{[0-9]+}}(%esp)
361 ; SSE-X86-NEXT: movaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
362 ; SSE-X86-NEXT: movhps %xmm0, {{[0-9]+}}(%esp)
363 ; SSE-X86-NEXT: movaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
364 ; SSE-X86-NEXT: movhps %xmm0, (%esp)
365 ; SSE-X86-NEXT: fstpl {{[0-9]+}}(%esp)
367 ; SSE-X86-NEXT: calll fma
368 ; SSE-X86-NEXT: fstpl {{[0-9]+}}(%esp)
370 ; SSE-X86-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
371 ; SSE-X86-NEXT: movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
372 ; SSE-X86-NEXT: movl %ebp, %esp
373 ; SSE-X86-NEXT: popl %ebp
374 ; SSE-X86-NEXT: .cfi_def_cfa %esp, 4
377 ; SSE-X64-LABEL: f14:
379 ; SSE-X64-NEXT: subq $72, %rsp
380 ; SSE-X64-NEXT: .cfi_def_cfa_offset 80
381 ; SSE-X64-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
382 ; SSE-X64-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
383 ; SSE-X64-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill
384 ; SSE-X64-NEXT: callq fma@PLT
385 ; SSE-X64-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
386 ; SSE-X64-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload
387 ; SSE-X64-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
388 ; SSE-X64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
389 ; SSE-X64-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1]
390 ; SSE-X64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
391 ; SSE-X64-NEXT: movhlps {{.*#+}} xmm2 = xmm2[1,1]
392 ; SSE-X64-NEXT: callq fma@PLT
393 ; SSE-X64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
394 ; SSE-X64-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
395 ; SSE-X64-NEXT: movaps %xmm1, %xmm0
396 ; SSE-X64-NEXT: addq $72, %rsp
397 ; SSE-X64-NEXT: .cfi_def_cfa_offset 8
402 ; AVX-NEXT: vfmadd213pd {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2
403 ; AVX-NEXT: ret{{[l|q]}}
404 %res = call <2 x double> @llvm.experimental.constrained.fma.v2f64(<2 x double> %a, <2 x double> %b, <2 x double> %c,
405 metadata !"round.dynamic",
406 metadata !"fpexcept.strict") #0
407 ret <2 x double> %res
410 define <2 x double> @f15(<2 x float> %a) #0 {
413 ; SSE-NEXT: cvtps2pd %xmm0, %xmm0
414 ; SSE-NEXT: ret{{[l|q]}}
418 ; AVX-NEXT: vcvtps2pd %xmm0, %xmm0
419 ; AVX-NEXT: ret{{[l|q]}}
420 %ret = call <2 x double> @llvm.experimental.constrained.fpext.v2f64.v2f32(
422 metadata !"fpexcept.strict") #0
423 ret <2 x double> %ret
426 define <2 x float> @f16(<2 x double> %a) #0 {
429 ; SSE-NEXT: cvtpd2ps %xmm0, %xmm0
430 ; SSE-NEXT: ret{{[l|q]}}
434 ; AVX-NEXT: vcvtpd2ps %xmm0, %xmm0
435 ; AVX-NEXT: ret{{[l|q]}}
436 %ret = call <2 x float> @llvm.experimental.constrained.fptrunc.v2f32.v2f64(
438 metadata !"round.dynamic",
439 metadata !"fpexcept.strict") #0
444 attributes #0 = { strictfp }