1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 -O3 | FileCheck %s --check-prefixes=SSE2
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+f16c -O3 | FileCheck %s --check-prefixes=AVX,F16C
4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f -O3 | FileCheck %s --check-prefixes=AVX,AVX512
5 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512fp16 -mattr=+avx512vl -O3 | FileCheck %s --check-prefixes=X86
6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512fp16 -mattr=+avx512vl -O3 | FileCheck %s --check-prefixes=X64
8 declare half @llvm.experimental.constrained.fadd.f16(half, half, metadata, metadata)
9 declare half @llvm.experimental.constrained.fsub.f16(half, half, metadata, metadata)
10 declare half @llvm.experimental.constrained.fmul.f16(half, half, metadata, metadata)
11 declare half @llvm.experimental.constrained.fdiv.f16(half, half, metadata, metadata)
12 declare float @llvm.experimental.constrained.fpext.f32.f16(half, metadata)
13 declare double @llvm.experimental.constrained.fpext.f64.f16(half, metadata)
14 declare half @llvm.experimental.constrained.fptrunc.f16.f32(float, metadata, metadata)
15 declare half @llvm.experimental.constrained.fptrunc.f16.f64(double, metadata, metadata)
16 declare half @llvm.experimental.constrained.sqrt.f16(half, metadata, metadata)
17 declare half @llvm.experimental.constrained.fma.f16(half, half, half, metadata, metadata)
19 define half @fadd_f16(half %a, half %b) nounwind strictfp {
20 ; SSE2-LABEL: fadd_f16:
22 ; SSE2-NEXT: pushq %rax
23 ; SSE2-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
24 ; SSE2-NEXT: movaps %xmm1, %xmm0
25 ; SSE2-NEXT: callq __extendhfsf2@PLT
26 ; SSE2-NEXT: movss %xmm0, (%rsp) # 4-byte Spill
27 ; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
28 ; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
29 ; SSE2-NEXT: callq __extendhfsf2@PLT
30 ; SSE2-NEXT: addss (%rsp), %xmm0 # 4-byte Folded Reload
31 ; SSE2-NEXT: callq __truncsfhf2@PLT
32 ; SSE2-NEXT: popq %rax
35 ; AVX-LABEL: fadd_f16:
37 ; AVX-NEXT: vpextrw $0, %xmm0, %eax
38 ; AVX-NEXT: vpextrw $0, %xmm1, %ecx
39 ; AVX-NEXT: movzwl %cx, %ecx
40 ; AVX-NEXT: vmovd %ecx, %xmm0
41 ; AVX-NEXT: vcvtph2ps %xmm0, %xmm0
42 ; AVX-NEXT: movzwl %ax, %eax
43 ; AVX-NEXT: vmovd %eax, %xmm1
44 ; AVX-NEXT: vcvtph2ps %xmm1, %xmm1
45 ; AVX-NEXT: vaddss %xmm0, %xmm1, %xmm0
46 ; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
47 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
48 ; AVX-NEXT: vcvtps2ph $4, %xmm0, %xmm0
49 ; AVX-NEXT: vmovd %xmm0, %eax
50 ; AVX-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0
53 ; X86-LABEL: fadd_f16:
55 ; X86-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm0
56 ; X86-NEXT: vaddsh {{[0-9]+}}(%esp), %xmm0, %xmm0
59 ; X64-LABEL: fadd_f16:
61 ; X64-NEXT: vaddsh %xmm1, %xmm0, %xmm0
63 %ret = call half @llvm.experimental.constrained.fadd.f16(half %a, half %b,
64 metadata !"round.dynamic",
65 metadata !"fpexcept.strict") #0
69 define half @fsub_f16(half %a, half %b) nounwind strictfp {
70 ; SSE2-LABEL: fsub_f16:
72 ; SSE2-NEXT: pushq %rax
73 ; SSE2-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
74 ; SSE2-NEXT: movaps %xmm1, %xmm0
75 ; SSE2-NEXT: callq __extendhfsf2@PLT
76 ; SSE2-NEXT: movss %xmm0, (%rsp) # 4-byte Spill
77 ; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
78 ; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
79 ; SSE2-NEXT: callq __extendhfsf2@PLT
80 ; SSE2-NEXT: subss (%rsp), %xmm0 # 4-byte Folded Reload
81 ; SSE2-NEXT: callq __truncsfhf2@PLT
82 ; SSE2-NEXT: popq %rax
85 ; AVX-LABEL: fsub_f16:
87 ; AVX-NEXT: vpextrw $0, %xmm0, %eax
88 ; AVX-NEXT: vpextrw $0, %xmm1, %ecx
89 ; AVX-NEXT: movzwl %cx, %ecx
90 ; AVX-NEXT: vmovd %ecx, %xmm0
91 ; AVX-NEXT: vcvtph2ps %xmm0, %xmm0
92 ; AVX-NEXT: movzwl %ax, %eax
93 ; AVX-NEXT: vmovd %eax, %xmm1
94 ; AVX-NEXT: vcvtph2ps %xmm1, %xmm1
95 ; AVX-NEXT: vsubss %xmm0, %xmm1, %xmm0
96 ; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
97 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
98 ; AVX-NEXT: vcvtps2ph $4, %xmm0, %xmm0
99 ; AVX-NEXT: vmovd %xmm0, %eax
100 ; AVX-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0
103 ; X86-LABEL: fsub_f16:
105 ; X86-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm0
106 ; X86-NEXT: vsubsh {{[0-9]+}}(%esp), %xmm0, %xmm0
109 ; X64-LABEL: fsub_f16:
111 ; X64-NEXT: vsubsh %xmm1, %xmm0, %xmm0
113 %ret = call half @llvm.experimental.constrained.fsub.f16(half %a, half %b,
114 metadata !"round.dynamic",
115 metadata !"fpexcept.strict") #0
119 define half @fmul_f16(half %a, half %b) nounwind strictfp {
120 ; SSE2-LABEL: fmul_f16:
122 ; SSE2-NEXT: pushq %rax
123 ; SSE2-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
124 ; SSE2-NEXT: movaps %xmm1, %xmm0
125 ; SSE2-NEXT: callq __extendhfsf2@PLT
126 ; SSE2-NEXT: movss %xmm0, (%rsp) # 4-byte Spill
127 ; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
128 ; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
129 ; SSE2-NEXT: callq __extendhfsf2@PLT
130 ; SSE2-NEXT: mulss (%rsp), %xmm0 # 4-byte Folded Reload
131 ; SSE2-NEXT: callq __truncsfhf2@PLT
132 ; SSE2-NEXT: popq %rax
135 ; AVX-LABEL: fmul_f16:
137 ; AVX-NEXT: vpextrw $0, %xmm0, %eax
138 ; AVX-NEXT: vpextrw $0, %xmm1, %ecx
139 ; AVX-NEXT: movzwl %cx, %ecx
140 ; AVX-NEXT: vmovd %ecx, %xmm0
141 ; AVX-NEXT: vcvtph2ps %xmm0, %xmm0
142 ; AVX-NEXT: movzwl %ax, %eax
143 ; AVX-NEXT: vmovd %eax, %xmm1
144 ; AVX-NEXT: vcvtph2ps %xmm1, %xmm1
145 ; AVX-NEXT: vmulss %xmm0, %xmm1, %xmm0
146 ; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
147 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
148 ; AVX-NEXT: vcvtps2ph $4, %xmm0, %xmm0
149 ; AVX-NEXT: vmovd %xmm0, %eax
150 ; AVX-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0
153 ; X86-LABEL: fmul_f16:
155 ; X86-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm0
156 ; X86-NEXT: vmulsh {{[0-9]+}}(%esp), %xmm0, %xmm0
159 ; X64-LABEL: fmul_f16:
161 ; X64-NEXT: vmulsh %xmm1, %xmm0, %xmm0
163 %ret = call half @llvm.experimental.constrained.fmul.f16(half %a, half %b,
164 metadata !"round.dynamic",
165 metadata !"fpexcept.strict") #0
169 define half @fdiv_f16(half %a, half %b) nounwind strictfp {
170 ; SSE2-LABEL: fdiv_f16:
172 ; SSE2-NEXT: pushq %rax
173 ; SSE2-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
174 ; SSE2-NEXT: movaps %xmm1, %xmm0
175 ; SSE2-NEXT: callq __extendhfsf2@PLT
176 ; SSE2-NEXT: movss %xmm0, (%rsp) # 4-byte Spill
177 ; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
178 ; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
179 ; SSE2-NEXT: callq __extendhfsf2@PLT
180 ; SSE2-NEXT: divss (%rsp), %xmm0 # 4-byte Folded Reload
181 ; SSE2-NEXT: callq __truncsfhf2@PLT
182 ; SSE2-NEXT: popq %rax
185 ; AVX-LABEL: fdiv_f16:
187 ; AVX-NEXT: vpextrw $0, %xmm0, %eax
188 ; AVX-NEXT: vpextrw $0, %xmm1, %ecx
189 ; AVX-NEXT: movzwl %cx, %ecx
190 ; AVX-NEXT: vmovd %ecx, %xmm0
191 ; AVX-NEXT: vcvtph2ps %xmm0, %xmm0
192 ; AVX-NEXT: movzwl %ax, %eax
193 ; AVX-NEXT: vmovd %eax, %xmm1
194 ; AVX-NEXT: vcvtph2ps %xmm1, %xmm1
195 ; AVX-NEXT: vdivss %xmm0, %xmm1, %xmm0
196 ; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
197 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
198 ; AVX-NEXT: vcvtps2ph $4, %xmm0, %xmm0
199 ; AVX-NEXT: vmovd %xmm0, %eax
200 ; AVX-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0
203 ; X86-LABEL: fdiv_f16:
205 ; X86-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm0
206 ; X86-NEXT: vdivsh {{[0-9]+}}(%esp), %xmm0, %xmm0
209 ; X64-LABEL: fdiv_f16:
211 ; X64-NEXT: vdivsh %xmm1, %xmm0, %xmm0
213 %ret = call half @llvm.experimental.constrained.fdiv.f16(half %a, half %b,
214 metadata !"round.dynamic",
215 metadata !"fpexcept.strict") #0
219 define void @fpext_f16_to_f32(ptr %val, ptr %ret) nounwind strictfp {
220 ; SSE2-LABEL: fpext_f16_to_f32:
222 ; SSE2-NEXT: pushq %rbx
223 ; SSE2-NEXT: movq %rsi, %rbx
224 ; SSE2-NEXT: pinsrw $0, (%rdi), %xmm0
225 ; SSE2-NEXT: callq __extendhfsf2@PLT
226 ; SSE2-NEXT: movd %xmm0, (%rbx)
227 ; SSE2-NEXT: popq %rbx
230 ; AVX-LABEL: fpext_f16_to_f32:
232 ; AVX-NEXT: movzwl (%rdi), %eax
233 ; AVX-NEXT: vmovd %eax, %xmm0
234 ; AVX-NEXT: vcvtph2ps %xmm0, %xmm0
235 ; AVX-NEXT: vmovss %xmm0, (%rsi)
238 ; X86-LABEL: fpext_f16_to_f32:
240 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
241 ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
242 ; X86-NEXT: vmovsh (%ecx), %xmm0
243 ; X86-NEXT: vcvtsh2ss %xmm0, %xmm0, %xmm0
244 ; X86-NEXT: vmovss %xmm0, (%eax)
247 ; X64-LABEL: fpext_f16_to_f32:
249 ; X64-NEXT: vmovsh (%rdi), %xmm0
250 ; X64-NEXT: vcvtsh2ss %xmm0, %xmm0, %xmm0
251 ; X64-NEXT: vmovss %xmm0, (%rsi)
253 %1 = load half, ptr %val, align 4
254 %res = call float @llvm.experimental.constrained.fpext.f32.f16(half %1,
255 metadata !"fpexcept.strict") #0
256 store float %res, ptr %ret, align 8
260 define void @fpext_f16_to_f64(ptr %val, ptr %ret) nounwind strictfp {
261 ; SSE2-LABEL: fpext_f16_to_f64:
263 ; SSE2-NEXT: pushq %rbx
264 ; SSE2-NEXT: movq %rsi, %rbx
265 ; SSE2-NEXT: pinsrw $0, (%rdi), %xmm0
266 ; SSE2-NEXT: callq __extendhfsf2@PLT
267 ; SSE2-NEXT: cvtss2sd %xmm0, %xmm0
268 ; SSE2-NEXT: movsd %xmm0, (%rbx)
269 ; SSE2-NEXT: popq %rbx
272 ; AVX-LABEL: fpext_f16_to_f64:
274 ; AVX-NEXT: movzwl (%rdi), %eax
275 ; AVX-NEXT: vmovd %eax, %xmm0
276 ; AVX-NEXT: vcvtph2ps %xmm0, %xmm0
277 ; AVX-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
278 ; AVX-NEXT: vmovsd %xmm0, (%rsi)
281 ; X86-LABEL: fpext_f16_to_f64:
283 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
284 ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
285 ; X86-NEXT: vmovsh (%ecx), %xmm0
286 ; X86-NEXT: vcvtsh2sd %xmm0, %xmm0, %xmm0
287 ; X86-NEXT: vmovsd %xmm0, (%eax)
290 ; X64-LABEL: fpext_f16_to_f64:
292 ; X64-NEXT: vmovsh (%rdi), %xmm0
293 ; X64-NEXT: vcvtsh2sd %xmm0, %xmm0, %xmm0
294 ; X64-NEXT: vmovsd %xmm0, (%rsi)
296 %1 = load half, ptr %val, align 4
297 %res = call double @llvm.experimental.constrained.fpext.f64.f16(half %1,
298 metadata !"fpexcept.strict") #0
299 store double %res, ptr %ret, align 8
303 define void @fptrunc_float_to_f16(ptr %val, ptr%ret) nounwind strictfp {
304 ; SSE2-LABEL: fptrunc_float_to_f16:
306 ; SSE2-NEXT: pushq %rbx
307 ; SSE2-NEXT: movq %rsi, %rbx
308 ; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
309 ; SSE2-NEXT: callq __truncsfhf2@PLT
310 ; SSE2-NEXT: pextrw $0, %xmm0, %eax
311 ; SSE2-NEXT: movw %ax, (%rbx)
312 ; SSE2-NEXT: popq %rbx
315 ; AVX-LABEL: fptrunc_float_to_f16:
317 ; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
318 ; AVX-NEXT: vcvtps2ph $4, %xmm0, %xmm0
319 ; AVX-NEXT: vmovd %xmm0, %eax
320 ; AVX-NEXT: movw %ax, (%rsi)
323 ; X86-LABEL: fptrunc_float_to_f16:
325 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
326 ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
327 ; X86-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
328 ; X86-NEXT: vcvtss2sh %xmm0, %xmm0, %xmm0
329 ; X86-NEXT: vmovsh %xmm0, (%eax)
332 ; X64-LABEL: fptrunc_float_to_f16:
334 ; X64-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
335 ; X64-NEXT: vcvtss2sh %xmm0, %xmm0, %xmm0
336 ; X64-NEXT: vmovsh %xmm0, (%rsi)
338 %1 = load float, ptr %val, align 8
339 %res = call half @llvm.experimental.constrained.fptrunc.f16.f32(float %1,
340 metadata !"round.dynamic",
341 metadata !"fpexcept.strict") #0
342 store half %res, ptr %ret, align 4
346 define void @fptrunc_double_to_f16(ptr %val, ptr%ret) nounwind strictfp {
347 ; SSE2-LABEL: fptrunc_double_to_f16:
349 ; SSE2-NEXT: pushq %rbx
350 ; SSE2-NEXT: movq %rsi, %rbx
351 ; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
352 ; SSE2-NEXT: callq __truncdfhf2@PLT
353 ; SSE2-NEXT: pextrw $0, %xmm0, %eax
354 ; SSE2-NEXT: movw %ax, (%rbx)
355 ; SSE2-NEXT: popq %rbx
358 ; AVX-LABEL: fptrunc_double_to_f16:
360 ; AVX-NEXT: pushq %rbx
361 ; AVX-NEXT: movq %rsi, %rbx
362 ; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
363 ; AVX-NEXT: callq __truncdfhf2@PLT
364 ; AVX-NEXT: vpextrw $0, %xmm0, (%rbx)
365 ; AVX-NEXT: popq %rbx
368 ; X86-LABEL: fptrunc_double_to_f16:
370 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
371 ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
372 ; X86-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
373 ; X86-NEXT: vcvtsd2sh %xmm0, %xmm0, %xmm0
374 ; X86-NEXT: vmovsh %xmm0, (%eax)
377 ; X64-LABEL: fptrunc_double_to_f16:
379 ; X64-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
380 ; X64-NEXT: vcvtsd2sh %xmm0, %xmm0, %xmm0
381 ; X64-NEXT: vmovsh %xmm0, (%rsi)
383 %1 = load double, ptr %val, align 8
384 %res = call half @llvm.experimental.constrained.fptrunc.f16.f64(double %1,
385 metadata !"round.dynamic",
386 metadata !"fpexcept.strict") #0
387 store half %res, ptr %ret, align 4
391 define void @fsqrt_f16(ptr %a) nounwind strictfp {
392 ; SSE2-LABEL: fsqrt_f16:
394 ; SSE2-NEXT: pushq %rbx
395 ; SSE2-NEXT: movq %rdi, %rbx
396 ; SSE2-NEXT: pinsrw $0, (%rdi), %xmm0
397 ; SSE2-NEXT: callq __extendhfsf2@PLT
398 ; SSE2-NEXT: sqrtss %xmm0, %xmm0
399 ; SSE2-NEXT: callq __truncsfhf2@PLT
400 ; SSE2-NEXT: pextrw $0, %xmm0, %eax
401 ; SSE2-NEXT: movw %ax, (%rbx)
402 ; SSE2-NEXT: popq %rbx
405 ; AVX-LABEL: fsqrt_f16:
407 ; AVX-NEXT: movzwl (%rdi), %eax
408 ; AVX-NEXT: vmovd %eax, %xmm0
409 ; AVX-NEXT: vcvtph2ps %xmm0, %xmm0
410 ; AVX-NEXT: vsqrtss %xmm0, %xmm0, %xmm0
411 ; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
412 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
413 ; AVX-NEXT: vcvtps2ph $4, %xmm0, %xmm0
414 ; AVX-NEXT: vmovd %xmm0, %eax
415 ; AVX-NEXT: movw %ax, (%rdi)
418 ; X86-LABEL: fsqrt_f16:
420 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
421 ; X86-NEXT: vmovsh (%eax), %xmm0
422 ; X86-NEXT: vsqrtsh %xmm0, %xmm0, %xmm0
423 ; X86-NEXT: vmovsh %xmm0, (%eax)
426 ; X64-LABEL: fsqrt_f16:
428 ; X64-NEXT: vmovsh (%rdi), %xmm0
429 ; X64-NEXT: vsqrtsh %xmm0, %xmm0, %xmm0
430 ; X64-NEXT: vmovsh %xmm0, (%rdi)
432 %1 = load half, ptr %a, align 4
433 %res = call half @llvm.experimental.constrained.sqrt.f16(half %1,
434 metadata !"round.dynamic",
435 metadata !"fpexcept.strict") #0
436 store half %res, ptr %a, align 4
440 define half @fma_f16(half %a, half %b, half %c) nounwind strictfp {
441 ; SSE2-LABEL: fma_f16:
443 ; SSE2-NEXT: subq $24, %rsp
444 ; SSE2-NEXT: movss %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
445 ; SSE2-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
446 ; SSE2-NEXT: movaps %xmm1, %xmm0
447 ; SSE2-NEXT: callq __extendhfsf2@PLT
448 ; SSE2-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
449 ; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
450 ; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
451 ; SSE2-NEXT: callq __extendhfsf2@PLT
452 ; SSE2-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
453 ; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
454 ; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
455 ; SSE2-NEXT: callq __extendhfsf2@PLT
456 ; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload
457 ; SSE2-NEXT: # xmm1 = mem[0],zero,zero,zero
458 ; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 4-byte Reload
459 ; SSE2-NEXT: # xmm2 = mem[0],zero,zero,zero
460 ; SSE2-NEXT: callq fmaf@PLT
461 ; SSE2-NEXT: callq __truncsfhf2@PLT
462 ; SSE2-NEXT: addq $24, %rsp
465 ; F16C-LABEL: fma_f16:
467 ; F16C-NEXT: pushq %rax
468 ; F16C-NEXT: vpextrw $0, %xmm0, %eax
469 ; F16C-NEXT: vpextrw $0, %xmm1, %ecx
470 ; F16C-NEXT: vpextrw $0, %xmm2, %edx
471 ; F16C-NEXT: movzwl %dx, %edx
472 ; F16C-NEXT: vmovd %edx, %xmm0
473 ; F16C-NEXT: vcvtph2ps %xmm0, %xmm2
474 ; F16C-NEXT: movzwl %cx, %ecx
475 ; F16C-NEXT: vmovd %ecx, %xmm0
476 ; F16C-NEXT: vcvtph2ps %xmm0, %xmm1
477 ; F16C-NEXT: movzwl %ax, %eax
478 ; F16C-NEXT: vmovd %eax, %xmm0
479 ; F16C-NEXT: vcvtph2ps %xmm0, %xmm0
480 ; F16C-NEXT: callq fmaf@PLT
481 ; F16C-NEXT: vxorps %xmm1, %xmm1, %xmm1
482 ; F16C-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
483 ; F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0
484 ; F16C-NEXT: vmovd %xmm0, %eax
485 ; F16C-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0
486 ; F16C-NEXT: popq %rax
489 ; AVX512-LABEL: fma_f16:
491 ; AVX512-NEXT: vpextrw $0, %xmm1, %eax
492 ; AVX512-NEXT: vpextrw $0, %xmm0, %ecx
493 ; AVX512-NEXT: vpextrw $0, %xmm2, %edx
494 ; AVX512-NEXT: movzwl %dx, %edx
495 ; AVX512-NEXT: vmovd %edx, %xmm0
496 ; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
497 ; AVX512-NEXT: movzwl %cx, %ecx
498 ; AVX512-NEXT: vmovd %ecx, %xmm1
499 ; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1
500 ; AVX512-NEXT: movzwl %ax, %eax
501 ; AVX512-NEXT: vmovd %eax, %xmm2
502 ; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2
503 ; AVX512-NEXT: vfmadd213ss {{.*#+}} xmm2 = (xmm1 * xmm2) + xmm0
504 ; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0
505 ; AVX512-NEXT: vblendps {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3]
506 ; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
507 ; AVX512-NEXT: vmovd %xmm0, %eax
508 ; AVX512-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0
511 ; X86-LABEL: fma_f16:
513 ; X86-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm1
514 ; X86-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm0
515 ; X86-NEXT: vfmadd213sh {{[0-9]+}}(%esp), %xmm1, %xmm0
518 ; X64-LABEL: fma_f16:
520 ; X64-NEXT: vfmadd213sh %xmm2, %xmm1, %xmm0
522 %res = call half @llvm.experimental.constrained.fma.f16(half %a, half %b, half %c,
523 metadata !"round.dynamic",
524 metadata !"fpexcept.strict") #0
528 attributes #0 = { strictfp }