1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+mmx,+sse2 | FileCheck %s --check-prefix=X86
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+mmx,+sse2 | FileCheck %s --check-prefix=X64
5 define double @mmx_zero(double, double, double, double) nounwind {
9 ; X86-NEXT: movl %esp, %ebp
10 ; X86-NEXT: andl $-8, %esp
11 ; X86-NEXT: subl $16, %esp
12 ; X86-NEXT: movq 8(%ebp), %mm0
13 ; X86-NEXT: movq 16(%ebp), %mm5
14 ; X86-NEXT: movq %mm5, (%esp) # 8-byte Spill
15 ; X86-NEXT: movq %mm0, %mm3
16 ; X86-NEXT: paddd %mm5, %mm3
17 ; X86-NEXT: pxor %mm1, %mm1
18 ; X86-NEXT: movq %mm3, %mm6
19 ; X86-NEXT: pmuludq %mm1, %mm6
20 ; X86-NEXT: movq 24(%ebp), %mm4
21 ; X86-NEXT: movq %mm6, %mm2
22 ; X86-NEXT: paddd %mm4, %mm2
23 ; X86-NEXT: paddw %mm2, %mm0
24 ; X86-NEXT: movq %mm5, %mm1
25 ; X86-NEXT: paddw %mm0, %mm1
26 ; X86-NEXT: movq 32(%ebp), %mm5
27 ; X86-NEXT: movq %mm1, %mm7
28 ; X86-NEXT: pmuludq %mm5, %mm7
29 ; X86-NEXT: paddw %mm4, %mm7
30 ; X86-NEXT: paddw %mm7, %mm5
31 ; X86-NEXT: paddw %mm5, %mm2
32 ; X86-NEXT: paddw %mm2, %mm0
33 ; X86-NEXT: paddw %mm6, %mm0
34 ; X86-NEXT: pmuludq %mm3, %mm0
35 ; X86-NEXT: paddw {{\.?LCPI[0-9]+_[0-9]+}}, %mm0
36 ; X86-NEXT: paddw %mm1, %mm0
37 ; X86-NEXT: pmuludq %mm7, %mm0
38 ; X86-NEXT: pmuludq (%esp), %mm0 # 8-byte Folded Reload
39 ; X86-NEXT: paddw %mm5, %mm0
40 ; X86-NEXT: paddw %mm2, %mm0
41 ; X86-NEXT: movq2dq %mm0, %xmm0
42 ; X86-NEXT: movsd %xmm0, {{[0-9]+}}(%esp)
43 ; X86-NEXT: fldl {{[0-9]+}}(%esp)
44 ; X86-NEXT: movl %ebp, %esp
48 ; X64-LABEL: mmx_zero:
50 ; X64-NEXT: movdq2q %xmm0, %mm0
51 ; X64-NEXT: movdq2q %xmm1, %mm5
52 ; X64-NEXT: movq %mm5, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
53 ; X64-NEXT: movq %mm0, %mm3
54 ; X64-NEXT: paddd %mm5, %mm3
55 ; X64-NEXT: pxor %mm1, %mm1
56 ; X64-NEXT: movq %mm3, %mm6
57 ; X64-NEXT: pmuludq %mm1, %mm6
58 ; X64-NEXT: movdq2q %xmm2, %mm4
59 ; X64-NEXT: movq %mm6, %mm2
60 ; X64-NEXT: paddd %mm4, %mm2
61 ; X64-NEXT: paddw %mm2, %mm0
62 ; X64-NEXT: movq %mm5, %mm1
63 ; X64-NEXT: paddw %mm0, %mm1
64 ; X64-NEXT: movdq2q %xmm3, %mm5
65 ; X64-NEXT: movq %mm1, %mm7
66 ; X64-NEXT: pmuludq %mm5, %mm7
67 ; X64-NEXT: paddw %mm4, %mm7
68 ; X64-NEXT: paddw %mm7, %mm5
69 ; X64-NEXT: paddw %mm5, %mm2
70 ; X64-NEXT: paddw %mm2, %mm0
71 ; X64-NEXT: paddw %mm6, %mm0
72 ; X64-NEXT: pmuludq %mm3, %mm0
73 ; X64-NEXT: paddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %mm0
74 ; X64-NEXT: paddw %mm1, %mm0
75 ; X64-NEXT: pmuludq %mm7, %mm0
76 ; X64-NEXT: pmuludq {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
77 ; X64-NEXT: paddw %mm5, %mm0
78 ; X64-NEXT: paddw %mm2, %mm0
79 ; X64-NEXT: movq2dq %mm0, %xmm0
81 %5 = bitcast double %0 to x86_mmx
82 %6 = bitcast double %1 to x86_mmx
83 %7 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %5, x86_mmx %6)
84 %8 = tail call x86_mmx @llvm.x86.mmx.pmulu.dq(x86_mmx %7, x86_mmx bitcast (double 0.000000e+00 to x86_mmx))
85 %9 = bitcast double %2 to x86_mmx
86 %10 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %8, x86_mmx %9)
87 %11 = tail call x86_mmx @llvm.x86.mmx.padd.w(x86_mmx %5, x86_mmx %10)
88 %12 = tail call x86_mmx @llvm.x86.mmx.padd.w(x86_mmx %6, x86_mmx %11)
89 %13 = bitcast double %3 to x86_mmx
90 %14 = tail call x86_mmx @llvm.x86.mmx.pmulu.dq(x86_mmx %12, x86_mmx %13)
91 %15 = tail call x86_mmx @llvm.x86.mmx.padd.w(x86_mmx %14, x86_mmx %9)
92 %16 = tail call x86_mmx @llvm.x86.mmx.padd.w(x86_mmx %15, x86_mmx %13)
93 %17 = tail call x86_mmx @llvm.x86.mmx.padd.w(x86_mmx %16, x86_mmx %10)
94 %18 = tail call x86_mmx @llvm.x86.mmx.padd.w(x86_mmx %17, x86_mmx %11)
95 %19 = tail call x86_mmx @llvm.x86.mmx.padd.w(x86_mmx %18, x86_mmx %8)
96 %20 = tail call x86_mmx @llvm.x86.mmx.pmulu.dq(x86_mmx %19, x86_mmx %7)
97 %21 = tail call x86_mmx @llvm.x86.mmx.padd.w(x86_mmx %20, x86_mmx bitcast (double 0.000000e+00 to x86_mmx))
98 %22 = tail call x86_mmx @llvm.x86.mmx.padd.w(x86_mmx %21, x86_mmx %12)
99 %23 = tail call x86_mmx @llvm.x86.mmx.pmulu.dq(x86_mmx %22, x86_mmx %15)
100 %24 = tail call x86_mmx @llvm.x86.mmx.pmulu.dq(x86_mmx %23, x86_mmx %6)
101 %25 = tail call x86_mmx @llvm.x86.mmx.padd.w(x86_mmx %24, x86_mmx %16)
102 %26 = tail call x86_mmx @llvm.x86.mmx.padd.w(x86_mmx %25, x86_mmx %17)
103 %27 = bitcast x86_mmx %26 to double
107 declare x86_mmx @llvm.x86.mmx.padd.d(x86_mmx, x86_mmx)
108 declare x86_mmx @llvm.x86.mmx.padd.w(x86_mmx, x86_mmx)
109 declare x86_mmx @llvm.x86.mmx.pmulu.dq(x86_mmx, x86_mmx)