1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+mmx,+sse2 | FileCheck %s --check-prefix=X86
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+mmx,+sse2 -code-model=small | FileCheck %s --check-prefix=X64
4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+mmx,+sse2 -code-model=medium | FileCheck %s --check-prefix=X64
5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+mmx,+sse2 -code-model=large | FileCheck %s --check-prefix=X64-LARGE
7 define double @mmx_zero(double, double, double, double) nounwind {
10 ; X86-NEXT: pushl %ebp
11 ; X86-NEXT: movl %esp, %ebp
12 ; X86-NEXT: andl $-8, %esp
13 ; X86-NEXT: subl $16, %esp
14 ; X86-NEXT: movq 8(%ebp), %mm0
15 ; X86-NEXT: movq 16(%ebp), %mm5
16 ; X86-NEXT: movq %mm5, (%esp) # 8-byte Spill
17 ; X86-NEXT: movq %mm0, %mm3
18 ; X86-NEXT: paddd %mm5, %mm3
19 ; X86-NEXT: pxor %mm1, %mm1
20 ; X86-NEXT: movq %mm3, %mm6
21 ; X86-NEXT: pmuludq %mm1, %mm6
22 ; X86-NEXT: movq 24(%ebp), %mm4
23 ; X86-NEXT: movq %mm6, %mm2
24 ; X86-NEXT: paddd %mm4, %mm2
25 ; X86-NEXT: paddw %mm2, %mm0
26 ; X86-NEXT: movq %mm5, %mm1
27 ; X86-NEXT: paddw %mm0, %mm1
28 ; X86-NEXT: movq 32(%ebp), %mm5
29 ; X86-NEXT: movq %mm1, %mm7
30 ; X86-NEXT: pmuludq %mm5, %mm7
31 ; X86-NEXT: paddw %mm4, %mm7
32 ; X86-NEXT: paddw %mm7, %mm5
33 ; X86-NEXT: paddw %mm5, %mm2
34 ; X86-NEXT: paddw %mm2, %mm0
35 ; X86-NEXT: paddw %mm6, %mm0
36 ; X86-NEXT: pmuludq %mm3, %mm0
37 ; X86-NEXT: paddw {{\.?LCPI[0-9]+_[0-9]+}}, %mm0
38 ; X86-NEXT: paddw %mm1, %mm0
39 ; X86-NEXT: pmuludq %mm7, %mm0
40 ; X86-NEXT: pmuludq (%esp), %mm0 # 8-byte Folded Reload
41 ; X86-NEXT: paddw %mm5, %mm0
42 ; X86-NEXT: paddw %mm2, %mm0
43 ; X86-NEXT: movq2dq %mm0, %xmm0
44 ; X86-NEXT: movsd %xmm0, {{[0-9]+}}(%esp)
45 ; X86-NEXT: fldl {{[0-9]+}}(%esp)
46 ; X86-NEXT: movl %ebp, %esp
50 ; X64-LABEL: mmx_zero:
52 ; X64-NEXT: movdq2q %xmm0, %mm0
53 ; X64-NEXT: movdq2q %xmm1, %mm5
54 ; X64-NEXT: movq %mm5, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
55 ; X64-NEXT: movq %mm0, %mm3
56 ; X64-NEXT: paddd %mm5, %mm3
57 ; X64-NEXT: pxor %mm1, %mm1
58 ; X64-NEXT: movq %mm3, %mm6
59 ; X64-NEXT: pmuludq %mm1, %mm6
60 ; X64-NEXT: movdq2q %xmm2, %mm4
61 ; X64-NEXT: movq %mm6, %mm2
62 ; X64-NEXT: paddd %mm4, %mm2
63 ; X64-NEXT: paddw %mm2, %mm0
64 ; X64-NEXT: movq %mm5, %mm1
65 ; X64-NEXT: paddw %mm0, %mm1
66 ; X64-NEXT: movdq2q %xmm3, %mm5
67 ; X64-NEXT: movq %mm1, %mm7
68 ; X64-NEXT: pmuludq %mm5, %mm7
69 ; X64-NEXT: paddw %mm4, %mm7
70 ; X64-NEXT: paddw %mm7, %mm5
71 ; X64-NEXT: paddw %mm5, %mm2
72 ; X64-NEXT: paddw %mm2, %mm0
73 ; X64-NEXT: paddw %mm6, %mm0
74 ; X64-NEXT: pmuludq %mm3, %mm0
75 ; X64-NEXT: paddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %mm0
76 ; X64-NEXT: paddw %mm1, %mm0
77 ; X64-NEXT: pmuludq %mm7, %mm0
78 ; X64-NEXT: pmuludq {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
79 ; X64-NEXT: paddw %mm5, %mm0
80 ; X64-NEXT: paddw %mm2, %mm0
81 ; X64-NEXT: movq2dq %mm0, %xmm0
84 ; X64-LARGE-LABEL: mmx_zero:
86 ; X64-LARGE-NEXT: movdq2q %xmm0, %mm0
87 ; X64-LARGE-NEXT: movdq2q %xmm1, %mm5
88 ; X64-LARGE-NEXT: movq %mm5, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
89 ; X64-LARGE-NEXT: movq %mm0, %mm3
90 ; X64-LARGE-NEXT: paddd %mm5, %mm3
91 ; X64-LARGE-NEXT: pxor %mm1, %mm1
92 ; X64-LARGE-NEXT: movq %mm3, %mm6
93 ; X64-LARGE-NEXT: pmuludq %mm1, %mm6
94 ; X64-LARGE-NEXT: movdq2q %xmm2, %mm4
95 ; X64-LARGE-NEXT: movq %mm6, %mm2
96 ; X64-LARGE-NEXT: paddd %mm4, %mm2
97 ; X64-LARGE-NEXT: paddw %mm2, %mm0
98 ; X64-LARGE-NEXT: movq %mm5, %mm1
99 ; X64-LARGE-NEXT: paddw %mm0, %mm1
100 ; X64-LARGE-NEXT: movdq2q %xmm3, %mm5
101 ; X64-LARGE-NEXT: movq %mm1, %mm7
102 ; X64-LARGE-NEXT: pmuludq %mm5, %mm7
103 ; X64-LARGE-NEXT: paddw %mm4, %mm7
104 ; X64-LARGE-NEXT: paddw %mm7, %mm5
105 ; X64-LARGE-NEXT: paddw %mm5, %mm2
106 ; X64-LARGE-NEXT: paddw %mm2, %mm0
107 ; X64-LARGE-NEXT: paddw %mm6, %mm0
108 ; X64-LARGE-NEXT: pmuludq %mm3, %mm0
109 ; X64-LARGE-NEXT: pxor %mm3, %mm3
110 ; X64-LARGE-NEXT: paddw %mm3, %mm0
111 ; X64-LARGE-NEXT: paddw %mm1, %mm0
112 ; X64-LARGE-NEXT: pmuludq %mm7, %mm0
113 ; X64-LARGE-NEXT: pmuludq {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
114 ; X64-LARGE-NEXT: paddw %mm5, %mm0
115 ; X64-LARGE-NEXT: paddw %mm2, %mm0
116 ; X64-LARGE-NEXT: movq2dq %mm0, %xmm0
117 ; X64-LARGE-NEXT: retq
118 %5 = bitcast double %0 to <1 x i64>
119 %6 = bitcast double %1 to <1 x i64>
120 %7 = tail call <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64> %5, <1 x i64> %6)
121 %8 = tail call <1 x i64> @llvm.x86.mmx.pmulu.dq(<1 x i64> %7, <1 x i64> bitcast (double 0.000000e+00 to <1 x i64>))
122 %9 = bitcast double %2 to <1 x i64>
123 %10 = tail call <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64> %8, <1 x i64> %9)
124 %11 = tail call <1 x i64> @llvm.x86.mmx.padd.w(<1 x i64> %5, <1 x i64> %10)
125 %12 = tail call <1 x i64> @llvm.x86.mmx.padd.w(<1 x i64> %6, <1 x i64> %11)
126 %13 = bitcast double %3 to <1 x i64>
127 %14 = tail call <1 x i64> @llvm.x86.mmx.pmulu.dq(<1 x i64> %12, <1 x i64> %13)
128 %15 = tail call <1 x i64> @llvm.x86.mmx.padd.w(<1 x i64> %14, <1 x i64> %9)
129 %16 = tail call <1 x i64> @llvm.x86.mmx.padd.w(<1 x i64> %15, <1 x i64> %13)
130 %17 = tail call <1 x i64> @llvm.x86.mmx.padd.w(<1 x i64> %16, <1 x i64> %10)
131 %18 = tail call <1 x i64> @llvm.x86.mmx.padd.w(<1 x i64> %17, <1 x i64> %11)
132 %19 = tail call <1 x i64> @llvm.x86.mmx.padd.w(<1 x i64> %18, <1 x i64> %8)
133 %20 = tail call <1 x i64> @llvm.x86.mmx.pmulu.dq(<1 x i64> %19, <1 x i64> %7)
134 %21 = tail call <1 x i64> @llvm.x86.mmx.padd.w(<1 x i64> %20, <1 x i64> bitcast (double 0.000000e+00 to <1 x i64>))
135 %22 = tail call <1 x i64> @llvm.x86.mmx.padd.w(<1 x i64> %21, <1 x i64> %12)
136 %23 = tail call <1 x i64> @llvm.x86.mmx.pmulu.dq(<1 x i64> %22, <1 x i64> %15)
137 %24 = tail call <1 x i64> @llvm.x86.mmx.pmulu.dq(<1 x i64> %23, <1 x i64> %6)
138 %25 = tail call <1 x i64> @llvm.x86.mmx.padd.w(<1 x i64> %24, <1 x i64> %16)
139 %26 = tail call <1 x i64> @llvm.x86.mmx.padd.w(<1 x i64> %25, <1 x i64> %17)
140 %27 = bitcast <1 x i64> %26 to double
144 declare <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64>, <1 x i64>)
145 declare <1 x i64> @llvm.x86.mmx.padd.w(<1 x i64>, <1 x i64>)
146 declare <1 x i64> @llvm.x86.mmx.pmulu.dq(<1 x i64>, <1 x i64>)