1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+mmx,+sse2 | FileCheck %s --check-prefix=X86
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+mmx,+sse2 | FileCheck %s --check-prefix=X64
5 ; If we are transferring XMM conversion results to MMX registers we could use the MMX equivalents
6 ; (CVTPD2PI/CVTTPD2PI + CVTPS2PI/CVTTPS2PI) without affecting rounding/exceptions etc.
8 define void @cvt_v2f64_v2i32(<2 x double>, <1 x i64>*) nounwind {
9 ; X86-LABEL: cvt_v2f64_v2i32:
11 ; X86-NEXT: pushl %ebp
12 ; X86-NEXT: movl %esp, %ebp
13 ; X86-NEXT: andl $-8, %esp
14 ; X86-NEXT: subl $8, %esp
15 ; X86-NEXT: movl 8(%ebp), %eax
16 ; X86-NEXT: cvtpd2pi %xmm0, %mm0
17 ; X86-NEXT: paddd %mm0, %mm0
18 ; X86-NEXT: movq %mm0, (%esp)
19 ; X86-NEXT: movl (%esp), %ecx
20 ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
21 ; X86-NEXT: movl %edx, 4(%eax)
22 ; X86-NEXT: movl %ecx, (%eax)
23 ; X86-NEXT: movl %ebp, %esp
27 ; X64-LABEL: cvt_v2f64_v2i32:
29 ; X64-NEXT: cvtpd2pi %xmm0, %mm0
30 ; X64-NEXT: paddd %mm0, %mm0
31 ; X64-NEXT: movq %mm0, (%rdi)
33 %3 = tail call <4 x i32> @llvm.x86.sse2.cvtpd2dq(<2 x double> %0)
34 %4 = bitcast <4 x i32> %3 to <2 x i64>
35 %5 = extractelement <2 x i64> %4, i32 0
36 %6 = bitcast i64 %5 to x86_mmx
37 %7 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %6, x86_mmx %6)
38 %8 = bitcast x86_mmx %7 to i64
39 %9 = insertelement <1 x i64> undef, i64 %8, i32 0
40 store <1 x i64> %9, <1 x i64>* %1
44 define void @cvtt_v2f64_v2i32(<2 x double>, <1 x i64>*) nounwind {
45 ; X86-LABEL: cvtt_v2f64_v2i32:
47 ; X86-NEXT: pushl %ebp
48 ; X86-NEXT: movl %esp, %ebp
49 ; X86-NEXT: andl $-8, %esp
50 ; X86-NEXT: subl $8, %esp
51 ; X86-NEXT: movl 8(%ebp), %eax
52 ; X86-NEXT: cvttpd2pi %xmm0, %mm0
53 ; X86-NEXT: paddd %mm0, %mm0
54 ; X86-NEXT: movq %mm0, (%esp)
55 ; X86-NEXT: movl (%esp), %ecx
56 ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
57 ; X86-NEXT: movl %edx, 4(%eax)
58 ; X86-NEXT: movl %ecx, (%eax)
59 ; X86-NEXT: movl %ebp, %esp
63 ; X64-LABEL: cvtt_v2f64_v2i32:
65 ; X64-NEXT: cvttpd2pi %xmm0, %mm0
66 ; X64-NEXT: paddd %mm0, %mm0
67 ; X64-NEXT: movq %mm0, (%rdi)
69 %3 = tail call <4 x i32> @llvm.x86.sse2.cvttpd2dq(<2 x double> %0)
70 %4 = bitcast <4 x i32> %3 to <2 x i64>
71 %5 = extractelement <2 x i64> %4, i32 0
72 %6 = bitcast i64 %5 to x86_mmx
73 %7 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %6, x86_mmx %6)
74 %8 = bitcast x86_mmx %7 to i64
75 %9 = insertelement <1 x i64> undef, i64 %8, i32 0
76 store <1 x i64> %9, <1 x i64>* %1
80 define void @fptosi_v2f64_v2i32(<2 x double>, <1 x i64>*) nounwind {
81 ; X86-LABEL: fptosi_v2f64_v2i32:
83 ; X86-NEXT: pushl %ebp
84 ; X86-NEXT: movl %esp, %ebp
85 ; X86-NEXT: andl $-8, %esp
86 ; X86-NEXT: subl $8, %esp
87 ; X86-NEXT: movl 8(%ebp), %eax
88 ; X86-NEXT: cvttpd2pi %xmm0, %mm0
89 ; X86-NEXT: paddd %mm0, %mm0
90 ; X86-NEXT: movq %mm0, (%esp)
91 ; X86-NEXT: movl (%esp), %ecx
92 ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
93 ; X86-NEXT: movl %edx, 4(%eax)
94 ; X86-NEXT: movl %ecx, (%eax)
95 ; X86-NEXT: movl %ebp, %esp
99 ; X64-LABEL: fptosi_v2f64_v2i32:
101 ; X64-NEXT: cvttpd2pi %xmm0, %mm0
102 ; X64-NEXT: paddd %mm0, %mm0
103 ; X64-NEXT: movq %mm0, (%rdi)
105 %3 = fptosi <2 x double> %0 to <2 x i32>
106 %4 = bitcast <2 x i32> %3 to x86_mmx
107 %5 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %4, x86_mmx %4)
108 %6 = bitcast x86_mmx %5 to i64
109 %7 = insertelement <1 x i64> undef, i64 %6, i32 0
110 store <1 x i64> %7, <1 x i64>* %1
114 define void @cvt_v2f32_v2i32(<4 x float>, <1 x i64>*) nounwind {
115 ; X86-LABEL: cvt_v2f32_v2i32:
117 ; X86-NEXT: pushl %ebp
118 ; X86-NEXT: movl %esp, %ebp
119 ; X86-NEXT: andl $-8, %esp
120 ; X86-NEXT: subl $8, %esp
121 ; X86-NEXT: movl 8(%ebp), %eax
122 ; X86-NEXT: cvtps2pi %xmm0, %mm0
123 ; X86-NEXT: paddd %mm0, %mm0
124 ; X86-NEXT: movq %mm0, (%esp)
125 ; X86-NEXT: movl (%esp), %ecx
126 ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
127 ; X86-NEXT: movl %edx, 4(%eax)
128 ; X86-NEXT: movl %ecx, (%eax)
129 ; X86-NEXT: movl %ebp, %esp
130 ; X86-NEXT: popl %ebp
133 ; X64-LABEL: cvt_v2f32_v2i32:
135 ; X64-NEXT: cvtps2pi %xmm0, %mm0
136 ; X64-NEXT: paddd %mm0, %mm0
137 ; X64-NEXT: movq %mm0, (%rdi)
139 %3 = tail call <4 x i32> @llvm.x86.sse2.cvtps2dq(<4 x float> %0)
140 %4 = bitcast <4 x i32> %3 to <2 x i64>
141 %5 = extractelement <2 x i64> %4, i32 0
142 %6 = bitcast i64 %5 to x86_mmx
143 %7 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %6, x86_mmx %6)
144 %8 = bitcast x86_mmx %7 to i64
145 %9 = insertelement <1 x i64> undef, i64 %8, i32 0
146 store <1 x i64> %9, <1 x i64>* %1
150 define void @cvtt_v2f32_v2i32(<4 x float>, <1 x i64>*) nounwind {
151 ; X86-LABEL: cvtt_v2f32_v2i32:
153 ; X86-NEXT: pushl %ebp
154 ; X86-NEXT: movl %esp, %ebp
155 ; X86-NEXT: andl $-8, %esp
156 ; X86-NEXT: subl $8, %esp
157 ; X86-NEXT: movl 8(%ebp), %eax
158 ; X86-NEXT: cvttps2pi %xmm0, %mm0
159 ; X86-NEXT: paddd %mm0, %mm0
160 ; X86-NEXT: movq %mm0, (%esp)
161 ; X86-NEXT: movl (%esp), %ecx
162 ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
163 ; X86-NEXT: movl %edx, 4(%eax)
164 ; X86-NEXT: movl %ecx, (%eax)
165 ; X86-NEXT: movl %ebp, %esp
166 ; X86-NEXT: popl %ebp
169 ; X64-LABEL: cvtt_v2f32_v2i32:
171 ; X64-NEXT: cvttps2pi %xmm0, %mm0
172 ; X64-NEXT: paddd %mm0, %mm0
173 ; X64-NEXT: movq %mm0, (%rdi)
175 %3 = tail call <4 x i32> @llvm.x86.sse2.cvttps2dq(<4 x float> %0)
176 %4 = bitcast <4 x i32> %3 to <2 x i64>
177 %5 = extractelement <2 x i64> %4, i32 0
178 %6 = bitcast i64 %5 to x86_mmx
179 %7 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %6, x86_mmx %6)
180 %8 = bitcast x86_mmx %7 to i64
181 %9 = insertelement <1 x i64> undef, i64 %8, i32 0
182 store <1 x i64> %9, <1 x i64>* %1
186 define void @fptosi_v4f32_v4i32(<4 x float>, <1 x i64>*) nounwind {
187 ; X86-LABEL: fptosi_v4f32_v4i32:
189 ; X86-NEXT: pushl %ebp
190 ; X86-NEXT: movl %esp, %ebp
191 ; X86-NEXT: andl $-8, %esp
192 ; X86-NEXT: subl $8, %esp
193 ; X86-NEXT: movl 8(%ebp), %eax
194 ; X86-NEXT: cvttps2pi %xmm0, %mm0
195 ; X86-NEXT: paddd %mm0, %mm0
196 ; X86-NEXT: movq %mm0, (%esp)
197 ; X86-NEXT: movl (%esp), %ecx
198 ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
199 ; X86-NEXT: movl %edx, 4(%eax)
200 ; X86-NEXT: movl %ecx, (%eax)
201 ; X86-NEXT: movl %ebp, %esp
202 ; X86-NEXT: popl %ebp
205 ; X64-LABEL: fptosi_v4f32_v4i32:
207 ; X64-NEXT: cvttps2pi %xmm0, %mm0
208 ; X64-NEXT: paddd %mm0, %mm0
209 ; X64-NEXT: movq %mm0, (%rdi)
211 %3 = fptosi <4 x float> %0 to <4 x i32>
212 %4 = shufflevector <4 x i32> %3, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
213 %5 = bitcast <2 x i32> %4 to x86_mmx
214 %6 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %5, x86_mmx %5)
215 %7 = bitcast x86_mmx %6 to i64
216 %8 = insertelement <1 x i64> undef, i64 %7, i32 0
217 store <1 x i64> %8, <1 x i64>* %1
221 define void @fptosi_v2f32_v2i32(<4 x float>, <1 x i64>*) nounwind {
222 ; X86-LABEL: fptosi_v2f32_v2i32:
224 ; X86-NEXT: pushl %ebp
225 ; X86-NEXT: movl %esp, %ebp
226 ; X86-NEXT: andl $-8, %esp
227 ; X86-NEXT: subl $8, %esp
228 ; X86-NEXT: movl 8(%ebp), %eax
229 ; X86-NEXT: cvttps2pi %xmm0, %mm0
230 ; X86-NEXT: paddd %mm0, %mm0
231 ; X86-NEXT: movq %mm0, (%esp)
232 ; X86-NEXT: movl (%esp), %ecx
233 ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
234 ; X86-NEXT: movl %edx, 4(%eax)
235 ; X86-NEXT: movl %ecx, (%eax)
236 ; X86-NEXT: movl %ebp, %esp
237 ; X86-NEXT: popl %ebp
240 ; X64-LABEL: fptosi_v2f32_v2i32:
242 ; X64-NEXT: cvttps2pi %xmm0, %mm0
243 ; X64-NEXT: paddd %mm0, %mm0
244 ; X64-NEXT: movq %mm0, (%rdi)
246 %3 = fptosi <4 x float> %0 to <4 x i32>
247 %4 = bitcast <4 x i32> %3 to <2 x i64>
248 %5 = extractelement <2 x i64> %4, i32 0
249 %6 = bitcast i64 %5 to x86_mmx
250 %7 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %6, x86_mmx %6)
251 %8 = bitcast x86_mmx %7 to i64
252 %9 = insertelement <1 x i64> undef, i64 %8, i32 0
253 store <1 x i64> %9, <1 x i64>* %1
257 ; FIXME: If we are transferring MMX registers to XMM for conversion we could use the MMX equivalents
258 ; (CVTPI2PD + CVTPI2PS) without affecting rounding/exceptions etc.
260 define <2 x double> @sitofp_v2i32_v2f64(<1 x i64>*) nounwind {
261 ; X86-LABEL: sitofp_v2i32_v2f64:
263 ; X86-NEXT: pushl %ebp
264 ; X86-NEXT: movl %esp, %ebp
265 ; X86-NEXT: andl $-8, %esp
266 ; X86-NEXT: subl $8, %esp
267 ; X86-NEXT: movl 8(%ebp), %eax
268 ; X86-NEXT: movq (%eax), %mm0
269 ; X86-NEXT: paddd %mm0, %mm0
270 ; X86-NEXT: movq %mm0, (%esp)
271 ; X86-NEXT: cvtdq2pd (%esp), %xmm0
272 ; X86-NEXT: movl %ebp, %esp
273 ; X86-NEXT: popl %ebp
276 ; X64-LABEL: sitofp_v2i32_v2f64:
278 ; X64-NEXT: movq (%rdi), %mm0
279 ; X64-NEXT: paddd %mm0, %mm0
280 ; X64-NEXT: movq2dq %mm0, %xmm0
281 ; X64-NEXT: cvtdq2pd %xmm0, %xmm0
283 %2 = bitcast <1 x i64>* %0 to x86_mmx*
284 %3 = load x86_mmx, x86_mmx* %2, align 8
285 %4 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %3, x86_mmx %3)
286 %5 = bitcast x86_mmx %4 to i64
287 %6 = insertelement <2 x i64> undef, i64 %5, i32 0
288 %7 = bitcast <2 x i64> %6 to <4 x i32>
289 %8 = shufflevector <4 x i32> %7, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
290 %9 = sitofp <2 x i32> %8 to <2 x double>
294 define <4 x float> @sitofp_v2i32_v2f32(<1 x i64>*) nounwind {
295 ; X86-LABEL: sitofp_v2i32_v2f32:
297 ; X86-NEXT: pushl %ebp
298 ; X86-NEXT: movl %esp, %ebp
299 ; X86-NEXT: andl $-8, %esp
300 ; X86-NEXT: subl $8, %esp
301 ; X86-NEXT: movl 8(%ebp), %eax
302 ; X86-NEXT: movq (%eax), %mm0
303 ; X86-NEXT: paddd %mm0, %mm0
304 ; X86-NEXT: movq %mm0, (%esp)
305 ; X86-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
306 ; X86-NEXT: cvtdq2ps %xmm0, %xmm0
307 ; X86-NEXT: movl %ebp, %esp
308 ; X86-NEXT: popl %ebp
311 ; X64-LABEL: sitofp_v2i32_v2f32:
313 ; X64-NEXT: movq (%rdi), %mm0
314 ; X64-NEXT: paddd %mm0, %mm0
315 ; X64-NEXT: movq %mm0, -{{[0-9]+}}(%rsp)
316 ; X64-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
317 ; X64-NEXT: cvtdq2ps %xmm0, %xmm0
319 %2 = bitcast <1 x i64>* %0 to x86_mmx*
320 %3 = load x86_mmx, x86_mmx* %2, align 8
321 %4 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %3, x86_mmx %3)
322 %5 = bitcast x86_mmx %4 to <2 x i32>
323 %6 = shufflevector <2 x i32> %5, <2 x i32> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
324 %7 = sitofp <4 x i32> %6 to <4 x float>
328 define <4 x float> @cvt_v2i32_v2f32(<1 x i64>*) nounwind {
329 ; X86-LABEL: cvt_v2i32_v2f32:
331 ; X86-NEXT: pushl %ebp
332 ; X86-NEXT: movl %esp, %ebp
333 ; X86-NEXT: andl $-8, %esp
334 ; X86-NEXT: subl $8, %esp
335 ; X86-NEXT: movl 8(%ebp), %eax
336 ; X86-NEXT: movq (%eax), %mm0
337 ; X86-NEXT: paddd %mm0, %mm0
338 ; X86-NEXT: movq %mm0, (%esp)
339 ; X86-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
340 ; X86-NEXT: cvtdq2ps %xmm0, %xmm0
341 ; X86-NEXT: movl %ebp, %esp
342 ; X86-NEXT: popl %ebp
345 ; X64-LABEL: cvt_v2i32_v2f32:
347 ; X64-NEXT: movq (%rdi), %mm0
348 ; X64-NEXT: paddd %mm0, %mm0
349 ; X64-NEXT: movq %mm0, %rax
350 ; X64-NEXT: movq %rax, %xmm0
351 ; X64-NEXT: cvtdq2ps %xmm0, %xmm0
353 %2 = bitcast <1 x i64>* %0 to x86_mmx*
354 %3 = load x86_mmx, x86_mmx* %2, align 8
355 %4 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %3, x86_mmx %3)
356 %5 = bitcast x86_mmx %4 to i64
357 %6 = insertelement <2 x i64> undef, i64 %5, i32 0
358 %7 = insertelement <2 x i64> %6, i64 0, i32 1
359 %8 = bitcast <2 x i64> %7 to <4 x i32>
360 %9 = tail call <4 x float> @llvm.x86.sse2.cvtdq2ps(<4 x i32> %8)
364 declare x86_mmx @llvm.x86.mmx.padd.d(x86_mmx, x86_mmx)
365 declare <4 x i32> @llvm.x86.sse2.cvtpd2dq(<2 x double>)
366 declare <4 x i32> @llvm.x86.sse2.cvttpd2dq(<2 x double>)
367 declare <4 x i32> @llvm.x86.sse2.cvtps2dq(<4 x float>)
368 declare <4 x i32> @llvm.x86.sse2.cvttps2dq(<4 x float>)
369 declare <4 x float> @llvm.x86.sse2.cvtdq2ps(<4 x i32>)