1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X86
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X64
5 define double @f1(double %a) {
9 ; X86-NEXT: .cfi_def_cfa_offset 8
10 ; X86-NEXT: .cfi_offset %ebp, -8
11 ; X86-NEXT: movl %esp, %ebp
12 ; X86-NEXT: .cfi_def_cfa_register %ebp
13 ; X86-NEXT: andl $-8, %esp
14 ; X86-NEXT: subl $8, %esp
15 ; X86-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
16 ; X86-NEXT: mulsd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
17 ; X86-NEXT: movsd %xmm0, (%esp)
18 ; X86-NEXT: fldl (%esp)
19 ; X86-NEXT: movl %ebp, %esp
21 ; X86-NEXT: .cfi_def_cfa %esp, 4
26 ; X64-NEXT: mulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
28 %1 = fadd fast double %a, %a
29 %2 = fadd fast double %a, %a
30 %3 = fadd fast double %1, %2
34 define double @f2(double %a) {
37 ; X86-NEXT: pushl %ebp
38 ; X86-NEXT: .cfi_def_cfa_offset 8
39 ; X86-NEXT: .cfi_offset %ebp, -8
40 ; X86-NEXT: movl %esp, %ebp
41 ; X86-NEXT: .cfi_def_cfa_register %ebp
42 ; X86-NEXT: andl $-8, %esp
43 ; X86-NEXT: subl $8, %esp
44 ; X86-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
45 ; X86-NEXT: addsd %xmm0, %xmm0
46 ; X86-NEXT: movapd %xmm0, %xmm1
47 ; X86-NEXT: #ARITH_FENCE
48 ; X86-NEXT: addsd %xmm0, %xmm1
49 ; X86-NEXT: movsd %xmm1, (%esp)
50 ; X86-NEXT: fldl (%esp)
51 ; X86-NEXT: movl %ebp, %esp
53 ; X86-NEXT: .cfi_def_cfa %esp, 4
58 ; X64-NEXT: addsd %xmm0, %xmm0
59 ; X64-NEXT: movapd %xmm0, %xmm1
60 ; X64-NEXT: #ARITH_FENCE
61 ; X64-NEXT: addsd %xmm1, %xmm0
63 %1 = fadd fast double %a, %a
64 %t = call double @llvm.arithmetic.fence.f64(double %1)
65 %2 = fadd fast double %a, %a
66 %3 = fadd fast double %t, %2
70 define <2 x float> @f3(<2 x float> %a) {
73 ; X86-NEXT: mulps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
78 ; X64-NEXT: mulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
80 %1 = fadd fast <2 x float> %a, %a
81 %2 = fadd fast <2 x float> %a, %a
82 %3 = fadd fast <2 x float> %1, %2
86 define <2 x float> @f4(<2 x float> %a) {
89 ; X86-NEXT: addps %xmm0, %xmm0
90 ; X86-NEXT: movaps %xmm0, %xmm1
91 ; X86-NEXT: #ARITH_FENCE
92 ; X86-NEXT: addps %xmm1, %xmm0
97 ; X64-NEXT: addps %xmm0, %xmm0
98 ; X64-NEXT: movaps %xmm0, %xmm1
99 ; X64-NEXT: #ARITH_FENCE
100 ; X64-NEXT: addps %xmm1, %xmm0
102 %1 = fadd fast <2 x float> %a, %a
103 %t = call <2 x float> @llvm.arithmetic.fence.v2f32(<2 x float> %1)
104 %2 = fadd fast <2 x float> %a, %a
105 %3 = fadd fast <2 x float> %t, %2
109 define <8 x float> @f5(<8 x float> %a) {
112 ; X86-NEXT: movaps {{.*#+}} xmm2 = [4.0E+0,4.0E+0,4.0E+0,4.0E+0]
113 ; X86-NEXT: mulps %xmm2, %xmm0
114 ; X86-NEXT: mulps %xmm2, %xmm1
119 ; X64-NEXT: movaps {{.*#+}} xmm2 = [4.0E+0,4.0E+0,4.0E+0,4.0E+0]
120 ; X64-NEXT: mulps %xmm2, %xmm0
121 ; X64-NEXT: mulps %xmm2, %xmm1
123 %1 = fadd fast <8 x float> %a, %a
124 %2 = fadd fast <8 x float> %a, %a
125 %3 = fadd fast <8 x float> %1, %2
129 define <8 x float> @f6(<8 x float> %a) {
132 ; X86-NEXT: addps %xmm0, %xmm0
133 ; X86-NEXT: addps %xmm1, %xmm1
134 ; X86-NEXT: movaps %xmm1, %xmm2
135 ; X86-NEXT: #ARITH_FENCE
136 ; X86-NEXT: movaps %xmm0, %xmm3
137 ; X86-NEXT: #ARITH_FENCE
138 ; X86-NEXT: addps %xmm3, %xmm0
139 ; X86-NEXT: addps %xmm2, %xmm1
144 ; X64-NEXT: addps %xmm0, %xmm0
145 ; X64-NEXT: addps %xmm1, %xmm1
146 ; X64-NEXT: movaps %xmm1, %xmm2
147 ; X64-NEXT: #ARITH_FENCE
148 ; X64-NEXT: movaps %xmm0, %xmm3
149 ; X64-NEXT: #ARITH_FENCE
150 ; X64-NEXT: addps %xmm3, %xmm0
151 ; X64-NEXT: addps %xmm2, %xmm1
153 %1 = fadd fast <8 x float> %a, %a
154 %t = call <8 x float> @llvm.arithmetic.fence.v8f32(<8 x float> %1)
155 %2 = fadd fast <8 x float> %a, %a
156 %3 = fadd fast <8 x float> %t, %2
160 define half @f7(half %a) nounwind {
163 ; X86-NEXT: pinsrw $0, {{[0-9]+}}(%esp), %xmm0
164 ; X86-NEXT: #ARITH_FENCE
169 ; X64-NEXT: #ARITH_FENCE
171 %b = call half @llvm.arithmetic.fence.f16(half %a)
175 define bfloat @f8(bfloat %a) nounwind {
178 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
179 ; X86-NEXT: #ARITH_FENCE
180 ; X86-NEXT: pinsrw $0, %eax, %xmm0
185 ; X64-NEXT: pextrw $0, %xmm0, %eax
186 ; X64-NEXT: #ARITH_FENCE
187 ; X64-NEXT: pinsrw $0, %eax, %xmm0
189 %b = call bfloat @llvm.arithmetic.fence.bf16(bfloat %a)
193 define <2 x half> @f9(<2 x half> %a) nounwind {
196 ; X86-NEXT: movdqa %xmm0, %xmm1
197 ; X86-NEXT: psrld $16, %xmm1
198 ; X86-NEXT: #ARITH_FENCE
199 ; X86-NEXT: #ARITH_FENCE
200 ; X86-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
205 ; X64-NEXT: movdqa %xmm0, %xmm1
206 ; X64-NEXT: psrld $16, %xmm1
207 ; X64-NEXT: #ARITH_FENCE
208 ; X64-NEXT: #ARITH_FENCE
209 ; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
211 %b = call <2 x half> @llvm.arithmetic.fence.v2f16(<2 x half> %a)
215 define <3 x bfloat> @f10(<3 x bfloat> %a) nounwind {
218 ; X86-NEXT: pextrw $0, %xmm0, %eax
219 ; X86-NEXT: movdqa %xmm0, %xmm1
220 ; X86-NEXT: psrld $16, %xmm1
221 ; X86-NEXT: pextrw $0, %xmm1, %ecx
222 ; X86-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
223 ; X86-NEXT: pextrw $0, %xmm0, %edx
224 ; X86-NEXT: #ARITH_FENCE
225 ; X86-NEXT: #ARITH_FENCE
226 ; X86-NEXT: #ARITH_FENCE
227 ; X86-NEXT: pinsrw $0, %eax, %xmm0
228 ; X86-NEXT: pinsrw $0, %ecx, %xmm1
229 ; X86-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
230 ; X86-NEXT: pinsrw $0, %edx, %xmm1
231 ; X86-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
236 ; X64-NEXT: pextrw $0, %xmm0, %eax
237 ; X64-NEXT: movdqa %xmm0, %xmm1
238 ; X64-NEXT: psrld $16, %xmm1
239 ; X64-NEXT: pextrw $0, %xmm1, %ecx
240 ; X64-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
241 ; X64-NEXT: pextrw $0, %xmm0, %edx
242 ; X64-NEXT: #ARITH_FENCE
243 ; X64-NEXT: #ARITH_FENCE
244 ; X64-NEXT: #ARITH_FENCE
245 ; X64-NEXT: pinsrw $0, %eax, %xmm0
246 ; X64-NEXT: pinsrw $0, %ecx, %xmm1
247 ; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
248 ; X64-NEXT: pinsrw $0, %edx, %xmm1
249 ; X64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
251 %b = call <3 x bfloat> @llvm.arithmetic.fence.v3bf16(<3 x bfloat> %a)
255 define <4 x bfloat> @f11(<4 x bfloat> %a) nounwind {
258 ; X86-NEXT: pushl %esi
259 ; X86-NEXT: movdqa %xmm0, %xmm1
260 ; X86-NEXT: psrlq $48, %xmm1
261 ; X86-NEXT: pextrw $0, %xmm1, %eax
262 ; X86-NEXT: movdqa %xmm0, %xmm1
263 ; X86-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1]
264 ; X86-NEXT: pextrw $0, %xmm1, %edx
265 ; X86-NEXT: pextrw $0, %xmm0, %ecx
266 ; X86-NEXT: psrld $16, %xmm0
267 ; X86-NEXT: pextrw $0, %xmm0, %esi
268 ; X86-NEXT: #ARITH_FENCE
269 ; X86-NEXT: #ARITH_FENCE
270 ; X86-NEXT: #ARITH_FENCE
271 ; X86-NEXT: #ARITH_FENCE
272 ; X86-NEXT: pinsrw $0, %eax, %xmm0
273 ; X86-NEXT: pinsrw $0, %edx, %xmm1
274 ; X86-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
275 ; X86-NEXT: pinsrw $0, %ecx, %xmm0
276 ; X86-NEXT: pinsrw $0, %esi, %xmm2
277 ; X86-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
278 ; X86-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
279 ; X86-NEXT: popl %esi
284 ; X64-NEXT: movdqa %xmm0, %xmm1
285 ; X64-NEXT: psrlq $48, %xmm1
286 ; X64-NEXT: pextrw $0, %xmm1, %eax
287 ; X64-NEXT: movdqa %xmm0, %xmm1
288 ; X64-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1]
289 ; X64-NEXT: pextrw $0, %xmm1, %ecx
290 ; X64-NEXT: pextrw $0, %xmm0, %edx
291 ; X64-NEXT: psrld $16, %xmm0
292 ; X64-NEXT: pextrw $0, %xmm0, %esi
293 ; X64-NEXT: #ARITH_FENCE
294 ; X64-NEXT: #ARITH_FENCE
295 ; X64-NEXT: #ARITH_FENCE
296 ; X64-NEXT: #ARITH_FENCE
297 ; X64-NEXT: pinsrw $0, %eax, %xmm0
298 ; X64-NEXT: pinsrw $0, %ecx, %xmm1
299 ; X64-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
300 ; X64-NEXT: pinsrw $0, %edx, %xmm0
301 ; X64-NEXT: pinsrw $0, %esi, %xmm2
302 ; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
303 ; X64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
305 %b = call <4 x bfloat> @llvm.arithmetic.fence.v4bf16(<4 x bfloat> %a)
309 declare half @llvm.arithmetic.fence.f16(half)
310 declare bfloat @llvm.arithmetic.fence.bf16(bfloat)
311 declare <2 x half> @llvm.arithmetic.fence.v2f16(<2 x half>)
312 declare <3 x bfloat> @llvm.arithmetic.fence.v3bf16(<3 x bfloat>)
313 declare <4 x bfloat> @llvm.arithmetic.fence.v4bf16(<4 x bfloat>)
314 declare float @llvm.arithmetic.fence.f32(float)
315 declare double @llvm.arithmetic.fence.f64(double)
316 declare <2 x float> @llvm.arithmetic.fence.v2f32(<2 x float>)
317 declare <8 x float> @llvm.arithmetic.fence.v8f32(<8 x float>)