1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=avx512f | FileCheck %s
4 define void @eggs(ptr %arg, ptr %arg1, ptr %arg2, ptr %arg3, ptr %arg4, ptr %arg5, i64 %arg6, i64 %arg7, i64 %arg8, i64 %arg9, i64 %arg10, i64 %arg11, i64 %arg12, ptr %arg13, ptr %arg14) nounwind {
6 ; CHECK: ## %bb.0: ## %bb
7 ; CHECK-NEXT: pushq %r15
8 ; CHECK-NEXT: pushq %r14
9 ; CHECK-NEXT: pushq %r13
10 ; CHECK-NEXT: pushq %r12
11 ; CHECK-NEXT: pushq %rbx
12 ; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rax
13 ; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %r10
14 ; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %r11
15 ; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rbx
16 ; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %r15
17 ; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %r14
18 ; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %r12
19 ; CHECK-NEXT: leaq (%r12,%r14,8), %r14
20 ; CHECK-NEXT: leaq (%r12,%r15,8), %r15
21 ; CHECK-NEXT: vxorpd %xmm0, %xmm0, %xmm0
22 ; CHECK-NEXT: xorl %r12d, %r12d
23 ; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %r13
24 ; CHECK-NEXT: addq %rbx, %r13
25 ; CHECK-NEXT: addq {{[0-9]+}}(%rsp), %rbx
26 ; CHECK-NEXT: vxorpd %xmm1, %xmm1, %xmm1
27 ; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2
28 ; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3
29 ; CHECK-NEXT: vxorpd %xmm4, %xmm4, %xmm4
30 ; CHECK-NEXT: vxorpd %xmm5, %xmm5, %xmm5
31 ; CHECK-NEXT: .p2align 4, 0x90
32 ; CHECK-NEXT: LBB0_1: ## %bb15
33 ; CHECK-NEXT: ## =>This Inner Loop Header: Depth=1
34 ; CHECK-NEXT: vmovupd (%rax,%r11,8), %zmm6
35 ; CHECK-NEXT: vmovupd (%rax,%r13,8), %zmm7
36 ; CHECK-NEXT: vmovupd (%rax,%rbx,8), %zmm8
37 ; CHECK-NEXT: vbroadcastsd (%r15,%r12,8), %zmm9
38 ; CHECK-NEXT: vfmadd231pd {{.*#+}} zmm0 = (zmm6 * zmm9) + zmm0
39 ; CHECK-NEXT: vfmadd231pd {{.*#+}} zmm1 = (zmm7 * zmm9) + zmm1
40 ; CHECK-NEXT: vfmadd231pd {{.*#+}} zmm2 = (zmm8 * zmm9) + zmm2
41 ; CHECK-NEXT: vbroadcastsd (%r14,%r12,8), %zmm9
42 ; CHECK-NEXT: vfmadd231pd {{.*#+}} zmm3 = (zmm9 * zmm6) + zmm3
43 ; CHECK-NEXT: vfmadd231pd {{.*#+}} zmm4 = (zmm9 * zmm7) + zmm4
44 ; CHECK-NEXT: vfmadd231pd {{.*#+}} zmm5 = (zmm8 * zmm9) + zmm5
45 ; CHECK-NEXT: incq %r12
46 ; CHECK-NEXT: cmpq %r12, %r10
47 ; CHECK-NEXT: jne LBB0_1
48 ; CHECK-NEXT: ## %bb.2: ## %bb51
49 ; CHECK-NEXT: vmovapd %zmm0, (%rdi)
50 ; CHECK-NEXT: vmovapd %zmm1, (%rsi)
51 ; CHECK-NEXT: vmovapd %zmm2, (%rdx)
52 ; CHECK-NEXT: vmovapd %zmm3, (%rcx)
53 ; CHECK-NEXT: vmovapd %zmm4, (%r8)
54 ; CHECK-NEXT: vmovapd %zmm5, (%r9)
55 ; CHECK-NEXT: popq %rbx
56 ; CHECK-NEXT: popq %r12
57 ; CHECK-NEXT: popq %r13
58 ; CHECK-NEXT: popq %r14
59 ; CHECK-NEXT: popq %r15
60 ; CHECK-NEXT: vzeroupper
65 bb15: ; preds = %bb15, %bb
66 %tmp = phi <8 x double> [ zeroinitializer, %bb ], [ %tmp38, %bb15 ]
67 %tmp16 = phi <8 x double> [ zeroinitializer, %bb ], [ %tmp39, %bb15 ]
68 %tmp17 = phi <8 x double> [ zeroinitializer, %bb ], [ %tmp40, %bb15 ]
69 %tmp18 = phi <8 x double> [ zeroinitializer, %bb ], [ %tmp46, %bb15 ]
70 %tmp19 = phi <8 x double> [ zeroinitializer, %bb ], [ %tmp47, %bb15 ]
71 %tmp20 = phi <8 x double> [ zeroinitializer, %bb ], [ %tmp48, %bb15 ]
72 %tmp21 = phi i64 [ 0, %bb ], [ %tmp49, %bb15 ]
73 %tmp22 = getelementptr inbounds double, ptr %arg14, i64 %arg11
74 %tmp24 = load <8 x double>, ptr %tmp22, align 8
75 %tmp25 = add i64 %arg10, %arg6
76 %tmp26 = getelementptr inbounds double, ptr %arg14, i64 %tmp25
77 %tmp28 = load <8 x double>, ptr %tmp26, align 8
78 %tmp29 = add i64 %arg10, %arg7
79 %tmp30 = getelementptr inbounds double, ptr %arg14, i64 %tmp29
80 %tmp32 = load <8 x double>, ptr %tmp30, align 8
81 %tmp33 = add i64 %tmp21, %arg8
82 %tmp34 = getelementptr inbounds double, ptr %arg13, i64 %tmp33
83 %tmp35 = load double, ptr %tmp34, align 8
84 %tmp36 = insertelement <8 x double> undef, double %tmp35, i32 0
85 %tmp37 = shufflevector <8 x double> %tmp36, <8 x double> undef, <8 x i32> zeroinitializer
86 %tmp38 = call <8 x double> @llvm.fmuladd.v8f64(<8 x double> %tmp24, <8 x double> %tmp37, <8 x double> %tmp)
87 %tmp39 = call <8 x double> @llvm.fmuladd.v8f64(<8 x double> %tmp28, <8 x double> %tmp37, <8 x double> %tmp16)
88 %tmp40 = call <8 x double> @llvm.fmuladd.v8f64(<8 x double> %tmp32, <8 x double> %tmp37, <8 x double> %tmp17)
89 %tmp41 = add i64 %tmp21, %arg9
90 %tmp42 = getelementptr inbounds double, ptr %arg13, i64 %tmp41
91 %tmp43 = load double, ptr %tmp42, align 8
92 %tmp44 = insertelement <8 x double> undef, double %tmp43, i32 0
93 %tmp45 = shufflevector <8 x double> %tmp44, <8 x double> undef, <8 x i32> zeroinitializer
94 %tmp46 = call <8 x double> @llvm.fmuladd.v8f64(<8 x double> %tmp24, <8 x double> %tmp45, <8 x double> %tmp18)
95 %tmp47 = call <8 x double> @llvm.fmuladd.v8f64(<8 x double> %tmp28, <8 x double> %tmp45, <8 x double> %tmp19)
96 %tmp48 = call <8 x double> @llvm.fmuladd.v8f64(<8 x double> %tmp32, <8 x double> %tmp45, <8 x double> %tmp20)
97 %tmp49 = add nuw nsw i64 %tmp21, 1
98 %tmp50 = icmp eq i64 %tmp49, %arg12
99 br i1 %tmp50, label %bb51, label %bb15
101 bb51: ; preds = %bb15
102 store <8 x double> %tmp38, ptr %arg
103 store <8 x double> %tmp39, ptr %arg1
104 store <8 x double> %tmp40, ptr %arg2
105 store <8 x double> %tmp46, ptr %arg3
106 store <8 x double> %tmp47, ptr %arg4
107 store <8 x double> %tmp48, ptr %arg5
111 declare <8 x double> @llvm.fmuladd.v8f64(<8 x double>, <8 x double>, <8 x double>)