1 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2 ; RUN: opt -passes=lower-matrix-intrinsics,instcombine -fuse-matrix-use-loops=false -fuse-matrix-tile-size=2 -matrix-allow-contract -force-fuse-matrix -verify-dom-info %s -S | FileCheck %s
4 ; REQUIRES: aarch64-registered-target
6 target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
7 target triple = "aarch64-apple-ios"
9 ; Test tiling without generating explicit loops.
11 define void @multiply(ptr %A, ptr %B, ptr %C) {
12 ; CHECK-LABEL: @multiply(
14 ; CHECK-NEXT: [[STORE_BEGIN:%.*]] = ptrtoint ptr [[C:%.*]] to i64
15 ; CHECK-NEXT: [[STORE_END:%.*]] = add nuw nsw i64 [[STORE_BEGIN]], 128
16 ; CHECK-NEXT: [[LOAD_BEGIN:%.*]] = ptrtoint ptr [[A:%.*]] to i64
17 ; CHECK-NEXT: [[TMP0:%.*]] = icmp ugt i64 [[STORE_END]], [[LOAD_BEGIN]]
18 ; CHECK-NEXT: br i1 [[TMP0]], label [[ALIAS_CONT:%.*]], label [[NO_ALIAS:%.*]]
20 ; CHECK-NEXT: [[LOAD_END:%.*]] = add nuw nsw i64 [[LOAD_BEGIN]], 128
21 ; CHECK-NEXT: [[TMP1:%.*]] = icmp ugt i64 [[LOAD_END]], [[STORE_BEGIN]]
22 ; CHECK-NEXT: br i1 [[TMP1]], label [[COPY:%.*]], label [[NO_ALIAS]]
24 ; CHECK-NEXT: [[TMP2:%.*]] = alloca [16 x double], align 8
25 ; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(128) [[TMP2]], ptr noundef nonnull align 8 dereferenceable(128) [[A]], i64 128, i1 false)
26 ; CHECK-NEXT: br label [[NO_ALIAS]]
28 ; CHECK-NEXT: [[TMP3:%.*]] = phi ptr [ [[A]], [[ENTRY:%.*]] ], [ [[A]], [[ALIAS_CONT]] ], [ [[TMP2]], [[COPY]] ]
29 ; CHECK-NEXT: [[STORE_BEGIN4:%.*]] = ptrtoint ptr [[C]] to i64
30 ; CHECK-NEXT: [[STORE_END5:%.*]] = add nuw nsw i64 [[STORE_BEGIN4]], 128
31 ; CHECK-NEXT: [[LOAD_BEGIN6:%.*]] = ptrtoint ptr [[B:%.*]] to i64
32 ; CHECK-NEXT: [[TMP4:%.*]] = icmp ugt i64 [[STORE_END5]], [[LOAD_BEGIN6]]
33 ; CHECK-NEXT: br i1 [[TMP4]], label [[ALIAS_CONT1:%.*]], label [[NO_ALIAS3:%.*]]
35 ; CHECK-NEXT: [[LOAD_END7:%.*]] = add nuw nsw i64 [[LOAD_BEGIN6]], 128
36 ; CHECK-NEXT: [[TMP5:%.*]] = icmp ugt i64 [[LOAD_END7]], [[STORE_BEGIN4]]
37 ; CHECK-NEXT: br i1 [[TMP5]], label [[COPY2:%.*]], label [[NO_ALIAS3]]
39 ; CHECK-NEXT: [[TMP6:%.*]] = alloca [16 x double], align 8
40 ; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(128) [[TMP6]], ptr noundef nonnull align 8 dereferenceable(128) [[B]], i64 128, i1 false)
41 ; CHECK-NEXT: br label [[NO_ALIAS3]]
43 ; CHECK-NEXT: [[TMP7:%.*]] = phi ptr [ [[B]], [[NO_ALIAS]] ], [ [[B]], [[ALIAS_CONT1]] ], [ [[TMP6]], [[COPY2]] ]
44 ; CHECK-NEXT: [[COL_LOAD:%.*]] = load <2 x double>, ptr [[TMP3]], align 8
45 ; CHECK-NEXT: [[VEC_GEP:%.*]] = getelementptr double, ptr [[TMP3]], i64 4
46 ; CHECK-NEXT: [[COL_LOAD8:%.*]] = load <2 x double>, ptr [[VEC_GEP]], align 8
47 ; CHECK-NEXT: [[COL_LOAD9:%.*]] = load <2 x double>, ptr [[TMP7]], align 8
48 ; CHECK-NEXT: [[VEC_GEP10:%.*]] = getelementptr double, ptr [[TMP7]], i64 4
49 ; CHECK-NEXT: [[COL_LOAD11:%.*]] = load <2 x double>, ptr [[VEC_GEP10]], align 8
50 ; CHECK-NEXT: [[SPLAT_SPLAT:%.*]] = shufflevector <2 x double> [[COL_LOAD9]], <2 x double> poison, <2 x i32> zeroinitializer
51 ; CHECK-NEXT: [[TMP8:%.*]] = fmul contract <2 x double> [[COL_LOAD]], [[SPLAT_SPLAT]]
52 ; CHECK-NEXT: [[SPLAT_SPLAT14:%.*]] = shufflevector <2 x double> [[COL_LOAD9]], <2 x double> undef, <2 x i32> <i32 1, i32 1>
53 ; CHECK-NEXT: [[TMP9:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD8]], <2 x double> [[SPLAT_SPLAT14]], <2 x double> [[TMP8]])
54 ; CHECK-NEXT: [[SPLAT_SPLAT17:%.*]] = shufflevector <2 x double> [[COL_LOAD11]], <2 x double> poison, <2 x i32> zeroinitializer
55 ; CHECK-NEXT: [[TMP10:%.*]] = fmul contract <2 x double> [[COL_LOAD]], [[SPLAT_SPLAT17]]
56 ; CHECK-NEXT: [[SPLAT_SPLAT20:%.*]] = shufflevector <2 x double> [[COL_LOAD11]], <2 x double> undef, <2 x i32> <i32 1, i32 1>
57 ; CHECK-NEXT: [[TMP11:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD8]], <2 x double> [[SPLAT_SPLAT20]], <2 x double> [[TMP10]])
58 ; CHECK-NEXT: [[TMP12:%.*]] = getelementptr double, ptr [[TMP3]], i64 8
59 ; CHECK-NEXT: [[COL_LOAD21:%.*]] = load <2 x double>, ptr [[TMP12]], align 8
60 ; CHECK-NEXT: [[VEC_GEP22:%.*]] = getelementptr double, ptr [[TMP3]], i64 12
61 ; CHECK-NEXT: [[COL_LOAD23:%.*]] = load <2 x double>, ptr [[VEC_GEP22]], align 8
62 ; CHECK-NEXT: [[TMP13:%.*]] = getelementptr double, ptr [[TMP7]], i64 2
63 ; CHECK-NEXT: [[COL_LOAD24:%.*]] = load <2 x double>, ptr [[TMP13]], align 8
64 ; CHECK-NEXT: [[VEC_GEP25:%.*]] = getelementptr double, ptr [[TMP7]], i64 6
65 ; CHECK-NEXT: [[COL_LOAD26:%.*]] = load <2 x double>, ptr [[VEC_GEP25]], align 8
66 ; CHECK-NEXT: [[SPLAT_SPLAT30:%.*]] = shufflevector <2 x double> [[COL_LOAD24]], <2 x double> poison, <2 x i32> zeroinitializer
67 ; CHECK-NEXT: [[TMP14:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD21]], <2 x double> [[SPLAT_SPLAT30]], <2 x double> [[TMP9]])
68 ; CHECK-NEXT: [[SPLAT_SPLAT33:%.*]] = shufflevector <2 x double> [[COL_LOAD24]], <2 x double> undef, <2 x i32> <i32 1, i32 1>
69 ; CHECK-NEXT: [[TMP15:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD23]], <2 x double> [[SPLAT_SPLAT33]], <2 x double> [[TMP14]])
70 ; CHECK-NEXT: [[SPLAT_SPLAT37:%.*]] = shufflevector <2 x double> [[COL_LOAD26]], <2 x double> poison, <2 x i32> zeroinitializer
71 ; CHECK-NEXT: [[TMP16:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD21]], <2 x double> [[SPLAT_SPLAT37]], <2 x double> [[TMP11]])
72 ; CHECK-NEXT: [[SPLAT_SPLAT40:%.*]] = shufflevector <2 x double> [[COL_LOAD26]], <2 x double> undef, <2 x i32> <i32 1, i32 1>
73 ; CHECK-NEXT: [[TMP17:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD23]], <2 x double> [[SPLAT_SPLAT40]], <2 x double> [[TMP16]])
74 ; CHECK-NEXT: store <2 x double> [[TMP15]], ptr [[C]], align 8
75 ; CHECK-NEXT: [[VEC_GEP41:%.*]] = getelementptr double, ptr [[C]], i64 4
76 ; CHECK-NEXT: store <2 x double> [[TMP17]], ptr [[VEC_GEP41]], align 8
77 ; CHECK-NEXT: [[TMP18:%.*]] = getelementptr double, ptr [[TMP3]], i64 2
78 ; CHECK-NEXT: [[COL_LOAD42:%.*]] = load <2 x double>, ptr [[TMP18]], align 8
79 ; CHECK-NEXT: [[VEC_GEP43:%.*]] = getelementptr double, ptr [[TMP3]], i64 6
80 ; CHECK-NEXT: [[COL_LOAD44:%.*]] = load <2 x double>, ptr [[VEC_GEP43]], align 8
81 ; CHECK-NEXT: [[COL_LOAD45:%.*]] = load <2 x double>, ptr [[TMP7]], align 8
82 ; CHECK-NEXT: [[VEC_GEP46:%.*]] = getelementptr double, ptr [[TMP7]], i64 4
83 ; CHECK-NEXT: [[COL_LOAD47:%.*]] = load <2 x double>, ptr [[VEC_GEP46]], align 8
84 ; CHECK-NEXT: [[SPLAT_SPLAT50:%.*]] = shufflevector <2 x double> [[COL_LOAD45]], <2 x double> poison, <2 x i32> zeroinitializer
85 ; CHECK-NEXT: [[TMP19:%.*]] = fmul contract <2 x double> [[COL_LOAD42]], [[SPLAT_SPLAT50]]
86 ; CHECK-NEXT: [[SPLAT_SPLAT53:%.*]] = shufflevector <2 x double> [[COL_LOAD45]], <2 x double> undef, <2 x i32> <i32 1, i32 1>
87 ; CHECK-NEXT: [[TMP20:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD44]], <2 x double> [[SPLAT_SPLAT53]], <2 x double> [[TMP19]])
88 ; CHECK-NEXT: [[SPLAT_SPLAT56:%.*]] = shufflevector <2 x double> [[COL_LOAD47]], <2 x double> poison, <2 x i32> zeroinitializer
89 ; CHECK-NEXT: [[TMP21:%.*]] = fmul contract <2 x double> [[COL_LOAD42]], [[SPLAT_SPLAT56]]
90 ; CHECK-NEXT: [[SPLAT_SPLAT59:%.*]] = shufflevector <2 x double> [[COL_LOAD47]], <2 x double> undef, <2 x i32> <i32 1, i32 1>
91 ; CHECK-NEXT: [[TMP22:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD44]], <2 x double> [[SPLAT_SPLAT59]], <2 x double> [[TMP21]])
92 ; CHECK-NEXT: [[TMP23:%.*]] = getelementptr double, ptr [[TMP3]], i64 10
93 ; CHECK-NEXT: [[COL_LOAD60:%.*]] = load <2 x double>, ptr [[TMP23]], align 8
94 ; CHECK-NEXT: [[VEC_GEP61:%.*]] = getelementptr double, ptr [[TMP3]], i64 14
95 ; CHECK-NEXT: [[COL_LOAD62:%.*]] = load <2 x double>, ptr [[VEC_GEP61]], align 8
96 ; CHECK-NEXT: [[TMP24:%.*]] = getelementptr double, ptr [[TMP7]], i64 2
97 ; CHECK-NEXT: [[COL_LOAD63:%.*]] = load <2 x double>, ptr [[TMP24]], align 8
98 ; CHECK-NEXT: [[VEC_GEP64:%.*]] = getelementptr double, ptr [[TMP7]], i64 6
99 ; CHECK-NEXT: [[COL_LOAD65:%.*]] = load <2 x double>, ptr [[VEC_GEP64]], align 8
100 ; CHECK-NEXT: [[SPLAT_SPLAT69:%.*]] = shufflevector <2 x double> [[COL_LOAD63]], <2 x double> poison, <2 x i32> zeroinitializer
101 ; CHECK-NEXT: [[TMP25:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD60]], <2 x double> [[SPLAT_SPLAT69]], <2 x double> [[TMP20]])
102 ; CHECK-NEXT: [[SPLAT_SPLAT72:%.*]] = shufflevector <2 x double> [[COL_LOAD63]], <2 x double> undef, <2 x i32> <i32 1, i32 1>
103 ; CHECK-NEXT: [[TMP26:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD62]], <2 x double> [[SPLAT_SPLAT72]], <2 x double> [[TMP25]])
104 ; CHECK-NEXT: [[SPLAT_SPLAT76:%.*]] = shufflevector <2 x double> [[COL_LOAD65]], <2 x double> poison, <2 x i32> zeroinitializer
105 ; CHECK-NEXT: [[TMP27:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD60]], <2 x double> [[SPLAT_SPLAT76]], <2 x double> [[TMP22]])
106 ; CHECK-NEXT: [[SPLAT_SPLAT79:%.*]] = shufflevector <2 x double> [[COL_LOAD65]], <2 x double> undef, <2 x i32> <i32 1, i32 1>
107 ; CHECK-NEXT: [[TMP28:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD62]], <2 x double> [[SPLAT_SPLAT79]], <2 x double> [[TMP27]])
108 ; CHECK-NEXT: [[TMP29:%.*]] = getelementptr double, ptr [[C]], i64 2
109 ; CHECK-NEXT: store <2 x double> [[TMP26]], ptr [[TMP29]], align 8
110 ; CHECK-NEXT: [[VEC_GEP80:%.*]] = getelementptr double, ptr [[C]], i64 6
111 ; CHECK-NEXT: store <2 x double> [[TMP28]], ptr [[VEC_GEP80]], align 8
112 ; CHECK-NEXT: [[COL_LOAD81:%.*]] = load <2 x double>, ptr [[TMP3]], align 8
113 ; CHECK-NEXT: [[VEC_GEP82:%.*]] = getelementptr double, ptr [[TMP3]], i64 4
114 ; CHECK-NEXT: [[COL_LOAD83:%.*]] = load <2 x double>, ptr [[VEC_GEP82]], align 8
115 ; CHECK-NEXT: [[TMP30:%.*]] = getelementptr double, ptr [[TMP7]], i64 8
116 ; CHECK-NEXT: [[COL_LOAD84:%.*]] = load <2 x double>, ptr [[TMP30]], align 8
117 ; CHECK-NEXT: [[VEC_GEP85:%.*]] = getelementptr double, ptr [[TMP7]], i64 12
118 ; CHECK-NEXT: [[COL_LOAD86:%.*]] = load <2 x double>, ptr [[VEC_GEP85]], align 8
119 ; CHECK-NEXT: [[SPLAT_SPLAT89:%.*]] = shufflevector <2 x double> [[COL_LOAD84]], <2 x double> poison, <2 x i32> zeroinitializer
120 ; CHECK-NEXT: [[TMP31:%.*]] = fmul contract <2 x double> [[COL_LOAD81]], [[SPLAT_SPLAT89]]
121 ; CHECK-NEXT: [[SPLAT_SPLAT92:%.*]] = shufflevector <2 x double> [[COL_LOAD84]], <2 x double> undef, <2 x i32> <i32 1, i32 1>
122 ; CHECK-NEXT: [[TMP32:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD83]], <2 x double> [[SPLAT_SPLAT92]], <2 x double> [[TMP31]])
123 ; CHECK-NEXT: [[SPLAT_SPLAT95:%.*]] = shufflevector <2 x double> [[COL_LOAD86]], <2 x double> poison, <2 x i32> zeroinitializer
124 ; CHECK-NEXT: [[TMP33:%.*]] = fmul contract <2 x double> [[COL_LOAD81]], [[SPLAT_SPLAT95]]
125 ; CHECK-NEXT: [[SPLAT_SPLAT98:%.*]] = shufflevector <2 x double> [[COL_LOAD86]], <2 x double> undef, <2 x i32> <i32 1, i32 1>
126 ; CHECK-NEXT: [[TMP34:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD83]], <2 x double> [[SPLAT_SPLAT98]], <2 x double> [[TMP33]])
127 ; CHECK-NEXT: [[TMP35:%.*]] = getelementptr double, ptr [[TMP3]], i64 8
128 ; CHECK-NEXT: [[COL_LOAD99:%.*]] = load <2 x double>, ptr [[TMP35]], align 8
129 ; CHECK-NEXT: [[VEC_GEP100:%.*]] = getelementptr double, ptr [[TMP3]], i64 12
130 ; CHECK-NEXT: [[COL_LOAD101:%.*]] = load <2 x double>, ptr [[VEC_GEP100]], align 8
131 ; CHECK-NEXT: [[TMP36:%.*]] = getelementptr double, ptr [[TMP7]], i64 10
132 ; CHECK-NEXT: [[COL_LOAD102:%.*]] = load <2 x double>, ptr [[TMP36]], align 8
133 ; CHECK-NEXT: [[VEC_GEP103:%.*]] = getelementptr double, ptr [[TMP7]], i64 14
134 ; CHECK-NEXT: [[COL_LOAD104:%.*]] = load <2 x double>, ptr [[VEC_GEP103]], align 8
135 ; CHECK-NEXT: [[SPLAT_SPLAT108:%.*]] = shufflevector <2 x double> [[COL_LOAD102]], <2 x double> poison, <2 x i32> zeroinitializer
136 ; CHECK-NEXT: [[TMP37:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD99]], <2 x double> [[SPLAT_SPLAT108]], <2 x double> [[TMP32]])
137 ; CHECK-NEXT: [[SPLAT_SPLAT111:%.*]] = shufflevector <2 x double> [[COL_LOAD102]], <2 x double> undef, <2 x i32> <i32 1, i32 1>
138 ; CHECK-NEXT: [[TMP38:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD101]], <2 x double> [[SPLAT_SPLAT111]], <2 x double> [[TMP37]])
139 ; CHECK-NEXT: [[SPLAT_SPLAT115:%.*]] = shufflevector <2 x double> [[COL_LOAD104]], <2 x double> poison, <2 x i32> zeroinitializer
140 ; CHECK-NEXT: [[TMP39:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD99]], <2 x double> [[SPLAT_SPLAT115]], <2 x double> [[TMP34]])
141 ; CHECK-NEXT: [[SPLAT_SPLAT118:%.*]] = shufflevector <2 x double> [[COL_LOAD104]], <2 x double> undef, <2 x i32> <i32 1, i32 1>
142 ; CHECK-NEXT: [[TMP40:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD101]], <2 x double> [[SPLAT_SPLAT118]], <2 x double> [[TMP39]])
143 ; CHECK-NEXT: [[TMP41:%.*]] = getelementptr double, ptr [[C]], i64 8
144 ; CHECK-NEXT: store <2 x double> [[TMP38]], ptr [[TMP41]], align 8
145 ; CHECK-NEXT: [[VEC_GEP119:%.*]] = getelementptr double, ptr [[C]], i64 12
146 ; CHECK-NEXT: store <2 x double> [[TMP40]], ptr [[VEC_GEP119]], align 8
147 ; CHECK-NEXT: [[TMP42:%.*]] = getelementptr double, ptr [[TMP3]], i64 2
148 ; CHECK-NEXT: [[COL_LOAD120:%.*]] = load <2 x double>, ptr [[TMP42]], align 8
149 ; CHECK-NEXT: [[VEC_GEP121:%.*]] = getelementptr double, ptr [[TMP3]], i64 6
150 ; CHECK-NEXT: [[COL_LOAD122:%.*]] = load <2 x double>, ptr [[VEC_GEP121]], align 8
151 ; CHECK-NEXT: [[TMP43:%.*]] = getelementptr double, ptr [[TMP7]], i64 8
152 ; CHECK-NEXT: [[COL_LOAD123:%.*]] = load <2 x double>, ptr [[TMP43]], align 8
153 ; CHECK-NEXT: [[VEC_GEP124:%.*]] = getelementptr double, ptr [[TMP7]], i64 12
154 ; CHECK-NEXT: [[COL_LOAD125:%.*]] = load <2 x double>, ptr [[VEC_GEP124]], align 8
155 ; CHECK-NEXT: [[SPLAT_SPLAT128:%.*]] = shufflevector <2 x double> [[COL_LOAD123]], <2 x double> poison, <2 x i32> zeroinitializer
156 ; CHECK-NEXT: [[TMP44:%.*]] = fmul contract <2 x double> [[COL_LOAD120]], [[SPLAT_SPLAT128]]
157 ; CHECK-NEXT: [[SPLAT_SPLAT131:%.*]] = shufflevector <2 x double> [[COL_LOAD123]], <2 x double> undef, <2 x i32> <i32 1, i32 1>
158 ; CHECK-NEXT: [[TMP45:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD122]], <2 x double> [[SPLAT_SPLAT131]], <2 x double> [[TMP44]])
159 ; CHECK-NEXT: [[SPLAT_SPLAT134:%.*]] = shufflevector <2 x double> [[COL_LOAD125]], <2 x double> poison, <2 x i32> zeroinitializer
160 ; CHECK-NEXT: [[TMP46:%.*]] = fmul contract <2 x double> [[COL_LOAD120]], [[SPLAT_SPLAT134]]
161 ; CHECK-NEXT: [[SPLAT_SPLAT137:%.*]] = shufflevector <2 x double> [[COL_LOAD125]], <2 x double> undef, <2 x i32> <i32 1, i32 1>
162 ; CHECK-NEXT: [[TMP47:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD122]], <2 x double> [[SPLAT_SPLAT137]], <2 x double> [[TMP46]])
163 ; CHECK-NEXT: [[TMP48:%.*]] = getelementptr double, ptr [[TMP3]], i64 10
164 ; CHECK-NEXT: [[COL_LOAD138:%.*]] = load <2 x double>, ptr [[TMP48]], align 8
165 ; CHECK-NEXT: [[VEC_GEP139:%.*]] = getelementptr double, ptr [[TMP3]], i64 14
166 ; CHECK-NEXT: [[COL_LOAD140:%.*]] = load <2 x double>, ptr [[VEC_GEP139]], align 8
167 ; CHECK-NEXT: [[TMP49:%.*]] = getelementptr double, ptr [[TMP7]], i64 10
168 ; CHECK-NEXT: [[COL_LOAD141:%.*]] = load <2 x double>, ptr [[TMP49]], align 8
169 ; CHECK-NEXT: [[VEC_GEP142:%.*]] = getelementptr double, ptr [[TMP7]], i64 14
170 ; CHECK-NEXT: [[COL_LOAD143:%.*]] = load <2 x double>, ptr [[VEC_GEP142]], align 8
171 ; CHECK-NEXT: [[SPLAT_SPLAT147:%.*]] = shufflevector <2 x double> [[COL_LOAD141]], <2 x double> poison, <2 x i32> zeroinitializer
172 ; CHECK-NEXT: [[TMP50:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD138]], <2 x double> [[SPLAT_SPLAT147]], <2 x double> [[TMP45]])
173 ; CHECK-NEXT: [[SPLAT_SPLAT150:%.*]] = shufflevector <2 x double> [[COL_LOAD141]], <2 x double> undef, <2 x i32> <i32 1, i32 1>
174 ; CHECK-NEXT: [[TMP51:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD140]], <2 x double> [[SPLAT_SPLAT150]], <2 x double> [[TMP50]])
175 ; CHECK-NEXT: [[SPLAT_SPLAT154:%.*]] = shufflevector <2 x double> [[COL_LOAD143]], <2 x double> poison, <2 x i32> zeroinitializer
176 ; CHECK-NEXT: [[TMP52:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD138]], <2 x double> [[SPLAT_SPLAT154]], <2 x double> [[TMP47]])
177 ; CHECK-NEXT: [[SPLAT_SPLAT157:%.*]] = shufflevector <2 x double> [[COL_LOAD143]], <2 x double> undef, <2 x i32> <i32 1, i32 1>
178 ; CHECK-NEXT: [[TMP53:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD140]], <2 x double> [[SPLAT_SPLAT157]], <2 x double> [[TMP52]])
179 ; CHECK-NEXT: [[TMP54:%.*]] = getelementptr double, ptr [[C]], i64 10
180 ; CHECK-NEXT: store <2 x double> [[TMP51]], ptr [[TMP54]], align 8
181 ; CHECK-NEXT: [[VEC_GEP158:%.*]] = getelementptr double, ptr [[C]], i64 14
182 ; CHECK-NEXT: store <2 x double> [[TMP53]], ptr [[VEC_GEP158]], align 8
183 ; CHECK-NEXT: ret void
187 ;; np.dot(a[0:2, 0:2], b[0:2, 0:2])
190 ;; + np.dot(a[0:2, 2:4], b[2:4, 0:2])
196 ;; np.dot(a[2:4, 0:2], b[0:2, 0:2])
199 ;; + np.dot(a[2:4, 2:4], b[2:4, 0:2])
205 ;; np.dot(a[0:2, 0:2], b[0:2, 2:4])
208 ;; + np.dot(a[0:2, 2:4], b[2:4, 2:4])
214 ;; np.dot(a[2:4, 0:2], b[2:4, 0:2])
217 ;; + np.dot(a[2:4, 2:4], b[2:4, 2:4])
223 %a = load <16 x double>, ptr %A, align 8
224 %b = load <16 x double>, ptr %B, align 8
226 %c = call <16 x double> @llvm.matrix.multiply(<16 x double> %a, <16 x double> %b, i32 4, i32 4, i32 4)
228 store <16 x double> %c, ptr %C, align 8
232 ; The same load is used for both operands of the multiply.
233 define void @multiply_reuse_load(ptr noalias %A, ptr noalias %B, ptr noalias %C) {
234 ; CHECK-LABEL: @multiply_reuse_load(
236 ; CHECK-NEXT: [[COL_LOAD:%.*]] = load <2 x double>, ptr [[A:%.*]], align 8
237 ; CHECK-NEXT: [[VEC_GEP:%.*]] = getelementptr double, ptr [[A]], i64 4
238 ; CHECK-NEXT: [[COL_LOAD1:%.*]] = load <2 x double>, ptr [[VEC_GEP]], align 8
239 ; CHECK-NEXT: [[SPLAT_SPLAT:%.*]] = shufflevector <2 x double> [[COL_LOAD]], <2 x double> poison, <2 x i32> zeroinitializer
240 ; CHECK-NEXT: [[TMP0:%.*]] = fmul contract <2 x double> [[COL_LOAD]], [[SPLAT_SPLAT]]
241 ; CHECK-NEXT: [[SPLAT_SPLAT7:%.*]] = shufflevector <2 x double> [[COL_LOAD]], <2 x double> undef, <2 x i32> <i32 1, i32 1>
242 ; CHECK-NEXT: [[TMP1:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD1]], <2 x double> [[SPLAT_SPLAT7]], <2 x double> [[TMP0]])
243 ; CHECK-NEXT: [[SPLAT_SPLAT10:%.*]] = shufflevector <2 x double> [[COL_LOAD1]], <2 x double> poison, <2 x i32> zeroinitializer
244 ; CHECK-NEXT: [[TMP2:%.*]] = fmul contract <2 x double> [[COL_LOAD]], [[SPLAT_SPLAT10]]
245 ; CHECK-NEXT: [[SPLAT_SPLAT13:%.*]] = shufflevector <2 x double> [[COL_LOAD1]], <2 x double> undef, <2 x i32> <i32 1, i32 1>
246 ; CHECK-NEXT: [[TMP3:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD1]], <2 x double> [[SPLAT_SPLAT13]], <2 x double> [[TMP2]])
247 ; CHECK-NEXT: [[TMP4:%.*]] = getelementptr double, ptr [[A]], i64 8
248 ; CHECK-NEXT: [[COL_LOAD14:%.*]] = load <2 x double>, ptr [[TMP4]], align 8
249 ; CHECK-NEXT: [[VEC_GEP15:%.*]] = getelementptr double, ptr [[A]], i64 12
250 ; CHECK-NEXT: [[COL_LOAD16:%.*]] = load <2 x double>, ptr [[VEC_GEP15]], align 8
251 ; CHECK-NEXT: [[TMP5:%.*]] = getelementptr double, ptr [[A]], i64 2
252 ; CHECK-NEXT: [[COL_LOAD17:%.*]] = load <2 x double>, ptr [[TMP5]], align 8
253 ; CHECK-NEXT: [[VEC_GEP18:%.*]] = getelementptr double, ptr [[A]], i64 6
254 ; CHECK-NEXT: [[COL_LOAD19:%.*]] = load <2 x double>, ptr [[VEC_GEP18]], align 8
255 ; CHECK-NEXT: [[SPLAT_SPLAT23:%.*]] = shufflevector <2 x double> [[COL_LOAD17]], <2 x double> poison, <2 x i32> zeroinitializer
256 ; CHECK-NEXT: [[TMP6:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD14]], <2 x double> [[SPLAT_SPLAT23]], <2 x double> [[TMP1]])
257 ; CHECK-NEXT: [[SPLAT_SPLAT26:%.*]] = shufflevector <2 x double> [[COL_LOAD17]], <2 x double> undef, <2 x i32> <i32 1, i32 1>
258 ; CHECK-NEXT: [[TMP7:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD16]], <2 x double> [[SPLAT_SPLAT26]], <2 x double> [[TMP6]])
259 ; CHECK-NEXT: [[SPLAT_SPLAT30:%.*]] = shufflevector <2 x double> [[COL_LOAD19]], <2 x double> poison, <2 x i32> zeroinitializer
260 ; CHECK-NEXT: [[TMP8:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD14]], <2 x double> [[SPLAT_SPLAT30]], <2 x double> [[TMP3]])
261 ; CHECK-NEXT: [[SPLAT_SPLAT33:%.*]] = shufflevector <2 x double> [[COL_LOAD19]], <2 x double> undef, <2 x i32> <i32 1, i32 1>
262 ; CHECK-NEXT: [[TMP9:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD16]], <2 x double> [[SPLAT_SPLAT33]], <2 x double> [[TMP8]])
263 ; CHECK-NEXT: store <2 x double> [[TMP7]], ptr [[C:%.*]], align 8
264 ; CHECK-NEXT: [[VEC_GEP34:%.*]] = getelementptr double, ptr [[C]], i64 4
265 ; CHECK-NEXT: store <2 x double> [[TMP9]], ptr [[VEC_GEP34]], align 8
266 ; CHECK-NEXT: [[TMP10:%.*]] = getelementptr double, ptr [[A]], i64 2
267 ; CHECK-NEXT: [[COL_LOAD35:%.*]] = load <2 x double>, ptr [[TMP10]], align 8
268 ; CHECK-NEXT: [[VEC_GEP36:%.*]] = getelementptr double, ptr [[A]], i64 6
269 ; CHECK-NEXT: [[COL_LOAD37:%.*]] = load <2 x double>, ptr [[VEC_GEP36]], align 8
270 ; CHECK-NEXT: [[COL_LOAD38:%.*]] = load <2 x double>, ptr [[A]], align 8
271 ; CHECK-NEXT: [[VEC_GEP39:%.*]] = getelementptr double, ptr [[A]], i64 4
272 ; CHECK-NEXT: [[COL_LOAD40:%.*]] = load <2 x double>, ptr [[VEC_GEP39]], align 8
273 ; CHECK-NEXT: [[SPLAT_SPLAT43:%.*]] = shufflevector <2 x double> [[COL_LOAD38]], <2 x double> poison, <2 x i32> zeroinitializer
274 ; CHECK-NEXT: [[TMP11:%.*]] = fmul contract <2 x double> [[COL_LOAD35]], [[SPLAT_SPLAT43]]
275 ; CHECK-NEXT: [[SPLAT_SPLAT46:%.*]] = shufflevector <2 x double> [[COL_LOAD38]], <2 x double> undef, <2 x i32> <i32 1, i32 1>
276 ; CHECK-NEXT: [[TMP12:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD37]], <2 x double> [[SPLAT_SPLAT46]], <2 x double> [[TMP11]])
277 ; CHECK-NEXT: [[SPLAT_SPLAT49:%.*]] = shufflevector <2 x double> [[COL_LOAD40]], <2 x double> poison, <2 x i32> zeroinitializer
278 ; CHECK-NEXT: [[TMP13:%.*]] = fmul contract <2 x double> [[COL_LOAD35]], [[SPLAT_SPLAT49]]
279 ; CHECK-NEXT: [[SPLAT_SPLAT52:%.*]] = shufflevector <2 x double> [[COL_LOAD40]], <2 x double> undef, <2 x i32> <i32 1, i32 1>
280 ; CHECK-NEXT: [[TMP14:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD37]], <2 x double> [[SPLAT_SPLAT52]], <2 x double> [[TMP13]])
281 ; CHECK-NEXT: [[TMP15:%.*]] = getelementptr double, ptr [[A]], i64 10
282 ; CHECK-NEXT: [[COL_LOAD53:%.*]] = load <2 x double>, ptr [[TMP15]], align 8
283 ; CHECK-NEXT: [[VEC_GEP54:%.*]] = getelementptr double, ptr [[A]], i64 14
284 ; CHECK-NEXT: [[COL_LOAD55:%.*]] = load <2 x double>, ptr [[VEC_GEP54]], align 8
285 ; CHECK-NEXT: [[TMP16:%.*]] = getelementptr double, ptr [[A]], i64 2
286 ; CHECK-NEXT: [[COL_LOAD56:%.*]] = load <2 x double>, ptr [[TMP16]], align 8
287 ; CHECK-NEXT: [[VEC_GEP57:%.*]] = getelementptr double, ptr [[A]], i64 6
288 ; CHECK-NEXT: [[COL_LOAD58:%.*]] = load <2 x double>, ptr [[VEC_GEP57]], align 8
289 ; CHECK-NEXT: [[SPLAT_SPLAT62:%.*]] = shufflevector <2 x double> [[COL_LOAD56]], <2 x double> poison, <2 x i32> zeroinitializer
290 ; CHECK-NEXT: [[TMP17:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD53]], <2 x double> [[SPLAT_SPLAT62]], <2 x double> [[TMP12]])
291 ; CHECK-NEXT: [[SPLAT_SPLAT65:%.*]] = shufflevector <2 x double> [[COL_LOAD56]], <2 x double> undef, <2 x i32> <i32 1, i32 1>
292 ; CHECK-NEXT: [[TMP18:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD55]], <2 x double> [[SPLAT_SPLAT65]], <2 x double> [[TMP17]])
293 ; CHECK-NEXT: [[SPLAT_SPLAT69:%.*]] = shufflevector <2 x double> [[COL_LOAD58]], <2 x double> poison, <2 x i32> zeroinitializer
294 ; CHECK-NEXT: [[TMP19:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD53]], <2 x double> [[SPLAT_SPLAT69]], <2 x double> [[TMP14]])
295 ; CHECK-NEXT: [[SPLAT_SPLAT72:%.*]] = shufflevector <2 x double> [[COL_LOAD58]], <2 x double> undef, <2 x i32> <i32 1, i32 1>
296 ; CHECK-NEXT: [[TMP20:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD55]], <2 x double> [[SPLAT_SPLAT72]], <2 x double> [[TMP19]])
297 ; CHECK-NEXT: [[TMP21:%.*]] = getelementptr double, ptr [[C]], i64 2
298 ; CHECK-NEXT: store <2 x double> [[TMP18]], ptr [[TMP21]], align 8
299 ; CHECK-NEXT: [[VEC_GEP73:%.*]] = getelementptr double, ptr [[C]], i64 6
300 ; CHECK-NEXT: store <2 x double> [[TMP20]], ptr [[VEC_GEP73]], align 8
301 ; CHECK-NEXT: [[COL_LOAD74:%.*]] = load <2 x double>, ptr [[A]], align 8
302 ; CHECK-NEXT: [[VEC_GEP75:%.*]] = getelementptr double, ptr [[A]], i64 4
303 ; CHECK-NEXT: [[COL_LOAD76:%.*]] = load <2 x double>, ptr [[VEC_GEP75]], align 8
304 ; CHECK-NEXT: [[TMP22:%.*]] = getelementptr double, ptr [[A]], i64 8
305 ; CHECK-NEXT: [[COL_LOAD77:%.*]] = load <2 x double>, ptr [[TMP22]], align 8
306 ; CHECK-NEXT: [[VEC_GEP78:%.*]] = getelementptr double, ptr [[A]], i64 12
307 ; CHECK-NEXT: [[COL_LOAD79:%.*]] = load <2 x double>, ptr [[VEC_GEP78]], align 8
308 ; CHECK-NEXT: [[SPLAT_SPLAT82:%.*]] = shufflevector <2 x double> [[COL_LOAD77]], <2 x double> poison, <2 x i32> zeroinitializer
309 ; CHECK-NEXT: [[TMP23:%.*]] = fmul contract <2 x double> [[COL_LOAD74]], [[SPLAT_SPLAT82]]
310 ; CHECK-NEXT: [[SPLAT_SPLAT85:%.*]] = shufflevector <2 x double> [[COL_LOAD77]], <2 x double> undef, <2 x i32> <i32 1, i32 1>
311 ; CHECK-NEXT: [[TMP24:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD76]], <2 x double> [[SPLAT_SPLAT85]], <2 x double> [[TMP23]])
312 ; CHECK-NEXT: [[SPLAT_SPLAT88:%.*]] = shufflevector <2 x double> [[COL_LOAD79]], <2 x double> poison, <2 x i32> zeroinitializer
313 ; CHECK-NEXT: [[TMP25:%.*]] = fmul contract <2 x double> [[COL_LOAD74]], [[SPLAT_SPLAT88]]
314 ; CHECK-NEXT: [[SPLAT_SPLAT91:%.*]] = shufflevector <2 x double> [[COL_LOAD79]], <2 x double> undef, <2 x i32> <i32 1, i32 1>
315 ; CHECK-NEXT: [[TMP26:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD76]], <2 x double> [[SPLAT_SPLAT91]], <2 x double> [[TMP25]])
316 ; CHECK-NEXT: [[TMP27:%.*]] = getelementptr double, ptr [[A]], i64 8
317 ; CHECK-NEXT: [[COL_LOAD92:%.*]] = load <2 x double>, ptr [[TMP27]], align 8
318 ; CHECK-NEXT: [[VEC_GEP93:%.*]] = getelementptr double, ptr [[A]], i64 12
319 ; CHECK-NEXT: [[COL_LOAD94:%.*]] = load <2 x double>, ptr [[VEC_GEP93]], align 8
320 ; CHECK-NEXT: [[TMP28:%.*]] = getelementptr double, ptr [[A]], i64 10
321 ; CHECK-NEXT: [[COL_LOAD95:%.*]] = load <2 x double>, ptr [[TMP28]], align 8
322 ; CHECK-NEXT: [[VEC_GEP96:%.*]] = getelementptr double, ptr [[A]], i64 14
323 ; CHECK-NEXT: [[COL_LOAD97:%.*]] = load <2 x double>, ptr [[VEC_GEP96]], align 8
324 ; CHECK-NEXT: [[SPLAT_SPLAT101:%.*]] = shufflevector <2 x double> [[COL_LOAD95]], <2 x double> poison, <2 x i32> zeroinitializer
325 ; CHECK-NEXT: [[TMP29:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD92]], <2 x double> [[SPLAT_SPLAT101]], <2 x double> [[TMP24]])
326 ; CHECK-NEXT: [[SPLAT_SPLAT104:%.*]] = shufflevector <2 x double> [[COL_LOAD95]], <2 x double> undef, <2 x i32> <i32 1, i32 1>
327 ; CHECK-NEXT: [[TMP30:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD94]], <2 x double> [[SPLAT_SPLAT104]], <2 x double> [[TMP29]])
328 ; CHECK-NEXT: [[SPLAT_SPLAT108:%.*]] = shufflevector <2 x double> [[COL_LOAD97]], <2 x double> poison, <2 x i32> zeroinitializer
329 ; CHECK-NEXT: [[TMP31:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD92]], <2 x double> [[SPLAT_SPLAT108]], <2 x double> [[TMP26]])
330 ; CHECK-NEXT: [[SPLAT_SPLAT111:%.*]] = shufflevector <2 x double> [[COL_LOAD97]], <2 x double> undef, <2 x i32> <i32 1, i32 1>
331 ; CHECK-NEXT: [[TMP32:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD94]], <2 x double> [[SPLAT_SPLAT111]], <2 x double> [[TMP31]])
332 ; CHECK-NEXT: [[TMP33:%.*]] = getelementptr double, ptr [[C]], i64 8
333 ; CHECK-NEXT: store <2 x double> [[TMP30]], ptr [[TMP33]], align 8
334 ; CHECK-NEXT: [[VEC_GEP112:%.*]] = getelementptr double, ptr [[C]], i64 12
335 ; CHECK-NEXT: store <2 x double> [[TMP32]], ptr [[VEC_GEP112]], align 8
336 ; CHECK-NEXT: [[TMP34:%.*]] = getelementptr double, ptr [[A]], i64 2
337 ; CHECK-NEXT: [[COL_LOAD113:%.*]] = load <2 x double>, ptr [[TMP34]], align 8
338 ; CHECK-NEXT: [[VEC_GEP114:%.*]] = getelementptr double, ptr [[A]], i64 6
339 ; CHECK-NEXT: [[COL_LOAD115:%.*]] = load <2 x double>, ptr [[VEC_GEP114]], align 8
340 ; CHECK-NEXT: [[TMP35:%.*]] = getelementptr double, ptr [[A]], i64 8
341 ; CHECK-NEXT: [[COL_LOAD116:%.*]] = load <2 x double>, ptr [[TMP35]], align 8
342 ; CHECK-NEXT: [[VEC_GEP117:%.*]] = getelementptr double, ptr [[A]], i64 12
343 ; CHECK-NEXT: [[COL_LOAD118:%.*]] = load <2 x double>, ptr [[VEC_GEP117]], align 8
344 ; CHECK-NEXT: [[SPLAT_SPLAT121:%.*]] = shufflevector <2 x double> [[COL_LOAD116]], <2 x double> poison, <2 x i32> zeroinitializer
345 ; CHECK-NEXT: [[TMP36:%.*]] = fmul contract <2 x double> [[COL_LOAD113]], [[SPLAT_SPLAT121]]
346 ; CHECK-NEXT: [[SPLAT_SPLAT124:%.*]] = shufflevector <2 x double> [[COL_LOAD116]], <2 x double> undef, <2 x i32> <i32 1, i32 1>
347 ; CHECK-NEXT: [[TMP37:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD115]], <2 x double> [[SPLAT_SPLAT124]], <2 x double> [[TMP36]])
348 ; CHECK-NEXT: [[SPLAT_SPLAT127:%.*]] = shufflevector <2 x double> [[COL_LOAD118]], <2 x double> poison, <2 x i32> zeroinitializer
349 ; CHECK-NEXT: [[TMP38:%.*]] = fmul contract <2 x double> [[COL_LOAD113]], [[SPLAT_SPLAT127]]
350 ; CHECK-NEXT: [[SPLAT_SPLAT130:%.*]] = shufflevector <2 x double> [[COL_LOAD118]], <2 x double> undef, <2 x i32> <i32 1, i32 1>
351 ; CHECK-NEXT: [[TMP39:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD115]], <2 x double> [[SPLAT_SPLAT130]], <2 x double> [[TMP38]])
352 ; CHECK-NEXT: [[TMP40:%.*]] = getelementptr double, ptr [[A]], i64 10
353 ; CHECK-NEXT: [[COL_LOAD131:%.*]] = load <2 x double>, ptr [[TMP40]], align 8
354 ; CHECK-NEXT: [[VEC_GEP132:%.*]] = getelementptr double, ptr [[A]], i64 14
355 ; CHECK-NEXT: [[COL_LOAD133:%.*]] = load <2 x double>, ptr [[VEC_GEP132]], align 8
356 ; CHECK-NEXT: [[SPLAT_SPLAT140:%.*]] = shufflevector <2 x double> [[COL_LOAD131]], <2 x double> poison, <2 x i32> zeroinitializer
357 ; CHECK-NEXT: [[TMP41:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD131]], <2 x double> [[SPLAT_SPLAT140]], <2 x double> [[TMP37]])
358 ; CHECK-NEXT: [[SPLAT_SPLAT143:%.*]] = shufflevector <2 x double> [[COL_LOAD131]], <2 x double> undef, <2 x i32> <i32 1, i32 1>
359 ; CHECK-NEXT: [[TMP42:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD133]], <2 x double> [[SPLAT_SPLAT143]], <2 x double> [[TMP41]])
360 ; CHECK-NEXT: [[SPLAT_SPLAT147:%.*]] = shufflevector <2 x double> [[COL_LOAD133]], <2 x double> poison, <2 x i32> zeroinitializer
361 ; CHECK-NEXT: [[TMP43:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD131]], <2 x double> [[SPLAT_SPLAT147]], <2 x double> [[TMP39]])
362 ; CHECK-NEXT: [[SPLAT_SPLAT150:%.*]] = shufflevector <2 x double> [[COL_LOAD133]], <2 x double> undef, <2 x i32> <i32 1, i32 1>
363 ; CHECK-NEXT: [[TMP44:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD133]], <2 x double> [[SPLAT_SPLAT150]], <2 x double> [[TMP43]])
364 ; CHECK-NEXT: [[TMP45:%.*]] = getelementptr double, ptr [[C]], i64 10
365 ; CHECK-NEXT: store <2 x double> [[TMP42]], ptr [[TMP45]], align 8
366 ; CHECK-NEXT: [[VEC_GEP151:%.*]] = getelementptr double, ptr [[C]], i64 14
367 ; CHECK-NEXT: store <2 x double> [[TMP44]], ptr [[VEC_GEP151]], align 8
368 ; CHECK-NEXT: ret void
371 %a = load <16 x double>, ptr %A, align 8
372 %c = call <16 x double> @llvm.matrix.multiply(<16 x double> %a, <16 x double> %a, i32 4, i32 4, i32 4)
373 store <16 x double> %c, ptr %C, align 8
377 declare <16 x double> @llvm.matrix.multiply(<16 x double>, <16 x double>, i32, i32, i32)