1 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2 ; RUN: opt -lower-matrix-intrinsics -fuse-matrix-use-loops=false -fuse-matrix-tile-size=1 -matrix-allow-contract -force-fuse-matrix -instcombine -verify-dom-info %s -S | FileCheck %s
3 ; RUN: opt -passes=lower-matrix-intrinsics,instcombine -fuse-matrix-use-loops=false -fuse-matrix-tile-size=1 -matrix-allow-contract -force-fuse-matrix -verify-dom-info %s -S | FileCheck %s
5 ; REQUIRES: aarch64-registered-target
7 target datalayout = "e-m:o-i64:64-f80:128-n8:4:32:64-S128"
8 target triple = "aarch64-apple-ios"
10 define void @multiply_can_hoist_cast(<4 x double>* noalias %A, <4 x double> * %B, [4 x double]* %C) {
11 ; CHECK-LABEL: @multiply_can_hoist_cast(
13 ; CHECK-NEXT: [[STORE_BEGIN:%.*]] = ptrtoint [4 x double]* [[C:%.*]] to i64
14 ; CHECK-NEXT: [[STORE_END:%.*]] = add nuw nsw i64 [[STORE_BEGIN]], 32
15 ; CHECK-NEXT: [[LOAD_BEGIN:%.*]] = ptrtoint <4 x double>* [[B:%.*]] to i64
16 ; CHECK-NEXT: [[TMP0:%.*]] = icmp ugt i64 [[STORE_END]], [[LOAD_BEGIN]]
17 ; CHECK-NEXT: br i1 [[TMP0]], label [[ALIAS_CONT:%.*]], label [[NO_ALIAS:%.*]]
19 ; CHECK-NEXT: [[LOAD_END:%.*]] = add nuw nsw i64 [[LOAD_BEGIN]], 32
20 ; CHECK-NEXT: [[TMP1:%.*]] = icmp ugt i64 [[LOAD_END]], [[STORE_BEGIN]]
21 ; CHECK-NEXT: br i1 [[TMP1]], label [[COPY:%.*]], label [[NO_ALIAS]]
23 ; CHECK-NEXT: [[TMP2:%.*]] = alloca <4 x double>, align 32
24 ; CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x double>* [[TMP2]] to i8*
25 ; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x double>* [[B]] to i8*
26 ; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* noundef nonnull align 32 dereferenceable(32) [[TMP3]], i8* noundef nonnull align 8 dereferenceable(32) [[TMP4]], i64 32, i1 false)
27 ; CHECK-NEXT: br label [[NO_ALIAS]]
29 ; CHECK-NEXT: [[TMP5:%.*]] = phi <4 x double>* [ [[B]], [[ENTRY:%.*]] ], [ [[B]], [[ALIAS_CONT]] ], [ [[TMP2]], [[COPY]] ]
30 ; CHECK-NEXT: [[VEC_CAST:%.*]] = bitcast <4 x double>* [[A:%.*]] to <1 x double>*
31 ; CHECK-NEXT: [[COL_LOAD:%.*]] = load <1 x double>, <1 x double>* [[VEC_CAST]], align 8
32 ; CHECK-NEXT: [[VEC_CAST2:%.*]] = bitcast <4 x double>* [[TMP5]] to <1 x double>*
33 ; CHECK-NEXT: [[COL_LOAD3:%.*]] = load <1 x double>, <1 x double>* [[VEC_CAST2]], align 8
34 ; CHECK-NEXT: [[TMP6:%.*]] = fmul contract <1 x double> [[COL_LOAD]], [[COL_LOAD3]]
35 ; CHECK-NEXT: [[TMP7:%.*]] = getelementptr <4 x double>, <4 x double>* [[A]], i64 0, i64 2
36 ; CHECK-NEXT: [[VEC_CAST5:%.*]] = bitcast double* [[TMP7]] to <1 x double>*
37 ; CHECK-NEXT: [[COL_LOAD6:%.*]] = load <1 x double>, <1 x double>* [[VEC_CAST5]], align 8
38 ; CHECK-NEXT: [[TMP8:%.*]] = getelementptr <4 x double>, <4 x double>* [[TMP5]], i64 0, i64 1
39 ; CHECK-NEXT: [[VEC_CAST8:%.*]] = bitcast double* [[TMP8]] to <1 x double>*
40 ; CHECK-NEXT: [[COL_LOAD9:%.*]] = load <1 x double>, <1 x double>* [[VEC_CAST8]], align 8
41 ; CHECK-NEXT: [[TMP9:%.*]] = call contract <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[COL_LOAD6]], <1 x double> [[COL_LOAD9]], <1 x double> [[TMP6]])
42 ; CHECK-NEXT: [[VEC_CAST15:%.*]] = bitcast [4 x double]* [[C]] to <1 x double>*
43 ; CHECK-NEXT: store <1 x double> [[TMP9]], <1 x double>* [[VEC_CAST15]], align 8
44 ; CHECK-NEXT: [[TMP10:%.*]] = getelementptr <4 x double>, <4 x double>* [[A]], i64 0, i64 1
45 ; CHECK-NEXT: [[VEC_CAST17:%.*]] = bitcast double* [[TMP10]] to <1 x double>*
46 ; CHECK-NEXT: [[COL_LOAD18:%.*]] = load <1 x double>, <1 x double>* [[VEC_CAST17]], align 8
47 ; CHECK-NEXT: [[VEC_CAST20:%.*]] = bitcast <4 x double>* [[TMP5]] to <1 x double>*
48 ; CHECK-NEXT: [[COL_LOAD21:%.*]] = load <1 x double>, <1 x double>* [[VEC_CAST20]], align 8
49 ; CHECK-NEXT: [[TMP11:%.*]] = fmul contract <1 x double> [[COL_LOAD18]], [[COL_LOAD21]]
50 ; CHECK-NEXT: [[TMP12:%.*]] = getelementptr <4 x double>, <4 x double>* [[A]], i64 0, i64 3
51 ; CHECK-NEXT: [[VEC_CAST26:%.*]] = bitcast double* [[TMP12]] to <1 x double>*
52 ; CHECK-NEXT: [[COL_LOAD27:%.*]] = load <1 x double>, <1 x double>* [[VEC_CAST26]], align 8
53 ; CHECK-NEXT: [[TMP13:%.*]] = getelementptr <4 x double>, <4 x double>* [[TMP5]], i64 0, i64 1
54 ; CHECK-NEXT: [[VEC_CAST29:%.*]] = bitcast double* [[TMP13]] to <1 x double>*
55 ; CHECK-NEXT: [[COL_LOAD30:%.*]] = load <1 x double>, <1 x double>* [[VEC_CAST29]], align 8
56 ; CHECK-NEXT: [[TMP14:%.*]] = call contract <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[COL_LOAD27]], <1 x double> [[COL_LOAD30]], <1 x double> [[TMP11]])
57 ; CHECK-NEXT: [[TMP15:%.*]] = getelementptr [4 x double], [4 x double]* [[C]], i64 0, i64 1
58 ; CHECK-NEXT: [[VEC_CAST36:%.*]] = bitcast double* [[TMP15]] to <1 x double>*
59 ; CHECK-NEXT: store <1 x double> [[TMP14]], <1 x double>* [[VEC_CAST36]], align 8
60 ; CHECK-NEXT: [[VEC_CAST38:%.*]] = bitcast <4 x double>* [[A]] to <1 x double>*
61 ; CHECK-NEXT: [[COL_LOAD39:%.*]] = load <1 x double>, <1 x double>* [[VEC_CAST38]], align 8
62 ; CHECK-NEXT: [[TMP16:%.*]] = getelementptr <4 x double>, <4 x double>* [[TMP5]], i64 0, i64 2
63 ; CHECK-NEXT: [[VEC_CAST41:%.*]] = bitcast double* [[TMP16]] to <1 x double>*
64 ; CHECK-NEXT: [[COL_LOAD42:%.*]] = load <1 x double>, <1 x double>* [[VEC_CAST41]], align 8
65 ; CHECK-NEXT: [[TMP17:%.*]] = fmul contract <1 x double> [[COL_LOAD39]], [[COL_LOAD42]]
66 ; CHECK-NEXT: [[TMP18:%.*]] = getelementptr <4 x double>, <4 x double>* [[A]], i64 0, i64 2
67 ; CHECK-NEXT: [[VEC_CAST47:%.*]] = bitcast double* [[TMP18]] to <1 x double>*
68 ; CHECK-NEXT: [[COL_LOAD48:%.*]] = load <1 x double>, <1 x double>* [[VEC_CAST47]], align 8
69 ; CHECK-NEXT: [[TMP19:%.*]] = getelementptr <4 x double>, <4 x double>* [[TMP5]], i64 0, i64 3
70 ; CHECK-NEXT: [[VEC_CAST50:%.*]] = bitcast double* [[TMP19]] to <1 x double>*
71 ; CHECK-NEXT: [[COL_LOAD51:%.*]] = load <1 x double>, <1 x double>* [[VEC_CAST50]], align 8
72 ; CHECK-NEXT: [[TMP20:%.*]] = call contract <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[COL_LOAD48]], <1 x double> [[COL_LOAD51]], <1 x double> [[TMP17]])
73 ; CHECK-NEXT: [[TMP21:%.*]] = getelementptr [4 x double], [4 x double]* [[C]], i64 0, i64 2
74 ; CHECK-NEXT: [[VEC_CAST57:%.*]] = bitcast double* [[TMP21]] to <1 x double>*
75 ; CHECK-NEXT: store <1 x double> [[TMP20]], <1 x double>* [[VEC_CAST57]], align 8
76 ; CHECK-NEXT: [[TMP22:%.*]] = getelementptr <4 x double>, <4 x double>* [[A]], i64 0, i64 1
77 ; CHECK-NEXT: [[VEC_CAST59:%.*]] = bitcast double* [[TMP22]] to <1 x double>*
78 ; CHECK-NEXT: [[COL_LOAD60:%.*]] = load <1 x double>, <1 x double>* [[VEC_CAST59]], align 8
79 ; CHECK-NEXT: [[TMP23:%.*]] = getelementptr <4 x double>, <4 x double>* [[TMP5]], i64 0, i64 2
80 ; CHECK-NEXT: [[VEC_CAST62:%.*]] = bitcast double* [[TMP23]] to <1 x double>*
81 ; CHECK-NEXT: [[COL_LOAD63:%.*]] = load <1 x double>, <1 x double>* [[VEC_CAST62]], align 8
82 ; CHECK-NEXT: [[TMP24:%.*]] = fmul contract <1 x double> [[COL_LOAD60]], [[COL_LOAD63]]
83 ; CHECK-NEXT: [[TMP25:%.*]] = getelementptr <4 x double>, <4 x double>* [[A]], i64 0, i64 3
84 ; CHECK-NEXT: [[VEC_CAST68:%.*]] = bitcast double* [[TMP25]] to <1 x double>*
85 ; CHECK-NEXT: [[COL_LOAD69:%.*]] = load <1 x double>, <1 x double>* [[VEC_CAST68]], align 8
86 ; CHECK-NEXT: [[TMP26:%.*]] = getelementptr <4 x double>, <4 x double>* [[TMP5]], i64 0, i64 3
87 ; CHECK-NEXT: [[VEC_CAST71:%.*]] = bitcast double* [[TMP26]] to <1 x double>*
88 ; CHECK-NEXT: [[COL_LOAD72:%.*]] = load <1 x double>, <1 x double>* [[VEC_CAST71]], align 8
89 ; CHECK-NEXT: [[TMP27:%.*]] = call contract <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[COL_LOAD69]], <1 x double> [[COL_LOAD72]], <1 x double> [[TMP24]])
90 ; CHECK-NEXT: [[TMP28:%.*]] = getelementptr [4 x double], [4 x double]* [[C]], i64 0, i64 3
91 ; CHECK-NEXT: [[VEC_CAST78:%.*]] = bitcast double* [[TMP28]] to <1 x double>*
92 ; CHECK-NEXT: store <1 x double> [[TMP27]], <1 x double>* [[VEC_CAST78]], align 8
93 ; CHECK-NEXT: ret void
96 %a = load <4 x double>, <4 x double>* %A, align 8
97 %b = load <4 x double>, <4 x double>* %B, align 8
98 %c = call <4 x double> @llvm.matrix.multiply(<4 x double> %a, <4 x double> %b, i32 2, i32 2, i32 2)
99 %c.cast = bitcast [4 x double]* %C to <4 x double>*
100 store <4 x double> %c, <4 x double>* %c.cast, align 8
104 define void @multiply_can_hoist_multiple_insts(<4 x double>* noalias %A, <4 x double> * %B, [4 x double]* %C) {
105 ; CHECK-LABEL: @multiply_can_hoist_multiple_insts(
107 ; CHECK-NEXT: [[GEP:%.*]] = getelementptr [4 x double], [4 x double]* [[C:%.*]], i64 2
108 ; CHECK-NEXT: [[STORE_BEGIN:%.*]] = ptrtoint [4 x double]* [[GEP]] to i64
109 ; CHECK-NEXT: [[STORE_END:%.*]] = add nuw nsw i64 [[STORE_BEGIN]], 32
110 ; CHECK-NEXT: [[LOAD_BEGIN:%.*]] = ptrtoint <4 x double>* [[B:%.*]] to i64
111 ; CHECK-NEXT: [[TMP0:%.*]] = icmp ugt i64 [[STORE_END]], [[LOAD_BEGIN]]
112 ; CHECK-NEXT: br i1 [[TMP0]], label [[ALIAS_CONT:%.*]], label [[NO_ALIAS:%.*]]
114 ; CHECK-NEXT: [[LOAD_END:%.*]] = add nuw nsw i64 [[LOAD_BEGIN]], 32
115 ; CHECK-NEXT: [[TMP1:%.*]] = icmp ugt i64 [[LOAD_END]], [[STORE_BEGIN]]
116 ; CHECK-NEXT: br i1 [[TMP1]], label [[COPY:%.*]], label [[NO_ALIAS]]
118 ; CHECK-NEXT: [[TMP2:%.*]] = alloca <4 x double>, align 32
119 ; CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x double>* [[TMP2]] to i8*
120 ; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x double>* [[B]] to i8*
121 ; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* noundef nonnull align 32 dereferenceable(32) [[TMP3]], i8* noundef nonnull align 8 dereferenceable(32) [[TMP4]], i64 32, i1 false)
122 ; CHECK-NEXT: br label [[NO_ALIAS]]
124 ; CHECK-NEXT: [[TMP5:%.*]] = phi <4 x double>* [ [[B]], [[ENTRY:%.*]] ], [ [[B]], [[ALIAS_CONT]] ], [ [[TMP2]], [[COPY]] ]
125 ; CHECK-NEXT: [[VEC_CAST:%.*]] = bitcast <4 x double>* [[A:%.*]] to <1 x double>*
126 ; CHECK-NEXT: [[COL_LOAD:%.*]] = load <1 x double>, <1 x double>* [[VEC_CAST]], align 8
127 ; CHECK-NEXT: [[VEC_CAST2:%.*]] = bitcast <4 x double>* [[TMP5]] to <1 x double>*
128 ; CHECK-NEXT: [[COL_LOAD3:%.*]] = load <1 x double>, <1 x double>* [[VEC_CAST2]], align 8
129 ; CHECK-NEXT: [[TMP6:%.*]] = fmul contract <1 x double> [[COL_LOAD]], [[COL_LOAD3]]
130 ; CHECK-NEXT: [[TMP7:%.*]] = getelementptr <4 x double>, <4 x double>* [[A]], i64 0, i64 2
131 ; CHECK-NEXT: [[VEC_CAST5:%.*]] = bitcast double* [[TMP7]] to <1 x double>*
132 ; CHECK-NEXT: [[COL_LOAD6:%.*]] = load <1 x double>, <1 x double>* [[VEC_CAST5]], align 8
133 ; CHECK-NEXT: [[TMP8:%.*]] = getelementptr <4 x double>, <4 x double>* [[TMP5]], i64 0, i64 1
134 ; CHECK-NEXT: [[VEC_CAST8:%.*]] = bitcast double* [[TMP8]] to <1 x double>*
135 ; CHECK-NEXT: [[COL_LOAD9:%.*]] = load <1 x double>, <1 x double>* [[VEC_CAST8]], align 8
136 ; CHECK-NEXT: [[TMP9:%.*]] = call contract <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[COL_LOAD6]], <1 x double> [[COL_LOAD9]], <1 x double> [[TMP6]])
137 ; CHECK-NEXT: [[VEC_CAST15:%.*]] = bitcast [4 x double]* [[GEP]] to <1 x double>*
138 ; CHECK-NEXT: store <1 x double> [[TMP9]], <1 x double>* [[VEC_CAST15]], align 8
139 ; CHECK-NEXT: [[TMP10:%.*]] = getelementptr <4 x double>, <4 x double>* [[A]], i64 0, i64 1
140 ; CHECK-NEXT: [[VEC_CAST17:%.*]] = bitcast double* [[TMP10]] to <1 x double>*
141 ; CHECK-NEXT: [[COL_LOAD18:%.*]] = load <1 x double>, <1 x double>* [[VEC_CAST17]], align 8
142 ; CHECK-NEXT: [[VEC_CAST20:%.*]] = bitcast <4 x double>* [[TMP5]] to <1 x double>*
143 ; CHECK-NEXT: [[COL_LOAD21:%.*]] = load <1 x double>, <1 x double>* [[VEC_CAST20]], align 8
144 ; CHECK-NEXT: [[TMP11:%.*]] = fmul contract <1 x double> [[COL_LOAD18]], [[COL_LOAD21]]
145 ; CHECK-NEXT: [[TMP12:%.*]] = getelementptr <4 x double>, <4 x double>* [[A]], i64 0, i64 3
146 ; CHECK-NEXT: [[VEC_CAST26:%.*]] = bitcast double* [[TMP12]] to <1 x double>*
147 ; CHECK-NEXT: [[COL_LOAD27:%.*]] = load <1 x double>, <1 x double>* [[VEC_CAST26]], align 8
148 ; CHECK-NEXT: [[TMP13:%.*]] = getelementptr <4 x double>, <4 x double>* [[TMP5]], i64 0, i64 1
149 ; CHECK-NEXT: [[VEC_CAST29:%.*]] = bitcast double* [[TMP13]] to <1 x double>*
150 ; CHECK-NEXT: [[COL_LOAD30:%.*]] = load <1 x double>, <1 x double>* [[VEC_CAST29]], align 8
151 ; CHECK-NEXT: [[TMP14:%.*]] = call contract <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[COL_LOAD27]], <1 x double> [[COL_LOAD30]], <1 x double> [[TMP11]])
152 ; CHECK-NEXT: [[TMP15:%.*]] = getelementptr [4 x double], [4 x double]* [[C]], i64 2, i64 1
153 ; CHECK-NEXT: [[VEC_CAST36:%.*]] = bitcast double* [[TMP15]] to <1 x double>*
154 ; CHECK-NEXT: store <1 x double> [[TMP14]], <1 x double>* [[VEC_CAST36]], align 8
155 ; CHECK-NEXT: [[VEC_CAST38:%.*]] = bitcast <4 x double>* [[A]] to <1 x double>*
156 ; CHECK-NEXT: [[COL_LOAD39:%.*]] = load <1 x double>, <1 x double>* [[VEC_CAST38]], align 8
157 ; CHECK-NEXT: [[TMP16:%.*]] = getelementptr <4 x double>, <4 x double>* [[TMP5]], i64 0, i64 2
158 ; CHECK-NEXT: [[VEC_CAST41:%.*]] = bitcast double* [[TMP16]] to <1 x double>*
159 ; CHECK-NEXT: [[COL_LOAD42:%.*]] = load <1 x double>, <1 x double>* [[VEC_CAST41]], align 8
160 ; CHECK-NEXT: [[TMP17:%.*]] = fmul contract <1 x double> [[COL_LOAD39]], [[COL_LOAD42]]
161 ; CHECK-NEXT: [[TMP18:%.*]] = getelementptr <4 x double>, <4 x double>* [[A]], i64 0, i64 2
162 ; CHECK-NEXT: [[VEC_CAST47:%.*]] = bitcast double* [[TMP18]] to <1 x double>*
163 ; CHECK-NEXT: [[COL_LOAD48:%.*]] = load <1 x double>, <1 x double>* [[VEC_CAST47]], align 8
164 ; CHECK-NEXT: [[TMP19:%.*]] = getelementptr <4 x double>, <4 x double>* [[TMP5]], i64 0, i64 3
165 ; CHECK-NEXT: [[VEC_CAST50:%.*]] = bitcast double* [[TMP19]] to <1 x double>*
166 ; CHECK-NEXT: [[COL_LOAD51:%.*]] = load <1 x double>, <1 x double>* [[VEC_CAST50]], align 8
167 ; CHECK-NEXT: [[TMP20:%.*]] = call contract <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[COL_LOAD48]], <1 x double> [[COL_LOAD51]], <1 x double> [[TMP17]])
168 ; CHECK-NEXT: [[TMP21:%.*]] = getelementptr [4 x double], [4 x double]* [[C]], i64 2, i64 2
169 ; CHECK-NEXT: [[VEC_CAST57:%.*]] = bitcast double* [[TMP21]] to <1 x double>*
170 ; CHECK-NEXT: store <1 x double> [[TMP20]], <1 x double>* [[VEC_CAST57]], align 8
171 ; CHECK-NEXT: [[TMP22:%.*]] = getelementptr <4 x double>, <4 x double>* [[A]], i64 0, i64 1
172 ; CHECK-NEXT: [[VEC_CAST59:%.*]] = bitcast double* [[TMP22]] to <1 x double>*
173 ; CHECK-NEXT: [[COL_LOAD60:%.*]] = load <1 x double>, <1 x double>* [[VEC_CAST59]], align 8
174 ; CHECK-NEXT: [[TMP23:%.*]] = getelementptr <4 x double>, <4 x double>* [[TMP5]], i64 0, i64 2
175 ; CHECK-NEXT: [[VEC_CAST62:%.*]] = bitcast double* [[TMP23]] to <1 x double>*
176 ; CHECK-NEXT: [[COL_LOAD63:%.*]] = load <1 x double>, <1 x double>* [[VEC_CAST62]], align 8
177 ; CHECK-NEXT: [[TMP24:%.*]] = fmul contract <1 x double> [[COL_LOAD60]], [[COL_LOAD63]]
178 ; CHECK-NEXT: [[TMP25:%.*]] = getelementptr <4 x double>, <4 x double>* [[A]], i64 0, i64 3
179 ; CHECK-NEXT: [[VEC_CAST68:%.*]] = bitcast double* [[TMP25]] to <1 x double>*
180 ; CHECK-NEXT: [[COL_LOAD69:%.*]] = load <1 x double>, <1 x double>* [[VEC_CAST68]], align 8
181 ; CHECK-NEXT: [[TMP26:%.*]] = getelementptr <4 x double>, <4 x double>* [[TMP5]], i64 0, i64 3
182 ; CHECK-NEXT: [[VEC_CAST71:%.*]] = bitcast double* [[TMP26]] to <1 x double>*
183 ; CHECK-NEXT: [[COL_LOAD72:%.*]] = load <1 x double>, <1 x double>* [[VEC_CAST71]], align 8
184 ; CHECK-NEXT: [[TMP27:%.*]] = call contract <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[COL_LOAD69]], <1 x double> [[COL_LOAD72]], <1 x double> [[TMP24]])
185 ; CHECK-NEXT: [[TMP28:%.*]] = getelementptr [4 x double], [4 x double]* [[C]], i64 2, i64 3
186 ; CHECK-NEXT: [[VEC_CAST78:%.*]] = bitcast double* [[TMP28]] to <1 x double>*
187 ; CHECK-NEXT: store <1 x double> [[TMP27]], <1 x double>* [[VEC_CAST78]], align 8
188 ; CHECK-NEXT: ret void
191 %a = load <4 x double>, <4 x double>* %A, align 8
192 %b = load <4 x double>, <4 x double>* %B, align 8
193 %c = call <4 x double> @llvm.matrix.multiply(<4 x double> %a, <4 x double> %b, i32 2, i32 2, i32 2)
194 %gep = getelementptr [4 x double], [4 x double]* %C, i32 2
195 %c.cast = bitcast [4 x double]* %gep to <4 x double>*
196 store <4 x double> %c, <4 x double>* %c.cast, align 8
200 ; Make sure the correct instruction order is preserved when hoisting.
201 define void @multiply_can_hoist_multiple_insts2(<4 x double>* noalias %A, <4 x double> * %B, [4 x double]* %C) {
202 ; CHECK-LABEL: @multiply_can_hoist_multiple_insts2(
204 ; CHECK-NEXT: [[GEP_179:%.*]] = getelementptr [4 x double], [4 x double]* [[C:%.*]], i64 42
205 ; CHECK-NEXT: [[STORE_BEGIN:%.*]] = ptrtoint [4 x double]* [[GEP_179]] to i64
206 ; CHECK-NEXT: [[STORE_END:%.*]] = add nuw nsw i64 [[STORE_BEGIN]], 32
207 ; CHECK-NEXT: [[LOAD_BEGIN:%.*]] = ptrtoint <4 x double>* [[B:%.*]] to i64
208 ; CHECK-NEXT: [[TMP0:%.*]] = icmp ugt i64 [[STORE_END]], [[LOAD_BEGIN]]
209 ; CHECK-NEXT: br i1 [[TMP0]], label [[ALIAS_CONT:%.*]], label [[NO_ALIAS:%.*]]
211 ; CHECK-NEXT: [[LOAD_END:%.*]] = add nuw nsw i64 [[LOAD_BEGIN]], 32
212 ; CHECK-NEXT: [[TMP1:%.*]] = icmp ugt i64 [[LOAD_END]], [[STORE_BEGIN]]
213 ; CHECK-NEXT: br i1 [[TMP1]], label [[COPY:%.*]], label [[NO_ALIAS]]
215 ; CHECK-NEXT: [[TMP2:%.*]] = alloca <4 x double>, align 32
216 ; CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x double>* [[TMP2]] to i8*
217 ; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x double>* [[B]] to i8*
218 ; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* noundef nonnull align 32 dereferenceable(32) [[TMP3]], i8* noundef nonnull align 8 dereferenceable(32) [[TMP4]], i64 32, i1 false)
219 ; CHECK-NEXT: br label [[NO_ALIAS]]
221 ; CHECK-NEXT: [[TMP5:%.*]] = phi <4 x double>* [ [[B]], [[ENTRY:%.*]] ], [ [[B]], [[ALIAS_CONT]] ], [ [[TMP2]], [[COPY]] ]
222 ; CHECK-NEXT: [[VEC_CAST:%.*]] = bitcast <4 x double>* [[A:%.*]] to <1 x double>*
223 ; CHECK-NEXT: [[COL_LOAD:%.*]] = load <1 x double>, <1 x double>* [[VEC_CAST]], align 8
224 ; CHECK-NEXT: [[VEC_CAST2:%.*]] = bitcast <4 x double>* [[TMP5]] to <1 x double>*
225 ; CHECK-NEXT: [[COL_LOAD3:%.*]] = load <1 x double>, <1 x double>* [[VEC_CAST2]], align 8
226 ; CHECK-NEXT: [[TMP6:%.*]] = fmul contract <1 x double> [[COL_LOAD]], [[COL_LOAD3]]
227 ; CHECK-NEXT: [[TMP7:%.*]] = getelementptr <4 x double>, <4 x double>* [[A]], i64 0, i64 2
228 ; CHECK-NEXT: [[VEC_CAST5:%.*]] = bitcast double* [[TMP7]] to <1 x double>*
229 ; CHECK-NEXT: [[COL_LOAD6:%.*]] = load <1 x double>, <1 x double>* [[VEC_CAST5]], align 8
230 ; CHECK-NEXT: [[TMP8:%.*]] = getelementptr <4 x double>, <4 x double>* [[TMP5]], i64 0, i64 1
231 ; CHECK-NEXT: [[VEC_CAST8:%.*]] = bitcast double* [[TMP8]] to <1 x double>*
232 ; CHECK-NEXT: [[COL_LOAD9:%.*]] = load <1 x double>, <1 x double>* [[VEC_CAST8]], align 8
233 ; CHECK-NEXT: [[TMP9:%.*]] = call contract <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[COL_LOAD6]], <1 x double> [[COL_LOAD9]], <1 x double> [[TMP6]])
234 ; CHECK-NEXT: [[VEC_CAST15:%.*]] = bitcast [4 x double]* [[GEP_179]] to <1 x double>*
235 ; CHECK-NEXT: store <1 x double> [[TMP9]], <1 x double>* [[VEC_CAST15]], align 8
236 ; CHECK-NEXT: [[TMP10:%.*]] = getelementptr <4 x double>, <4 x double>* [[A]], i64 0, i64 1
237 ; CHECK-NEXT: [[VEC_CAST17:%.*]] = bitcast double* [[TMP10]] to <1 x double>*
238 ; CHECK-NEXT: [[COL_LOAD18:%.*]] = load <1 x double>, <1 x double>* [[VEC_CAST17]], align 8
239 ; CHECK-NEXT: [[VEC_CAST20:%.*]] = bitcast <4 x double>* [[TMP5]] to <1 x double>*
240 ; CHECK-NEXT: [[COL_LOAD21:%.*]] = load <1 x double>, <1 x double>* [[VEC_CAST20]], align 8
241 ; CHECK-NEXT: [[TMP11:%.*]] = fmul contract <1 x double> [[COL_LOAD18]], [[COL_LOAD21]]
242 ; CHECK-NEXT: [[TMP12:%.*]] = getelementptr <4 x double>, <4 x double>* [[A]], i64 0, i64 3
243 ; CHECK-NEXT: [[VEC_CAST26:%.*]] = bitcast double* [[TMP12]] to <1 x double>*
244 ; CHECK-NEXT: [[COL_LOAD27:%.*]] = load <1 x double>, <1 x double>* [[VEC_CAST26]], align 8
245 ; CHECK-NEXT: [[TMP13:%.*]] = getelementptr <4 x double>, <4 x double>* [[TMP5]], i64 0, i64 1
246 ; CHECK-NEXT: [[VEC_CAST29:%.*]] = bitcast double* [[TMP13]] to <1 x double>*
247 ; CHECK-NEXT: [[COL_LOAD30:%.*]] = load <1 x double>, <1 x double>* [[VEC_CAST29]], align 8
248 ; CHECK-NEXT: [[TMP14:%.*]] = call contract <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[COL_LOAD27]], <1 x double> [[COL_LOAD30]], <1 x double> [[TMP11]])
249 ; CHECK-NEXT: [[TMP15:%.*]] = getelementptr [4 x double], [4 x double]* [[C]], i64 42, i64 1
250 ; CHECK-NEXT: [[VEC_CAST36:%.*]] = bitcast double* [[TMP15]] to <1 x double>*
251 ; CHECK-NEXT: store <1 x double> [[TMP14]], <1 x double>* [[VEC_CAST36]], align 8
252 ; CHECK-NEXT: [[VEC_CAST38:%.*]] = bitcast <4 x double>* [[A]] to <1 x double>*
253 ; CHECK-NEXT: [[COL_LOAD39:%.*]] = load <1 x double>, <1 x double>* [[VEC_CAST38]], align 8
254 ; CHECK-NEXT: [[TMP16:%.*]] = getelementptr <4 x double>, <4 x double>* [[TMP5]], i64 0, i64 2
255 ; CHECK-NEXT: [[VEC_CAST41:%.*]] = bitcast double* [[TMP16]] to <1 x double>*
256 ; CHECK-NEXT: [[COL_LOAD42:%.*]] = load <1 x double>, <1 x double>* [[VEC_CAST41]], align 8
257 ; CHECK-NEXT: [[TMP17:%.*]] = fmul contract <1 x double> [[COL_LOAD39]], [[COL_LOAD42]]
258 ; CHECK-NEXT: [[TMP18:%.*]] = getelementptr <4 x double>, <4 x double>* [[A]], i64 0, i64 2
259 ; CHECK-NEXT: [[VEC_CAST47:%.*]] = bitcast double* [[TMP18]] to <1 x double>*
260 ; CHECK-NEXT: [[COL_LOAD48:%.*]] = load <1 x double>, <1 x double>* [[VEC_CAST47]], align 8
261 ; CHECK-NEXT: [[TMP19:%.*]] = getelementptr <4 x double>, <4 x double>* [[TMP5]], i64 0, i64 3
262 ; CHECK-NEXT: [[VEC_CAST50:%.*]] = bitcast double* [[TMP19]] to <1 x double>*
263 ; CHECK-NEXT: [[COL_LOAD51:%.*]] = load <1 x double>, <1 x double>* [[VEC_CAST50]], align 8
264 ; CHECK-NEXT: [[TMP20:%.*]] = call contract <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[COL_LOAD48]], <1 x double> [[COL_LOAD51]], <1 x double> [[TMP17]])
265 ; CHECK-NEXT: [[TMP21:%.*]] = getelementptr [4 x double], [4 x double]* [[C]], i64 42, i64 2
266 ; CHECK-NEXT: [[VEC_CAST57:%.*]] = bitcast double* [[TMP21]] to <1 x double>*
267 ; CHECK-NEXT: store <1 x double> [[TMP20]], <1 x double>* [[VEC_CAST57]], align 8
268 ; CHECK-NEXT: [[TMP22:%.*]] = getelementptr <4 x double>, <4 x double>* [[A]], i64 0, i64 1
269 ; CHECK-NEXT: [[VEC_CAST59:%.*]] = bitcast double* [[TMP22]] to <1 x double>*
270 ; CHECK-NEXT: [[COL_LOAD60:%.*]] = load <1 x double>, <1 x double>* [[VEC_CAST59]], align 8
271 ; CHECK-NEXT: [[TMP23:%.*]] = getelementptr <4 x double>, <4 x double>* [[TMP5]], i64 0, i64 2
272 ; CHECK-NEXT: [[VEC_CAST62:%.*]] = bitcast double* [[TMP23]] to <1 x double>*
273 ; CHECK-NEXT: [[COL_LOAD63:%.*]] = load <1 x double>, <1 x double>* [[VEC_CAST62]], align 8
274 ; CHECK-NEXT: [[TMP24:%.*]] = fmul contract <1 x double> [[COL_LOAD60]], [[COL_LOAD63]]
275 ; CHECK-NEXT: [[TMP25:%.*]] = getelementptr <4 x double>, <4 x double>* [[A]], i64 0, i64 3
276 ; CHECK-NEXT: [[VEC_CAST68:%.*]] = bitcast double* [[TMP25]] to <1 x double>*
277 ; CHECK-NEXT: [[COL_LOAD69:%.*]] = load <1 x double>, <1 x double>* [[VEC_CAST68]], align 8
278 ; CHECK-NEXT: [[TMP26:%.*]] = getelementptr <4 x double>, <4 x double>* [[TMP5]], i64 0, i64 3
279 ; CHECK-NEXT: [[VEC_CAST71:%.*]] = bitcast double* [[TMP26]] to <1 x double>*
280 ; CHECK-NEXT: [[COL_LOAD72:%.*]] = load <1 x double>, <1 x double>* [[VEC_CAST71]], align 8
281 ; CHECK-NEXT: [[TMP27:%.*]] = call contract <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[COL_LOAD69]], <1 x double> [[COL_LOAD72]], <1 x double> [[TMP24]])
282 ; CHECK-NEXT: [[TMP28:%.*]] = getelementptr [4 x double], [4 x double]* [[C]], i64 42, i64 3
283 ; CHECK-NEXT: [[VEC_CAST78:%.*]] = bitcast double* [[TMP28]] to <1 x double>*
284 ; CHECK-NEXT: store <1 x double> [[TMP27]], <1 x double>* [[VEC_CAST78]], align 8
285 ; CHECK-NEXT: ret void
288 %a = load <4 x double>, <4 x double>* %A, align 8
289 %b = load <4 x double>, <4 x double>* %B, align 8
290 %c = call <4 x double> @llvm.matrix.multiply(<4 x double> %a, <4 x double> %b, i32 2, i32 2, i32 2)
291 %c.cast = bitcast [4 x double]* %C to <4 x double>*
292 %off.0 = add i32 10, 10
293 %off.1 = add i32 %off.0, 2
294 %off.2 = add i32 %off.0, %off.1
295 %gep.1 = getelementptr <4 x double>, <4 x double>* %c.cast, i32 %off.2
296 store <4 x double> %c, <4 x double>* %gep.1, align 8
300 define void @multiply_dont_hoist_phi(<4 x double>* noalias %A, <4 x double> * %B, [4 x double]* %C) {
301 ; CHECK-LABEL: @multiply_dont_hoist_phi(
303 ; CHECK-NEXT: [[VEC_CAST:%.*]] = bitcast <4 x double>* [[A:%.*]] to <2 x double>*
304 ; CHECK-NEXT: [[COL_LOAD:%.*]] = load <2 x double>, <2 x double>* [[VEC_CAST]], align 8
305 ; CHECK-NEXT: [[VEC_GEP:%.*]] = getelementptr <4 x double>, <4 x double>* [[A]], i64 0, i64 2
306 ; CHECK-NEXT: [[VEC_CAST1:%.*]] = bitcast double* [[VEC_GEP]] to <2 x double>*
307 ; CHECK-NEXT: [[COL_LOAD2:%.*]] = load <2 x double>, <2 x double>* [[VEC_CAST1]], align 8
308 ; CHECK-NEXT: [[VEC_CAST3:%.*]] = bitcast <4 x double>* [[B:%.*]] to <2 x double>*
309 ; CHECK-NEXT: [[COL_LOAD4:%.*]] = load <2 x double>, <2 x double>* [[VEC_CAST3]], align 8
310 ; CHECK-NEXT: [[VEC_GEP5:%.*]] = getelementptr <4 x double>, <4 x double>* [[B]], i64 0, i64 2
311 ; CHECK-NEXT: [[VEC_CAST6:%.*]] = bitcast double* [[VEC_GEP5]] to <2 x double>*
312 ; CHECK-NEXT: [[COL_LOAD7:%.*]] = load <2 x double>, <2 x double>* [[VEC_CAST6]], align 8
313 ; CHECK-NEXT: br label [[NEXT:%.*]]
315 ; CHECK-NEXT: [[SPLAT_SPLAT16:%.*]] = shufflevector <2 x double> [[COL_LOAD7]], <2 x double> undef, <2 x i32> <i32 1, i32 1>
316 ; CHECK-NEXT: [[SPLAT_SPLAT13:%.*]] = shufflevector <2 x double> [[COL_LOAD7]], <2 x double> poison, <2 x i32> zeroinitializer
317 ; CHECK-NEXT: [[TMP0:%.*]] = fmul contract <2 x double> [[COL_LOAD]], [[SPLAT_SPLAT13]]
318 ; CHECK-NEXT: [[TMP1:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD2]], <2 x double> [[SPLAT_SPLAT16]], <2 x double> [[TMP0]])
319 ; CHECK-NEXT: [[SPLAT_SPLAT10:%.*]] = shufflevector <2 x double> [[COL_LOAD4]], <2 x double> undef, <2 x i32> <i32 1, i32 1>
320 ; CHECK-NEXT: [[SPLAT_SPLAT:%.*]] = shufflevector <2 x double> [[COL_LOAD4]], <2 x double> poison, <2 x i32> zeroinitializer
321 ; CHECK-NEXT: [[TMP2:%.*]] = fmul contract <2 x double> [[COL_LOAD]], [[SPLAT_SPLAT]]
322 ; CHECK-NEXT: [[TMP3:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD2]], <2 x double> [[SPLAT_SPLAT10]], <2 x double> [[TMP2]])
323 ; CHECK-NEXT: [[TMP4:%.*]] = getelementptr [4 x double], [4 x double]* [[C:%.*]], i64 26, i64 0
324 ; CHECK-NEXT: [[VEC_CAST17:%.*]] = bitcast double* [[TMP4]] to <2 x double>*
325 ; CHECK-NEXT: store <2 x double> [[TMP3]], <2 x double>* [[VEC_CAST17]], align 8
326 ; CHECK-NEXT: [[VEC_GEP18:%.*]] = getelementptr [4 x double], [4 x double]* [[C]], i64 26, i64 2
327 ; CHECK-NEXT: [[VEC_CAST19:%.*]] = bitcast double* [[VEC_GEP18]] to <2 x double>*
328 ; CHECK-NEXT: store <2 x double> [[TMP1]], <2 x double>* [[VEC_CAST19]], align 8
329 ; CHECK-NEXT: ret void
332 %a = load <4 x double>, <4 x double>* %A, align 8
333 %b = load <4 x double>, <4 x double>* %B, align 8
334 %c = call <4 x double> @llvm.matrix.multiply(<4 x double> %a, <4 x double> %b, i32 2, i32 2, i32 2)
335 %c.cast = bitcast [4 x double]* %C to <4 x double>*
339 %p = phi i32 [ 2, %entry ]
340 %off.0 = add i32 10, %p
341 %off.1 = add i32 %off.0, 2
342 %off.2 = add i32 %off.0, %off.1
343 %gep.1 = getelementptr <4 x double>, <4 x double>* %c.cast, i32 %off.2
344 store <4 x double> %c, <4 x double>* %gep.1, align 8
348 ; The address load may alias, so avoid moving it for now.
349 define void @multiply_dont_hoist_cast_due_to_operand(<4 x double>* noalias %A, <4 x double> * %B, [4 x double]** %C.ptr) {
350 ; CHECK-LABEL: @multiply_dont_hoist_cast_due_to_operand(
352 ; CHECK-NEXT: [[VEC_CAST:%.*]] = bitcast <4 x double>* [[A:%.*]] to <2 x double>*
353 ; CHECK-NEXT: [[COL_LOAD:%.*]] = load <2 x double>, <2 x double>* [[VEC_CAST]], align 8
354 ; CHECK-NEXT: [[VEC_GEP:%.*]] = getelementptr <4 x double>, <4 x double>* [[A]], i64 0, i64 2
355 ; CHECK-NEXT: [[VEC_CAST1:%.*]] = bitcast double* [[VEC_GEP]] to <2 x double>*
356 ; CHECK-NEXT: [[COL_LOAD2:%.*]] = load <2 x double>, <2 x double>* [[VEC_CAST1]], align 8
357 ; CHECK-NEXT: [[VEC_CAST3:%.*]] = bitcast <4 x double>* [[B:%.*]] to <2 x double>*
358 ; CHECK-NEXT: [[COL_LOAD4:%.*]] = load <2 x double>, <2 x double>* [[VEC_CAST3]], align 8
359 ; CHECK-NEXT: [[VEC_GEP5:%.*]] = getelementptr <4 x double>, <4 x double>* [[B]], i64 0, i64 2
360 ; CHECK-NEXT: [[VEC_CAST6:%.*]] = bitcast double* [[VEC_GEP5]] to <2 x double>*
361 ; CHECK-NEXT: [[COL_LOAD7:%.*]] = load <2 x double>, <2 x double>* [[VEC_CAST6]], align 8
362 ; CHECK-NEXT: [[SPLAT_SPLAT:%.*]] = shufflevector <2 x double> [[COL_LOAD4]], <2 x double> poison, <2 x i32> zeroinitializer
363 ; CHECK-NEXT: [[TMP0:%.*]] = fmul contract <2 x double> [[COL_LOAD]], [[SPLAT_SPLAT]]
364 ; CHECK-NEXT: [[SPLAT_SPLAT10:%.*]] = shufflevector <2 x double> [[COL_LOAD4]], <2 x double> undef, <2 x i32> <i32 1, i32 1>
365 ; CHECK-NEXT: [[TMP1:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD2]], <2 x double> [[SPLAT_SPLAT10]], <2 x double> [[TMP0]])
366 ; CHECK-NEXT: [[SPLAT_SPLAT13:%.*]] = shufflevector <2 x double> [[COL_LOAD7]], <2 x double> poison, <2 x i32> zeroinitializer
367 ; CHECK-NEXT: [[TMP2:%.*]] = fmul contract <2 x double> [[COL_LOAD]], [[SPLAT_SPLAT13]]
368 ; CHECK-NEXT: [[SPLAT_SPLAT16:%.*]] = shufflevector <2 x double> [[COL_LOAD7]], <2 x double> undef, <2 x i32> <i32 1, i32 1>
369 ; CHECK-NEXT: [[TMP3:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD2]], <2 x double> [[SPLAT_SPLAT16]], <2 x double> [[TMP2]])
370 ; CHECK-NEXT: [[TMP4:%.*]] = bitcast [4 x double]** [[C_PTR:%.*]] to double**
371 ; CHECK-NEXT: [[C2021:%.*]] = load double*, double** [[TMP4]], align 8
372 ; CHECK-NEXT: [[VEC_CAST17:%.*]] = bitcast double* [[C2021]] to <2 x double>*
373 ; CHECK-NEXT: store <2 x double> [[TMP1]], <2 x double>* [[VEC_CAST17]], align 8
374 ; CHECK-NEXT: [[VEC_GEP18:%.*]] = getelementptr double, double* [[C2021]], i64 2
375 ; CHECK-NEXT: [[VEC_CAST19:%.*]] = bitcast double* [[VEC_GEP18]] to <2 x double>*
376 ; CHECK-NEXT: store <2 x double> [[TMP3]], <2 x double>* [[VEC_CAST19]], align 8
377 ; CHECK-NEXT: ret void
380 %a = load <4 x double>, <4 x double>* %A, align 8
381 %b = load <4 x double>, <4 x double>* %B, align 8
382 %c = call <4 x double> @llvm.matrix.multiply(<4 x double> %a, <4 x double> %b, i32 2, i32 2, i32 2)
383 %C = load [4 x double]*, [4 x double]** %C.ptr
384 %c.cast = bitcast [4 x double]* %C to <4 x double>*
385 store <4 x double> %c, <4 x double>* %c.cast, align 8
389 ; The address load may alias, so avoid moving it for now.
390 define void @multiply_dont_hoist_load(<4 x double>* noalias %A, <4 x double> * %B, <4 x double>** %C.ptr) {
391 ; CHECK-LABEL: @multiply_dont_hoist_load(
393 ; CHECK-NEXT: [[VEC_CAST:%.*]] = bitcast <4 x double>* [[A:%.*]] to <2 x double>*
394 ; CHECK-NEXT: [[COL_LOAD:%.*]] = load <2 x double>, <2 x double>* [[VEC_CAST]], align 8
395 ; CHECK-NEXT: [[VEC_GEP:%.*]] = getelementptr <4 x double>, <4 x double>* [[A]], i64 0, i64 2
396 ; CHECK-NEXT: [[VEC_CAST1:%.*]] = bitcast double* [[VEC_GEP]] to <2 x double>*
397 ; CHECK-NEXT: [[COL_LOAD2:%.*]] = load <2 x double>, <2 x double>* [[VEC_CAST1]], align 8
398 ; CHECK-NEXT: [[VEC_CAST3:%.*]] = bitcast <4 x double>* [[B:%.*]] to <2 x double>*
399 ; CHECK-NEXT: [[COL_LOAD4:%.*]] = load <2 x double>, <2 x double>* [[VEC_CAST3]], align 8
400 ; CHECK-NEXT: [[VEC_GEP5:%.*]] = getelementptr <4 x double>, <4 x double>* [[B]], i64 0, i64 2
401 ; CHECK-NEXT: [[VEC_CAST6:%.*]] = bitcast double* [[VEC_GEP5]] to <2 x double>*
402 ; CHECK-NEXT: [[COL_LOAD7:%.*]] = load <2 x double>, <2 x double>* [[VEC_CAST6]], align 8
403 ; CHECK-NEXT: [[SPLAT_SPLAT:%.*]] = shufflevector <2 x double> [[COL_LOAD4]], <2 x double> poison, <2 x i32> zeroinitializer
404 ; CHECK-NEXT: [[TMP0:%.*]] = fmul contract <2 x double> [[COL_LOAD]], [[SPLAT_SPLAT]]
405 ; CHECK-NEXT: [[SPLAT_SPLAT10:%.*]] = shufflevector <2 x double> [[COL_LOAD4]], <2 x double> undef, <2 x i32> <i32 1, i32 1>
406 ; CHECK-NEXT: [[TMP1:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD2]], <2 x double> [[SPLAT_SPLAT10]], <2 x double> [[TMP0]])
407 ; CHECK-NEXT: [[SPLAT_SPLAT13:%.*]] = shufflevector <2 x double> [[COL_LOAD7]], <2 x double> poison, <2 x i32> zeroinitializer
408 ; CHECK-NEXT: [[TMP2:%.*]] = fmul contract <2 x double> [[COL_LOAD]], [[SPLAT_SPLAT13]]
409 ; CHECK-NEXT: [[SPLAT_SPLAT16:%.*]] = shufflevector <2 x double> [[COL_LOAD7]], <2 x double> undef, <2 x i32> <i32 1, i32 1>
410 ; CHECK-NEXT: [[TMP3:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD2]], <2 x double> [[SPLAT_SPLAT16]], <2 x double> [[TMP2]])
411 ; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x double>** [[C_PTR:%.*]] to double**
412 ; CHECK-NEXT: [[C20:%.*]] = load double*, double** [[TMP4]], align 8
413 ; CHECK-NEXT: [[VEC_CAST17:%.*]] = bitcast double* [[C20]] to <2 x double>*
414 ; CHECK-NEXT: store <2 x double> [[TMP1]], <2 x double>* [[VEC_CAST17]], align 8
415 ; CHECK-NEXT: [[VEC_GEP18:%.*]] = getelementptr double, double* [[C20]], i64 2
416 ; CHECK-NEXT: [[VEC_CAST19:%.*]] = bitcast double* [[VEC_GEP18]] to <2 x double>*
417 ; CHECK-NEXT: store <2 x double> [[TMP3]], <2 x double>* [[VEC_CAST19]], align 8
418 ; CHECK-NEXT: ret void
421 %a = load <4 x double>, <4 x double>* %A, align 8
422 %b = load <4 x double>, <4 x double>* %B, align 8
423 %c = call <4 x double> @llvm.matrix.multiply(<4 x double> %a, <4 x double> %b, i32 2, i32 2, i32 2)
424 %C = load <4 x double>*, <4 x double>** %C.ptr
425 store <4 x double> %c, <4 x double>* %C, align 8
429 ; The call to @get_adress may clobber memory, avoid moving it for now.
430 define void @multiply_dont_hoist_call(<4 x double>* noalias %A, <4 x double> * %B) {
431 ; CHECK-LABEL: @multiply_dont_hoist_call(
433 ; CHECK-NEXT: [[VEC_CAST:%.*]] = bitcast <4 x double>* [[A:%.*]] to <2 x double>*
434 ; CHECK-NEXT: [[COL_LOAD:%.*]] = load <2 x double>, <2 x double>* [[VEC_CAST]], align 8
435 ; CHECK-NEXT: [[VEC_GEP:%.*]] = getelementptr <4 x double>, <4 x double>* [[A]], i64 0, i64 2
436 ; CHECK-NEXT: [[VEC_CAST1:%.*]] = bitcast double* [[VEC_GEP]] to <2 x double>*
437 ; CHECK-NEXT: [[COL_LOAD2:%.*]] = load <2 x double>, <2 x double>* [[VEC_CAST1]], align 8
438 ; CHECK-NEXT: [[VEC_CAST3:%.*]] = bitcast <4 x double>* [[B:%.*]] to <2 x double>*
439 ; CHECK-NEXT: [[COL_LOAD4:%.*]] = load <2 x double>, <2 x double>* [[VEC_CAST3]], align 8
440 ; CHECK-NEXT: [[VEC_GEP5:%.*]] = getelementptr <4 x double>, <4 x double>* [[B]], i64 0, i64 2
441 ; CHECK-NEXT: [[VEC_CAST6:%.*]] = bitcast double* [[VEC_GEP5]] to <2 x double>*
442 ; CHECK-NEXT: [[COL_LOAD7:%.*]] = load <2 x double>, <2 x double>* [[VEC_CAST6]], align 8
443 ; CHECK-NEXT: [[SPLAT_SPLAT:%.*]] = shufflevector <2 x double> [[COL_LOAD4]], <2 x double> poison, <2 x i32> zeroinitializer
444 ; CHECK-NEXT: [[TMP0:%.*]] = fmul contract <2 x double> [[COL_LOAD]], [[SPLAT_SPLAT]]
445 ; CHECK-NEXT: [[SPLAT_SPLAT10:%.*]] = shufflevector <2 x double> [[COL_LOAD4]], <2 x double> undef, <2 x i32> <i32 1, i32 1>
446 ; CHECK-NEXT: [[TMP1:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD2]], <2 x double> [[SPLAT_SPLAT10]], <2 x double> [[TMP0]])
447 ; CHECK-NEXT: [[SPLAT_SPLAT13:%.*]] = shufflevector <2 x double> [[COL_LOAD7]], <2 x double> poison, <2 x i32> zeroinitializer
448 ; CHECK-NEXT: [[TMP2:%.*]] = fmul contract <2 x double> [[COL_LOAD]], [[SPLAT_SPLAT13]]
449 ; CHECK-NEXT: [[SPLAT_SPLAT16:%.*]] = shufflevector <2 x double> [[COL_LOAD7]], <2 x double> undef, <2 x i32> <i32 1, i32 1>
450 ; CHECK-NEXT: [[TMP3:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD2]], <2 x double> [[SPLAT_SPLAT16]], <2 x double> [[TMP2]])
451 ; CHECK-NEXT: [[C:%.*]] = call <4 x double>* @get_address()
452 ; CHECK-NEXT: [[VEC_CAST17:%.*]] = bitcast <4 x double>* [[C]] to <2 x double>*
453 ; CHECK-NEXT: store <2 x double> [[TMP1]], <2 x double>* [[VEC_CAST17]], align 8
454 ; CHECK-NEXT: [[VEC_GEP18:%.*]] = getelementptr <4 x double>, <4 x double>* [[C]], i64 0, i64 2
455 ; CHECK-NEXT: [[VEC_CAST19:%.*]] = bitcast double* [[VEC_GEP18]] to <2 x double>*
456 ; CHECK-NEXT: store <2 x double> [[TMP3]], <2 x double>* [[VEC_CAST19]], align 8
457 ; CHECK-NEXT: ret void
460 %a = load <4 x double>, <4 x double>* %A, align 8
461 %b = load <4 x double>, <4 x double>* %B, align 8
462 %c = call <4 x double> @llvm.matrix.multiply(<4 x double> %a, <4 x double> %b, i32 2, i32 2, i32 2)
463 %C = call <4 x double>* @get_address()
464 store <4 x double> %c, <4 x double>* %C, align 8
468 declare <4 x double>* @get_address()
471 declare <4 x double> @llvm.matrix.multiply(<4 x double>, <4 x double>, i32, i32, i32)