1 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2 ; RUN: opt --codegen-opt-level=2 -mtriple=x86_64 -lower-amx-type %s -S | FileCheck %s
4 define void @combine_amx_cast_inside_bb() {
5 ; CHECK-LABEL: @combine_amx_cast_inside_bb(
6 ; CHECK-NEXT: wrapper_entry:
7 ; CHECK-NEXT: [[TMP0:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 11, i16 40, ptr undef, i64 undef)
8 ; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 11, i16 40, ptr undef, i64 undef, x86_amx [[TMP0]])
12 %0 = call x86_amx @llvm.x86.tileloadd64.internal(i16 11, i16 40, ptr undef, i64 undef)
13 %tmp = call <110 x i32> @llvm.x86.cast.tile.to.vector.v110i32(x86_amx %0)
14 %1 = call x86_amx @llvm.x86.cast.vector.to.tile.v110i32(<110 x i32> %tmp)
15 call void @llvm.x86.tilestored64.internal(i16 11, i16 40, ptr undef, i64 undef, x86_amx %1)
19 ; Cases where amxcast can be combined across bb
20 ; %5 and %6 is combined together since %goodphi's incoming is phi or amxcast
21 define void @combine_amx_cast_and_phi(i1 %arg) {
22 ; CHECK-LABEL: @combine_amx_cast_and_phi(
23 ; CHECK-NEXT: wrapper_entry:
24 ; CHECK-NEXT: [[TMP0:%.*]] = alloca <560 x i8>, align 64
25 ; CHECK-NEXT: [[TMP1:%.*]] = alloca <616 x i8>, align 64
26 ; CHECK-NEXT: [[TMP2:%.*]] = alloca <110 x i32>, align 64
27 ; CHECK-NEXT: [[TMP3:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 11, i16 40, ptr undef, i64 undef)
28 ; CHECK-NEXT: br i1 [[ARG:%.*]], label [[FOR_COND_CLEANUP_I_I:%.*]], label [[FOR_BODY_I_LR_PH_I:%.*]]
29 ; CHECK: for.body.i.lr.ph.i:
30 ; CHECK-NEXT: store <110 x i32> undef, ptr [[TMP2]], align 512
31 ; CHECK-NEXT: [[TMP5:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 11, i16 40, ptr [[TMP2]], i64 40)
32 ; CHECK-NEXT: store <616 x i8> undef, ptr [[TMP1]], align 1024
33 ; CHECK-NEXT: [[TMP7:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 11, i16 56, ptr [[TMP1]], i64 56)
34 ; CHECK-NEXT: store <560 x i8> undef, ptr [[TMP0]], align 1024
35 ; CHECK-NEXT: [[TMP9:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 14, i16 40, ptr [[TMP0]], i64 40)
36 ; CHECK-NEXT: [[TMP10:%.*]] = call x86_amx @llvm.x86.tdpbssd.internal(i16 11, i16 40, i16 56, x86_amx [[TMP5]], x86_amx [[TMP7]], x86_amx [[TMP9]])
37 ; CHECK-NEXT: br label [[FOR_COND_CLEANUP_I_I]]
38 ; CHECK: for.cond.cleanup.i.i:
39 ; CHECK-NEXT: [[TMP11:%.*]] = phi x86_amx [ [[TMP3]], [[WRAPPER_ENTRY:%.*]] ], [ [[TMP10]], [[FOR_BODY_I_LR_PH_I]] ]
40 ; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 11, i16 40, ptr undef, i64 undef, x86_amx [[TMP11]])
41 ; CHECK-NEXT: ret void
44 %0 = call x86_amx @llvm.x86.tileloadd64.internal(i16 11, i16 40, ptr undef, i64 undef)
45 %tmp = call <110 x i32> @llvm.x86.cast.tile.to.vector.v110i32(x86_amx %0)
46 br i1 %arg, label %for.cond.cleanup.i.i, label %for.body.i.lr.ph.i
48 for.body.i.lr.ph.i: ; preds = %wrapper_entry
49 %1 = call x86_amx @llvm.x86.cast.vector.to.tile.v110i32(<110 x i32> undef)
50 %2 = call x86_amx @llvm.x86.cast.vector.to.tile.v616i8(<616 x i8> undef)
51 %3 = call x86_amx @llvm.x86.cast.vector.to.tile.v560i8(<560 x i8> undef)
52 %4 = call x86_amx @llvm.x86.tdpbssd.internal(i16 11, i16 40, i16 56, x86_amx %1, x86_amx %2, x86_amx %3)
53 %5 = call <110 x i32> @llvm.x86.cast.tile.to.vector.v110i32(x86_amx %4)
54 br label %for.cond.cleanup.i.i
56 for.cond.cleanup.i.i: ; preds = %for.body.i.lr.ph.i, %wrapper_entry
57 %goodphi = phi <110 x i32> [ %tmp, %wrapper_entry ], [ %5, %for.body.i.lr.ph.i ]
58 %6 = call x86_amx @llvm.x86.cast.vector.to.tile.v110i32(<110 x i32> %goodphi)
59 call void @llvm.x86.tilestored64.internal(i16 11, i16 40, ptr undef, i64 undef, x86_amx %6)
63 ; Cases where amxcast can't be combined across bb
64 ; %5 and %6 is not combined together since %evilphi's incoming is not phi or amxcast
65 define void @fail_to_combine_amx_cast_and_phi(<110 x i32> %tmp, i1 %arg) {
66 ; CHECK-LABEL: @fail_to_combine_amx_cast_and_phi(
67 ; CHECK-NEXT: wrapper_entry:
68 ; CHECK-NEXT: [[TMP0:%.*]] = alloca <110 x i32>, align 64
69 ; CHECK-NEXT: [[TMP1:%.*]] = alloca <110 x i32>, align 64
70 ; CHECK-NEXT: [[TMP2:%.*]] = alloca <560 x i8>, align 64
71 ; CHECK-NEXT: [[TMP3:%.*]] = alloca <616 x i8>, align 64
72 ; CHECK-NEXT: [[TMP4:%.*]] = alloca <110 x i32>, align 64
73 ; CHECK-NEXT: [[TMP5:%.*]] = add <110 x i32> [[TMP:%.*]], [[TMP]]
74 ; CHECK-NEXT: br i1 [[ARG:%.*]], label [[FOR_COND_CLEANUP_I_I:%.*]], label [[FOR_BODY_I_LR_PH_I:%.*]]
75 ; CHECK: for.body.i.lr.ph.i:
76 ; CHECK-NEXT: store <110 x i32> undef, ptr [[TMP4]], align 512
77 ; CHECK-NEXT: [[TMP7:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 11, i16 40, ptr [[TMP4]], i64 40)
78 ; CHECK-NEXT: store <616 x i8> undef, ptr [[TMP3]], align 1024
79 ; CHECK-NEXT: [[TMP9:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 11, i16 56, ptr [[TMP3]], i64 56)
80 ; CHECK-NEXT: store <560 x i8> undef, ptr [[TMP2]], align 1024
81 ; CHECK-NEXT: [[TMP11:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 14, i16 40, ptr [[TMP2]], i64 40)
82 ; CHECK-NEXT: [[TMP12:%.*]] = call x86_amx @llvm.x86.tdpbssd.internal(i16 11, i16 40, i16 56, x86_amx [[TMP7]], x86_amx [[TMP9]], x86_amx [[TMP11]])
83 ; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 11, i16 40, ptr [[TMP1]], i64 40, x86_amx [[TMP12]])
84 ; CHECK-NEXT: [[TMP14:%.*]] = load <110 x i32>, ptr [[TMP1]], align 512
85 ; CHECK-NEXT: br label [[FOR_COND_CLEANUP_I_I]]
86 ; CHECK: for.cond.cleanup.i.i:
87 ; CHECK-NEXT: [[EVILPHI:%.*]] = phi <110 x i32> [ [[TMP5]], [[WRAPPER_ENTRY:%.*]] ], [ [[TMP14]], [[FOR_BODY_I_LR_PH_I]] ]
88 ; CHECK-NEXT: store <110 x i32> [[EVILPHI]], ptr [[TMP0]], align 512
89 ; CHECK-NEXT: [[TMP16:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 11, i16 40, ptr [[TMP0]], i64 40)
90 ; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 11, i16 40, ptr undef, i64 undef, x86_amx [[TMP16]])
91 ; CHECK-NEXT: ret void
94 %0 = add <110 x i32> %tmp, %tmp
95 br i1 %arg, label %for.cond.cleanup.i.i, label %for.body.i.lr.ph.i
97 for.body.i.lr.ph.i: ; preds = %wrapper_entry
98 %1 = call x86_amx @llvm.x86.cast.vector.to.tile.v110i32(<110 x i32> undef)
99 %2 = call x86_amx @llvm.x86.cast.vector.to.tile.v616i8(<616 x i8> undef)
100 %3 = call x86_amx @llvm.x86.cast.vector.to.tile.v560i8(<560 x i8> undef)
101 %4 = call x86_amx @llvm.x86.tdpbssd.internal(i16 11, i16 40, i16 56, x86_amx %1, x86_amx %2, x86_amx %3)
102 %5 = call <110 x i32> @llvm.x86.cast.tile.to.vector.v110i32(x86_amx %4)
103 br label %for.cond.cleanup.i.i
105 for.cond.cleanup.i.i: ; preds = %for.body.i.lr.ph.i, %wrapper_entry
106 %evilphi = phi <110 x i32> [ %0, %wrapper_entry ], [ %5, %for.body.i.lr.ph.i ]
107 %6 = call x86_amx @llvm.x86.cast.vector.to.tile.v110i32(<110 x i32> %evilphi)
108 call void @llvm.x86.tilestored64.internal(i16 11, i16 40, ptr undef, i64 undef, x86_amx %6)
112 ; Cases where amxcast can't be combined across bb
113 ; %5 and %6 is not combined together since %evilphi's user aka %evilphi2 is not inside phi web.
114 define void @fail_to_combine_amx_cast_and_phi2(i1 %arg) {
115 ; CHECK-LABEL: @fail_to_combine_amx_cast_and_phi2(
116 ; CHECK-NEXT: wrapper_entry:
117 ; CHECK-NEXT: [[TMP0:%.*]] = alloca <110 x i32>, align 64
118 ; CHECK-NEXT: [[TMP1:%.*]] = alloca <110 x i32>, align 64
119 ; CHECK-NEXT: [[TMP2:%.*]] = alloca <560 x i8>, align 64
120 ; CHECK-NEXT: [[TMP3:%.*]] = alloca <616 x i8>, align 64
121 ; CHECK-NEXT: [[TMP4:%.*]] = alloca <110 x i32>, align 64
122 ; CHECK-NEXT: [[TMP5:%.*]] = alloca <110 x i32>, align 64
123 ; CHECK-NEXT: [[TMP6:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 11, i16 40, ptr undef, i64 undef)
124 ; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 11, i16 40, ptr [[TMP5]], i64 40, x86_amx [[TMP6]])
125 ; CHECK-NEXT: [[TMP8:%.*]] = load <110 x i32>, ptr [[TMP5]], align 512
126 ; CHECK-NEXT: br i1 [[ARG:%.*]], label [[FOR_COND_CLEANUP_I_I:%.*]], label [[FOR_BODY_I_LR_PH_I:%.*]]
127 ; CHECK: for.body.i.lr.ph.i:
128 ; CHECK-NEXT: store <110 x i32> undef, ptr [[TMP4]], align 512
129 ; CHECK-NEXT: [[TMP10:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 11, i16 40, ptr [[TMP4]], i64 40)
130 ; CHECK-NEXT: store <616 x i8> undef, ptr [[TMP3]], align 1024
131 ; CHECK-NEXT: [[TMP12:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 11, i16 56, ptr [[TMP3]], i64 56)
132 ; CHECK-NEXT: store <560 x i8> undef, ptr [[TMP2]], align 1024
133 ; CHECK-NEXT: [[TMP14:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 14, i16 40, ptr [[TMP2]], i64 40)
134 ; CHECK-NEXT: [[TMP15:%.*]] = call x86_amx @llvm.x86.tdpbssd.internal(i16 11, i16 40, i16 56, x86_amx [[TMP10]], x86_amx [[TMP12]], x86_amx [[TMP14]])
135 ; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 11, i16 40, ptr [[TMP1]], i64 40, x86_amx [[TMP15]])
136 ; CHECK-NEXT: [[TMP17:%.*]] = load <110 x i32>, ptr [[TMP1]], align 512
137 ; CHECK-NEXT: br i1 [[ARG]], label [[FOR_COND_CLEANUP_I_I]], label [[EXIT:%.*]]
138 ; CHECK: for.cond.cleanup.i.i:
139 ; CHECK-NEXT: [[GOODPHI:%.*]] = phi <110 x i32> [ [[TMP8]], [[WRAPPER_ENTRY:%.*]] ], [ [[TMP17]], [[FOR_BODY_I_LR_PH_I]] ]
140 ; CHECK-NEXT: store <110 x i32> [[GOODPHI]], ptr [[TMP0]], align 512
141 ; CHECK-NEXT: [[TMP19:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 11, i16 40, ptr [[TMP0]], i64 40)
142 ; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 11, i16 40, ptr undef, i64 undef, x86_amx [[TMP19]])
143 ; CHECK-NEXT: br i1 [[ARG]], label [[EXIT]], label [[FOR_BODY_I_LR_PH_I]]
145 ; CHECK-NEXT: [[EVILPHI2:%.*]] = phi <110 x i32> [ [[GOODPHI]], [[FOR_COND_CLEANUP_I_I]] ], [ [[TMP17]], [[FOR_BODY_I_LR_PH_I]] ]
146 ; CHECK-NEXT: store <110 x i32> [[EVILPHI2]], ptr undef, align 512
147 ; CHECK-NEXT: ret void
150 %0 = call x86_amx @llvm.x86.tileloadd64.internal(i16 11, i16 40, ptr undef, i64 undef)
151 %tmp = call <110 x i32> @llvm.x86.cast.tile.to.vector.v110i32(x86_amx %0)
152 br i1 %arg, label %for.cond.cleanup.i.i, label %for.body.i.lr.ph.i
154 for.body.i.lr.ph.i: ; preds = %wrapper_entry
155 %1 = call x86_amx @llvm.x86.cast.vector.to.tile.v110i32(<110 x i32> undef)
156 %2 = call x86_amx @llvm.x86.cast.vector.to.tile.v616i8(<616 x i8> undef)
157 %3 = call x86_amx @llvm.x86.cast.vector.to.tile.v560i8(<560 x i8> undef)
158 %4 = call x86_amx @llvm.x86.tdpbssd.internal(i16 11, i16 40, i16 56, x86_amx %1, x86_amx %2, x86_amx %3)
159 %5 = call <110 x i32> @llvm.x86.cast.tile.to.vector.v110i32(x86_amx %4)
160 br i1 %arg, label %for.cond.cleanup.i.i, label %exit
162 for.cond.cleanup.i.i: ; preds = %for.body.i.lr.ph.i, %wrapper_entry
163 %goodphi = phi <110 x i32> [ %tmp, %wrapper_entry ], [ %5, %for.body.i.lr.ph.i ]
164 %6 = call x86_amx @llvm.x86.cast.vector.to.tile.v110i32(<110 x i32> %goodphi)
165 call void @llvm.x86.tilestored64.internal(i16 11, i16 40, ptr undef, i64 undef, x86_amx %6)
166 br i1 %arg, label %exit, label %for.body.i.lr.ph.i
168 %evilphi2 = phi <110 x i32> [ %goodphi, %for.cond.cleanup.i.i ], [ %5, %for.body.i.lr.ph.i ]
169 store <110 x i32> %evilphi2, ptr undef, align 512
173 define void @fail_to_combine_amx_cast_and_phi_due_to_const_value(i1 %arg) {
174 ; CHECK-LABEL: @fail_to_combine_amx_cast_and_phi_due_to_const_value(
175 ; CHECK-NEXT: wrapper_entry:
176 ; CHECK-NEXT: [[TMP0:%.*]] = alloca <560 x i8>, align 64
177 ; CHECK-NEXT: [[TMP1:%.*]] = alloca <616 x i8>, align 64
178 ; CHECK-NEXT: [[TMP2:%.*]] = alloca <110 x i32>, align 64
179 ; CHECK-NEXT: [[TMP3:%.*]] = call x86_amx @llvm.x86.tilezero.internal(i16 11, i16 40)
180 ; CHECK-NEXT: br i1 [[ARG:%.*]], label [[FOR_COND_CLEANUP_I_I:%.*]], label [[FOR_BODY_I_LR_PH_I:%.*]]
181 ; CHECK: for.body.i.lr.ph.i:
182 ; CHECK-NEXT: store <110 x i32> undef, ptr [[TMP2]], align 512
183 ; CHECK-NEXT: [[TMP5:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 11, i16 40, ptr [[TMP2]], i64 40)
184 ; CHECK-NEXT: store <616 x i8> undef, ptr [[TMP1]], align 1024
185 ; CHECK-NEXT: [[TMP7:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 11, i16 56, ptr [[TMP1]], i64 56)
186 ; CHECK-NEXT: store <560 x i8> undef, ptr [[TMP0]], align 1024
187 ; CHECK-NEXT: [[TMP9:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 14, i16 40, ptr [[TMP0]], i64 40)
188 ; CHECK-NEXT: [[TMP10:%.*]] = call x86_amx @llvm.x86.tdpbssd.internal(i16 11, i16 40, i16 56, x86_amx [[TMP5]], x86_amx [[TMP7]], x86_amx [[TMP9]])
189 ; CHECK-NEXT: br label [[FOR_COND_CLEANUP_I_I]]
190 ; CHECK: for.cond.cleanup.i.i:
191 ; CHECK-NEXT: [[TMP11:%.*]] = phi x86_amx [ [[TMP3]], [[WRAPPER_ENTRY:%.*]] ], [ [[TMP10]], [[FOR_BODY_I_LR_PH_I]] ]
192 ; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 11, i16 40, ptr undef, i64 undef, x86_amx [[TMP11]])
193 ; CHECK-NEXT: ret void
196 br i1 %arg, label %for.cond.cleanup.i.i, label %for.body.i.lr.ph.i
198 for.body.i.lr.ph.i: ; preds = %wrapper_entry
199 %0 = call x86_amx @llvm.x86.cast.vector.to.tile.v110i32(<110 x i32> undef)
200 %1 = call x86_amx @llvm.x86.cast.vector.to.tile.v616i8(<616 x i8> undef)
201 %2 = call x86_amx @llvm.x86.cast.vector.to.tile.v560i8(<560 x i8> undef)
202 %3 = call x86_amx @llvm.x86.tdpbssd.internal(i16 11, i16 40, i16 56, x86_amx %0, x86_amx %1, x86_amx %2)
203 %4 = call <110 x i32> @llvm.x86.cast.tile.to.vector.v110i32(x86_amx %3)
204 br label %for.cond.cleanup.i.i
206 for.cond.cleanup.i.i: ; preds = %for.body.i.lr.ph.i, %wrapper_entry
207 %evilphi = phi <110 x i32> [ undef, %wrapper_entry ], [ %4, %for.body.i.lr.ph.i ]
208 %5 = call x86_amx @llvm.x86.cast.vector.to.tile.v110i32(<110 x i32> %evilphi)
209 call void @llvm.x86.tilestored64.internal(i16 11, i16 40, ptr undef, i64 undef, x86_amx %5)
213 ; Cases where amxcast can be combined across bb
214 ; When optimizeAMXCastFromPhi process %6 and %goodphi, %goodphi2 is outside the phi-web, so the optimization stop
215 ; When optimizeAMXCastFromPhi process %7 and %goodphi2, the optimization continue.
216 define void @combine_amx_cast_and_multiple_phi(i1 %arg) {
217 ; CHECK-LABEL: @combine_amx_cast_and_multiple_phi(
218 ; CHECK-NEXT: wrapper_entry:
219 ; CHECK-NEXT: [[TMP0:%.*]] = alloca <560 x i8>, align 64
220 ; CHECK-NEXT: [[TMP1:%.*]] = alloca <616 x i8>, align 64
221 ; CHECK-NEXT: [[TMP2:%.*]] = alloca <110 x i32>, align 64
222 ; CHECK-NEXT: [[TMP3:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 11, i16 40, ptr undef, i64 undef)
223 ; CHECK-NEXT: br i1 [[ARG:%.*]], label [[FOR_COND_CLEANUP_I_I:%.*]], label [[FOR_BODY_I_LR_PH_I:%.*]]
224 ; CHECK: for.body.i.lr.ph.i:
225 ; CHECK-NEXT: store <110 x i32> undef, ptr [[TMP2]], align 512
226 ; CHECK-NEXT: [[TMP5:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 11, i16 40, ptr [[TMP2]], i64 40)
227 ; CHECK-NEXT: store <616 x i8> undef, ptr [[TMP1]], align 1024
228 ; CHECK-NEXT: [[TMP7:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 11, i16 56, ptr [[TMP1]], i64 56)
229 ; CHECK-NEXT: store <560 x i8> undef, ptr [[TMP0]], align 1024
230 ; CHECK-NEXT: [[TMP9:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 14, i16 40, ptr [[TMP0]], i64 40)
231 ; CHECK-NEXT: [[TMP10:%.*]] = call x86_amx @llvm.x86.tdpbssd.internal(i16 11, i16 40, i16 56, x86_amx [[TMP5]], x86_amx [[TMP7]], x86_amx [[TMP9]])
232 ; CHECK-NEXT: br i1 [[ARG]], label [[FOR_COND_CLEANUP_I_I]], label [[EXIT:%.*]]
233 ; CHECK: for.cond.cleanup.i.i:
234 ; CHECK-NEXT: [[TMP11:%.*]] = phi x86_amx [ [[TMP3]], [[WRAPPER_ENTRY:%.*]] ], [ [[TMP10]], [[FOR_BODY_I_LR_PH_I]] ]
235 ; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 11, i16 40, ptr undef, i64 undef, x86_amx [[TMP11]])
236 ; CHECK-NEXT: br i1 [[ARG]], label [[EXIT]], label [[FOR_BODY_I_LR_PH_I]]
238 ; CHECK-NEXT: [[TMP12:%.*]] = phi x86_amx [ [[TMP11]], [[FOR_COND_CLEANUP_I_I]] ], [ [[TMP10]], [[FOR_BODY_I_LR_PH_I]] ]
239 ; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 11, i16 40, ptr undef, i64 undef, x86_amx [[TMP12]])
240 ; CHECK-NEXT: ret void
243 %0 = call x86_amx @llvm.x86.tileloadd64.internal(i16 11, i16 40, ptr undef, i64 undef)
244 %tmp = call <110 x i32> @llvm.x86.cast.tile.to.vector.v110i32(x86_amx %0)
245 br i1 %arg, label %for.cond.cleanup.i.i, label %for.body.i.lr.ph.i
247 for.body.i.lr.ph.i: ; preds = %wrapper_entry
248 %1 = call x86_amx @llvm.x86.cast.vector.to.tile.v110i32(<110 x i32> undef)
249 %2 = call x86_amx @llvm.x86.cast.vector.to.tile.v616i8(<616 x i8> undef)
250 %3 = call x86_amx @llvm.x86.cast.vector.to.tile.v560i8(<560 x i8> undef)
251 %4 = call x86_amx @llvm.x86.tdpbssd.internal(i16 11, i16 40, i16 56, x86_amx %1, x86_amx %2, x86_amx %3)
252 %5 = call <110 x i32> @llvm.x86.cast.tile.to.vector.v110i32(x86_amx %4)
253 br i1 %arg, label %for.cond.cleanup.i.i, label %exit
255 for.cond.cleanup.i.i: ; preds = %for.body.i.lr.ph.i, %wrapper_entry
256 %goodphi = phi <110 x i32> [ %tmp, %wrapper_entry ], [ %5, %for.body.i.lr.ph.i ]
257 %6 = call x86_amx @llvm.x86.cast.vector.to.tile.v110i32(<110 x i32> %goodphi)
258 call void @llvm.x86.tilestored64.internal(i16 11, i16 40, ptr undef, i64 undef, x86_amx %6)
259 br i1 %arg, label %exit, label %for.body.i.lr.ph.i
261 %evilphi2 = phi <110 x i32> [ %goodphi, %for.cond.cleanup.i.i ], [ %5, %for.body.i.lr.ph.i ]
262 %7 = call x86_amx @llvm.x86.cast.vector.to.tile.v110i32(<110 x i32> %evilphi2)
263 call void @llvm.x86.tilestored64.internal(i16 11, i16 40, ptr undef, i64 undef, x86_amx %7)
267 ; Currently we are not able to delete DeadPHICycle, later we will handle with them
268 define void @combine_amx_cast_and_phi_in_a_circle(i1 %arg) {
269 ; CHECK-LABEL: @combine_amx_cast_and_phi_in_a_circle(
270 ; CHECK-NEXT: wrapper_entry:
271 ; CHECK-NEXT: [[TMP0:%.*]] = alloca <110 x i32>, align 64
272 ; CHECK-NEXT: [[TMP1:%.*]] = alloca <560 x i8>, align 64
273 ; CHECK-NEXT: [[TMP2:%.*]] = alloca <616 x i8>, align 64
274 ; CHECK-NEXT: [[TMP3:%.*]] = alloca <110 x i32>, align 64
275 ; CHECK-NEXT: [[TMP4:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 11, i16 40, ptr undef, i64 undef)
276 ; CHECK-NEXT: br label [[BB1:%.*]]
278 ; CHECK-NEXT: store <110 x i32> undef, ptr [[TMP3]], align 512
279 ; CHECK-NEXT: [[TMP6:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 11, i16 40, ptr [[TMP3]], i64 40)
280 ; CHECK-NEXT: store <616 x i8> undef, ptr [[TMP2]], align 1024
281 ; CHECK-NEXT: [[TMP8:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 11, i16 56, ptr [[TMP2]], i64 56)
282 ; CHECK-NEXT: store <560 x i8> undef, ptr [[TMP1]], align 1024
283 ; CHECK-NEXT: [[TMP10:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 14, i16 40, ptr [[TMP1]], i64 40)
284 ; CHECK-NEXT: [[TMP11:%.*]] = call x86_amx @llvm.x86.tdpbssd.internal(i16 11, i16 40, i16 56, x86_amx [[TMP6]], x86_amx [[TMP8]], x86_amx [[TMP10]])
285 ; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 11, i16 40, ptr [[TMP0]], i64 40, x86_amx [[TMP11]])
286 ; CHECK-NEXT: [[TMP13:%.*]] = load <110 x i32>, ptr [[TMP0]], align 512
287 ; CHECK-NEXT: br i1 [[ARG:%.*]], label [[BB2:%.*]], label [[BB3:%.*]]
289 ; CHECK-NEXT: [[TMP14:%.*]] = phi x86_amx [ [[TMP15:%.*]], [[BB3]] ], [ [[TMP11]], [[BB1]] ]
290 ; CHECK-NEXT: [[GOODPHI:%.*]] = phi <110 x i32> [ [[EVILPHI2:%.*]], [[BB3]] ], [ [[TMP13]], [[BB1]] ]
291 ; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 11, i16 40, ptr undef, i64 undef, x86_amx [[TMP14]])
292 ; CHECK-NEXT: br label [[BB3]]
294 ; CHECK-NEXT: [[TMP15]] = phi x86_amx [ [[TMP14]], [[BB2]] ], [ [[TMP11]], [[BB1]] ]
295 ; CHECK-NEXT: [[EVILPHI2]] = phi <110 x i32> [ [[GOODPHI]], [[BB2]] ], [ [[TMP13]], [[BB1]] ]
296 ; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 11, i16 40, ptr undef, i64 undef, x86_amx [[TMP15]])
297 ; CHECK-NEXT: br i1 [[ARG]], label [[BB2]], label [[EXIT:%.*]]
299 ; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 11, i16 40, ptr undef, i64 undef, x86_amx [[TMP15]])
300 ; CHECK-NEXT: ret void
303 %0 = call x86_amx @llvm.x86.tileloadd64.internal(i16 11, i16 40, ptr undef, i64 undef)
304 %tmp = call <110 x i32> @llvm.x86.cast.tile.to.vector.v110i32(x86_amx %0)
307 bb1: ; preds = %wrapper_entry
308 %1 = call x86_amx @llvm.x86.cast.vector.to.tile.v110i32(<110 x i32> undef)
309 %2 = call x86_amx @llvm.x86.cast.vector.to.tile.v616i8(<616 x i8> undef)
310 %3 = call x86_amx @llvm.x86.cast.vector.to.tile.v560i8(<560 x i8> undef)
311 %4 = call x86_amx @llvm.x86.tdpbssd.internal(i16 11, i16 40, i16 56, x86_amx %1, x86_amx %2, x86_amx %3)
312 %5 = call <110 x i32> @llvm.x86.cast.tile.to.vector.v110i32(x86_amx %4)
313 br i1 %arg, label %bb2, label %bb3
315 bb2: ; preds = %bb1, %wrapper_entry
316 %goodphi = phi <110 x i32> [ %evilphi2, %bb3], [ %5, %bb1 ]
317 %6 = call x86_amx @llvm.x86.cast.vector.to.tile.v110i32(<110 x i32> %goodphi)
318 call void @llvm.x86.tilestored64.internal(i16 11, i16 40, ptr undef, i64 undef, x86_amx %6)
321 %evilphi2 = phi <110 x i32> [ %goodphi, %bb2 ], [ %5, %bb1 ]
322 %7 = call x86_amx @llvm.x86.cast.vector.to.tile.v110i32(<110 x i32> %evilphi2)
323 call void @llvm.x86.tilestored64.internal(i16 11, i16 40, ptr undef, i64 undef, x86_amx %7)
324 br i1 %arg, label %bb2, label %exit
326 %8 = call x86_amx @llvm.x86.cast.vector.to.tile.v110i32(<110 x i32> %evilphi2)
327 call void @llvm.x86.tilestored64.internal(i16 11, i16 40, ptr undef, i64 undef, x86_amx %8)
331 define void @eliminate_unused_phi_and_cast(i1 %arg) {
332 ; CHECK-LABEL: @eliminate_unused_phi_and_cast(
333 ; CHECK-NEXT: wrapper_entry:
334 ; CHECK-NEXT: [[TMP0:%.*]] = alloca <560 x i8>, align 64
335 ; CHECK-NEXT: [[TMP1:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 11, i16 40, ptr undef, i64 undef)
336 ; CHECK-NEXT: br i1 [[ARG:%.*]], label [[FOR_COND_CLEANUP_I_I:%.*]], label [[FOR_BODY_I_LR_PH_I:%.*]]
337 ; CHECK: for.body.i.lr.ph.i:
338 ; CHECK-NEXT: [[TMP2:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 11, i16 56, ptr undef, i64 undef)
339 ; CHECK-NEXT: [[TMP3:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 14, i16 40, ptr undef, i64 undef)
340 ; CHECK-NEXT: store <560 x i8> undef, ptr [[TMP0]], align 1024
341 ; CHECK-NEXT: [[TMP5:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 14, i16 40, ptr [[TMP0]], i64 40)
342 ; CHECK-NEXT: [[TMP6:%.*]] = call x86_amx @llvm.x86.tdpbssd.internal(i16 11, i16 40, i16 56, x86_amx [[TMP2]], x86_amx [[TMP3]], x86_amx [[TMP5]])
343 ; CHECK-NEXT: br label [[FOR_COND_CLEANUP_I_I]]
344 ; CHECK: for.cond.cleanup.i.i:
345 ; CHECK-NEXT: [[TMP7:%.*]] = phi x86_amx [ [[TMP1]], [[WRAPPER_ENTRY:%.*]] ], [ [[TMP6]], [[FOR_BODY_I_LR_PH_I]] ]
346 ; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 11, i16 40, ptr undef, i64 undef, x86_amx [[TMP7]])
347 ; CHECK-NEXT: ret void
350 %0 = call x86_amx @llvm.x86.tileloadd64.internal(i16 11, i16 40, ptr undef, i64 undef)
351 %tmp = call <110 x i32> @llvm.x86.cast.tile.to.vector.v110i32(x86_amx %0)
352 br i1 %arg, label %for.cond.cleanup.i.i, label %for.body.i.lr.ph.i
354 for.body.i.lr.ph.i: ; preds = %wrapper_entry
355 %1 = call x86_amx @llvm.x86.tileloadd64.internal(i16 11, i16 56, ptr undef, i64 undef)
356 %v1 = call <110 x i32> @llvm.x86.cast.tile.to.vector.v110i32(x86_amx %1)
357 %2 = call x86_amx @llvm.x86.tileloadd64.internal(i16 14, i16 40, ptr undef, i64 undef)
358 %v2 = call <616 x i8> @llvm.x86.cast.tile.to.vector.v616i8(x86_amx %2)
359 %3 = call x86_amx @llvm.x86.cast.vector.to.tile.v110i32(<110 x i32> %v1)
360 %4 = call x86_amx @llvm.x86.cast.vector.to.tile.v616i8(<616 x i8> %v2)
361 %5 = call x86_amx @llvm.x86.cast.vector.to.tile.v560i8(<560 x i8> undef)
362 %6 = call x86_amx @llvm.x86.tdpbssd.internal(i16 11, i16 40, i16 56, x86_amx %3, x86_amx %4, x86_amx %5)
363 %7 = call <110 x i32> @llvm.x86.cast.tile.to.vector.v110i32(x86_amx %6)
364 br label %for.cond.cleanup.i.i
366 for.cond.cleanup.i.i: ; preds = %for.body.i.lr.ph.i, %wrapper_entry
367 %goodphi = phi <110 x i32> [ %tmp, %wrapper_entry ], [ %7, %for.body.i.lr.ph.i ]
368 %8 = call x86_amx @llvm.x86.cast.vector.to.tile.v110i32(<110 x i32> %goodphi)
369 call void @llvm.x86.tilestored64.internal(i16 11, i16 40, ptr undef, i64 undef, x86_amx %8)
373 declare x86_amx @llvm.x86.tileloadd64.internal(i16, i16, ptr, i64)
374 declare <110 x i32> @llvm.x86.cast.tile.to.vector.v110i32(x86_amx)
375 declare <616 x i8> @llvm.x86.cast.tile.to.vector.v616i8(x86_amx)
376 declare x86_amx @llvm.x86.cast.vector.to.tile.v110i32(<110 x i32>)
377 declare void @llvm.x86.tilestored64.internal(i16, i16, ptr, i64, x86_amx)
378 declare x86_amx @llvm.x86.cast.vector.to.tile.v616i8(<616 x i8>)
379 declare x86_amx @llvm.x86.cast.vector.to.tile.v560i8(<560 x i8>)
380 declare x86_amx @llvm.x86.tdpbssd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx)