1 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2 ; RUN: opt -mtriple=x86_64 -lower-amx-intrinsics -enable-x86-scalar-amx=true %s -S | FileCheck %s
4 define dso_local void @test_no_bitcast(ptr %A_mem, ptr %B_mem, ptr %C_mem) local_unnamed_addr #0 {
5 ; CHECK-LABEL: @test_no_bitcast(
7 ; CHECK-NEXT: br label [[TILELOAD_SCALARIZE_ROWS_HEADER:%.*]]
8 ; CHECK: tileload.scalarize.rows.header:
9 ; CHECK-NEXT: [[TILELOAD_SCALARIZE_ROWS_IV:%.*]] = phi i16 [ 0, [[ENTRY:%.*]] ], [ [[TILELOAD_SCALARIZE_ROWS_STEP:%.*]], [[TILELOAD_SCALARIZE_ROWS_LATCH:%.*]] ]
10 ; CHECK-NEXT: [[VEC_PHI_ROW:%.*]] = phi <256 x i32> [ zeroinitializer, [[ENTRY]] ], [ [[TMP10:%.*]], [[TILELOAD_SCALARIZE_ROWS_LATCH]] ]
11 ; CHECK-NEXT: br label [[TILELOAD_SCALARIZE_ROWS_BODY:%.*]]
12 ; CHECK: tileload.scalarize.rows.body:
13 ; CHECK-NEXT: br label [[TILELOAD_SCALARIZE_COLS_HEADER:%.*]]
14 ; CHECK: tileload.scalarize.cols.header:
15 ; CHECK-NEXT: [[TILELOAD_SCALARIZE_COLS_IV:%.*]] = phi i16 [ 0, [[TILELOAD_SCALARIZE_ROWS_BODY]] ], [ [[TILELOAD_SCALARIZE_COLS_STEP:%.*]], [[TILELOAD_SCALARIZE_COLS_LATCH:%.*]] ]
16 ; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <256 x i32> [ [[VEC_PHI_ROW]], [[TILELOAD_SCALARIZE_ROWS_BODY]] ], [ [[TMP10]], [[TILELOAD_SCALARIZE_COLS_LATCH]] ]
17 ; CHECK-NEXT: br label [[TILELOAD_SCALARIZE_COLS_BODY:%.*]]
18 ; CHECK: tileload.scalarize.cols.body:
19 ; CHECK-NEXT: [[TMP1:%.*]] = zext i16 [[TILELOAD_SCALARIZE_ROWS_IV]] to i64
20 ; CHECK-NEXT: [[TMP2:%.*]] = zext i16 [[TILELOAD_SCALARIZE_COLS_IV]] to i64
21 ; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP1]], 4
22 ; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[TMP3]], [[TMP2]]
23 ; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i32, ptr [[C_MEM:%.*]], i64 [[TMP4]]
24 ; CHECK-NEXT: [[TMP7:%.*]] = mul i16 [[TILELOAD_SCALARIZE_ROWS_IV]], 16
25 ; CHECK-NEXT: [[TMP8:%.*]] = add i16 [[TMP7]], [[TILELOAD_SCALARIZE_COLS_IV]]
26 ; CHECK-NEXT: [[TMP9:%.*]] = load i32, ptr [[TMP6]], align 4
27 ; CHECK-NEXT: [[TMP10]] = insertelement <256 x i32> [[VEC_PHI]], i32 [[TMP9]], i16 [[TMP8]]
28 ; CHECK-NEXT: br label [[TILELOAD_SCALARIZE_COLS_LATCH]]
29 ; CHECK: tileload.scalarize.cols.latch:
30 ; CHECK-NEXT: [[TILELOAD_SCALARIZE_COLS_STEP]] = add i16 [[TILELOAD_SCALARIZE_COLS_IV]], 1
31 ; CHECK-NEXT: [[TILELOAD_SCALARIZE_COLS_COND:%.*]] = icmp ne i16 [[TILELOAD_SCALARIZE_COLS_STEP]], 4
32 ; CHECK-NEXT: br i1 [[TILELOAD_SCALARIZE_COLS_COND]], label [[TILELOAD_SCALARIZE_COLS_HEADER]], label [[TILELOAD_SCALARIZE_ROWS_LATCH]]
33 ; CHECK: tileload.scalarize.rows.latch:
34 ; CHECK-NEXT: [[TILELOAD_SCALARIZE_ROWS_STEP]] = add i16 [[TILELOAD_SCALARIZE_ROWS_IV]], 1
35 ; CHECK-NEXT: [[TILELOAD_SCALARIZE_ROWS_COND:%.*]] = icmp ne i16 [[TILELOAD_SCALARIZE_ROWS_STEP]], 4
36 ; CHECK-NEXT: br i1 [[TILELOAD_SCALARIZE_ROWS_COND]], label [[TILELOAD_SCALARIZE_ROWS_HEADER]], label [[CONTINUE:%.*]]
38 ; CHECK-NEXT: [[TMP11:%.*]] = bitcast <256 x i32> [[TMP10]] to x86_amx
39 ; CHECK-NEXT: br label [[TILELOAD_SCALARIZE_ROWS_HEADER2:%.*]]
40 ; CHECK: tileload.scalarize.rows.header2:
41 ; CHECK-NEXT: [[TILELOAD_SCALARIZE_ROWS_IV5:%.*]] = phi i16 [ 0, [[CONTINUE]] ], [ [[TILELOAD_SCALARIZE_ROWS_STEP6:%.*]], [[TILELOAD_SCALARIZE_ROWS_LATCH4:%.*]] ]
42 ; CHECK-NEXT: [[VEC_PHI_ROW14:%.*]] = phi <256 x i32> [ zeroinitializer, [[CONTINUE]] ], [ [[TMP22:%.*]], [[TILELOAD_SCALARIZE_ROWS_LATCH4]] ]
43 ; CHECK-NEXT: br label [[TILELOAD_SCALARIZE_ROWS_BODY3:%.*]]
44 ; CHECK: tileload.scalarize.rows.body3:
45 ; CHECK-NEXT: br label [[TILELOAD_SCALARIZE_COLS_HEADER8:%.*]]
46 ; CHECK: tileload.scalarize.cols.header8:
47 ; CHECK-NEXT: [[TILELOAD_SCALARIZE_COLS_IV11:%.*]] = phi i16 [ 0, [[TILELOAD_SCALARIZE_ROWS_BODY3]] ], [ [[TILELOAD_SCALARIZE_COLS_STEP12:%.*]], [[TILELOAD_SCALARIZE_COLS_LATCH10:%.*]] ]
48 ; CHECK-NEXT: [[VEC_PHI15:%.*]] = phi <256 x i32> [ [[VEC_PHI_ROW14]], [[TILELOAD_SCALARIZE_ROWS_BODY3]] ], [ [[TMP22]], [[TILELOAD_SCALARIZE_COLS_LATCH10]] ]
49 ; CHECK-NEXT: br label [[TILELOAD_SCALARIZE_COLS_BODY9:%.*]]
50 ; CHECK: tileload.scalarize.cols.body9:
51 ; CHECK-NEXT: [[TMP13:%.*]] = zext i16 [[TILELOAD_SCALARIZE_ROWS_IV5]] to i64
52 ; CHECK-NEXT: [[TMP14:%.*]] = zext i16 [[TILELOAD_SCALARIZE_COLS_IV11]] to i64
53 ; CHECK-NEXT: [[TMP15:%.*]] = mul i64 [[TMP13]], 4
54 ; CHECK-NEXT: [[TMP16:%.*]] = add i64 [[TMP15]], [[TMP14]]
55 ; CHECK-NEXT: [[TMP18:%.*]] = getelementptr i32, ptr [[A_MEM:%.*]], i64 [[TMP16]]
56 ; CHECK-NEXT: [[TMP19:%.*]] = mul i16 [[TILELOAD_SCALARIZE_ROWS_IV5]], 16
57 ; CHECK-NEXT: [[TMP20:%.*]] = add i16 [[TMP19]], [[TILELOAD_SCALARIZE_COLS_IV11]]
58 ; CHECK-NEXT: [[TMP21:%.*]] = load i32, ptr [[TMP18]], align 4
59 ; CHECK-NEXT: [[TMP22]] = insertelement <256 x i32> [[VEC_PHI15]], i32 [[TMP21]], i16 [[TMP20]]
60 ; CHECK-NEXT: br label [[TILELOAD_SCALARIZE_COLS_LATCH10]]
61 ; CHECK: tileload.scalarize.cols.latch10:
62 ; CHECK-NEXT: [[TILELOAD_SCALARIZE_COLS_STEP12]] = add i16 [[TILELOAD_SCALARIZE_COLS_IV11]], 1
63 ; CHECK-NEXT: [[TILELOAD_SCALARIZE_COLS_COND13:%.*]] = icmp ne i16 [[TILELOAD_SCALARIZE_COLS_STEP12]], 4
64 ; CHECK-NEXT: br i1 [[TILELOAD_SCALARIZE_COLS_COND13]], label [[TILELOAD_SCALARIZE_COLS_HEADER8]], label [[TILELOAD_SCALARIZE_ROWS_LATCH4]]
65 ; CHECK: tileload.scalarize.rows.latch4:
66 ; CHECK-NEXT: [[TILELOAD_SCALARIZE_ROWS_STEP6]] = add i16 [[TILELOAD_SCALARIZE_ROWS_IV5]], 1
67 ; CHECK-NEXT: [[TILELOAD_SCALARIZE_ROWS_COND7:%.*]] = icmp ne i16 [[TILELOAD_SCALARIZE_ROWS_STEP6]], 4
68 ; CHECK-NEXT: br i1 [[TILELOAD_SCALARIZE_ROWS_COND7]], label [[TILELOAD_SCALARIZE_ROWS_HEADER2]], label [[CONTINUE1:%.*]]
70 ; CHECK-NEXT: [[TMP23:%.*]] = bitcast <256 x i32> [[TMP22]] to x86_amx
71 ; CHECK-NEXT: br label [[TILELOAD_SCALARIZE_ROWS_HEADER17:%.*]]
72 ; CHECK: tileload.scalarize.rows.header17:
73 ; CHECK-NEXT: [[TILELOAD_SCALARIZE_ROWS_IV20:%.*]] = phi i16 [ 0, [[CONTINUE1]] ], [ [[TILELOAD_SCALARIZE_ROWS_STEP21:%.*]], [[TILELOAD_SCALARIZE_ROWS_LATCH19:%.*]] ]
74 ; CHECK-NEXT: [[VEC_PHI_ROW29:%.*]] = phi <256 x i32> [ zeroinitializer, [[CONTINUE1]] ], [ [[TMP34:%.*]], [[TILELOAD_SCALARIZE_ROWS_LATCH19]] ]
75 ; CHECK-NEXT: br label [[TILELOAD_SCALARIZE_ROWS_BODY18:%.*]]
76 ; CHECK: tileload.scalarize.rows.body18:
77 ; CHECK-NEXT: br label [[TILELOAD_SCALARIZE_COLS_HEADER23:%.*]]
78 ; CHECK: tileload.scalarize.cols.header23:
79 ; CHECK-NEXT: [[TILELOAD_SCALARIZE_COLS_IV26:%.*]] = phi i16 [ 0, [[TILELOAD_SCALARIZE_ROWS_BODY18]] ], [ [[TILELOAD_SCALARIZE_COLS_STEP27:%.*]], [[TILELOAD_SCALARIZE_COLS_LATCH25:%.*]] ]
80 ; CHECK-NEXT: [[VEC_PHI30:%.*]] = phi <256 x i32> [ [[VEC_PHI_ROW29]], [[TILELOAD_SCALARIZE_ROWS_BODY18]] ], [ [[TMP34]], [[TILELOAD_SCALARIZE_COLS_LATCH25]] ]
81 ; CHECK-NEXT: br label [[TILELOAD_SCALARIZE_COLS_BODY24:%.*]]
82 ; CHECK: tileload.scalarize.cols.body24:
83 ; CHECK-NEXT: [[TMP25:%.*]] = zext i16 [[TILELOAD_SCALARIZE_ROWS_IV20]] to i64
84 ; CHECK-NEXT: [[TMP26:%.*]] = zext i16 [[TILELOAD_SCALARIZE_COLS_IV26]] to i64
85 ; CHECK-NEXT: [[TMP27:%.*]] = mul i64 [[TMP25]], 4
86 ; CHECK-NEXT: [[TMP28:%.*]] = add i64 [[TMP27]], [[TMP26]]
87 ; CHECK-NEXT: [[TMP30:%.*]] = getelementptr i32, ptr [[B_MEM:%.*]], i64 [[TMP28]]
88 ; CHECK-NEXT: [[TMP31:%.*]] = mul i16 [[TILELOAD_SCALARIZE_ROWS_IV20]], 16
89 ; CHECK-NEXT: [[TMP32:%.*]] = add i16 [[TMP31]], [[TILELOAD_SCALARIZE_COLS_IV26]]
90 ; CHECK-NEXT: [[TMP33:%.*]] = load i32, ptr [[TMP30]], align 4
91 ; CHECK-NEXT: [[TMP34]] = insertelement <256 x i32> [[VEC_PHI30]], i32 [[TMP33]], i16 [[TMP32]]
92 ; CHECK-NEXT: br label [[TILELOAD_SCALARIZE_COLS_LATCH25]]
93 ; CHECK: tileload.scalarize.cols.latch25:
94 ; CHECK-NEXT: [[TILELOAD_SCALARIZE_COLS_STEP27]] = add i16 [[TILELOAD_SCALARIZE_COLS_IV26]], 1
95 ; CHECK-NEXT: [[TILELOAD_SCALARIZE_COLS_COND28:%.*]] = icmp ne i16 [[TILELOAD_SCALARIZE_COLS_STEP27]], 4
96 ; CHECK-NEXT: br i1 [[TILELOAD_SCALARIZE_COLS_COND28]], label [[TILELOAD_SCALARIZE_COLS_HEADER23]], label [[TILELOAD_SCALARIZE_ROWS_LATCH19]]
97 ; CHECK: tileload.scalarize.rows.latch19:
98 ; CHECK-NEXT: [[TILELOAD_SCALARIZE_ROWS_STEP21]] = add i16 [[TILELOAD_SCALARIZE_ROWS_IV20]], 1
99 ; CHECK-NEXT: [[TILELOAD_SCALARIZE_ROWS_COND22:%.*]] = icmp ne i16 [[TILELOAD_SCALARIZE_ROWS_STEP21]], 4
100 ; CHECK-NEXT: br i1 [[TILELOAD_SCALARIZE_ROWS_COND22]], label [[TILELOAD_SCALARIZE_ROWS_HEADER17]], label [[CONTINUE16:%.*]]
102 ; CHECK-NEXT: [[TMP35:%.*]] = bitcast <256 x i32> [[TMP34]] to x86_amx
103 ; CHECK-NEXT: br label [[TILEDPBSSD_SCALARIZE_ROWS_HEADER:%.*]]
104 ; CHECK: tiledpbssd.scalarize.rows.header:
105 ; CHECK-NEXT: [[TILEDPBSSD_SCALARIZE_ROWS_IV:%.*]] = phi i16 [ 0, [[CONTINUE16]] ], [ [[TILEDPBSSD_SCALARIZE_ROWS_STEP:%.*]], [[TILEDPBSSD_SCALARIZE_ROWS_LATCH:%.*]] ]
106 ; CHECK-NEXT: [[VEC_C_PHI_ROW:%.*]] = phi <256 x i32> [ [[TMP10]], [[CONTINUE16]] ], [ [[TMP52:%.*]], [[TILEDPBSSD_SCALARIZE_ROWS_LATCH]] ]
107 ; CHECK-NEXT: [[VEC_D_PHI_ROW:%.*]] = phi <256 x i32> [ zeroinitializer, [[CONTINUE16]] ], [ [[TMP54:%.*]], [[TILEDPBSSD_SCALARIZE_ROWS_LATCH]] ]
108 ; CHECK-NEXT: br label [[TILEDPBSSD_SCALARIZE_ROWS_BODY:%.*]]
109 ; CHECK: tiledpbssd.scalarize.rows.body:
110 ; CHECK-NEXT: br label [[TILEDPBSSD_SCALARIZE_COLS_HEADER:%.*]]
111 ; CHECK: tiledpbssd.scalarize.cols.header:
112 ; CHECK-NEXT: [[TILEDPBSSD_SCALARIZE_COLS_IV:%.*]] = phi i16 [ 0, [[TILEDPBSSD_SCALARIZE_ROWS_BODY]] ], [ [[TILEDPBSSD_SCALARIZE_COLS_STEP:%.*]], [[TILEDPBSSD_SCALARIZE_COLS_LATCH:%.*]] ]
113 ; CHECK-NEXT: [[VEC_C_PHI_COL:%.*]] = phi <256 x i32> [ [[VEC_C_PHI_ROW]], [[TILEDPBSSD_SCALARIZE_ROWS_BODY]] ], [ [[TMP52]], [[TILEDPBSSD_SCALARIZE_COLS_LATCH]] ]
114 ; CHECK-NEXT: [[VEC_D_PHI_COL:%.*]] = phi <256 x i32> [ [[VEC_D_PHI_ROW]], [[TILEDPBSSD_SCALARIZE_ROWS_BODY]] ], [ [[TMP54]], [[TILEDPBSSD_SCALARIZE_COLS_LATCH]] ]
115 ; CHECK-NEXT: [[TMP36:%.*]] = mul i16 [[TILEDPBSSD_SCALARIZE_ROWS_IV]], 16
116 ; CHECK-NEXT: [[TMP37:%.*]] = add i16 [[TMP36]], [[TILEDPBSSD_SCALARIZE_COLS_IV]]
117 ; CHECK-NEXT: br label [[TILEDPBSSD_SCALARIZE_COLS_BODY:%.*]]
118 ; CHECK: tiledpbssd.scalarize.cols.body:
119 ; CHECK-NEXT: br label [[TILEDPBSSD_SCALARIZE_INNER_HEADER:%.*]]
120 ; CHECK: tiledpbssd.scalarize.inner.header:
121 ; CHECK-NEXT: [[TILEDPBSSD_SCALARIZE_INNER_IV:%.*]] = phi i16 [ 0, [[TILEDPBSSD_SCALARIZE_COLS_BODY]] ], [ [[TILEDPBSSD_SCALARIZE_INNER_STEP:%.*]], [[TILEDPBSSD_SCALARIZE_INNER_LATCH:%.*]] ]
122 ; CHECK-NEXT: [[VEC_C_INNER_PHI:%.*]] = phi <256 x i32> [ [[VEC_C_PHI_COL]], [[TILEDPBSSD_SCALARIZE_COLS_BODY]] ], [ [[TMP52]], [[TILEDPBSSD_SCALARIZE_INNER_LATCH]] ]
123 ; CHECK-NEXT: br label [[TILEDPBSSD_SCALARIZE_INNER_BODY:%.*]]
124 ; CHECK: tiledpbssd.scalarize.inner.body:
125 ; CHECK-NEXT: [[TMP38:%.*]] = mul i16 [[TILEDPBSSD_SCALARIZE_ROWS_IV]], 16
126 ; CHECK-NEXT: [[TMP39:%.*]] = add i16 [[TMP38]], [[TILEDPBSSD_SCALARIZE_INNER_IV]]
127 ; CHECK-NEXT: [[TMP40:%.*]] = mul i16 [[TILEDPBSSD_SCALARIZE_INNER_IV]], 16
128 ; CHECK-NEXT: [[TMP41:%.*]] = add i16 [[TMP40]], [[TILEDPBSSD_SCALARIZE_COLS_IV]]
129 ; CHECK-NEXT: [[TMP42:%.*]] = extractelement <256 x i32> [[VEC_C_INNER_PHI]], i16 [[TMP37]]
130 ; CHECK-NEXT: [[TMP43:%.*]] = extractelement <256 x i32> [[TMP22]], i16 [[TMP39]]
131 ; CHECK-NEXT: [[TMP44:%.*]] = bitcast i32 [[TMP43]] to <4 x i8>
132 ; CHECK-NEXT: [[TMP45:%.*]] = extractelement <256 x i32> [[TMP34]], i16 [[TMP41]]
133 ; CHECK-NEXT: [[TMP46:%.*]] = bitcast i32 [[TMP45]] to <4 x i8>
134 ; CHECK-NEXT: [[TMP47:%.*]] = sext <4 x i8> [[TMP46]] to <4 x i32>
135 ; CHECK-NEXT: [[TMP48:%.*]] = sext <4 x i8> [[TMP44]] to <4 x i32>
136 ; CHECK-NEXT: [[TMP49:%.*]] = mul <4 x i32> [[TMP48]], [[TMP47]]
137 ; CHECK-NEXT: [[TMP50:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP49]])
138 ; CHECK-NEXT: [[TMP51:%.*]] = add i32 [[TMP42]], [[TMP50]]
139 ; CHECK-NEXT: [[TMP52]] = insertelement <256 x i32> [[VEC_C_INNER_PHI]], i32 [[TMP51]], i16 [[TMP37]]
140 ; CHECK-NEXT: br label [[TILEDPBSSD_SCALARIZE_INNER_LATCH]]
141 ; CHECK: tiledpbssd.scalarize.inner.latch:
142 ; CHECK-NEXT: [[TILEDPBSSD_SCALARIZE_INNER_STEP]] = add i16 [[TILEDPBSSD_SCALARIZE_INNER_IV]], 1
143 ; CHECK-NEXT: [[TILEDPBSSD_SCALARIZE_INNER_COND:%.*]] = icmp ne i16 [[TILEDPBSSD_SCALARIZE_INNER_STEP]], 4
144 ; CHECK-NEXT: br i1 [[TILEDPBSSD_SCALARIZE_INNER_COND]], label [[TILEDPBSSD_SCALARIZE_INNER_HEADER]], label [[TILEDPBSSD_SCALARIZE_COLS_LATCH]]
145 ; CHECK: tiledpbssd.scalarize.cols.latch:
146 ; CHECK-NEXT: [[TILEDPBSSD_SCALARIZE_COLS_STEP]] = add i16 [[TILEDPBSSD_SCALARIZE_COLS_IV]], 1
147 ; CHECK-NEXT: [[TILEDPBSSD_SCALARIZE_COLS_COND:%.*]] = icmp ne i16 [[TILEDPBSSD_SCALARIZE_COLS_STEP]], 4
148 ; CHECK-NEXT: [[TMP53:%.*]] = extractelement <256 x i32> [[TMP52]], i16 [[TMP37]]
149 ; CHECK-NEXT: [[TMP54]] = insertelement <256 x i32> [[VEC_D_PHI_COL]], i32 [[TMP53]], i16 [[TMP37]]
150 ; CHECK-NEXT: br i1 [[TILEDPBSSD_SCALARIZE_COLS_COND]], label [[TILEDPBSSD_SCALARIZE_COLS_HEADER]], label [[TILEDPBSSD_SCALARIZE_ROWS_LATCH]]
151 ; CHECK: tiledpbssd.scalarize.rows.latch:
152 ; CHECK-NEXT: [[TILEDPBSSD_SCALARIZE_ROWS_STEP]] = add i16 [[TILEDPBSSD_SCALARIZE_ROWS_IV]], 1
153 ; CHECK-NEXT: [[TILEDPBSSD_SCALARIZE_ROWS_COND:%.*]] = icmp ne i16 [[TILEDPBSSD_SCALARIZE_ROWS_STEP]], 4
154 ; CHECK-NEXT: br i1 [[TILEDPBSSD_SCALARIZE_ROWS_COND]], label [[TILEDPBSSD_SCALARIZE_ROWS_HEADER]], label [[CONTINUE31:%.*]]
156 ; CHECK-NEXT: [[TMP55:%.*]] = bitcast <256 x i32> [[TMP54]] to x86_amx
157 ; CHECK-NEXT: br label [[TILESTORE_SCALARIZE_ROWS_HEADER:%.*]]
158 ; CHECK: tilestore.scalarize.rows.header:
159 ; CHECK-NEXT: [[TILESTORE_SCALARIZE_ROWS_IV:%.*]] = phi i16 [ 0, [[CONTINUE31]] ], [ [[TILESTORE_SCALARIZE_ROWS_STEP:%.*]], [[TILESTORE_SCALARIZE_ROWS_LATCH:%.*]] ]
160 ; CHECK-NEXT: br label [[TILESTORE_SCALARIZE_ROWS_BODY:%.*]]
161 ; CHECK: tilestore.scalarize.rows.body:
162 ; CHECK-NEXT: br label [[TILESTORE_SCALARIZE_COLS_HEADER:%.*]]
163 ; CHECK: tilestore.scalarize.cols.header:
164 ; CHECK-NEXT: [[TILESTORE_SCALARIZE_COLS_IV:%.*]] = phi i16 [ 0, [[TILESTORE_SCALARIZE_ROWS_BODY]] ], [ [[TILESTORE_SCALARIZE_COLS_STEP:%.*]], [[TILESTORE_SCALARIZE_COLS_LATCH:%.*]] ]
165 ; CHECK-NEXT: br label [[TILESTORE_SCALARIZE_COLS_BODY:%.*]]
166 ; CHECK: tilestore.scalarize.cols.body:
167 ; CHECK-NEXT: [[TMP56:%.*]] = zext i16 [[TILESTORE_SCALARIZE_ROWS_IV]] to i64
168 ; CHECK-NEXT: [[TMP57:%.*]] = zext i16 [[TILESTORE_SCALARIZE_COLS_IV]] to i64
169 ; CHECK-NEXT: [[TMP58:%.*]] = mul i64 [[TMP56]], 4
170 ; CHECK-NEXT: [[TMP59:%.*]] = add i64 [[TMP58]], [[TMP57]]
171 ; CHECK-NEXT: [[TMP61:%.*]] = getelementptr i32, ptr [[C_MEM:%.*]], i64 [[TMP59]]
172 ; CHECK-NEXT: [[TMP62:%.*]] = mul i16 [[TILESTORE_SCALARIZE_ROWS_IV]], 16
173 ; CHECK-NEXT: [[TMP63:%.*]] = add i16 [[TMP62]], [[TILESTORE_SCALARIZE_COLS_IV]]
174 ; CHECK-NEXT: [[TMP64:%.*]] = extractelement <256 x i32> [[TMP54]], i16 [[TMP63]]
175 ; CHECK-NEXT: store i32 [[TMP64]], ptr [[TMP61]], align 4
176 ; CHECK-NEXT: br label [[TILESTORE_SCALARIZE_COLS_LATCH]]
177 ; CHECK: tilestore.scalarize.cols.latch:
178 ; CHECK-NEXT: [[TILESTORE_SCALARIZE_COLS_STEP]] = add i16 [[TILESTORE_SCALARIZE_COLS_IV]], 1
179 ; CHECK-NEXT: [[TILESTORE_SCALARIZE_COLS_COND:%.*]] = icmp ne i16 [[TILESTORE_SCALARIZE_COLS_STEP]], 4
180 ; CHECK-NEXT: br i1 [[TILESTORE_SCALARIZE_COLS_COND]], label [[TILESTORE_SCALARIZE_COLS_HEADER]], label [[TILESTORE_SCALARIZE_ROWS_LATCH]]
181 ; CHECK: tilestore.scalarize.rows.latch:
182 ; CHECK-NEXT: [[TILESTORE_SCALARIZE_ROWS_STEP]] = add i16 [[TILESTORE_SCALARIZE_ROWS_IV]], 1
183 ; CHECK-NEXT: [[TILESTORE_SCALARIZE_ROWS_COND:%.*]] = icmp ne i16 [[TILESTORE_SCALARIZE_ROWS_STEP]], 4
184 ; CHECK-NEXT: br i1 [[TILESTORE_SCALARIZE_ROWS_COND]], label [[TILESTORE_SCALARIZE_ROWS_HEADER]], label [[CONTINUE32:%.*]]
186 ; CHECK-NEXT: ret void
189 %0 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 4, i16 16, ptr %C_mem, i64 16)
190 %1 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 4, i16 16, ptr %A_mem, i64 16)
191 %2 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 4, i16 16, ptr %B_mem, i64 16)
192 %3 = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 4, i16 16, i16 16, x86_amx %0, x86_amx %1, x86_amx %2)
193 tail call void @llvm.x86.tilestored64.internal(i16 4, i16 16, ptr %C_mem, i64 16, x86_amx %3)
197 declare x86_amx @llvm.x86.tileloadd64.internal(i16, i16, ptr, i64)
198 declare x86_amx @llvm.x86.tdpbssd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx)
199 declare void @llvm.x86.tilestored64.internal(i16, i16, ptr, i64, x86_amx)
201 attributes #0 = { noinline nounwind optnone }