1 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2 ; RUN: opt -mtriple=x86_64 -lower-amx-intrinsics -enable-x86-scalar-amx=true %s -S | FileCheck %s
4 define dso_local void @test_amx_load_non_O0(i16 signext %row, i16 signext %col, i8 *%ptr, i64 %stride, <256 x i32>* %vptr) {
5 ; CHECK-LABEL: @test_amx_load_non_O0(
7 ; CHECK-NEXT: [[TMP0:%.*]] = lshr i16 [[COL:%.*]], 2
8 ; CHECK-NEXT: [[TMP1:%.*]] = lshr i64 [[STRIDE:%.*]], 2
9 ; CHECK-NEXT: br label [[TILELOAD_SCALARIZE_ROWS_HEADER:%.*]]
10 ; CHECK: tileload.scalarize.rows.header:
11 ; CHECK-NEXT: [[TILELOAD_SCALARIZE_ROWS_IV:%.*]] = phi i16 [ 0, [[ENTRY:%.*]] ], [ [[TILELOAD_SCALARIZE_ROWS_STEP:%.*]], [[TILELOAD_SCALARIZE_ROWS_LATCH:%.*]] ]
12 ; CHECK-NEXT: [[VEC_PHI_ROW:%.*]] = phi <256 x i32> [ zeroinitializer, [[ENTRY]] ], [ [[TMP11:%.*]], [[TILELOAD_SCALARIZE_ROWS_LATCH]] ]
13 ; CHECK-NEXT: br label [[TILELOAD_SCALARIZE_ROWS_BODY:%.*]]
14 ; CHECK: tileload.scalarize.rows.body:
15 ; CHECK-NEXT: br label [[TILELOAD_SCALARIZE_COLS_HEADER:%.*]]
16 ; CHECK: tileload.scalarize.cols.header:
17 ; CHECK-NEXT: [[TILELOAD_SCALARIZE_COLS_IV:%.*]] = phi i16 [ 0, [[TILELOAD_SCALARIZE_ROWS_BODY]] ], [ [[TILELOAD_SCALARIZE_COLS_STEP:%.*]], [[TILELOAD_SCALARIZE_COLS_LATCH:%.*]] ]
18 ; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <256 x i32> [ [[VEC_PHI_ROW]], [[TILELOAD_SCALARIZE_ROWS_BODY]] ], [ [[TMP11]], [[TILELOAD_SCALARIZE_COLS_LATCH]] ]
19 ; CHECK-NEXT: br label [[TILELOAD_SCALARIZE_COLS_BODY:%.*]]
20 ; CHECK: tileload.scalarize.cols.body:
21 ; CHECK-NEXT: [[TMP2:%.*]] = zext i16 [[TILELOAD_SCALARIZE_ROWS_IV]] to i64
22 ; CHECK-NEXT: [[TMP3:%.*]] = zext i16 [[TILELOAD_SCALARIZE_COLS_IV]] to i64
23 ; CHECK-NEXT: [[TMP4:%.*]] = mul i64 [[TMP2]], [[TMP1]]
24 ; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[TMP4]], [[TMP3]]
25 ; CHECK-NEXT: [[TMP6:%.*]] = bitcast i8* [[PTR:%.*]] to i32*
26 ; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i32, i32* [[TMP6]], i64 [[TMP5]]
27 ; CHECK-NEXT: [[TMP8:%.*]] = mul i16 [[TILELOAD_SCALARIZE_ROWS_IV]], 16
28 ; CHECK-NEXT: [[TMP9:%.*]] = add i16 [[TMP8]], [[TILELOAD_SCALARIZE_COLS_IV]]
29 ; CHECK-NEXT: [[TMP10:%.*]] = load i32, i32* [[TMP7]], align 4
30 ; CHECK-NEXT: [[TMP11]] = insertelement <256 x i32> [[VEC_PHI]], i32 [[TMP10]], i16 [[TMP9]]
31 ; CHECK-NEXT: br label [[TILELOAD_SCALARIZE_COLS_LATCH]]
32 ; CHECK: tileload.scalarize.cols.latch:
33 ; CHECK-NEXT: [[TILELOAD_SCALARIZE_COLS_STEP]] = add i16 [[TILELOAD_SCALARIZE_COLS_IV]], 1
34 ; CHECK-NEXT: [[TILELOAD_SCALARIZE_COLS_COND:%.*]] = icmp ne i16 [[TILELOAD_SCALARIZE_COLS_STEP]], [[TMP0]]
35 ; CHECK-NEXT: br i1 [[TILELOAD_SCALARIZE_COLS_COND]], label [[TILELOAD_SCALARIZE_COLS_HEADER]], label [[TILELOAD_SCALARIZE_ROWS_LATCH]]
36 ; CHECK: tileload.scalarize.rows.latch:
37 ; CHECK-NEXT: [[TILELOAD_SCALARIZE_ROWS_STEP]] = add i16 [[TILELOAD_SCALARIZE_ROWS_IV]], 1
38 ; CHECK-NEXT: [[TILELOAD_SCALARIZE_ROWS_COND:%.*]] = icmp ne i16 [[TILELOAD_SCALARIZE_ROWS_STEP]], [[ROW:%.*]]
39 ; CHECK-NEXT: br i1 [[TILELOAD_SCALARIZE_ROWS_COND]], label [[TILELOAD_SCALARIZE_ROWS_HEADER]], label [[CONTINUE:%.*]]
41 ; CHECK-NEXT: [[TMP12:%.*]] = bitcast <256 x i32> [[TMP11]] to x86_amx
42 ; CHECK-NEXT: store <256 x i32> [[TMP11]], <256 x i32>* [[VPTR:%.*]], align 64
43 ; CHECK-NEXT: ret void
46 %amx = call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 %col, i8* %ptr, i64 %stride)
47 %vec = bitcast x86_amx %amx to <256 x i32>
48 store <256 x i32> %vec, <256 x i32>* %vptr, align 64
52 define dso_local void @test_amx_load(i16 signext %row, i16 signext %col, i8 *%ptr, i64 %stride, <256 x i32>* %vptr) #0 {
53 ; CHECK-LABEL: @test_amx_load(
55 ; CHECK-NEXT: [[TMP0:%.*]] = lshr i16 [[COL:%.*]], 2
56 ; CHECK-NEXT: [[TMP1:%.*]] = lshr i64 [[STRIDE:%.*]], 2
57 ; CHECK-NEXT: br label [[TILELOAD_SCALARIZE_ROWS_HEADER:%.*]]
58 ; CHECK: tileload.scalarize.rows.header:
59 ; CHECK-NEXT: [[TILELOAD_SCALARIZE_ROWS_IV:%.*]] = phi i16 [ 0, [[ENTRY:%.*]] ], [ [[TILELOAD_SCALARIZE_ROWS_STEP:%.*]], [[TILELOAD_SCALARIZE_ROWS_LATCH:%.*]] ]
60 ; CHECK-NEXT: [[VEC_PHI_ROW:%.*]] = phi <256 x i32> [ zeroinitializer, [[ENTRY]] ], [ [[TMP11:%.*]], [[TILELOAD_SCALARIZE_ROWS_LATCH]] ]
61 ; CHECK-NEXT: br label [[TILELOAD_SCALARIZE_ROWS_BODY:%.*]]
62 ; CHECK: tileload.scalarize.rows.body:
63 ; CHECK-NEXT: br label [[TILELOAD_SCALARIZE_COLS_HEADER:%.*]]
64 ; CHECK: tileload.scalarize.cols.header:
65 ; CHECK-NEXT: [[TILELOAD_SCALARIZE_COLS_IV:%.*]] = phi i16 [ 0, [[TILELOAD_SCALARIZE_ROWS_BODY]] ], [ [[TILELOAD_SCALARIZE_COLS_STEP:%.*]], [[TILELOAD_SCALARIZE_COLS_LATCH:%.*]] ]
66 ; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <256 x i32> [ [[VEC_PHI_ROW]], [[TILELOAD_SCALARIZE_ROWS_BODY]] ], [ [[TMP11]], [[TILELOAD_SCALARIZE_COLS_LATCH]] ]
67 ; CHECK-NEXT: br label [[TILELOAD_SCALARIZE_COLS_BODY:%.*]]
68 ; CHECK: tileload.scalarize.cols.body:
69 ; CHECK-NEXT: [[TMP2:%.*]] = zext i16 [[TILELOAD_SCALARIZE_ROWS_IV]] to i64
70 ; CHECK-NEXT: [[TMP3:%.*]] = zext i16 [[TILELOAD_SCALARIZE_COLS_IV]] to i64
71 ; CHECK-NEXT: [[TMP4:%.*]] = mul i64 [[TMP2]], [[TMP1]]
72 ; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[TMP4]], [[TMP3]]
73 ; CHECK-NEXT: [[TMP6:%.*]] = bitcast i8* [[PTR:%.*]] to i32*
74 ; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i32, i32* [[TMP6]], i64 [[TMP5]]
75 ; CHECK-NEXT: [[TMP8:%.*]] = mul i16 [[TILELOAD_SCALARIZE_ROWS_IV]], 16
76 ; CHECK-NEXT: [[TMP9:%.*]] = add i16 [[TMP8]], [[TILELOAD_SCALARIZE_COLS_IV]]
77 ; CHECK-NEXT: [[TMP10:%.*]] = load i32, i32* [[TMP7]], align 4
78 ; CHECK-NEXT: [[TMP11]] = insertelement <256 x i32> [[VEC_PHI]], i32 [[TMP10]], i16 [[TMP9]]
79 ; CHECK-NEXT: br label [[TILELOAD_SCALARIZE_COLS_LATCH]]
80 ; CHECK: tileload.scalarize.cols.latch:
81 ; CHECK-NEXT: [[TILELOAD_SCALARIZE_COLS_STEP]] = add i16 [[TILELOAD_SCALARIZE_COLS_IV]], 1
82 ; CHECK-NEXT: [[TILELOAD_SCALARIZE_COLS_COND:%.*]] = icmp ne i16 [[TILELOAD_SCALARIZE_COLS_STEP]], [[TMP0]]
83 ; CHECK-NEXT: br i1 [[TILELOAD_SCALARIZE_COLS_COND]], label [[TILELOAD_SCALARIZE_COLS_HEADER]], label [[TILELOAD_SCALARIZE_ROWS_LATCH]]
84 ; CHECK: tileload.scalarize.rows.latch:
85 ; CHECK-NEXT: [[TILELOAD_SCALARIZE_ROWS_STEP]] = add i16 [[TILELOAD_SCALARIZE_ROWS_IV]], 1
86 ; CHECK-NEXT: [[TILELOAD_SCALARIZE_ROWS_COND:%.*]] = icmp ne i16 [[TILELOAD_SCALARIZE_ROWS_STEP]], [[ROW:%.*]]
87 ; CHECK-NEXT: br i1 [[TILELOAD_SCALARIZE_ROWS_COND]], label [[TILELOAD_SCALARIZE_ROWS_HEADER]], label [[CONTINUE:%.*]]
89 ; CHECK-NEXT: [[TMP12:%.*]] = bitcast <256 x i32> [[TMP11]] to x86_amx
90 ; CHECK-NEXT: store <256 x i32> [[TMP11]], <256 x i32>* [[VPTR:%.*]], align 64
91 ; CHECK-NEXT: ret void
94 %amx = call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 %col, i8* %ptr, i64 %stride)
95 %vec = bitcast x86_amx %amx to <256 x i32>
96 store <256 x i32> %vec, <256 x i32>* %vptr, align 64
100 define dso_local void @test_amx_dpbssd(i16 signext %row, i16 signext %col, i16 signext %k, <256 x i32> %c, <256 x i32> %a, <256 x i32> %b, <256 x i32>* %vptr) #0 {
101 ; CHECK-LABEL: @test_amx_dpbssd(
103 ; CHECK-NEXT: [[A_AMX:%.*]] = bitcast <256 x i32> [[A:%.*]] to x86_amx
104 ; CHECK-NEXT: [[B_AMX:%.*]] = bitcast <256 x i32> [[B:%.*]] to x86_amx
105 ; CHECK-NEXT: [[C_AMX:%.*]] = bitcast <256 x i32> [[C:%.*]] to x86_amx
106 ; CHECK-NEXT: [[TMP0:%.*]] = lshr i16 [[COL:%.*]], 2
107 ; CHECK-NEXT: [[TMP1:%.*]] = lshr i16 [[K:%.*]], 2
108 ; CHECK-NEXT: br label [[TILEDPBSSD_SCALARIZE_ROWS_HEADER:%.*]]
109 ; CHECK: tiledpbssd.scalarize.rows.header:
110 ; CHECK-NEXT: [[TILEDPBSSD_SCALARIZE_ROWS_IV:%.*]] = phi i16 [ 0, [[ENTRY:%.*]] ], [ [[TILEDPBSSD_SCALARIZE_ROWS_STEP:%.*]], [[TILEDPBSSD_SCALARIZE_ROWS_LATCH:%.*]] ]
111 ; CHECK-NEXT: [[VEC_C_PHI_ROW:%.*]] = phi <256 x i32> [ [[C]], [[ENTRY]] ], [ [[TMP18:%.*]], [[TILEDPBSSD_SCALARIZE_ROWS_LATCH]] ]
112 ; CHECK-NEXT: [[VEC_D_PHI_ROW:%.*]] = phi <256 x i32> [ zeroinitializer, [[ENTRY]] ], [ [[TMP20:%.*]], [[TILEDPBSSD_SCALARIZE_ROWS_LATCH]] ]
113 ; CHECK-NEXT: br label [[TILEDPBSSD_SCALARIZE_ROWS_BODY:%.*]]
114 ; CHECK: tiledpbssd.scalarize.rows.body:
115 ; CHECK-NEXT: br label [[TILEDPBSSD_SCALARIZE_COLS_HEADER:%.*]]
116 ; CHECK: tiledpbssd.scalarize.cols.header:
117 ; CHECK-NEXT: [[TILEDPBSSD_SCALARIZE_COLS_IV:%.*]] = phi i16 [ 0, [[TILEDPBSSD_SCALARIZE_ROWS_BODY]] ], [ [[TILEDPBSSD_SCALARIZE_COLS_STEP:%.*]], [[TILEDPBSSD_SCALARIZE_COLS_LATCH:%.*]] ]
118 ; CHECK-NEXT: [[VEC_C_PHI_COL:%.*]] = phi <256 x i32> [ [[VEC_C_PHI_ROW]], [[TILEDPBSSD_SCALARIZE_ROWS_BODY]] ], [ [[TMP18]], [[TILEDPBSSD_SCALARIZE_COLS_LATCH]] ]
119 ; CHECK-NEXT: [[VEC_D_PHI_COL:%.*]] = phi <256 x i32> [ [[VEC_D_PHI_ROW]], [[TILEDPBSSD_SCALARIZE_ROWS_BODY]] ], [ [[TMP20]], [[TILEDPBSSD_SCALARIZE_COLS_LATCH]] ]
120 ; CHECK-NEXT: [[TMP2:%.*]] = mul i16 [[TILEDPBSSD_SCALARIZE_ROWS_IV]], 16
121 ; CHECK-NEXT: [[TMP3:%.*]] = add i16 [[TMP2]], [[TILEDPBSSD_SCALARIZE_COLS_IV]]
122 ; CHECK-NEXT: br label [[TILEDPBSSD_SCALARIZE_COLS_BODY:%.*]]
123 ; CHECK: tiledpbssd.scalarize.cols.body:
124 ; CHECK-NEXT: br label [[TILEDPBSSD_SCALARIZE_INNER_HEADER:%.*]]
125 ; CHECK: tiledpbssd.scalarize.inner.header:
126 ; CHECK-NEXT: [[TILEDPBSSD_SCALARIZE_INNER_IV:%.*]] = phi i16 [ 0, [[TILEDPBSSD_SCALARIZE_COLS_BODY]] ], [ [[TILEDPBSSD_SCALARIZE_INNER_STEP:%.*]], [[TILEDPBSSD_SCALARIZE_INNER_LATCH:%.*]] ]
127 ; CHECK-NEXT: [[VEC_C_INNER_PHI:%.*]] = phi <256 x i32> [ [[VEC_C_PHI_COL]], [[TILEDPBSSD_SCALARIZE_COLS_BODY]] ], [ [[TMP18]], [[TILEDPBSSD_SCALARIZE_INNER_LATCH]] ]
128 ; CHECK-NEXT: br label [[TILEDPBSSD_SCALARIZE_INNER_BODY:%.*]]
129 ; CHECK: tiledpbssd.scalarize.inner.body:
130 ; CHECK-NEXT: [[TMP4:%.*]] = mul i16 [[TILEDPBSSD_SCALARIZE_ROWS_IV]], 16
131 ; CHECK-NEXT: [[TMP5:%.*]] = add i16 [[TMP4]], [[TILEDPBSSD_SCALARIZE_INNER_IV]]
132 ; CHECK-NEXT: [[TMP6:%.*]] = mul i16 [[TILEDPBSSD_SCALARIZE_INNER_IV]], 16
133 ; CHECK-NEXT: [[TMP7:%.*]] = add i16 [[TMP6]], [[TILEDPBSSD_SCALARIZE_COLS_IV]]
134 ; CHECK-NEXT: [[TMP8:%.*]] = extractelement <256 x i32> [[VEC_C_INNER_PHI]], i16 [[TMP3]]
135 ; CHECK-NEXT: [[TMP9:%.*]] = extractelement <256 x i32> [[A]], i16 [[TMP5]]
136 ; CHECK-NEXT: [[TMP10:%.*]] = bitcast i32 [[TMP9]] to <4 x i8>
137 ; CHECK-NEXT: [[TMP11:%.*]] = extractelement <256 x i32> [[B]], i16 [[TMP7]]
138 ; CHECK-NEXT: [[TMP12:%.*]] = bitcast i32 [[TMP11]] to <4 x i8>
139 ; CHECK-NEXT: [[TMP13:%.*]] = sext <4 x i8> [[TMP12]] to <4 x i32>
140 ; CHECK-NEXT: [[TMP14:%.*]] = sext <4 x i8> [[TMP10]] to <4 x i32>
141 ; CHECK-NEXT: [[TMP15:%.*]] = mul <4 x i32> [[TMP14]], [[TMP13]]
142 ; CHECK-NEXT: [[TMP16:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP15]])
143 ; CHECK-NEXT: [[TMP17:%.*]] = add i32 [[TMP8]], [[TMP16]]
144 ; CHECK-NEXT: [[TMP18]] = insertelement <256 x i32> [[VEC_C_INNER_PHI]], i32 [[TMP17]], i16 [[TMP3]]
145 ; CHECK-NEXT: br label [[TILEDPBSSD_SCALARIZE_INNER_LATCH]]
146 ; CHECK: tiledpbssd.scalarize.inner.latch:
147 ; CHECK-NEXT: [[TILEDPBSSD_SCALARIZE_INNER_STEP]] = add i16 [[TILEDPBSSD_SCALARIZE_INNER_IV]], 1
148 ; CHECK-NEXT: [[TILEDPBSSD_SCALARIZE_INNER_COND:%.*]] = icmp ne i16 [[TILEDPBSSD_SCALARIZE_INNER_STEP]], [[TMP1]]
149 ; CHECK-NEXT: br i1 [[TILEDPBSSD_SCALARIZE_INNER_COND]], label [[TILEDPBSSD_SCALARIZE_INNER_HEADER]], label [[TILEDPBSSD_SCALARIZE_COLS_LATCH]]
150 ; CHECK: tiledpbssd.scalarize.cols.latch:
151 ; CHECK-NEXT: [[TILEDPBSSD_SCALARIZE_COLS_STEP]] = add i16 [[TILEDPBSSD_SCALARIZE_COLS_IV]], 1
152 ; CHECK-NEXT: [[TILEDPBSSD_SCALARIZE_COLS_COND:%.*]] = icmp ne i16 [[TILEDPBSSD_SCALARIZE_COLS_STEP]], [[TMP0]]
153 ; CHECK-NEXT: [[TMP19:%.*]] = extractelement <256 x i32> [[TMP18]], i16 [[TMP3]]
154 ; CHECK-NEXT: [[TMP20]] = insertelement <256 x i32> [[VEC_D_PHI_COL]], i32 [[TMP19]], i16 [[TMP3]]
155 ; CHECK-NEXT: br i1 [[TILEDPBSSD_SCALARIZE_COLS_COND]], label [[TILEDPBSSD_SCALARIZE_COLS_HEADER]], label [[TILEDPBSSD_SCALARIZE_ROWS_LATCH]]
156 ; CHECK: tiledpbssd.scalarize.rows.latch:
157 ; CHECK-NEXT: [[TILEDPBSSD_SCALARIZE_ROWS_STEP]] = add i16 [[TILEDPBSSD_SCALARIZE_ROWS_IV]], 1
158 ; CHECK-NEXT: [[TILEDPBSSD_SCALARIZE_ROWS_COND:%.*]] = icmp ne i16 [[TILEDPBSSD_SCALARIZE_ROWS_STEP]], [[ROW:%.*]]
159 ; CHECK-NEXT: br i1 [[TILEDPBSSD_SCALARIZE_ROWS_COND]], label [[TILEDPBSSD_SCALARIZE_ROWS_HEADER]], label [[CONTINUE:%.*]]
161 ; CHECK-NEXT: [[TMP21:%.*]] = bitcast <256 x i32> [[TMP20]] to x86_amx
162 ; CHECK-NEXT: store <256 x i32> [[TMP20]], <256 x i32>* [[VPTR:%.*]], align 64
163 ; CHECK-NEXT: ret void
166 %a.amx = bitcast <256 x i32> %a to x86_amx
167 %b.amx = bitcast <256 x i32> %b to x86_amx
168 %c.amx = bitcast <256 x i32> %c to x86_amx
169 %acc = call x86_amx @llvm.x86.tdpbssd.internal(i16 %row, i16 %col, i16 %k, x86_amx %c.amx, x86_amx %a.amx, x86_amx %b.amx)
170 %vec = bitcast x86_amx %acc to <256 x i32>
171 store <256 x i32> %vec, <256 x i32>* %vptr, align 64
175 define dso_local void @test_amx_dpbsud(i16 signext %row, i16 signext %col, i16 signext %k, <256 x i32> %c, <256 x i32> %a, <256 x i32> %b, <256 x i32>* %vptr) #0 {
176 ; CHECK-LABEL: @test_amx_dpbsud(
178 ; CHECK-NEXT: [[A_AMX:%.*]] = bitcast <256 x i32> [[A:%.*]] to x86_amx
179 ; CHECK-NEXT: [[B_AMX:%.*]] = bitcast <256 x i32> [[B:%.*]] to x86_amx
180 ; CHECK-NEXT: [[C_AMX:%.*]] = bitcast <256 x i32> [[C:%.*]] to x86_amx
181 ; CHECK-NEXT: [[TMP0:%.*]] = lshr i16 [[COL:%.*]], 2
182 ; CHECK-NEXT: [[TMP1:%.*]] = lshr i16 [[K:%.*]], 2
183 ; CHECK-NEXT: br label [[TILEDPBSUD_SCALARIZE_ROWS_HEADER:%.*]]
184 ; CHECK: tiledpbsud.scalarize.rows.header:
185 ; CHECK-NEXT: [[TILEDPBSUD_SCALARIZE_ROWS_IV:%.*]] = phi i16 [ 0, [[ENTRY:%.*]] ], [ [[TILEDPBSUD_SCALARIZE_ROWS_STEP:%.*]], [[TILEDPBSUD_SCALARIZE_ROWS_LATCH:%.*]] ]
186 ; CHECK-NEXT: [[VEC_C_PHI_ROW:%.*]] = phi <256 x i32> [ [[C]], [[ENTRY]] ], [ [[TMP18:%.*]], [[TILEDPBSUD_SCALARIZE_ROWS_LATCH]] ]
187 ; CHECK-NEXT: [[VEC_D_PHI_ROW:%.*]] = phi <256 x i32> [ zeroinitializer, [[ENTRY]] ], [ [[TMP20:%.*]], [[TILEDPBSUD_SCALARIZE_ROWS_LATCH]] ]
188 ; CHECK-NEXT: br label [[TILEDPBSUD_SCALARIZE_ROWS_BODY:%.*]]
189 ; CHECK: tiledpbsud.scalarize.rows.body:
190 ; CHECK-NEXT: br label [[TILEDPBSUD_SCALARIZE_COLS_HEADER:%.*]]
191 ; CHECK: tiledpbsud.scalarize.cols.header:
192 ; CHECK-NEXT: [[TILEDPBSUD_SCALARIZE_COLS_IV:%.*]] = phi i16 [ 0, [[TILEDPBSUD_SCALARIZE_ROWS_BODY]] ], [ [[TILEDPBSUD_SCALARIZE_COLS_STEP:%.*]], [[TILEDPBSUD_SCALARIZE_COLS_LATCH:%.*]] ]
193 ; CHECK-NEXT: [[VEC_C_PHI_COL:%.*]] = phi <256 x i32> [ [[VEC_C_PHI_ROW]], [[TILEDPBSUD_SCALARIZE_ROWS_BODY]] ], [ [[TMP18]], [[TILEDPBSUD_SCALARIZE_COLS_LATCH]] ]
194 ; CHECK-NEXT: [[VEC_D_PHI_COL:%.*]] = phi <256 x i32> [ [[VEC_D_PHI_ROW]], [[TILEDPBSUD_SCALARIZE_ROWS_BODY]] ], [ [[TMP20]], [[TILEDPBSUD_SCALARIZE_COLS_LATCH]] ]
195 ; CHECK-NEXT: [[TMP2:%.*]] = mul i16 [[TILEDPBSUD_SCALARIZE_ROWS_IV]], 16
196 ; CHECK-NEXT: [[TMP3:%.*]] = add i16 [[TMP2]], [[TILEDPBSUD_SCALARIZE_COLS_IV]]
197 ; CHECK-NEXT: br label [[TILEDPBSUD_SCALARIZE_COLS_BODY:%.*]]
198 ; CHECK: tiledpbsud.scalarize.cols.body:
199 ; CHECK-NEXT: br label [[TILEDPBSUD_SCALARIZE_INNER_HEADER:%.*]]
200 ; CHECK: tiledpbsud.scalarize.inner.header:
201 ; CHECK-NEXT: [[TILEDPBSUD_SCALARIZE_INNER_IV:%.*]] = phi i16 [ 0, [[TILEDPBSUD_SCALARIZE_COLS_BODY]] ], [ [[TILEDPBSUD_SCALARIZE_INNER_STEP:%.*]], [[TILEDPBSUD_SCALARIZE_INNER_LATCH:%.*]] ]
202 ; CHECK-NEXT: [[VEC_C_INNER_PHI:%.*]] = phi <256 x i32> [ [[VEC_C_PHI_COL]], [[TILEDPBSUD_SCALARIZE_COLS_BODY]] ], [ [[TMP18]], [[TILEDPBSUD_SCALARIZE_INNER_LATCH]] ]
203 ; CHECK-NEXT: br label [[TILEDPBSUD_SCALARIZE_INNER_BODY:%.*]]
204 ; CHECK: tiledpbsud.scalarize.inner.body:
205 ; CHECK-NEXT: [[TMP4:%.*]] = mul i16 [[TILEDPBSUD_SCALARIZE_ROWS_IV]], 16
206 ; CHECK-NEXT: [[TMP5:%.*]] = add i16 [[TMP4]], [[TILEDPBSUD_SCALARIZE_INNER_IV]]
207 ; CHECK-NEXT: [[TMP6:%.*]] = mul i16 [[TILEDPBSUD_SCALARIZE_INNER_IV]], 16
208 ; CHECK-NEXT: [[TMP7:%.*]] = add i16 [[TMP6]], [[TILEDPBSUD_SCALARIZE_COLS_IV]]
209 ; CHECK-NEXT: [[TMP8:%.*]] = extractelement <256 x i32> [[VEC_C_INNER_PHI]], i16 [[TMP3]]
210 ; CHECK-NEXT: [[TMP9:%.*]] = extractelement <256 x i32> [[A]], i16 [[TMP5]]
211 ; CHECK-NEXT: [[TMP10:%.*]] = bitcast i32 [[TMP9]] to <4 x i8>
212 ; CHECK-NEXT: [[TMP11:%.*]] = extractelement <256 x i32> [[B]], i16 [[TMP7]]
213 ; CHECK-NEXT: [[TMP12:%.*]] = bitcast i32 [[TMP11]] to <4 x i8>
214 ; CHECK-NEXT: [[TMP13:%.*]] = zext <4 x i8> [[TMP12]] to <4 x i32>
215 ; CHECK-NEXT: [[TMP14:%.*]] = sext <4 x i8> [[TMP10]] to <4 x i32>
216 ; CHECK-NEXT: [[TMP15:%.*]] = mul <4 x i32> [[TMP14]], [[TMP13]]
217 ; CHECK-NEXT: [[TMP16:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP15]])
218 ; CHECK-NEXT: [[TMP17:%.*]] = add i32 [[TMP8]], [[TMP16]]
219 ; CHECK-NEXT: [[TMP18]] = insertelement <256 x i32> [[VEC_C_INNER_PHI]], i32 [[TMP17]], i16 [[TMP3]]
220 ; CHECK-NEXT: br label [[TILEDPBSUD_SCALARIZE_INNER_LATCH]]
221 ; CHECK: tiledpbsud.scalarize.inner.latch:
222 ; CHECK-NEXT: [[TILEDPBSUD_SCALARIZE_INNER_STEP]] = add i16 [[TILEDPBSUD_SCALARIZE_INNER_IV]], 1
223 ; CHECK-NEXT: [[TILEDPBSUD_SCALARIZE_INNER_COND:%.*]] = icmp ne i16 [[TILEDPBSUD_SCALARIZE_INNER_STEP]], [[TMP1]]
224 ; CHECK-NEXT: br i1 [[TILEDPBSUD_SCALARIZE_INNER_COND]], label [[TILEDPBSUD_SCALARIZE_INNER_HEADER]], label [[TILEDPBSUD_SCALARIZE_COLS_LATCH]]
225 ; CHECK: tiledpbsud.scalarize.cols.latch:
226 ; CHECK-NEXT: [[TILEDPBSUD_SCALARIZE_COLS_STEP]] = add i16 [[TILEDPBSUD_SCALARIZE_COLS_IV]], 1
227 ; CHECK-NEXT: [[TILEDPBSUD_SCALARIZE_COLS_COND:%.*]] = icmp ne i16 [[TILEDPBSUD_SCALARIZE_COLS_STEP]], [[TMP0]]
228 ; CHECK-NEXT: [[TMP19:%.*]] = extractelement <256 x i32> [[TMP18]], i16 [[TMP3]]
229 ; CHECK-NEXT: [[TMP20]] = insertelement <256 x i32> [[VEC_D_PHI_COL]], i32 [[TMP19]], i16 [[TMP3]]
230 ; CHECK-NEXT: br i1 [[TILEDPBSUD_SCALARIZE_COLS_COND]], label [[TILEDPBSUD_SCALARIZE_COLS_HEADER]], label [[TILEDPBSUD_SCALARIZE_ROWS_LATCH]]
231 ; CHECK: tiledpbsud.scalarize.rows.latch:
232 ; CHECK-NEXT: [[TILEDPBSUD_SCALARIZE_ROWS_STEP]] = add i16 [[TILEDPBSUD_SCALARIZE_ROWS_IV]], 1
233 ; CHECK-NEXT: [[TILEDPBSUD_SCALARIZE_ROWS_COND:%.*]] = icmp ne i16 [[TILEDPBSUD_SCALARIZE_ROWS_STEP]], [[ROW:%.*]]
234 ; CHECK-NEXT: br i1 [[TILEDPBSUD_SCALARIZE_ROWS_COND]], label [[TILEDPBSUD_SCALARIZE_ROWS_HEADER]], label [[CONTINUE:%.*]]
236 ; CHECK-NEXT: [[TMP21:%.*]] = bitcast <256 x i32> [[TMP20]] to x86_amx
237 ; CHECK-NEXT: store <256 x i32> [[TMP20]], <256 x i32>* [[VPTR:%.*]], align 64
238 ; CHECK-NEXT: ret void
241 %a.amx = bitcast <256 x i32> %a to x86_amx
242 %b.amx = bitcast <256 x i32> %b to x86_amx
243 %c.amx = bitcast <256 x i32> %c to x86_amx
244 %acc = call x86_amx @llvm.x86.tdpbsud.internal(i16 %row, i16 %col, i16 %k, x86_amx %c.amx, x86_amx %a.amx, x86_amx %b.amx)
245 %vec = bitcast x86_amx %acc to <256 x i32>
246 store <256 x i32> %vec, <256 x i32>* %vptr, align 64
250 define dso_local void @test_amx_dpbusd(i16 signext %row, i16 signext %col, i16 signext %k, <256 x i32> %c, <256 x i32> %a, <256 x i32> %b, <256 x i32>* %vptr) #0 {
251 ; CHECK-LABEL: @test_amx_dpbusd(
253 ; CHECK-NEXT: [[A_AMX:%.*]] = bitcast <256 x i32> [[A:%.*]] to x86_amx
254 ; CHECK-NEXT: [[B_AMX:%.*]] = bitcast <256 x i32> [[B:%.*]] to x86_amx
255 ; CHECK-NEXT: [[C_AMX:%.*]] = bitcast <256 x i32> [[C:%.*]] to x86_amx
256 ; CHECK-NEXT: [[TMP0:%.*]] = lshr i16 [[COL:%.*]], 2
257 ; CHECK-NEXT: [[TMP1:%.*]] = lshr i16 [[K:%.*]], 2
258 ; CHECK-NEXT: br label [[TILEDPBUSD_SCALARIZE_ROWS_HEADER:%.*]]
259 ; CHECK: tiledpbusd.scalarize.rows.header:
260 ; CHECK-NEXT: [[TILEDPBUSD_SCALARIZE_ROWS_IV:%.*]] = phi i16 [ 0, [[ENTRY:%.*]] ], [ [[TILEDPBUSD_SCALARIZE_ROWS_STEP:%.*]], [[TILEDPBUSD_SCALARIZE_ROWS_LATCH:%.*]] ]
261 ; CHECK-NEXT: [[VEC_C_PHI_ROW:%.*]] = phi <256 x i32> [ [[C]], [[ENTRY]] ], [ [[TMP18:%.*]], [[TILEDPBUSD_SCALARIZE_ROWS_LATCH]] ]
262 ; CHECK-NEXT: [[VEC_D_PHI_ROW:%.*]] = phi <256 x i32> [ zeroinitializer, [[ENTRY]] ], [ [[TMP20:%.*]], [[TILEDPBUSD_SCALARIZE_ROWS_LATCH]] ]
263 ; CHECK-NEXT: br label [[TILEDPBUSD_SCALARIZE_ROWS_BODY:%.*]]
264 ; CHECK: tiledpbusd.scalarize.rows.body:
265 ; CHECK-NEXT: br label [[TILEDPBUSD_SCALARIZE_COLS_HEADER:%.*]]
266 ; CHECK: tiledpbusd.scalarize.cols.header:
267 ; CHECK-NEXT: [[TILEDPBUSD_SCALARIZE_COLS_IV:%.*]] = phi i16 [ 0, [[TILEDPBUSD_SCALARIZE_ROWS_BODY]] ], [ [[TILEDPBUSD_SCALARIZE_COLS_STEP:%.*]], [[TILEDPBUSD_SCALARIZE_COLS_LATCH:%.*]] ]
268 ; CHECK-NEXT: [[VEC_C_PHI_COL:%.*]] = phi <256 x i32> [ [[VEC_C_PHI_ROW]], [[TILEDPBUSD_SCALARIZE_ROWS_BODY]] ], [ [[TMP18]], [[TILEDPBUSD_SCALARIZE_COLS_LATCH]] ]
269 ; CHECK-NEXT: [[VEC_D_PHI_COL:%.*]] = phi <256 x i32> [ [[VEC_D_PHI_ROW]], [[TILEDPBUSD_SCALARIZE_ROWS_BODY]] ], [ [[TMP20]], [[TILEDPBUSD_SCALARIZE_COLS_LATCH]] ]
270 ; CHECK-NEXT: [[TMP2:%.*]] = mul i16 [[TILEDPBUSD_SCALARIZE_ROWS_IV]], 16
271 ; CHECK-NEXT: [[TMP3:%.*]] = add i16 [[TMP2]], [[TILEDPBUSD_SCALARIZE_COLS_IV]]
272 ; CHECK-NEXT: br label [[TILEDPBUSD_SCALARIZE_COLS_BODY:%.*]]
273 ; CHECK: tiledpbusd.scalarize.cols.body:
274 ; CHECK-NEXT: br label [[TILEDPBUSD_SCALARIZE_INNER_HEADER:%.*]]
275 ; CHECK: tiledpbusd.scalarize.inner.header:
276 ; CHECK-NEXT: [[TILEDPBUSD_SCALARIZE_INNER_IV:%.*]] = phi i16 [ 0, [[TILEDPBUSD_SCALARIZE_COLS_BODY]] ], [ [[TILEDPBUSD_SCALARIZE_INNER_STEP:%.*]], [[TILEDPBUSD_SCALARIZE_INNER_LATCH:%.*]] ]
277 ; CHECK-NEXT: [[VEC_C_INNER_PHI:%.*]] = phi <256 x i32> [ [[VEC_C_PHI_COL]], [[TILEDPBUSD_SCALARIZE_COLS_BODY]] ], [ [[TMP18]], [[TILEDPBUSD_SCALARIZE_INNER_LATCH]] ]
278 ; CHECK-NEXT: br label [[TILEDPBUSD_SCALARIZE_INNER_BODY:%.*]]
279 ; CHECK: tiledpbusd.scalarize.inner.body:
280 ; CHECK-NEXT: [[TMP4:%.*]] = mul i16 [[TILEDPBUSD_SCALARIZE_ROWS_IV]], 16
281 ; CHECK-NEXT: [[TMP5:%.*]] = add i16 [[TMP4]], [[TILEDPBUSD_SCALARIZE_INNER_IV]]
282 ; CHECK-NEXT: [[TMP6:%.*]] = mul i16 [[TILEDPBUSD_SCALARIZE_INNER_IV]], 16
283 ; CHECK-NEXT: [[TMP7:%.*]] = add i16 [[TMP6]], [[TILEDPBUSD_SCALARIZE_COLS_IV]]
284 ; CHECK-NEXT: [[TMP8:%.*]] = extractelement <256 x i32> [[VEC_C_INNER_PHI]], i16 [[TMP3]]
285 ; CHECK-NEXT: [[TMP9:%.*]] = extractelement <256 x i32> [[A]], i16 [[TMP5]]
286 ; CHECK-NEXT: [[TMP10:%.*]] = bitcast i32 [[TMP9]] to <4 x i8>
287 ; CHECK-NEXT: [[TMP11:%.*]] = extractelement <256 x i32> [[B]], i16 [[TMP7]]
288 ; CHECK-NEXT: [[TMP12:%.*]] = bitcast i32 [[TMP11]] to <4 x i8>
289 ; CHECK-NEXT: [[TMP13:%.*]] = sext <4 x i8> [[TMP12]] to <4 x i32>
290 ; CHECK-NEXT: [[TMP14:%.*]] = zext <4 x i8> [[TMP10]] to <4 x i32>
291 ; CHECK-NEXT: [[TMP15:%.*]] = mul <4 x i32> [[TMP14]], [[TMP13]]
292 ; CHECK-NEXT: [[TMP16:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP15]])
293 ; CHECK-NEXT: [[TMP17:%.*]] = add i32 [[TMP8]], [[TMP16]]
294 ; CHECK-NEXT: [[TMP18]] = insertelement <256 x i32> [[VEC_C_INNER_PHI]], i32 [[TMP17]], i16 [[TMP3]]
295 ; CHECK-NEXT: br label [[TILEDPBUSD_SCALARIZE_INNER_LATCH]]
296 ; CHECK: tiledpbusd.scalarize.inner.latch:
297 ; CHECK-NEXT: [[TILEDPBUSD_SCALARIZE_INNER_STEP]] = add i16 [[TILEDPBUSD_SCALARIZE_INNER_IV]], 1
298 ; CHECK-NEXT: [[TILEDPBUSD_SCALARIZE_INNER_COND:%.*]] = icmp ne i16 [[TILEDPBUSD_SCALARIZE_INNER_STEP]], [[TMP1]]
299 ; CHECK-NEXT: br i1 [[TILEDPBUSD_SCALARIZE_INNER_COND]], label [[TILEDPBUSD_SCALARIZE_INNER_HEADER]], label [[TILEDPBUSD_SCALARIZE_COLS_LATCH]]
300 ; CHECK: tiledpbusd.scalarize.cols.latch:
301 ; CHECK-NEXT: [[TILEDPBUSD_SCALARIZE_COLS_STEP]] = add i16 [[TILEDPBUSD_SCALARIZE_COLS_IV]], 1
302 ; CHECK-NEXT: [[TILEDPBUSD_SCALARIZE_COLS_COND:%.*]] = icmp ne i16 [[TILEDPBUSD_SCALARIZE_COLS_STEP]], [[TMP0]]
303 ; CHECK-NEXT: [[TMP19:%.*]] = extractelement <256 x i32> [[TMP18]], i16 [[TMP3]]
304 ; CHECK-NEXT: [[TMP20]] = insertelement <256 x i32> [[VEC_D_PHI_COL]], i32 [[TMP19]], i16 [[TMP3]]
305 ; CHECK-NEXT: br i1 [[TILEDPBUSD_SCALARIZE_COLS_COND]], label [[TILEDPBUSD_SCALARIZE_COLS_HEADER]], label [[TILEDPBUSD_SCALARIZE_ROWS_LATCH]]
306 ; CHECK: tiledpbusd.scalarize.rows.latch:
307 ; CHECK-NEXT: [[TILEDPBUSD_SCALARIZE_ROWS_STEP]] = add i16 [[TILEDPBUSD_SCALARIZE_ROWS_IV]], 1
308 ; CHECK-NEXT: [[TILEDPBUSD_SCALARIZE_ROWS_COND:%.*]] = icmp ne i16 [[TILEDPBUSD_SCALARIZE_ROWS_STEP]], [[ROW:%.*]]
309 ; CHECK-NEXT: br i1 [[TILEDPBUSD_SCALARIZE_ROWS_COND]], label [[TILEDPBUSD_SCALARIZE_ROWS_HEADER]], label [[CONTINUE:%.*]]
311 ; CHECK-NEXT: [[TMP21:%.*]] = bitcast <256 x i32> [[TMP20]] to x86_amx
312 ; CHECK-NEXT: store <256 x i32> [[TMP20]], <256 x i32>* [[VPTR:%.*]], align 64
313 ; CHECK-NEXT: ret void
316 %a.amx = bitcast <256 x i32> %a to x86_amx
317 %b.amx = bitcast <256 x i32> %b to x86_amx
318 %c.amx = bitcast <256 x i32> %c to x86_amx
319 %acc = call x86_amx @llvm.x86.tdpbusd.internal(i16 %row, i16 %col, i16 %k, x86_amx %c.amx, x86_amx %a.amx, x86_amx %b.amx)
320 %vec = bitcast x86_amx %acc to <256 x i32>
321 store <256 x i32> %vec, <256 x i32>* %vptr, align 64
325 define dso_local void @test_amx_dpbuud(i16 signext %row, i16 signext %col, i16 signext %k, <256 x i32> %c, <256 x i32> %a, <256 x i32> %b, <256 x i32>* %vptr) #0 {
326 ; CHECK-LABEL: @test_amx_dpbuud(
328 ; CHECK-NEXT: [[A_AMX:%.*]] = bitcast <256 x i32> [[A:%.*]] to x86_amx
329 ; CHECK-NEXT: [[B_AMX:%.*]] = bitcast <256 x i32> [[B:%.*]] to x86_amx
330 ; CHECK-NEXT: [[C_AMX:%.*]] = bitcast <256 x i32> [[C:%.*]] to x86_amx
331 ; CHECK-NEXT: [[TMP0:%.*]] = lshr i16 [[COL:%.*]], 2
332 ; CHECK-NEXT: [[TMP1:%.*]] = lshr i16 [[K:%.*]], 2
333 ; CHECK-NEXT: br label [[TILEDPBUUD_SCALARIZE_ROWS_HEADER:%.*]]
334 ; CHECK: tiledpbuud.scalarize.rows.header:
335 ; CHECK-NEXT: [[TILEDPBUUD_SCALARIZE_ROWS_IV:%.*]] = phi i16 [ 0, [[ENTRY:%.*]] ], [ [[TILEDPBUUD_SCALARIZE_ROWS_STEP:%.*]], [[TILEDPBUUD_SCALARIZE_ROWS_LATCH:%.*]] ]
336 ; CHECK-NEXT: [[VEC_C_PHI_ROW:%.*]] = phi <256 x i32> [ [[C]], [[ENTRY]] ], [ [[TMP18:%.*]], [[TILEDPBUUD_SCALARIZE_ROWS_LATCH]] ]
337 ; CHECK-NEXT: [[VEC_D_PHI_ROW:%.*]] = phi <256 x i32> [ zeroinitializer, [[ENTRY]] ], [ [[TMP20:%.*]], [[TILEDPBUUD_SCALARIZE_ROWS_LATCH]] ]
338 ; CHECK-NEXT: br label [[TILEDPBUUD_SCALARIZE_ROWS_BODY:%.*]]
339 ; CHECK: tiledpbuud.scalarize.rows.body:
340 ; CHECK-NEXT: br label [[TILEDPBUUD_SCALARIZE_COLS_HEADER:%.*]]
341 ; CHECK: tiledpbuud.scalarize.cols.header:
342 ; CHECK-NEXT: [[TILEDPBUUD_SCALARIZE_COLS_IV:%.*]] = phi i16 [ 0, [[TILEDPBUUD_SCALARIZE_ROWS_BODY]] ], [ [[TILEDPBUUD_SCALARIZE_COLS_STEP:%.*]], [[TILEDPBUUD_SCALARIZE_COLS_LATCH:%.*]] ]
343 ; CHECK-NEXT: [[VEC_C_PHI_COL:%.*]] = phi <256 x i32> [ [[VEC_C_PHI_ROW]], [[TILEDPBUUD_SCALARIZE_ROWS_BODY]] ], [ [[TMP18]], [[TILEDPBUUD_SCALARIZE_COLS_LATCH]] ]
344 ; CHECK-NEXT: [[VEC_D_PHI_COL:%.*]] = phi <256 x i32> [ [[VEC_D_PHI_ROW]], [[TILEDPBUUD_SCALARIZE_ROWS_BODY]] ], [ [[TMP20]], [[TILEDPBUUD_SCALARIZE_COLS_LATCH]] ]
345 ; CHECK-NEXT: [[TMP2:%.*]] = mul i16 [[TILEDPBUUD_SCALARIZE_ROWS_IV]], 16
346 ; CHECK-NEXT: [[TMP3:%.*]] = add i16 [[TMP2]], [[TILEDPBUUD_SCALARIZE_COLS_IV]]
347 ; CHECK-NEXT: br label [[TILEDPBUUD_SCALARIZE_COLS_BODY:%.*]]
348 ; CHECK: tiledpbuud.scalarize.cols.body:
349 ; CHECK-NEXT: br label [[TILEDPBUUD_SCALARIZE_INNER_HEADER:%.*]]
350 ; CHECK: tiledpbuud.scalarize.inner.header:
351 ; CHECK-NEXT: [[TILEDPBUUD_SCALARIZE_INNER_IV:%.*]] = phi i16 [ 0, [[TILEDPBUUD_SCALARIZE_COLS_BODY]] ], [ [[TILEDPBUUD_SCALARIZE_INNER_STEP:%.*]], [[TILEDPBUUD_SCALARIZE_INNER_LATCH:%.*]] ]
352 ; CHECK-NEXT: [[VEC_C_INNER_PHI:%.*]] = phi <256 x i32> [ [[VEC_C_PHI_COL]], [[TILEDPBUUD_SCALARIZE_COLS_BODY]] ], [ [[TMP18]], [[TILEDPBUUD_SCALARIZE_INNER_LATCH]] ]
353 ; CHECK-NEXT: br label [[TILEDPBUUD_SCALARIZE_INNER_BODY:%.*]]
354 ; CHECK: tiledpbuud.scalarize.inner.body:
355 ; CHECK-NEXT: [[TMP4:%.*]] = mul i16 [[TILEDPBUUD_SCALARIZE_ROWS_IV]], 16
356 ; CHECK-NEXT: [[TMP5:%.*]] = add i16 [[TMP4]], [[TILEDPBUUD_SCALARIZE_INNER_IV]]
357 ; CHECK-NEXT: [[TMP6:%.*]] = mul i16 [[TILEDPBUUD_SCALARIZE_INNER_IV]], 16
358 ; CHECK-NEXT: [[TMP7:%.*]] = add i16 [[TMP6]], [[TILEDPBUUD_SCALARIZE_COLS_IV]]
359 ; CHECK-NEXT: [[TMP8:%.*]] = extractelement <256 x i32> [[VEC_C_INNER_PHI]], i16 [[TMP3]]
360 ; CHECK-NEXT: [[TMP9:%.*]] = extractelement <256 x i32> [[A]], i16 [[TMP5]]
361 ; CHECK-NEXT: [[TMP10:%.*]] = bitcast i32 [[TMP9]] to <4 x i8>
362 ; CHECK-NEXT: [[TMP11:%.*]] = extractelement <256 x i32> [[B]], i16 [[TMP7]]
363 ; CHECK-NEXT: [[TMP12:%.*]] = bitcast i32 [[TMP11]] to <4 x i8>
364 ; CHECK-NEXT: [[TMP13:%.*]] = zext <4 x i8> [[TMP12]] to <4 x i32>
365 ; CHECK-NEXT: [[TMP14:%.*]] = zext <4 x i8> [[TMP10]] to <4 x i32>
366 ; CHECK-NEXT: [[TMP15:%.*]] = mul <4 x i32> [[TMP14]], [[TMP13]]
367 ; CHECK-NEXT: [[TMP16:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP15]])
368 ; CHECK-NEXT: [[TMP17:%.*]] = add i32 [[TMP8]], [[TMP16]]
369 ; CHECK-NEXT: [[TMP18]] = insertelement <256 x i32> [[VEC_C_INNER_PHI]], i32 [[TMP17]], i16 [[TMP3]]
370 ; CHECK-NEXT: br label [[TILEDPBUUD_SCALARIZE_INNER_LATCH]]
371 ; CHECK: tiledpbuud.scalarize.inner.latch:
372 ; CHECK-NEXT: [[TILEDPBUUD_SCALARIZE_INNER_STEP]] = add i16 [[TILEDPBUUD_SCALARIZE_INNER_IV]], 1
373 ; CHECK-NEXT: [[TILEDPBUUD_SCALARIZE_INNER_COND:%.*]] = icmp ne i16 [[TILEDPBUUD_SCALARIZE_INNER_STEP]], [[TMP1]]
374 ; CHECK-NEXT: br i1 [[TILEDPBUUD_SCALARIZE_INNER_COND]], label [[TILEDPBUUD_SCALARIZE_INNER_HEADER]], label [[TILEDPBUUD_SCALARIZE_COLS_LATCH]]
375 ; CHECK: tiledpbuud.scalarize.cols.latch:
376 ; CHECK-NEXT: [[TILEDPBUUD_SCALARIZE_COLS_STEP]] = add i16 [[TILEDPBUUD_SCALARIZE_COLS_IV]], 1
377 ; CHECK-NEXT: [[TILEDPBUUD_SCALARIZE_COLS_COND:%.*]] = icmp ne i16 [[TILEDPBUUD_SCALARIZE_COLS_STEP]], [[TMP0]]
378 ; CHECK-NEXT: [[TMP19:%.*]] = extractelement <256 x i32> [[TMP18]], i16 [[TMP3]]
379 ; CHECK-NEXT: [[TMP20]] = insertelement <256 x i32> [[VEC_D_PHI_COL]], i32 [[TMP19]], i16 [[TMP3]]
380 ; CHECK-NEXT: br i1 [[TILEDPBUUD_SCALARIZE_COLS_COND]], label [[TILEDPBUUD_SCALARIZE_COLS_HEADER]], label [[TILEDPBUUD_SCALARIZE_ROWS_LATCH]]
381 ; CHECK: tiledpbuud.scalarize.rows.latch:
382 ; CHECK-NEXT: [[TILEDPBUUD_SCALARIZE_ROWS_STEP]] = add i16 [[TILEDPBUUD_SCALARIZE_ROWS_IV]], 1
383 ; CHECK-NEXT: [[TILEDPBUUD_SCALARIZE_ROWS_COND:%.*]] = icmp ne i16 [[TILEDPBUUD_SCALARIZE_ROWS_STEP]], [[ROW:%.*]]
384 ; CHECK-NEXT: br i1 [[TILEDPBUUD_SCALARIZE_ROWS_COND]], label [[TILEDPBUUD_SCALARIZE_ROWS_HEADER]], label [[CONTINUE:%.*]]
386 ; CHECK-NEXT: [[TMP21:%.*]] = bitcast <256 x i32> [[TMP20]] to x86_amx
387 ; CHECK-NEXT: store <256 x i32> [[TMP20]], <256 x i32>* [[VPTR:%.*]], align 64
388 ; CHECK-NEXT: ret void
391 %a.amx = bitcast <256 x i32> %a to x86_amx
392 %b.amx = bitcast <256 x i32> %b to x86_amx
393 %c.amx = bitcast <256 x i32> %c to x86_amx
394 %acc = call x86_amx @llvm.x86.tdpbuud.internal(i16 %row, i16 %col, i16 %k, x86_amx %c.amx, x86_amx %a.amx, x86_amx %b.amx)
395 %vec = bitcast x86_amx %acc to <256 x i32>
396 store <256 x i32> %vec, <256 x i32>* %vptr, align 64
400 define dso_local void @test_amx_dpbf16ps(i16 signext %row, i16 signext %col, i16 signext %k, <256 x i32> %c, <256 x i32> %a, <256 x i32> %b, <256 x i32>* %vptr) #0 {
401 ; CHECK-LABEL: @test_amx_dpbf16ps(
403 ; CHECK-NEXT: [[A_AMX:%.*]] = bitcast <256 x i32> [[A:%.*]] to x86_amx
404 ; CHECK-NEXT: [[B_AMX:%.*]] = bitcast <256 x i32> [[B:%.*]] to x86_amx
405 ; CHECK-NEXT: [[C_AMX:%.*]] = bitcast <256 x i32> [[C:%.*]] to x86_amx
406 ; CHECK-NEXT: [[TMP0:%.*]] = lshr i16 [[COL:%.*]], 2
407 ; CHECK-NEXT: [[TMP1:%.*]] = lshr i16 [[K:%.*]], 2
408 ; CHECK-NEXT: br label [[TILEDPBF16PS_SCALARIZE_ROWS_HEADER:%.*]]
409 ; CHECK: tiledpbf16ps.scalarize.rows.header:
410 ; CHECK-NEXT: [[TILEDPBF16PS_SCALARIZE_ROWS_IV:%.*]] = phi i16 [ 0, [[ENTRY:%.*]] ], [ [[TILEDPBF16PS_SCALARIZE_ROWS_STEP:%.*]], [[TILEDPBF16PS_SCALARIZE_ROWS_LATCH:%.*]] ]
411 ; CHECK-NEXT: [[VEC_C_PHI_ROW:%.*]] = phi <256 x i32> [ [[C]], [[ENTRY]] ], [ [[TMP21:%.*]], [[TILEDPBF16PS_SCALARIZE_ROWS_LATCH]] ]
412 ; CHECK-NEXT: [[VEC_D_PHI_ROW:%.*]] = phi <256 x i32> [ zeroinitializer, [[ENTRY]] ], [ [[TMP23:%.*]], [[TILEDPBF16PS_SCALARIZE_ROWS_LATCH]] ]
413 ; CHECK-NEXT: br label [[TILEDPBF16PS_SCALARIZE_ROWS_BODY:%.*]]
414 ; CHECK: tiledpbf16ps.scalarize.rows.body:
415 ; CHECK-NEXT: br label [[TILEDPBF16PS_SCALARIZE_COLS_HEADER:%.*]]
416 ; CHECK: tiledpbf16ps.scalarize.cols.header:
417 ; CHECK-NEXT: [[TILEDPBF16PS_SCALARIZE_COLS_IV:%.*]] = phi i16 [ 0, [[TILEDPBF16PS_SCALARIZE_ROWS_BODY]] ], [ [[TILEDPBF16PS_SCALARIZE_COLS_STEP:%.*]], [[TILEDPBF16PS_SCALARIZE_COLS_LATCH:%.*]] ]
418 ; CHECK-NEXT: [[VEC_C_PHI_COL:%.*]] = phi <256 x i32> [ [[VEC_C_PHI_ROW]], [[TILEDPBF16PS_SCALARIZE_ROWS_BODY]] ], [ [[TMP21]], [[TILEDPBF16PS_SCALARIZE_COLS_LATCH]] ]
419 ; CHECK-NEXT: [[VEC_D_PHI_COL:%.*]] = phi <256 x i32> [ [[VEC_D_PHI_ROW]], [[TILEDPBF16PS_SCALARIZE_ROWS_BODY]] ], [ [[TMP23]], [[TILEDPBF16PS_SCALARIZE_COLS_LATCH]] ]
420 ; CHECK-NEXT: [[TMP2:%.*]] = mul i16 [[TILEDPBF16PS_SCALARIZE_ROWS_IV]], 16
421 ; CHECK-NEXT: [[TMP3:%.*]] = add i16 [[TMP2]], [[TILEDPBF16PS_SCALARIZE_COLS_IV]]
422 ; CHECK-NEXT: br label [[TILEDPBF16PS_SCALARIZE_COLS_BODY:%.*]]
423 ; CHECK: tiledpbf16ps.scalarize.cols.body:
424 ; CHECK-NEXT: br label [[TILEDPBF16PS_SCALARIZE_INNER_HEADER:%.*]]
425 ; CHECK: tiledpbf16ps.scalarize.inner.header:
426 ; CHECK-NEXT: [[TILEDPBF16PS_SCALARIZE_INNER_IV:%.*]] = phi i16 [ 0, [[TILEDPBF16PS_SCALARIZE_COLS_BODY]] ], [ [[TILEDPBF16PS_SCALARIZE_INNER_STEP:%.*]], [[TILEDPBF16PS_SCALARIZE_INNER_LATCH:%.*]] ]
427 ; CHECK-NEXT: [[VEC_C_INNER_PHI:%.*]] = phi <256 x i32> [ [[VEC_C_PHI_COL]], [[TILEDPBF16PS_SCALARIZE_COLS_BODY]] ], [ [[TMP21]], [[TILEDPBF16PS_SCALARIZE_INNER_LATCH]] ]
428 ; CHECK-NEXT: br label [[TILEDPBF16PS_SCALARIZE_INNER_BODY:%.*]]
429 ; CHECK: tiledpbf16ps.scalarize.inner.body:
430 ; CHECK-NEXT: [[TMP4:%.*]] = mul i16 [[TILEDPBF16PS_SCALARIZE_ROWS_IV]], 16
431 ; CHECK-NEXT: [[TMP5:%.*]] = add i16 [[TMP4]], [[TILEDPBF16PS_SCALARIZE_INNER_IV]]
432 ; CHECK-NEXT: [[TMP6:%.*]] = mul i16 [[TILEDPBF16PS_SCALARIZE_INNER_IV]], 16
433 ; CHECK-NEXT: [[TMP7:%.*]] = add i16 [[TMP6]], [[TILEDPBF16PS_SCALARIZE_COLS_IV]]
434 ; CHECK-NEXT: [[TMP8:%.*]] = extractelement <256 x i32> [[VEC_C_INNER_PHI]], i16 [[TMP3]]
435 ; CHECK-NEXT: [[TMP9:%.*]] = bitcast i32 [[TMP8]] to float
436 ; CHECK-NEXT: [[TMP10:%.*]] = extractelement <256 x i32> [[A]], i16 [[TMP5]]
437 ; CHECK-NEXT: [[TMP11:%.*]] = bitcast i32 [[TMP10]] to <2 x i16>
438 ; CHECK-NEXT: [[TMP12:%.*]] = extractelement <256 x i32> [[B]], i16 [[TMP7]]
439 ; CHECK-NEXT: [[TMP13:%.*]] = bitcast i32 [[TMP12]] to <2 x i16>
440 ; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <2 x i16> [[TMP11]], <2 x i16> zeroinitializer, <4 x i32> <i32 2, i32 0, i32 3, i32 1>
441 ; CHECK-NEXT: [[TMP15:%.*]] = bitcast <4 x i16> [[TMP14]] to <2 x float>
442 ; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <2 x i16> [[TMP13]], <2 x i16> zeroinitializer, <4 x i32> <i32 2, i32 0, i32 3, i32 1>
443 ; CHECK-NEXT: [[TMP17:%.*]] = bitcast <4 x i16> [[TMP16]] to <2 x float>
444 ; CHECK-NEXT: [[TMP18:%.*]] = fmul <2 x float> [[TMP15]], [[TMP17]]
445 ; CHECK-NEXT: [[TMP19:%.*]] = call float @llvm.vector.reduce.fadd.v2f32(float [[TMP9]], <2 x float> [[TMP18]])
446 ; CHECK-NEXT: [[TMP20:%.*]] = bitcast float [[TMP19]] to i32
447 ; CHECK-NEXT: [[TMP21]] = insertelement <256 x i32> [[VEC_C_INNER_PHI]], i32 [[TMP20]], i16 [[TMP3]]
448 ; CHECK-NEXT: br label [[TILEDPBF16PS_SCALARIZE_INNER_LATCH]]
449 ; CHECK: tiledpbf16ps.scalarize.inner.latch:
450 ; CHECK-NEXT: [[TILEDPBF16PS_SCALARIZE_INNER_STEP]] = add i16 [[TILEDPBF16PS_SCALARIZE_INNER_IV]], 1
451 ; CHECK-NEXT: [[TILEDPBF16PS_SCALARIZE_INNER_COND:%.*]] = icmp ne i16 [[TILEDPBF16PS_SCALARIZE_INNER_STEP]], [[TMP1]]
452 ; CHECK-NEXT: br i1 [[TILEDPBF16PS_SCALARIZE_INNER_COND]], label [[TILEDPBF16PS_SCALARIZE_INNER_HEADER]], label [[TILEDPBF16PS_SCALARIZE_COLS_LATCH]]
453 ; CHECK: tiledpbf16ps.scalarize.cols.latch:
454 ; CHECK-NEXT: [[TILEDPBF16PS_SCALARIZE_COLS_STEP]] = add i16 [[TILEDPBF16PS_SCALARIZE_COLS_IV]], 1
455 ; CHECK-NEXT: [[TILEDPBF16PS_SCALARIZE_COLS_COND:%.*]] = icmp ne i16 [[TILEDPBF16PS_SCALARIZE_COLS_STEP]], [[TMP0]]
456 ; CHECK-NEXT: [[TMP22:%.*]] = extractelement <256 x i32> [[TMP21]], i16 [[TMP3]]
457 ; CHECK-NEXT: [[TMP23]] = insertelement <256 x i32> [[VEC_D_PHI_COL]], i32 [[TMP22]], i16 [[TMP3]]
458 ; CHECK-NEXT: br i1 [[TILEDPBF16PS_SCALARIZE_COLS_COND]], label [[TILEDPBF16PS_SCALARIZE_COLS_HEADER]], label [[TILEDPBF16PS_SCALARIZE_ROWS_LATCH]]
459 ; CHECK: tiledpbf16ps.scalarize.rows.latch:
460 ; CHECK-NEXT: [[TILEDPBF16PS_SCALARIZE_ROWS_STEP]] = add i16 [[TILEDPBF16PS_SCALARIZE_ROWS_IV]], 1
461 ; CHECK-NEXT: [[TILEDPBF16PS_SCALARIZE_ROWS_COND:%.*]] = icmp ne i16 [[TILEDPBF16PS_SCALARIZE_ROWS_STEP]], [[ROW:%.*]]
462 ; CHECK-NEXT: br i1 [[TILEDPBF16PS_SCALARIZE_ROWS_COND]], label [[TILEDPBF16PS_SCALARIZE_ROWS_HEADER]], label [[CONTINUE:%.*]]
464 ; CHECK-NEXT: [[TMP24:%.*]] = bitcast <256 x i32> [[TMP23]] to x86_amx
465 ; CHECK-NEXT: store <256 x i32> [[TMP23]], <256 x i32>* [[VPTR:%.*]], align 64
466 ; CHECK-NEXT: ret void
469 %a.amx = bitcast <256 x i32> %a to x86_amx
470 %b.amx = bitcast <256 x i32> %b to x86_amx
471 %c.amx = bitcast <256 x i32> %c to x86_amx
472 %acc = call x86_amx @llvm.x86.tdpbf16ps.internal(i16 %row, i16 %col, i16 %k, x86_amx %c.amx, x86_amx %a.amx, x86_amx %b.amx)
473 %vec = bitcast x86_amx %acc to <256 x i32>
474 store <256 x i32> %vec, <256 x i32>* %vptr, align 64
478 define dso_local void @test_amx_store(i16 signext %row, i16 signext %col, i8 *%ptr, i64 %stride, <256 x i32>* %vptr, <256 x i32> %vec) #0 {
479 ; CHECK-LABEL: @test_amx_store(
481 ; CHECK-NEXT: [[AMX:%.*]] = bitcast <256 x i32> [[VEC:%.*]] to x86_amx
482 ; CHECK-NEXT: [[TMP0:%.*]] = lshr i16 [[COL:%.*]], 2
483 ; CHECK-NEXT: [[TMP1:%.*]] = lshr i64 [[STRIDE:%.*]], 2
484 ; CHECK-NEXT: br label [[TILESTORE_SCALARIZE_ROWS_HEADER:%.*]]
485 ; CHECK: tilestore.scalarize.rows.header:
486 ; CHECK-NEXT: [[TILESTORE_SCALARIZE_ROWS_IV:%.*]] = phi i16 [ 0, [[ENTRY:%.*]] ], [ [[TILESTORE_SCALARIZE_ROWS_STEP:%.*]], [[TILESTORE_SCALARIZE_ROWS_LATCH:%.*]] ]
487 ; CHECK-NEXT: br label [[TILESTORE_SCALARIZE_ROWS_BODY:%.*]]
488 ; CHECK: tilestore.scalarize.rows.body:
489 ; CHECK-NEXT: br label [[TILESTORE_SCALARIZE_COLS_HEADER:%.*]]
490 ; CHECK: tilestore.scalarize.cols.header:
491 ; CHECK-NEXT: [[TILESTORE_SCALARIZE_COLS_IV:%.*]] = phi i16 [ 0, [[TILESTORE_SCALARIZE_ROWS_BODY]] ], [ [[TILESTORE_SCALARIZE_COLS_STEP:%.*]], [[TILESTORE_SCALARIZE_COLS_LATCH:%.*]] ]
492 ; CHECK-NEXT: br label [[TILESTORE_SCALARIZE_COLS_BODY:%.*]]
493 ; CHECK: tilestore.scalarize.cols.body:
494 ; CHECK-NEXT: [[TMP2:%.*]] = zext i16 [[TILESTORE_SCALARIZE_ROWS_IV]] to i64
495 ; CHECK-NEXT: [[TMP3:%.*]] = zext i16 [[TILESTORE_SCALARIZE_COLS_IV]] to i64
496 ; CHECK-NEXT: [[TMP4:%.*]] = mul i64 [[TMP2]], [[TMP1]]
497 ; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[TMP4]], [[TMP3]]
498 ; CHECK-NEXT: [[TMP6:%.*]] = bitcast i8* [[PTR:%.*]] to i32*
499 ; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i32, i32* [[TMP6]], i64 [[TMP5]]
500 ; CHECK-NEXT: [[TMP8:%.*]] = mul i16 [[TILESTORE_SCALARIZE_ROWS_IV]], 16
501 ; CHECK-NEXT: [[TMP9:%.*]] = add i16 [[TMP8]], [[TILESTORE_SCALARIZE_COLS_IV]]
502 ; CHECK-NEXT: [[TMP10:%.*]] = extractelement <256 x i32> [[VEC]], i16 [[TMP9]]
503 ; CHECK-NEXT: store i32 [[TMP10]], i32* [[TMP7]], align 4
504 ; CHECK-NEXT: br label [[TILESTORE_SCALARIZE_COLS_LATCH]]
505 ; CHECK: tilestore.scalarize.cols.latch:
506 ; CHECK-NEXT: [[TILESTORE_SCALARIZE_COLS_STEP]] = add i16 [[TILESTORE_SCALARIZE_COLS_IV]], 1
507 ; CHECK-NEXT: [[TILESTORE_SCALARIZE_COLS_COND:%.*]] = icmp ne i16 [[TILESTORE_SCALARIZE_COLS_STEP]], [[TMP0]]
508 ; CHECK-NEXT: br i1 [[TILESTORE_SCALARIZE_COLS_COND]], label [[TILESTORE_SCALARIZE_COLS_HEADER]], label [[TILESTORE_SCALARIZE_ROWS_LATCH]]
509 ; CHECK: tilestore.scalarize.rows.latch:
510 ; CHECK-NEXT: [[TILESTORE_SCALARIZE_ROWS_STEP]] = add i16 [[TILESTORE_SCALARIZE_ROWS_IV]], 1
511 ; CHECK-NEXT: [[TILESTORE_SCALARIZE_ROWS_COND:%.*]] = icmp ne i16 [[TILESTORE_SCALARIZE_ROWS_STEP]], [[ROW:%.*]]
512 ; CHECK-NEXT: br i1 [[TILESTORE_SCALARIZE_ROWS_COND]], label [[TILESTORE_SCALARIZE_ROWS_HEADER]], label [[CONTINUE:%.*]]
514 ; CHECK-NEXT: ret void
517 %amx = bitcast <256 x i32> %vec to x86_amx
518 call void @llvm.x86.tilestored64.internal(i16 %row, i16 %col, i8* %ptr, i64 %stride, x86_amx %amx)
522 define dso_local void @test_amx_zero(i16 signext %row, i16 signext %col, <256 x i32>* %vptr) #0 {
523 ; CHECK-LABEL: @test_amx_zero(
525 ; CHECK-NEXT: store <256 x i32> zeroinitializer, <256 x i32>* [[VPTR:%.*]], align 64
526 ; CHECK-NEXT: ret void
529 %amx = call x86_amx @llvm.x86.tilezero.internal(i16 %row, i16 %col)
530 %vec = bitcast x86_amx %amx to <256 x i32>
531 store <256 x i32> %vec, <256 x i32>* %vptr, align 64
535 declare x86_amx @llvm.x86.tilezero.internal(i16, i16)
536 declare x86_amx @llvm.x86.tileloadd64.internal(i16, i16, i8*, i64)
537 declare x86_amx @llvm.x86.tdpbssd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx)
538 declare x86_amx @llvm.x86.tdpbsud.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx)
539 declare x86_amx @llvm.x86.tdpbusd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx)
540 declare x86_amx @llvm.x86.tdpbuud.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx)
541 declare x86_amx @llvm.x86.tdpbf16ps.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx)
542 declare void @llvm.x86.tilestored64.internal(i16, i16, i8*, i64, x86_amx)
544 attributes #0 = { noinline nounwind optnone }