1 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2 ; RUN: opt --codegen-opt-level=2 -mtriple=x86_64 -lower-amx-type %s -S | FileCheck %s
4 %struct.__tile_str = type { i16, i16, <256 x i32> }
6 @buf = dso_local global [1024 x i8] zeroinitializer, align 64
7 @buf2 = dso_local global [1024 x i8] zeroinitializer, align 64
9 ; test bitcast x86_amx to <256 x i32>
10 define dso_local void @test_user_empty(i16 %m, i16 %n, ptr%buf, i64 %s) {
11 ; CHECK-LABEL: @test_user_empty(
13 ; CHECK-NEXT: [[T1:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[M:%.*]], i16 [[N:%.*]], ptr [[BUF:%.*]], i64 [[S:%.*]])
14 ; CHECK-NEXT: ret void
17 %t1 = call x86_amx @llvm.x86.tileloadd64.internal(i16 %m, i16 %n, ptr %buf, i64 %s)
18 %t2 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %t1)
22 ; test bitcast <256 x i32> to x86_amx
23 define dso_local void @test_user_empty2(<256 x i32> %in) {
24 ; CHECK-LABEL: @test_user_empty2(
26 ; CHECK-NEXT: ret void
29 %t = call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %in)
33 define dso_local <256 x i32> @test_amx_load_bitcast_v256i32(ptr %in, i16 %m, i16 %n, ptr%buf, i64 %s) {
34 ; CHECK-LABEL: @test_amx_load_bitcast_v256i32(
36 ; CHECK-NEXT: [[TMP0:%.*]] = alloca <256 x i32>, align 64
37 ; CHECK-NEXT: [[T1:%.*]] = load <256 x i32>, ptr [[IN:%.*]], align 64
38 ; CHECK-NEXT: store <256 x i32> [[T1]], ptr [[TMP0]], align 1024
39 ; CHECK-NEXT: [[TMP1:%.*]] = sext i16 [[N:%.*]] to i64
40 ; CHECK-NEXT: [[TMP2:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[M:%.*]], i16 [[N]], ptr [[TMP0]], i64 [[TMP1]])
41 ; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 [[M]], i16 [[N]], ptr [[BUF:%.*]], i64 [[S:%.*]], x86_amx [[TMP2]])
42 ; CHECK-NEXT: ret <256 x i32> [[T1]]
45 %t1 = load <256 x i32>, ptr %in, align 64
46 %t2 = call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %t1)
47 call void @llvm.x86.tilestored64.internal(i16 %m, i16 %n, ptr %buf, i64 %s, x86_amx %t2)
51 define dso_local <225 x i32> @test_amx_load_bitcast_v225i32(ptr %in, i16 %m, i16 %n, ptr%buf, i64 %s) {
52 ; CHECK-LABEL: @test_amx_load_bitcast_v225i32(
54 ; CHECK-NEXT: [[TMP0:%.*]] = alloca <225 x i32>, align 64
55 ; CHECK-NEXT: [[T1:%.*]] = load <225 x i32>, ptr [[IN:%.*]], align 64
56 ; CHECK-NEXT: store <225 x i32> [[T1]], ptr [[TMP0]], align 1024
57 ; CHECK-NEXT: [[TMP1:%.*]] = sext i16 [[N:%.*]] to i64
58 ; CHECK-NEXT: [[TMP2:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[M:%.*]], i16 [[N]], ptr [[TMP0]], i64 [[TMP1]])
59 ; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 [[M]], i16 [[N]], ptr [[BUF:%.*]], i64 [[S:%.*]], x86_amx [[TMP2]])
60 ; CHECK-NEXT: ret <225 x i32> [[T1]]
63 %t1 = load <225 x i32>, ptr %in, align 64
64 %t2 = call x86_amx @llvm.x86.cast.vector.to.tile.v225i32(<225 x i32> %t1)
65 call void @llvm.x86.tilestored64.internal(i16 %m, i16 %n, ptr %buf, i64 %s, x86_amx %t2)
69 define dso_local <256 x i32> @test_amx_bitcast_store(ptr %out, i16 %m, i16 %n, ptr%buf, i64 %s) {
70 ; CHECK-LABEL: @test_amx_bitcast_store(
72 ; CHECK-NEXT: [[TMP0:%.*]] = alloca <256 x i32>, align 64
73 ; CHECK-NEXT: [[T1:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[M:%.*]], i16 [[M]], ptr [[BUF:%.*]], i64 [[S:%.*]])
74 ; CHECK-NEXT: [[TMP1:%.*]] = sext i16 [[M]] to i64
75 ; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 [[M]], i16 [[M]], ptr [[TMP0]], i64 [[TMP1]], x86_amx [[T1]])
76 ; CHECK-NEXT: [[TMP2:%.*]] = load <256 x i32>, ptr [[TMP0]], align 1024
77 ; CHECK-NEXT: [[TMP3:%.*]] = sext i16 [[M]] to i64
78 ; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 [[M]], i16 [[M]], ptr [[OUT:%.*]], i64 [[TMP3]], x86_amx [[T1]])
79 ; CHECK-NEXT: ret <256 x i32> [[TMP2]]
82 %t1 = call x86_amx @llvm.x86.tileloadd64.internal(i16 %m, i16 %m, ptr %buf, i64 %s)
83 %t2 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %t1)
84 store <256 x i32> %t2, ptr %out
88 define dso_local void @test_src_add(<256 x i32> %x, <256 x i32> %y, i16 %r, i16 %c, ptr %buf, i64 %s) {
89 ; CHECK-LABEL: @test_src_add(
91 ; CHECK-NEXT: [[TMP0:%.*]] = alloca <256 x i32>, align 64
92 ; CHECK-NEXT: [[ADD:%.*]] = add <256 x i32> [[Y:%.*]], [[X:%.*]]
93 ; CHECK-NEXT: store <256 x i32> [[ADD]], ptr [[TMP0]], align 1024
94 ; CHECK-NEXT: [[TMP1:%.*]] = sext i16 [[C:%.*]] to i64
95 ; CHECK-NEXT: [[TMP2:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[R:%.*]], i16 [[C]], ptr [[TMP0]], i64 [[TMP1]])
96 ; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 [[R]], i16 [[C]], ptr [[BUF:%.*]], i64 [[S:%.*]], x86_amx [[TMP2]])
97 ; CHECK-NEXT: ret void
100 %add = add <256 x i32> %y, %x
101 %t = call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %add)
102 call void @llvm.x86.tilestored64.internal(i16 %r, i16 %c, ptr %buf, i64 %s, x86_amx %t)
106 define dso_local void @test_src_add2(<256 x i32> %x, i16 %r, i16 %c, ptr %buf, i64 %s) {
107 ; CHECK-LABEL: @test_src_add2(
109 ; CHECK-NEXT: [[TMP0:%.*]] = alloca <256 x i32>, align 64
110 ; CHECK-NEXT: [[T1:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[R:%.*]], i16 [[C:%.*]], ptr [[BUF:%.*]], i64 [[S:%.*]])
111 ; CHECK-NEXT: [[TMP1:%.*]] = sext i16 [[C]] to i64
112 ; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 [[R]], i16 [[C]], ptr [[TMP0]], i64 [[TMP1]], x86_amx [[T1]])
113 ; CHECK-NEXT: [[TMP2:%.*]] = load <256 x i32>, ptr [[TMP0]], align 1024
114 ; CHECK-NEXT: [[ADD:%.*]] = add <256 x i32> [[TMP2]], [[X:%.*]]
115 ; CHECK-NEXT: ret void
118 %t1 = call x86_amx @llvm.x86.tileloadd64.internal(i16 %r, i16 %c, ptr %buf, i64 %s)
119 %t2 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %t1)
120 %add = add <256 x i32> %t2, %x
124 define dso_local void @__tile_loadd(ptr nocapture %0, ptr %1, i64 %2) local_unnamed_addr {
125 ; CHECK-LABEL: @__tile_loadd(
126 ; CHECK-NEXT: [[TMP4:%.*]] = load i16, ptr [[TMP0:%.*]], align 64
127 ; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TILE_STR:%.*]], ptr [[TMP0]], i64 0, i32 1
128 ; CHECK-NEXT: [[TMP6:%.*]] = load i16, ptr [[TMP5]], align 2
129 ; CHECK-NEXT: [[TMP7:%.*]] = shl i64 [[TMP2:%.*]], 32
130 ; CHECK-NEXT: [[TMP8:%.*]] = ashr exact i64 [[TMP7]], 32
131 ; CHECK-NEXT: [[TMP9:%.*]] = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 [[TMP4]], i16 [[TMP6]], ptr [[TMP1:%.*]], i64 [[TMP8]])
132 ; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds [[STRUCT___TILE_STR]], ptr [[TMP0]], i64 0, i32 2
133 ; CHECK-NEXT: [[TMP11:%.*]] = sext i16 [[TMP6]] to i64
134 ; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 [[TMP4]], i16 [[TMP6]], ptr [[TMP10]], i64 [[TMP11]], x86_amx [[TMP9]])
135 ; CHECK-NEXT: ret void
137 %4 = load i16, ptr %0, align 64
138 %5 = getelementptr inbounds %struct.__tile_str, ptr %0, i64 0, i32 1
139 %6 = load i16, ptr %5, align 2
141 %8 = ashr exact i64 %7, 32
142 %9 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %4, i16 %6, ptr %1, i64 %8)
143 %10 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %9)
144 %11 = getelementptr inbounds %struct.__tile_str, ptr %0, i64 0, i32 2
145 store <256 x i32> %10, ptr %11, align 64
149 define dso_local void @__tile_dpbssd(ptr nocapture %0, ptr nocapture readonly byval(%struct.__tile_str) align 64 %1, ptr nocapture readonly byval(%struct.__tile_str) align 64 %2) local_unnamed_addr {
150 ; CHECK-LABEL: @__tile_dpbssd(
151 ; CHECK-NEXT: [[TMP4:%.*]] = load i16, ptr [[TMP1:%.*]], align 64
152 ; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TILE_STR:%.*]], ptr [[TMP2:%.*]], i64 0, i32 1
153 ; CHECK-NEXT: [[TMP6:%.*]] = load i16, ptr [[TMP5]], align 2
154 ; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TILE_STR]], ptr [[TMP1]], i64 0, i32 1
155 ; CHECK-NEXT: [[TMP8:%.*]] = load i16, ptr [[TMP7]], align 2
156 ; CHECK-NEXT: [[TMP9:%.*]] = udiv i16 [[TMP8]], 4
157 ; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds [[STRUCT___TILE_STR]], ptr [[TMP0:%.*]], i64 0, i32 2
158 ; CHECK-NEXT: [[TMP11:%.*]] = sext i16 [[TMP6]] to i64
159 ; CHECK-NEXT: [[TMP12:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[TMP4]], i16 [[TMP6]], ptr [[TMP10]], i64 [[TMP11]])
160 ; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds [[STRUCT___TILE_STR]], ptr [[TMP1]], i64 0, i32 2
161 ; CHECK-NEXT: [[TMP14:%.*]] = sext i16 [[TMP8]] to i64
162 ; CHECK-NEXT: [[TMP15:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[TMP4]], i16 [[TMP8]], ptr [[TMP13]], i64 [[TMP14]])
163 ; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TILE_STR]], ptr [[TMP2]], i64 0, i32 2
164 ; CHECK-NEXT: [[TMP17:%.*]] = sext i16 [[TMP6]] to i64
165 ; CHECK-NEXT: [[TMP18:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[TMP9]], i16 [[TMP6]], ptr [[TMP16]], i64 [[TMP17]])
166 ; CHECK-NEXT: [[TMP19:%.*]] = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 [[TMP4]], i16 [[TMP6]], i16 [[TMP8]], x86_amx [[TMP12]], x86_amx [[TMP15]], x86_amx [[TMP18]])
167 ; CHECK-NEXT: [[TMP20:%.*]] = sext i16 [[TMP6]] to i64
168 ; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 [[TMP4]], i16 [[TMP6]], ptr [[TMP10]], i64 [[TMP20]], x86_amx [[TMP19]])
169 ; CHECK-NEXT: ret void
171 %4 = load i16, ptr %1, align 64
172 %5 = getelementptr inbounds %struct.__tile_str, ptr %2, i64 0, i32 1
173 %6 = load i16, ptr %5, align 2
174 %7 = getelementptr inbounds %struct.__tile_str, ptr %1, i64 0, i32 1
175 %8 = load i16, ptr %7, align 2
176 %9 = getelementptr inbounds %struct.__tile_str, ptr %0, i64 0, i32 2
177 %10 = load <256 x i32>, ptr %9, align 64
178 %11 = call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %10)
179 %12 = getelementptr inbounds %struct.__tile_str, ptr %1, i64 0, i32 2
180 %13 = load <256 x i32>, ptr %12, align 64
181 %14 = call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %13)
182 %15 = getelementptr inbounds %struct.__tile_str, ptr %2, i64 0, i32 2
183 %16 = load <256 x i32>, ptr %15, align 64
184 %17 = call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %16)
185 %18 = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 %4, i16 %6, i16 %8, x86_amx %11, x86_amx %14, x86_amx %17)
186 %19 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %18)
187 store <256 x i32> %19, ptr %9, align 64
191 define dso_local void @__tile_dpbsud(i16 %m, i16 %n, i16 %k, ptr %pc, ptr %pa, ptr %pb) {
192 ; CHECK-LABEL: @__tile_dpbsud(
193 ; CHECK-NEXT: [[TMP1:%.*]] = udiv i16 [[K:%.*]], 4
194 ; CHECK-NEXT: [[TMP2:%.*]] = sext i16 [[K]] to i64
195 ; CHECK-NEXT: [[TMP3:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[M:%.*]], i16 [[K]], ptr [[PA:%.*]], i64 [[TMP2]])
196 ; CHECK-NEXT: [[TMP4:%.*]] = sext i16 [[N:%.*]] to i64
197 ; CHECK-NEXT: [[TMP5:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[TMP1]], i16 [[N]], ptr [[PB:%.*]], i64 [[TMP4]])
198 ; CHECK-NEXT: [[TMP6:%.*]] = sext i16 [[N]] to i64
199 ; CHECK-NEXT: [[TMP7:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[M]], i16 [[N]], ptr [[PC:%.*]], i64 [[TMP6]])
200 ; CHECK-NEXT: [[T6:%.*]] = tail call x86_amx @llvm.x86.tdpbsud.internal(i16 [[M]], i16 [[N]], i16 [[K]], x86_amx [[TMP7]], x86_amx [[TMP3]], x86_amx [[TMP5]])
201 ; CHECK-NEXT: [[TMP8:%.*]] = sext i16 [[N]] to i64
202 ; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 [[M]], i16 [[N]], ptr [[PC]], i64 [[TMP8]], x86_amx [[T6]])
203 ; CHECK-NEXT: ret void
205 %t0 = load <256 x i32>, ptr %pa, align 64
206 %t1 = call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %t0)
207 %t2 = load <256 x i32>, ptr %pb, align 64
208 %t3 = call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %t2)
209 %t4 = load <256 x i32>, ptr %pc, align 64
210 %t5 = call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %t4)
211 %t6 = tail call x86_amx @llvm.x86.tdpbsud.internal(i16 %m, i16 %n, i16 %k, x86_amx %t5, x86_amx %t1, x86_amx %t3)
212 %t7 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %t6)
213 store <256 x i32> %t7, ptr %pc, align 64
217 define dso_local void @__tile_dpbusd(i16 %m, i16 %n, i16 %k, ptr %pc, ptr %pa, ptr %pb) {
218 ; CHECK-LABEL: @__tile_dpbusd(
219 ; CHECK-NEXT: [[TMP1:%.*]] = udiv i16 [[K:%.*]], 4
220 ; CHECK-NEXT: [[TMP2:%.*]] = sext i16 [[K]] to i64
221 ; CHECK-NEXT: [[TMP3:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[M:%.*]], i16 [[K]], ptr [[PA:%.*]], i64 [[TMP2]])
222 ; CHECK-NEXT: [[TMP4:%.*]] = sext i16 [[N:%.*]] to i64
223 ; CHECK-NEXT: [[TMP5:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[TMP1]], i16 [[N]], ptr [[PB:%.*]], i64 [[TMP4]])
224 ; CHECK-NEXT: [[TMP6:%.*]] = sext i16 [[N]] to i64
225 ; CHECK-NEXT: [[TMP7:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[M]], i16 [[N]], ptr [[PC:%.*]], i64 [[TMP6]])
226 ; CHECK-NEXT: [[T6:%.*]] = tail call x86_amx @llvm.x86.tdpbusd.internal(i16 [[M]], i16 [[N]], i16 [[K]], x86_amx [[TMP7]], x86_amx [[TMP3]], x86_amx [[TMP5]])
227 ; CHECK-NEXT: [[TMP8:%.*]] = sext i16 [[N]] to i64
228 ; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 [[M]], i16 [[N]], ptr [[PC]], i64 [[TMP8]], x86_amx [[T6]])
229 ; CHECK-NEXT: ret void
231 %t0 = load <256 x i32>, ptr %pa, align 64
232 %t1 = call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %t0)
233 %t2 = load <256 x i32>, ptr %pb, align 64
234 %t3 = call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %t2)
235 %t4 = load <256 x i32>, ptr %pc, align 64
236 %t5 = call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %t4)
237 %t6 = tail call x86_amx @llvm.x86.tdpbusd.internal(i16 %m, i16 %n, i16 %k, x86_amx %t5, x86_amx %t1, x86_amx %t3)
238 %t7 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %t6)
239 store <256 x i32> %t7, ptr %pc, align 64
243 define dso_local void @__tile_dpbuud(i16 %m, i16 %n, i16 %k, ptr %pc, ptr %pa, ptr %pb) {
244 ; CHECK-LABEL: @__tile_dpbuud(
245 ; CHECK-NEXT: [[TMP1:%.*]] = udiv i16 [[K:%.*]], 4
246 ; CHECK-NEXT: [[TMP2:%.*]] = sext i16 [[K]] to i64
247 ; CHECK-NEXT: [[TMP3:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[M:%.*]], i16 [[K]], ptr [[PA:%.*]], i64 [[TMP2]])
248 ; CHECK-NEXT: [[TMP4:%.*]] = sext i16 [[N:%.*]] to i64
249 ; CHECK-NEXT: [[TMP5:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[TMP1]], i16 [[N]], ptr [[PB:%.*]], i64 [[TMP4]])
250 ; CHECK-NEXT: [[TMP6:%.*]] = sext i16 [[N]] to i64
251 ; CHECK-NEXT: [[TMP7:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[M]], i16 [[N]], ptr [[PC:%.*]], i64 [[TMP6]])
252 ; CHECK-NEXT: [[T6:%.*]] = tail call x86_amx @llvm.x86.tdpbuud.internal(i16 [[M]], i16 [[N]], i16 [[K]], x86_amx [[TMP7]], x86_amx [[TMP3]], x86_amx [[TMP5]])
253 ; CHECK-NEXT: [[TMP8:%.*]] = sext i16 [[N]] to i64
254 ; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 [[M]], i16 [[N]], ptr [[PC]], i64 [[TMP8]], x86_amx [[T6]])
255 ; CHECK-NEXT: ret void
257 %t0 = load <256 x i32>, ptr %pa, align 64
258 %t1 = call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %t0)
259 %t2 = load <256 x i32>, ptr %pb, align 64
260 %t3 = call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %t2)
261 %t4 = load <256 x i32>, ptr %pc, align 64
262 %t5 = call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %t4)
263 %t6 = tail call x86_amx @llvm.x86.tdpbuud.internal(i16 %m, i16 %n, i16 %k, x86_amx %t5, x86_amx %t1, x86_amx %t3)
264 %t7 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %t6)
265 store <256 x i32> %t7, ptr %pc, align 64
269 define dso_local void @__tile_dpbf16ps(i16 %m, i16 %n, i16 %k, ptr %pc, ptr %pa, ptr %pb) {
270 ; CHECK-LABEL: @__tile_dpbf16ps(
271 ; CHECK-NEXT: [[TMP1:%.*]] = udiv i16 [[K:%.*]], 4
272 ; CHECK-NEXT: [[TMP2:%.*]] = sext i16 [[K]] to i64
273 ; CHECK-NEXT: [[TMP3:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[M:%.*]], i16 [[K]], ptr [[PA:%.*]], i64 [[TMP2]])
274 ; CHECK-NEXT: [[TMP4:%.*]] = sext i16 [[N:%.*]] to i64
275 ; CHECK-NEXT: [[TMP5:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[TMP1]], i16 [[N]], ptr [[PB:%.*]], i64 [[TMP4]])
276 ; CHECK-NEXT: [[TMP6:%.*]] = sext i16 [[N]] to i64
277 ; CHECK-NEXT: [[TMP7:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[M]], i16 [[N]], ptr [[PC:%.*]], i64 [[TMP6]])
278 ; CHECK-NEXT: [[T6:%.*]] = tail call x86_amx @llvm.x86.tdpbf16ps.internal(i16 [[M]], i16 [[N]], i16 [[K]], x86_amx [[TMP7]], x86_amx [[TMP3]], x86_amx [[TMP5]])
279 ; CHECK-NEXT: [[TMP8:%.*]] = sext i16 [[N]] to i64
280 ; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 [[M]], i16 [[N]], ptr [[PC]], i64 [[TMP8]], x86_amx [[T6]])
281 ; CHECK-NEXT: ret void
283 %t0 = load <256 x i32>, ptr %pa, align 64
284 %t1 = call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %t0)
285 %t2 = load <256 x i32>, ptr %pb, align 64
286 %t3 = call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %t2)
287 %t4 = load <256 x i32>, ptr %pc, align 64
288 %t5 = call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %t4)
289 %t6 = tail call x86_amx @llvm.x86.tdpbf16ps.internal(i16 %m, i16 %n, i16 %k, x86_amx %t5, x86_amx %t1, x86_amx %t3)
290 %t7 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %t6)
291 store <256 x i32> %t7, ptr %pc, align 64
295 define dso_local void @__tile_stored(ptr %0, i64 %1, ptr nocapture readonly byval(%struct.__tile_str) align 64 %2) local_unnamed_addr {
296 ; CHECK-LABEL: @__tile_stored(
297 ; CHECK-NEXT: [[TMP4:%.*]] = load i16, ptr [[TMP2:%.*]], align 64
298 ; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TILE_STR:%.*]], ptr [[TMP2]], i64 0, i32 1
299 ; CHECK-NEXT: [[TMP6:%.*]] = load i16, ptr [[TMP5]], align 2
300 ; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TILE_STR]], ptr [[TMP2]], i64 0, i32 2
301 ; CHECK-NEXT: [[TMP8:%.*]] = sext i16 [[TMP6]] to i64
302 ; CHECK-NEXT: [[TMP9:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[TMP4]], i16 [[TMP6]], ptr [[TMP7]], i64 [[TMP8]])
303 ; CHECK-NEXT: [[TMP10:%.*]] = shl i64 [[TMP1:%.*]], 32
304 ; CHECK-NEXT: [[TMP11:%.*]] = ashr exact i64 [[TMP10]], 32
305 ; CHECK-NEXT: tail call void @llvm.x86.tilestored64.internal(i16 [[TMP4]], i16 [[TMP6]], ptr [[TMP0:%.*]], i64 [[TMP11]], x86_amx [[TMP9]])
306 ; CHECK-NEXT: ret void
308 %4 = load i16, ptr %2, align 64
309 %5 = getelementptr inbounds %struct.__tile_str, ptr %2, i64 0, i32 1
310 %6 = load i16, ptr %5, align 2
311 %7 = getelementptr inbounds %struct.__tile_str, ptr %2, i64 0, i32 2
312 %8 = load <256 x i32>, ptr %7, align 64
313 %9 = call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %8)
315 %11 = ashr exact i64 %10, 32
316 tail call void @llvm.x86.tilestored64.internal(i16 %4, i16 %6, ptr %0, i64 %11, x86_amx %9)
320 define void @dead_code(ptr%buf, i1 %arg) {
321 ; CHECK-LABEL: @dead_code(
323 ; CHECK-NEXT: [[TMP0:%.*]] = alloca <256 x i32>, align 64
324 ; CHECK-NEXT: br i1 [[ARG:%.*]], label [[L1:%.*]], label [[L2:%.*]]
326 ; CHECK-NEXT: [[T1:%.*]] = call x86_amx @llvm.x86.tilezero.internal(i16 8, i16 32)
327 ; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 8, i16 32, ptr [[TMP0]], i64 32, x86_amx [[T1]])
328 ; CHECK-NEXT: [[TMP1:%.*]] = load <256 x i32>, ptr [[TMP0]], align 1024
329 ; CHECK-NEXT: br i1 [[ARG]], label [[L2]], label [[EXIT:%.*]]
331 ; CHECK-NEXT: [[T3:%.*]] = phi <256 x i32> [ undef, [[ENTRY:%.*]] ], [ [[TMP1]], [[L1]] ]
332 ; CHECK-NEXT: store <256 x i32> [[T3]], ptr [[BUF:%.*]], align 1024
333 ; CHECK-NEXT: br label [[EXIT]]
335 ; CHECK-NEXT: ret void
338 br i1 %arg, label %l1, label %l2
341 %t1 = call x86_amx @llvm.x86.tilezero.internal(i16 8, i16 32)
342 %t2 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %t1)
343 br i1 %arg, label %l2, label %exit
346 %t3 = phi <256 x i32> [ undef, %entry ], [ %t2, %l1 ]
347 %t4 = call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %t3)
348 %t5 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %t4)
349 store <256 x i32> %t5, ptr %buf
356 declare x86_amx @llvm.x86.tilezero.internal(i16, i16)
357 declare x86_amx @llvm.x86.tileloadd64.internal(i16, i16, ptr, i64)
358 declare x86_amx @llvm.x86.tdpbssd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx)
359 declare x86_amx @llvm.x86.tdpbsud.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx)
360 declare x86_amx @llvm.x86.tdpbusd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx)
361 declare x86_amx @llvm.x86.tdpbuud.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx)
362 declare x86_amx @llvm.x86.tdpbf16ps.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx)
363 declare void @llvm.x86.tilestored64.internal(i16, i16, ptr, i64, x86_amx)
365 declare x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32>)
366 declare x86_amx @llvm.x86.cast.vector.to.tile.v225i32(<225 x i32>)
367 declare <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx)
368 declare <225 x i32> @llvm.x86.cast.tile.to.vector.v225i32(x86_amx)