1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: opt < %s -mtriple=x86_64-unknown-unknown -mattr=+amx-int8 -mattr=+avx512f -pre-amx-config -S | FileCheck %s
4 @buf = dso_local global [1024 x i8] zeroinitializer, align 16
5 @buf2 = dso_local global [1024 x i8] zeroinitializer, align 16
7 ; Function Attrs: nounwind uwtable
8 define dso_local void @test_api(i32 %cond, i16 signext %row, i16 signext %col) local_unnamed_addr {
10 ; CHECK: %{{[0-9]+}} = alloca <16 x i32>, align 4
11 ; CHECK-NEXT: %{{[0-9]+}} = alloca <16 x i32>, align 4
12 ; CHECK-NEXT: %{{[0-9]+}} = alloca <16 x i32>, align 4
13 ; CHECK-NEXT: %{{[0-9]+}} = alloca <16 x i32>, align 4
14 ; CHECK-NEXT: %{{[0-9]+}} = alloca <16 x i32>, align 4
15 ; CHECK-NEXT: %{{[0-9]+}} = alloca <16 x i32>, align 4
16 ; CHECK-NEXT: %{{[0-9]+}} = alloca <16 x i32>, align 4
17 ; CHECK-NEXT: %{{[0-9]+}} = alloca <16 x i32>, align 4
18 ; CHECK-NEXT: %{{[0-9]+}} = alloca <256 x i32>, align 1024
19 ; CHECK-NEXT: %{{[0-9]+}} = bitcast <256 x i32>* %{{[0-9]+}} to i8*
20 ; CHECK-NEXT: %{{[0-9]+}} = alloca <256 x i32>, align 1024
21 ; CHECK-NEXT: %{{[0-9]+}} = bitcast <256 x i32>* %{{[0-9]+}} to i8*
22 ; CHECK-NEXT: %{{[0-9]+}} = alloca <256 x i32>, align 1024
23 ; CHECK-NEXT: %{{[0-9]+}} = bitcast <256 x i32>* %{{[0-9]+}} to i8*
24 ; CHECK-NEXT: %{{[0-9]+}} = alloca <256 x i32>, align 1024
25 ; CHECK-NEXT: %{{[0-9]+}} = bitcast <256 x i32>* %{{[0-9]+}} to i8*
26 ; CHECK-NEXT: %tobool.not = icmp eq i32 %cond, 0
27 ; CHECK-NEXT: br i1 %tobool.not, label %if.else, label %if.then
29 ; CHECK-NEXT: %{{[0-9]+}} = bitcast <16 x i32>* %{{[0-9]+}} to i8*
30 ; CHECK-NEXT: store <16 x i32> zeroinitializer, <16 x i32>* %{{[0-9]+}}, align 4
31 ; CHECK-NEXT: %{{[0-9]+}} = getelementptr i8, i8* %{{[0-9]+}}, i64 0
32 ; CHECK-NEXT: store i8 1, i8* %{{[0-9]+}}, align 1
33 ; CHECK-NEXT: %amx.tmm.0.shape.row{{.*}} = getelementptr i8, i8* %{{[0-9]+}}, i64 48
34 ; CHECK-NEXT: %{{[0-9]+}} = getelementptr i8, i8* %{{[0-9]+}}, i64 16
35 ; CHECK-NEXT: %amx.tmm.0.shape.col{{.*}} = bitcast i8* %{{[0-9]+}} to i16*
36 ; CHECK-NEXT: %{{[0-9]+}} = trunc i16 %row to i8
37 ; CHECK-NEXT: store i8 %{{[0-9]+}}, i8* %amx.tmm.0.shape.row{{.*}}, align 1
38 ; CHECK-NEXT: store i16 8, i16* %amx.tmm.0.shape.col{{.*}}, align 2
39 ; CHECK-NEXT: call void @llvm.x86.ldtilecfg.internal(i8* %{{[0-9]+}})
40 ; CHECK-NEXT: %{{[0-9]+}} = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 8, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf, i64 0, i64 0), i64 32)
41 ; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 %row, i16 8, i8* %{{[0-9]+}}, i64 64, x86_amx %{{[0-9]+}})
42 ; CHECK-NEXT: %{{[0-9]+}} = bitcast <16 x i32>* %{{[0-9]+}} to i8*
43 ; CHECK-NEXT: store <16 x i32> zeroinitializer, <16 x i32>* %{{[0-9]+}}, align 4
44 ; CHECK-NEXT: %{{[0-9]+}} = getelementptr i8, i8* %{{[0-9]+}}, i64 0
45 ; CHECK-NEXT: store i8 1, i8* %{{[0-9]+}}, align 1
46 ; CHECK-NEXT: %amx.tmm.0.shape.row{{.*}} = getelementptr i8, i8* %{{[0-9]+}}, i64 48
47 ; CHECK-NEXT: %{{[0-9]+}} = getelementptr i8, i8* %{{[0-9]+}}, i64 16
48 ; CHECK-NEXT: %amx.tmm.0.shape.col{{.*}} = bitcast i8* %{{[0-9]+}} to i16*
49 ; CHECK-NEXT: %{{[0-9]+}} = trunc i16 8 to i8
50 ; CHECK-NEXT: store i8 %{{[0-9]+}}, i8* %amx.tmm.0.shape.row{{.*}}, align 1
51 ; CHECK-NEXT: store i16 %col, i16* %amx.tmm.0.shape.col{{.*}}, align 2
52 ; CHECK-NEXT: call void @llvm.x86.ldtilecfg.internal(i8* %{{[0-9]+}})
53 ; CHECK-NEXT: %{{[0-9]+}} = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 %col, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf, i64 0, i64 0), i64 32)
54 ; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 8, i16 %col, i8* %{{[0-9]+}}, i64 64, x86_amx %{{[0-9]+}})
55 ; CHECK-NEXT: %{{[0-9]+}} = bitcast <16 x i32>* %{{[0-9]+}} to i8*
56 ; CHECK-NEXT: store <16 x i32> zeroinitializer, <16 x i32>* %{{[0-9]+}}, align 4
57 ; CHECK-NEXT: %{{[0-9]+}} = getelementptr i8, i8* %{{[0-9]+}}, i64 0
58 ; CHECK-NEXT: store i8 1, i8* %{{[0-9]+}}, align 1
59 ; CHECK-NEXT: %amx.tmm.0.shape.row{{.*}} = getelementptr i8, i8* %{{[0-9]+}}, i64 48
60 ; CHECK-NEXT: %{{[0-9]+}} = getelementptr i8, i8* %{{[0-9]+}}, i64 16
61 ; CHECK-NEXT: %amx.tmm.0.shape.col{{.*}} = bitcast i8* %{{[0-9]+}} to i16*
62 ; CHECK-NEXT: %{{[0-9]+}} = trunc i16 %row to i8
63 ; CHECK-NEXT: store i8 %{{[0-9]+}}, i8* %amx.tmm.0.shape.row{{.*}}, align 1
64 ; CHECK-NEXT: store i16 %col, i16* %amx.tmm.0.shape.col{{.*}}, align 2
65 ; CHECK-NEXT: call void @llvm.x86.ldtilecfg.internal(i8* %{{[0-9]+}})
66 ; CHECK-NEXT: %{{[0-9]+}} = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 %col, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf, i64 0, i64 0), i64 32)
67 ; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 %row, i16 %col, i8* %{{[0-9]+}}, i64 64, x86_amx %{{[0-9]+}})
68 ; CHECK-NEXT: br label %if.end
70 ; CHECK-NEXT: %{{[0-9]+}} = bitcast <16 x i32>* %{{[0-9]+}} to i8*
71 ; CHECK-NEXT: store <16 x i32> zeroinitializer, <16 x i32>* %{{[0-9]+}}, align 4
72 ; CHECK-NEXT: %{{[0-9]+}} = getelementptr i8, i8* %{{[0-9]+}}, i64 0
73 ; CHECK-NEXT: store i8 1, i8* %{{[0-9]+}}, align 1
74 ; CHECK-NEXT: %amx.tmm.0.shape.row{{.*}} = getelementptr i8, i8* %{{[0-9]+}}, i64 48
75 ; CHECK-NEXT: %{{[0-9]+}} = getelementptr i8, i8* %{{[0-9]+}}, i64 16
76 ; CHECK-NEXT: %amx.tmm.0.shape.col{{.*}} = bitcast i8* %{{[0-9]+}} to i16*
77 ; CHECK-NEXT: %{{[0-9]+}} = trunc i16 %row to i8
78 ; CHECK-NEXT: store i8 %{{[0-9]+}}, i8* %amx.tmm.0.shape.row{{.*}}, align 1
79 ; CHECK-NEXT: store i16 8, i16* %amx.tmm.0.shape.col{{.*}}, align 2
80 ; CHECK-NEXT: call void @llvm.x86.ldtilecfg.internal(i8* %{{[0-9]+}})
81 ; CHECK-NEXT: %{{[0-9]+}} = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 8, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf2, i64 0, i64 0), i64 32)
82 ; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 %row, i16 8, i8* %{{[0-9]+}}, i64 64, x86_amx %{{[0-9]+}})
83 ; CHECK-NEXT: %{{[0-9]+}} = bitcast <16 x i32>* %{{[0-9]+}} to i8*
84 ; CHECK-NEXT: store <16 x i32> zeroinitializer, <16 x i32>* %{{[0-9]+}}, align 4
85 ; CHECK-NEXT: %{{[0-9]+}} = getelementptr i8, i8* %{{[0-9]+}}, i64 0
86 ; CHECK-NEXT: store i8 1, i8* %{{[0-9]+}}, align 1
87 ; CHECK-NEXT: %amx.tmm.0.shape.row{{.*}} = getelementptr i8, i8* %{{[0-9]+}}, i64 48
88 ; CHECK-NEXT: %{{[0-9]+}} = getelementptr i8, i8* %{{[0-9]+}}, i64 16
89 ; CHECK-NEXT: %amx.tmm.0.shape.col{{.*}} = bitcast i8* %{{[0-9]+}} to i16*
90 ; CHECK-NEXT: %{{[0-9]+}} = trunc i16 8 to i8
91 ; CHECK-NEXT: store i8 %{{[0-9]+}}, i8* %amx.tmm.0.shape.row{{.*}}, align 1
92 ; CHECK-NEXT: store i16 %col, i16* %amx.tmm.0.shape.col{{.*}}, align 2
93 ; CHECK-NEXT: call void @llvm.x86.ldtilecfg.internal(i8* %{{[0-9]+}})
94 ; CHECK-NEXT: %{{[0-9]+}} = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 %col, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf2, i64 0, i64 0), i64 32)
95 ; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 8, i16 %col, i8* %{{[0-9]+}}, i64 64, x86_amx %{{[0-9]+}})
96 ; CHECK-NEXT: %{{[0-9]+}} = bitcast <16 x i32>* %{{[0-9]+}} to i8*
97 ; CHECK-NEXT: store <16 x i32> zeroinitializer, <16 x i32>* %{{[0-9]+}}, align 4
98 ; CHECK-NEXT: %{{[0-9]+}} = getelementptr i8, i8* %{{[0-9]+}}, i64 0
99 ; CHECK-NEXT: store i8 1, i8* %{{[0-9]+}}, align 1
100 ; CHECK-NEXT: %amx.tmm.0.shape.row{{.*}} = getelementptr i8, i8* %{{[0-9]+}}, i64 48
101 ; CHECK-NEXT: %{{[0-9]+}} = getelementptr i8, i8* %{{[0-9]+}}, i64 16
102 ; CHECK-NEXT: %amx.tmm.0.shape.col{{.*}} = bitcast i8* %{{[0-9]+}} to i16*
103 ; CHECK-NEXT: %{{[0-9]+}} = trunc i16 %row to i8
104 ; CHECK-NEXT: store i8 %{{[0-9]+}}, i8* %amx.tmm.0.shape.row{{.*}}, align 1
105 ; CHECK-NEXT: store i16 %col, i16* %amx.tmm.0.shape.col{{.*}}, align 2
106 ; CHECK-NEXT: call void @llvm.x86.ldtilecfg.internal(i8* %{{[0-9]+}})
107 ; CHECK-NEXT: %{{[0-9]+}} = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 %col, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf2, i64 0, i64 0), i64 32)
108 ; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 %row, i16 %col, i8* %{{[0-9]+}}, i64 64, x86_amx %{{[0-9]+}})
109 ; CHECK-NEXT: br label %if.end
110 ; CHECK: if.end: ; preds = %if.else, %if.then
111 ; CHECK-NEXT: %{{[0-9]+}} = bitcast <16 x i32>* %{{[0-9]+}} to i8*
112 ; CHECK-NEXT: store <16 x i32> zeroinitializer, <16 x i32>* %{{[0-9]+}}, align 4
113 ; CHECK-NEXT: %{{[0-9]+}} = getelementptr i8, i8* %{{[0-9]+}}, i64 0
114 ; CHECK-NEXT: store i8 1, i8* %{{[0-9]+}}, align 1
115 ; CHECK-NEXT: %amx.tmm.0.shape.row{{.*}} = getelementptr i8, i8* %{{[0-9]+}}, i64 48
116 ; CHECK-NEXT: %{{[0-9]+}} = getelementptr i8, i8* %{{[0-9]+}}, i64 16
117 ; CHECK-NEXT: %amx.tmm.0.shape.col{{.*}} = bitcast i8* %{{[0-9]+}} to i16*
118 ; CHECK-NEXT: %{{[0-9]+}} = trunc i16 %row to i8
119 ; CHECK-NEXT: store i8 %{{[0-9]+}}, i8* %amx.tmm.0.shape.row{{.*}}, align 1
120 ; CHECK-NEXT: store i16 %col, i16* %amx.tmm.0.shape.col{{.*}}, align 2
121 ; CHECK-NEXT: %amx.tmm.1.shape.row{{.*}} = getelementptr i8, i8* %{{[0-9]+}}, i64 49
122 ; CHECK-NEXT: %{{[0-9]+}} = getelementptr i8, i8* %{{[0-9]+}}, i64 18
123 ; CHECK-NEXT: %amx.tmm.1.shape.col{{.*}} = bitcast i8* %{{[0-9]+}} to i16*
124 ; CHECK-NEXT: %{{[0-9]+}} = trunc i16 %row to i8
125 ; CHECK-NEXT: store i8 %{{[0-9]+}}, i8* %amx.tmm.1.shape.row{{.*}}, align 1
126 ; CHECK-NEXT: store i16 8, i16* %amx.tmm.1.shape.col{{.*}}, align 2
127 ; CHECK-NEXT: %amx.tmm.2.shape.row{{.*}} = getelementptr i8, i8* %{{[0-9]+}}, i64 50
128 ; CHECK-NEXT: %{{[0-9]+}} = getelementptr i8, i8* %{{[0-9]+}}, i64 20
129 ; CHECK-NEXT: %amx.tmm.2.shape.col{{.*}} = bitcast i8* %{{[0-9]+}} to i16*
130 ; CHECK-NEXT: %{{[0-9]+}} = trunc i16 8 to i8
131 ; CHECK-NEXT: store i8 %{{[0-9]+}}, i8* %amx.tmm.2.shape.row{{.*}}, align 1
132 ; CHECK-NEXT: store i16 %col, i16* %amx.tmm.2.shape.col{{.*}}, align 2
133 ; CHECK-NEXT: %amx.tmm.3.shape.row{{.*}} = getelementptr i8, i8* %{{[0-9]+}}, i64 51
134 ; CHECK-NEXT: %{{[0-9]+}} = getelementptr i8, i8* %{{[0-9]+}}, i64 22
135 ; CHECK-NEXT: %amx.tmm.3.shape.col{{.*}} = bitcast i8* %{{[0-9]+}} to i16*
136 ; CHECK-NEXT: %{{[0-9]+}} = trunc i16 %row to i8
137 ; CHECK-NEXT: store i8 %{{[0-9]+}}, i8* %amx.tmm.3.shape.row{{.*}}, align 1
138 ; CHECK-NEXT: store i16 %col, i16* %amx.tmm.3.shape.col{{.*}}, align 2
139 ; CHECK-NEXT: call void @llvm.x86.ldtilecfg.internal(i8* %{{[0-9]+}})
140 ; CHECK-NEXT: %{{[0-9]+}} = call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 8, i8* %{{[0-9]+}}, i64 64)
141 ; CHECK-NEXT: %{{[0-9]+}} = call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 %col, i8* %{{[0-9]+}}, i64 64)
142 ; CHECK-NEXT: %{{[0-9]+}} = call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 %col, i8* %{{[0-9]+}}, i64 64)
143 ; CHECK-NEXT: %{{[0-9]+}} = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 %row, i16 %col, i16 8, x86_amx %{{[0-9]+}}, x86_amx %{{[0-9]+}}, x86_amx %{{[0-9]+}})
144 ; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 %row, i16 %col, i8* %{{[0-9]+}}, i64 64, x86_amx %{{[0-9]+}})
145 ; CHECK-NEXT: %{{[0-9]+}} = bitcast <16 x i32>* %{{[0-9]+}} to i8*
146 ; CHECK-NEXT: store <16 x i32> zeroinitializer, <16 x i32>* %{{[0-9]+}}, align 4
147 ; CHECK-NEXT: %{{[0-9]+}} = getelementptr i8, i8* %{{[0-9]+}}, i64 0
148 ; CHECK-NEXT: store i8 1, i8* %{{[0-9]+}}, align 1
149 ; CHECK-NEXT: %amx.tmm.0.shape.row{{.*}} = getelementptr i8, i8* %{{[0-9]+}}, i64 48
150 ; CHECK-NEXT: %{{[0-9]+}} = getelementptr i8, i8* %{{[0-9]+}}, i64 16
151 ; CHECK-NEXT: %amx.tmm.0.shape.col{{.*}} = bitcast i8* %{{[0-9]+}} to i16*
152 ; CHECK-NEXT: %{{[0-9]+}} = trunc i16 %row to i8
153 ; CHECK-NEXT: store i8 %{{[0-9]+}}, i8* %amx.tmm.0.shape.row{{.*}}, align 1
154 ; CHECK-NEXT: store i16 %col, i16* %amx.tmm.0.shape.col{{.*}}, align 2
155 ; CHECK-NEXT: call void @llvm.x86.ldtilecfg.internal(i8* %{{[0-9]+}})
156 ; CHECK-NEXT: %{{[0-9]+}} = call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 %col, i8* %{{[0-9]+}}, i64 64)
157 ; CHECK-NEXT: tail call void @llvm.x86.tilestored64.internal(i16 %row, i16 %col, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf, i64 0, i64 0), i64 32, x86_amx %{{[0-9]+}})
158 ; CHECK-NEXT: ret void
160 %0 = alloca <256 x i32>, align 1024
161 %1 = bitcast <256 x i32>* %0 to i8*
162 %2 = alloca <256 x i32>, align 1024
163 %3 = bitcast <256 x i32>* %2 to i8*
164 %4 = alloca <256 x i32>, align 1024
165 %5 = bitcast <256 x i32>* %4 to i8*
166 %6 = alloca <256 x i32>, align 1024
167 %7 = bitcast <256 x i32>* %6 to i8*
168 %tobool.not = icmp eq i32 %cond, 0
169 br i1 %tobool.not, label %if.else, label %if.then
171 if.then: ; preds = %entry
172 %8 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 8, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf, i64 0, i64 0), i64 32)
173 call void @llvm.x86.tilestored64.internal(i16 %row, i16 8, i8* %5, i64 64, x86_amx %8)
174 %9 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 %col, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf, i64 0, i64 0), i64 32)
175 call void @llvm.x86.tilestored64.internal(i16 8, i16 %col, i8* %3, i64 64, x86_amx %9)
176 %10 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 %col, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf, i64 0, i64 0), i64 32)
177 call void @llvm.x86.tilestored64.internal(i16 %row, i16 %col, i8* %1, i64 64, x86_amx %10)
180 if.else: ; preds = %entry
181 %11 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 8, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf2, i64 0, i64 0), i64 32)
182 call void @llvm.x86.tilestored64.internal(i16 %row, i16 8, i8* %5, i64 64, x86_amx %11)
183 %12 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 %col, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf2, i64 0, i64 0), i64 32)
184 call void @llvm.x86.tilestored64.internal(i16 8, i16 %col, i8* %3, i64 64, x86_amx %12)
185 %13 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 %col, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf2, i64 0, i64 0), i64 32)
186 call void @llvm.x86.tilestored64.internal(i16 %row, i16 %col, i8* %1, i64 64, x86_amx %13)
189 if.end: ; preds = %if.else, %if.then
190 %14 = call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 8, i8* %5, i64 64)
191 %15 = call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 %col, i8* %3, i64 64)
192 %16 = call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 %col, i8* %1, i64 64)
193 %17 = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 %row, i16 %col, i16 8, x86_amx %16, x86_amx %14, x86_amx %15)
194 call void @llvm.x86.tilestored64.internal(i16 %row, i16 %col, i8* %7, i64 64, x86_amx %17)
195 %18 = call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 %col, i8* %7, i64 64)
196 tail call void @llvm.x86.tilestored64.internal(i16 %row, i16 %col, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf, i64 0, i64 0), i64 32, x86_amx %18)
200 ; Function Attrs: nounwind
201 declare x86_amx @llvm.x86.tileloadd64.internal(i16, i16, i8*, i64)
203 ; Function Attrs: nounwind
204 declare x86_amx @llvm.x86.tdpbssd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx)
206 ; Function Attrs: nounwind
207 declare void @llvm.x86.tilestored64.internal(i16, i16, i8*, i64, x86_amx)