1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+amx-bf16,+amx-fp16,+amx-complex,+amx-transpose | FileCheck %s
4 define void @test_amx(i32 %rv32, i64 %stride, i64 %rvalue, i8* %addr1, <4 x float> %xmm) #0 {
5 ; CHECK-LABEL: test_amx:
7 ; CHECK-NEXT: t2rpntlvwz0 (%rcx,%rsi), %tmm0
8 ; CHECK-NEXT: t2rpntlvwz0t1 (%rcx,%rsi), %tmm2
9 ; CHECK-NEXT: t2rpntlvwz1 (%rcx,%rsi), %tmm0
10 ; CHECK-NEXT: t2rpntlvwz1t1 (%rcx,%rsi), %tmm2
11 ; CHECK-NEXT: ttransposed %tmm3, %tmm1
12 ; CHECK-NEXT: ttdpbf16ps %tmm3, %tmm2, %tmm1
13 ; CHECK-NEXT: ttdpfp16ps %tmm6, %tmm5, %tmm4
14 ; CHECK-NEXT: ttcmmimfp16ps %tmm3, %tmm2, %tmm1
15 ; CHECK-NEXT: ttcmmrlfp16ps %tmm3, %tmm2, %tmm1
16 ; CHECK-NEXT: tconjtcmmimfp16ps %tmm3, %tmm2, %tmm1
17 ; CHECK-NEXT: tconjtfp16 %tmm2, %tmm1
19 call void @llvm.x86.t2rpntlvwz0(i8 1, i8* %addr1, i64 %stride)
20 call void @llvm.x86.t2rpntlvwz0t1(i8 2, i8* %addr1, i64 %stride)
21 call void @llvm.x86.t2rpntlvwz1(i8 1, i8* %addr1, i64 %stride)
22 call void @llvm.x86.t2rpntlvwz1t1(i8 2, i8* %addr1, i64 %stride)
23 call void @llvm.x86.ttransposed(i8 1, i8 3)
24 call void @llvm.x86.ttdpbf16ps(i8 1, i8 2, i8 3)
25 call void @llvm.x86.ttdpfp16ps(i8 4, i8 5, i8 6)
26 call void @llvm.x86.ttcmmimfp16ps(i8 1, i8 2, i8 3)
27 call void @llvm.x86.ttcmmrlfp16ps(i8 1, i8 2, i8 3)
28 call void @llvm.x86.tconjtcmmimfp16ps(i8 1, i8 2, i8 3)
29 call void @llvm.x86.tconjtfp16(i8 1, i8 2)
33 declare void @llvm.x86.t2rpntlvwz0(i8 %tile1, i8* %addr1, i64 %stride)
34 declare void @llvm.x86.t2rpntlvwz0t1(i8 %tile1, i8* %addr1, i64 %stride)
35 declare void @llvm.x86.t2rpntlvwz1(i8 %tile1, i8* %addr1, i64 %stride)
36 declare void @llvm.x86.t2rpntlvwz1t1(i8 %tile1, i8* %addr1, i64 %stride)
37 declare void @llvm.x86.ttransposed(i8 %tile0, i8 %tile1)
38 declare void @llvm.x86.ttdpbf16ps(i8 %tile0, i8 %tile1, i8 %tile2)
39 declare void @llvm.x86.ttdpfp16ps(i8 %tile0, i8 %tile1, i8 %tile2)
40 declare void @llvm.x86.ttcmmimfp16ps(i8 %A, i8 %B, i8 %C)
41 declare void @llvm.x86.ttcmmrlfp16ps(i8 %A, i8 %B, i8 %C)
42 declare void @llvm.x86.tconjtcmmimfp16ps(i8 %A, i8 %B, i8 %C)
43 declare void @llvm.x86.tconjtfp16(i8 %A, i8 %B)
45 define void @test_amx2(i8* %pointer, i8* %base, i64 %stride) #0 {
46 ; CHECK-LABEL: test_amx2:
48 ; CHECK-NEXT: pushq %rbp
49 ; CHECK-NEXT: subq $2928, %rsp # imm = 0xB70
50 ; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0
51 ; CHECK-NEXT: vmovups %zmm0, {{[0-9]+}}(%rsp)
52 ; CHECK-NEXT: movb $1, {{[0-9]+}}(%rsp)
53 ; CHECK-NEXT: movb $8, {{[0-9]+}}(%rsp)
54 ; CHECK-NEXT: movw $8, {{[0-9]+}}(%rsp)
55 ; CHECK-NEXT: movb $8, {{[0-9]+}}(%rsp)
56 ; CHECK-NEXT: movw $8, {{[0-9]+}}(%rsp)
57 ; CHECK-NEXT: movb $8, {{[0-9]+}}(%rsp)
58 ; CHECK-NEXT: movw $8, {{[0-9]+}}(%rsp)
59 ; CHECK-NEXT: movb $8, {{[0-9]+}}(%rsp)
60 ; CHECK-NEXT: movw $8, {{[0-9]+}}(%rsp)
61 ; CHECK-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
62 ; CHECK-NEXT: movw $8, %ax
63 ; CHECK-NEXT: tileloadd (%rsi,%rdx), %tmm0
64 ; CHECK-NEXT: tilezero %tmm1
65 ; CHECK-NEXT: tilezero %tmm2
66 ; CHECK-NEXT: ttdpbf16ps %tmm1, %tmm0, %tmm2
67 ; CHECK-NEXT: ttdpfp16ps %tmm1, %tmm0, %tmm2
68 ; CHECK-NEXT: ttcmmimfp16ps %tmm1, %tmm0, %tmm2
69 ; CHECK-NEXT: ttcmmrlfp16ps %tmm1, %tmm0, %tmm2
70 ; CHECK-NEXT: movabsq $64, %rbp
71 ; CHECK-NEXT: tilestored %tmm2, 896(%rsp,%rbp) # 1024-byte Folded Spill
72 ; CHECK-NEXT: tileloadd 896(%rsp,%rbp), %tmm3 # 1024-byte Folded Reload
73 ; CHECK-NEXT: tconjtcmmimfp16ps %tmm1, %tmm0, %tmm3
74 ; CHECK-NEXT: tconjtfp16 %tmm3, %tmm0
75 ; CHECK-NEXT: tilestored %tmm2, (%rdi,%rdx)
76 ; CHECK-NEXT: addq $2928, %rsp # imm = 0xB70
77 ; CHECK-NEXT: popq %rbp
78 ; CHECK-NEXT: tilerelease
79 ; CHECK-NEXT: vzeroupper
82 %a = call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 8, i8* %base, i64 %stride)
83 %b = call x86_amx @llvm.x86.tilezero.internal(i16 8, i16 8)
84 %c = call x86_amx @llvm.x86.tilezero.internal(i16 8, i16 8)
85 %c1 = call x86_amx @llvm.x86.ttdpbf16ps.internal(i16 8, i16 8, i16 8, x86_amx %c, x86_amx %a, x86_amx %b)
86 %c2 = call x86_amx @llvm.x86.ttdpfp16ps.internal(i16 8, i16 8, i16 8, x86_amx %c1, x86_amx %a, x86_amx %b)
87 %c3 = call x86_amx @llvm.x86.ttcmmimfp16ps.internal(i16 8, i16 8, i16 8, x86_amx %c2, x86_amx %a, x86_amx %b)
88 %c4 = call x86_amx @llvm.x86.ttcmmrlfp16ps.internal(i16 8, i16 8, i16 8, x86_amx %c3, x86_amx %a, x86_amx %b)
89 %c5 = call x86_amx @llvm.x86.tconjtcmmimfp16ps.internal(i16 8, i16 8, i16 8, x86_amx %c4, x86_amx %a, x86_amx %b)
90 %c6 = call x86_amx @llvm.x86.tconjtfp16.internal(i16 8, i16 8, x86_amx %c5)
92 call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %pointer, i64 %stride, x86_amx %c4)
96 define void @test_amx3(i8* %pointer, i8* %base, i64 %stride) #0 {
97 ; CHECK-LABEL: test_amx3:
99 ; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0
100 ; CHECK-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp)
101 ; CHECK-NEXT: movb $1, -{{[0-9]+}}(%rsp)
102 ; CHECK-NEXT: movb $8, -{{[0-9]+}}(%rsp)
103 ; CHECK-NEXT: movw $8, -{{[0-9]+}}(%rsp)
104 ; CHECK-NEXT: movb $8, -{{[0-9]+}}(%rsp)
105 ; CHECK-NEXT: movw $8, -{{[0-9]+}}(%rsp)
106 ; CHECK-NEXT: movb $0, -{{[0-9]+}}(%rsp)
107 ; CHECK-NEXT: movw $0, -{{[0-9]+}}(%rsp)
108 ; CHECK-NEXT: ldtilecfg -{{[0-9]+}}(%rsp)
109 ; CHECK-NEXT: xorl %eax, %eax
110 ; CHECK-NEXT: movw $8, %cx
111 ; CHECK-NEXT: t2rpntlvwz0 (%rsi,%rdx), %tmm4
112 ; CHECK-NEXT: t2rpntlvwz0t1 (%rsi,%rdx), %tmm4
113 ; CHECK-NEXT: t2rpntlvwz1 (%rsi,%rdx), %tmm4
114 ; CHECK-NEXT: t2rpntlvwz1t1 (%rsi,%rdx), %tmm4
115 ; CHECK-NEXT: ttransposed %tmm4, %tmm0
116 ; CHECK-NEXT: tilestored %tmm0, (%rdi,%rdx)
117 ; CHECK-NEXT: tilerelease
118 ; CHECK-NEXT: vzeroupper
120 %1 = call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0.internal(i16 8, i16 8, i16 0, i8* %base, i64 %stride)
121 %2 = call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0t1.internal(i16 8, i16 8, i16 0, i8* %base, i64 %stride)
122 %3 = call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz1.internal(i16 8, i16 8, i16 0, i8* %base, i64 %stride)
123 %4 = call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz1t1.internal(i16 8, i16 8, i16 0, i8* %base, i64 %stride)
124 %5 = extractvalue { x86_amx, x86_amx } %4, 0
125 %6 = call x86_amx @llvm.x86.ttransposed.internal(i16 8, i16 8, x86_amx %5)
126 call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %pointer, i64 %stride, x86_amx %6)
130 define void @test_amx_spill(i8* %pointer, i8* %base, i64 %stride) #0 {
131 ; CHECK-LABEL: test_amx_spill:
133 ; CHECK-NEXT: subq $6088, %rsp # imm = 0x17C8
134 ; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0
135 ; CHECK-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp)
136 ; CHECK-NEXT: movb $1, -{{[0-9]+}}(%rsp)
137 ; CHECK-NEXT: movb $8, -{{[0-9]+}}(%rsp)
138 ; CHECK-NEXT: movw $8, -{{[0-9]+}}(%rsp)
139 ; CHECK-NEXT: movb $8, -{{[0-9]+}}(%rsp)
140 ; CHECK-NEXT: movw $8, -{{[0-9]+}}(%rsp)
141 ; CHECK-NEXT: movb $8, -{{[0-9]+}}(%rsp)
142 ; CHECK-NEXT: movw $8, -{{[0-9]+}}(%rsp)
143 ; CHECK-NEXT: movb $8, -{{[0-9]+}}(%rsp)
144 ; CHECK-NEXT: movw $8, -{{[0-9]+}}(%rsp)
145 ; CHECK-NEXT: movb $8, -{{[0-9]+}}(%rsp)
146 ; CHECK-NEXT: movw $8, -{{[0-9]+}}(%rsp)
147 ; CHECK-NEXT: ldtilecfg -{{[0-9]+}}(%rsp)
148 ; CHECK-NEXT: movw $8, %ax
149 ; CHECK-NEXT: tileloadd (%rsi,%rdx), %tmm0
150 ; CHECK-NEXT: t2rpntlvwz0 (%rsi,%rdx), %tmm4
151 ; CHECK-NEXT: t2rpntlvwz0t1 (%rsi,%rdx), %tmm6
152 ; CHECK-NEXT: movabsq $64, %rcx
153 ; CHECK-NEXT: tilestored %tmm6, 4032(%rsp,%rcx) # 1024-byte Folded Spill
154 ; CHECK-NEXT: tilestored %tmm7, 5056(%rsp,%rcx) # 1024-byte Folded Spill
155 ; CHECK-NEXT: t2rpntlvwz1 (%rsi,%rdx), %tmm6
156 ; CHECK-NEXT: tilestored %tmm6, 1984(%rsp,%rcx) # 1024-byte Folded Spill
157 ; CHECK-NEXT: tilestored %tmm7, 3008(%rsp,%rcx) # 1024-byte Folded Spill
158 ; CHECK-NEXT: t2rpntlvwz1t1 (%rsi,%rdx), %tmm6
159 ; CHECK-NEXT: tilestored %tmm6, -64(%rsp,%rcx) # 1024-byte Folded Spill
160 ; CHECK-NEXT: tilestored %tmm7, 960(%rsp,%rcx) # 1024-byte Folded Spill
161 ; CHECK-NEXT: t2rpntlvwz0 (%rsi,%rdx), %tmm6
162 ; CHECK-NEXT: tilestored %tmm4, (%rsi,%rdx)
163 ; CHECK-NEXT: tilestored %tmm5, (%rsi,%rdx)
164 ; CHECK-NEXT: tileloadd 4032(%rsp,%rcx), %tmm4 # 1024-byte Folded Reload
165 ; CHECK-NEXT: tileloadd 5056(%rsp,%rcx), %tmm5 # 1024-byte Folded Reload
166 ; CHECK-NEXT: tilestored %tmm4, (%rsi,%rdx)
167 ; CHECK-NEXT: tilestored %tmm5, (%rsi,%rdx)
168 ; CHECK-NEXT: tileloadd 1984(%rsp,%rcx), %tmm4 # 1024-byte Folded Reload
169 ; CHECK-NEXT: tileloadd 3008(%rsp,%rcx), %tmm5 # 1024-byte Folded Reload
170 ; CHECK-NEXT: tilestored %tmm4, (%rsi,%rdx)
171 ; CHECK-NEXT: tilestored %tmm5, (%rsi,%rdx)
172 ; CHECK-NEXT: tileloadd -64(%rsp,%rcx), %tmm4 # 1024-byte Folded Reload
173 ; CHECK-NEXT: tileloadd 960(%rsp,%rcx), %tmm5 # 1024-byte Folded Reload
174 ; CHECK-NEXT: tilestored %tmm4, (%rsi,%rdx)
175 ; CHECK-NEXT: tilestored %tmm5, (%rsi,%rdx)
176 ; CHECK-NEXT: tilestored %tmm6, (%rsi,%rdx)
177 ; CHECK-NEXT: tilestored %tmm7, (%rsi,%rdx)
178 ; CHECK-NEXT: addq $6088, %rsp # imm = 0x17C8
179 ; CHECK-NEXT: tilerelease
180 ; CHECK-NEXT: vzeroupper
182 %a = call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 8, i8* %base, i64 %stride)
183 %b1 = call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0.internal(i16 8, i16 8, i16 8, i8* %base, i64 %stride)
184 %b2 = call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0t1.internal(i16 8, i16 8, i16 8, i8* %base, i64 %stride)
185 %b3 = call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz1.internal(i16 8, i16 8, i16 8, i8* %base, i64 %stride)
186 %b4 = call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz1t1.internal(i16 8, i16 8, i16 8, i8* %base, i64 %stride)
187 %b5 = call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0.internal(i16 8, i16 8, i16 8, i8* %base, i64 %stride)
188 %e11 = extractvalue { x86_amx, x86_amx } %b1, 0
189 %e12 = extractvalue { x86_amx, x86_amx } %b1, 1
190 %e21 = extractvalue { x86_amx, x86_amx } %b2, 0
191 %e22 = extractvalue { x86_amx, x86_amx } %b2, 1
192 %e31 = extractvalue { x86_amx, x86_amx } %b3, 0
193 %e32 = extractvalue { x86_amx, x86_amx } %b3, 1
194 %e41 = extractvalue { x86_amx, x86_amx } %b4, 0
195 %e42 = extractvalue { x86_amx, x86_amx } %b4, 1
196 %e51 = extractvalue { x86_amx, x86_amx } %b5, 0
197 %e52 = extractvalue { x86_amx, x86_amx } %b5, 1
198 call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %base, i64 %stride, x86_amx %e11)
199 call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %base, i64 %stride, x86_amx %e12)
200 call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %base, i64 %stride, x86_amx %e21)
201 call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %base, i64 %stride, x86_amx %e22)
202 call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %base, i64 %stride, x86_amx %e31)
203 call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %base, i64 %stride, x86_amx %e32)
204 call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %base, i64 %stride, x86_amx %e41)
205 call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %base, i64 %stride, x86_amx %e42)
206 call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %base, i64 %stride, x86_amx %e51)
207 call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %base, i64 %stride, x86_amx %e52)
211 declare x86_amx @llvm.x86.tileloadd64.internal(i16, i16, i8*, i64)
212 declare void @llvm.x86.tilestored64.internal(i16, i16, i8*, i64, x86_amx)
213 declare { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0.internal(i16, i16, i16, i8*, i64)
214 declare { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0t1.internal(i16, i16, i16, i8*, i64)
215 declare { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz1.internal(i16, i16, i16, i8*, i64)
216 declare { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz1t1.internal(i16, i16, i16, i8*, i64)
217 declare x86_amx @llvm.x86.ttransposed.internal(i16, i16, x86_amx)
218 declare x86_amx @llvm.x86.ttdpbf16ps.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx)
219 declare x86_amx @llvm.x86.ttdpfp16ps.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx)
220 declare x86_amx @llvm.x86.ttcmmimfp16ps.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx)
221 declare x86_amx @llvm.x86.ttcmmrlfp16ps.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx)
222 declare x86_amx @llvm.x86.tconjtcmmimfp16ps.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx)
223 declare x86_amx @llvm.x86.tconjtfp16.internal(i16, i16, x86_amx)
225 attributes #0 = { nounwind }