1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+amx-tile,+amx-int8,+amx-bf16,+avx512f -verify-machineinstrs | FileCheck %s
4 define void @test_amx(ptr %pointer, ptr %base, i64 %stride) {
5 ; CHECK-LABEL: test_amx:
7 ; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0
8 ; CHECK-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp)
9 ; CHECK-NEXT: movb $1, -{{[0-9]+}}(%rsp)
10 ; CHECK-NEXT: movb $8, -{{[0-9]+}}(%rsp)
11 ; CHECK-NEXT: movw $8, -{{[0-9]+}}(%rsp)
12 ; CHECK-NEXT: movb $8, -{{[0-9]+}}(%rsp)
13 ; CHECK-NEXT: movw $8, -{{[0-9]+}}(%rsp)
14 ; CHECK-NEXT: movb $8, -{{[0-9]+}}(%rsp)
15 ; CHECK-NEXT: movw $8, -{{[0-9]+}}(%rsp)
16 ; CHECK-NEXT: ldtilecfg -{{[0-9]+}}(%rsp)
17 ; CHECK-NEXT: movw $8, %ax
18 ; CHECK-NEXT: tilezero %tmm0
19 ; CHECK-NEXT: tileloadd (%rsi,%rdx), %tmm1
20 ; CHECK-NEXT: tileloadd (%rsi,%rdx), %tmm2
21 ; CHECK-NEXT: tdpbssd %tmm2, %tmm1, %tmm0
22 ; CHECK-NEXT: tdpbsud %tmm2, %tmm1, %tmm0
23 ; CHECK-NEXT: tdpbusd %tmm2, %tmm1, %tmm0
24 ; CHECK-NEXT: tdpbuud %tmm2, %tmm1, %tmm0
25 ; CHECK-NEXT: tdpbf16ps %tmm2, %tmm1, %tmm0
26 ; CHECK-NEXT: tileloaddt1 (%rsi,%rdx), %tmm1
27 ; CHECK-NEXT: tilestored %tmm0, (%rdi,%rdx)
28 ; CHECK-NEXT: tilerelease
29 ; CHECK-NEXT: vzeroupper
31 %c = call x86_amx @llvm.x86.tilezero.internal(i16 8, i16 8)
32 %a = call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 8, ptr %base, i64 %stride)
33 %b = call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 8, ptr %base, i64 %stride)
34 %d0 = call x86_amx @llvm.x86.tdpbssd.internal(i16 8, i16 8, i16 8, x86_amx %c, x86_amx %a, x86_amx %b)
35 %d1 = call x86_amx @llvm.x86.tdpbsud.internal(i16 8, i16 8, i16 8, x86_amx %d0, x86_amx %a, x86_amx %b)
36 %d2 = call x86_amx @llvm.x86.tdpbusd.internal(i16 8, i16 8, i16 8, x86_amx %d1, x86_amx %a, x86_amx %b)
37 %d3 = call x86_amx @llvm.x86.tdpbuud.internal(i16 8, i16 8, i16 8, x86_amx %d2, x86_amx %a, x86_amx %b)
38 %d4 = call x86_amx @llvm.x86.tdpbf16ps.internal(i16 8, i16 8, i16 8, x86_amx %d3, x86_amx %a, x86_amx %b)
39 %e = call x86_amx @llvm.x86.tileloaddt164.internal(i16 8, i16 8, ptr %base, i64 %stride)
40 call void @llvm.x86.tilestored64.internal(i16 8, i16 8, ptr %pointer, i64 %stride, x86_amx %d4)
45 declare x86_amx @llvm.x86.tilezero.internal(i16, i16)
46 declare x86_amx @llvm.x86.tileloadd64.internal(i16, i16, ptr, i64)
47 declare x86_amx @llvm.x86.tileloaddt164.internal(i16, i16, ptr, i64)
48 declare x86_amx @llvm.x86.tdpbssd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx)
49 declare x86_amx @llvm.x86.tdpbsud.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx)
50 declare x86_amx @llvm.x86.tdpbusd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx)
51 declare x86_amx @llvm.x86.tdpbuud.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx)
52 declare x86_amx @llvm.x86.tdpbf16ps.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx)
53 declare void @llvm.x86.tilestored64.internal(i16, i16, ptr, i64, x86_amx)
55 define void @PR90954(ptr %0, ptr %1, i32 %2) nounwind {
56 ; CHECK-LABEL: PR90954:
58 ; CHECK-NEXT: pushq %rbp
59 ; CHECK-NEXT: movq %rsp, %rbp
60 ; CHECK-NEXT: pushq %r15
61 ; CHECK-NEXT: pushq %r14
62 ; CHECK-NEXT: pushq %r13
63 ; CHECK-NEXT: pushq %r12
64 ; CHECK-NEXT: pushq %rbx
65 ; CHECK-NEXT: andq $-1024, %rsp # imm = 0xFC00
66 ; CHECK-NEXT: subq $5120, %rsp # imm = 0x1400
67 ; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0
68 ; CHECK-NEXT: vmovups %zmm0, {{[0-9]+}}(%rsp)
69 ; CHECK-NEXT: movb $1, {{[0-9]+}}(%rsp)
70 ; CHECK-NEXT: movb $16, {{[0-9]+}}(%rsp)
71 ; CHECK-NEXT: movw $64, {{[0-9]+}}(%rsp)
72 ; CHECK-NEXT: movb $16, {{[0-9]+}}(%rsp)
73 ; CHECK-NEXT: movw $64, {{[0-9]+}}(%rsp)
74 ; CHECK-NEXT: movb $16, {{[0-9]+}}(%rsp)
75 ; CHECK-NEXT: movw $64, {{[0-9]+}}(%rsp)
76 ; CHECK-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
77 ; CHECK-NEXT: shll $4, %edx
78 ; CHECK-NEXT: xorl %eax, %eax
79 ; CHECK-NEXT: movw $64, %cx
80 ; CHECK-NEXT: movw $16, %di
81 ; CHECK-NEXT: movb $1, %r8b
82 ; CHECK-NEXT: movl $64, %r9d
83 ; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %r10
84 ; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %r11
85 ; CHECK-NEXT: xorl %ebx, %ebx
86 ; CHECK-NEXT: xorl %r14d, %r14d
87 ; CHECK-NEXT: jmp .LBB1_1
88 ; CHECK-NEXT: .p2align 4
89 ; CHECK-NEXT: .LBB1_5: # in Loop: Header=BB1_1 Depth=1
90 ; CHECK-NEXT: incq %r14
91 ; CHECK-NEXT: addl %edx, %ebx
92 ; CHECK-NEXT: .LBB1_1: # =>This Loop Header: Depth=1
93 ; CHECK-NEXT: # Child Loop BB1_2 Depth 2
94 ; CHECK-NEXT: movslq %ebx, %r15
95 ; CHECK-NEXT: leaq (%rsi,%r15,4), %r15
96 ; CHECK-NEXT: xorl %r12d, %r12d
97 ; CHECK-NEXT: xorl %r13d, %r13d
98 ; CHECK-NEXT: jmp .LBB1_2
99 ; CHECK-NEXT: .p2align 4
100 ; CHECK-NEXT: .LBB1_4: # in Loop: Header=BB1_2 Depth=2
101 ; CHECK-NEXT: tilestored %tmm1, (%r15,%rax)
102 ; CHECK-NEXT: incq %r13
103 ; CHECK-NEXT: addq $64, %r15
104 ; CHECK-NEXT: decq %r12
105 ; CHECK-NEXT: je .LBB1_5
106 ; CHECK-NEXT: .LBB1_2: # Parent Loop BB1_1 Depth=1
107 ; CHECK-NEXT: # => This Inner Loop Header: Depth=2
108 ; CHECK-NEXT: tilezero %tmm0
109 ; CHECK-NEXT: tilezero %tmm1
110 ; CHECK-NEXT: testb %r8b, %r8b
111 ; CHECK-NEXT: jne .LBB1_4
112 ; CHECK-NEXT: # %bb.3: # in Loop: Header=BB1_2 Depth=2
113 ; CHECK-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp)
114 ; CHECK-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp)
115 ; CHECK-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp)
116 ; CHECK-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp)
117 ; CHECK-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp)
118 ; CHECK-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp)
119 ; CHECK-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp)
120 ; CHECK-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp)
121 ; CHECK-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp)
122 ; CHECK-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp)
123 ; CHECK-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp)
124 ; CHECK-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp)
125 ; CHECK-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp)
126 ; CHECK-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp)
127 ; CHECK-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp)
128 ; CHECK-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp)
129 ; CHECK-NEXT: tileloadd (%r10,%r9), %tmm1
130 ; CHECK-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp)
131 ; CHECK-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp)
132 ; CHECK-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp)
133 ; CHECK-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp)
134 ; CHECK-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp)
135 ; CHECK-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp)
136 ; CHECK-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp)
137 ; CHECK-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp)
138 ; CHECK-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp)
139 ; CHECK-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp)
140 ; CHECK-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp)
141 ; CHECK-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp)
142 ; CHECK-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp)
143 ; CHECK-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp)
144 ; CHECK-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp)
145 ; CHECK-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp)
146 ; CHECK-NEXT: tileloadd (%r11,%r9), %tmm2
147 ; CHECK-NEXT: tdpbf16ps %tmm2, %tmm1, %tmm0
148 ; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
149 ; CHECK-NEXT: movabsq $64, %rax
150 ; CHECK-NEXT: tilestored %tmm0, 3072(%rsp,%rax) # 1024-byte Folded Spill
151 ; CHECK-NEXT: tileloadd 3072(%rsp,%rax), %tmm1 # 1024-byte Folded Reload
152 ; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
153 ; CHECK-NEXT: jmp .LBB1_4
155 %5 = icmp eq i64 0, 0
159 %7 = phi i64 [ 0, %3 ], [ %32, %31 ]
160 %8 = trunc nuw nsw i64 %7 to i32
163 %11 = sext i32 %9 to i64
164 %12 = getelementptr inbounds i32, ptr %1, i64 %11
167 13: ; preds = %25, %6
168 %14 = phi i64 [ %29, %25 ], [ 0, %6 ]
169 %15 = tail call x86_amx @llvm.x86.tilezero.internal(i16 16, i16 64)
170 %16 = tail call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %15)
171 %17 = shl nsw i64 %14, 4
172 %18 = getelementptr i32, ptr %0, i64 %17
173 br i1 %5, label %25, label %19
176 %20 = tail call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %16)
177 %21 = tail call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> zeroinitializer)
178 %22 = tail call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> zeroinitializer)
179 %23 = tail call x86_amx @llvm.x86.tdpbf16ps.internal(i16 16, i16 64, i16 64, x86_amx %20, x86_amx %21, x86_amx %22)
180 %24 = tail call noundef <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %23)
183 25: ; preds = %19, %13
184 %26 = phi <256 x i32> [ undef, %13 ], [ %24, %19 ]
185 %27 = getelementptr inbounds i32, ptr %12, i64 %17
186 %28 = tail call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %26)
187 tail call void @llvm.x86.tilestored64.internal(i16 16, i16 64, ptr %27, i64 0, x86_amx %28)
188 %29 = add nuw nsw i64 %14, 1
189 %30 = icmp eq i64 %29, 0
190 br i1 %30, label %31, label %13
193 %32 = add nuw nsw i64 %7, 1
197 define void @multi_use() nounwind {
198 ; CHECK-LABEL: multi_use:
200 ; CHECK-NEXT: pushq %rbp
201 ; CHECK-NEXT: subq $2928, %rsp # imm = 0xB70
202 ; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0
203 ; CHECK-NEXT: vmovups %zmm0, {{[0-9]+}}(%rsp)
204 ; CHECK-NEXT: movb $1, {{[0-9]+}}(%rsp)
205 ; CHECK-NEXT: movb $16, {{[0-9]+}}(%rsp)
206 ; CHECK-NEXT: movw $64, {{[0-9]+}}(%rsp)
207 ; CHECK-NEXT: movb $16, {{[0-9]+}}(%rsp)
208 ; CHECK-NEXT: movw $64, {{[0-9]+}}(%rsp)
209 ; CHECK-NEXT: movw $64, %ax
210 ; CHECK-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
211 ; CHECK-NEXT: movw $16, %cx
212 ; CHECK-NEXT: tilezero %tmm0
213 ; CHECK-NEXT: movabsq $64, %rbp
214 ; CHECK-NEXT: tilestored %tmm0, 896(%rsp,%rbp) # 1024-byte Folded Spill
215 ; CHECK-NEXT: tileloadd 896(%rsp,%rbp), %tmm1 # 1024-byte Folded Reload
216 ; CHECK-NEXT: tdpbf16ps %tmm0, %tmm0, %tmm1
217 ; CHECK-NEXT: tdpbf16ps %tmm0, %tmm0, %tmm0
218 ; CHECK-NEXT: addq $2928, %rsp # imm = 0xB70
219 ; CHECK-NEXT: popq %rbp
220 ; CHECK-NEXT: tilerelease
221 ; CHECK-NEXT: vzeroupper
223 %1 = call x86_amx @llvm.x86.tilezero.internal(i16 16, i16 64)
224 %2 = call x86_amx @llvm.x86.tdpbf16ps.internal(i16 16, i16 64, i16 64, x86_amx %1, x86_amx %1, x86_amx %1)
225 %3 = call x86_amx @llvm.x86.tdpbf16ps.internal(i16 16, i16 64, i16 64, x86_amx %1, x86_amx %1, x86_amx %1)
229 declare x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32>)
230 declare <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx)