1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+amx-int8 -mattr=+avx10.2-512 -mattr=+amx-avx512 -verify-machineinstrs | FileCheck %s
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+amx-int8 -mattr=+avx10.2-512 -mattr=+amx-avx512 -verify-machineinstrs -enable-ipra | FileCheck -check-prefix=IPRA %s
4 ; RUN: llc < %s -O0 -mtriple=x86_64-unknown-unknown -mattr=+amx-int8 -mattr=+avx10.2-512 -mattr=+amx-avx512 -verify-machineinstrs | FileCheck -check-prefix=O0 %s
6 @buf = dso_local global [3072 x i8] zeroinitializer, align 64
8 define internal void @foo() {
10 ; CHECK: # %bb.0: # %entry
14 ; IPRA: # %bb.0: # %entry
18 ; O0: # %bb.0: # %entry
24 define dso_local <16 x i32> @test_api(i16 signext %0, i16 signext %1) nounwind {
25 ; CHECK-LABEL: test_api:
27 ; CHECK-NEXT: pushq %rbp
28 ; CHECK-NEXT: pushq %r14
29 ; CHECK-NEXT: pushq %rbx
30 ; CHECK-NEXT: subq $2112, %rsp # imm = 0x840
31 ; CHECK-NEXT: movl %esi, %ebx
32 ; CHECK-NEXT: movl %edi, %ebp
33 ; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0
34 ; CHECK-NEXT: vmovups %zmm0, (%rsp)
35 ; CHECK-NEXT: movb $1, (%rsp)
36 ; CHECK-NEXT: movw $8, {{[0-9]+}}(%rsp)
37 ; CHECK-NEXT: movb $8, {{[0-9]+}}(%rsp)
38 ; CHECK-NEXT: movw %bx, {{[0-9]+}}(%rsp)
39 ; CHECK-NEXT: movb %bpl, {{[0-9]+}}(%rsp)
40 ; CHECK-NEXT: ldtilecfg (%rsp)
41 ; CHECK-NEXT: movl $buf, %eax
42 ; CHECK-NEXT: movl $32, %ecx
43 ; CHECK-NEXT: movw $8, %r14w
44 ; CHECK-NEXT: tileloadd (%rax,%rcx), %tmm0
45 ; CHECK-NEXT: movabsq $64, %rax
46 ; CHECK-NEXT: tilestored %tmm0, 1088(%rsp,%rax) # 1024-byte Folded Spill
47 ; CHECK-NEXT: movl $buf+1024, %eax
48 ; CHECK-NEXT: tileloadd (%rax,%rcx), %tmm1
49 ; CHECK-NEXT: movabsq $64, %rax
50 ; CHECK-NEXT: tilestored %tmm1, 64(%rsp,%rax) # 1024-byte Folded Spill
51 ; CHECK-NEXT: vzeroupper
52 ; CHECK-NEXT: callq foo
53 ; CHECK-NEXT: ldtilecfg (%rsp)
54 ; CHECK-NEXT: movabsq $64, %rax
55 ; CHECK-NEXT: tileloadd 64(%rsp,%rax), %tmm1 # 1024-byte Folded Reload
56 ; CHECK-NEXT: tilemovrow $2, %tmm1, %zmm0
57 ; CHECK-NEXT: tileloadd 1088(%rsp,%rax), %tmm0 # 1024-byte Folded Reload
58 ; CHECK-NEXT: tilemovrow $2, %tmm0, %zmm1
59 ; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm0
60 ; CHECK-NEXT: addq $2112, %rsp # imm = 0x840
61 ; CHECK-NEXT: popq %rbx
62 ; CHECK-NEXT: popq %r14
63 ; CHECK-NEXT: popq %rbp
64 ; CHECK-NEXT: tilerelease
67 ; IPRA-LABEL: test_api:
69 ; IPRA-NEXT: subq $72, %rsp
70 ; IPRA-NEXT: vxorps %xmm0, %xmm0, %xmm0
71 ; IPRA-NEXT: vmovups %zmm0, {{[0-9]+}}(%rsp)
72 ; IPRA-NEXT: movb $1, {{[0-9]+}}(%rsp)
73 ; IPRA-NEXT: movw $8, {{[0-9]+}}(%rsp)
74 ; IPRA-NEXT: movb $8, {{[0-9]+}}(%rsp)
75 ; IPRA-NEXT: movw %si, {{[0-9]+}}(%rsp)
76 ; IPRA-NEXT: movb %dil, {{[0-9]+}}(%rsp)
77 ; IPRA-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
78 ; IPRA-NEXT: movl $buf, %eax
79 ; IPRA-NEXT: movl $32, %ecx
80 ; IPRA-NEXT: movw $8, %dx
81 ; IPRA-NEXT: tileloadd (%rax,%rcx), %tmm0
82 ; IPRA-NEXT: movl $buf+1024, %eax
83 ; IPRA-NEXT: tileloadd (%rax,%rcx), %tmm1
84 ; IPRA-NEXT: callq foo
85 ; IPRA-NEXT: tilemovrow $2, %tmm1, %zmm0
86 ; IPRA-NEXT: tilemovrow $2, %tmm0, %zmm1
87 ; IPRA-NEXT: vpaddd %zmm1, %zmm0, %zmm0
88 ; IPRA-NEXT: addq $72, %rsp
89 ; IPRA-NEXT: tilerelease
95 ; O0-NEXT: movq %rsp, %rbp
96 ; O0-NEXT: andq $-1024, %rsp # imm = 0xFC00
97 ; O0-NEXT: subq $4096, %rsp # imm = 0x1000
98 ; O0-NEXT: vpxor %xmm0, %xmm0, %xmm0
99 ; O0-NEXT: vmovups %zmm0, {{[0-9]+}}(%rsp)
100 ; O0-NEXT: movb $1, {{[0-9]+}}(%rsp)
101 ; O0-NEXT: movw %si, %cx
102 ; O0-NEXT: movw %cx, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
103 ; O0-NEXT: movw %di, %ax
104 ; O0-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
105 ; O0-NEXT: movl $buf, %esi
106 ; O0-NEXT: movl $32, %edi
107 ; O0-NEXT: movw $8, %dx
108 ; O0-NEXT: # implicit-def: $al
109 ; O0-NEXT: movb %al, {{[0-9]+}}(%rsp)
110 ; O0-NEXT: movw %dx, {{[0-9]+}}(%rsp)
111 ; O0-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
112 ; O0-NEXT: tileloadd (%rsi,%rdi), %tmm0
113 ; O0-NEXT: movl $64, %edi
114 ; O0-NEXT: leaq {{[0-9]+}}(%rsp), %rsi
115 ; O0-NEXT: movw $8, %dx
116 ; O0-NEXT: tilestored %tmm0, (%rsi,%rdi)
117 ; O0-NEXT: movl $32, %esi
118 ; O0-NEXT: movl $buf+1024, %edx
119 ; O0-NEXT: movw $8, %ax
120 ; O0-NEXT: # implicit-def: $al
121 ; O0-NEXT: movb %al, {{[0-9]+}}(%rsp)
122 ; O0-NEXT: movw %cx, {{[0-9]+}}(%rsp)
123 ; O0-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
124 ; O0-NEXT: tileloadd (%rdx,%rsi), %tmm0
125 ; O0-NEXT: movl $64, %esi
126 ; O0-NEXT: leaq {{[0-9]+}}(%rsp), %rdx
127 ; O0-NEXT: movw $8, %ax
128 ; O0-NEXT: tilestored %tmm0, (%rdx,%rsi)
129 ; O0-NEXT: vzeroupper
131 ; O0-NEXT: movw {{[-0-9]+}}(%r{{[sb]}}p), %dx # 2-byte Reload
132 ; O0-NEXT: movw {{[-0-9]+}}(%r{{[sb]}}p), %ax # 2-byte Reload
133 ; O0-NEXT: movl $64, %edi
134 ; O0-NEXT: leaq {{[0-9]+}}(%rsp), %rsi
135 ; O0-NEXT: movw $8, %cx
136 ; O0-NEXT: # implicit-def: $cl
137 ; O0-NEXT: movb %cl, {{[0-9]+}}(%rsp)
138 ; O0-NEXT: movw %dx, {{[0-9]+}}(%rsp)
139 ; O0-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
140 ; O0-NEXT: tileloadd (%rsi,%rdi), %tmm0
141 ; O0-NEXT: movw $8, %cx
142 ; O0-NEXT: tilemovrow $2, %tmm0, %zmm0
143 ; O0-NEXT: movl $64, %esi
144 ; O0-NEXT: leaq {{[0-9]+}}(%rsp), %rdx
145 ; O0-NEXT: movw $8, %cx
146 ; O0-NEXT: # implicit-def: $al
147 ; O0-NEXT: movb %al, {{[0-9]+}}(%rsp)
148 ; O0-NEXT: movw %cx, {{[0-9]+}}(%rsp)
149 ; O0-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
150 ; O0-NEXT: tileloadd (%rdx,%rsi), %tmm0
151 ; O0-NEXT: movw $8, %cx
152 ; O0-NEXT: tilemovrow $2, %tmm0, %zmm1
153 ; O0-NEXT: vpaddd %zmm1, %zmm0, %zmm0
154 ; O0-NEXT: movq %rbp, %rsp
156 ; O0-NEXT: tilerelease
158 %3 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %0, i16 8, ptr @buf, i64 32)
159 %4 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 %1, ptr getelementptr inbounds ([3072 x i8], ptr @buf, i64 0, i64 1024), i64 32)
161 %5 = call <16 x i32> @llvm.x86.tilemovrow.internal(i16 8, i16 %1, x86_amx %4, i32 2)
162 %6 = call <16 x i32> @llvm.x86.tilemovrow.internal(i16 %0, i16 8, x86_amx %3, i32 2)
163 %7 = add <16 x i32> %5, %6
168 declare x86_amx @llvm.x86.tileloadd64.internal(i16, i16, ptr, i64)
169 declare x86_amx @llvm.x86.tdpbssd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx)
170 declare void @llvm.x86.tilestored64.internal(i16, i16, ptr, i64, x86_amx)
171 declare <16 x i32> @llvm.x86.tilemovrow.internal(i16, i16, x86_amx, i32)