1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+amx-int8 -mattr=+avx512f | FileCheck %s --check-prefix=AVX512
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+amx-int8 -mattr=+avx2 | FileCheck %s --check-prefix=AVX2
4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+amx-int8 | FileCheck %s --check-prefix=SSE2
5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+amx-int8 -mattr=+avx512f -O0 | FileCheck %s --check-prefix=AVX512-O0
6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+amx-int8 -mattr=+avx2 -O0 | FileCheck %s --check-prefix=AVX2-O0
7 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+amx-int8 -O0 | FileCheck %s --check-prefix=SSE2-O0
9 define void @foo(ptr %buf) nounwind {
11 ; AVX512: # %bb.0: # %entry
12 ; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0
13 ; AVX512-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp)
14 ; AVX512-NEXT: movb $1, -{{[0-9]+}}(%rsp)
15 ; AVX512-NEXT: movb $8, -{{[0-9]+}}(%rsp)
16 ; AVX512-NEXT: movw $32, -{{[0-9]+}}(%rsp)
17 ; AVX512-NEXT: ldtilecfg -{{[0-9]+}}(%rsp)
18 ; AVX512-NEXT: movw $32, %ax
19 ; AVX512-NEXT: movw $8, %cx
20 ; AVX512-NEXT: tilezero %tmm0
21 ; AVX512-NEXT: movl $1024, %edx # imm = 0x400
22 ; AVX512-NEXT: tilestored %tmm0, (%rdi,%rdx)
23 ; AVX512-NEXT: tilerelease
24 ; AVX512-NEXT: vzeroupper
28 ; AVX2: # %bb.0: # %entry
29 ; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0
30 ; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
31 ; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
32 ; AVX2-NEXT: movb $1, -{{[0-9]+}}(%rsp)
33 ; AVX2-NEXT: movb $8, -{{[0-9]+}}(%rsp)
34 ; AVX2-NEXT: movw $32, -{{[0-9]+}}(%rsp)
35 ; AVX2-NEXT: ldtilecfg -{{[0-9]+}}(%rsp)
36 ; AVX2-NEXT: movw $32, %ax
37 ; AVX2-NEXT: movw $8, %cx
38 ; AVX2-NEXT: tilezero %tmm0
39 ; AVX2-NEXT: movl $1024, %edx # imm = 0x400
40 ; AVX2-NEXT: tilestored %tmm0, (%rdi,%rdx)
41 ; AVX2-NEXT: tilerelease
42 ; AVX2-NEXT: vzeroupper
46 ; SSE2: # %bb.0: # %entry
47 ; SSE2-NEXT: xorps %xmm0, %xmm0
48 ; SSE2-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp)
49 ; SSE2-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp)
50 ; SSE2-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp)
51 ; SSE2-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp)
52 ; SSE2-NEXT: movb $1, -{{[0-9]+}}(%rsp)
53 ; SSE2-NEXT: movb $8, -{{[0-9]+}}(%rsp)
54 ; SSE2-NEXT: movw $32, -{{[0-9]+}}(%rsp)
55 ; SSE2-NEXT: ldtilecfg -{{[0-9]+}}(%rsp)
56 ; SSE2-NEXT: movw $32, %ax
57 ; SSE2-NEXT: movw $8, %cx
58 ; SSE2-NEXT: tilezero %tmm0
59 ; SSE2-NEXT: movl $1024, %edx # imm = 0x400
60 ; SSE2-NEXT: tilestored %tmm0, (%rdi,%rdx)
61 ; SSE2-NEXT: tilerelease
64 ; AVX512-O0-LABEL: foo:
65 ; AVX512-O0: # %bb.0: # %entry
66 ; AVX512-O0-NEXT: pushq %rbp
67 ; AVX512-O0-NEXT: movq %rsp, %rbp
68 ; AVX512-O0-NEXT: andq $-1024, %rsp # imm = 0xFC00
69 ; AVX512-O0-NEXT: subq $3072, %rsp # imm = 0xC00
70 ; AVX512-O0-NEXT: vxorps %xmm0, %xmm0, %xmm0
71 ; AVX512-O0-NEXT: vmovups %zmm0, {{[0-9]+}}(%rsp)
72 ; AVX512-O0-NEXT: movb $1, {{[0-9]+}}(%rsp)
73 ; AVX512-O0-NEXT: movw $32, %cx
74 ; AVX512-O0-NEXT: movw $8, %ax
75 ; AVX512-O0-NEXT: # implicit-def: $al
76 ; AVX512-O0-NEXT: movb %al, {{[0-9]+}}(%rsp)
77 ; AVX512-O0-NEXT: movw %cx, {{[0-9]+}}(%rsp)
78 ; AVX512-O0-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
79 ; AVX512-O0-NEXT: tilezero %tmm0
80 ; AVX512-O0-NEXT: movl $64, %esi
81 ; AVX512-O0-NEXT: leaq {{[0-9]+}}(%rsp), %rdx
82 ; AVX512-O0-NEXT: movw $32, %cx
83 ; AVX512-O0-NEXT: movw $8, %ax
84 ; AVX512-O0-NEXT: tilestored %tmm0, (%rdx,%rsi)
85 ; AVX512-O0-NEXT: movl $64, %esi
86 ; AVX512-O0-NEXT: leaq {{[0-9]+}}(%rsp), %rdx
87 ; AVX512-O0-NEXT: movw $32, %cx
88 ; AVX512-O0-NEXT: movw $8, %ax
89 ; AVX512-O0-NEXT: # implicit-def: $al
90 ; AVX512-O0-NEXT: movb %al, {{[0-9]+}}(%rsp)
91 ; AVX512-O0-NEXT: movw %cx, {{[0-9]+}}(%rsp)
92 ; AVX512-O0-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
93 ; AVX512-O0-NEXT: tileloadd (%rdx,%rsi), %tmm0
94 ; AVX512-O0-NEXT: movl $1024, %edx # imm = 0x400
95 ; AVX512-O0-NEXT: movw $32, %cx
96 ; AVX512-O0-NEXT: movw $8, %ax
97 ; AVX512-O0-NEXT: tilestored %tmm0, (%rdi,%rdx)
98 ; AVX512-O0-NEXT: movq %rbp, %rsp
99 ; AVX512-O0-NEXT: popq %rbp
100 ; AVX512-O0-NEXT: tilerelease
101 ; AVX512-O0-NEXT: vzeroupper
102 ; AVX512-O0-NEXT: retq
104 ; AVX2-O0-LABEL: foo:
105 ; AVX2-O0: # %bb.0: # %entry
106 ; AVX2-O0-NEXT: pushq %rbp
107 ; AVX2-O0-NEXT: movq %rsp, %rbp
108 ; AVX2-O0-NEXT: andq $-1024, %rsp # imm = 0xFC00
109 ; AVX2-O0-NEXT: subq $3072, %rsp # imm = 0xC00
110 ; AVX2-O0-NEXT: vxorps %xmm0, %xmm0, %xmm0
111 ; AVX2-O0-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
112 ; AVX2-O0-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
113 ; AVX2-O0-NEXT: movb $1, {{[0-9]+}}(%rsp)
114 ; AVX2-O0-NEXT: movw $32, %cx
115 ; AVX2-O0-NEXT: movw $8, %ax
116 ; AVX2-O0-NEXT: # implicit-def: $al
117 ; AVX2-O0-NEXT: movb %al, {{[0-9]+}}(%rsp)
118 ; AVX2-O0-NEXT: movw %cx, {{[0-9]+}}(%rsp)
119 ; AVX2-O0-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
120 ; AVX2-O0-NEXT: tilezero %tmm0
121 ; AVX2-O0-NEXT: movl $64, %esi
122 ; AVX2-O0-NEXT: leaq {{[0-9]+}}(%rsp), %rdx
123 ; AVX2-O0-NEXT: movw $32, %cx
124 ; AVX2-O0-NEXT: movw $8, %ax
125 ; AVX2-O0-NEXT: tilestored %tmm0, (%rdx,%rsi)
126 ; AVX2-O0-NEXT: movl $64, %esi
127 ; AVX2-O0-NEXT: leaq {{[0-9]+}}(%rsp), %rdx
128 ; AVX2-O0-NEXT: movw $32, %cx
129 ; AVX2-O0-NEXT: movw $8, %ax
130 ; AVX2-O0-NEXT: # implicit-def: $al
131 ; AVX2-O0-NEXT: movb %al, {{[0-9]+}}(%rsp)
132 ; AVX2-O0-NEXT: movw %cx, {{[0-9]+}}(%rsp)
133 ; AVX2-O0-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
134 ; AVX2-O0-NEXT: tileloadd (%rdx,%rsi), %tmm0
135 ; AVX2-O0-NEXT: movl $1024, %edx # imm = 0x400
136 ; AVX2-O0-NEXT: movw $32, %cx
137 ; AVX2-O0-NEXT: movw $8, %ax
138 ; AVX2-O0-NEXT: tilestored %tmm0, (%rdi,%rdx)
139 ; AVX2-O0-NEXT: movq %rbp, %rsp
140 ; AVX2-O0-NEXT: popq %rbp
141 ; AVX2-O0-NEXT: tilerelease
142 ; AVX2-O0-NEXT: vzeroupper
145 ; SSE2-O0-LABEL: foo:
146 ; SSE2-O0: # %bb.0: # %entry
147 ; SSE2-O0-NEXT: pushq %rbp
148 ; SSE2-O0-NEXT: movq %rsp, %rbp
149 ; SSE2-O0-NEXT: andq $-1024, %rsp # imm = 0xFC00
150 ; SSE2-O0-NEXT: subq $3072, %rsp # imm = 0xC00
151 ; SSE2-O0-NEXT: xorps %xmm0, %xmm0
152 ; SSE2-O0-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
153 ; SSE2-O0-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
154 ; SSE2-O0-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
155 ; SSE2-O0-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
156 ; SSE2-O0-NEXT: movb $1, {{[0-9]+}}(%rsp)
157 ; SSE2-O0-NEXT: movw $32, %cx
158 ; SSE2-O0-NEXT: movw $8, %ax
159 ; SSE2-O0-NEXT: # implicit-def: $al
160 ; SSE2-O0-NEXT: movb %al, {{[0-9]+}}(%rsp)
161 ; SSE2-O0-NEXT: movw %cx, {{[0-9]+}}(%rsp)
162 ; SSE2-O0-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
163 ; SSE2-O0-NEXT: tilezero %tmm0
164 ; SSE2-O0-NEXT: movl $64, %esi
165 ; SSE2-O0-NEXT: leaq {{[0-9]+}}(%rsp), %rdx
166 ; SSE2-O0-NEXT: movw $32, %cx
167 ; SSE2-O0-NEXT: movw $8, %ax
168 ; SSE2-O0-NEXT: tilestored %tmm0, (%rdx,%rsi)
169 ; SSE2-O0-NEXT: movl $64, %esi
170 ; SSE2-O0-NEXT: leaq {{[0-9]+}}(%rsp), %rdx
171 ; SSE2-O0-NEXT: movw $32, %cx
172 ; SSE2-O0-NEXT: movw $8, %ax
173 ; SSE2-O0-NEXT: # implicit-def: $al
174 ; SSE2-O0-NEXT: movb %al, {{[0-9]+}}(%rsp)
175 ; SSE2-O0-NEXT: movw %cx, {{[0-9]+}}(%rsp)
176 ; SSE2-O0-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
177 ; SSE2-O0-NEXT: tileloadd (%rdx,%rsi), %tmm0
178 ; SSE2-O0-NEXT: movl $1024, %edx # imm = 0x400
179 ; SSE2-O0-NEXT: movw $32, %cx
180 ; SSE2-O0-NEXT: movw $8, %ax
181 ; SSE2-O0-NEXT: tilestored %tmm0, (%rdi,%rdx)
182 ; SSE2-O0-NEXT: movq %rbp, %rsp
183 ; SSE2-O0-NEXT: popq %rbp
184 ; SSE2-O0-NEXT: tilerelease
187 %t = call x86_amx @llvm.x86.tilezero.internal(i16 8, i16 32)
188 call void @llvm.x86.tilestored64.internal(i16 8, i16 32, ptr %buf, i64 1024, x86_amx %t)
192 declare x86_amx @llvm.x86.tilezero.internal(i16, i16)
193 declare void @llvm.x86.tilestored64.internal(i16, i16, ptr, i64, x86_amx)