1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+amx-int8,+avx512f -verify-machineinstrs | FileCheck %s --check-prefix=AVX512
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+amx-int8,+avx2 -verify-machineinstrs | FileCheck %s --check-prefix=AVX2
4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+amx-int8,+avx -verify-machineinstrs | FileCheck %s --check-prefix=AVX1
5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+amx-int8 -verify-machineinstrs | FileCheck %s --check-prefix=SSE2
7 @buf = dso_local global [1024 x i8] zeroinitializer, align 64
8 @buf2 = dso_local global [1024 x i8] zeroinitializer, align 64
10 ; Function Attrs: nounwind uwtable
11 define <4 x i32> @test_api(i32 %0, i16 signext %1, i16 signext %2, <4 x i32> %xmm0) {
12 ; AVX512-LABEL: test_api:
14 ; AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1
15 ; AVX512-NEXT: vmovups %zmm1, -{{[0-9]+}}(%rsp)
16 ; AVX512-NEXT: movb $1, -{{[0-9]+}}(%rsp)
17 ; AVX512-NEXT: movw %dx, -{{[0-9]+}}(%rsp)
18 ; AVX512-NEXT: movw %dx, -{{[0-9]+}}(%rsp)
19 ; AVX512-NEXT: movw %si, -{{[0-9]+}}(%rsp)
20 ; AVX512-NEXT: testl %edi, %edi
21 ; AVX512-NEXT: movsbl %sil, %eax
22 ; AVX512-NEXT: movb %al, -{{[0-9]+}}(%rsp)
23 ; AVX512-NEXT: movb %al, -{{[0-9]+}}(%rsp)
24 ; AVX512-NEXT: movb %al, -{{[0-9]+}}(%rsp)
25 ; AVX512-NEXT: ldtilecfg -{{[0-9]+}}(%rsp)
26 ; AVX512-NEXT: je .LBB0_2
27 ; AVX512-NEXT: # %bb.1:
28 ; AVX512-NEXT: movl $buf, %ecx
29 ; AVX512-NEXT: jmp .LBB0_3
30 ; AVX512-NEXT: .LBB0_2:
31 ; AVX512-NEXT: movl $buf2, %ecx
32 ; AVX512-NEXT: .LBB0_3:
33 ; AVX512-NEXT: movl $32, %edi
34 ; AVX512-NEXT: tileloadd (%rcx,%rdi), %tmm0
35 ; AVX512-NEXT: tileloadd (%rcx,%rdi), %tmm2
36 ; AVX512-NEXT: tileloadd (%rcx,%rdi), %tmm1
37 ; AVX512-NEXT: tdpbssd %tmm2, %tmm0, %tmm1
38 ; AVX512-NEXT: movl $buf, %ecx
39 ; AVX512-NEXT: movl $32, %esi
40 ; AVX512-NEXT: tilestored %tmm1, (%rcx,%rsi)
41 ; AVX512-NEXT: tilerelease
42 ; AVX512-NEXT: vzeroupper
45 ; AVX2-LABEL: test_api:
47 ; AVX2-NEXT: vxorps %xmm1, %xmm1, %xmm1
48 ; AVX2-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
49 ; AVX2-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
50 ; AVX2-NEXT: movb $1, -{{[0-9]+}}(%rsp)
51 ; AVX2-NEXT: movw %dx, -{{[0-9]+}}(%rsp)
52 ; AVX2-NEXT: movw %dx, -{{[0-9]+}}(%rsp)
53 ; AVX2-NEXT: movw %si, -{{[0-9]+}}(%rsp)
54 ; AVX2-NEXT: testl %edi, %edi
55 ; AVX2-NEXT: movsbl %sil, %eax
56 ; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp)
57 ; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp)
58 ; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp)
59 ; AVX2-NEXT: ldtilecfg -{{[0-9]+}}(%rsp)
60 ; AVX2-NEXT: je .LBB0_2
62 ; AVX2-NEXT: movl $buf, %ecx
63 ; AVX2-NEXT: jmp .LBB0_3
65 ; AVX2-NEXT: movl $buf2, %ecx
67 ; AVX2-NEXT: movl $32, %edi
68 ; AVX2-NEXT: tileloadd (%rcx,%rdi), %tmm0
69 ; AVX2-NEXT: tileloadd (%rcx,%rdi), %tmm2
70 ; AVX2-NEXT: tileloadd (%rcx,%rdi), %tmm1
71 ; AVX2-NEXT: tdpbssd %tmm2, %tmm0, %tmm1
72 ; AVX2-NEXT: movl $buf, %ecx
73 ; AVX2-NEXT: movl $32, %esi
74 ; AVX2-NEXT: tilestored %tmm1, (%rcx,%rsi)
75 ; AVX2-NEXT: tilerelease
76 ; AVX2-NEXT: vzeroupper
79 ; AVX1-LABEL: test_api:
81 ; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1
82 ; AVX1-NEXT: vmovups %xmm1, -{{[0-9]+}}(%rsp)
83 ; AVX1-NEXT: vmovups %xmm1, -{{[0-9]+}}(%rsp)
84 ; AVX1-NEXT: vmovups %xmm1, -{{[0-9]+}}(%rsp)
85 ; AVX1-NEXT: vmovups %xmm1, -{{[0-9]+}}(%rsp)
86 ; AVX1-NEXT: movb $1, -{{[0-9]+}}(%rsp)
87 ; AVX1-NEXT: movw %dx, -{{[0-9]+}}(%rsp)
88 ; AVX1-NEXT: movw %dx, -{{[0-9]+}}(%rsp)
89 ; AVX1-NEXT: movw %si, -{{[0-9]+}}(%rsp)
90 ; AVX1-NEXT: testl %edi, %edi
91 ; AVX1-NEXT: movsbl %sil, %eax
92 ; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp)
93 ; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp)
94 ; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp)
95 ; AVX1-NEXT: ldtilecfg -{{[0-9]+}}(%rsp)
96 ; AVX1-NEXT: je .LBB0_2
98 ; AVX1-NEXT: movl $buf, %ecx
99 ; AVX1-NEXT: jmp .LBB0_3
100 ; AVX1-NEXT: .LBB0_2:
101 ; AVX1-NEXT: movl $buf2, %ecx
102 ; AVX1-NEXT: .LBB0_3:
103 ; AVX1-NEXT: movl $32, %edi
104 ; AVX1-NEXT: tileloadd (%rcx,%rdi), %tmm0
105 ; AVX1-NEXT: tileloadd (%rcx,%rdi), %tmm2
106 ; AVX1-NEXT: tileloadd (%rcx,%rdi), %tmm1
107 ; AVX1-NEXT: tdpbssd %tmm2, %tmm0, %tmm1
108 ; AVX1-NEXT: movl $buf, %ecx
109 ; AVX1-NEXT: movl $32, %esi
110 ; AVX1-NEXT: tilestored %tmm1, (%rcx,%rsi)
111 ; AVX1-NEXT: tilerelease
114 ; SSE2-LABEL: test_api:
116 ; SSE2-NEXT: xorps %xmm1, %xmm1
117 ; SSE2-NEXT: movups %xmm1, -{{[0-9]+}}(%rsp)
118 ; SSE2-NEXT: movups %xmm1, -{{[0-9]+}}(%rsp)
119 ; SSE2-NEXT: movups %xmm1, -{{[0-9]+}}(%rsp)
120 ; SSE2-NEXT: movups %xmm1, -{{[0-9]+}}(%rsp)
121 ; SSE2-NEXT: movb $1, -{{[0-9]+}}(%rsp)
122 ; SSE2-NEXT: movw %dx, -{{[0-9]+}}(%rsp)
123 ; SSE2-NEXT: movw %dx, -{{[0-9]+}}(%rsp)
124 ; SSE2-NEXT: movw %si, -{{[0-9]+}}(%rsp)
125 ; SSE2-NEXT: testl %edi, %edi
126 ; SSE2-NEXT: movsbl %sil, %eax
127 ; SSE2-NEXT: movb %al, -{{[0-9]+}}(%rsp)
128 ; SSE2-NEXT: movb %al, -{{[0-9]+}}(%rsp)
129 ; SSE2-NEXT: movb %al, -{{[0-9]+}}(%rsp)
130 ; SSE2-NEXT: ldtilecfg -{{[0-9]+}}(%rsp)
131 ; SSE2-NEXT: je .LBB0_2
132 ; SSE2-NEXT: # %bb.1:
133 ; SSE2-NEXT: movl $buf, %ecx
134 ; SSE2-NEXT: jmp .LBB0_3
135 ; SSE2-NEXT: .LBB0_2:
136 ; SSE2-NEXT: movl $buf2, %ecx
137 ; SSE2-NEXT: .LBB0_3:
138 ; SSE2-NEXT: movl $32, %edi
139 ; SSE2-NEXT: tileloadd (%rcx,%rdi), %tmm0
140 ; SSE2-NEXT: tileloadd (%rcx,%rdi), %tmm2
141 ; SSE2-NEXT: tileloadd (%rcx,%rdi), %tmm1
142 ; SSE2-NEXT: tdpbssd %tmm2, %tmm0, %tmm1
143 ; SSE2-NEXT: movl $buf, %ecx
144 ; SSE2-NEXT: movl $32, %esi
145 ; SSE2-NEXT: tilestored %tmm1, (%rcx,%rsi)
146 ; SSE2-NEXT: tilerelease
148 %4 = icmp eq i32 %0, 0
150 %6 = ashr exact i16 %5, 8
151 br i1 %4, label %11, label %7
154 %8 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %6, i16 %1, ptr @buf, i64 32)
155 %9 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %6, i16 %2, ptr @buf, i64 32)
156 %10 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %6, i16 %2, ptr @buf, i64 32)
160 %12 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %6, i16 %1, ptr @buf2, i64 32)
161 %13 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %6, i16 %2, ptr @buf2, i64 32)
162 %14 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %6, i16 %2, ptr @buf2, i64 32)
165 15: ; preds = %11, %7
166 %16 = phi x86_amx [ %12, %11 ], [ %8, %7 ]
167 %17 = phi x86_amx [ %13, %11 ], [ %9, %7 ]
168 %18 = phi x86_amx [ %14, %11 ], [ %10, %7 ]
169 %19 = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 %6, i16 %2, i16 %1, x86_amx %18, x86_amx %16, x86_amx %17)
170 tail call void @llvm.x86.tilestored64.internal(i16 %6, i16 %2, ptr @buf, i64 32, x86_amx %19)
174 declare x86_amx @llvm.x86.tileloadd64.internal(i16, i16, ptr, i64)
175 declare x86_amx @llvm.x86.tdpbssd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx)
176 declare void @llvm.x86.tilestored64.internal(i16, i16, ptr, i64, x86_amx)