1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+amx-int8 -mattr=+avx512f -verify-machineinstrs | FileCheck %s --check-prefix=AVX512
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+amx-int8 -mattr=+avx2 -verify-machineinstrs | FileCheck %s --check-prefix=AVX2
4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+amx-int8 -verify-machineinstrs | FileCheck %s --check-prefix=SSE2
6 @buf = dso_local global [1024 x i8] zeroinitializer, align 64
7 @buf2 = dso_local global [1024 x i8] zeroinitializer, align 64
9 ; Function Attrs: nounwind uwtable
10 define dso_local void @test_api(i32 %0, i16 signext %1, i16 signext %2) {
11 ; AVX512-LABEL: test_api:
13 ; AVX512-NEXT: vpxord %zmm0, %zmm0, %zmm0
14 ; AVX512-NEXT: vmovdqu64 %zmm0, -{{[0-9]+}}(%rsp)
15 ; AVX512-NEXT: movb $1, -{{[0-9]+}}(%rsp)
16 ; AVX512-NEXT: movw %dx, -{{[0-9]+}}(%rsp)
17 ; AVX512-NEXT: movw %dx, -{{[0-9]+}}(%rsp)
18 ; AVX512-NEXT: movw %si, -{{[0-9]+}}(%rsp)
19 ; AVX512-NEXT: testl %edi, %edi
20 ; AVX512-NEXT: movsbl %sil, %eax
21 ; AVX512-NEXT: movb %al, -{{[0-9]+}}(%rsp)
22 ; AVX512-NEXT: movb %al, -{{[0-9]+}}(%rsp)
23 ; AVX512-NEXT: movb %al, -{{[0-9]+}}(%rsp)
24 ; AVX512-NEXT: ldtilecfg -{{[0-9]+}}(%rsp)
25 ; AVX512-NEXT: je .LBB0_2
26 ; AVX512-NEXT: # %bb.1:
27 ; AVX512-NEXT: movl $buf, %ecx
28 ; AVX512-NEXT: jmp .LBB0_3
29 ; AVX512-NEXT: .LBB0_2:
30 ; AVX512-NEXT: movl $buf2, %ecx
31 ; AVX512-NEXT: .LBB0_3:
32 ; AVX512-NEXT: movl $32, %edi
33 ; AVX512-NEXT: tileloadd (%rcx,%rdi), %tmm0
34 ; AVX512-NEXT: tileloadd (%rcx,%rdi), %tmm2
35 ; AVX512-NEXT: tileloadd (%rcx,%rdi), %tmm1
36 ; AVX512-NEXT: tdpbssd %tmm2, %tmm0, %tmm1
37 ; AVX512-NEXT: movl $buf, %ecx
38 ; AVX512-NEXT: movl $32, %esi
39 ; AVX512-NEXT: tilestored %tmm1, (%rcx,%rsi)
40 ; AVX512-NEXT: tilerelease
41 ; AVX512-NEXT: vzeroupper
44 ; AVX2-LABEL: test_api:
46 ; AVX2-NEXT: vxorps %ymm0, %ymm0, %ymm0
47 ; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
48 ; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
49 ; AVX2-NEXT: movb $1, -{{[0-9]+}}(%rsp)
50 ; AVX2-NEXT: movw %dx, -{{[0-9]+}}(%rsp)
51 ; AVX2-NEXT: movw %dx, -{{[0-9]+}}(%rsp)
52 ; AVX2-NEXT: movw %si, -{{[0-9]+}}(%rsp)
53 ; AVX2-NEXT: testl %edi, %edi
54 ; AVX2-NEXT: movsbl %sil, %eax
55 ; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp)
56 ; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp)
57 ; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp)
58 ; AVX2-NEXT: ldtilecfg -{{[0-9]+}}(%rsp)
59 ; AVX2-NEXT: je .LBB0_2
61 ; AVX2-NEXT: movl $buf, %ecx
62 ; AVX2-NEXT: jmp .LBB0_3
64 ; AVX2-NEXT: movl $buf2, %ecx
66 ; AVX2-NEXT: movl $32, %edi
67 ; AVX2-NEXT: tileloadd (%rcx,%rdi), %tmm0
68 ; AVX2-NEXT: tileloadd (%rcx,%rdi), %tmm2
69 ; AVX2-NEXT: tileloadd (%rcx,%rdi), %tmm1
70 ; AVX2-NEXT: tdpbssd %tmm2, %tmm0, %tmm1
71 ; AVX2-NEXT: movl $buf, %ecx
72 ; AVX2-NEXT: movl $32, %esi
73 ; AVX2-NEXT: tilestored %tmm1, (%rcx,%rsi)
74 ; AVX2-NEXT: tilerelease
75 ; AVX2-NEXT: vzeroupper
78 ; SSE2-LABEL: test_api:
80 ; SSE2-NEXT: xorps %xmm0, %xmm0
81 ; SSE2-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp)
82 ; SSE2-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp)
83 ; SSE2-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp)
84 ; SSE2-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp)
85 ; SSE2-NEXT: movb $1, -{{[0-9]+}}(%rsp)
86 ; SSE2-NEXT: movw %dx, -{{[0-9]+}}(%rsp)
87 ; SSE2-NEXT: movw %dx, -{{[0-9]+}}(%rsp)
88 ; SSE2-NEXT: movw %si, -{{[0-9]+}}(%rsp)
89 ; SSE2-NEXT: testl %edi, %edi
90 ; SSE2-NEXT: movsbl %sil, %eax
91 ; SSE2-NEXT: movb %al, -{{[0-9]+}}(%rsp)
92 ; SSE2-NEXT: movb %al, -{{[0-9]+}}(%rsp)
93 ; SSE2-NEXT: movb %al, -{{[0-9]+}}(%rsp)
94 ; SSE2-NEXT: ldtilecfg -{{[0-9]+}}(%rsp)
95 ; SSE2-NEXT: je .LBB0_2
97 ; SSE2-NEXT: movl $buf, %ecx
98 ; SSE2-NEXT: jmp .LBB0_3
100 ; SSE2-NEXT: movl $buf2, %ecx
101 ; SSE2-NEXT: .LBB0_3:
102 ; SSE2-NEXT: movl $32, %edi
103 ; SSE2-NEXT: tileloadd (%rcx,%rdi), %tmm0
104 ; SSE2-NEXT: tileloadd (%rcx,%rdi), %tmm2
105 ; SSE2-NEXT: tileloadd (%rcx,%rdi), %tmm1
106 ; SSE2-NEXT: tdpbssd %tmm2, %tmm0, %tmm1
107 ; SSE2-NEXT: movl $buf, %ecx
108 ; SSE2-NEXT: movl $32, %esi
109 ; SSE2-NEXT: tilestored %tmm1, (%rcx,%rsi)
110 ; SSE2-NEXT: tilerelease
112 %4 = icmp eq i32 %0, 0
114 %6 = ashr exact i16 %5, 8
115 br i1 %4, label %11, label %7
118 %8 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %6, i16 %1, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf, i64 0, i64 0), i64 32)
119 %9 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %6, i16 %2, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf, i64 0, i64 0), i64 32)
120 %10 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %6, i16 %2, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf, i64 0, i64 0), i64 32)
124 %12 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %6, i16 %1, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf2, i64 0, i64 0), i64 32)
125 %13 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %6, i16 %2, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf2, i64 0, i64 0), i64 32)
126 %14 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %6, i16 %2, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf2, i64 0, i64 0), i64 32)
129 15: ; preds = %11, %7
130 %16 = phi x86_amx [ %12, %11 ], [ %8, %7 ]
131 %17 = phi x86_amx [ %13, %11 ], [ %9, %7 ]
132 %18 = phi x86_amx [ %14, %11 ], [ %10, %7 ]
133 %19 = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 %6, i16 %2, i16 %1, x86_amx %18, x86_amx %16, x86_amx %17)
134 tail call void @llvm.x86.tilestored64.internal(i16 %6, i16 %2, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf, i64 0, i64 0), i64 32, x86_amx %19)
138 declare x86_amx @llvm.x86.tileloadd64.internal(i16, i16, i8*, i64)
140 declare x86_amx @llvm.x86.tdpbssd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx)
142 declare void @llvm.x86.tilestored64.internal(i16, i16, i8*, i64, x86_amx)