1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512vnni --show-mc-encoding | FileCheck %s --check-prefixes=CHECK,X86
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vnni --show-mc-encoding | FileCheck %s --check-prefixes=CHECK,X64
5 declare <16 x i32> @llvm.x86.avx512.vpdpbusd.512(<16 x i32>, <16 x i32>, <16 x i32>)
7 define <16 x i32>@test_int_x86_avx512_mask_vpdpbusd_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32>* %x2p, <16 x i32> %x4, i16 %x3) {
8 ; X86-LABEL: test_int_x86_avx512_mask_vpdpbusd_512:
10 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
11 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x08]
12 ; X86-NEXT: vmovaps %zmm0, %zmm3 # encoding: [0x62,0xf1,0x7c,0x48,0x28,0xd8]
13 ; X86-NEXT: vpdpbusd (%eax), %zmm1, %zmm3 {%k1} # encoding: [0x62,0xf2,0x75,0x49,0x50,0x18]
14 ; X86-NEXT: vmovaps %zmm0, %zmm4 # encoding: [0x62,0xf1,0x7c,0x48,0x28,0xe0]
15 ; X86-NEXT: vpdpbusd %zmm2, %zmm1, %zmm4 # encoding: [0x62,0xf2,0x75,0x48,0x50,0xe2]
16 ; X86-NEXT: vpdpbusd %zmm2, %zmm1, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xc9,0x50,0xc2]
17 ; X86-NEXT: vpaddd %zmm0, %zmm4, %zmm0 # encoding: [0x62,0xf1,0x5d,0x48,0xfe,0xc0]
18 ; X86-NEXT: vpaddd %zmm0, %zmm3, %zmm0 # encoding: [0x62,0xf1,0x65,0x48,0xfe,0xc0]
19 ; X86-NEXT: retl # encoding: [0xc3]
21 ; X64-LABEL: test_int_x86_avx512_mask_vpdpbusd_512:
23 ; X64-NEXT: kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce]
24 ; X64-NEXT: vmovaps %zmm0, %zmm3 # encoding: [0x62,0xf1,0x7c,0x48,0x28,0xd8]
25 ; X64-NEXT: vpdpbusd (%rdi), %zmm1, %zmm3 {%k1} # encoding: [0x62,0xf2,0x75,0x49,0x50,0x1f]
26 ; X64-NEXT: vmovaps %zmm0, %zmm4 # encoding: [0x62,0xf1,0x7c,0x48,0x28,0xe0]
27 ; X64-NEXT: vpdpbusd %zmm2, %zmm1, %zmm4 # encoding: [0x62,0xf2,0x75,0x48,0x50,0xe2]
28 ; X64-NEXT: vpdpbusd %zmm2, %zmm1, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xc9,0x50,0xc2]
29 ; X64-NEXT: vpaddd %zmm0, %zmm4, %zmm0 # encoding: [0x62,0xf1,0x5d,0x48,0xfe,0xc0]
30 ; X64-NEXT: vpaddd %zmm0, %zmm3, %zmm0 # encoding: [0x62,0xf1,0x65,0x48,0xfe,0xc0]
31 ; X64-NEXT: retq # encoding: [0xc3]
32 %x2 = load <16 x i32>, <16 x i32>* %x2p
33 %1 = call <16 x i32> @llvm.x86.avx512.vpdpbusd.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2)
34 %2 = bitcast i16 %x3 to <16 x i1>
35 %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> %x0
36 %4 = call <16 x i32> @llvm.x86.avx512.vpdpbusd.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x4)
37 %5 = call <16 x i32> @llvm.x86.avx512.vpdpbusd.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x4)
38 %6 = bitcast i16 %x3 to <16 x i1>
39 %7 = select <16 x i1> %6, <16 x i32> %5, <16 x i32> zeroinitializer
40 %res3 = add <16 x i32> %3, %4
41 %res4 = add <16 x i32> %7, %res3
45 declare <16 x i32> @llvm.x86.avx512.vpdpbusds.512(<16 x i32>, <16 x i32>, <16 x i32>)
47 define <16 x i32>@test_int_x86_avx512_mask_vpdpbusds_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32>* %x2p, <16 x i32> %x4, i16 %x3) {
48 ; X86-LABEL: test_int_x86_avx512_mask_vpdpbusds_512:
50 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
51 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x08]
52 ; X86-NEXT: vmovaps %zmm0, %zmm3 # encoding: [0x62,0xf1,0x7c,0x48,0x28,0xd8]
53 ; X86-NEXT: vpdpbusds (%eax), %zmm1, %zmm3 {%k1} # encoding: [0x62,0xf2,0x75,0x49,0x51,0x18]
54 ; X86-NEXT: vmovaps %zmm0, %zmm4 # encoding: [0x62,0xf1,0x7c,0x48,0x28,0xe0]
55 ; X86-NEXT: vpdpbusds %zmm2, %zmm1, %zmm4 # encoding: [0x62,0xf2,0x75,0x48,0x51,0xe2]
56 ; X86-NEXT: vpdpbusds %zmm2, %zmm1, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xc9,0x51,0xc2]
57 ; X86-NEXT: vpaddd %zmm0, %zmm4, %zmm0 # encoding: [0x62,0xf1,0x5d,0x48,0xfe,0xc0]
58 ; X86-NEXT: vpaddd %zmm0, %zmm3, %zmm0 # encoding: [0x62,0xf1,0x65,0x48,0xfe,0xc0]
59 ; X86-NEXT: retl # encoding: [0xc3]
61 ; X64-LABEL: test_int_x86_avx512_mask_vpdpbusds_512:
63 ; X64-NEXT: kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce]
64 ; X64-NEXT: vmovaps %zmm0, %zmm3 # encoding: [0x62,0xf1,0x7c,0x48,0x28,0xd8]
65 ; X64-NEXT: vpdpbusds (%rdi), %zmm1, %zmm3 {%k1} # encoding: [0x62,0xf2,0x75,0x49,0x51,0x1f]
66 ; X64-NEXT: vmovaps %zmm0, %zmm4 # encoding: [0x62,0xf1,0x7c,0x48,0x28,0xe0]
67 ; X64-NEXT: vpdpbusds %zmm2, %zmm1, %zmm4 # encoding: [0x62,0xf2,0x75,0x48,0x51,0xe2]
68 ; X64-NEXT: vpdpbusds %zmm2, %zmm1, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xc9,0x51,0xc2]
69 ; X64-NEXT: vpaddd %zmm0, %zmm4, %zmm0 # encoding: [0x62,0xf1,0x5d,0x48,0xfe,0xc0]
70 ; X64-NEXT: vpaddd %zmm0, %zmm3, %zmm0 # encoding: [0x62,0xf1,0x65,0x48,0xfe,0xc0]
71 ; X64-NEXT: retq # encoding: [0xc3]
72 %x2 = load <16 x i32>, <16 x i32>* %x2p
73 %1 = call <16 x i32> @llvm.x86.avx512.vpdpbusds.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2)
74 %2 = bitcast i16 %x3 to <16 x i1>
75 %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> %x0
76 %4 = call <16 x i32> @llvm.x86.avx512.vpdpbusds.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x4)
77 %5 = call <16 x i32> @llvm.x86.avx512.vpdpbusds.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x4)
78 %6 = bitcast i16 %x3 to <16 x i1>
79 %7 = select <16 x i1> %6, <16 x i32> %5, <16 x i32> zeroinitializer
80 %res3 = add <16 x i32> %3, %4
81 %res4 = add <16 x i32> %7, %res3
85 declare <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32>, <16 x i32>, <16 x i32>)
87 define <16 x i32>@test_int_x86_avx512_mask_vpdpwssd_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32>* %x2p, <16 x i32> %x4, i16 %x3) {
88 ; X86-LABEL: test_int_x86_avx512_mask_vpdpwssd_512:
90 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
91 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x08]
92 ; X86-NEXT: vmovaps %zmm0, %zmm3 # encoding: [0x62,0xf1,0x7c,0x48,0x28,0xd8]
93 ; X86-NEXT: vpdpwssd (%eax), %zmm1, %zmm3 {%k1} # encoding: [0x62,0xf2,0x75,0x49,0x52,0x18]
94 ; X86-NEXT: vmovaps %zmm0, %zmm4 # encoding: [0x62,0xf1,0x7c,0x48,0x28,0xe0]
95 ; X86-NEXT: vpdpwssd %zmm2, %zmm1, %zmm4 # encoding: [0x62,0xf2,0x75,0x48,0x52,0xe2]
96 ; X86-NEXT: vpdpwssd %zmm2, %zmm1, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xc9,0x52,0xc2]
97 ; X86-NEXT: vpaddd %zmm0, %zmm4, %zmm0 # encoding: [0x62,0xf1,0x5d,0x48,0xfe,0xc0]
98 ; X86-NEXT: vpaddd %zmm0, %zmm3, %zmm0 # encoding: [0x62,0xf1,0x65,0x48,0xfe,0xc0]
99 ; X86-NEXT: retl # encoding: [0xc3]
101 ; X64-LABEL: test_int_x86_avx512_mask_vpdpwssd_512:
103 ; X64-NEXT: kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce]
104 ; X64-NEXT: vmovaps %zmm0, %zmm3 # encoding: [0x62,0xf1,0x7c,0x48,0x28,0xd8]
105 ; X64-NEXT: vpdpwssd (%rdi), %zmm1, %zmm3 {%k1} # encoding: [0x62,0xf2,0x75,0x49,0x52,0x1f]
106 ; X64-NEXT: vmovaps %zmm0, %zmm4 # encoding: [0x62,0xf1,0x7c,0x48,0x28,0xe0]
107 ; X64-NEXT: vpdpwssd %zmm2, %zmm1, %zmm4 # encoding: [0x62,0xf2,0x75,0x48,0x52,0xe2]
108 ; X64-NEXT: vpdpwssd %zmm2, %zmm1, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xc9,0x52,0xc2]
109 ; X64-NEXT: vpaddd %zmm0, %zmm4, %zmm0 # encoding: [0x62,0xf1,0x5d,0x48,0xfe,0xc0]
110 ; X64-NEXT: vpaddd %zmm0, %zmm3, %zmm0 # encoding: [0x62,0xf1,0x65,0x48,0xfe,0xc0]
111 ; X64-NEXT: retq # encoding: [0xc3]
112 %x2 = load <16 x i32>, <16 x i32>* %x2p
113 %1 = call <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2)
114 %2 = bitcast i16 %x3 to <16 x i1>
115 %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> %x0
116 %4 = call <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x4)
117 %5 = call <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x4)
118 %6 = bitcast i16 %x3 to <16 x i1>
119 %7 = select <16 x i1> %6, <16 x i32> %5, <16 x i32> zeroinitializer
120 %res3 = add <16 x i32> %3, %4
121 %res4 = add <16 x i32> %7, %res3
125 declare <16 x i32> @llvm.x86.avx512.vpdpwssds.512(<16 x i32>, <16 x i32>, <16 x i32>)
127 define <16 x i32>@test_int_x86_avx512_mask_vpdpwssds_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32>* %x2p, <16 x i32> %x4, i16 %x3) {
128 ; X86-LABEL: test_int_x86_avx512_mask_vpdpwssds_512:
130 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
131 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x08]
132 ; X86-NEXT: vmovaps %zmm0, %zmm3 # encoding: [0x62,0xf1,0x7c,0x48,0x28,0xd8]
133 ; X86-NEXT: vpdpwssds (%eax), %zmm1, %zmm3 {%k1} # encoding: [0x62,0xf2,0x75,0x49,0x53,0x18]
134 ; X86-NEXT: vmovaps %zmm0, %zmm4 # encoding: [0x62,0xf1,0x7c,0x48,0x28,0xe0]
135 ; X86-NEXT: vpdpwssds %zmm2, %zmm1, %zmm4 # encoding: [0x62,0xf2,0x75,0x48,0x53,0xe2]
136 ; X86-NEXT: vpdpwssds %zmm2, %zmm1, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xc9,0x53,0xc2]
137 ; X86-NEXT: vpaddd %zmm0, %zmm4, %zmm0 # encoding: [0x62,0xf1,0x5d,0x48,0xfe,0xc0]
138 ; X86-NEXT: vpaddd %zmm0, %zmm3, %zmm0 # encoding: [0x62,0xf1,0x65,0x48,0xfe,0xc0]
139 ; X86-NEXT: retl # encoding: [0xc3]
141 ; X64-LABEL: test_int_x86_avx512_mask_vpdpwssds_512:
143 ; X64-NEXT: kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce]
144 ; X64-NEXT: vmovaps %zmm0, %zmm3 # encoding: [0x62,0xf1,0x7c,0x48,0x28,0xd8]
145 ; X64-NEXT: vpdpwssds (%rdi), %zmm1, %zmm3 {%k1} # encoding: [0x62,0xf2,0x75,0x49,0x53,0x1f]
146 ; X64-NEXT: vmovaps %zmm0, %zmm4 # encoding: [0x62,0xf1,0x7c,0x48,0x28,0xe0]
147 ; X64-NEXT: vpdpwssds %zmm2, %zmm1, %zmm4 # encoding: [0x62,0xf2,0x75,0x48,0x53,0xe2]
148 ; X64-NEXT: vpdpwssds %zmm2, %zmm1, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xc9,0x53,0xc2]
149 ; X64-NEXT: vpaddd %zmm0, %zmm4, %zmm0 # encoding: [0x62,0xf1,0x5d,0x48,0xfe,0xc0]
150 ; X64-NEXT: vpaddd %zmm0, %zmm3, %zmm0 # encoding: [0x62,0xf1,0x65,0x48,0xfe,0xc0]
151 ; X64-NEXT: retq # encoding: [0xc3]
152 %x2 = load <16 x i32>, <16 x i32>* %x2p
153 %1 = call <16 x i32> @llvm.x86.avx512.vpdpwssds.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2)
154 %2 = bitcast i16 %x3 to <16 x i1>
155 %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> %x0
156 %4 = call <16 x i32> @llvm.x86.avx512.vpdpwssds.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x4)
157 %5 = call <16 x i32> @llvm.x86.avx512.vpdpwssds.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x4)
158 %6 = bitcast i16 %x3 to <16 x i1>
159 %7 = select <16 x i1> %6, <16 x i32> %5, <16 x i32> zeroinitializer
160 %res3 = add <16 x i32> %3, %4
161 %res4 = add <16 x i32> %7, %res3