1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avxvnni | FileCheck %s --check-prefixes=AVXVNNI
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vnni | FileCheck %s --check-prefixes=AVX512,AVX512VNNI
4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vnni -mattr=+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VLVNNI
6 define i32 @no_dpbusd(ptr%a, ptr%b, i32 %c, i32 %n) {
7 ; AVXVNNI-LABEL: no_dpbusd:
8 ; AVXVNNI: # %bb.0: # %entry
9 ; AVXVNNI-NEXT: vpmovzxbw {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
10 ; AVXVNNI-NEXT: vpmovzxbw {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
11 ; AVXVNNI-NEXT: vpmaddwd %ymm0, %ymm1, %ymm0
12 ; AVXVNNI-NEXT: vextracti128 $1, %ymm0, %xmm1
13 ; AVXVNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0
14 ; AVXVNNI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
15 ; AVXVNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0
16 ; AVXVNNI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
17 ; AVXVNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0
18 ; AVXVNNI-NEXT: vmovd %xmm0, %eax
19 ; AVXVNNI-NEXT: addl %edx, %eax
20 ; AVXVNNI-NEXT: vzeroupper
23 ; AVX512-LABEL: no_dpbusd:
24 ; AVX512: # %bb.0: # %entry
25 ; AVX512-NEXT: vpmovzxbw {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
26 ; AVX512-NEXT: vpmovzxbw {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
27 ; AVX512-NEXT: vpmaddwd %ymm0, %ymm1, %ymm0
28 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
29 ; AVX512-NEXT: vpaddd %ymm1, %ymm0, %ymm0
30 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
31 ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
32 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
33 ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
34 ; AVX512-NEXT: vmovd %xmm0, %eax
35 ; AVX512-NEXT: addl %edx, %eax
36 ; AVX512-NEXT: vzeroupper
39 %0 = load <16 x i8>, ptr %a, align 16
40 %1 = zext <16 x i8> %0 to <16 x i32>
41 %2 = load <16 x i8>, ptr %b, align 16
42 %3 = zext <16 x i8> %2 to <16 x i32>
43 %4 = mul nsw <16 x i32> %3, %1
44 %5 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %4)
45 %op.extra = add nsw i32 %5, %c
49 define i32 @vpdpbusd_mutate(ptr%a, ptr%b, i32 %c, i32 %n) {
50 ; AVXVNNI-LABEL: vpdpbusd_mutate:
51 ; AVXVNNI: # %bb.0: # %entry
52 ; AVXVNNI-NEXT: vmovdqa (%rsi), %xmm0
53 ; AVXVNNI-NEXT: vpxor %xmm1, %xmm1, %xmm1
54 ; AVXVNNI-NEXT: {vex} vpdpbusd (%rdi), %xmm0, %xmm1
55 ; AVXVNNI-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
56 ; AVXVNNI-NEXT: vpaddd %xmm0, %xmm1, %xmm0
57 ; AVXVNNI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
58 ; AVXVNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0
59 ; AVXVNNI-NEXT: vmovd %xmm0, %eax
60 ; AVXVNNI-NEXT: addl %edx, %eax
63 ; AVX512VNNI-LABEL: vpdpbusd_mutate:
64 ; AVX512VNNI: # %bb.0: # %entry
65 ; AVX512VNNI-NEXT: vmovdqa (%rdi), %xmm0
66 ; AVX512VNNI-NEXT: vmovdqa (%rsi), %xmm1
67 ; AVX512VNNI-NEXT: vpxor %xmm2, %xmm2, %xmm2
68 ; AVX512VNNI-NEXT: vpdpbusd %zmm0, %zmm1, %zmm2
69 ; AVX512VNNI-NEXT: vpshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
70 ; AVX512VNNI-NEXT: vpaddd %xmm0, %xmm2, %xmm0
71 ; AVX512VNNI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
72 ; AVX512VNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0
73 ; AVX512VNNI-NEXT: vmovd %xmm0, %eax
74 ; AVX512VNNI-NEXT: addl %edx, %eax
75 ; AVX512VNNI-NEXT: vzeroupper
76 ; AVX512VNNI-NEXT: retq
78 ; AVX512VLVNNI-LABEL: vpdpbusd_mutate:
79 ; AVX512VLVNNI: # %bb.0: # %entry
80 ; AVX512VLVNNI-NEXT: vmovdqa (%rsi), %xmm0
81 ; AVX512VLVNNI-NEXT: vpxor %xmm1, %xmm1, %xmm1
82 ; AVX512VLVNNI-NEXT: vpdpbusd (%rdi), %xmm0, %xmm1
83 ; AVX512VLVNNI-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
84 ; AVX512VLVNNI-NEXT: vpaddd %xmm0, %xmm1, %xmm0
85 ; AVX512VLVNNI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
86 ; AVX512VLVNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0
87 ; AVX512VLVNNI-NEXT: vmovd %xmm0, %eax
88 ; AVX512VLVNNI-NEXT: addl %edx, %eax
89 ; AVX512VLVNNI-NEXT: retq
91 %0 = load <16 x i8>, ptr %a, align 16
92 %1 = sext <16 x i8> %0 to <16 x i32>
93 %2 = load <16 x i8>, ptr %b, align 16
94 %3 = zext <16 x i8> %2 to <16 x i32>
95 %4 = mul nsw <16 x i32> %3, %1
96 %5 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %4)
97 %op.extra = add nsw i32 %5, %c
101 define i32 @mul_zext(ptr%a, ptr%b, i32 %c, i32 %n) {
102 ; AVXVNNI-LABEL: mul_zext:
103 ; AVXVNNI: # %bb.0: # %entry
104 ; AVXVNNI-NEXT: vpmovzxbw {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
105 ; AVXVNNI-NEXT: vpmovsxbw (%rsi), %ymm1
106 ; AVXVNNI-NEXT: vpmullw %ymm0, %ymm1, %ymm0
107 ; AVXVNNI-NEXT: vextracti128 $1, %ymm0, %xmm1
108 ; AVXVNNI-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
109 ; AVXVNNI-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
110 ; AVXVNNI-NEXT: vpaddd %ymm1, %ymm0, %ymm0
111 ; AVXVNNI-NEXT: vextracti128 $1, %ymm0, %xmm1
112 ; AVXVNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0
113 ; AVXVNNI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
114 ; AVXVNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0
115 ; AVXVNNI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
116 ; AVXVNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0
117 ; AVXVNNI-NEXT: vmovd %xmm0, %eax
118 ; AVXVNNI-NEXT: addl %edx, %eax
119 ; AVXVNNI-NEXT: vzeroupper
122 ; AVX512-LABEL: mul_zext:
123 ; AVX512: # %bb.0: # %entry
124 ; AVX512-NEXT: vpmovzxbw {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
125 ; AVX512-NEXT: vpmovsxbw (%rsi), %ymm1
126 ; AVX512-NEXT: vpmullw %ymm0, %ymm1, %ymm0
127 ; AVX512-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
128 ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
129 ; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0
130 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
131 ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
132 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
133 ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
134 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
135 ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
136 ; AVX512-NEXT: vmovd %xmm0, %eax
137 ; AVX512-NEXT: addl %edx, %eax
138 ; AVX512-NEXT: vzeroupper
141 %0 = load <16 x i8>, ptr %a, align 16
142 %1 = zext <16 x i8> %0 to <16 x i16>
143 %2 = load <16 x i8>, ptr %b, align 16
144 %3 = sext <16 x i8> %2 to <16 x i16>
145 %4 = mul nsw <16 x i16> %3, %1
146 ; We can't combine to vpdpbusd for zext, because each of the 4 multiplies
147 ; done by vpdpbusd compute a signed 16-bit product that will be sign extended
148 ; before adding into the accumulator.
149 %5 = zext <16 x i16> %4 to <16 x i32>
150 %6 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %5)
151 %op.extra = add nsw i32 %6, %c
155 define i32 @mul_sext(ptr%a, ptr%b, i32 %c, i32 %n) {
156 ; AVXVNNI-LABEL: mul_sext:
157 ; AVXVNNI: # %bb.0: # %entry
158 ; AVXVNNI-NEXT: vpmovzxbw {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
159 ; AVXVNNI-NEXT: vpmovsxbw (%rsi), %ymm1
160 ; AVXVNNI-NEXT: vpmullw %ymm0, %ymm1, %ymm0
161 ; AVXVNNI-NEXT: vextracti128 $1, %ymm0, %xmm1
162 ; AVXVNNI-NEXT: vpmovsxwd %xmm1, %ymm1
163 ; AVXVNNI-NEXT: vpmovsxwd %xmm0, %ymm0
164 ; AVXVNNI-NEXT: vpaddd %ymm1, %ymm0, %ymm0
165 ; AVXVNNI-NEXT: vextracti128 $1, %ymm0, %xmm1
166 ; AVXVNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0
167 ; AVXVNNI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
168 ; AVXVNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0
169 ; AVXVNNI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
170 ; AVXVNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0
171 ; AVXVNNI-NEXT: vmovd %xmm0, %eax
172 ; AVXVNNI-NEXT: addl %edx, %eax
173 ; AVXVNNI-NEXT: vzeroupper
176 ; AVX512-LABEL: mul_sext:
177 ; AVX512: # %bb.0: # %entry
178 ; AVX512-NEXT: vpmovzxbw {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
179 ; AVX512-NEXT: vpmovsxbw (%rsi), %ymm1
180 ; AVX512-NEXT: vpmullw %ymm0, %ymm1, %ymm0
181 ; AVX512-NEXT: vpmovsxwd %ymm0, %zmm0
182 ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
183 ; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0
184 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
185 ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
186 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
187 ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
188 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
189 ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
190 ; AVX512-NEXT: vmovd %xmm0, %eax
191 ; AVX512-NEXT: addl %edx, %eax
192 ; AVX512-NEXT: vzeroupper
195 %0 = load <16 x i8>, ptr %a, align 16
196 %1 = zext <16 x i8> %0 to <16 x i16>
197 %2 = load <16 x i8>, ptr %b, align 16
198 %3 = sext <16 x i8> %2 to <16 x i16>
199 %4 = mul nsw <16 x i16> %3, %1
201 ; We also need to verify that the multiply has at least 2x the number of bits
202 ; of the input. We shouldn't match
203 ; (sign_extend (mul (vXi9 (zext (vXi8 X))), (vXi9 (zext (vXi8 Y)))).
204 %5 = sext <16 x i16> %4 to <16 x i32>
205 %6 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %5)
206 %op.extra = add nsw i32 %6, %c
210 define i32 @vpdpbusd_512(ptr%a, ptr%b, i32 %c, i32 %n) {
211 ; AVXVNNI-LABEL: vpdpbusd_512:
212 ; AVXVNNI: # %bb.0: # %entry
213 ; AVXVNNI-NEXT: vmovdqa (%rdi), %xmm0
214 ; AVXVNNI-NEXT: vpxor %xmm1, %xmm1, %xmm1
215 ; AVXVNNI-NEXT: {vex} vpdpbusd (%rsi), %xmm0, %xmm1
216 ; AVXVNNI-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
217 ; AVXVNNI-NEXT: vpaddd %xmm0, %xmm1, %xmm0
218 ; AVXVNNI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
219 ; AVXVNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0
220 ; AVXVNNI-NEXT: vmovd %xmm0, %eax
221 ; AVXVNNI-NEXT: addl %edx, %eax
224 ; AVX512VNNI-LABEL: vpdpbusd_512:
225 ; AVX512VNNI: # %bb.0: # %entry
226 ; AVX512VNNI-NEXT: vmovdqa (%rdi), %xmm0
227 ; AVX512VNNI-NEXT: vmovdqa (%rsi), %xmm1
228 ; AVX512VNNI-NEXT: vpxor %xmm2, %xmm2, %xmm2
229 ; AVX512VNNI-NEXT: vpdpbusd %zmm1, %zmm0, %zmm2
230 ; AVX512VNNI-NEXT: vpshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
231 ; AVX512VNNI-NEXT: vpaddd %xmm0, %xmm2, %xmm0
232 ; AVX512VNNI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
233 ; AVX512VNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0
234 ; AVX512VNNI-NEXT: vmovd %xmm0, %eax
235 ; AVX512VNNI-NEXT: addl %edx, %eax
236 ; AVX512VNNI-NEXT: vzeroupper
237 ; AVX512VNNI-NEXT: retq
239 ; AVX512VLVNNI-LABEL: vpdpbusd_512:
240 ; AVX512VLVNNI: # %bb.0: # %entry
241 ; AVX512VLVNNI-NEXT: vmovdqa (%rdi), %xmm0
242 ; AVX512VLVNNI-NEXT: vpxor %xmm1, %xmm1, %xmm1
243 ; AVX512VLVNNI-NEXT: vpdpbusd (%rsi), %xmm0, %xmm1
244 ; AVX512VLVNNI-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
245 ; AVX512VLVNNI-NEXT: vpaddd %xmm0, %xmm1, %xmm0
246 ; AVX512VLVNNI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
247 ; AVX512VLVNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0
248 ; AVX512VLVNNI-NEXT: vmovd %xmm0, %eax
249 ; AVX512VLVNNI-NEXT: addl %edx, %eax
250 ; AVX512VLVNNI-NEXT: retq
252 %0 = load <16 x i8>, ptr %a, align 16
253 %1 = zext <16 x i8> %0 to <16 x i32>
254 %2 = load <16 x i8>, ptr %b, align 16
255 %3 = sext <16 x i8> %2 to <16 x i32>
256 %4 = mul nsw <16 x i32> %3, %1
257 %5 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %4)
258 %op.extra = add nsw i32 %5, %c
262 declare i32 @llvm.vector.reduce.add.v16i32(<16 x i32>)
264 define i32 @vpdpbusd_256(ptr%a, ptr%b, i32 %c, i32 %n) {
265 ; AVXVNNI-LABEL: vpdpbusd_256:
266 ; AVXVNNI: # %bb.0: # %entry
267 ; AVXVNNI-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
268 ; AVXVNNI-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
269 ; AVXVNNI-NEXT: vpxor %xmm2, %xmm2, %xmm2
270 ; AVXVNNI-NEXT: {vex} vpdpbusd %xmm0, %xmm1, %xmm2
271 ; AVXVNNI-NEXT: vpshufd {{.*#+}} xmm0 = xmm2[1,1,1,1]
272 ; AVXVNNI-NEXT: vpaddd %xmm0, %xmm2, %xmm0
273 ; AVXVNNI-NEXT: vmovd %xmm0, %eax
274 ; AVXVNNI-NEXT: addl %edx, %eax
277 ; AVX512VNNI-LABEL: vpdpbusd_256:
278 ; AVX512VNNI: # %bb.0: # %entry
279 ; AVX512VNNI-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
280 ; AVX512VNNI-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
281 ; AVX512VNNI-NEXT: vpxor %xmm2, %xmm2, %xmm2
282 ; AVX512VNNI-NEXT: vpdpbusd %zmm0, %zmm1, %zmm2
283 ; AVX512VNNI-NEXT: vpshufd {{.*#+}} xmm0 = xmm2[1,1,1,1]
284 ; AVX512VNNI-NEXT: vpaddd %xmm0, %xmm2, %xmm0
285 ; AVX512VNNI-NEXT: vmovd %xmm0, %eax
286 ; AVX512VNNI-NEXT: addl %edx, %eax
287 ; AVX512VNNI-NEXT: vzeroupper
288 ; AVX512VNNI-NEXT: retq
290 ; AVX512VLVNNI-LABEL: vpdpbusd_256:
291 ; AVX512VLVNNI: # %bb.0: # %entry
292 ; AVX512VLVNNI-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
293 ; AVX512VLVNNI-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
294 ; AVX512VLVNNI-NEXT: vpxor %xmm2, %xmm2, %xmm2
295 ; AVX512VLVNNI-NEXT: vpdpbusd %xmm0, %xmm1, %xmm2
296 ; AVX512VLVNNI-NEXT: vpshufd {{.*#+}} xmm0 = xmm2[1,1,1,1]
297 ; AVX512VLVNNI-NEXT: vpaddd %xmm0, %xmm2, %xmm0
298 ; AVX512VLVNNI-NEXT: vmovd %xmm0, %eax
299 ; AVX512VLVNNI-NEXT: addl %edx, %eax
300 ; AVX512VLVNNI-NEXT: retq
302 %0 = load <8 x i8>, ptr %a, align 8
303 %1 = zext <8 x i8> %0 to <8 x i32>
304 %2 = load <8 x i8>, ptr %b, align 8
305 %3 = sext <8 x i8> %2 to <8 x i32>
306 %4 = mul nsw <8 x i32> %3, %1
307 %5 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %4)
308 %op.extra = add nsw i32 %5, %c
312 declare i32 @llvm.vector.reduce.add.v8i32(<8 x i32>)
314 define i32 @vpdpbusd_128(ptr%a, ptr%b, i32 %c, i32 %n) {
315 ; AVXVNNI-LABEL: vpdpbusd_128:
316 ; AVXVNNI: # %bb.0: # %entry
317 ; AVXVNNI-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
318 ; AVXVNNI-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
319 ; AVXVNNI-NEXT: vpxor %xmm2, %xmm2, %xmm2
320 ; AVXVNNI-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7]
321 ; AVXVNNI-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3,4,5,6,7]
322 ; AVXVNNI-NEXT: {vex} vpdpbusd %xmm1, %xmm0, %xmm2
323 ; AVXVNNI-NEXT: vmovd %xmm2, %eax
324 ; AVXVNNI-NEXT: addl %edx, %eax
327 ; AVX512VNNI-LABEL: vpdpbusd_128:
328 ; AVX512VNNI: # %bb.0: # %entry
329 ; AVX512VNNI-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
330 ; AVX512VNNI-NEXT: vpxor %xmm1, %xmm1, %xmm1
331 ; AVX512VNNI-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
332 ; AVX512VNNI-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
333 ; AVX512VNNI-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3,4,5,6,7]
334 ; AVX512VNNI-NEXT: vpxor %xmm2, %xmm2, %xmm2
335 ; AVX512VNNI-NEXT: vpdpbusd %zmm0, %zmm1, %zmm2
336 ; AVX512VNNI-NEXT: vmovd %xmm2, %eax
337 ; AVX512VNNI-NEXT: addl %edx, %eax
338 ; AVX512VNNI-NEXT: vzeroupper
339 ; AVX512VNNI-NEXT: retq
341 ; AVX512VLVNNI-LABEL: vpdpbusd_128:
342 ; AVX512VLVNNI: # %bb.0: # %entry
343 ; AVX512VLVNNI-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
344 ; AVX512VLVNNI-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
345 ; AVX512VLVNNI-NEXT: vpxor %xmm2, %xmm2, %xmm2
346 ; AVX512VLVNNI-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7]
347 ; AVX512VLVNNI-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3,4,5,6,7]
348 ; AVX512VLVNNI-NEXT: vpxor %xmm2, %xmm2, %xmm2
349 ; AVX512VLVNNI-NEXT: vpdpbusd %xmm1, %xmm0, %xmm2
350 ; AVX512VLVNNI-NEXT: vmovd %xmm2, %eax
351 ; AVX512VLVNNI-NEXT: addl %edx, %eax
352 ; AVX512VLVNNI-NEXT: retq
354 %0 = load <4 x i8>, ptr %a, align 8
355 %1 = zext <4 x i8> %0 to <4 x i32>
356 %2 = load <4 x i8>, ptr %b, align 8
357 %3 = sext <4 x i8> %2 to <4 x i32>
358 %4 = mul nsw <4 x i32> %3, %1
359 %5 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %4)
360 %op.extra = add nsw i32 %5, %c
364 declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>)
366 define i32 @vpdpbusd_2xi32(ptr%a, ptr%b, i32 %c, i32 %n) {
367 ; AVXVNNI-LABEL: vpdpbusd_2xi32:
368 ; AVXVNNI: # %bb.0: # %entry
369 ; AVXVNNI-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
370 ; AVXVNNI-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
371 ; AVXVNNI-NEXT: vpxor %xmm2, %xmm2, %xmm2
372 ; AVXVNNI-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5,6,7]
373 ; AVXVNNI-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3,4,5,6,7]
374 ; AVXVNNI-NEXT: {vex} vpdpbusd %xmm1, %xmm0, %xmm2
375 ; AVXVNNI-NEXT: vmovd %xmm2, %eax
376 ; AVXVNNI-NEXT: addl %edx, %eax
379 ; AVX512VNNI-LABEL: vpdpbusd_2xi32:
380 ; AVX512VNNI: # %bb.0: # %entry
381 ; AVX512VNNI-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
382 ; AVX512VNNI-NEXT: vmovdqa {{.*#+}} xmm1 = [65535,0,0,0]
383 ; AVX512VNNI-NEXT: vpandq %zmm1, %zmm0, %zmm0
384 ; AVX512VNNI-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
385 ; AVX512VNNI-NEXT: vpandq %zmm1, %zmm2, %zmm1
386 ; AVX512VNNI-NEXT: vpxor %xmm2, %xmm2, %xmm2
387 ; AVX512VNNI-NEXT: vpdpbusd %zmm0, %zmm1, %zmm2
388 ; AVX512VNNI-NEXT: vmovd %xmm2, %eax
389 ; AVX512VNNI-NEXT: addl %edx, %eax
390 ; AVX512VNNI-NEXT: vzeroupper
391 ; AVX512VNNI-NEXT: retq
393 ; AVX512VLVNNI-LABEL: vpdpbusd_2xi32:
394 ; AVX512VLVNNI: # %bb.0: # %entry
395 ; AVX512VLVNNI-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
396 ; AVX512VLVNNI-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
397 ; AVX512VLVNNI-NEXT: vpxor %xmm2, %xmm2, %xmm2
398 ; AVX512VLVNNI-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5,6,7]
399 ; AVX512VLVNNI-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3,4,5,6,7]
400 ; AVX512VLVNNI-NEXT: vpdpbusd %xmm1, %xmm0, %xmm2
401 ; AVX512VLVNNI-NEXT: vmovd %xmm2, %eax
402 ; AVX512VLVNNI-NEXT: addl %edx, %eax
403 ; AVX512VLVNNI-NEXT: retq
405 %0 = load <2 x i8>, ptr %a, align 8
406 %1 = zext <2 x i8> %0 to <2 x i32>
407 %2 = load <2 x i8>, ptr %b, align 8
408 %3 = sext <2 x i8> %2 to <2 x i32>
409 %4 = mul nsw <2 x i32> %3, %1
410 %5 = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> %4)
411 %op.extra = add nsw i32 %5, %c
415 declare i32 @llvm.vector.reduce.add.v2i32(<2 x i32>)
417 define i32 @vpdpbusd_32xi32(ptr%a, ptr%b, i32 %c, i32 %n) {
418 ; AVXVNNI-LABEL: vpdpbusd_32xi32:
419 ; AVXVNNI: # %bb.0: # %entry
420 ; AVXVNNI-NEXT: vmovdqu (%rdi), %ymm0
421 ; AVXVNNI-NEXT: vpxor %xmm1, %xmm1, %xmm1
422 ; AVXVNNI-NEXT: {vex} vpdpbusd (%rsi), %ymm0, %ymm1
423 ; AVXVNNI-NEXT: vextracti128 $1, %ymm1, %xmm0
424 ; AVXVNNI-NEXT: vpaddd %xmm0, %xmm1, %xmm0
425 ; AVXVNNI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
426 ; AVXVNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0
427 ; AVXVNNI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
428 ; AVXVNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0
429 ; AVXVNNI-NEXT: vmovd %xmm0, %eax
430 ; AVXVNNI-NEXT: addl %edx, %eax
431 ; AVXVNNI-NEXT: vzeroupper
434 ; AVX512VNNI-LABEL: vpdpbusd_32xi32:
435 ; AVX512VNNI: # %bb.0: # %entry
436 ; AVX512VNNI-NEXT: vmovdqu (%rdi), %ymm0
437 ; AVX512VNNI-NEXT: vmovdqu (%rsi), %ymm1
438 ; AVX512VNNI-NEXT: vpxor %xmm2, %xmm2, %xmm2
439 ; AVX512VNNI-NEXT: vpdpbusd %zmm1, %zmm0, %zmm2
440 ; AVX512VNNI-NEXT: vextracti128 $1, %ymm2, %xmm0
441 ; AVX512VNNI-NEXT: vpaddd %xmm0, %xmm2, %xmm0
442 ; AVX512VNNI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
443 ; AVX512VNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0
444 ; AVX512VNNI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
445 ; AVX512VNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0
446 ; AVX512VNNI-NEXT: vmovd %xmm0, %eax
447 ; AVX512VNNI-NEXT: addl %edx, %eax
448 ; AVX512VNNI-NEXT: vzeroupper
449 ; AVX512VNNI-NEXT: retq
451 ; AVX512VLVNNI-LABEL: vpdpbusd_32xi32:
452 ; AVX512VLVNNI: # %bb.0: # %entry
453 ; AVX512VLVNNI-NEXT: vmovdqu (%rdi), %ymm0
454 ; AVX512VLVNNI-NEXT: vpxor %xmm1, %xmm1, %xmm1
455 ; AVX512VLVNNI-NEXT: vpdpbusd (%rsi), %ymm0, %ymm1
456 ; AVX512VLVNNI-NEXT: vextracti128 $1, %ymm1, %xmm0
457 ; AVX512VLVNNI-NEXT: vpaddd %xmm0, %xmm1, %xmm0
458 ; AVX512VLVNNI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
459 ; AVX512VLVNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0
460 ; AVX512VLVNNI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
461 ; AVX512VLVNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0
462 ; AVX512VLVNNI-NEXT: vmovd %xmm0, %eax
463 ; AVX512VLVNNI-NEXT: addl %edx, %eax
464 ; AVX512VLVNNI-NEXT: vzeroupper
465 ; AVX512VLVNNI-NEXT: retq
467 %0 = load <32 x i8>, ptr %a, align 16
468 %1 = zext <32 x i8> %0 to <32 x i32>
469 %2 = load <32 x i8>, ptr %b, align 16
470 %3 = sext <32 x i8> %2 to <32 x i32>
471 %4 = mul nsw <32 x i32> %3, %1
472 %5 = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> %4)
473 %op.extra = add nsw i32 %5, %c
477 declare i32 @llvm.vector.reduce.add.v32i32(<32 x i32>)
479 define i32 @vpdpbusd_64xi32(ptr%a, ptr%b, i32 %c, i32 %n) {
480 ; AVXVNNI-LABEL: vpdpbusd_64xi32:
481 ; AVXVNNI: # %bb.0: # %entry
482 ; AVXVNNI-NEXT: vmovdqu (%rdi), %ymm0
483 ; AVXVNNI-NEXT: vmovdqu 32(%rdi), %ymm1
484 ; AVXVNNI-NEXT: vpxor %xmm2, %xmm2, %xmm2
485 ; AVXVNNI-NEXT: vpxor %xmm3, %xmm3, %xmm3
486 ; AVXVNNI-NEXT: {vex} vpdpbusd 32(%rsi), %ymm1, %ymm3
487 ; AVXVNNI-NEXT: {vex} vpdpbusd (%rsi), %ymm0, %ymm2
488 ; AVXVNNI-NEXT: vpaddd %ymm3, %ymm2, %ymm0
489 ; AVXVNNI-NEXT: vextracti128 $1, %ymm0, %xmm1
490 ; AVXVNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0
491 ; AVXVNNI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
492 ; AVXVNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0
493 ; AVXVNNI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
494 ; AVXVNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0
495 ; AVXVNNI-NEXT: vmovd %xmm0, %eax
496 ; AVXVNNI-NEXT: addl %edx, %eax
497 ; AVXVNNI-NEXT: vzeroupper
500 ; AVX512-LABEL: vpdpbusd_64xi32:
501 ; AVX512: # %bb.0: # %entry
502 ; AVX512-NEXT: vmovdqu64 (%rdi), %zmm0
503 ; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1
504 ; AVX512-NEXT: vpdpbusd (%rsi), %zmm0, %zmm1
505 ; AVX512-NEXT: vextracti64x4 $1, %zmm1, %ymm0
506 ; AVX512-NEXT: vpaddd %zmm0, %zmm1, %zmm0
507 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
508 ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
509 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
510 ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
511 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
512 ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
513 ; AVX512-NEXT: vmovd %xmm0, %eax
514 ; AVX512-NEXT: addl %edx, %eax
515 ; AVX512-NEXT: vzeroupper
518 %0 = load <64 x i8>, ptr %a, align 16
519 %1 = zext <64 x i8> %0 to <64 x i32>
520 %2 = load <64 x i8>, ptr %b, align 16
521 %3 = sext <64 x i8> %2 to <64 x i32>
522 %4 = mul nsw <64 x i32> %3, %1
523 %5 = call i32 @llvm.vector.reduce.add.v64i32(<64 x i32> %4)
524 %op.extra = add nsw i32 %5, %c
528 declare i32 @llvm.vector.reduce.add.v64i32(<64 x i32>)