1 /*===-------- avxvnniint8intrin.h - AVXVNNIINT8 intrinsics -----------===
3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 * See https://llvm.org/LICENSE.txt for license information.
5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 *===-----------------------------------------------------------------------===
11 "Never use <avxvnniint8intrin.h> directly; include <immintrin.h> instead."
14 #ifndef __AVXVNNIINT8INTRIN_H
15 #define __AVXVNNIINT8INTRIN_H
17 /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
18 /// corresponding signed 8-bit integers in \a __B, producing 4 intermediate
19 /// signed 16-bit results. Sum these 4 results with the corresponding
20 /// 32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
22 /// \headerfile <x86intrin.h>
25 /// _mm_dpbssd_epi32(__m128i __W, __m128i __A, __m128i __B);
28 /// This intrinsic corresponds to the \c VPDPBSSD instruction.
31 /// A 128-bit vector of [16 x char].
33 /// A 128-bit vector of [16 x char].
35 /// A 128-bit vector of [4 x int].
39 /// tmp1.word := SignExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j])
40 /// tmp2.word := SignExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1])
41 /// tmp3.word := SignExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2])
42 /// tmp4.word := SignExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3])
43 /// dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
47 #define _mm_dpbssd_epi32(__W, __A, __B) \
48 ((__m128i)__builtin_ia32_vpdpbssd128((__v4si)(__W), (__v4si)(__A), \
51 /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
52 /// corresponding signed 8-bit integers in \a __B, producing 4 intermediate
53 /// signed 16-bit results. Sum these 4 results with the corresponding
54 /// 32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
56 /// \headerfile <x86intrin.h>
59 /// _mm256_dpbssd_epi32(__m256i __W, __m256i __A, __m256i __B);
62 /// This intrinsic corresponds to the \c VPDPBSSD instruction.
65 /// A 256-bit vector of [32 x char].
67 /// A 256-bit vector of [32 x char].
69 /// A 256-bit vector of [8 x int].
73 /// tmp1.word := SignExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j])
74 /// tmp2.word := SignExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1])
75 /// tmp3.word := SignExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2])
76 /// tmp4.word := SignExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3])
77 /// dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
81 #define _mm256_dpbssd_epi32(__W, __A, __B) \
82 ((__m256i)__builtin_ia32_vpdpbssd256((__v8si)(__W), (__v8si)(__A), \
85 /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
86 /// corresponding signed 8-bit integers in \a __B, producing 4 intermediate
87 /// signed 16-bit results. Sum these 4 results with the corresponding
88 /// 32-bit integer in \a __W with signed saturation, and store the packed
89 /// 32-bit results in \a dst.
91 /// \headerfile <x86intrin.h>
94 /// _mm_dpbssds_epi32( __m128i __W, __m128i __A, __m128i __B);
97 /// This intrinsic corresponds to the \c VPDPBSSD instruction.
100 /// A 128-bit vector of [16 x char].
102 /// A 128-bit vector of [16 x char].
104 /// A 128-bit vector of [4 x int].
106 /// \code{.operation}
108 /// tmp1.word := SignExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j])
109 /// tmp2.word := SignExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1])
110 /// tmp3.word := SignExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2])
111 /// tmp4.word := SignExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3])
112 /// dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
114 /// dst[MAX:128] := 0
116 #define _mm_dpbssds_epi32(__W, __A, __B) \
117 ((__m128i)__builtin_ia32_vpdpbssds128((__v4si)(__W), (__v4si)(__A), \
120 /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
121 /// corresponding signed 8-bit integers in \a __B, producing 4 intermediate
122 /// signed 16-bit results. Sum these 4 results with the corresponding
123 /// 32-bit integer in \a __W with signed saturation, and store the packed
124 /// 32-bit results in \a dst.
126 /// \headerfile <x86intrin.h>
129 /// _mm256_dpbssds_epi32(__m256i __W, __m256i __A, __m256i __B);
132 /// This intrinsic corresponds to the \c VPDPBSSD instruction.
135 /// A 256-bit vector of [32 x char].
137 /// A 256-bit vector of [32 x char].
139 /// A 256-bit vector of [8 x int].
141 /// \code{.operation}
143 /// tmp1.word := SignExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j])
144 /// tmp2.word := SignExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1])
145 /// tmp3.word := SignExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2])
146 /// tmp4.word := SignExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3])
147 /// dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
149 /// dst[MAX:256] := 0
151 #define _mm256_dpbssds_epi32(__W, __A, __B) \
152 ((__m256i)__builtin_ia32_vpdpbssds256((__v8si)(__W), (__v8si)(__A), \
155 /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
156 /// corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
157 /// signed 16-bit results. Sum these 4 results with the corresponding
158 /// 32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
160 /// \headerfile <x86intrin.h>
163 /// _mm_dpbsud_epi32(__m128i __W, __m128i __A, __m128i __B);
166 /// This intrinsic corresponds to the \c VPDPBSSD instruction.
169 /// A 128-bit vector of [16 x char].
171 /// A 128-bit vector of [16 x unsigned char].
173 /// A 128-bit vector of [4 x int].
175 /// \code{.operation}
177 /// tmp1.word := Signed(SignExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j]))
178 /// tmp2.word := Signed(SignExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1]))
179 /// tmp3.word := Signed(SignExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2]))
180 /// tmp4.word := Signed(SignExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3]))
181 /// dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
183 /// dst[MAX:128] := 0
185 #define _mm_dpbsud_epi32(__W, __A, __B) \
186 ((__m128i)__builtin_ia32_vpdpbsud128((__v4si)(__W), (__v4si)(__A), \
189 /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
190 /// corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
191 /// signed 16-bit results. Sum these 4 results with the corresponding
192 /// 32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
194 /// \headerfile <x86intrin.h>
197 /// _mm256_dpbsud_epi32(__m256i __W, __m256i __A, __m256i __B);
200 /// This intrinsic corresponds to the \c VPDPBSSD instruction.
203 /// A 256-bit vector of [32 x char].
205 /// A 256-bit vector of [32 x unsigned char].
207 /// A 256-bit vector of [8 x int].
209 /// \code{.operation}
211 /// tmp1.word := Signed(SignExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j]))
212 /// tmp2.word := Signed(SignExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1]))
213 /// tmp3.word := Signed(SignExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2]))
214 /// tmp4.word := Signed(SignExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3]))
215 /// dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
217 /// dst[MAX:256] := 0
219 #define _mm256_dpbsud_epi32(__W, __A, __B) \
220 ((__m256i)__builtin_ia32_vpdpbsud256((__v8si)(__W), (__v8si)(__A), \
223 /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
224 /// corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
225 /// signed 16-bit results. Sum these 4 results with the corresponding
226 /// 32-bit integer in \a __W with signed saturation, and store the packed
227 /// 32-bit results in \a dst.
229 /// \headerfile <x86intrin.h>
232 /// _mm_dpbsuds_epi32( __m128i __W, __m128i __A, __m128i __B);
235 /// This intrinsic corresponds to the \c VPDPBSSD instruction.
238 /// A 128-bit vector of [16 x char].
240 /// A 128-bit vector of [16 x unsigned char].
242 /// A 128-bit vector of [4 x int].
244 /// \code{.operation}
246 /// tmp1.word := Signed(SignExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j]))
247 /// tmp2.word := Signed(SignExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1]))
248 /// tmp3.word := Signed(SignExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2]))
249 /// tmp4.word := Signed(SignExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3]))
250 /// dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
252 /// dst[MAX:128] := 0
254 #define _mm_dpbsuds_epi32(__W, __A, __B) \
255 ((__m128i)__builtin_ia32_vpdpbsuds128((__v4si)(__W), (__v4si)(__A), \
258 /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
259 /// corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
260 /// signed 16-bit results. Sum these 4 results with the corresponding
261 /// 32-bit integer in \a __W with signed saturation, and store the packed
262 /// 32-bit results in \a dst.
264 /// \headerfile <x86intrin.h>
267 /// _mm256_dpbsuds_epi32(__m256i __W, __m256i __A, __m256i __B);
270 /// This intrinsic corresponds to the \c VPDPBSSD instruction.
273 /// A 256-bit vector of [32 x char].
275 /// A 256-bit vector of [32 x unsigned char].
277 /// A 256-bit vector of [8 x int].
279 /// \code{.operation}
281 /// tmp1.word := Signed(SignExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j]))
282 /// tmp2.word := Signed(SignExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1]))
283 /// tmp3.word := Signed(SignExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2]))
284 /// tmp4.word := Signed(SignExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3]))
285 /// dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
287 /// dst[MAX:256] := 0
289 #define _mm256_dpbsuds_epi32(__W, __A, __B) \
290 ((__m256i)__builtin_ia32_vpdpbsuds256((__v8si)(__W), (__v8si)(__A), \
293 /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with
294 /// corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
295 /// signed 16-bit results. Sum these 4 results with the corresponding
296 /// 32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
298 /// \headerfile <x86intrin.h>
301 /// _mm_dpbuud_epi32(__m128i __W, __m128i __A, __m128i __B);
304 /// This intrinsic corresponds to the \c VPDPBSSD instruction.
307 /// A 128-bit vector of [16 x unsigned char].
309 /// A 128-bit vector of [16 x unsigned char].
311 /// A 128-bit vector of [4 x int].
313 /// \code{.operation}
315 /// tmp1.word := ZeroExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j])
316 /// tmp2.word := ZeroExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1])
317 /// tmp3.word := ZeroExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2])
318 /// tmp4.word := ZeroExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3])
319 /// dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
321 /// dst[MAX:128] := 0
323 #define _mm_dpbuud_epi32(__W, __A, __B) \
324 ((__m128i)__builtin_ia32_vpdpbuud128((__v4si)(__W), (__v4si)(__A), \
327 /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with
328 /// corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
329 /// signed 16-bit results. Sum these 4 results with the corresponding
330 /// 32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
332 /// \headerfile <x86intrin.h>
335 /// _mm256_dpbuud_epi32(__m256i __W, __m256i __A, __m256i __B);
338 /// This intrinsic corresponds to the \c VPDPBSSD instruction.
341 /// A 256-bit vector of [32 x unsigned char].
343 /// A 256-bit vector of [32 x unsigned char].
345 /// A 256-bit vector of [8 x int].
347 /// \code{.operation}
349 /// tmp1.word := ZeroExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j])
350 /// tmp2.word := ZeroExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1])
351 /// tmp3.word := ZeroExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2])
352 /// tmp4.word := ZeroExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3])
353 /// dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
355 /// dst[MAX:256] := 0
357 #define _mm256_dpbuud_epi32(__W, __A, __B) \
358 ((__m256i)__builtin_ia32_vpdpbuud256((__v8si)(__W), (__v8si)(__A), \
361 /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with
362 /// corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
363 /// signed 16-bit results. Sum these 4 results with the corresponding
364 /// 32-bit integer in \a __W with signed saturation, and store the packed
365 /// 32-bit results in \a dst.
367 /// \headerfile <x86intrin.h>
370 /// _mm_dpbuuds_epi32( __m128i __W, __m128i __A, __m128i __B);
373 /// This intrinsic corresponds to the \c VPDPBUUDS instruction.
376 /// A 128-bit vector of [16 x unsigned char].
378 /// A 128-bit vector of [16 x unsigned char].
380 /// A 128-bit vector of [4 x int].
382 /// \code{.operation}
384 /// tmp1.word := ZeroExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j])
385 /// tmp2.word := ZeroExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1])
386 /// tmp3.word := ZeroExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2])
387 /// tmp4.word := ZeroExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3])
388 /// dst.dword[j] := UNSIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
390 /// dst[MAX:128] := 0
392 #define _mm_dpbuuds_epi32(__W, __A, __B) \
393 ((__m128i)__builtin_ia32_vpdpbuuds128((__v4si)(__W), (__v4si)(__A), \
396 /// corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
397 /// signed 16-bit results. Sum these 4 results with the corresponding
398 /// 32-bit integer in \a __W with signed saturation, and store the packed
399 /// 32-bit results in \a dst.
401 /// \headerfile <x86intrin.h>
404 /// _mm256_dpbuuds_epi32(__m256i __W, __m256i __A, __m256i __B);
407 /// This intrinsic corresponds to the \c VPDPBUUDS instruction.
410 /// A 256-bit vector of [32 x unsigned char].
412 /// A 256-bit vector of [32 x unsigned char].
414 /// A 256-bit vector of [8 x int].
416 /// \code{.operation}
418 /// tmp1.word := ZeroExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j])
419 /// tmp2.word := ZeroExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1])
420 /// tmp3.word := ZeroExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2])
421 /// tmp4.word := ZeroExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3])
422 /// dst.dword[j] := UNSIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
424 /// dst[MAX:256] := 0
426 #define _mm256_dpbuuds_epi32(__W, __A, __B) \
427 ((__m256i)__builtin_ia32_vpdpbuuds256((__v8si)(__W), (__v8si)(__A), \
430 #endif // __AVXVNNIINT8INTRIN_H