1 /*===-------- avxvnniint8intrin.h - AVXVNNIINT8 intrinsics -----------===
3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 * See https://llvm.org/LICENSE.txt for license information.
5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 *===-----------------------------------------------------------------------===
11 "Never use <avxvnniint8intrin.h> directly; include <immintrin.h> instead."
14 #ifndef __AVXVNNIINT8INTRIN_H
15 #define __AVXVNNIINT8INTRIN_H
17 /* Define the default attributes for the functions in this file. */
18 #define __DEFAULT_FN_ATTRS256 \
19 __attribute__((__always_inline__, __nodebug__, __target__("avxvnniint8"), \
20 __min_vector_width__(256)))
21 #define __DEFAULT_FN_ATTRS128 \
22 __attribute__((__always_inline__, __nodebug__, __target__("avxvnniint8"), \
23 __min_vector_width__(128)))
25 /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
26 /// corresponding signed 8-bit integers in \a __B, producing 4 intermediate
27 /// signed 16-bit results. Sum these 4 results with the corresponding
28 /// 32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
30 /// \headerfile <x86intrin.h>
33 /// _mm_dpbssd_epi32(__m128i __W, __m128i __A, __m128i __B);
36 /// This intrinsic corresponds to the \c VPDPBSSD instruction.
39 /// A 128-bit vector of [16 x char].
41 /// A 128-bit vector of [16 x char].
43 /// A 128-bit vector of [4 x int].
47 /// tmp1.word := SignExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j])
48 /// tmp2.word := SignExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1])
49 /// tmp3.word := SignExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2])
50 /// tmp4.word := SignExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3])
51 /// dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
55 static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_dpbssd_epi32(__m128i __W
,
58 return (__m128i
)__builtin_ia32_vpdpbssd128((__v4si
)__W
, (__v4si
)__A
,
62 /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
63 /// corresponding signed 8-bit integers in \a __B, producing 4 intermediate
64 /// signed 16-bit results. Sum these 4 results with the corresponding
65 /// 32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
67 /// \headerfile <x86intrin.h>
70 /// _mm256_dpbssd_epi32(__m256i __W, __m256i __A, __m256i __B);
73 /// This intrinsic corresponds to the \c VPDPBSSD instruction.
76 /// A 256-bit vector of [32 x char].
78 /// A 256-bit vector of [32 x char].
80 /// A 256-bit vector of [8 x int].
84 /// tmp1.word := SignExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j])
85 /// tmp2.word := SignExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1])
86 /// tmp3.word := SignExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2])
87 /// tmp4.word := SignExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3])
88 /// dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
92 static __inline__ __m256i __DEFAULT_FN_ATTRS256
93 _mm256_dpbssd_epi32(__m256i __W
, __m256i __A
, __m256i __B
) {
94 return (__m256i
)__builtin_ia32_vpdpbssd256((__v8si
)__W
, (__v8si
)__A
,
98 /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
99 /// corresponding signed 8-bit integers in \a __B, producing 4 intermediate
100 /// signed 16-bit results. Sum these 4 results with the corresponding
101 /// 32-bit integer in \a __W with signed saturation, and store the packed
102 /// 32-bit results in \a dst.
104 /// \headerfile <x86intrin.h>
107 /// _mm_dpbssds_epi32( __m128i __W, __m128i __A, __m128i __B);
110 /// This intrinsic corresponds to the \c VPDPBSSD instruction.
113 /// A 128-bit vector of [16 x char].
115 /// A 128-bit vector of [16 x char].
117 /// A 128-bit vector of [4 x int].
119 /// \code{.operation}
121 /// tmp1.word := SignExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j])
122 /// tmp2.word := SignExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1])
123 /// tmp3.word := SignExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2])
124 /// tmp4.word := SignExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3])
125 /// dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
127 /// dst[MAX:128] := 0
129 static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_dpbssds_epi32(__m128i __W
,
132 return (__m128i
)__builtin_ia32_vpdpbssds128((__v4si
)__W
, (__v4si
)__A
,
136 /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
137 /// corresponding signed 8-bit integers in \a __B, producing 4 intermediate
138 /// signed 16-bit results. Sum these 4 results with the corresponding
139 /// 32-bit integer in \a __W with signed saturation, and store the packed
140 /// 32-bit results in \a dst.
142 /// \headerfile <x86intrin.h>
145 /// _mm256_dpbssds_epi32(__m256i __W, __m256i __A, __m256i __B);
148 /// This intrinsic corresponds to the \c VPDPBSSD instruction.
151 /// A 256-bit vector of [32 x char].
153 /// A 256-bit vector of [32 x char].
155 /// A 256-bit vector of [8 x int].
157 /// \code{.operation}
159 /// tmp1.word := SignExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j])
160 /// tmp2.word := SignExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1])
161 /// tmp3.word := SignExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2])
162 /// tmp4.word := SignExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3])
163 /// dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
165 /// dst[MAX:256] := 0
167 static __inline__ __m256i __DEFAULT_FN_ATTRS256
168 _mm256_dpbssds_epi32(__m256i __W
, __m256i __A
, __m256i __B
) {
169 return (__m256i
)__builtin_ia32_vpdpbssds256((__v8si
)__W
, (__v8si
)__A
,
173 /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
174 /// corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
175 /// signed 16-bit results. Sum these 4 results with the corresponding
176 /// 32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
178 /// \headerfile <x86intrin.h>
181 /// _mm_dpbsud_epi32(__m128i __W, __m128i __A, __m128i __B);
184 /// This intrinsic corresponds to the \c VPDPBSSD instruction.
187 /// A 128-bit vector of [16 x char].
189 /// A 128-bit vector of [16 x unsigned char].
191 /// A 128-bit vector of [4 x int].
193 /// \code{.operation}
195 /// tmp1.word := Signed(SignExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j]))
196 /// tmp2.word := Signed(SignExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1]))
197 /// tmp3.word := Signed(SignExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2]))
198 /// tmp4.word := Signed(SignExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3]))
199 /// dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
201 /// dst[MAX:128] := 0
203 static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_dpbsud_epi32(__m128i __W
,
206 return (__m128i
)__builtin_ia32_vpdpbsud128((__v4si
)__W
, (__v4si
)__A
,
210 /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
211 /// corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
212 /// signed 16-bit results. Sum these 4 results with the corresponding
213 /// 32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
215 /// \headerfile <x86intrin.h>
218 /// _mm256_dpbsud_epi32(__m256i __W, __m256i __A, __m256i __B);
221 /// This intrinsic corresponds to the \c VPDPBSSD instruction.
224 /// A 256-bit vector of [32 x char].
226 /// A 256-bit vector of [32 x unsigned char].
228 /// A 256-bit vector of [8 x int].
230 /// \code{.operation}
232 /// tmp1.word := Signed(SignExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j]))
233 /// tmp2.word := Signed(SignExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1]))
234 /// tmp3.word := Signed(SignExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2]))
235 /// tmp4.word := Signed(SignExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3]))
236 /// dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
238 /// dst[MAX:256] := 0
240 static __inline__ __m256i __DEFAULT_FN_ATTRS256
241 _mm256_dpbsud_epi32(__m256i __W
, __m256i __A
, __m256i __B
) {
242 return (__m256i
)__builtin_ia32_vpdpbsud256((__v8si
)__W
, (__v8si
)__A
,
246 /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
247 /// corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
248 /// signed 16-bit results. Sum these 4 results with the corresponding
249 /// 32-bit integer in \a __W with signed saturation, and store the packed
250 /// 32-bit results in \a dst.
252 /// \headerfile <x86intrin.h>
255 /// _mm_dpbsuds_epi32( __m128i __W, __m128i __A, __m128i __B);
258 /// This intrinsic corresponds to the \c VPDPBSSD instruction.
261 /// A 128-bit vector of [16 x char].
263 /// A 128-bit vector of [16 x unsigned char].
265 /// A 128-bit vector of [4 x int].
267 /// \code{.operation}
269 /// tmp1.word := Signed(SignExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j]))
270 /// tmp2.word := Signed(SignExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1]))
271 /// tmp3.word := Signed(SignExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2]))
272 /// tmp4.word := Signed(SignExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3]))
273 /// dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
275 /// dst[MAX:128] := 0
277 static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_dpbsuds_epi32(__m128i __W
,
280 return (__m128i
)__builtin_ia32_vpdpbsuds128((__v4si
)__W
, (__v4si
)__A
,
284 /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
285 /// corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
286 /// signed 16-bit results. Sum these 4 results with the corresponding
287 /// 32-bit integer in \a __W with signed saturation, and store the packed
288 /// 32-bit results in \a dst.
290 /// \headerfile <x86intrin.h>
293 /// _mm256_dpbsuds_epi32(__m256i __W, __m256i __A, __m256i __B);
296 /// This intrinsic corresponds to the \c VPDPBSSD instruction.
299 /// A 256-bit vector of [32 x char].
301 /// A 256-bit vector of [32 x unsigned char].
303 /// A 256-bit vector of [8 x int].
305 /// \code{.operation}
307 /// tmp1.word := Signed(SignExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j]))
308 /// tmp2.word := Signed(SignExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1]))
309 /// tmp3.word := Signed(SignExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2]))
310 /// tmp4.word := Signed(SignExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3]))
311 /// dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
313 /// dst[MAX:256] := 0
315 static __inline__ __m256i __DEFAULT_FN_ATTRS256
316 _mm256_dpbsuds_epi32(__m256i __W
, __m256i __A
, __m256i __B
) {
317 return (__m256i
)__builtin_ia32_vpdpbsuds256((__v8si
)__W
, (__v8si
)__A
,
321 /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with
322 /// corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
323 /// signed 16-bit results. Sum these 4 results with the corresponding
324 /// 32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
326 /// \headerfile <x86intrin.h>
329 /// _mm_dpbuud_epi32(__m128i __W, __m128i __A, __m128i __B);
332 /// This intrinsic corresponds to the \c VPDPBSSD instruction.
335 /// A 128-bit vector of [16 x unsigned char].
337 /// A 128-bit vector of [16 x unsigned char].
339 /// A 128-bit vector of [4 x int].
341 /// \code{.operation}
343 /// tmp1.word := ZeroExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j])
344 /// tmp2.word := ZeroExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1])
345 /// tmp3.word := ZeroExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2])
346 /// tmp4.word := ZeroExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3])
347 /// dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
349 /// dst[MAX:128] := 0
351 static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_dpbuud_epi32(__m128i __W
,
354 return (__m128i
)__builtin_ia32_vpdpbuud128((__v4si
)__W
, (__v4si
)__A
,
358 /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with
359 /// corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
360 /// signed 16-bit results. Sum these 4 results with the corresponding
361 /// 32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
363 /// \headerfile <x86intrin.h>
366 /// _mm256_dpbuud_epi32(__m256i __W, __m256i __A, __m256i __B);
369 /// This intrinsic corresponds to the \c VPDPBSSD instruction.
372 /// A 256-bit vector of [32 x unsigned char].
374 /// A 256-bit vector of [32 x unsigned char].
376 /// A 256-bit vector of [8 x int].
378 /// \code{.operation}
380 /// tmp1.word := ZeroExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j])
381 /// tmp2.word := ZeroExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1])
382 /// tmp3.word := ZeroExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2])
383 /// tmp4.word := ZeroExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3])
384 /// dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
386 /// dst[MAX:256] := 0
388 static __inline__ __m256i __DEFAULT_FN_ATTRS256
389 _mm256_dpbuud_epi32(__m256i __W
, __m256i __A
, __m256i __B
) {
390 return (__m256i
)__builtin_ia32_vpdpbuud256((__v8si
)__W
, (__v8si
)__A
,
394 /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with
395 /// corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
396 /// signed 16-bit results. Sum these 4 results with the corresponding
397 /// 32-bit integer in \a __W with signed saturation, and store the packed
398 /// 32-bit results in \a dst.
400 /// \headerfile <x86intrin.h>
403 /// _mm_dpbuuds_epi32( __m128i __W, __m128i __A, __m128i __B);
406 /// This intrinsic corresponds to the \c VPDPBUUDS instruction.
409 /// A 128-bit vector of [16 x unsigned char].
411 /// A 128-bit vector of [16 x unsigned char].
413 /// A 128-bit vector of [4 x int].
415 /// \code{.operation}
417 /// tmp1.word := ZeroExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j])
418 /// tmp2.word := ZeroExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1])
419 /// tmp3.word := ZeroExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2])
420 /// tmp4.word := ZeroExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3])
421 /// dst.dword[j] := UNSIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
423 /// dst[MAX:128] := 0
425 static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_dpbuuds_epi32(__m128i __W
,
428 return (__m128i
)__builtin_ia32_vpdpbuuds128((__v4si
)__W
, (__v4si
)__A
,
432 /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
433 /// corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
434 /// signed 16-bit results. Sum these 4 results with the corresponding
435 /// 32-bit integer in \a __W with signed saturation, and store the packed
436 /// 32-bit results in \a dst.
438 /// \headerfile <x86intrin.h>
441 /// _mm256_dpbuuds_epi32(__m256i __W, __m256i __A, __m256i __B);
444 /// This intrinsic corresponds to the \c VPDPBUUDS instruction.
447 /// A 256-bit vector of [32 x unsigned char].
449 /// A 256-bit vector of [32 x unsigned char].
451 /// A 256-bit vector of [8 x int].
453 /// \code{.operation}
455 /// tmp1.word := ZeroExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j])
456 /// tmp2.word := ZeroExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1])
457 /// tmp3.word := ZeroExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2])
458 /// tmp4.word := ZeroExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3])
459 /// dst.dword[j] := UNSIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
461 /// dst[MAX:256] := 0
463 static __inline__ __m256i __DEFAULT_FN_ATTRS256
464 _mm256_dpbuuds_epi32(__m256i __W
, __m256i __A
, __m256i __B
) {
465 return (__m256i
)__builtin_ia32_vpdpbuuds256((__v8si
)__W
, (__v8si
)__A
,
468 #undef __DEFAULT_FN_ATTRS128
469 #undef __DEFAULT_FN_ATTRS256
471 #endif // __AVXVNNIINT8INTRIN_H