1 /*===----------- avxvnniint16intrin.h - AVXVNNIINT16 intrinsics-------------===
3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 * See https://llvm.org/LICENSE.txt for license information.
5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 *===-----------------------------------------------------------------------===
12 "Never use <avxvnniint16intrin.h> directly; include <immintrin.h> instead."
13 #endif // __IMMINTRIN_H
15 #ifndef __AVXVNNIINT16INTRIN_H
16 #define __AVXVNNIINT16INTRIN_H
18 /* Define the default attributes for the functions in this file. */
19 #define __DEFAULT_FN_ATTRS128 \
20 __attribute__((__always_inline__, __nodebug__, __target__("avxvnniint16"), \
21 __min_vector_width__(128)))
22 #define __DEFAULT_FN_ATTRS256 \
23 __attribute__((__always_inline__, __nodebug__, __target__("avxvnniint16"), \
24 __min_vector_width__(256)))
26 /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with
27 /// corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
28 /// signed 16-bit results. Sum these 2 results with the corresponding
29 /// 32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
31 /// \headerfile <immintrin.h>
34 /// __m128i _mm_dpwsud_epi32(__m128i __W, __m128i __A, __m128i __B)
37 /// This intrinsic corresponds to the \c VPDPWSUD instruction.
40 /// A 128-bit vector of [4 x int].
42 /// A 128-bit vector of [8 x short].
44 /// A 128-bit vector of [8 x unsigned short].
46 /// A 128-bit vector of [4 x int].
50 /// tmp1.dword := SignExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j])
51 /// tmp2.dword := SignExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
52 /// dst.dword[j] := __W.dword[j] + tmp1 + tmp2
56 static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_dpwsud_epi32(__m128i __W
,
59 return (__m128i
)__builtin_ia32_vpdpwsud128((__v4si
)__W
, (__v4si
)__A
,
63 /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with
64 /// corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
65 /// signed 16-bit results. Sum these 2 results with the corresponding
66 /// 32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
68 /// \headerfile <immintrin.h>
71 /// __m256i _mm256_dpwsud_epi32(__m256i __W, __m256i __A, __m256i __B)
74 /// This intrinsic corresponds to the \c VPDPWSUD instruction.
77 /// A 256-bit vector of [8 x int].
79 /// A 256-bit vector of [16 x short].
81 /// A 256-bit vector of [16 x unsigned short].
83 /// A 256-bit vector of [8 x int].
87 /// tmp1.dword := SignExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j])
88 /// tmp2.dword := SignExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
89 /// dst.dword[j] := __W.dword[j] + tmp1 + tmp2
93 static __inline__ __m256i __DEFAULT_FN_ATTRS256
94 _mm256_dpwsud_epi32(__m256i __W
, __m256i __A
, __m256i __B
) {
95 return (__m256i
)__builtin_ia32_vpdpwsud256((__v8si
)__W
, (__v8si
)__A
,
99 /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with
100 /// corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
101 /// signed 16-bit results. Sum these 2 results with the corresponding
102 /// 32-bit integer in \a __W with signed saturation, and store the packed
103 /// 32-bit results in \a dst.
105 /// \headerfile <immintrin.h>
108 /// __m128i _mm_dpwsuds_epi32(__m128i __W, __m128i __A, __m128i __B)
111 /// This intrinsic corresponds to the \c VPDPWSUDS instruction.
114 /// A 128-bit vector of [4 x int].
116 /// A 128-bit vector of [8 x short].
118 /// A 128-bit vector of [8 x unsigned short].
120 /// A 128-bit vector of [4 x int].
122 /// \code{.operation}
124 /// tmp1.dword := SignExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j])
125 /// tmp2.dword := SignExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
126 /// dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2)
128 /// dst[MAX:128] := 0
130 static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_dpwsuds_epi32(__m128i __W
,
133 return (__m128i
)__builtin_ia32_vpdpwsuds128((__v4si
)__W
, (__v4si
)__A
,
137 /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with
138 /// corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
139 /// signed 16-bit results. Sum these 2 results with the corresponding
140 /// 32-bit integer in \a __W with signed saturation, and store the packed
141 /// 32-bit results in \a dst.
143 /// \headerfile <immintrin.h>
146 /// __m256i _mm256_dpwsuds_epi32(__m256i __W, __m256i __A, __m256i __B)
149 /// This intrinsic corresponds to the \c VPDPWSUDS instruction.
152 /// A 256-bit vector of [8 x int].
154 /// A 256-bit vector of [16 x short].
156 /// A 256-bit vector of [16 x unsigned short].
158 /// A 256-bit vector of [8 x int].
160 /// \code{.operation}
162 /// tmp1.dword := SignExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j])
163 /// tmp2.dword := SignExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
164 /// dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2)
166 /// dst[MAX:256] := 0
168 static __inline__ __m256i __DEFAULT_FN_ATTRS256
169 _mm256_dpwsuds_epi32(__m256i __W
, __m256i __A
, __m256i __B
) {
170 return (__m256i
)__builtin_ia32_vpdpwsuds256((__v8si
)__W
, (__v8si
)__A
,
174 /// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
175 /// corresponding signed 16-bit integers in \a __B, producing 2 intermediate
176 /// signed 16-bit results. Sum these 2 results with the corresponding
177 /// 32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
179 /// \headerfile <immintrin.h>
182 /// __m128i _mm_dpbusd_epi32(__m128i __W, __m128i __A, __m128i __B)
185 /// This intrinsic corresponds to the \c VPDPWUSD instruction.
188 /// A 128-bit vector of [4 x int].
190 /// A 128-bit vector of [8 x unsigned short].
192 /// A 128-bit vector of [8 x short].
194 /// A 128-bit vector of [4 x int].
196 /// \code{.operation}
198 /// tmp1.dword := ZeroExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j])
199 /// tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1])
200 /// dst.dword[j] := __W.dword[j] + tmp1 + tmp2
202 /// dst[MAX:128] := 0
204 static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_dpwusd_epi32(__m128i __W
,
207 return (__m128i
)__builtin_ia32_vpdpwusd128((__v4si
)__W
, (__v4si
)__A
,
211 /// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
212 /// corresponding signed 16-bit integers in \a __B, producing 2 intermediate
213 /// signed 16-bit results. Sum these 2 results with the corresponding
214 /// 32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
216 /// \headerfile <immintrin.h>
219 /// __m256i _mm256_dpwusd_epi32(__m256i __W, __m256i __A, __m256i __B)
222 /// This intrinsic corresponds to the \c VPDPWUSD instruction.
225 /// A 256-bit vector of [8 x int].
227 /// A 256-bit vector of [16 x unsigned short].
229 /// A 256-bit vector of [16 x short].
231 /// A 256-bit vector of [8 x int].
233 /// \code{.operation}
235 /// tmp1.dword := ZeroExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j])
236 /// tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1])
237 /// dst.dword[j] := __W.dword[j] + tmp1 + tmp2
239 /// dst[MAX:256] := 0
241 static __inline__ __m256i __DEFAULT_FN_ATTRS256
242 _mm256_dpwusd_epi32(__m256i __W
, __m256i __A
, __m256i __B
) {
243 return (__m256i
)__builtin_ia32_vpdpwusd256((__v8si
)__W
, (__v8si
)__A
,
247 /// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
248 /// corresponding signed 16-bit integers in \a __B, producing 2 intermediate
249 /// signed 16-bit results. Sum these 2 results with the corresponding
250 /// 32-bit integer in \a __W with signed saturation, and store the packed
251 /// 32-bit results in \a dst.
253 /// \headerfile <immintrin.h>
256 /// __m128i _mm_dpwusds_epi32(__m128i __W, __m128i __A, __m128i __B)
259 /// This intrinsic corresponds to the \c VPDPWSUDS instruction.
262 /// A 128-bit vector of [4 x int].
264 /// A 128-bit vector of [8 x unsigned short].
266 /// A 128-bit vector of [8 x short].
268 /// A 128-bit vector of [4 x int].
270 /// \code{.operation}
272 /// tmp1.dword := ZeroExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j])
273 /// tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1])
274 /// dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2)
276 /// dst[MAX:128] := 0
278 static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_dpwusds_epi32(__m128i __W
,
281 return (__m128i
)__builtin_ia32_vpdpwusds128((__v4si
)__W
, (__v4si
)__A
,
285 /// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
286 /// corresponding signed 16-bit integers in \a __B, producing 2 intermediate
287 /// signed 16-bit results. Sum these 2 results with the corresponding
288 /// 32-bit integer in \a __W with signed saturation, and store the packed
289 /// 32-bit results in \a dst.
291 /// \headerfile <immintrin.h>
294 /// __m256i _mm256_dpwsuds_epi32(__m256i __W, __m256i __A, __m256i __B)
297 /// This intrinsic corresponds to the \c VPDPWSUDS instruction.
300 /// A 256-bit vector of [8 x int].
302 /// A 256-bit vector of [16 x unsigned short].
304 /// A 256-bit vector of [16 x short].
306 /// A 256-bit vector of [8 x int].
308 /// \code{.operation}
310 /// tmp1.dword := ZeroExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j])
311 /// tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1])
312 /// dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2)
314 /// dst[MAX:256] := 0
316 static __inline__ __m256i __DEFAULT_FN_ATTRS256
317 _mm256_dpwusds_epi32(__m256i __W
, __m256i __A
, __m256i __B
) {
318 return (__m256i
)__builtin_ia32_vpdpwusds256((__v8si
)__W
, (__v8si
)__A
,
322 /// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
323 /// corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
324 /// signed 16-bit results. Sum these 2 results with the corresponding
325 /// 32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
327 /// \headerfile <immintrin.h>
330 /// __m128i _mm_dpwuud_epi32(__m128i __W, __m128i __A, __m128i __B)
333 /// This intrinsic corresponds to the \c VPDPWUUD instruction.
336 /// A 128-bit vector of [4 x unsigned int].
338 /// A 128-bit vector of [8 x unsigned short].
340 /// A 128-bit vector of [8 x unsigned short].
342 /// A 128-bit vector of [4 x unsigned int].
344 /// \code{.operation}
346 /// tmp1.dword := ZeroExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j])
347 /// tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
348 /// dst.dword[j] := __W.dword[j] + tmp1 + tmp2
350 /// dst[MAX:128] := 0
352 static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_dpwuud_epi32(__m128i __W
,
355 return (__m128i
)__builtin_ia32_vpdpwuud128((__v4si
)__W
, (__v4si
)__A
,
359 /// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
360 /// corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
361 /// signed 16-bit results. Sum these 2 results with the corresponding
362 /// 32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
364 /// \headerfile <immintrin.h>
367 /// __m256i _mm256_dpwuud_epi32(__m256i __W, __m256i __A, __m256i __B)
370 /// This intrinsic corresponds to the \c VPDPWUUD instruction.
373 /// A 256-bit vector of [8 x unsigned int].
375 /// A 256-bit vector of [16 x unsigned short].
377 /// A 256-bit vector of [16 x unsigned short].
379 /// A 256-bit vector of [8 x unsigned int].
381 /// \code{.operation}
383 /// tmp1.dword := ZeroExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j])
384 /// tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
385 /// dst.dword[j] := __W.dword[j] + tmp1 + tmp2
387 /// dst[MAX:256] := 0
389 static __inline__ __m256i __DEFAULT_FN_ATTRS256
390 _mm256_dpwuud_epi32(__m256i __W
, __m256i __A
, __m256i __B
) {
391 return (__m256i
)__builtin_ia32_vpdpwuud256((__v8si
)__W
, (__v8si
)__A
,
395 /// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
396 /// corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
397 /// signed 16-bit results. Sum these 2 results with the corresponding
398 /// 32-bit integer in \a __W with signed saturation, and store the packed
399 /// 32-bit results in \a dst.
401 /// \headerfile <immintrin.h>
404 /// __m128i _mm_dpwsuds_epi32(__m128i __W, __m128i __A, __m128i __B)
407 /// This intrinsic corresponds to the \c VPDPWSUDS instruction.
410 /// A 128-bit vector of [4 x unsigned int].
412 /// A 128-bit vector of [8 x unsigned short].
414 /// A 128-bit vector of [8 x unsigned short].
416 /// A 128-bit vector of [4 x unsigned int].
418 /// \code{.operation}
420 /// tmp1.dword := ZeroExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j])
421 /// tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
422 /// dst.dword[j] := UNSIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2)
424 /// dst[MAX:128] := 0
426 static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_dpwuuds_epi32(__m128i __W
,
429 return (__m128i
)__builtin_ia32_vpdpwuuds128((__v4si
)__W
, (__v4si
)__A
,
433 /// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
434 /// corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
435 /// signed 16-bit results. Sum these 2 results with the corresponding
436 /// 32-bit integer in \a __W with signed saturation, and store the packed
437 /// 32-bit results in \a dst.
439 /// \headerfile <immintrin.h>
442 /// __m256i _mm256_dpwuuds_epi32(__m256i __W, __m256i __A, __m256i __B)
445 /// This intrinsic corresponds to the \c VPDPWSUDS instruction.
448 /// A 256-bit vector of [8 x unsigned int].
450 /// A 256-bit vector of [16 x unsigned short].
452 /// A 256-bit vector of [16 x unsigned short].
454 /// A 256-bit vector of [8 x unsigned int].
456 /// \code{.operation}
458 /// tmp1.dword := ZeroExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j])
459 /// tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
460 /// dst.dword[j] := UNSIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2)
462 /// dst[MAX:256] := 0
464 static __inline__ __m256i __DEFAULT_FN_ATTRS256
465 _mm256_dpwuuds_epi32(__m256i __W
, __m256i __A
, __m256i __B
) {
466 return (__m256i
)__builtin_ia32_vpdpwuuds256((__v8si
)__W
, (__v8si
)__A
,
470 #undef __DEFAULT_FN_ATTRS128
471 #undef __DEFAULT_FN_ATTRS256
473 #endif // __AVXVNNIINT16INTRIN_H