1 /*===----------- avxvnniint16intrin.h - AVXVNNIINT16 intrinsics-------------===
3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 * See https://llvm.org/LICENSE.txt for license information.
5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 *===-----------------------------------------------------------------------===
12 "Never use <avxvnniint16intrin.h> directly; include <immintrin.h> instead."
13 #endif // __IMMINTRIN_H
15 #ifndef __AVXVNNIINT16INTRIN_H
16 #define __AVXVNNIINT16INTRIN_H
18 /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with
19 /// corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
20 /// signed 16-bit results. Sum these 2 results with the corresponding
21 /// 32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
23 /// \headerfile <immintrin.h>
26 /// __m128i _mm_dpwsud_epi32(__m128i __W, __m128i __A, __m128i __B)
29 /// This intrinsic corresponds to the \c VPDPWSUD instruction.
32 /// A 128-bit vector of [4 x int].
34 /// A 128-bit vector of [8 x short].
36 /// A 128-bit vector of [8 x unsigned short].
38 /// A 128-bit vector of [4 x int].
42 /// tmp1.dword := SignExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j])
43 /// tmp2.dword := SignExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
44 /// dst.dword[j] := __W.dword[j] + tmp1 + tmp2
48 #define _mm_dpwsud_epi32(__W, __A, __B) \
49 ((__m128i)__builtin_ia32_vpdpwsud128((__v4si)(__W), (__v4si)(__A), \
52 /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with
53 /// corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
54 /// signed 16-bit results. Sum these 2 results with the corresponding
55 /// 32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
57 /// \headerfile <immintrin.h>
60 /// __m256i _mm256_dpwsud_epi32(__m256i __W, __m256i __A, __m256i __B)
63 /// This intrinsic corresponds to the \c VPDPWSUD instruction.
66 /// A 256-bit vector of [8 x int].
68 /// A 256-bit vector of [16 x short].
70 /// A 256-bit vector of [16 x unsigned short].
72 /// A 256-bit vector of [8 x int].
76 /// tmp1.dword := SignExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j])
77 /// tmp2.dword := SignExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
78 /// dst.dword[j] := __W.dword[j] + tmp1 + tmp2
82 #define _mm256_dpwsud_epi32(__W, __A, __B) \
83 ((__m256i)__builtin_ia32_vpdpwsud256((__v8si)(__W), (__v8si)(__A), \
86 /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with
87 /// corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
88 /// signed 16-bit results. Sum these 2 results with the corresponding
89 /// 32-bit integer in \a __W with signed saturation, and store the packed
90 /// 32-bit results in \a dst.
92 /// \headerfile <immintrin.h>
95 /// __m128i _mm_dpwsuds_epi32(__m128i __W, __m128i __A, __m128i __B)
98 /// This intrinsic corresponds to the \c VPDPWSUDS instruction.
101 /// A 128-bit vector of [4 x int].
103 /// A 128-bit vector of [8 x short].
105 /// A 128-bit vector of [8 x unsigned short].
107 /// A 128-bit vector of [4 x int].
109 /// \code{.operation}
111 /// tmp1.dword := SignExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j])
112 /// tmp2.dword := SignExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
113 /// dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2)
115 /// dst[MAX:128] := 0
117 #define _mm_dpwsuds_epi32(__W, __A, __B) \
118 ((__m128i)__builtin_ia32_vpdpwsuds128((__v4si)(__W), (__v4si)(__A), \
121 /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with
122 /// corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
123 /// signed 16-bit results. Sum these 2 results with the corresponding
124 /// 32-bit integer in \a __W with signed saturation, and store the packed
125 /// 32-bit results in \a dst.
127 /// \headerfile <immintrin.h>
130 /// __m256i _mm256_dpwsuds_epi32(__m256i __W, __m256i __A, __m256i __B)
133 /// This intrinsic corresponds to the \c VPDPWSUDS instruction.
136 /// A 256-bit vector of [8 x int].
138 /// A 256-bit vector of [16 x short].
140 /// A 256-bit vector of [16 x unsigned short].
142 /// A 256-bit vector of [8 x int].
144 /// \code{.operation}
146 /// tmp1.dword := SignExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j])
147 /// tmp2.dword := SignExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
148 /// dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2)
150 /// dst[MAX:256] := 0
152 #define _mm256_dpwsuds_epi32(__W, __A, __B) \
153 ((__m256i)__builtin_ia32_vpdpwsuds256((__v8si)(__W), (__v8si)(__A), \
156 /// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
157 /// corresponding signed 16-bit integers in \a __B, producing 2 intermediate
158 /// signed 16-bit results. Sum these 2 results with the corresponding
159 /// 32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
161 /// \headerfile <immintrin.h>
164 /// __m128i _mm_dpbusd_epi32(__m128i __W, __m128i __A, __m128i __B)
167 /// This intrinsic corresponds to the \c VPDPWUSD instruction.
170 /// A 128-bit vector of [4 x int].
172 /// A 128-bit vector of [8 x unsigned short].
174 /// A 128-bit vector of [8 x short].
176 /// A 128-bit vector of [4 x int].
178 /// \code{.operation}
180 /// tmp1.dword := ZeroExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j])
181 /// tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1])
182 /// dst.dword[j] := __W.dword[j] + tmp1 + tmp2
184 /// dst[MAX:128] := 0
186 #define _mm_dpwusd_epi32(__W, __A, __B) \
187 ((__m128i)__builtin_ia32_vpdpwusd128((__v4si)(__W), (__v4si)(__A), \
190 /// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
191 /// corresponding signed 16-bit integers in \a __B, producing 2 intermediate
192 /// signed 16-bit results. Sum these 2 results with the corresponding
193 /// 32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
195 /// \headerfile <immintrin.h>
198 /// __m256i _mm256_dpwusd_epi32(__m256i __W, __m256i __A, __m256i __B)
201 /// This intrinsic corresponds to the \c VPDPWUSD instruction.
204 /// A 256-bit vector of [8 x int].
206 /// A 256-bit vector of [16 x unsigned short].
208 /// A 256-bit vector of [16 x short].
210 /// A 256-bit vector of [8 x int].
212 /// \code{.operation}
214 /// tmp1.dword := ZeroExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j])
215 /// tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1])
216 /// dst.dword[j] := __W.dword[j] + tmp1 + tmp2
218 /// dst[MAX:256] := 0
220 #define _mm256_dpwusd_epi32(__W, __A, __B) \
221 ((__m256i)__builtin_ia32_vpdpwusd256((__v8si)(__W), (__v8si)(__A), \
224 /// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
225 /// corresponding signed 16-bit integers in \a __B, producing 2 intermediate
226 /// signed 16-bit results. Sum these 2 results with the corresponding
227 /// 32-bit integer in \a __W with signed saturation, and store the packed
228 /// 32-bit results in \a dst.
230 /// \headerfile <immintrin.h>
233 /// __m128i _mm_dpwusds_epi32(__m128i __W, __m128i __A, __m128i __B)
236 /// This intrinsic corresponds to the \c VPDPWSUDS instruction.
239 /// A 128-bit vector of [4 x int].
241 /// A 128-bit vector of [8 x unsigned short].
243 /// A 128-bit vector of [8 x short].
245 /// A 128-bit vector of [4 x int].
247 /// \code{.operation}
249 /// tmp1.dword := ZeroExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j])
250 /// tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1])
251 /// dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2)
253 /// dst[MAX:128] := 0
255 #define _mm_dpwusds_epi32(__W, __A, __B) \
256 ((__m128i)__builtin_ia32_vpdpwusds128((__v4si)(__W), (__v4si)(__A), \
259 /// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
260 /// corresponding signed 16-bit integers in \a __B, producing 2 intermediate
261 /// signed 16-bit results. Sum these 2 results with the corresponding
262 /// 32-bit integer in \a __W with signed saturation, and store the packed
263 /// 32-bit results in \a dst.
265 /// \headerfile <immintrin.h>
268 /// __m256i _mm256_dpwsuds_epi32(__m256i __W, __m256i __A, __m256i __B)
271 /// This intrinsic corresponds to the \c VPDPWSUDS instruction.
274 /// A 256-bit vector of [8 x int].
276 /// A 256-bit vector of [16 x unsigned short].
278 /// A 256-bit vector of [16 x short].
280 /// A 256-bit vector of [8 x int].
282 /// \code{.operation}
284 /// tmp1.dword := ZeroExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j])
285 /// tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1])
286 /// dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2)
288 /// dst[MAX:256] := 0
290 #define _mm256_dpwusds_epi32(__W, __A, __B) \
291 ((__m256i)__builtin_ia32_vpdpwusds256((__v8si)(__W), (__v8si)(__A), \
294 /// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
295 /// corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
296 /// signed 16-bit results. Sum these 2 results with the corresponding
297 /// 32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
299 /// \headerfile <immintrin.h>
302 /// __m128i _mm_dpwuud_epi32(__m128i __W, __m128i __A, __m128i __B)
305 /// This intrinsic corresponds to the \c VPDPWUUD instruction.
308 /// A 128-bit vector of [4 x unsigned int].
310 /// A 128-bit vector of [8 x unsigned short].
312 /// A 128-bit vector of [8 x unsigned short].
314 /// A 128-bit vector of [4 x unsigned int].
316 /// \code{.operation}
318 /// tmp1.dword := ZeroExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j])
319 /// tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
320 /// dst.dword[j] := __W.dword[j] + tmp1 + tmp2
322 /// dst[MAX:128] := 0
324 #define _mm_dpwuud_epi32(__W, __A, __B) \
325 ((__m128i)__builtin_ia32_vpdpwuud128((__v4si)(__W), (__v4si)(__A), \
328 /// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
329 /// corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
330 /// signed 16-bit results. Sum these 2 results with the corresponding
331 /// 32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
333 /// \headerfile <immintrin.h>
336 /// __m256i _mm256_dpwuud_epi32(__m256i __W, __m256i __A, __m256i __B)
339 /// This intrinsic corresponds to the \c VPDPWUUD instruction.
342 /// A 256-bit vector of [8 x unsigned int].
344 /// A 256-bit vector of [16 x unsigned short].
346 /// A 256-bit vector of [16 x unsigned short].
348 /// A 256-bit vector of [8 x unsigned int].
350 /// \code{.operation}
352 /// tmp1.dword := ZeroExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j])
353 /// tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
354 /// dst.dword[j] := __W.dword[j] + tmp1 + tmp2
356 /// dst[MAX:256] := 0
358 #define _mm256_dpwuud_epi32(__W, __A, __B) \
359 ((__m256i)__builtin_ia32_vpdpwuud256((__v8si)(__W), (__v8si)(__A), \
362 /// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
363 /// corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
364 /// signed 16-bit results. Sum these 2 results with the corresponding
365 /// 32-bit integer in \a __W with signed saturation, and store the packed
366 /// 32-bit results in \a dst.
368 /// \headerfile <immintrin.h>
371 /// __m128i _mm_dpwsuds_epi32(__m128i __W, __m128i __A, __m128i __B)
374 /// This intrinsic corresponds to the \c VPDPWSUDS instruction.
377 /// A 128-bit vector of [4 x unsigned int].
379 /// A 128-bit vector of [8 x unsigned short].
381 /// A 128-bit vector of [8 x unsigned short].
383 /// A 128-bit vector of [4 x unsigned int].
385 /// \code{.operation}
387 /// tmp1.dword := ZeroExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j])
388 /// tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
389 /// dst.dword[j] := UNSIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2)
391 /// dst[MAX:128] := 0
393 #define _mm_dpwuuds_epi32(__W, __A, __B) \
394 ((__m128i)__builtin_ia32_vpdpwuuds128((__v4si)(__W), (__v4si)(__A), \
397 /// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
398 /// corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
399 /// signed 16-bit results. Sum these 2 results with the corresponding
400 /// 32-bit integer in \a __W with signed saturation, and store the packed
401 /// 32-bit results in \a dst.
403 /// \headerfile <immintrin.h>
406 /// __m256i _mm256_dpwuuds_epi32(__m256i __W, __m256i __A, __m256i __B)
409 /// This intrinsic corresponds to the \c VPDPWSUDS instruction.
412 /// A 256-bit vector of [8 x unsigned int].
414 /// A 256-bit vector of [16 x unsigned short].
416 /// A 256-bit vector of [16 x unsigned short].
418 /// A 256-bit vector of [8 x unsigned int].
420 /// \code{.operation}
422 /// tmp1.dword := ZeroExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j])
423 /// tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
424 /// dst.dword[j] := UNSIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2)
426 /// dst[MAX:256] := 0
428 #define _mm256_dpwuuds_epi32(__W, __A, __B) \
429 ((__m256i)__builtin_ia32_vpdpwuuds256((__v8si)(__W), (__v8si)(__A), \
432 #endif // __AVXVNNIINT16INTRIN_H