[InstCombine][NFC] Precommit a test for folding a binary op of reductions. (#121568)
[llvm-project.git] / clang / lib / Headers / avxvnniint16intrin.h
blob805d249911c176ca6324f09017199a02778d5a50
1 /*===----------- avxvnniint16intrin.h - AVXVNNIINT16 intrinsics-------------===
3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 * See https://llvm.org/LICENSE.txt for license information.
5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 *===-----------------------------------------------------------------------===
8 */
10 #ifndef __IMMINTRIN_H
11 #error \
12 "Never use <avxvnniint16intrin.h> directly; include <immintrin.h> instead."
13 #endif // __IMMINTRIN_H
15 #ifndef __AVXVNNIINT16INTRIN_H
16 #define __AVXVNNIINT16INTRIN_H
18 /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with
19 /// corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
20 /// signed 16-bit results. Sum these 2 results with the corresponding
21 /// 32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
22 ///
23 /// \headerfile <immintrin.h>
24 ///
25 /// \code
26 /// __m128i _mm_dpwsud_epi32(__m128i __W, __m128i __A, __m128i __B)
27 /// \endcode
28 ///
29 /// This intrinsic corresponds to the \c VPDPWSUD instruction.
30 ///
31 /// \param __W
32 /// A 128-bit vector of [4 x int].
33 /// \param __A
34 /// A 128-bit vector of [8 x short].
35 /// \param __B
36 /// A 128-bit vector of [8 x unsigned short].
37 /// \returns
38 /// A 128-bit vector of [4 x int].
39 ///
40 /// \code{.operation}
41 /// FOR j := 0 to 3
42 /// tmp1.dword := SignExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j])
43 /// tmp2.dword := SignExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
44 /// dst.dword[j] := __W.dword[j] + tmp1 + tmp2
45 /// ENDFOR
46 /// dst[MAX:128] := 0
47 /// \endcode
48 #define _mm_dpwsud_epi32(__W, __A, __B) \
49 ((__m128i)__builtin_ia32_vpdpwsud128((__v4si)(__W), (__v4si)(__A), \
50 (__v4si)(__B)))
52 /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with
53 /// corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
54 /// signed 16-bit results. Sum these 2 results with the corresponding
55 /// 32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
56 ///
57 /// \headerfile <immintrin.h>
58 ///
59 /// \code
60 /// __m256i _mm256_dpwsud_epi32(__m256i __W, __m256i __A, __m256i __B)
61 /// \endcode
62 ///
63 /// This intrinsic corresponds to the \c VPDPWSUD instruction.
64 ///
65 /// \param __W
66 /// A 256-bit vector of [8 x int].
67 /// \param __A
68 /// A 256-bit vector of [16 x short].
69 /// \param __B
70 /// A 256-bit vector of [16 x unsigned short].
71 /// \returns
72 /// A 256-bit vector of [8 x int].
73 ///
74 /// \code{.operation}
75 /// FOR j := 0 to 7
76 /// tmp1.dword := SignExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j])
77 /// tmp2.dword := SignExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
78 /// dst.dword[j] := __W.dword[j] + tmp1 + tmp2
79 /// ENDFOR
80 /// dst[MAX:256] := 0
81 /// \endcode
82 #define _mm256_dpwsud_epi32(__W, __A, __B) \
83 ((__m256i)__builtin_ia32_vpdpwsud256((__v8si)(__W), (__v8si)(__A), \
84 (__v8si)(__B)))
86 /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with
87 /// corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
88 /// signed 16-bit results. Sum these 2 results with the corresponding
89 /// 32-bit integer in \a __W with signed saturation, and store the packed
90 /// 32-bit results in \a dst.
91 ///
92 /// \headerfile <immintrin.h>
93 ///
94 /// \code
95 /// __m128i _mm_dpwsuds_epi32(__m128i __W, __m128i __A, __m128i __B)
96 /// \endcode
97 ///
98 /// This intrinsic corresponds to the \c VPDPWSUDS instruction.
99 ///
100 /// \param __W
101 /// A 128-bit vector of [4 x int].
102 /// \param __A
103 /// A 128-bit vector of [8 x short].
104 /// \param __B
105 /// A 128-bit vector of [8 x unsigned short].
106 /// \returns
107 /// A 128-bit vector of [4 x int].
109 /// \code{.operation}
110 /// FOR j := 0 to 3
111 /// tmp1.dword := SignExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j])
112 /// tmp2.dword := SignExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
113 /// dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2)
114 /// ENDFOR
115 /// dst[MAX:128] := 0
116 /// \endcode
117 #define _mm_dpwsuds_epi32(__W, __A, __B) \
118 ((__m128i)__builtin_ia32_vpdpwsuds128((__v4si)(__W), (__v4si)(__A), \
119 (__v4si)(__B)))
121 /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with
122 /// corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
123 /// signed 16-bit results. Sum these 2 results with the corresponding
124 /// 32-bit integer in \a __W with signed saturation, and store the packed
125 /// 32-bit results in \a dst.
127 /// \headerfile <immintrin.h>
129 /// \code
130 /// __m256i _mm256_dpwsuds_epi32(__m256i __W, __m256i __A, __m256i __B)
131 /// \endcode
133 /// This intrinsic corresponds to the \c VPDPWSUDS instruction.
135 /// \param __W
136 /// A 256-bit vector of [8 x int].
137 /// \param __A
138 /// A 256-bit vector of [16 x short].
139 /// \param __B
140 /// A 256-bit vector of [16 x unsigned short].
141 /// \returns
142 /// A 256-bit vector of [8 x int].
144 /// \code{.operation}
145 /// FOR j := 0 to 7
146 /// tmp1.dword := SignExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j])
147 /// tmp2.dword := SignExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
148 /// dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2)
149 /// ENDFOR
150 /// dst[MAX:256] := 0
151 /// \endcode
152 #define _mm256_dpwsuds_epi32(__W, __A, __B) \
153 ((__m256i)__builtin_ia32_vpdpwsuds256((__v8si)(__W), (__v8si)(__A), \
154 (__v8si)(__B)))
156 /// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
157 /// corresponding signed 16-bit integers in \a __B, producing 2 intermediate
158 /// signed 16-bit results. Sum these 2 results with the corresponding
159 /// 32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
161 /// \headerfile <immintrin.h>
163 /// \code
164 /// __m128i _mm_dpbusd_epi32(__m128i __W, __m128i __A, __m128i __B)
165 /// \endcode
167 /// This intrinsic corresponds to the \c VPDPWUSD instruction.
169 /// \param __W
170 /// A 128-bit vector of [4 x int].
171 /// \param __A
172 /// A 128-bit vector of [8 x unsigned short].
173 /// \param __B
174 /// A 128-bit vector of [8 x short].
175 /// \returns
176 /// A 128-bit vector of [4 x int].
178 /// \code{.operation}
179 /// FOR j := 0 to 3
180 /// tmp1.dword := ZeroExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j])
181 /// tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1])
182 /// dst.dword[j] := __W.dword[j] + tmp1 + tmp2
183 /// ENDFOR
184 /// dst[MAX:128] := 0
185 /// \endcode
186 #define _mm_dpwusd_epi32(__W, __A, __B) \
187 ((__m128i)__builtin_ia32_vpdpwusd128((__v4si)(__W), (__v4si)(__A), \
188 (__v4si)(__B)))
190 /// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
191 /// corresponding signed 16-bit integers in \a __B, producing 2 intermediate
192 /// signed 16-bit results. Sum these 2 results with the corresponding
193 /// 32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
195 /// \headerfile <immintrin.h>
197 /// \code
198 /// __m256i _mm256_dpwusd_epi32(__m256i __W, __m256i __A, __m256i __B)
199 /// \endcode
201 /// This intrinsic corresponds to the \c VPDPWUSD instruction.
203 /// \param __W
204 /// A 256-bit vector of [8 x int].
205 /// \param __A
206 /// A 256-bit vector of [16 x unsigned short].
207 /// \param __B
208 /// A 256-bit vector of [16 x short].
209 /// \returns
210 /// A 256-bit vector of [8 x int].
212 /// \code{.operation}
213 /// FOR j := 0 to 7
214 /// tmp1.dword := ZeroExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j])
215 /// tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1])
216 /// dst.dword[j] := __W.dword[j] + tmp1 + tmp2
217 /// ENDFOR
218 /// dst[MAX:256] := 0
219 /// \endcode
220 #define _mm256_dpwusd_epi32(__W, __A, __B) \
221 ((__m256i)__builtin_ia32_vpdpwusd256((__v8si)(__W), (__v8si)(__A), \
222 (__v8si)(__B)))
224 /// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
225 /// corresponding signed 16-bit integers in \a __B, producing 2 intermediate
226 /// signed 16-bit results. Sum these 2 results with the corresponding
227 /// 32-bit integer in \a __W with signed saturation, and store the packed
228 /// 32-bit results in \a dst.
230 /// \headerfile <immintrin.h>
232 /// \code
233 /// __m128i _mm_dpwusds_epi32(__m128i __W, __m128i __A, __m128i __B)
234 /// \endcode
236 /// This intrinsic corresponds to the \c VPDPWSUDS instruction.
238 /// \param __W
239 /// A 128-bit vector of [4 x int].
240 /// \param __A
241 /// A 128-bit vector of [8 x unsigned short].
242 /// \param __B
243 /// A 128-bit vector of [8 x short].
244 /// \returns
245 /// A 128-bit vector of [4 x int].
247 /// \code{.operation}
248 /// FOR j := 0 to 3
249 /// tmp1.dword := ZeroExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j])
250 /// tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1])
251 /// dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2)
252 /// ENDFOR
253 /// dst[MAX:128] := 0
254 /// \endcode
255 #define _mm_dpwusds_epi32(__W, __A, __B) \
256 ((__m128i)__builtin_ia32_vpdpwusds128((__v4si)(__W), (__v4si)(__A), \
257 (__v4si)(__B)))
259 /// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
260 /// corresponding signed 16-bit integers in \a __B, producing 2 intermediate
261 /// signed 16-bit results. Sum these 2 results with the corresponding
262 /// 32-bit integer in \a __W with signed saturation, and store the packed
263 /// 32-bit results in \a dst.
265 /// \headerfile <immintrin.h>
267 /// \code
268 /// __m256i _mm256_dpwsuds_epi32(__m256i __W, __m256i __A, __m256i __B)
269 /// \endcode
271 /// This intrinsic corresponds to the \c VPDPWSUDS instruction.
273 /// \param __W
274 /// A 256-bit vector of [8 x int].
275 /// \param __A
276 /// A 256-bit vector of [16 x unsigned short].
277 /// \param __B
278 /// A 256-bit vector of [16 x short].
279 /// \returns
280 /// A 256-bit vector of [8 x int].
282 /// \code{.operation}
283 /// FOR j := 0 to 7
284 /// tmp1.dword := ZeroExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j])
285 /// tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1])
286 /// dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2)
287 /// ENDFOR
288 /// dst[MAX:256] := 0
289 /// \endcode
290 #define _mm256_dpwusds_epi32(__W, __A, __B) \
291 ((__m256i)__builtin_ia32_vpdpwusds256((__v8si)(__W), (__v8si)(__A), \
292 (__v8si)(__B)))
294 /// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
295 /// corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
296 /// signed 16-bit results. Sum these 2 results with the corresponding
297 /// 32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
299 /// \headerfile <immintrin.h>
301 /// \code
302 /// __m128i _mm_dpwuud_epi32(__m128i __W, __m128i __A, __m128i __B)
303 /// \endcode
305 /// This intrinsic corresponds to the \c VPDPWUUD instruction.
307 /// \param __W
308 /// A 128-bit vector of [4 x unsigned int].
309 /// \param __A
310 /// A 128-bit vector of [8 x unsigned short].
311 /// \param __B
312 /// A 128-bit vector of [8 x unsigned short].
313 /// \returns
314 /// A 128-bit vector of [4 x unsigned int].
316 /// \code{.operation}
317 /// FOR j := 0 to 3
318 /// tmp1.dword := ZeroExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j])
319 /// tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
320 /// dst.dword[j] := __W.dword[j] + tmp1 + tmp2
321 /// ENDFOR
322 /// dst[MAX:128] := 0
323 /// \endcode
324 #define _mm_dpwuud_epi32(__W, __A, __B) \
325 ((__m128i)__builtin_ia32_vpdpwuud128((__v4si)(__W), (__v4si)(__A), \
326 (__v4si)(__B)))
328 /// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
329 /// corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
330 /// signed 16-bit results. Sum these 2 results with the corresponding
331 /// 32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
333 /// \headerfile <immintrin.h>
335 /// \code
336 /// __m256i _mm256_dpwuud_epi32(__m256i __W, __m256i __A, __m256i __B)
337 /// \endcode
339 /// This intrinsic corresponds to the \c VPDPWUUD instruction.
341 /// \param __W
342 /// A 256-bit vector of [8 x unsigned int].
343 /// \param __A
344 /// A 256-bit vector of [16 x unsigned short].
345 /// \param __B
346 /// A 256-bit vector of [16 x unsigned short].
347 /// \returns
348 /// A 256-bit vector of [8 x unsigned int].
350 /// \code{.operation}
351 /// FOR j := 0 to 7
352 /// tmp1.dword := ZeroExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j])
353 /// tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
354 /// dst.dword[j] := __W.dword[j] + tmp1 + tmp2
355 /// ENDFOR
356 /// dst[MAX:256] := 0
357 /// \endcode
358 #define _mm256_dpwuud_epi32(__W, __A, __B) \
359 ((__m256i)__builtin_ia32_vpdpwuud256((__v8si)(__W), (__v8si)(__A), \
360 (__v8si)(__B)))
362 /// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
363 /// corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
364 /// signed 16-bit results. Sum these 2 results with the corresponding
365 /// 32-bit integer in \a __W with signed saturation, and store the packed
366 /// 32-bit results in \a dst.
368 /// \headerfile <immintrin.h>
370 /// \code
371 /// __m128i _mm_dpwsuds_epi32(__m128i __W, __m128i __A, __m128i __B)
372 /// \endcode
374 /// This intrinsic corresponds to the \c VPDPWSUDS instruction.
376 /// \param __W
377 /// A 128-bit vector of [4 x unsigned int].
378 /// \param __A
379 /// A 128-bit vector of [8 x unsigned short].
380 /// \param __B
381 /// A 128-bit vector of [8 x unsigned short].
382 /// \returns
383 /// A 128-bit vector of [4 x unsigned int].
385 /// \code{.operation}
386 /// FOR j := 0 to 3
387 /// tmp1.dword := ZeroExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j])
388 /// tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
389 /// dst.dword[j] := UNSIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2)
390 /// ENDFOR
391 /// dst[MAX:128] := 0
392 /// \endcode
393 #define _mm_dpwuuds_epi32(__W, __A, __B) \
394 ((__m128i)__builtin_ia32_vpdpwuuds128((__v4si)(__W), (__v4si)(__A), \
395 (__v4si)(__B)))
397 /// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
398 /// corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
399 /// signed 16-bit results. Sum these 2 results with the corresponding
400 /// 32-bit integer in \a __W with signed saturation, and store the packed
401 /// 32-bit results in \a dst.
403 /// \headerfile <immintrin.h>
405 /// \code
406 /// __m256i _mm256_dpwuuds_epi32(__m256i __W, __m256i __A, __m256i __B)
407 /// \endcode
409 /// This intrinsic corresponds to the \c VPDPWSUDS instruction.
411 /// \param __W
412 /// A 256-bit vector of [8 x unsigned int].
413 /// \param __A
414 /// A 256-bit vector of [16 x unsigned short].
415 /// \param __B
416 /// A 256-bit vector of [16 x unsigned short].
417 /// \returns
418 /// A 256-bit vector of [8 x unsigned int].
420 /// \code{.operation}
421 /// FOR j := 0 to 7
422 /// tmp1.dword := ZeroExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j])
423 /// tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
424 /// dst.dword[j] := UNSIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2)
425 /// ENDFOR
426 /// dst[MAX:256] := 0
427 /// \endcode
428 #define _mm256_dpwuuds_epi32(__W, __A, __B) \
429 ((__m256i)__builtin_ia32_vpdpwuuds256((__v8si)(__W), (__v8si)(__A), \
430 (__v8si)(__B)))
432 #endif // __AVXVNNIINT16INTRIN_H