[RISCV] Add Qualcomm uC Xqcics(Conditional Select) extension (#119504)
[llvm-project.git] / clang / lib / Headers / avxvnniint8intrin.h
blobc211620c68f076050c6fd013ef1c7517ae4295b8
1 /*===-------- avxvnniint8intrin.h - AVXVNNIINT8 intrinsics -----------===
3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 * See https://llvm.org/LICENSE.txt for license information.
5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 *===-----------------------------------------------------------------------===
8 */
9 #ifndef __IMMINTRIN_H
10 #error \
11 "Never use <avxvnniint8intrin.h> directly; include <immintrin.h> instead."
12 #endif
14 #ifndef __AVXVNNIINT8INTRIN_H
15 #define __AVXVNNIINT8INTRIN_H
17 /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
18 /// corresponding signed 8-bit integers in \a __B, producing 4 intermediate
19 /// signed 16-bit results. Sum these 4 results with the corresponding
20 /// 32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
21 ///
22 /// \headerfile <x86intrin.h>
23 ///
24 /// \code
25 /// _mm_dpbssd_epi32(__m128i __W, __m128i __A, __m128i __B);
26 /// \endcode
27 ///
28 /// This intrinsic corresponds to the \c VPDPBSSD instruction.
29 ///
30 /// \param __A
31 /// A 128-bit vector of [16 x char].
32 /// \param __B
33 /// A 128-bit vector of [16 x char].
34 /// \returns
35 /// A 128-bit vector of [4 x int].
36 ///
37 /// \code{.operation}
38 /// FOR j := 0 to 3
39 /// tmp1.word := SignExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j])
40 /// tmp2.word := SignExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1])
41 /// tmp3.word := SignExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2])
42 /// tmp4.word := SignExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3])
43 /// dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
44 /// ENDFOR
45 /// dst[MAX:128] := 0
46 /// \endcode
47 #define _mm_dpbssd_epi32(__W, __A, __B) \
48 ((__m128i)__builtin_ia32_vpdpbssd128((__v4si)(__W), (__v4si)(__A), \
49 (__v4si)(__B)))
51 /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
52 /// corresponding signed 8-bit integers in \a __B, producing 4 intermediate
53 /// signed 16-bit results. Sum these 4 results with the corresponding
54 /// 32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
55 ///
56 /// \headerfile <x86intrin.h>
57 ///
58 /// \code
59 /// _mm256_dpbssd_epi32(__m256i __W, __m256i __A, __m256i __B);
60 /// \endcode
61 ///
62 /// This intrinsic corresponds to the \c VPDPBSSD instruction.
63 ///
64 /// \param __A
65 /// A 256-bit vector of [32 x char].
66 /// \param __B
67 /// A 256-bit vector of [32 x char].
68 /// \returns
69 /// A 256-bit vector of [8 x int].
70 ///
71 /// \code{.operation}
72 /// FOR j := 0 to 7
73 /// tmp1.word := SignExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j])
74 /// tmp2.word := SignExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1])
75 /// tmp3.word := SignExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2])
76 /// tmp4.word := SignExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3])
77 /// dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
78 /// ENDFOR
79 /// dst[MAX:256] := 0
80 /// \endcode
81 #define _mm256_dpbssd_epi32(__W, __A, __B) \
82 ((__m256i)__builtin_ia32_vpdpbssd256((__v8si)(__W), (__v8si)(__A), \
83 (__v8si)(__B)))
85 /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
86 /// corresponding signed 8-bit integers in \a __B, producing 4 intermediate
87 /// signed 16-bit results. Sum these 4 results with the corresponding
88 /// 32-bit integer in \a __W with signed saturation, and store the packed
89 /// 32-bit results in \a dst.
90 ///
91 /// \headerfile <x86intrin.h>
92 ///
93 /// \code
94 /// _mm_dpbssds_epi32( __m128i __W, __m128i __A, __m128i __B);
95 /// \endcode
96 ///
97 /// This intrinsic corresponds to the \c VPDPBSSD instruction.
98 ///
99 /// \param __A
100 /// A 128-bit vector of [16 x char].
101 /// \param __B
102 /// A 128-bit vector of [16 x char].
103 /// \returns
104 /// A 128-bit vector of [4 x int].
106 /// \code{.operation}
107 /// FOR j := 0 to 3
108 /// tmp1.word := SignExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j])
109 /// tmp2.word := SignExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1])
110 /// tmp3.word := SignExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2])
111 /// tmp4.word := SignExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3])
112 /// dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
113 /// ENDFOR
114 /// dst[MAX:128] := 0
115 /// \endcode
116 #define _mm_dpbssds_epi32(__W, __A, __B) \
117 ((__m128i)__builtin_ia32_vpdpbssds128((__v4si)(__W), (__v4si)(__A), \
118 (__v4si)(__B)))
120 /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
121 /// corresponding signed 8-bit integers in \a __B, producing 4 intermediate
122 /// signed 16-bit results. Sum these 4 results with the corresponding
123 /// 32-bit integer in \a __W with signed saturation, and store the packed
124 /// 32-bit results in \a dst.
126 /// \headerfile <x86intrin.h>
128 /// \code
129 /// _mm256_dpbssds_epi32(__m256i __W, __m256i __A, __m256i __B);
130 /// \endcode
132 /// This intrinsic corresponds to the \c VPDPBSSD instruction.
134 /// \param __A
135 /// A 256-bit vector of [32 x char].
136 /// \param __B
137 /// A 256-bit vector of [32 x char].
138 /// \returns
139 /// A 256-bit vector of [8 x int].
141 /// \code{.operation}
142 /// FOR j := 0 to 7
143 /// tmp1.word := SignExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j])
144 /// tmp2.word := SignExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1])
145 /// tmp3.word := SignExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2])
146 /// tmp4.word := SignExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3])
147 /// dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
148 /// ENDFOR
149 /// dst[MAX:256] := 0
150 /// \endcode
151 #define _mm256_dpbssds_epi32(__W, __A, __B) \
152 ((__m256i)__builtin_ia32_vpdpbssds256((__v8si)(__W), (__v8si)(__A), \
153 (__v8si)(__B)))
155 /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
156 /// corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
157 /// signed 16-bit results. Sum these 4 results with the corresponding
158 /// 32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
160 /// \headerfile <x86intrin.h>
162 /// \code
163 /// _mm_dpbsud_epi32(__m128i __W, __m128i __A, __m128i __B);
164 /// \endcode
166 /// This intrinsic corresponds to the \c VPDPBSSD instruction.
168 /// \param __A
169 /// A 128-bit vector of [16 x char].
170 /// \param __B
171 /// A 128-bit vector of [16 x unsigned char].
172 /// \returns
173 /// A 128-bit vector of [4 x int].
175 /// \code{.operation}
176 /// FOR j := 0 to 3
177 /// tmp1.word := Signed(SignExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j]))
178 /// tmp2.word := Signed(SignExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1]))
179 /// tmp3.word := Signed(SignExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2]))
180 /// tmp4.word := Signed(SignExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3]))
181 /// dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
182 /// ENDFOR
183 /// dst[MAX:128] := 0
184 /// \endcode
185 #define _mm_dpbsud_epi32(__W, __A, __B) \
186 ((__m128i)__builtin_ia32_vpdpbsud128((__v4si)(__W), (__v4si)(__A), \
187 (__v4si)(__B)))
189 /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
190 /// corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
191 /// signed 16-bit results. Sum these 4 results with the corresponding
192 /// 32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
194 /// \headerfile <x86intrin.h>
196 /// \code
197 /// _mm256_dpbsud_epi32(__m256i __W, __m256i __A, __m256i __B);
198 /// \endcode
200 /// This intrinsic corresponds to the \c VPDPBSSD instruction.
202 /// \param __A
203 /// A 256-bit vector of [32 x char].
204 /// \param __B
205 /// A 256-bit vector of [32 x unsigned char].
206 /// \returns
207 /// A 256-bit vector of [8 x int].
209 /// \code{.operation}
210 /// FOR j := 0 to 7
211 /// tmp1.word := Signed(SignExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j]))
212 /// tmp2.word := Signed(SignExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1]))
213 /// tmp3.word := Signed(SignExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2]))
214 /// tmp4.word := Signed(SignExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3]))
215 /// dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
216 /// ENDFOR
217 /// dst[MAX:256] := 0
218 /// \endcode
219 #define _mm256_dpbsud_epi32(__W, __A, __B) \
220 ((__m256i)__builtin_ia32_vpdpbsud256((__v8si)(__W), (__v8si)(__A), \
221 (__v8si)(__B)))
223 /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
224 /// corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
225 /// signed 16-bit results. Sum these 4 results with the corresponding
226 /// 32-bit integer in \a __W with signed saturation, and store the packed
227 /// 32-bit results in \a dst.
229 /// \headerfile <x86intrin.h>
231 /// \code
232 /// _mm_dpbsuds_epi32( __m128i __W, __m128i __A, __m128i __B);
233 /// \endcode
235 /// This intrinsic corresponds to the \c VPDPBSSD instruction.
237 /// \param __A
238 /// A 128-bit vector of [16 x char].
239 /// \param __B
240 /// A 128-bit vector of [16 x unsigned char].
241 /// \returns
242 /// A 128-bit vector of [4 x int].
244 /// \code{.operation}
245 /// FOR j := 0 to 3
246 /// tmp1.word := Signed(SignExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j]))
247 /// tmp2.word := Signed(SignExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1]))
248 /// tmp3.word := Signed(SignExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2]))
249 /// tmp4.word := Signed(SignExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3]))
250 /// dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
251 /// ENDFOR
252 /// dst[MAX:128] := 0
253 /// \endcode
254 #define _mm_dpbsuds_epi32(__W, __A, __B) \
255 ((__m128i)__builtin_ia32_vpdpbsuds128((__v4si)(__W), (__v4si)(__A), \
256 (__v4si)(__B)))
258 /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
259 /// corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
260 /// signed 16-bit results. Sum these 4 results with the corresponding
261 /// 32-bit integer in \a __W with signed saturation, and store the packed
262 /// 32-bit results in \a dst.
264 /// \headerfile <x86intrin.h>
266 /// \code
267 /// _mm256_dpbsuds_epi32(__m256i __W, __m256i __A, __m256i __B);
268 /// \endcode
270 /// This intrinsic corresponds to the \c VPDPBSSD instruction.
272 /// \param __A
273 /// A 256-bit vector of [32 x char].
274 /// \param __B
275 /// A 256-bit vector of [32 x unsigned char].
276 /// \returns
277 /// A 256-bit vector of [8 x int].
279 /// \code{.operation}
280 /// FOR j := 0 to 7
281 /// tmp1.word := Signed(SignExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j]))
282 /// tmp2.word := Signed(SignExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1]))
283 /// tmp3.word := Signed(SignExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2]))
284 /// tmp4.word := Signed(SignExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3]))
285 /// dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
286 /// ENDFOR
287 /// dst[MAX:256] := 0
288 /// \endcode
289 #define _mm256_dpbsuds_epi32(__W, __A, __B) \
290 ((__m256i)__builtin_ia32_vpdpbsuds256((__v8si)(__W), (__v8si)(__A), \
291 (__v8si)(__B)))
293 /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with
294 /// corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
295 /// signed 16-bit results. Sum these 4 results with the corresponding
296 /// 32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
298 /// \headerfile <x86intrin.h>
300 /// \code
301 /// _mm_dpbuud_epi32(__m128i __W, __m128i __A, __m128i __B);
302 /// \endcode
304 /// This intrinsic corresponds to the \c VPDPBSSD instruction.
306 /// \param __A
307 /// A 128-bit vector of [16 x unsigned char].
308 /// \param __B
309 /// A 128-bit vector of [16 x unsigned char].
310 /// \returns
311 /// A 128-bit vector of [4 x int].
313 /// \code{.operation}
314 /// FOR j := 0 to 3
315 /// tmp1.word := ZeroExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j])
316 /// tmp2.word := ZeroExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1])
317 /// tmp3.word := ZeroExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2])
318 /// tmp4.word := ZeroExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3])
319 /// dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
320 /// ENDFOR
321 /// dst[MAX:128] := 0
322 /// \endcode
323 #define _mm_dpbuud_epi32(__W, __A, __B) \
324 ((__m128i)__builtin_ia32_vpdpbuud128((__v4si)(__W), (__v4si)(__A), \
325 (__v4si)(__B)))
327 /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with
328 /// corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
329 /// signed 16-bit results. Sum these 4 results with the corresponding
330 /// 32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
332 /// \headerfile <x86intrin.h>
334 /// \code
335 /// _mm256_dpbuud_epi32(__m256i __W, __m256i __A, __m256i __B);
336 /// \endcode
338 /// This intrinsic corresponds to the \c VPDPBSSD instruction.
340 /// \param __A
341 /// A 256-bit vector of [32 x unsigned char].
342 /// \param __B
343 /// A 256-bit vector of [32 x unsigned char].
344 /// \returns
345 /// A 256-bit vector of [8 x int].
347 /// \code{.operation}
348 /// FOR j := 0 to 7
349 /// tmp1.word := ZeroExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j])
350 /// tmp2.word := ZeroExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1])
351 /// tmp3.word := ZeroExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2])
352 /// tmp4.word := ZeroExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3])
353 /// dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
354 /// ENDFOR
355 /// dst[MAX:256] := 0
356 /// \endcode
357 #define _mm256_dpbuud_epi32(__W, __A, __B) \
358 ((__m256i)__builtin_ia32_vpdpbuud256((__v8si)(__W), (__v8si)(__A), \
359 (__v8si)(__B)))
361 /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with
362 /// corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
363 /// signed 16-bit results. Sum these 4 results with the corresponding
364 /// 32-bit integer in \a __W with signed saturation, and store the packed
365 /// 32-bit results in \a dst.
367 /// \headerfile <x86intrin.h>
369 /// \code
370 /// _mm_dpbuuds_epi32( __m128i __W, __m128i __A, __m128i __B);
371 /// \endcode
373 /// This intrinsic corresponds to the \c VPDPBUUDS instruction.
375 /// \param __A
376 /// A 128-bit vector of [16 x unsigned char].
377 /// \param __B
378 /// A 128-bit vector of [16 x unsigned char].
379 /// \returns
380 /// A 128-bit vector of [4 x int].
382 /// \code{.operation}
383 /// FOR j := 0 to 3
384 /// tmp1.word := ZeroExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j])
385 /// tmp2.word := ZeroExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1])
386 /// tmp3.word := ZeroExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2])
387 /// tmp4.word := ZeroExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3])
388 /// dst.dword[j] := UNSIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
389 /// ENDFOR
390 /// dst[MAX:128] := 0
391 /// \endcode
392 #define _mm_dpbuuds_epi32(__W, __A, __B) \
393 ((__m128i)__builtin_ia32_vpdpbuuds128((__v4si)(__W), (__v4si)(__A), \
394 (__v4si)(__B)))
396 /// corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
397 /// signed 16-bit results. Sum these 4 results with the corresponding
398 /// 32-bit integer in \a __W with signed saturation, and store the packed
399 /// 32-bit results in \a dst.
401 /// \headerfile <x86intrin.h>
403 /// \code
404 /// _mm256_dpbuuds_epi32(__m256i __W, __m256i __A, __m256i __B);
405 /// \endcode
407 /// This intrinsic corresponds to the \c VPDPBUUDS instruction.
409 /// \param __A
410 /// A 256-bit vector of [32 x unsigned char].
411 /// \param __B
412 /// A 256-bit vector of [32 x unsigned char].
413 /// \returns
414 /// A 256-bit vector of [8 x int].
416 /// \code{.operation}
417 /// FOR j := 0 to 7
418 /// tmp1.word := ZeroExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j])
419 /// tmp2.word := ZeroExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1])
420 /// tmp3.word := ZeroExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2])
421 /// tmp4.word := ZeroExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3])
422 /// dst.dword[j] := UNSIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
423 /// ENDFOR
424 /// dst[MAX:256] := 0
425 /// \endcode
426 #define _mm256_dpbuuds_epi32(__W, __A, __B) \
427 ((__m256i)__builtin_ia32_vpdpbuuds256((__v8si)(__W), (__v8si)(__A), \
428 (__v8si)(__B)))
430 #endif // __AVXVNNIINT8INTRIN_H