[AMDGPU][AsmParser][NFC] Get rid of custom default operand handlers.
[llvm-project.git] / clang / lib / Headers / avxvnniint8intrin.h
blobb0b6cb853f713f40d9b126efcc13d60153f83ca7
1 /*===-------- avxvnniint8intrin.h - AVXVNNIINT8 intrinsics -----------===
3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 * See https://llvm.org/LICENSE.txt for license information.
5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 *===-----------------------------------------------------------------------===
8 */
9 #ifndef __IMMINTRIN_H
10 #error \
11 "Never use <avxvnniint8intrin.h> directly; include <immintrin.h> instead."
12 #endif
14 #ifndef __AVXVNNIINT8INTRIN_H
15 #define __AVXVNNIINT8INTRIN_H
17 /* Define the default attributes for the functions in this file. */
18 #define __DEFAULT_FN_ATTRS256 \
19 __attribute__((__always_inline__, __nodebug__, __target__("avxvnniint8"), \
20 __min_vector_width__(256)))
21 #define __DEFAULT_FN_ATTRS128 \
22 __attribute__((__always_inline__, __nodebug__, __target__("avxvnniint8"), \
23 __min_vector_width__(128)))
25 /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
26 /// corresponding signed 8-bit integers in \a __B, producing 4 intermediate
27 /// signed 16-bit results. Sum these 4 results with the corresponding
28 /// 32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
29 ///
30 /// \headerfile <x86intrin.h>
31 ///
32 /// \code
33 /// _mm_dpbssd_epi32(__m128i __W, __m128i __A, __m128i __B);
34 /// \endcode
35 ///
36 /// This intrinsic corresponds to the \c VPDPBSSD instruction.
37 ///
38 /// \param __A
39 /// A 128-bit vector of [16 x char].
40 /// \param __B
41 /// A 128-bit vector of [16 x char].
42 /// \returns
43 /// A 128-bit vector of [4 x int].
44 ///
45 /// \code{.operation}
46 /// FOR j := 0 to 3
47 /// tmp1.word := SignExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j])
48 /// tmp2.word := SignExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1])
49 /// tmp3.word := SignExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2])
50 /// tmp4.word := SignExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3])
51 /// dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
52 /// ENDFOR
53 /// dst[MAX:128] := 0
54 /// \endcode
55 static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbssd_epi32(__m128i __W,
56 __m128i __A,
57 __m128i __B) {
58 return (__m128i)__builtin_ia32_vpdpbssd128((__v4si)__W, (__v4si)__A,
59 (__v4si)__B);
62 /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
63 /// corresponding signed 8-bit integers in \a __B, producing 4 intermediate
64 /// signed 16-bit results. Sum these 4 results with the corresponding
65 /// 32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
66 ///
67 /// \headerfile <x86intrin.h>
68 ///
69 /// \code
70 /// _mm256_dpbssd_epi32(__m256i __W, __m256i __A, __m256i __B);
71 /// \endcode
72 ///
73 /// This intrinsic corresponds to the \c VPDPBSSD instruction.
74 ///
75 /// \param __A
76 /// A 256-bit vector of [32 x char].
77 /// \param __B
78 /// A 256-bit vector of [32 x char].
79 /// \returns
80 /// A 256-bit vector of [8 x int].
81 ///
82 /// \code{.operation}
83 /// FOR j := 0 to 7
84 /// tmp1.word := SignExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j])
85 /// tmp2.word := SignExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1])
86 /// tmp3.word := SignExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2])
87 /// tmp4.word := SignExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3])
88 /// dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
89 /// ENDFOR
90 /// dst[MAX:256] := 0
91 /// \endcode
92 static __inline__ __m256i __DEFAULT_FN_ATTRS256
93 _mm256_dpbssd_epi32(__m256i __W, __m256i __A, __m256i __B) {
94 return (__m256i)__builtin_ia32_vpdpbssd256((__v8si)__W, (__v8si)__A,
95 (__v8si)__B);
98 /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
99 /// corresponding signed 8-bit integers in \a __B, producing 4 intermediate
100 /// signed 16-bit results. Sum these 4 results with the corresponding
101 /// 32-bit integer in \a __W with signed saturation, and store the packed
102 /// 32-bit results in \a dst.
104 /// \headerfile <x86intrin.h>
106 /// \code
107 /// _mm_dpbssds_epi32( __m128i __W, __m128i __A, __m128i __B);
108 /// \endcode
110 /// This intrinsic corresponds to the \c VPDPBSSD instruction.
112 /// \param __A
113 /// A 128-bit vector of [16 x char].
114 /// \param __B
115 /// A 128-bit vector of [16 x char].
116 /// \returns
117 /// A 128-bit vector of [4 x int].
119 /// \code{.operation}
120 /// FOR j := 0 to 3
121 /// tmp1.word := SignExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j])
122 /// tmp2.word := SignExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1])
123 /// tmp3.word := SignExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2])
124 /// tmp4.word := SignExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3])
125 /// dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
126 /// ENDFOR
127 /// dst[MAX:128] := 0
128 /// \endcode
129 static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbssds_epi32(__m128i __W,
130 __m128i __A,
131 __m128i __B) {
132 return (__m128i)__builtin_ia32_vpdpbssds128((__v4si)__W, (__v4si)__A,
133 (__v4si)__B);
136 /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
137 /// corresponding signed 8-bit integers in \a __B, producing 4 intermediate
138 /// signed 16-bit results. Sum these 4 results with the corresponding
139 /// 32-bit integer in \a __W with signed saturation, and store the packed
140 /// 32-bit results in \a dst.
142 /// \headerfile <x86intrin.h>
144 /// \code
145 /// _mm256_dpbssds_epi32(__m256i __W, __m256i __A, __m256i __B);
146 /// \endcode
148 /// This intrinsic corresponds to the \c VPDPBSSD instruction.
150 /// \param __A
151 /// A 256-bit vector of [32 x char].
152 /// \param __B
153 /// A 256-bit vector of [32 x char].
154 /// \returns
155 /// A 256-bit vector of [8 x int].
157 /// \code{.operation}
158 /// FOR j := 0 to 7
159 /// tmp1.word := SignExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j])
160 /// tmp2.word := SignExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1])
161 /// tmp3.word := SignExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2])
162 /// tmp4.word := SignExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3])
163 /// dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
164 /// ENDFOR
165 /// dst[MAX:256] := 0
166 /// \endcode
167 static __inline__ __m256i __DEFAULT_FN_ATTRS256
168 _mm256_dpbssds_epi32(__m256i __W, __m256i __A, __m256i __B) {
169 return (__m256i)__builtin_ia32_vpdpbssds256((__v8si)__W, (__v8si)__A,
170 (__v8si)__B);
173 /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
174 /// corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
175 /// signed 16-bit results. Sum these 4 results with the corresponding
176 /// 32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
178 /// \headerfile <x86intrin.h>
180 /// \code
181 /// _mm_dpbsud_epi32(__m128i __W, __m128i __A, __m128i __B);
182 /// \endcode
184 /// This intrinsic corresponds to the \c VPDPBSSD instruction.
186 /// \param __A
187 /// A 128-bit vector of [16 x char].
188 /// \param __B
189 /// A 128-bit vector of [16 x unsigned char].
190 /// \returns
191 /// A 128-bit vector of [4 x int].
193 /// \code{.operation}
194 /// FOR j := 0 to 3
195 /// tmp1.word := Signed(SignExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j]))
196 /// tmp2.word := Signed(SignExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1]))
197 /// tmp3.word := Signed(SignExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2]))
198 /// tmp4.word := Signed(SignExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3]))
199 /// dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
200 /// ENDFOR
201 /// dst[MAX:128] := 0
202 /// \endcode
203 static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbsud_epi32(__m128i __W,
204 __m128i __A,
205 __m128i __B) {
206 return (__m128i)__builtin_ia32_vpdpbsud128((__v4si)__W, (__v4si)__A,
207 (__v4si)__B);
210 /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
211 /// corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
212 /// signed 16-bit results. Sum these 4 results with the corresponding
213 /// 32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
215 /// \headerfile <x86intrin.h>
217 /// \code
218 /// _mm256_dpbsud_epi32(__m256i __W, __m256i __A, __m256i __B);
219 /// \endcode
221 /// This intrinsic corresponds to the \c VPDPBSSD instruction.
223 /// \param __A
224 /// A 256-bit vector of [32 x char].
225 /// \param __B
226 /// A 256-bit vector of [32 x unsigned char].
227 /// \returns
228 /// A 256-bit vector of [8 x int].
230 /// \code{.operation}
231 /// FOR j := 0 to 7
232 /// tmp1.word := Signed(SignExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j]))
233 /// tmp2.word := Signed(SignExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1]))
234 /// tmp3.word := Signed(SignExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2]))
235 /// tmp4.word := Signed(SignExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3]))
236 /// dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
237 /// ENDFOR
238 /// dst[MAX:256] := 0
239 /// \endcode
240 static __inline__ __m256i __DEFAULT_FN_ATTRS256
241 _mm256_dpbsud_epi32(__m256i __W, __m256i __A, __m256i __B) {
242 return (__m256i)__builtin_ia32_vpdpbsud256((__v8si)__W, (__v8si)__A,
243 (__v8si)__B);
246 /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
247 /// corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
248 /// signed 16-bit results. Sum these 4 results with the corresponding
249 /// 32-bit integer in \a __W with signed saturation, and store the packed
250 /// 32-bit results in \a dst.
252 /// \headerfile <x86intrin.h>
254 /// \code
255 /// _mm_dpbsuds_epi32( __m128i __W, __m128i __A, __m128i __B);
256 /// \endcode
258 /// This intrinsic corresponds to the \c VPDPBSSD instruction.
260 /// \param __A
261 /// A 128-bit vector of [16 x char].
262 /// \param __B
263 /// A 128-bit vector of [16 x unsigned char].
264 /// \returns
265 /// A 128-bit vector of [4 x int].
267 /// \code{.operation}
268 /// FOR j := 0 to 3
269 /// tmp1.word := Signed(SignExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j]))
270 /// tmp2.word := Signed(SignExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1]))
271 /// tmp3.word := Signed(SignExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2]))
272 /// tmp4.word := Signed(SignExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3]))
273 /// dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
274 /// ENDFOR
275 /// dst[MAX:128] := 0
276 /// \endcode
277 static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbsuds_epi32(__m128i __W,
278 __m128i __A,
279 __m128i __B) {
280 return (__m128i)__builtin_ia32_vpdpbsuds128((__v4si)__W, (__v4si)__A,
281 (__v4si)__B);
284 /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
285 /// corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
286 /// signed 16-bit results. Sum these 4 results with the corresponding
287 /// 32-bit integer in \a __W with signed saturation, and store the packed
288 /// 32-bit results in \a dst.
290 /// \headerfile <x86intrin.h>
292 /// \code
293 /// _mm256_dpbsuds_epi32(__m256i __W, __m256i __A, __m256i __B);
294 /// \endcode
296 /// This intrinsic corresponds to the \c VPDPBSSD instruction.
298 /// \param __A
299 /// A 256-bit vector of [32 x char].
300 /// \param __B
301 /// A 256-bit vector of [32 x unsigned char].
302 /// \returns
303 /// A 256-bit vector of [8 x int].
305 /// \code{.operation}
306 /// FOR j := 0 to 7
307 /// tmp1.word := Signed(SignExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j]))
308 /// tmp2.word := Signed(SignExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1]))
309 /// tmp3.word := Signed(SignExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2]))
310 /// tmp4.word := Signed(SignExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3]))
311 /// dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
312 /// ENDFOR
313 /// dst[MAX:256] := 0
314 /// \endcode
315 static __inline__ __m256i __DEFAULT_FN_ATTRS256
316 _mm256_dpbsuds_epi32(__m256i __W, __m256i __A, __m256i __B) {
317 return (__m256i)__builtin_ia32_vpdpbsuds256((__v8si)__W, (__v8si)__A,
318 (__v8si)__B);
321 /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with
322 /// corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
323 /// signed 16-bit results. Sum these 4 results with the corresponding
324 /// 32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
326 /// \headerfile <x86intrin.h>
328 /// \code
329 /// _mm_dpbuud_epi32(__m128i __W, __m128i __A, __m128i __B);
330 /// \endcode
332 /// This intrinsic corresponds to the \c VPDPBSSD instruction.
334 /// \param __A
335 /// A 128-bit vector of [16 x unsigned char].
336 /// \param __B
337 /// A 128-bit vector of [16 x unsigned char].
338 /// \returns
339 /// A 128-bit vector of [4 x int].
341 /// \code{.operation}
342 /// FOR j := 0 to 3
343 /// tmp1.word := ZeroExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j])
344 /// tmp2.word := ZeroExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1])
345 /// tmp3.word := ZeroExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2])
346 /// tmp4.word := ZeroExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3])
347 /// dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
348 /// ENDFOR
349 /// dst[MAX:128] := 0
350 /// \endcode
351 static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbuud_epi32(__m128i __W,
352 __m128i __A,
353 __m128i __B) {
354 return (__m128i)__builtin_ia32_vpdpbuud128((__v4si)__W, (__v4si)__A,
355 (__v4si)__B);
358 /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with
359 /// corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
360 /// signed 16-bit results. Sum these 4 results with the corresponding
361 /// 32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
363 /// \headerfile <x86intrin.h>
365 /// \code
366 /// _mm256_dpbuud_epi32(__m256i __W, __m256i __A, __m256i __B);
367 /// \endcode
369 /// This intrinsic corresponds to the \c VPDPBSSD instruction.
371 /// \param __A
372 /// A 256-bit vector of [32 x unsigned char].
373 /// \param __B
374 /// A 256-bit vector of [32 x unsigned char].
375 /// \returns
376 /// A 256-bit vector of [8 x int].
378 /// \code{.operation}
379 /// FOR j := 0 to 7
380 /// tmp1.word := ZeroExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j])
381 /// tmp2.word := ZeroExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1])
382 /// tmp3.word := ZeroExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2])
383 /// tmp4.word := ZeroExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3])
384 /// dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
385 /// ENDFOR
386 /// dst[MAX:256] := 0
387 /// \endcode
388 static __inline__ __m256i __DEFAULT_FN_ATTRS256
389 _mm256_dpbuud_epi32(__m256i __W, __m256i __A, __m256i __B) {
390 return (__m256i)__builtin_ia32_vpdpbuud256((__v8si)__W, (__v8si)__A,
391 (__v8si)__B);
394 /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with
395 /// corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
396 /// signed 16-bit results. Sum these 4 results with the corresponding
397 /// 32-bit integer in \a __W with signed saturation, and store the packed
398 /// 32-bit results in \a dst.
400 /// \headerfile <x86intrin.h>
402 /// \code
403 /// _mm_dpbuuds_epi32( __m128i __W, __m128i __A, __m128i __B);
404 /// \endcode
406 /// This intrinsic corresponds to the \c VPDPBUUDS instruction.
408 /// \param __A
409 /// A 128-bit vector of [16 x unsigned char].
410 /// \param __B
411 /// A 128-bit vector of [16 x unsigned char].
412 /// \returns
413 /// A 128-bit vector of [4 x int].
415 /// \code{.operation}
416 /// FOR j := 0 to 3
417 /// tmp1.word := ZeroExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j])
418 /// tmp2.word := ZeroExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1])
419 /// tmp3.word := ZeroExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2])
420 /// tmp4.word := ZeroExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3])
421 /// dst.dword[j] := UNSIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
422 /// ENDFOR
423 /// dst[MAX:128] := 0
424 /// \endcode
425 static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbuuds_epi32(__m128i __W,
426 __m128i __A,
427 __m128i __B) {
428 return (__m128i)__builtin_ia32_vpdpbuuds128((__v4si)__W, (__v4si)__A,
429 (__v4si)__B);
432 /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
433 /// corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
434 /// signed 16-bit results. Sum these 4 results with the corresponding
435 /// 32-bit integer in \a __W with signed saturation, and store the packed
436 /// 32-bit results in \a dst.
438 /// \headerfile <x86intrin.h>
440 /// \code
441 /// _mm256_dpbuuds_epi32(__m256i __W, __m256i __A, __m256i __B);
442 /// \endcode
444 /// This intrinsic corresponds to the \c VPDPBUUDS instruction.
446 /// \param __A
447 /// A 256-bit vector of [32 x unsigned char].
448 /// \param __B
449 /// A 256-bit vector of [32 x unsigned char].
450 /// \returns
451 /// A 256-bit vector of [8 x int].
453 /// \code{.operation}
454 /// FOR j := 0 to 7
455 /// tmp1.word := ZeroExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j])
456 /// tmp2.word := ZeroExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1])
457 /// tmp3.word := ZeroExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2])
458 /// tmp4.word := ZeroExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3])
459 /// dst.dword[j] := UNSIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
460 /// ENDFOR
461 /// dst[MAX:256] := 0
462 /// \endcode
463 static __inline__ __m256i __DEFAULT_FN_ATTRS256
464 _mm256_dpbuuds_epi32(__m256i __W, __m256i __A, __m256i __B) {
465 return (__m256i)__builtin_ia32_vpdpbuuds256((__v8si)__W, (__v8si)__A,
466 (__v8si)__B);
468 #undef __DEFAULT_FN_ATTRS128
469 #undef __DEFAULT_FN_ATTRS256
471 #endif // __AVXVNNIINT8INTRIN_H