clang/lib/Headers/avxvnniint8intrin.h

   1 /*===-------- avxvnniint8intrin.h - AVXVNNIINT8 intrinsics -----------===
   2  *
   3  * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   4  * See https://llvm.org/LICENSE.txt for license information.
   5  * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
   6  *
   7  *===-----------------------------------------------------------------------===
   8  */
   9 #ifndef __IMMINTRIN_H
  10 #error                                                                         \
  11     "Never use <avxvnniint8intrin.h> directly; include <immintrin.h> instead."
  12 #endif
  13
  14 #ifndef __AVXVNNIINT8INTRIN_H
  15 #define __AVXVNNIINT8INTRIN_H
  16
  17 /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
  18 ///    corresponding signed 8-bit integers in \a __B, producing 4 intermediate
  19 ///    signed 16-bit results. Sum these 4 results with the corresponding
  20 ///    32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
  21 ///
  22 /// \headerfile <x86intrin.h>
  23 ///
  24 /// \code
  25 /// _mm_dpbssd_epi32(__m128i __W, __m128i __A, __m128i __B);
  26 /// \endcode
  27 ///
  28 /// This intrinsic corresponds to the \c VPDPBSSD instruction.
  29 ///
  30 /// \param __A
  31 ///    A 128-bit vector of [16 x char].
  32 /// \param __B
  33 ///    A 128-bit vector of [16 x char].
  34 /// \returns
  35 ///    A 128-bit vector of [4 x int].
  36 ///
  37 /// \code{.operation}
  38 /// FOR j := 0 to 3
  39 ///     tmp1.word := SignExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j])
  40 ///     tmp2.word := SignExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1])
  41 ///     tmp3.word := SignExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2])
  42 ///     tmp4.word := SignExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3])
  43 ///     dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
  44 /// ENDFOR
  45 /// dst[MAX:128] := 0
  46 /// \endcode
  47 #define _mm_dpbssd_epi32(__W, __A, __B)                                        \
  48   ((__m128i)__builtin_ia32_vpdpbssd128((__v4si)(__W), (__v4si)(__A),           \
  49                                        (__v4si)(__B)))
  50
  51 /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
  52 ///    corresponding signed 8-bit integers in \a __B, producing 4 intermediate
  53 ///    signed 16-bit results. Sum these 4 results with the corresponding
  54 ///    32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
  55 ///
  56 /// \headerfile <x86intrin.h>
  57 ///
  58 /// \code
  59 /// _mm256_dpbssd_epi32(__m256i __W, __m256i __A, __m256i __B);
  60 /// \endcode
  61 ///
  62 /// This intrinsic corresponds to the \c VPDPBSSD instruction.
  63 ///
  64 /// \param __A
  65 ///    A 256-bit vector of [32 x char].
  66 /// \param __B
  67 ///    A 256-bit vector of [32 x char].
  68 /// \returns
  69 ///    A 256-bit vector of [8 x int].
  70 ///
  71 /// \code{.operation}
  72 /// FOR j := 0 to 7
  73 ///     tmp1.word := SignExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j])
  74 ///     tmp2.word := SignExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1])
  75 ///     tmp3.word := SignExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2])
  76 ///     tmp4.word := SignExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3])
  77 ///     dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
  78 /// ENDFOR
  79 /// dst[MAX:256] := 0
  80 /// \endcode
  81 #define _mm256_dpbssd_epi32(__W, __A, __B)                                     \
  82   ((__m256i)__builtin_ia32_vpdpbssd256((__v8si)(__W), (__v8si)(__A),           \
  83                                        (__v8si)(__B)))
  84
  85 /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
  86 ///    corresponding signed 8-bit integers in \a __B, producing 4 intermediate
  87 ///    signed 16-bit results. Sum these 4 results with the corresponding
  88 ///    32-bit integer in \a __W with signed saturation, and store the packed
  89 ///    32-bit results in \a dst.
  90 ///
  91 /// \headerfile <x86intrin.h>
  92 ///
  93 /// \code
  94 /// _mm_dpbssds_epi32( __m128i __W, __m128i __A, __m128i __B);
  95 /// \endcode
  96 ///
  97 /// This intrinsic corresponds to the \c VPDPBSSD instruction.
  98 ///
  99 /// \param __A
 100 ///    A 128-bit vector of [16 x char].
 101 /// \param __B
 102 ///    A 128-bit vector of [16 x char].
 103 /// \returns
 104 ///    A 128-bit vector of [4 x int].
 105 ///
 106 /// \code{.operation}
 107 /// FOR j := 0 to 3
 108 ///     tmp1.word := SignExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j])
 109 ///     tmp2.word := SignExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1])
 110 ///     tmp3.word := SignExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2])
 111 ///     tmp4.word := SignExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3])
 112 ///     dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
 113 /// ENDFOR
 114 /// dst[MAX:128] := 0
 115 /// \endcode
 116 #define _mm_dpbssds_epi32(__W, __A, __B)                                       \
 117   ((__m128i)__builtin_ia32_vpdpbssds128((__v4si)(__W), (__v4si)(__A),          \
 118                                         (__v4si)(__B)))
 119
 120 /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
 121 ///    corresponding signed 8-bit integers in \a __B, producing 4 intermediate
 122 ///    signed 16-bit results. Sum these 4 results with the corresponding
 123 ///    32-bit integer in \a __W with signed saturation, and store the packed
 124 ///    32-bit results in \a dst.
 125 ///
 126 /// \headerfile <x86intrin.h>
 127 ///
 128 /// \code
 129 /// _mm256_dpbssds_epi32(__m256i __W, __m256i __A, __m256i __B);
 130 /// \endcode
 131 ///
 132 /// This intrinsic corresponds to the \c VPDPBSSD instruction.
 133 ///
 134 /// \param __A
 135 ///    A 256-bit vector of [32 x char].
 136 /// \param __B
 137 ///    A 256-bit vector of [32 x char].
 138 /// \returns
 139 ///    A 256-bit vector of [8 x int].
 140 ///
 141 /// \code{.operation}
 142 /// FOR j := 0 to 7
 143 ///     tmp1.word := SignExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j])
 144 ///     tmp2.word := SignExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1])
 145 ///     tmp3.word := SignExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2])
 146 ///     tmp4.word := SignExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3])
 147 ///     dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
 148 /// ENDFOR
 149 /// dst[MAX:256] := 0
 150 /// \endcode
 151 #define _mm256_dpbssds_epi32(__W, __A, __B)                                    \
 152   ((__m256i)__builtin_ia32_vpdpbssds256((__v8si)(__W), (__v8si)(__A),          \
 153                                         (__v8si)(__B)))
 154
 155 /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
 156 ///    corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
 157 ///    signed 16-bit results. Sum these 4 results with the corresponding
 158 ///    32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
 159 ///
 160 /// \headerfile <x86intrin.h>
 161 ///
 162 /// \code
 163 /// _mm_dpbsud_epi32(__m128i __W, __m128i __A, __m128i __B);
 164 /// \endcode
 165 ///
 166 /// This intrinsic corresponds to the \c VPDPBSSD instruction.
 167 ///
 168 /// \param __A
 169 ///    A 128-bit vector of [16 x char].
 170 /// \param __B
 171 ///    A 128-bit vector of [16 x unsigned char].
 172 /// \returns
 173 ///    A 128-bit vector of [4 x int].
 174 ///
 175 /// \code{.operation}
 176 /// FOR j := 0 to 3
 177 ///     tmp1.word := Signed(SignExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j]))
 178 ///     tmp2.word := Signed(SignExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1]))
 179 ///     tmp3.word := Signed(SignExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2]))
 180 ///     tmp4.word := Signed(SignExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3]))
 181 ///     dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
 182 /// ENDFOR
 183 /// dst[MAX:128] := 0
 184 /// \endcode
 185 #define _mm_dpbsud_epi32(__W, __A, __B)                                        \
 186   ((__m128i)__builtin_ia32_vpdpbsud128((__v4si)(__W), (__v4si)(__A),           \
 187                                        (__v4si)(__B)))
 188
 189 /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
 190 ///    corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
 191 ///    signed 16-bit results. Sum these 4 results with the corresponding
 192 ///    32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
 193 ///
 194 /// \headerfile <x86intrin.h>
 195 ///
 196 /// \code
 197 /// _mm256_dpbsud_epi32(__m256i __W, __m256i __A, __m256i __B);
 198 /// \endcode
 199 ///
 200 /// This intrinsic corresponds to the \c VPDPBSSD instruction.
 201 ///
 202 /// \param __A
 203 ///    A 256-bit vector of [32 x char].
 204 /// \param __B
 205 ///    A 256-bit vector of [32 x unsigned char].
 206 /// \returns
 207 ///    A 256-bit vector of [8 x int].
 208 ///
 209 /// \code{.operation}
 210 /// FOR j := 0 to 7
 211 ///     tmp1.word := Signed(SignExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j]))
 212 ///     tmp2.word := Signed(SignExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1]))
 213 ///     tmp3.word := Signed(SignExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2]))
 214 ///     tmp4.word := Signed(SignExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3]))
 215 ///     dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
 216 /// ENDFOR
 217 /// dst[MAX:256] := 0
 218 /// \endcode
 219 #define _mm256_dpbsud_epi32(__W, __A, __B)                                     \
 220   ((__m256i)__builtin_ia32_vpdpbsud256((__v8si)(__W), (__v8si)(__A),           \
 221                                        (__v8si)(__B)))
 222
 223 /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
 224 ///    corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
 225 ///    signed 16-bit results. Sum these 4 results with the corresponding
 226 ///    32-bit integer in \a __W with signed saturation, and store the packed
 227 ///    32-bit results in \a dst.
 228 ///
 229 /// \headerfile <x86intrin.h>
 230 ///
 231 /// \code
 232 /// _mm_dpbsuds_epi32( __m128i __W, __m128i __A, __m128i __B);
 233 /// \endcode
 234 ///
 235 /// This intrinsic corresponds to the \c VPDPBSSD instruction.
 236 ///
 237 /// \param __A
 238 ///    A 128-bit vector of [16 x char].
 239 /// \param __B
 240 ///    A 128-bit vector of [16 x unsigned char].
 241 /// \returns
 242 ///    A 128-bit vector of [4 x int].
 243 ///
 244 /// \code{.operation}
 245 /// FOR j := 0 to 3
 246 ///     tmp1.word := Signed(SignExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j]))
 247 ///     tmp2.word := Signed(SignExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1]))
 248 ///     tmp3.word := Signed(SignExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2]))
 249 ///     tmp4.word := Signed(SignExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3]))
 250 ///     dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
 251 /// ENDFOR
 252 /// dst[MAX:128] := 0
 253 /// \endcode
 254 #define _mm_dpbsuds_epi32(__W, __A, __B)                                       \
 255   ((__m128i)__builtin_ia32_vpdpbsuds128((__v4si)(__W), (__v4si)(__A),          \
 256                                         (__v4si)(__B)))
 257
 258 /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
 259 ///    corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
 260 ///    signed 16-bit results. Sum these 4 results with the corresponding
 261 ///    32-bit integer in \a __W with signed saturation, and store the packed
 262 ///    32-bit results in \a dst.
 263 ///
 264 /// \headerfile <x86intrin.h>
 265 ///
 266 /// \code
 267 /// _mm256_dpbsuds_epi32(__m256i __W, __m256i __A, __m256i __B);
 268 /// \endcode
 269 ///
 270 /// This intrinsic corresponds to the \c VPDPBSSD instruction.
 271 ///
 272 /// \param __A
 273 ///    A 256-bit vector of [32 x char].
 274 /// \param __B
 275 ///    A 256-bit vector of [32 x unsigned char].
 276 /// \returns
 277 ///    A 256-bit vector of [8 x int].
 278 ///
 279 /// \code{.operation}
 280 /// FOR j := 0 to 7
 281 ///     tmp1.word := Signed(SignExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j]))
 282 ///     tmp2.word := Signed(SignExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1]))
 283 ///     tmp3.word := Signed(SignExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2]))
 284 ///     tmp4.word := Signed(SignExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3]))
 285 ///     dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
 286 /// ENDFOR
 287 /// dst[MAX:256] := 0
 288 /// \endcode
 289 #define _mm256_dpbsuds_epi32(__W, __A, __B)                                    \
 290   ((__m256i)__builtin_ia32_vpdpbsuds256((__v8si)(__W), (__v8si)(__A),          \
 291                                         (__v8si)(__B)))
 292
 293 /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with
 294 ///    corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
 295 ///    signed 16-bit results. Sum these 4 results with the corresponding
 296 ///    32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
 297 ///
 298 /// \headerfile <x86intrin.h>
 299 ///
 300 /// \code
 301 /// _mm_dpbuud_epi32(__m128i __W, __m128i __A, __m128i __B);
 302 /// \endcode
 303 ///
 304 /// This intrinsic corresponds to the \c VPDPBSSD instruction.
 305 ///
 306 /// \param __A
 307 ///    A 128-bit vector of [16 x unsigned char].
 308 /// \param __B
 309 ///    A 128-bit vector of [16 x unsigned char].
 310 /// \returns
 311 ///    A 128-bit vector of [4 x int].
 312 ///
 313 /// \code{.operation}
 314 /// FOR j := 0 to 3
 315 ///     tmp1.word := ZeroExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j])
 316 ///     tmp2.word := ZeroExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1])
 317 ///     tmp3.word := ZeroExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2])
 318 ///     tmp4.word := ZeroExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3])
 319 ///     dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
 320 /// ENDFOR
 321 /// dst[MAX:128] := 0
 322 /// \endcode
 323 #define _mm_dpbuud_epi32(__W, __A, __B)                                        \
 324   ((__m128i)__builtin_ia32_vpdpbuud128((__v4si)(__W), (__v4si)(__A),           \
 325                                        (__v4si)(__B)))
 326
 327 /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with
 328 ///    corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
 329 ///    signed 16-bit results. Sum these 4 results with the corresponding
 330 ///    32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
 331 ///
 332 /// \headerfile <x86intrin.h>
 333 ///
 334 /// \code
 335 /// _mm256_dpbuud_epi32(__m256i __W, __m256i __A, __m256i __B);
 336 /// \endcode
 337 ///
 338 /// This intrinsic corresponds to the \c VPDPBSSD instruction.
 339 ///
 340 /// \param __A
 341 ///    A 256-bit vector of [32 x unsigned char].
 342 /// \param __B
 343 ///    A 256-bit vector of [32 x unsigned char].
 344 /// \returns
 345 ///    A 256-bit vector of [8 x int].
 346 ///
 347 /// \code{.operation}
 348 /// FOR j := 0 to 7
 349 ///     tmp1.word := ZeroExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j])
 350 ///     tmp2.word := ZeroExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1])
 351 ///     tmp3.word := ZeroExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2])
 352 ///     tmp4.word := ZeroExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3])
 353 ///     dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
 354 /// ENDFOR
 355 /// dst[MAX:256] := 0
 356 /// \endcode
 357 #define _mm256_dpbuud_epi32(__W, __A, __B)                                     \
 358   ((__m256i)__builtin_ia32_vpdpbuud256((__v8si)(__W), (__v8si)(__A),           \
 359                                        (__v8si)(__B)))
 360
 361 /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with
 362 ///    corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
 363 ///    signed 16-bit results. Sum these 4 results with the corresponding
 364 ///    32-bit integer in \a __W with signed saturation, and store the packed
 365 ///    32-bit results in \a dst.
 366 ///
 367 /// \headerfile <x86intrin.h>
 368 ///
 369 /// \code
 370 /// _mm_dpbuuds_epi32( __m128i __W, __m128i __A, __m128i __B);
 371 /// \endcode
 372 ///
 373 /// This intrinsic corresponds to the \c VPDPBUUDS instruction.
 374 ///
 375 /// \param __A
 376 ///    A 128-bit vector of [16 x unsigned char].
 377 /// \param __B
 378 ///    A 128-bit vector of [16 x unsigned char].
 379 /// \returns
 380 ///    A 128-bit vector of [4 x int].
 381 ///
 382 /// \code{.operation}
 383 /// FOR j := 0 to 3
 384 ///     tmp1.word := ZeroExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j])
 385 ///     tmp2.word := ZeroExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1])
 386 ///     tmp3.word := ZeroExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2])
 387 ///     tmp4.word := ZeroExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3])
 388 ///     dst.dword[j] := UNSIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
 389 /// ENDFOR
 390 /// dst[MAX:128] := 0
 391 /// \endcode
 392 #define _mm_dpbuuds_epi32(__W, __A, __B)                                       \
 393   ((__m128i)__builtin_ia32_vpdpbuuds128((__v4si)(__W), (__v4si)(__A),          \
 394                                         (__v4si)(__B)))
 395
 396 ///    corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
 397 ///    signed 16-bit results. Sum these 4 results with the corresponding
 398 ///    32-bit integer in \a __W with signed saturation, and store the packed
 399 ///    32-bit results in \a dst.
 400 ///
 401 /// \headerfile <x86intrin.h>
 402 ///
 403 /// \code
 404 /// _mm256_dpbuuds_epi32(__m256i __W, __m256i __A, __m256i __B);
 405 /// \endcode
 406 ///
 407 /// This intrinsic corresponds to the \c VPDPBUUDS instruction.
 408 ///
 409 /// \param __A
 410 ///    A 256-bit vector of [32 x unsigned char].
 411 /// \param __B
 412 ///    A 256-bit vector of [32 x unsigned char].
 413 /// \returns
 414 ///    A 256-bit vector of [8 x int].
 415 ///
 416 /// \code{.operation}
 417 /// FOR j := 0 to 7
 418 ///     tmp1.word := ZeroExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j])
 419 ///     tmp2.word := ZeroExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1])
 420 ///     tmp3.word := ZeroExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2])
 421 ///     tmp4.word := ZeroExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3])
 422 ///     dst.dword[j] := UNSIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
 423 /// ENDFOR
 424 /// dst[MAX:256] := 0
 425 /// \endcode
 426 #define _mm256_dpbuuds_epi32(__W, __A, __B)                                    \
 427   ((__m256i)__builtin_ia32_vpdpbuuds256((__v8si)(__W), (__v8si)(__A),          \
 428                                         (__v8si)(__B)))
 429
 430 #endif // __AVXVNNIINT8INTRIN_H