clang/lib/Headers/avxvnniint8intrin.h

   1 /*===-------- avxvnniint8intrin.h - AVXVNNIINT8 intrinsics -----------===
   2  *
   3  * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   4  * See https://llvm.org/LICENSE.txt for license information.
   5  * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
   6  *
   7  *===-----------------------------------------------------------------------===
   8  */
   9 #ifndef __IMMINTRIN_H
  10 #error                                                                         \
  11     "Never use <avxvnniint8intrin.h> directly; include <immintrin.h> instead."
  12 #endif
  13
  14 #ifndef __AVXVNNIINT8INTRIN_H
  15 #define __AVXVNNIINT8INTRIN_H
  16
  17 /* Define the default attributes for the functions in this file. */
  18 #define __DEFAULT_FN_ATTRS256                                                  \
  19   __attribute__((__always_inline__, __nodebug__, __target__("avxvnniint8"),    \
  20                  __min_vector_width__(256)))
  21 #define __DEFAULT_FN_ATTRS128                                                  \
  22   __attribute__((__always_inline__, __nodebug__, __target__("avxvnniint8"),    \
  23                  __min_vector_width__(128)))
  24
  25 /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
  26 ///    corresponding signed 8-bit integers in \a __B, producing 4 intermediate
  27 ///    signed 16-bit results. Sum these 4 results with the corresponding
  28 ///    32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
  29 ///
  30 /// \headerfile <x86intrin.h>
  31 ///
  32 /// \code
  33 /// _mm_dpbssd_epi32(__m128i __W, __m128i __A, __m128i __B);
  34 /// \endcode
  35 ///
  36 /// This intrinsic corresponds to the \c VPDPBSSD instruction.
  37 ///
  38 /// \param __A
  39 ///    A 128-bit vector of [16 x char].
  40 /// \param __B
  41 ///    A 128-bit vector of [16 x char].
  42 /// \returns
  43 ///    A 128-bit vector of [4 x int].
  44 ///
  45 /// \code{.operation}
  46 /// FOR j := 0 to 3
  47 ///     tmp1.word := SignExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j])
  48 ///     tmp2.word := SignExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1])
  49 ///     tmp3.word := SignExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2])
  50 ///     tmp4.word := SignExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3])
  51 ///     dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
  52 /// ENDFOR
  53 /// dst[MAX:128] := 0
  54 /// \endcode
  55 static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbssd_epi32(__m128i __W,
  56                                                                  __m128i __A,
  57                                                                  __m128i __B) {
  58   return (__m128i)__builtin_ia32_vpdpbssd128((__v4si)__W, (__v4si)__A,
  59                                              (__v4si)__B);
  60 }
  61
  62 /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
  63 ///    corresponding signed 8-bit integers in \a __B, producing 4 intermediate
  64 ///    signed 16-bit results. Sum these 4 results with the corresponding
  65 ///    32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
  66 ///
  67 /// \headerfile <x86intrin.h>
  68 ///
  69 /// \code
  70 /// _mm256_dpbssd_epi32(__m256i __W, __m256i __A, __m256i __B);
  71 /// \endcode
  72 ///
  73 /// This intrinsic corresponds to the \c VPDPBSSD instruction.
  74 ///
  75 /// \param __A
  76 ///    A 256-bit vector of [32 x char].
  77 /// \param __B
  78 ///    A 256-bit vector of [32 x char].
  79 /// \returns
  80 ///    A 256-bit vector of [8 x int].
  81 ///
  82 /// \code{.operation}
  83 /// FOR j := 0 to 7
  84 ///     tmp1.word := SignExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j])
  85 ///     tmp2.word := SignExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1])
  86 ///     tmp3.word := SignExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2])
  87 ///     tmp4.word := SignExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3])
  88 ///     dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
  89 /// ENDFOR
  90 /// dst[MAX:256] := 0
  91 /// \endcode
  92 static __inline__ __m256i __DEFAULT_FN_ATTRS256
  93 _mm256_dpbssd_epi32(__m256i __W, __m256i __A, __m256i __B) {
  94   return (__m256i)__builtin_ia32_vpdpbssd256((__v8si)__W, (__v8si)__A,
  95                                              (__v8si)__B);
  96 }
  97
  98 /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
  99 ///    corresponding signed 8-bit integers in \a __B, producing 4 intermediate
 100 ///    signed 16-bit results. Sum these 4 results with the corresponding
 101 ///    32-bit integer in \a __W with signed saturation, and store the packed
 102 ///    32-bit results in \a dst.
 103 ///
 104 /// \headerfile <x86intrin.h>
 105 ///
 106 /// \code
 107 /// _mm_dpbssds_epi32( __m128i __W, __m128i __A, __m128i __B);
 108 /// \endcode
 109 ///
 110 /// This intrinsic corresponds to the \c VPDPBSSD instruction.
 111 ///
 112 /// \param __A
 113 ///    A 128-bit vector of [16 x char].
 114 /// \param __B
 115 ///    A 128-bit vector of [16 x char].
 116 /// \returns
 117 ///    A 128-bit vector of [4 x int].
 118 ///
 119 /// \code{.operation}
 120 /// FOR j := 0 to 3
 121 ///     tmp1.word := SignExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j])
 122 ///     tmp2.word := SignExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1])
 123 ///     tmp3.word := SignExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2])
 124 ///     tmp4.word := SignExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3])
 125 ///     dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
 126 /// ENDFOR
 127 /// dst[MAX:128] := 0
 128 /// \endcode
 129 static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbssds_epi32(__m128i __W,
 130                                                                   __m128i __A,
 131                                                                   __m128i __B) {
 132   return (__m128i)__builtin_ia32_vpdpbssds128((__v4si)__W, (__v4si)__A,
 133                                               (__v4si)__B);
 134 }
 135
 136 /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
 137 ///    corresponding signed 8-bit integers in \a __B, producing 4 intermediate
 138 ///    signed 16-bit results. Sum these 4 results with the corresponding
 139 ///    32-bit integer in \a __W with signed saturation, and store the packed
 140 ///    32-bit results in \a dst.
 141 ///
 142 /// \headerfile <x86intrin.h>
 143 ///
 144 /// \code
 145 /// _mm256_dpbssds_epi32(__m256i __W, __m256i __A, __m256i __B);
 146 /// \endcode
 147 ///
 148 /// This intrinsic corresponds to the \c VPDPBSSD instruction.
 149 ///
 150 /// \param __A
 151 ///    A 256-bit vector of [32 x char].
 152 /// \param __B
 153 ///    A 256-bit vector of [32 x char].
 154 /// \returns
 155 ///    A 256-bit vector of [8 x int].
 156 ///
 157 /// \code{.operation}
 158 /// FOR j := 0 to 7
 159 ///     tmp1.word := SignExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j])
 160 ///     tmp2.word := SignExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1])
 161 ///     tmp3.word := SignExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2])
 162 ///     tmp4.word := SignExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3])
 163 ///     dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
 164 /// ENDFOR
 165 /// dst[MAX:256] := 0
 166 /// \endcode
 167 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 168 _mm256_dpbssds_epi32(__m256i __W, __m256i __A, __m256i __B) {
 169   return (__m256i)__builtin_ia32_vpdpbssds256((__v8si)__W, (__v8si)__A,
 170                                               (__v8si)__B);
 171 }
 172
 173 /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
 174 ///    corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
 175 ///    signed 16-bit results. Sum these 4 results with the corresponding
 176 ///    32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
 177 ///
 178 /// \headerfile <x86intrin.h>
 179 ///
 180 /// \code
 181 /// _mm_dpbsud_epi32(__m128i __W, __m128i __A, __m128i __B);
 182 /// \endcode
 183 ///
 184 /// This intrinsic corresponds to the \c VPDPBSSD instruction.
 185 ///
 186 /// \param __A
 187 ///    A 128-bit vector of [16 x char].
 188 /// \param __B
 189 ///    A 128-bit vector of [16 x unsigned char].
 190 /// \returns
 191 ///    A 128-bit vector of [4 x int].
 192 ///
 193 /// \code{.operation}
 194 /// FOR j := 0 to 3
 195 ///     tmp1.word := Signed(SignExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j]))
 196 ///     tmp2.word := Signed(SignExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1]))
 197 ///     tmp3.word := Signed(SignExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2]))
 198 ///     tmp4.word := Signed(SignExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3]))
 199 ///     dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
 200 /// ENDFOR
 201 /// dst[MAX:128] := 0
 202 /// \endcode
 203 static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbsud_epi32(__m128i __W,
 204                                                                  __m128i __A,
 205                                                                  __m128i __B) {
 206   return (__m128i)__builtin_ia32_vpdpbsud128((__v4si)__W, (__v4si)__A,
 207                                              (__v4si)__B);
 208 }
 209
 210 /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
 211 ///    corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
 212 ///    signed 16-bit results. Sum these 4 results with the corresponding
 213 ///    32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
 214 ///
 215 /// \headerfile <x86intrin.h>
 216 ///
 217 /// \code
 218 /// _mm256_dpbsud_epi32(__m256i __W, __m256i __A, __m256i __B);
 219 /// \endcode
 220 ///
 221 /// This intrinsic corresponds to the \c VPDPBSSD instruction.
 222 ///
 223 /// \param __A
 224 ///    A 256-bit vector of [32 x char].
 225 /// \param __B
 226 ///    A 256-bit vector of [32 x unsigned char].
 227 /// \returns
 228 ///    A 256-bit vector of [8 x int].
 229 ///
 230 /// \code{.operation}
 231 /// FOR j := 0 to 7
 232 ///     tmp1.word := Signed(SignExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j]))
 233 ///     tmp2.word := Signed(SignExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1]))
 234 ///     tmp3.word := Signed(SignExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2]))
 235 ///     tmp4.word := Signed(SignExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3]))
 236 ///     dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
 237 /// ENDFOR
 238 /// dst[MAX:256] := 0
 239 /// \endcode
 240 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 241 _mm256_dpbsud_epi32(__m256i __W, __m256i __A, __m256i __B) {
 242   return (__m256i)__builtin_ia32_vpdpbsud256((__v8si)__W, (__v8si)__A,
 243                                              (__v8si)__B);
 244 }
 245
 246 /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
 247 ///    corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
 248 ///    signed 16-bit results. Sum these 4 results with the corresponding
 249 ///    32-bit integer in \a __W with signed saturation, and store the packed
 250 ///    32-bit results in \a dst.
 251 ///
 252 /// \headerfile <x86intrin.h>
 253 ///
 254 /// \code
 255 /// _mm_dpbsuds_epi32( __m128i __W, __m128i __A, __m128i __B);
 256 /// \endcode
 257 ///
 258 /// This intrinsic corresponds to the \c VPDPBSSD instruction.
 259 ///
 260 /// \param __A
 261 ///    A 128-bit vector of [16 x char].
 262 /// \param __B
 263 ///    A 128-bit vector of [16 x unsigned char].
 264 /// \returns
 265 ///    A 128-bit vector of [4 x int].
 266 ///
 267 /// \code{.operation}
 268 /// FOR j := 0 to 3
 269 ///     tmp1.word := Signed(SignExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j]))
 270 ///     tmp2.word := Signed(SignExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1]))
 271 ///     tmp3.word := Signed(SignExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2]))
 272 ///     tmp4.word := Signed(SignExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3]))
 273 ///     dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
 274 /// ENDFOR
 275 /// dst[MAX:128] := 0
 276 /// \endcode
 277 static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbsuds_epi32(__m128i __W,
 278                                                                   __m128i __A,
 279                                                                   __m128i __B) {
 280   return (__m128i)__builtin_ia32_vpdpbsuds128((__v4si)__W, (__v4si)__A,
 281                                               (__v4si)__B);
 282 }
 283
 284 /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
 285 ///    corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
 286 ///    signed 16-bit results. Sum these 4 results with the corresponding
 287 ///    32-bit integer in \a __W with signed saturation, and store the packed
 288 ///    32-bit results in \a dst.
 289 ///
 290 /// \headerfile <x86intrin.h>
 291 ///
 292 /// \code
 293 /// _mm256_dpbsuds_epi32(__m256i __W, __m256i __A, __m256i __B);
 294 /// \endcode
 295 ///
 296 /// This intrinsic corresponds to the \c VPDPBSSD instruction.
 297 ///
 298 /// \param __A
 299 ///    A 256-bit vector of [32 x char].
 300 /// \param __B
 301 ///    A 256-bit vector of [32 x unsigned char].
 302 /// \returns
 303 ///    A 256-bit vector of [8 x int].
 304 ///
 305 /// \code{.operation}
 306 /// FOR j := 0 to 7
 307 ///     tmp1.word := Signed(SignExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j]))
 308 ///     tmp2.word := Signed(SignExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1]))
 309 ///     tmp3.word := Signed(SignExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2]))
 310 ///     tmp4.word := Signed(SignExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3]))
 311 ///     dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
 312 /// ENDFOR
 313 /// dst[MAX:256] := 0
 314 /// \endcode
 315 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 316 _mm256_dpbsuds_epi32(__m256i __W, __m256i __A, __m256i __B) {
 317   return (__m256i)__builtin_ia32_vpdpbsuds256((__v8si)__W, (__v8si)__A,
 318                                               (__v8si)__B);
 319 }
 320
 321 /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with
 322 ///    corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
 323 ///    signed 16-bit results. Sum these 4 results with the corresponding
 324 ///    32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
 325 ///
 326 /// \headerfile <x86intrin.h>
 327 ///
 328 /// \code
 329 /// _mm_dpbuud_epi32(__m128i __W, __m128i __A, __m128i __B);
 330 /// \endcode
 331 ///
 332 /// This intrinsic corresponds to the \c VPDPBSSD instruction.
 333 ///
 334 /// \param __A
 335 ///    A 128-bit vector of [16 x unsigned char].
 336 /// \param __B
 337 ///    A 128-bit vector of [16 x unsigned char].
 338 /// \returns
 339 ///    A 128-bit vector of [4 x int].
 340 ///
 341 /// \code{.operation}
 342 /// FOR j := 0 to 3
 343 ///     tmp1.word := ZeroExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j])
 344 ///     tmp2.word := ZeroExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1])
 345 ///     tmp3.word := ZeroExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2])
 346 ///     tmp4.word := ZeroExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3])
 347 ///     dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
 348 /// ENDFOR
 349 /// dst[MAX:128] := 0
 350 /// \endcode
 351 static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbuud_epi32(__m128i __W,
 352                                                                  __m128i __A,
 353                                                                  __m128i __B) {
 354   return (__m128i)__builtin_ia32_vpdpbuud128((__v4si)__W, (__v4si)__A,
 355                                              (__v4si)__B);
 356 }
 357
 358 /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with
 359 ///    corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
 360 ///    signed 16-bit results. Sum these 4 results with the corresponding
 361 ///    32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
 362 ///
 363 /// \headerfile <x86intrin.h>
 364 ///
 365 /// \code
 366 /// _mm256_dpbuud_epi32(__m256i __W, __m256i __A, __m256i __B);
 367 /// \endcode
 368 ///
 369 /// This intrinsic corresponds to the \c VPDPBSSD instruction.
 370 ///
 371 /// \param __A
 372 ///    A 256-bit vector of [32 x unsigned char].
 373 /// \param __B
 374 ///    A 256-bit vector of [32 x unsigned char].
 375 /// \returns
 376 ///    A 256-bit vector of [8 x int].
 377 ///
 378 /// \code{.operation}
 379 /// FOR j := 0 to 7
 380 ///     tmp1.word := ZeroExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j])
 381 ///     tmp2.word := ZeroExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1])
 382 ///     tmp3.word := ZeroExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2])
 383 ///     tmp4.word := ZeroExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3])
 384 ///     dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
 385 /// ENDFOR
 386 /// dst[MAX:256] := 0
 387 /// \endcode
 388 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 389 _mm256_dpbuud_epi32(__m256i __W, __m256i __A, __m256i __B) {
 390   return (__m256i)__builtin_ia32_vpdpbuud256((__v8si)__W, (__v8si)__A,
 391                                              (__v8si)__B);
 392 }
 393
 394 /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with
 395 ///    corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
 396 ///    signed 16-bit results. Sum these 4 results with the corresponding
 397 ///    32-bit integer in \a __W with signed saturation, and store the packed
 398 ///    32-bit results in \a dst.
 399 ///
 400 /// \headerfile <x86intrin.h>
 401 ///
 402 /// \code
 403 /// _mm_dpbuuds_epi32( __m128i __W, __m128i __A, __m128i __B);
 404 /// \endcode
 405 ///
 406 /// This intrinsic corresponds to the \c VPDPBUUDS instruction.
 407 ///
 408 /// \param __A
 409 ///    A 128-bit vector of [16 x unsigned char].
 410 /// \param __B
 411 ///    A 128-bit vector of [16 x unsigned char].
 412 /// \returns
 413 ///    A 128-bit vector of [4 x int].
 414 ///
 415 /// \code{.operation}
 416 /// FOR j := 0 to 3
 417 ///     tmp1.word := ZeroExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j])
 418 ///     tmp2.word := ZeroExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1])
 419 ///     tmp3.word := ZeroExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2])
 420 ///     tmp4.word := ZeroExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3])
 421 ///     dst.dword[j] := UNSIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
 422 /// ENDFOR
 423 /// dst[MAX:128] := 0
 424 /// \endcode
 425 static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbuuds_epi32(__m128i __W,
 426                                                                   __m128i __A,
 427                                                                   __m128i __B) {
 428   return (__m128i)__builtin_ia32_vpdpbuuds128((__v4si)__W, (__v4si)__A,
 429                                               (__v4si)__B);
 430 }
 431
 432 /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
 433 ///    corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
 434 ///    signed 16-bit results. Sum these 4 results with the corresponding
 435 ///    32-bit integer in \a __W with signed saturation, and store the packed
 436 ///    32-bit results in \a dst.
 437 ///
 438 /// \headerfile <x86intrin.h>
 439 ///
 440 /// \code
 441 /// _mm256_dpbuuds_epi32(__m256i __W, __m256i __A, __m256i __B);
 442 /// \endcode
 443 ///
 444 /// This intrinsic corresponds to the \c VPDPBUUDS instruction.
 445 ///
 446 /// \param __A
 447 ///    A 256-bit vector of [32 x unsigned char].
 448 /// \param __B
 449 ///    A 256-bit vector of [32 x unsigned char].
 450 /// \returns
 451 ///    A 256-bit vector of [8 x int].
 452 ///
 453 /// \code{.operation}
 454 /// FOR j := 0 to 7
 455 ///     tmp1.word := ZeroExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j])
 456 ///     tmp2.word := ZeroExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1])
 457 ///     tmp3.word := ZeroExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2])
 458 ///     tmp4.word := ZeroExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3])
 459 ///     dst.dword[j] := UNSIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
 460 /// ENDFOR
 461 /// dst[MAX:256] := 0
 462 /// \endcode
 463 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 464 _mm256_dpbuuds_epi32(__m256i __W, __m256i __A, __m256i __B) {
 465   return (__m256i)__builtin_ia32_vpdpbuuds256((__v8si)__W, (__v8si)__A,
 466                                               (__v8si)__B);
 467 }
 468 #undef __DEFAULT_FN_ATTRS128
 469 #undef __DEFAULT_FN_ATTRS256
 470
 471 #endif // __AVXVNNIINT8INTRIN_H