clang/lib/Headers/avxvnniint16intrin.h

   1 /*===----------- avxvnniint16intrin.h - AVXVNNIINT16 intrinsics-------------===
   2  *
   3  * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   4  * See https://llvm.org/LICENSE.txt for license information.
   5  * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
   6  *
   7  *===-----------------------------------------------------------------------===
   8  */
   9
  10 #ifndef __IMMINTRIN_H
  11 #error                                                                         \
  12     "Never use <avxvnniint16intrin.h> directly; include <immintrin.h> instead."
  13 #endif // __IMMINTRIN_H
  14
  15 #ifndef __AVXVNNIINT16INTRIN_H
  16 #define __AVXVNNIINT16INTRIN_H
  17
  18 /* Define the default attributes for the functions in this file. */
  19 #define __DEFAULT_FN_ATTRS128                                                  \
  20   __attribute__((__always_inline__, __nodebug__, __target__("avxvnniint16"),   \
  21                  __min_vector_width__(128)))
  22 #define __DEFAULT_FN_ATTRS256                                                  \
  23   __attribute__((__always_inline__, __nodebug__, __target__("avxvnniint16"),   \
  24                  __min_vector_width__(256)))
  25
  26 /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with
  27 ///    corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
  28 ///    signed 16-bit results. Sum these 2 results with the corresponding
  29 ///    32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
  30 ///
  31 /// \headerfile <immintrin.h>
  32 ///
  33 /// \code
  34 /// __m128i _mm_dpwsud_epi32(__m128i __W, __m128i __A, __m128i __B)
  35 /// \endcode
  36 ///
  37 /// This intrinsic corresponds to the \c VPDPWSUD instruction.
  38 ///
  39 /// \param __W
  40 ///    A 128-bit vector of [4 x int].
  41 /// \param __A
  42 ///    A 128-bit vector of [8 x short].
  43 /// \param __B
  44 ///    A 128-bit vector of [8 x unsigned short].
  45 /// \returns
  46 ///    A 128-bit vector of [4 x int].
  47 ///
  48 /// \code{.operation}
  49 /// FOR j := 0 to 3
  50 ///     tmp1.dword := SignExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j])
  51 ///     tmp2.dword := SignExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
  52 ///     dst.dword[j] := __W.dword[j] + tmp1 + tmp2
  53 /// ENDFOR
  54 /// dst[MAX:128] := 0
  55 /// \endcode
  56 static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpwsud_epi32(__m128i __W,
  57                                                                  __m128i __A,
  58                                                                  __m128i __B) {
  59   return (__m128i)__builtin_ia32_vpdpwsud128((__v4si)__W, (__v4si)__A,
  60                                              (__v4si)__B);
  61 }
  62
  63 /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with
  64 ///    corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
  65 ///    signed 16-bit results. Sum these 2 results with the corresponding
  66 ///    32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
  67 ///
  68 /// \headerfile <immintrin.h>
  69 ///
  70 /// \code
  71 /// __m256i _mm256_dpwsud_epi32(__m256i __W, __m256i __A, __m256i __B)
  72 /// \endcode
  73 ///
  74 /// This intrinsic corresponds to the \c VPDPWSUD instruction.
  75 ///
  76 /// \param __W
  77 ///    A 256-bit vector of [8 x int].
  78 /// \param __A
  79 ///    A 256-bit vector of [16 x short].
  80 /// \param __B
  81 ///    A 256-bit vector of [16 x unsigned short].
  82 /// \returns
  83 ///    A 256-bit vector of [8 x int].
  84 ///
  85 /// \code{.operation}
  86 /// FOR j := 0 to 7
  87 ///     tmp1.dword := SignExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j])
  88 ///     tmp2.dword := SignExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
  89 ///     dst.dword[j] := __W.dword[j] + tmp1 + tmp2
  90 /// ENDFOR
  91 /// dst[MAX:256] := 0
  92 /// \endcode
  93 static __inline__ __m256i __DEFAULT_FN_ATTRS256
  94 _mm256_dpwsud_epi32(__m256i __W, __m256i __A, __m256i __B) {
  95   return (__m256i)__builtin_ia32_vpdpwsud256((__v8si)__W, (__v8si)__A,
  96                                              (__v8si)__B);
  97 }
  98
  99 /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with
 100 ///    corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
 101 ///    signed 16-bit results. Sum these 2 results with the corresponding
 102 ///    32-bit integer in \a __W with signed saturation, and store the packed
 103 ///    32-bit results in \a dst.
 104 ///
 105 /// \headerfile <immintrin.h>
 106 ///
 107 /// \code
 108 /// __m128i _mm_dpwsuds_epi32(__m128i __W, __m128i __A, __m128i __B)
 109 /// \endcode
 110 ///
 111 /// This intrinsic corresponds to the \c VPDPWSUDS instruction.
 112 ///
 113 /// \param __W
 114 ///    A 128-bit vector of [4 x int].
 115 /// \param __A
 116 ///    A 128-bit vector of [8 x short].
 117 /// \param __B
 118 ///    A 128-bit vector of [8 x unsigned short].
 119 /// \returns
 120 ///    A 128-bit vector of [4 x int].
 121 ///
 122 /// \code{.operation}
 123 /// FOR j := 0 to 3
 124 ///     tmp1.dword := SignExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j])
 125 ///     tmp2.dword := SignExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
 126 ///     dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2)
 127 /// ENDFOR
 128 /// dst[MAX:128] := 0
 129 /// \endcode
 130 static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpwsuds_epi32(__m128i __W,
 131                                                                   __m128i __A,
 132                                                                   __m128i __B) {
 133   return (__m128i)__builtin_ia32_vpdpwsuds128((__v4si)__W, (__v4si)__A,
 134                                               (__v4si)__B);
 135 }
 136
 137 /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with
 138 ///    corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
 139 ///    signed 16-bit results. Sum these 2 results with the corresponding
 140 ///    32-bit integer in \a __W with signed saturation, and store the packed
 141 ///    32-bit results in \a dst.
 142 ///
 143 /// \headerfile <immintrin.h>
 144 ///
 145 /// \code
 146 /// __m256i _mm256_dpwsuds_epi32(__m256i __W, __m256i __A, __m256i __B)
 147 /// \endcode
 148 ///
 149 /// This intrinsic corresponds to the \c VPDPWSUDS instruction.
 150 ///
 151 /// \param __W
 152 ///    A 256-bit vector of [8 x int].
 153 /// \param __A
 154 ///    A 256-bit vector of [16 x short].
 155 /// \param __B
 156 ///    A 256-bit vector of [16 x unsigned short].
 157 /// \returns
 158 ///    A 256-bit vector of [8 x int].
 159 ///
 160 /// \code{.operation}
 161 /// FOR j := 0 to 7
 162 ///     tmp1.dword := SignExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j])
 163 ///     tmp2.dword := SignExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
 164 ///     dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2)
 165 /// ENDFOR
 166 /// dst[MAX:256] := 0
 167 /// \endcode
 168 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 169 _mm256_dpwsuds_epi32(__m256i __W, __m256i __A, __m256i __B) {
 170   return (__m256i)__builtin_ia32_vpdpwsuds256((__v8si)__W, (__v8si)__A,
 171                                               (__v8si)__B);
 172 }
 173
 174 /// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
 175 ///    corresponding signed 16-bit integers in \a __B, producing 2 intermediate
 176 ///    signed 16-bit results. Sum these 2 results with the corresponding
 177 ///    32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
 178 ///
 179 /// \headerfile <immintrin.h>
 180 ///
 181 /// \code
 182 /// __m128i _mm_dpbusd_epi32(__m128i __W, __m128i __A, __m128i __B)
 183 /// \endcode
 184 ///
 185 /// This intrinsic corresponds to the \c VPDPWUSD instruction.
 186 ///
 187 /// \param __W
 188 ///    A 128-bit vector of [4 x int].
 189 /// \param __A
 190 ///    A 128-bit vector of [8 x unsigned short].
 191 /// \param __B
 192 ///    A 128-bit vector of [8 x short].
 193 /// \returns
 194 ///    A 128-bit vector of [4 x int].
 195 ///
 196 /// \code{.operation}
 197 /// FOR j := 0 to 3
 198 ///     tmp1.dword := ZeroExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j])
 199 ///     tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1])
 200 ///     dst.dword[j] := __W.dword[j] + tmp1 + tmp2
 201 /// ENDFOR
 202 /// dst[MAX:128] := 0
 203 /// \endcode
 204 static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpwusd_epi32(__m128i __W,
 205                                                                  __m128i __A,
 206                                                                  __m128i __B) {
 207   return (__m128i)__builtin_ia32_vpdpwusd128((__v4si)__W, (__v4si)__A,
 208                                              (__v4si)__B);
 209 }
 210
 211 /// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
 212 ///    corresponding signed 16-bit integers in \a __B, producing 2 intermediate
 213 ///    signed 16-bit results. Sum these 2 results with the corresponding
 214 ///    32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
 215 ///
 216 /// \headerfile <immintrin.h>
 217 ///
 218 /// \code
 219 /// __m256i _mm256_dpwusd_epi32(__m256i __W, __m256i __A, __m256i __B)
 220 /// \endcode
 221 ///
 222 /// This intrinsic corresponds to the \c VPDPWUSD instruction.
 223 ///
 224 /// \param __W
 225 ///    A 256-bit vector of [8 x int].
 226 /// \param __A
 227 ///    A 256-bit vector of [16 x unsigned short].
 228 /// \param __B
 229 ///    A 256-bit vector of [16 x short].
 230 /// \returns
 231 ///    A 256-bit vector of [8 x int].
 232 ///
 233 /// \code{.operation}
 234 /// FOR j := 0 to 7
 235 ///     tmp1.dword := ZeroExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j])
 236 ///     tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1])
 237 ///     dst.dword[j] := __W.dword[j] + tmp1 + tmp2
 238 /// ENDFOR
 239 /// dst[MAX:256] := 0
 240 /// \endcode
 241 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 242 _mm256_dpwusd_epi32(__m256i __W, __m256i __A, __m256i __B) {
 243   return (__m256i)__builtin_ia32_vpdpwusd256((__v8si)__W, (__v8si)__A,
 244                                              (__v8si)__B);
 245 }
 246
 247 /// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
 248 ///    corresponding signed 16-bit integers in \a __B, producing 2 intermediate
 249 ///    signed 16-bit results. Sum these 2 results with the corresponding
 250 ///    32-bit integer in \a __W with signed saturation, and store the packed
 251 ///    32-bit results in \a dst.
 252 ///
 253 /// \headerfile <immintrin.h>
 254 ///
 255 /// \code
 256 /// __m128i _mm_dpwusds_epi32(__m128i __W, __m128i __A, __m128i __B)
 257 /// \endcode
 258 ///
 259 /// This intrinsic corresponds to the \c VPDPWSUDS instruction.
 260 ///
 261 /// \param __W
 262 ///    A 128-bit vector of [4 x int].
 263 /// \param __A
 264 ///    A 128-bit vector of [8 x unsigned short].
 265 /// \param __B
 266 ///    A 128-bit vector of [8 x short].
 267 /// \returns
 268 ///    A 128-bit vector of [4 x int].
 269 ///
 270 /// \code{.operation}
 271 /// FOR j := 0 to 3
 272 ///     tmp1.dword := ZeroExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j])
 273 ///     tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1])
 274 ///     dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2)
 275 /// ENDFOR
 276 /// dst[MAX:128] := 0
 277 /// \endcode
 278 static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpwusds_epi32(__m128i __W,
 279                                                                   __m128i __A,
 280                                                                   __m128i __B) {
 281   return (__m128i)__builtin_ia32_vpdpwusds128((__v4si)__W, (__v4si)__A,
 282                                               (__v4si)__B);
 283 }
 284
 285 /// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
 286 ///    corresponding signed 16-bit integers in \a __B, producing 2 intermediate
 287 ///    signed 16-bit results. Sum these 2 results with the corresponding
 288 ///    32-bit integer in \a __W with signed saturation, and store the packed
 289 ///    32-bit results in \a dst.
 290 ///
 291 /// \headerfile <immintrin.h>
 292 ///
 293 /// \code
 294 /// __m256i _mm256_dpwsuds_epi32(__m256i __W, __m256i __A, __m256i __B)
 295 /// \endcode
 296 ///
 297 /// This intrinsic corresponds to the \c VPDPWSUDS instruction.
 298 ///
 299 /// \param __W
 300 ///    A 256-bit vector of [8 x int].
 301 /// \param __A
 302 ///    A 256-bit vector of [16 x unsigned short].
 303 /// \param __B
 304 ///    A 256-bit vector of [16 x short].
 305 /// \returns
 306 ///    A 256-bit vector of [8 x int].
 307 ///
 308 /// \code{.operation}
 309 /// FOR j := 0 to 7
 310 ///     tmp1.dword := ZeroExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j])
 311 ///     tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1])
 312 ///     dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2)
 313 /// ENDFOR
 314 /// dst[MAX:256] := 0
 315 /// \endcode
 316 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 317 _mm256_dpwusds_epi32(__m256i __W, __m256i __A, __m256i __B) {
 318   return (__m256i)__builtin_ia32_vpdpwusds256((__v8si)__W, (__v8si)__A,
 319                                               (__v8si)__B);
 320 }
 321
 322 /// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
 323 ///    corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
 324 ///    signed 16-bit results. Sum these 2 results with the corresponding
 325 ///    32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
 326 ///
 327 /// \headerfile <immintrin.h>
 328 ///
 329 /// \code
 330 /// __m128i _mm_dpwuud_epi32(__m128i __W, __m128i __A, __m128i __B)
 331 /// \endcode
 332 ///
 333 /// This intrinsic corresponds to the \c VPDPWUUD instruction.
 334 ///
 335 /// \param __W
 336 ///    A 128-bit vector of [4 x unsigned int].
 337 /// \param __A
 338 ///    A 128-bit vector of [8 x unsigned short].
 339 /// \param __B
 340 ///    A 128-bit vector of [8 x unsigned short].
 341 /// \returns
 342 ///    A 128-bit vector of [4 x unsigned int].
 343 ///
 344 /// \code{.operation}
 345 /// FOR j := 0 to 3
 346 ///     tmp1.dword := ZeroExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j])
 347 ///     tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
 348 ///     dst.dword[j] := __W.dword[j] + tmp1 + tmp2
 349 /// ENDFOR
 350 /// dst[MAX:128] := 0
 351 /// \endcode
 352 static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpwuud_epi32(__m128i __W,
 353                                                                  __m128i __A,
 354                                                                  __m128i __B) {
 355   return (__m128i)__builtin_ia32_vpdpwuud128((__v4si)__W, (__v4si)__A,
 356                                              (__v4si)__B);
 357 }
 358
 359 /// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
 360 ///    corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
 361 ///    signed 16-bit results. Sum these 2 results with the corresponding
 362 ///    32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
 363 ///
 364 /// \headerfile <immintrin.h>
 365 ///
 366 /// \code
 367 /// __m256i _mm256_dpwuud_epi32(__m256i __W, __m256i __A, __m256i __B)
 368 /// \endcode
 369 ///
 370 /// This intrinsic corresponds to the \c VPDPWUUD instruction.
 371 ///
 372 /// \param __W
 373 ///    A 256-bit vector of [8 x unsigned int].
 374 /// \param __A
 375 ///    A 256-bit vector of [16 x unsigned short].
 376 /// \param __B
 377 ///    A 256-bit vector of [16 x unsigned short].
 378 /// \returns
 379 ///    A 256-bit vector of [8 x unsigned int].
 380 ///
 381 /// \code{.operation}
 382 /// FOR j := 0 to 7
 383 ///     tmp1.dword := ZeroExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j])
 384 ///     tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
 385 ///     dst.dword[j] := __W.dword[j] + tmp1 + tmp2
 386 /// ENDFOR
 387 /// dst[MAX:256] := 0
 388 /// \endcode
 389 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 390 _mm256_dpwuud_epi32(__m256i __W, __m256i __A, __m256i __B) {
 391   return (__m256i)__builtin_ia32_vpdpwuud256((__v8si)__W, (__v8si)__A,
 392                                              (__v8si)__B);
 393 }
 394
 395 /// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
 396 ///    corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
 397 ///    signed 16-bit results. Sum these 2 results with the corresponding
 398 ///    32-bit integer in \a __W with signed saturation, and store the packed
 399 ///    32-bit results in \a dst.
 400 ///
 401 /// \headerfile <immintrin.h>
 402 ///
 403 /// \code
 404 /// __m128i _mm_dpwsuds_epi32(__m128i __W, __m128i __A, __m128i __B)
 405 /// \endcode
 406 ///
 407 /// This intrinsic corresponds to the \c VPDPWSUDS instruction.
 408 ///
 409 /// \param __W
 410 ///    A 128-bit vector of [4 x unsigned int].
 411 /// \param __A
 412 ///    A 128-bit vector of [8 x unsigned short].
 413 /// \param __B
 414 ///    A 128-bit vector of [8 x unsigned short].
 415 /// \returns
 416 ///    A 128-bit vector of [4 x unsigned int].
 417 ///
 418 /// \code{.operation}
 419 /// FOR j := 0 to 3
 420 ///     tmp1.dword := ZeroExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j])
 421 ///     tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
 422 ///     dst.dword[j] := UNSIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2)
 423 /// ENDFOR
 424 /// dst[MAX:128] := 0
 425 /// \endcode
 426 static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpwuuds_epi32(__m128i __W,
 427                                                                   __m128i __A,
 428                                                                   __m128i __B) {
 429   return (__m128i)__builtin_ia32_vpdpwuuds128((__v4si)__W, (__v4si)__A,
 430                                               (__v4si)__B);
 431 }
 432
 433 /// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
 434 ///    corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
 435 ///    signed 16-bit results. Sum these 2 results with the corresponding
 436 ///    32-bit integer in \a __W with signed saturation, and store the packed
 437 ///    32-bit results in \a dst.
 438 ///
 439 /// \headerfile <immintrin.h>
 440 ///
 441 /// \code
 442 /// __m256i _mm256_dpwuuds_epi32(__m256i __W, __m256i __A, __m256i __B)
 443 /// \endcode
 444 ///
 445 /// This intrinsic corresponds to the \c VPDPWSUDS instruction.
 446 ///
 447 /// \param __W
 448 ///    A 256-bit vector of [8 x unsigned int].
 449 /// \param __A
 450 ///    A 256-bit vector of [16 x unsigned short].
 451 /// \param __B
 452 ///    A 256-bit vector of [16 x unsigned short].
 453 /// \returns
 454 ///    A 256-bit vector of [8 x unsigned int].
 455 ///
 456 /// \code{.operation}
 457 /// FOR j := 0 to 7
 458 ///     tmp1.dword := ZeroExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j])
 459 ///     tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
 460 ///     dst.dword[j] := UNSIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2)
 461 /// ENDFOR
 462 /// dst[MAX:256] := 0
 463 /// \endcode
 464 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 465 _mm256_dpwuuds_epi32(__m256i __W, __m256i __A, __m256i __B) {
 466   return (__m256i)__builtin_ia32_vpdpwuuds256((__v8si)__W, (__v8si)__A,
 467                                               (__v8si)__B);
 468 }
 469
 470 #undef __DEFAULT_FN_ATTRS128
 471 #undef __DEFAULT_FN_ATTRS256
 472
 473 #endif // __AVXVNNIINT16INTRIN_H