clang/lib/Headers/avxvnniint16intrin.h

   1 /*===----------- avxvnniint16intrin.h - AVXVNNIINT16 intrinsics-------------===
   2  *
   3  * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   4  * See https://llvm.org/LICENSE.txt for license information.
   5  * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
   6  *
   7  *===-----------------------------------------------------------------------===
   8  */
   9
  10 #ifndef __IMMINTRIN_H
  11 #error                                                                         \
  12     "Never use <avxvnniint16intrin.h> directly; include <immintrin.h> instead."
  13 #endif // __IMMINTRIN_H
  14
  15 #ifndef __AVXVNNIINT16INTRIN_H
  16 #define __AVXVNNIINT16INTRIN_H
  17
  18 /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with
  19 ///    corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
  20 ///    signed 16-bit results. Sum these 2 results with the corresponding
  21 ///    32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
  22 ///
  23 /// \headerfile <immintrin.h>
  24 ///
  25 /// \code
  26 /// __m128i _mm_dpwsud_epi32(__m128i __W, __m128i __A, __m128i __B)
  27 /// \endcode
  28 ///
  29 /// This intrinsic corresponds to the \c VPDPWSUD instruction.
  30 ///
  31 /// \param __W
  32 ///    A 128-bit vector of [4 x int].
  33 /// \param __A
  34 ///    A 128-bit vector of [8 x short].
  35 /// \param __B
  36 ///    A 128-bit vector of [8 x unsigned short].
  37 /// \returns
  38 ///    A 128-bit vector of [4 x int].
  39 ///
  40 /// \code{.operation}
  41 /// FOR j := 0 to 3
  42 ///     tmp1.dword := SignExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j])
  43 ///     tmp2.dword := SignExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
  44 ///     dst.dword[j] := __W.dword[j] + tmp1 + tmp2
  45 /// ENDFOR
  46 /// dst[MAX:128] := 0
  47 /// \endcode
  48 #define _mm_dpwsud_epi32(__W, __A, __B)                                        \
  49   ((__m128i)__builtin_ia32_vpdpwsud128((__v4si)(__W), (__v4si)(__A),           \
  50                                        (__v4si)(__B)))
  51
  52 /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with
  53 ///    corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
  54 ///    signed 16-bit results. Sum these 2 results with the corresponding
  55 ///    32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
  56 ///
  57 /// \headerfile <immintrin.h>
  58 ///
  59 /// \code
  60 /// __m256i _mm256_dpwsud_epi32(__m256i __W, __m256i __A, __m256i __B)
  61 /// \endcode
  62 ///
  63 /// This intrinsic corresponds to the \c VPDPWSUD instruction.
  64 ///
  65 /// \param __W
  66 ///    A 256-bit vector of [8 x int].
  67 /// \param __A
  68 ///    A 256-bit vector of [16 x short].
  69 /// \param __B
  70 ///    A 256-bit vector of [16 x unsigned short].
  71 /// \returns
  72 ///    A 256-bit vector of [8 x int].
  73 ///
  74 /// \code{.operation}
  75 /// FOR j := 0 to 7
  76 ///     tmp1.dword := SignExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j])
  77 ///     tmp2.dword := SignExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
  78 ///     dst.dword[j] := __W.dword[j] + tmp1 + tmp2
  79 /// ENDFOR
  80 /// dst[MAX:256] := 0
  81 /// \endcode
  82 #define _mm256_dpwsud_epi32(__W, __A, __B)                                     \
  83   ((__m256i)__builtin_ia32_vpdpwsud256((__v8si)(__W), (__v8si)(__A),           \
  84                                        (__v8si)(__B)))
  85
  86 /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with
  87 ///    corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
  88 ///    signed 16-bit results. Sum these 2 results with the corresponding
  89 ///    32-bit integer in \a __W with signed saturation, and store the packed
  90 ///    32-bit results in \a dst.
  91 ///
  92 /// \headerfile <immintrin.h>
  93 ///
  94 /// \code
  95 /// __m128i _mm_dpwsuds_epi32(__m128i __W, __m128i __A, __m128i __B)
  96 /// \endcode
  97 ///
  98 /// This intrinsic corresponds to the \c VPDPWSUDS instruction.
  99 ///
 100 /// \param __W
 101 ///    A 128-bit vector of [4 x int].
 102 /// \param __A
 103 ///    A 128-bit vector of [8 x short].
 104 /// \param __B
 105 ///    A 128-bit vector of [8 x unsigned short].
 106 /// \returns
 107 ///    A 128-bit vector of [4 x int].
 108 ///
 109 /// \code{.operation}
 110 /// FOR j := 0 to 3
 111 ///     tmp1.dword := SignExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j])
 112 ///     tmp2.dword := SignExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
 113 ///     dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2)
 114 /// ENDFOR
 115 /// dst[MAX:128] := 0
 116 /// \endcode
 117 #define _mm_dpwsuds_epi32(__W, __A, __B)                                       \
 118   ((__m128i)__builtin_ia32_vpdpwsuds128((__v4si)(__W), (__v4si)(__A),          \
 119                                         (__v4si)(__B)))
 120
 121 /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with
 122 ///    corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
 123 ///    signed 16-bit results. Sum these 2 results with the corresponding
 124 ///    32-bit integer in \a __W with signed saturation, and store the packed
 125 ///    32-bit results in \a dst.
 126 ///
 127 /// \headerfile <immintrin.h>
 128 ///
 129 /// \code
 130 /// __m256i _mm256_dpwsuds_epi32(__m256i __W, __m256i __A, __m256i __B)
 131 /// \endcode
 132 ///
 133 /// This intrinsic corresponds to the \c VPDPWSUDS instruction.
 134 ///
 135 /// \param __W
 136 ///    A 256-bit vector of [8 x int].
 137 /// \param __A
 138 ///    A 256-bit vector of [16 x short].
 139 /// \param __B
 140 ///    A 256-bit vector of [16 x unsigned short].
 141 /// \returns
 142 ///    A 256-bit vector of [8 x int].
 143 ///
 144 /// \code{.operation}
 145 /// FOR j := 0 to 7
 146 ///     tmp1.dword := SignExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j])
 147 ///     tmp2.dword := SignExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
 148 ///     dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2)
 149 /// ENDFOR
 150 /// dst[MAX:256] := 0
 151 /// \endcode
 152 #define _mm256_dpwsuds_epi32(__W, __A, __B)                                    \
 153   ((__m256i)__builtin_ia32_vpdpwsuds256((__v8si)(__W), (__v8si)(__A),          \
 154                                         (__v8si)(__B)))
 155
 156 /// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
 157 ///    corresponding signed 16-bit integers in \a __B, producing 2 intermediate
 158 ///    signed 16-bit results. Sum these 2 results with the corresponding
 159 ///    32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
 160 ///
 161 /// \headerfile <immintrin.h>
 162 ///
 163 /// \code
 164 /// __m128i _mm_dpbusd_epi32(__m128i __W, __m128i __A, __m128i __B)
 165 /// \endcode
 166 ///
 167 /// This intrinsic corresponds to the \c VPDPWUSD instruction.
 168 ///
 169 /// \param __W
 170 ///    A 128-bit vector of [4 x int].
 171 /// \param __A
 172 ///    A 128-bit vector of [8 x unsigned short].
 173 /// \param __B
 174 ///    A 128-bit vector of [8 x short].
 175 /// \returns
 176 ///    A 128-bit vector of [4 x int].
 177 ///
 178 /// \code{.operation}
 179 /// FOR j := 0 to 3
 180 ///     tmp1.dword := ZeroExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j])
 181 ///     tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1])
 182 ///     dst.dword[j] := __W.dword[j] + tmp1 + tmp2
 183 /// ENDFOR
 184 /// dst[MAX:128] := 0
 185 /// \endcode
 186 #define _mm_dpwusd_epi32(__W, __A, __B)                                        \
 187   ((__m128i)__builtin_ia32_vpdpwusd128((__v4si)(__W), (__v4si)(__A),           \
 188                                        (__v4si)(__B)))
 189
 190 /// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
 191 ///    corresponding signed 16-bit integers in \a __B, producing 2 intermediate
 192 ///    signed 16-bit results. Sum these 2 results with the corresponding
 193 ///    32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
 194 ///
 195 /// \headerfile <immintrin.h>
 196 ///
 197 /// \code
 198 /// __m256i _mm256_dpwusd_epi32(__m256i __W, __m256i __A, __m256i __B)
 199 /// \endcode
 200 ///
 201 /// This intrinsic corresponds to the \c VPDPWUSD instruction.
 202 ///
 203 /// \param __W
 204 ///    A 256-bit vector of [8 x int].
 205 /// \param __A
 206 ///    A 256-bit vector of [16 x unsigned short].
 207 /// \param __B
 208 ///    A 256-bit vector of [16 x short].
 209 /// \returns
 210 ///    A 256-bit vector of [8 x int].
 211 ///
 212 /// \code{.operation}
 213 /// FOR j := 0 to 7
 214 ///     tmp1.dword := ZeroExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j])
 215 ///     tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1])
 216 ///     dst.dword[j] := __W.dword[j] + tmp1 + tmp2
 217 /// ENDFOR
 218 /// dst[MAX:256] := 0
 219 /// \endcode
 220 #define _mm256_dpwusd_epi32(__W, __A, __B)                                     \
 221   ((__m256i)__builtin_ia32_vpdpwusd256((__v8si)(__W), (__v8si)(__A),           \
 222                                        (__v8si)(__B)))
 223
 224 /// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
 225 ///    corresponding signed 16-bit integers in \a __B, producing 2 intermediate
 226 ///    signed 16-bit results. Sum these 2 results with the corresponding
 227 ///    32-bit integer in \a __W with signed saturation, and store the packed
 228 ///    32-bit results in \a dst.
 229 ///
 230 /// \headerfile <immintrin.h>
 231 ///
 232 /// \code
 233 /// __m128i _mm_dpwusds_epi32(__m128i __W, __m128i __A, __m128i __B)
 234 /// \endcode
 235 ///
 236 /// This intrinsic corresponds to the \c VPDPWSUDS instruction.
 237 ///
 238 /// \param __W
 239 ///    A 128-bit vector of [4 x int].
 240 /// \param __A
 241 ///    A 128-bit vector of [8 x unsigned short].
 242 /// \param __B
 243 ///    A 128-bit vector of [8 x short].
 244 /// \returns
 245 ///    A 128-bit vector of [4 x int].
 246 ///
 247 /// \code{.operation}
 248 /// FOR j := 0 to 3
 249 ///     tmp1.dword := ZeroExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j])
 250 ///     tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1])
 251 ///     dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2)
 252 /// ENDFOR
 253 /// dst[MAX:128] := 0
 254 /// \endcode
 255 #define _mm_dpwusds_epi32(__W, __A, __B)                                       \
 256   ((__m128i)__builtin_ia32_vpdpwusds128((__v4si)(__W), (__v4si)(__A),          \
 257                                         (__v4si)(__B)))
 258
 259 /// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
 260 ///    corresponding signed 16-bit integers in \a __B, producing 2 intermediate
 261 ///    signed 16-bit results. Sum these 2 results with the corresponding
 262 ///    32-bit integer in \a __W with signed saturation, and store the packed
 263 ///    32-bit results in \a dst.
 264 ///
 265 /// \headerfile <immintrin.h>
 266 ///
 267 /// \code
 268 /// __m256i _mm256_dpwsuds_epi32(__m256i __W, __m256i __A, __m256i __B)
 269 /// \endcode
 270 ///
 271 /// This intrinsic corresponds to the \c VPDPWSUDS instruction.
 272 ///
 273 /// \param __W
 274 ///    A 256-bit vector of [8 x int].
 275 /// \param __A
 276 ///    A 256-bit vector of [16 x unsigned short].
 277 /// \param __B
 278 ///    A 256-bit vector of [16 x short].
 279 /// \returns
 280 ///    A 256-bit vector of [8 x int].
 281 ///
 282 /// \code{.operation}
 283 /// FOR j := 0 to 7
 284 ///     tmp1.dword := ZeroExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j])
 285 ///     tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1])
 286 ///     dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2)
 287 /// ENDFOR
 288 /// dst[MAX:256] := 0
 289 /// \endcode
 290 #define _mm256_dpwusds_epi32(__W, __A, __B)                                    \
 291   ((__m256i)__builtin_ia32_vpdpwusds256((__v8si)(__W), (__v8si)(__A),          \
 292                                         (__v8si)(__B)))
 293
 294 /// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
 295 ///    corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
 296 ///    signed 16-bit results. Sum these 2 results with the corresponding
 297 ///    32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
 298 ///
 299 /// \headerfile <immintrin.h>
 300 ///
 301 /// \code
 302 /// __m128i _mm_dpwuud_epi32(__m128i __W, __m128i __A, __m128i __B)
 303 /// \endcode
 304 ///
 305 /// This intrinsic corresponds to the \c VPDPWUUD instruction.
 306 ///
 307 /// \param __W
 308 ///    A 128-bit vector of [4 x unsigned int].
 309 /// \param __A
 310 ///    A 128-bit vector of [8 x unsigned short].
 311 /// \param __B
 312 ///    A 128-bit vector of [8 x unsigned short].
 313 /// \returns
 314 ///    A 128-bit vector of [4 x unsigned int].
 315 ///
 316 /// \code{.operation}
 317 /// FOR j := 0 to 3
 318 ///     tmp1.dword := ZeroExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j])
 319 ///     tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
 320 ///     dst.dword[j] := __W.dword[j] + tmp1 + tmp2
 321 /// ENDFOR
 322 /// dst[MAX:128] := 0
 323 /// \endcode
 324 #define _mm_dpwuud_epi32(__W, __A, __B)                                        \
 325   ((__m128i)__builtin_ia32_vpdpwuud128((__v4si)(__W), (__v4si)(__A),           \
 326                                        (__v4si)(__B)))
 327
 328 /// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
 329 ///    corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
 330 ///    signed 16-bit results. Sum these 2 results with the corresponding
 331 ///    32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
 332 ///
 333 /// \headerfile <immintrin.h>
 334 ///
 335 /// \code
 336 /// __m256i _mm256_dpwuud_epi32(__m256i __W, __m256i __A, __m256i __B)
 337 /// \endcode
 338 ///
 339 /// This intrinsic corresponds to the \c VPDPWUUD instruction.
 340 ///
 341 /// \param __W
 342 ///    A 256-bit vector of [8 x unsigned int].
 343 /// \param __A
 344 ///    A 256-bit vector of [16 x unsigned short].
 345 /// \param __B
 346 ///    A 256-bit vector of [16 x unsigned short].
 347 /// \returns
 348 ///    A 256-bit vector of [8 x unsigned int].
 349 ///
 350 /// \code{.operation}
 351 /// FOR j := 0 to 7
 352 ///     tmp1.dword := ZeroExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j])
 353 ///     tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
 354 ///     dst.dword[j] := __W.dword[j] + tmp1 + tmp2
 355 /// ENDFOR
 356 /// dst[MAX:256] := 0
 357 /// \endcode
 358 #define _mm256_dpwuud_epi32(__W, __A, __B)                                     \
 359   ((__m256i)__builtin_ia32_vpdpwuud256((__v8si)(__W), (__v8si)(__A),           \
 360                                        (__v8si)(__B)))
 361
 362 /// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
 363 ///    corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
 364 ///    signed 16-bit results. Sum these 2 results with the corresponding
 365 ///    32-bit integer in \a __W with signed saturation, and store the packed
 366 ///    32-bit results in \a dst.
 367 ///
 368 /// \headerfile <immintrin.h>
 369 ///
 370 /// \code
 371 /// __m128i _mm_dpwsuds_epi32(__m128i __W, __m128i __A, __m128i __B)
 372 /// \endcode
 373 ///
 374 /// This intrinsic corresponds to the \c VPDPWSUDS instruction.
 375 ///
 376 /// \param __W
 377 ///    A 128-bit vector of [4 x unsigned int].
 378 /// \param __A
 379 ///    A 128-bit vector of [8 x unsigned short].
 380 /// \param __B
 381 ///    A 128-bit vector of [8 x unsigned short].
 382 /// \returns
 383 ///    A 128-bit vector of [4 x unsigned int].
 384 ///
 385 /// \code{.operation}
 386 /// FOR j := 0 to 3
 387 ///     tmp1.dword := ZeroExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j])
 388 ///     tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
 389 ///     dst.dword[j] := UNSIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2)
 390 /// ENDFOR
 391 /// dst[MAX:128] := 0
 392 /// \endcode
 393 #define _mm_dpwuuds_epi32(__W, __A, __B)                                       \
 394   ((__m128i)__builtin_ia32_vpdpwuuds128((__v4si)(__W), (__v4si)(__A),          \
 395                                         (__v4si)(__B)))
 396
 397 /// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
 398 ///    corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
 399 ///    signed 16-bit results. Sum these 2 results with the corresponding
 400 ///    32-bit integer in \a __W with signed saturation, and store the packed
 401 ///    32-bit results in \a dst.
 402 ///
 403 /// \headerfile <immintrin.h>
 404 ///
 405 /// \code
 406 /// __m256i _mm256_dpwuuds_epi32(__m256i __W, __m256i __A, __m256i __B)
 407 /// \endcode
 408 ///
 409 /// This intrinsic corresponds to the \c VPDPWSUDS instruction.
 410 ///
 411 /// \param __W
 412 ///    A 256-bit vector of [8 x unsigned int].
 413 /// \param __A
 414 ///    A 256-bit vector of [16 x unsigned short].
 415 /// \param __B
 416 ///    A 256-bit vector of [16 x unsigned short].
 417 /// \returns
 418 ///    A 256-bit vector of [8 x unsigned int].
 419 ///
 420 /// \code{.operation}
 421 /// FOR j := 0 to 7
 422 ///     tmp1.dword := ZeroExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j])
 423 ///     tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
 424 ///     dst.dword[j] := UNSIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2)
 425 /// ENDFOR
 426 /// dst[MAX:256] := 0
 427 /// \endcode
 428 #define _mm256_dpwuuds_epi32(__W, __A, __B)                                    \
 429   ((__m256i)__builtin_ia32_vpdpwuuds256((__v8si)(__W), (__v8si)(__A),          \
 430                                         (__v8si)(__B)))
 431
 432 #endif // __AVXVNNIINT16INTRIN_H