clang/lib/Headers/avxneconvertintrin.h

   1 /*===-------------- avxneconvertintrin.h - AVXNECONVERT --------------------===
   2  *
   3  * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   4  * See https://llvm.org/LICENSE.txt for license information.
   5  * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
   6  *
   7  *===-----------------------------------------------------------------------===
   8  */
   9
  10 #ifndef __IMMINTRIN_H
  11 #error                                                                         \
  12     "Never use <avxneconvertintrin.h> directly; include <immintrin.h> instead."
  13 #endif // __IMMINTRIN_H
  14
  15 #ifdef __SSE2__
  16
  17 #ifndef __AVXNECONVERTINTRIN_H
  18 #define __AVXNECONVERTINTRIN_H
  19
  20 /* Define the default attributes for the functions in this file. */
  21 #define __DEFAULT_FN_ATTRS128                                                  \
  22   __attribute__((__always_inline__, __nodebug__, __target__("avxneconvert"),   \
  23                  __min_vector_width__(128)))
  24 #define __DEFAULT_FN_ATTRS256                                                  \
  25   __attribute__((__always_inline__, __nodebug__, __target__("avxneconvert"),   \
  26                  __min_vector_width__(256)))
  27
  28 /// Convert scalar BF16 (16-bit) floating-point element
  29 /// stored at memory locations starting at location \a __A to a
  30 /// single-precision (32-bit) floating-point, broadcast it to packed
  31 /// single-precision (32-bit) floating-point elements, and store the results in
  32 /// \a dst.
  33 ///
  34 /// \headerfile <x86intrin.h>
  35 ///
  36 /// \code
  37 /// _mm_bcstnebf16_ps(const void *__A);
  38 /// \endcode
  39 ///
  40 /// This intrinsic corresponds to the \c VBCSTNEBF162PS instruction.
  41 ///
  42 /// \param __A
  43 ///    A pointer to a 16-bit memory location. The address of the memory
  44 ///    location does not have to be aligned.
  45 /// \returns
  46 ///    A 128-bit vector of [4 x float].
  47 ///
  48 /// \code{.operation}
  49 /// b := Convert_BF16_To_FP32(MEM[__A+15:__A])
  50 /// FOR j := 0 to 3
  51 ///   m := j*32
  52 ///   dst[m+31:m] := b
  53 /// ENDFOR
  54 /// dst[MAX:128] := 0
  55 /// \endcode
  56 static __inline__ __m128 __DEFAULT_FN_ATTRS128
  57 _mm_bcstnebf16_ps(const void *__A) {
  58   return (__m128)__builtin_ia32_vbcstnebf162ps128((const __bf16 *)__A);
  59 }
  60
  61 /// Convert scalar BF16 (16-bit) floating-point element
  62 /// stored at memory locations starting at location \a __A to a
  63 /// single-precision (32-bit) floating-point, broadcast it to packed
  64 /// single-precision (32-bit) floating-point elements, and store the results in
  65 /// \a dst.
  66 ///
  67 /// \headerfile <x86intrin.h>
  68 ///
  69 /// \code
  70 /// _mm256_bcstnebf16_ps(const void *__A);
  71 /// \endcode
  72 ///
  73 /// This intrinsic corresponds to the \c VBCSTNEBF162PS instruction.
  74 ///
  75 /// \param __A
  76 ///    A pointer to a 16-bit memory location. The address of the memory
  77 ///    location does not have to be aligned.
  78 /// \returns
  79 ///    A 256-bit vector of [8 x float].
  80 ///
  81 /// \code{.operation}
  82 /// b := Convert_BF16_To_FP32(MEM[__A+15:__A])
  83 /// FOR j := 0 to 7
  84 ///   m := j*32
  85 ///   dst[m+31:m] := b
  86 /// ENDFOR
  87 /// dst[MAX:256] := 0
  88 /// \endcode
  89 static __inline__ __m256 __DEFAULT_FN_ATTRS256
  90 _mm256_bcstnebf16_ps(const void *__A) {
  91   return (__m256)__builtin_ia32_vbcstnebf162ps256((const __bf16 *)__A);
  92 }
  93
  94 /// Convert scalar half-precision (16-bit) floating-point element
  95 /// stored at memory locations starting at location \a __A to a
  96 /// single-precision (32-bit) floating-point, broadcast it to packed
  97 /// single-precision (32-bit) floating-point elements, and store the results in
  98 /// \a dst.
  99 ///
 100 /// \headerfile <x86intrin.h>
 101 ///
 102 /// \code
 103 /// _mm_bcstnesh_ps(const void *__A);
 104 /// \endcode
 105 ///
 106 /// This intrinsic corresponds to the \c VBCSTNESH2PS instruction.
 107 ///
 108 /// \param __A
 109 ///    A pointer to a 16-bit memory location. The address of the memory
 110 ///    location does not have to be aligned.
 111 /// \returns
 112 ///    A 128-bit vector of [4 x float].
 113 ///
 114 /// \code{.operation}
 115 /// b := Convert_FP16_To_FP32(MEM[__A+15:__A])
 116 /// FOR j := 0 to 3
 117 ///   m := j*32
 118 ///   dst[m+31:m] := b
 119 /// ENDFOR
 120 /// dst[MAX:128] := 0
 121 /// \endcode
 122 static __inline__ __m128 __DEFAULT_FN_ATTRS128
 123 _mm_bcstnesh_ps(const void *__A) {
 124   return (__m128)__builtin_ia32_vbcstnesh2ps128((const _Float16 *)__A);
 125 }
 126
 127 /// Convert scalar half-precision (16-bit) floating-point element
 128 /// stored at memory locations starting at location \a __A to a
 129 /// single-precision (32-bit) floating-point, broadcast it to packed
 130 /// single-precision (32-bit) floating-point elements, and store the results in
 131 /// \a dst.
 132 ///
 133 /// \headerfile <x86intrin.h>
 134 ///
 135 /// \code
 136 /// _mm256_bcstnesh_ps(const void *__A);
 137 /// \endcode
 138 ///
 139 /// This intrinsic corresponds to the \c VBCSTNESH2PS instruction.
 140 ///
 141 /// \param __A
 142 ///    A pointer to a 16-bit memory location. The address of the memory
 143 ///    location does not have to be aligned.
 144 /// \returns
 145 ///    A 256-bit vector of [8 x float].
 146 ///
 147 /// \code{.operation}
 148 /// b := Convert_FP16_To_FP32(MEM[__A+15:__A])
 149 /// FOR j := 0 to 7
 150 ///   m := j*32
 151 ///   dst[m+31:m] := b
 152 /// ENDFOR
 153 /// dst[MAX:256] := 0
 154 /// \endcode
 155 static __inline__ __m256 __DEFAULT_FN_ATTRS256
 156 _mm256_bcstnesh_ps(const void *__A) {
 157   return (__m256)__builtin_ia32_vbcstnesh2ps256((const _Float16 *)__A);
 158 }
 159
 160 /// Convert packed BF16 (16-bit) floating-point even-indexed elements
 161 /// stored at memory locations starting at location \a __A to packed
 162 /// single-precision (32-bit) floating-point elements, and store the results in
 163 /// \a dst.
 164 ///
 165 /// \headerfile <x86intrin.h>
 166 ///
 167 /// \code
 168 /// _mm_cvtneebf16_ps(const __m128bh *__A);
 169 /// \endcode
 170 ///
 171 /// This intrinsic corresponds to the \c VCVTNEEBF162PS instruction.
 172 ///
 173 /// \param __A
 174 ///    A pointer to a 128-bit memory location containing 8 consecutive
 175 ///    BF16 (16-bit) floating-point values.
 176 /// \returns
 177 ///    A 128-bit vector of [4 x float].
 178 ///
 179 /// \code{.operation}
 180 /// FOR j := 0 to 3
 181 ///     k := j*2
 182 ///     i := k*16
 183 ///     m := j*32
 184 ///     dst[m+31:m] := Convert_BF16_To_FP32(MEM[__A+i+15:__A+i])
 185 /// ENDFOR
 186 /// dst[MAX:128] := 0
 187 /// \endcode
 188 static __inline__ __m128 __DEFAULT_FN_ATTRS128
 189 _mm_cvtneebf16_ps(const __m128bh *__A) {
 190   return (__m128)__builtin_ia32_vcvtneebf162ps128((const __v8bf *)__A);
 191 }
 192
 193 /// Convert packed BF16 (16-bit) floating-point even-indexed elements
 194 /// stored at memory locations starting at location \a __A to packed
 195 /// single-precision (32-bit) floating-point elements, and store the results in
 196 /// \a dst.
 197 ///
 198 /// \headerfile <x86intrin.h>
 199 ///
 200 /// \code
 201 /// _mm256_cvtneebf16_ps(const __m256bh *__A);
 202 /// \endcode
 203 ///
 204 /// This intrinsic corresponds to the \c VCVTNEEBF162PS instruction.
 205 ///
 206 /// \param __A
 207 ///    A pointer to a 256-bit memory location containing 16 consecutive
 208 ///    BF16 (16-bit) floating-point values.
 209 /// \returns
 210 ///    A 256-bit vector of [8 x float].
 211 ///
 212 /// \code{.operation}
 213 /// FOR j := 0 to 7
 214 ///     k := j*2
 215 ///     i := k*16
 216 ///     m := j*32
 217 ///     dst[m+31:m] := Convert_BF16_To_FP32(MEM[__A+i+15:__A+i])
 218 /// ENDFOR
 219 /// dst[MAX:256] := 0
 220 /// \endcode
 221 static __inline__ __m256 __DEFAULT_FN_ATTRS256
 222 _mm256_cvtneebf16_ps(const __m256bh *__A) {
 223   return (__m256)__builtin_ia32_vcvtneebf162ps256((const __v16bf *)__A);
 224 }
 225
 226 /// Convert packed half-precision (16-bit) floating-point even-indexed elements
 227 /// stored at memory locations starting at location \a __A to packed
 228 /// single-precision (32-bit) floating-point elements, and store the results in
 229 /// \a dst.
 230 ///
 231 /// \headerfile <x86intrin.h>
 232 ///
 233 /// \code
 234 /// _mm_cvtneeph_ps(const __m128h *__A);
 235 /// \endcode
 236 ///
 237 /// This intrinsic corresponds to the \c VCVTNEEPH2PS instruction.
 238 ///
 239 /// \param __A
 240 ///    A pointer to a 128-bit memory location containing 8 consecutive
 241 ///    half-precision (16-bit) floating-point values.
 242 /// \returns
 243 ///    A 128-bit vector of [4 x float].
 244 ///
 245 /// \code{.operation}
 246 /// FOR j := 0 to 3
 247 ///     k := j*2
 248 ///     i := k*16
 249 ///     m := j*32
 250 ///     dst[m+31:m] := Convert_FP16_To_FP32(MEM[__A+i+15:__A+i])
 251 /// ENDFOR
 252 /// dst[MAX:128] := 0
 253 /// \endcode
 254 static __inline__ __m128 __DEFAULT_FN_ATTRS128
 255 _mm_cvtneeph_ps(const __m128h *__A) {
 256   return (__m128)__builtin_ia32_vcvtneeph2ps128((const __v8hf *)__A);
 257 }
 258
 259 /// Convert packed half-precision (16-bit) floating-point even-indexed elements
 260 /// stored at memory locations starting at location \a __A to packed
 261 /// single-precision (32-bit) floating-point elements, and store the results in
 262 /// \a dst.
 263 ///
 264 /// \headerfile <x86intrin.h>
 265 ///
 266 /// \code
 267 /// _mm256_cvtneeph_ps(const __m256h *__A);
 268 /// \endcode
 269 ///
 270 /// This intrinsic corresponds to the \c VCVTNEEPH2PS instruction.
 271 ///
 272 /// \param __A
 273 ///    A pointer to a 256-bit memory location containing 16 consecutive
 274 ///    half-precision (16-bit) floating-point values.
 275 /// \returns
 276 ///    A 256-bit vector of [8 x float].
 277 ///
 278 /// \code{.operation}
 279 /// FOR j := 0 to 7
 280 ///     k := j*2
 281 ///     i := k*16
 282 ///     m := j*32
 283 ///     dst[m+31:m] := Convert_FP16_To_FP32(MEM[__A+i+15:__A+i])
 284 /// ENDFOR
 285 /// dst[MAX:256] := 0
 286 /// \endcode
 287 static __inline__ __m256 __DEFAULT_FN_ATTRS256
 288 _mm256_cvtneeph_ps(const __m256h *__A) {
 289   return (__m256)__builtin_ia32_vcvtneeph2ps256((const __v16hf *)__A);
 290 }
 291
 292 /// Convert packed BF16 (16-bit) floating-point odd-indexed elements
 293 /// stored at memory locations starting at location \a __A to packed
 294 /// single-precision (32-bit) floating-point elements, and store the results in
 295 /// \a dst.
 296 ///
 297 /// \headerfile <x86intrin.h>
 298 ///
 299 /// \code
 300 /// _mm_cvtneobf16_ps(const __m128bh *__A);
 301 /// \endcode
 302 ///
 303 /// This intrinsic corresponds to the \c VCVTNEOBF162PS instruction.
 304 ///
 305 /// \param __A
 306 ///    A pointer to a 128-bit memory location containing 8 consecutive
 307 ///    BF16 (16-bit) floating-point values.
 308 /// \returns
 309 ///    A 128-bit vector of [4 x float].
 310 ///
 311 /// \code{.operation}
 312 /// FOR j := 0 to 3
 313 ///     k := j*2+1
 314 ///     i := k*16
 315 ///     m := j*32
 316 ///     dst[m+31:m] := Convert_BF16_To_FP32(MEM[__A+i+15:__A+i])
 317 /// ENDFOR
 318 /// dst[MAX:128] := 0
 319 /// \endcode
 320 static __inline__ __m128 __DEFAULT_FN_ATTRS128
 321 _mm_cvtneobf16_ps(const __m128bh *__A) {
 322   return (__m128)__builtin_ia32_vcvtneobf162ps128((const __v8bf *)__A);
 323 }
 324
 325 /// Convert packed BF16 (16-bit) floating-point odd-indexed elements
 326 /// stored at memory locations starting at location \a __A to packed
 327 /// single-precision (32-bit) floating-point elements, and store the results in
 328 /// \a dst.
 329 ///
 330 /// \headerfile <x86intrin.h>
 331 ///
 332 /// \code
 333 /// _mm256_cvtneobf16_ps(const __m256bh *__A);
 334 /// \endcode
 335 ///
 336 /// This intrinsic corresponds to the \c VCVTNEOBF162PS instruction.
 337 ///
 338 /// \param __A
 339 ///    A pointer to a 256-bit memory location containing 16 consecutive
 340 ///    BF16 (16-bit) floating-point values.
 341 /// \returns
 342 ///    A 256-bit vector of [8 x float].
 343 ///
 344 /// \code{.operation}
 345 /// FOR j := 0 to 7
 346 ///     k := j*2+1
 347 ///     i := k*16
 348 ///     m := j*32
 349 ///     dst[m+31:m] := Convert_BF16_To_FP32(MEM[__A+i+15:__A+i])
 350 /// ENDFOR
 351 /// dst[MAX:256] := 0
 352 /// \endcode
 353 static __inline__ __m256 __DEFAULT_FN_ATTRS256
 354 _mm256_cvtneobf16_ps(const __m256bh *__A) {
 355   return (__m256)__builtin_ia32_vcvtneobf162ps256((const __v16bf *)__A);
 356 }
 357
 358 /// Convert packed half-precision (16-bit) floating-point odd-indexed elements
 359 /// stored at memory locations starting at location \a __A to packed
 360 /// single-precision (32-bit) floating-point elements, and store the results in
 361 /// \a dst.
 362 ///
 363 /// \headerfile <x86intrin.h>
 364 ///
 365 /// \code
 366 /// _mm_cvtneoph_ps(const __m128h *__A);
 367 /// \endcode
 368 ///
 369 /// This intrinsic corresponds to the \c VCVTNEOPH2PS instruction.
 370 ///
 371 /// \param __A
 372 ///    A pointer to a 128-bit memory location containing 8 consecutive
 373 ///    half-precision (16-bit) floating-point values.
 374 /// \returns
 375 ///    A 128-bit vector of [4 x float].
 376 ///
 377 /// \code{.operation}
 378 /// FOR j := 0 to 3
 379 ///     k := j*2+1
 380 ///     i := k*16
 381 ///     m := j*32
 382 ///     dst[m+31:m] := Convert_FP16_To_FP32(MEM[__A+i+15:__A+i])
 383 /// ENDFOR
 384 /// dst[MAX:128] := 0
 385 /// \endcode
 386 static __inline__ __m128 __DEFAULT_FN_ATTRS128
 387 _mm_cvtneoph_ps(const __m128h *__A) {
 388   return (__m128)__builtin_ia32_vcvtneoph2ps128((const __v8hf *)__A);
 389 }
 390
 391 /// Convert packed half-precision (16-bit) floating-point odd-indexed elements
 392 /// stored at memory locations starting at location \a __A to packed
 393 /// single-precision (32-bit) floating-point elements, and store the results in
 394 /// \a dst.
 395 ///
 396 /// \headerfile <x86intrin.h>
 397 ///
 398 /// \code
 399 /// _mm256_cvtneoph_ps(const __m256h *__A);
 400 /// \endcode
 401 ///
 402 /// This intrinsic corresponds to the \c VCVTNEOPH2PS instruction.
 403 ///
 404 /// \param __A
 405 ///    A pointer to a 256-bit memory location containing 16 consecutive
 406 ///    half-precision (16-bit) floating-point values.
 407 /// \returns
 408 ///    A 256-bit vector of [8 x float].
 409 ///
 410 /// \code{.operation}
 411 /// FOR j := 0 to 7
 412 ///     k := j*2+1
 413 ///     i := k*16
 414 ///     m := j*32
 415 ///     dst[m+31:m] := Convert_FP16_To_FP32(MEM[__A+i+15:__A+i])
 416 /// ENDFOR
 417 /// dst[MAX:256] := 0
 418 /// \endcode
 419 static __inline__ __m256 __DEFAULT_FN_ATTRS256
 420 _mm256_cvtneoph_ps(const __m256h *__A) {
 421   return (__m256)__builtin_ia32_vcvtneoph2ps256((const __v16hf *)__A);
 422 }
 423
 424 /// Convert packed single-precision (32-bit) floating-point elements in \a __A
 425 /// to packed BF16 (16-bit) floating-point elements, and store the results in \a
 426 /// dst.
 427 ///
 428 /// \headerfile <x86intrin.h>
 429 ///
 430 /// \code
 431 /// _mm_cvtneps_avx_pbh(__m128 __A);
 432 /// \endcode
 433 ///
 434 /// This intrinsic corresponds to the \c VCVTNEPS2BF16 instruction.
 435 ///
 436 /// \param __A
 437 ///    A 128-bit vector of [4 x float].
 438 /// \returns
 439 ///    A 128-bit vector of [8 x bfloat].
 440 ///
 441 /// \code{.operation}
 442 /// FOR j := 0 to 3
 443 ///     dst.word[j] := Convert_FP32_To_BF16(__A.fp32[j])
 444 /// ENDFOR
 445 /// dst[MAX:128] := 0
 446 /// \endcode
 447 static __inline__ __m128bh __DEFAULT_FN_ATTRS128
 448 _mm_cvtneps_avx_pbh(__m128 __A) {
 449   return (__m128bh)__builtin_ia32_vcvtneps2bf16128((__v4sf)__A);
 450 }
 451
 452 /// Convert packed single-precision (32-bit) floating-point elements in \a __A
 453 /// to packed BF16 (16-bit) floating-point elements, and store the results in \a
 454 /// dst.
 455 ///
 456 /// \headerfile <x86intrin.h>
 457 ///
 458 /// \code
 459 /// _mm256_cvtneps_avx_pbh(__m256 __A);
 460 /// \endcode
 461 ///
 462 /// This intrinsic corresponds to the \c VCVTNEPS2BF16 instruction.
 463 ///
 464 /// \param __A
 465 ///    A 256-bit vector of [8 x float].
 466 /// \returns
 467 ///    A 128-bit vector of [8 x bfloat].
 468 ///
 469 /// \code{.operation}
 470 /// FOR j := 0 to 7
 471 ///     dst.word[j] := Convert_FP32_To_BF16(a.fp32[j])
 472 /// ENDFOR
 473 /// dst[MAX:128] := 0
 474 /// \endcode
 475 static __inline__ __m128bh __DEFAULT_FN_ATTRS256
 476 _mm256_cvtneps_avx_pbh(__m256 __A) {
 477   return (__m128bh)__builtin_ia32_vcvtneps2bf16256((__v8sf)__A);
 478 }
 479
 480 #undef __DEFAULT_FN_ATTRS128
 481 #undef __DEFAULT_FN_ATTRS256
 482
 483 #endif // __AVXNECONVERTINTRIN_H
 484 #endif // __SSE2__