clang/lib/Headers/avx512bf16intrin.h

   1 /*===------------ avx512bf16intrin.h - AVX512_BF16 intrinsics --------------===
   2  *
   3  * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   4  * See https://llvm.org/LICENSE.txt for license information.
   5  * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
   6  *
   7  *===-----------------------------------------------------------------------===
   8  */
   9 #ifndef __IMMINTRIN_H
  10 #error "Never use <avx512bf16intrin.h> directly; include <immintrin.h> instead."
  11 #endif
  12
  13 #ifdef __SSE2__
  14
  15 #ifndef __AVX512BF16INTRIN_H
  16 #define __AVX512BF16INTRIN_H
  17
  18 typedef __bf16 __v32bf __attribute__((__vector_size__(64), __aligned__(64)));
  19 typedef __bf16 __m512bh __attribute__((__vector_size__(64), __aligned__(64)));
  20 typedef __bf16 __bfloat16 __attribute__((deprecated("use __bf16 instead")));
  21
  22 #define __DEFAULT_FN_ATTRS512 \
  23   __attribute__((__always_inline__, __nodebug__, __target__("avx512bf16,evex512"), \
  24                  __min_vector_width__(512)))
  25 #define __DEFAULT_FN_ATTRS                                                     \
  26   __attribute__((__always_inline__, __nodebug__,                               \
  27                  __target__("avx512bf16,no-evex512")))
  28
  29 /// Convert One BF16 Data to One Single Float Data.
  30 ///
  31 /// \headerfile <x86intrin.h>
  32 ///
  33 /// This intrinsic does not correspond to a specific instruction.
  34 ///
  35 /// \param __A
  36 ///    A bfloat data.
  37 /// \returns A float data whose sign field and exponent field keep unchanged,
  38 ///    and fraction field is extended to 23 bits.
  39 static __inline__ float __DEFAULT_FN_ATTRS _mm_cvtsbh_ss(__bf16 __A) {
  40   return __builtin_ia32_cvtsbf162ss_32(__A);
  41 }
  42
  43 /// Convert Two Packed Single Data to One Packed BF16 Data.
  44 ///
  45 /// \headerfile <x86intrin.h>
  46 ///
  47 /// This intrinsic corresponds to the <c> VCVTNE2PS2BF16 </c> instructions.
  48 ///
  49 /// \param __A
  50 ///    A 512-bit vector of [16 x float].
  51 /// \param __B
  52 ///    A 512-bit vector of [16 x float].
  53 /// \returns A 512-bit vector of [32 x bfloat] whose lower 256 bits come from
  54 ///    conversion of __B, and higher 256 bits come from conversion of __A.
  55 static __inline__ __m512bh __DEFAULT_FN_ATTRS512
  56 _mm512_cvtne2ps_pbh(__m512 __A, __m512 __B) {
  57   return (__m512bh)__builtin_ia32_cvtne2ps2bf16_512((__v16sf) __A,
  58                                                     (__v16sf) __B);
  59 }
  60
  61 /// Convert Two Packed Single Data to One Packed BF16 Data.
  62 ///
  63 /// \headerfile <x86intrin.h>
  64 ///
  65 /// This intrinsic corresponds to the <c> VCVTNE2PS2BF16 </c> instructions.
  66 ///
  67 /// \param __A
  68 ///    A 512-bit vector of [16 x float].
  69 /// \param __B
  70 ///    A 512-bit vector of [16 x float].
  71 /// \param __W
  72 ///    A 512-bit vector of [32 x bfloat].
  73 /// \param __U
  74 ///    A 32-bit mask value specifying what is chosen for each element.
  75 ///    A 1 means conversion of __A or __B. A 0 means element from __W.
  76 /// \returns A 512-bit vector of [32 x bfloat] whose lower 256 bits come from
  77 ///    conversion of __B, and higher 256 bits come from conversion of __A.
  78 static __inline__ __m512bh __DEFAULT_FN_ATTRS512
  79 _mm512_mask_cvtne2ps_pbh(__m512bh __W, __mmask32 __U, __m512 __A, __m512 __B) {
  80   return (__m512bh)__builtin_ia32_selectpbf_512((__mmask32)__U,
  81                                         (__v32bf)_mm512_cvtne2ps_pbh(__A, __B),
  82                                         (__v32bf)__W);
  83 }
  84
  85 /// Convert Two Packed Single Data to One Packed BF16 Data.
  86 ///
  87 /// \headerfile <x86intrin.h>
  88 ///
  89 /// This intrinsic corresponds to the <c> VCVTNE2PS2BF16 </c> instructions.
  90 ///
  91 /// \param __A
  92 ///    A 512-bit vector of [16 x float].
  93 /// \param __B
  94 ///    A 512-bit vector of [16 x float].
  95 /// \param __U
  96 ///    A 32-bit mask value specifying what is chosen for each element.
  97 ///    A 1 means conversion of __A or __B. A 0 means element is zero.
  98 /// \returns A 512-bit vector of [32 x bfloat] whose lower 256 bits come from
  99 ///    conversion of __B, and higher 256 bits come from conversion of __A.
 100 static __inline__ __m512bh __DEFAULT_FN_ATTRS512
 101 _mm512_maskz_cvtne2ps_pbh(__mmask32 __U, __m512 __A, __m512 __B) {
 102   return (__m512bh)__builtin_ia32_selectpbf_512((__mmask32)__U,
 103                                         (__v32bf)_mm512_cvtne2ps_pbh(__A, __B),
 104                                         (__v32bf)_mm512_setzero_si512());
 105 }
 106
 107 /// Convert Packed Single Data to Packed BF16 Data.
 108 ///
 109 /// \headerfile <x86intrin.h>
 110 ///
 111 /// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions.
 112 ///
 113 /// \param __A
 114 ///    A 512-bit vector of [16 x float].
 115 /// \returns A 256-bit vector of [16 x bfloat] come from conversion of __A.
 116 static __inline__ __m256bh __DEFAULT_FN_ATTRS512
 117 _mm512_cvtneps_pbh(__m512 __A) {
 118   return (__m256bh)__builtin_ia32_cvtneps2bf16_512_mask((__v16sf)__A,
 119                                               (__v16bf)_mm256_undefined_si256(),
 120                                               (__mmask16)-1);
 121 }
 122
 123 /// Convert Packed Single Data to Packed BF16 Data.
 124 ///
 125 /// \headerfile <x86intrin.h>
 126 ///
 127 /// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions.
 128 ///
 129 /// \param __A
 130 ///    A 512-bit vector of [16 x float].
 131 /// \param __W
 132 ///    A 256-bit vector of [16 x bfloat].
 133 /// \param __U
 134 ///    A 16-bit mask value specifying what is chosen for each element.
 135 ///    A 1 means conversion of __A. A 0 means element from __W.
 136 /// \returns A 256-bit vector of [16 x bfloat] come from conversion of __A.
 137 static __inline__ __m256bh __DEFAULT_FN_ATTRS512
 138 _mm512_mask_cvtneps_pbh(__m256bh __W, __mmask16 __U, __m512 __A) {
 139   return (__m256bh)__builtin_ia32_cvtneps2bf16_512_mask((__v16sf)__A,
 140                                                         (__v16bf)__W,
 141                                                         (__mmask16)__U);
 142 }
 143
 144 /// Convert Packed Single Data to Packed BF16 Data.
 145 ///
 146 /// \headerfile <x86intrin.h>
 147 ///
 148 /// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions.
 149 ///
 150 /// \param __A
 151 ///    A 512-bit vector of [16 x float].
 152 /// \param __U
 153 ///    A 16-bit mask value specifying what is chosen for each element.
 154 ///    A 1 means conversion of __A. A 0 means element is zero.
 155 /// \returns A 256-bit vector of [16 x bfloat] come from conversion of __A.
 156 static __inline__ __m256bh __DEFAULT_FN_ATTRS512
 157 _mm512_maskz_cvtneps_pbh(__mmask16 __U, __m512 __A) {
 158   return (__m256bh)__builtin_ia32_cvtneps2bf16_512_mask((__v16sf)__A,
 159                                                 (__v16bf)_mm256_setzero_si256(),
 160                                                 (__mmask16)__U);
 161 }
 162
 163 /// Dot Product of BF16 Pairs Accumulated into Packed Single Precision.
 164 ///
 165 /// \headerfile <x86intrin.h>
 166 ///
 167 /// This intrinsic corresponds to the <c> VDPBF16PS </c> instructions.
 168 ///
 169 /// \param __A
 170 ///    A 512-bit vector of [32 x bfloat].
 171 /// \param __B
 172 ///    A 512-bit vector of [32 x bfloat].
 173 /// \param __D
 174 ///    A 512-bit vector of [16 x float].
 175 /// \returns A 512-bit vector of [16 x float] comes from  Dot Product of
 176 ///  __A, __B and __D
 177 static __inline__ __m512 __DEFAULT_FN_ATTRS512
 178 _mm512_dpbf16_ps(__m512 __D, __m512bh __A, __m512bh __B) {
 179   return (__m512)__builtin_ia32_dpbf16ps_512((__v16sf) __D,
 180                                              (__v32bf) __A,
 181                                              (__v32bf) __B);
 182 }
 183
 184 /// Dot Product of BF16 Pairs Accumulated into Packed Single Precision.
 185 ///
 186 /// \headerfile <x86intrin.h>
 187 ///
 188 /// This intrinsic corresponds to the <c> VDPBF16PS </c> instructions.
 189 ///
 190 /// \param __A
 191 ///    A 512-bit vector of [32 x bfloat].
 192 /// \param __B
 193 ///    A 512-bit vector of [32 x bfloat].
 194 /// \param __D
 195 ///    A 512-bit vector of [16 x float].
 196 /// \param __U
 197 ///    A 16-bit mask value specifying what is chosen for each element.
 198 ///    A 1 means __A and __B's dot product accumulated with __D. A 0 means __D.
 199 /// \returns A 512-bit vector of [16 x float] comes from  Dot Product of
 200 ///  __A, __B and __D
 201 static __inline__ __m512 __DEFAULT_FN_ATTRS512
 202 _mm512_mask_dpbf16_ps(__m512 __D, __mmask16 __U, __m512bh __A, __m512bh __B) {
 203   return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
 204                                        (__v16sf)_mm512_dpbf16_ps(__D, __A, __B),
 205                                        (__v16sf)__D);
 206 }
 207
 208 /// Dot Product of BF16 Pairs Accumulated into Packed Single Precision.
 209 ///
 210 /// \headerfile <x86intrin.h>
 211 ///
 212 /// This intrinsic corresponds to the <c> VDPBF16PS </c> instructions.
 213 ///
 214 /// \param __A
 215 ///    A 512-bit vector of [32 x bfloat].
 216 /// \param __B
 217 ///    A 512-bit vector of [32 x bfloat].
 218 /// \param __D
 219 ///    A 512-bit vector of [16 x float].
 220 /// \param __U
 221 ///    A 16-bit mask value specifying what is chosen for each element.
 222 ///    A 1 means __A and __B's dot product accumulated with __D. A 0 means 0.
 223 /// \returns A 512-bit vector of [16 x float] comes from  Dot Product of
 224 ///  __A, __B and __D
 225 static __inline__ __m512 __DEFAULT_FN_ATTRS512
 226 _mm512_maskz_dpbf16_ps(__mmask16 __U, __m512 __D, __m512bh __A, __m512bh __B) {
 227   return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
 228                                        (__v16sf)_mm512_dpbf16_ps(__D, __A, __B),
 229                                        (__v16sf)_mm512_setzero_si512());
 230 }
 231
 232 /// Convert Packed BF16 Data to Packed float Data.
 233 ///
 234 /// \headerfile <x86intrin.h>
 235 ///
 236 /// \param __A
 237 ///    A 256-bit vector of [16 x bfloat].
 238 /// \returns A 512-bit vector of [16 x float] come from conversion of __A
 239 static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_cvtpbh_ps(__m256bh __A) {
 240   return _mm512_castsi512_ps((__m512i)_mm512_slli_epi32(
 241       (__m512i)_mm512_cvtepi16_epi32((__m256i)__A), 16));
 242 }
 243
 244 /// Convert Packed BF16 Data to Packed float Data using zeroing mask.
 245 ///
 246 /// \headerfile <x86intrin.h>
 247 ///
 248 /// \param __U
 249 ///    A 16-bit mask. Elements are zeroed out when the corresponding mask
 250 ///    bit is not set.
 251 /// \param __A
 252 ///    A 256-bit vector of [16 x bfloat].
 253 /// \returns A 512-bit vector of [16 x float] come from conversion of __A
 254 static __inline__ __m512 __DEFAULT_FN_ATTRS512
 255 _mm512_maskz_cvtpbh_ps(__mmask16 __U, __m256bh __A) {
 256   return _mm512_castsi512_ps((__m512i)_mm512_slli_epi32(
 257       (__m512i)_mm512_maskz_cvtepi16_epi32((__mmask16)__U, (__m256i)__A), 16));
 258 }
 259
 260 /// Convert Packed BF16 Data to Packed float Data using merging mask.
 261 ///
 262 /// \headerfile <x86intrin.h>
 263 ///
 264 /// \param __S
 265 ///    A 512-bit vector of [16 x float]. Elements are copied from __S when
 266 ///     the corresponding mask bit is not set.
 267 /// \param __U
 268 ///    A 16-bit mask.
 269 /// \param __A
 270 ///    A 256-bit vector of [16 x bfloat].
 271 /// \returns A 512-bit vector of [16 x float] come from conversion of __A
 272 static __inline__ __m512 __DEFAULT_FN_ATTRS512
 273 _mm512_mask_cvtpbh_ps(__m512 __S, __mmask16 __U, __m256bh __A) {
 274   return _mm512_castsi512_ps((__m512i)_mm512_mask_slli_epi32(
 275       (__m512i)__S, (__mmask16)__U,
 276       (__m512i)_mm512_cvtepi16_epi32((__m256i)__A), 16));
 277 }
 278
 279 #undef __DEFAULT_FN_ATTRS
 280 #undef __DEFAULT_FN_ATTRS512
 281
 282 #endif
 283 #endif