llvm/test/CodeGen/X86/avx512bf16-vl-intrinsics.ll

   1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
   2 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512bf16 -mattr=+avx512vl --show-mc-encoding | FileCheck %s --check-prefixes=CHECK,X86
   3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bf16 -mattr=+avx512vl --show-mc-encoding | FileCheck %s --check-prefixes=CHECK,X64
   4
   5 declare <8 x bfloat> @llvm.x86.avx512bf16.cvtne2ps2bf16.128(<4 x float>, <4 x float>) #1
   6
   7 define <2 x i64> @test_mm_cvtne2ps2bf16_128(<4 x float> %A, <4 x float> %B) local_unnamed_addr #0 {
   8 ; CHECK-LABEL: test_mm_cvtne2ps2bf16_128:
   9 ; CHECK:       # %bb.0: # %entry
  10 ; CHECK-NEXT:    vcvtne2ps2bf16 %xmm1, %xmm0, %xmm0 # encoding: [0x62,0xf2,0x7f,0x08,0x72,0xc1]
  11 ; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
  12 entry:
  13   %0 = tail call <8 x bfloat> @llvm.x86.avx512bf16.cvtne2ps2bf16.128(<4 x float> %A, <4 x float> %B) #2
  14   %1 = bitcast <8 x bfloat> %0 to <2 x i64>
  15   ret <2 x i64> %1
  16 }
  17
  18 define <2 x i64> @test_mm_maskz_cvtne2ps2bf16_128(<4 x float> %A, <4 x float> %B, i8 zeroext %U) local_unnamed_addr #0 {
  19 ; X86-LABEL: test_mm_maskz_cvtne2ps2bf16_128:
  20 ; X86:       # %bb.0: # %entry
  21 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
  22 ; X86-NEXT:    kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8]
  23 ; X86-NEXT:    vcvtne2ps2bf16 %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7f,0x89,0x72,0xc1]
  24 ; X86-NEXT:    retl # encoding: [0xc3]
  25 ;
  26 ; X64-LABEL: test_mm_maskz_cvtne2ps2bf16_128:
  27 ; X64:       # %bb.0: # %entry
  28 ; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
  29 ; X64-NEXT:    vcvtne2ps2bf16 %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7f,0x89,0x72,0xc1]
  30 ; X64-NEXT:    retq # encoding: [0xc3]
  31 entry:
  32   %0 = tail call <8 x bfloat> @llvm.x86.avx512bf16.cvtne2ps2bf16.128(<4 x float> %A, <4 x float> %B) #2
  33   %1 = bitcast i8 %U to <8 x i1>
  34   %2 = select <8 x i1> %1, <8 x bfloat> %0, <8 x bfloat> zeroinitializer
  35   %3 = bitcast <8 x bfloat> %2 to <2 x i64>
  36   ret <2 x i64> %3
  37 }
  38
  39 define <2 x i64> @test_mm_mask_cvtne2ps2bf16_128(<2 x i64> %C, i8 zeroext %U, <4 x float> %A, <4 x float> %B) local_unnamed_addr #0 {
  40 ; X86-LABEL: test_mm_mask_cvtne2ps2bf16_128:
  41 ; X86:       # %bb.0: # %entry
  42 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
  43 ; X86-NEXT:    kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8]
  44 ; X86-NEXT:    vcvtne2ps2bf16 %xmm2, %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf2,0x77,0x09,0x72,0xc2]
  45 ; X86-NEXT:    retl # encoding: [0xc3]
  46 ;
  47 ; X64-LABEL: test_mm_mask_cvtne2ps2bf16_128:
  48 ; X64:       # %bb.0: # %entry
  49 ; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
  50 ; X64-NEXT:    vcvtne2ps2bf16 %xmm2, %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf2,0x77,0x09,0x72,0xc2]
  51 ; X64-NEXT:    retq # encoding: [0xc3]
  52 entry:
  53   %0 = tail call <8 x bfloat> @llvm.x86.avx512bf16.cvtne2ps2bf16.128(<4 x float> %A, <4 x float> %B) #2
  54   %1 = bitcast <2 x i64> %C to <8 x bfloat>
  55   %2 = bitcast i8 %U to <8 x i1>
  56   %3 = select <8 x i1> %2, <8 x bfloat> %0, <8 x bfloat> %1
  57   %4 = bitcast <8 x bfloat> %3 to <2 x i64>
  58   ret <2 x i64> %4
  59 }
  60
  61 declare <16 x bfloat> @llvm.x86.avx512bf16.cvtne2ps2bf16.256(<8 x float>, <8 x float>) #3
  62
  63 define <4 x i64> @test_mm256_cvtne2ps2bf16_256(<8 x float> %A, <8 x float> %B) local_unnamed_addr #1 {
  64 ; CHECK-LABEL: test_mm256_cvtne2ps2bf16_256:
  65 ; CHECK:       # %bb.0: # %entry
  66 ; CHECK-NEXT:    vcvtne2ps2bf16 %ymm1, %ymm0, %ymm0 # encoding: [0x62,0xf2,0x7f,0x28,0x72,0xc1]
  67 ; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
  68 entry:
  69   %0 = tail call <16 x bfloat> @llvm.x86.avx512bf16.cvtne2ps2bf16.256(<8 x float> %A, <8 x float> %B) #4
  70   %1 = bitcast <16 x bfloat> %0 to <4 x i64>
  71   ret <4 x i64> %1
  72 }
  73
  74 define <4 x i64> @test_mm256_maskz_cvtne2ps2bf16_256(<8 x float> %A, <8 x float> %B, i16 zeroext %U) local_unnamed_addr #1 {
  75 ; X86-LABEL: test_mm256_maskz_cvtne2ps2bf16_256:
  76 ; X86:       # %bb.0: # %entry
  77 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
  78 ; X86-NEXT:    vcvtne2ps2bf16 %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x7f,0xa9,0x72,0xc1]
  79 ; X86-NEXT:    retl # encoding: [0xc3]
  80 ;
  81 ; X64-LABEL: test_mm256_maskz_cvtne2ps2bf16_256:
  82 ; X64:       # %bb.0: # %entry
  83 ; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
  84 ; X64-NEXT:    vcvtne2ps2bf16 %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x7f,0xa9,0x72,0xc1]
  85 ; X64-NEXT:    retq # encoding: [0xc3]
  86 entry:
  87   %0 = tail call <16 x bfloat> @llvm.x86.avx512bf16.cvtne2ps2bf16.256(<8 x float> %A, <8 x float> %B) #4
  88   %1 = bitcast i16 %U to <16 x i1>
  89   %2 = select <16 x i1> %1, <16 x bfloat> %0, <16 x bfloat> zeroinitializer
  90   %3 = bitcast <16 x bfloat> %2 to <4 x i64>
  91   ret <4 x i64> %3
  92 }
  93
  94 define <4 x i64> @test_mm256_mask_cvtne2ps2bf16_256(<4 x i64> %C, i16 zeroext %U, <8 x float> %A, <8 x float> %B) local_unnamed_addr #1 {
  95 ; X86-LABEL: test_mm256_mask_cvtne2ps2bf16_256:
  96 ; X86:       # %bb.0: # %entry
  97 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
  98 ; X86-NEXT:    vcvtne2ps2bf16 %ymm2, %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf2,0x77,0x29,0x72,0xc2]
  99 ; X86-NEXT:    retl # encoding: [0xc3]
 100 ;
 101 ; X64-LABEL: test_mm256_mask_cvtne2ps2bf16_256:
 102 ; X64:       # %bb.0: # %entry
 103 ; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
 104 ; X64-NEXT:    vcvtne2ps2bf16 %ymm2, %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf2,0x77,0x29,0x72,0xc2]
 105 ; X64-NEXT:    retq # encoding: [0xc3]
 106 entry:
 107   %0 = tail call <16 x bfloat> @llvm.x86.avx512bf16.cvtne2ps2bf16.256(<8 x float> %A, <8 x float> %B) #4
 108   %1 = bitcast <4 x i64> %C to <16 x bfloat>
 109   %2 = bitcast i16 %U to <16 x i1>
 110   %3 = select <16 x i1> %2, <16 x bfloat> %0, <16 x bfloat> %1
 111   %4 = bitcast <16 x bfloat> %3 to <4 x i64>
 112   ret <4 x i64> %4
 113 }
 114
 115 declare <8 x bfloat> @llvm.x86.avx512bf16.cvtneps2bf16.256(<8 x float>) #3
 116
 117 define <2 x i64> @test_mm256_cvtneps2bf16_256(<8 x float> %A) local_unnamed_addr #2 {
 118 ; CHECK-LABEL: test_mm256_cvtneps2bf16_256:
 119 ; CHECK:       # %bb.0: # %entry
 120 ; CHECK-NEXT:    vcvtneps2bf16 %ymm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x28,0x72,0xc0]
 121 ; CHECK-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
 122 ; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
 123 entry:
 124   %0 = tail call <8 x bfloat> @llvm.x86.avx512bf16.cvtneps2bf16.256(<8 x float> %A) #4
 125   %1 = bitcast <8 x bfloat> %0 to <2 x i64>
 126   ret <2 x i64> %1
 127 }
 128
 129 define <2 x i64> @test_mm256_maskz_cvtneps2bf16_256(<8 x float> %A, i8 zeroext %U) local_unnamed_addr #2 {
 130 ; X86-LABEL: test_mm256_maskz_cvtneps2bf16_256:
 131 ; X86:       # %bb.0: # %entry
 132 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
 133 ; X86-NEXT:    kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8]
 134 ; X86-NEXT:    vcvtneps2bf16 %ymm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xa9,0x72,0xc0]
 135 ; X86-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
 136 ; X86-NEXT:    retl # encoding: [0xc3]
 137 ;
 138 ; X64-LABEL: test_mm256_maskz_cvtneps2bf16_256:
 139 ; X64:       # %bb.0: # %entry
 140 ; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
 141 ; X64-NEXT:    vcvtneps2bf16 %ymm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xa9,0x72,0xc0]
 142 ; X64-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
 143 ; X64-NEXT:    retq # encoding: [0xc3]
 144 entry:
 145   %0 = tail call <8 x bfloat> @llvm.x86.avx512bf16.cvtneps2bf16.256(<8 x float> %A) #4
 146   %1 = bitcast i8 %U to <8 x i1>
 147   %2 = select <8 x i1> %1, <8 x bfloat> %0, <8 x bfloat> zeroinitializer
 148   %3 = bitcast <8 x bfloat> %2 to <2 x i64>
 149   ret <2 x i64> %3
 150 }
 151
 152 define <2 x i64> @test_mm256_mask_cvtneps2bf16_256(<2 x i64> %C, i8 zeroext %U, <8 x float> %A) local_unnamed_addr #2 {
 153 ; X86-LABEL: test_mm256_mask_cvtneps2bf16_256:
 154 ; X86:       # %bb.0: # %entry
 155 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
 156 ; X86-NEXT:    kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8]
 157 ; X86-NEXT:    vcvtneps2bf16 %ymm1, %xmm0 {%k1} # encoding: [0x62,0xf2,0x7e,0x29,0x72,0xc1]
 158 ; X86-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
 159 ; X86-NEXT:    retl # encoding: [0xc3]
 160 ;
 161 ; X64-LABEL: test_mm256_mask_cvtneps2bf16_256:
 162 ; X64:       # %bb.0: # %entry
 163 ; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
 164 ; X64-NEXT:    vcvtneps2bf16 %ymm1, %xmm0 {%k1} # encoding: [0x62,0xf2,0x7e,0x29,0x72,0xc1]
 165 ; X64-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
 166 ; X64-NEXT:    retq # encoding: [0xc3]
 167 entry:
 168   %0 = tail call <8 x bfloat> @llvm.x86.avx512bf16.cvtneps2bf16.256(<8 x float> %A) #4
 169   %1 = bitcast <2 x i64> %C to <8 x bfloat>
 170   %2 = bitcast i8 %U to <8 x i1>
 171   %3 = select <8 x i1> %2, <8 x bfloat> %0, <8 x bfloat> %1
 172   %4 = bitcast <8 x bfloat> %3 to <2 x i64>
 173   ret <2 x i64> %4
 174 }
 175
 176 declare <8 x bfloat> @llvm.x86.avx512bf16.mask.cvtneps2bf16.128(<4 x float>, <8 x bfloat>, <4 x i1>) #3
 177
 178 define <2 x i64> @test_mm128_cvtneps2bf16_128(<4 x float> %A) local_unnamed_addr #2 {
 179 ; CHECK-LABEL: test_mm128_cvtneps2bf16_128:
 180 ; CHECK:       # %bb.0: # %entry
 181 ; CHECK-NEXT:    vcvtneps2bf16 %xmm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x08,0x72,0xc0]
 182 ; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
 183 entry:
 184   %0 = tail call <8 x bfloat> @llvm.x86.avx512bf16.mask.cvtneps2bf16.128(<4 x float> %A, <8 x bfloat> undef, <4 x i1> <i1 true, i1 true, i1 true, i1 true>) #4
 185   %1 = bitcast <8 x bfloat> %0 to <2 x i64>
 186   ret <2 x i64> %1
 187 }
 188
 189 define <2 x i64> @test_mm128_maskz_cvtneps2bf16_128(<4 x float> %A, i8 zeroext %U) local_unnamed_addr #2 {
 190 ; X86-LABEL: test_mm128_maskz_cvtneps2bf16_128:
 191 ; X86:       # %bb.0: # %entry
 192 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
 193 ; X86-NEXT:    kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8]
 194 ; X86-NEXT:    vcvtneps2bf16 %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0x89,0x72,0xc0]
 195 ; X86-NEXT:    retl # encoding: [0xc3]
 196 ;
 197 ; X64-LABEL: test_mm128_maskz_cvtneps2bf16_128:
 198 ; X64:       # %bb.0: # %entry
 199 ; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
 200 ; X64-NEXT:    vcvtneps2bf16 %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0x89,0x72,0xc0]
 201 ; X64-NEXT:    retq # encoding: [0xc3]
 202 entry:
 203   %0 = bitcast i8 %U to <8 x i1>
 204   %1 = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 205   %2 = tail call <8 x bfloat> @llvm.x86.avx512bf16.mask.cvtneps2bf16.128(<4 x float> %A, <8 x bfloat> zeroinitializer, <4 x i1> %1) #4
 206   %3 = bitcast <8 x bfloat> %2 to <2 x i64>
 207   ret <2 x i64> %3
 208 }
 209
 210 define <2 x i64> @test_mm128_mask_cvtneps2bf16_128(<2 x i64> %C, i8 zeroext %U, <4 x float> %A) local_unnamed_addr #2 {
 211 ; X86-LABEL: test_mm128_mask_cvtneps2bf16_128:
 212 ; X86:       # %bb.0: # %entry
 213 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
 214 ; X86-NEXT:    kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8]
 215 ; X86-NEXT:    vcvtneps2bf16 %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf2,0x7e,0x09,0x72,0xc1]
 216 ; X86-NEXT:    retl # encoding: [0xc3]
 217 ;
 218 ; X64-LABEL: test_mm128_mask_cvtneps2bf16_128:
 219 ; X64:       # %bb.0: # %entry
 220 ; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
 221 ; X64-NEXT:    vcvtneps2bf16 %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf2,0x7e,0x09,0x72,0xc1]
 222 ; X64-NEXT:    retq # encoding: [0xc3]
 223 entry:
 224   %0 = bitcast i8 %U to <8 x i1>
 225   %1 = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 226   %2 = bitcast <2 x i64> %C to <8 x bfloat>
 227   %3 = tail call <8 x bfloat> @llvm.x86.avx512bf16.mask.cvtneps2bf16.128(<4 x float> %A, <8 x bfloat> %2, <4 x i1> %1) #4
 228   %4 = bitcast <8 x bfloat> %3 to <2 x i64>
 229   ret <2 x i64> %4
 230 }
 231
 232 ; Make sure we don't fold a select into the 128 bit form of cvtneps2bf16. It
 233 ; always writes zeros to bits 127:64 regardless of mask.
 234 define <2 x i64> @test_mm128_cvtneps2bf16_128_select(<2 x i64> %C, i8 zeroext %U, <4 x float> %A) local_unnamed_addr #2 {
 235 ; X86-LABEL: test_mm128_cvtneps2bf16_128_select:
 236 ; X86:       # %bb.0: # %entry
 237 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
 238 ; X86-NEXT:    kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8]
 239 ; X86-NEXT:    vcvtneps2bf16 %xmm1, %xmm1 # encoding: [0x62,0xf2,0x7e,0x08,0x72,0xc9]
 240 ; X86-NEXT:    vmovdqu16 %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf1,0xff,0x09,0x6f,0xc1]
 241 ; X86-NEXT:    retl # encoding: [0xc3]
 242 ;
 243 ; X64-LABEL: test_mm128_cvtneps2bf16_128_select:
 244 ; X64:       # %bb.0: # %entry
 245 ; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
 246 ; X64-NEXT:    vcvtneps2bf16 %xmm1, %xmm1 # encoding: [0x62,0xf2,0x7e,0x08,0x72,0xc9]
 247 ; X64-NEXT:    vmovdqu16 %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf1,0xff,0x09,0x6f,0xc1]
 248 ; X64-NEXT:    retq # encoding: [0xc3]
 249 entry:
 250   %0 = bitcast i8 %U to <8 x i1>
 251   %1 = bitcast <2 x i64> %C to <8 x bfloat>
 252   %2 = tail call <8 x bfloat> @llvm.x86.avx512bf16.mask.cvtneps2bf16.128(<4 x float> %A, <8 x bfloat> undef, <4 x i1> <i1 true, i1 true, i1 true, i1 true>) #4
 253   %3 = select <8 x i1> %0, <8 x bfloat> %2, <8 x bfloat> %1
 254   %4 = bitcast <8 x bfloat> %3 to <2 x i64>
 255   ret <2 x i64> %4
 256 }
 257
 258 declare <8 x float> @llvm.x86.avx512bf16.dpbf16ps.256(<8 x float>, <16 x bfloat>, <16 x bfloat>) #3
 259
 260 define <8 x float> @test_mm256_dpbf16ps_256(<8 x float> %E, <16 x bfloat> %A, <16 x bfloat> %B) local_unnamed_addr #2 {
 261 ; CHECK-LABEL: test_mm256_dpbf16ps_256:
 262 ; CHECK:       # %bb.0: # %entry
 263 ; CHECK-NEXT:    vdpbf16ps %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0x76,0x28,0x52,0xc2]
 264 ; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
 265 entry:
 266   %0 = tail call <8 x float> @llvm.x86.avx512bf16.dpbf16ps.256(<8 x float> %E, <16 x bfloat> %A, <16 x bfloat> %B) #4
 267   ret <8 x float> %0
 268 }
 269
 270 define <8 x float> @test_mm256_maskz_dpbf16ps_256(<8 x float> %E, <16 x bfloat> %A, <16 x bfloat> %B, i8 zeroext %U) local_unnamed_addr #2 {
 271 ; X86-LABEL: test_mm256_maskz_dpbf16ps_256:
 272 ; X86:       # %bb.0: # %entry
 273 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
 274 ; X86-NEXT:    kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8]
 275 ; X86-NEXT:    vdpbf16ps %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x76,0xa9,0x52,0xc2]
 276 ; X86-NEXT:    retl # encoding: [0xc3]
 277 ;
 278 ; X64-LABEL: test_mm256_maskz_dpbf16ps_256:
 279 ; X64:       # %bb.0: # %entry
 280 ; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
 281 ; X64-NEXT:    vdpbf16ps %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x76,0xa9,0x52,0xc2]
 282 ; X64-NEXT:    retq # encoding: [0xc3]
 283 entry:
 284   %0 = tail call <8 x float> @llvm.x86.avx512bf16.dpbf16ps.256(<8 x float> %E, <16 x bfloat> %A, <16 x bfloat> %B) #4
 285   %1 = bitcast i8 %U to <8 x i1>
 286   %2 = select <8 x i1> %1, <8 x float> %0, <8 x float> zeroinitializer
 287   ret <8 x float> %2
 288 }
 289 define <8 x float> @test_mm256_mask_dpbf16ps_256(i8 zeroext %U, <8 x float> %E, <16 x bfloat> %A, <16 x bfloat> %B) local_unnamed_addr #2 {
 290 ; X86-LABEL: test_mm256_mask_dpbf16ps_256:
 291 ; X86:       # %bb.0: # %entry
 292 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
 293 ; X86-NEXT:    kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8]
 294 ; X86-NEXT:    vdpbf16ps %ymm2, %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf2,0x76,0x29,0x52,0xc2]
 295 ; X86-NEXT:    retl # encoding: [0xc3]
 296 ;
 297 ; X64-LABEL: test_mm256_mask_dpbf16ps_256:
 298 ; X64:       # %bb.0: # %entry
 299 ; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
 300 ; X64-NEXT:    vdpbf16ps %ymm2, %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf2,0x76,0x29,0x52,0xc2]
 301 ; X64-NEXT:    retq # encoding: [0xc3]
 302 entry:
 303   %0 = tail call <8 x float> @llvm.x86.avx512bf16.dpbf16ps.256(<8 x float> %E, <16 x bfloat> %A, <16 x bfloat> %B) #4
 304   %1 = bitcast i8 %U to <8 x i1>
 305   %2 = select <8 x i1> %1, <8 x float> %0, <8 x float> %E
 306   ret <8 x float> %2
 307 }
 308
 309 declare <4 x float> @llvm.x86.avx512bf16.dpbf16ps.128(<4 x float>, <8 x bfloat>, <8 x bfloat>) #3
 310
 311 define <4 x float> @test_mm128_dpbf16ps_128(<4 x float> %E, <8 x bfloat> %A, <8 x bfloat> %B) local_unnamed_addr #2 {
 312 ; CHECK-LABEL: test_mm128_dpbf16ps_128:
 313 ; CHECK:       # %bb.0: # %entry
 314 ; CHECK-NEXT:    vdpbf16ps %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf2,0x76,0x08,0x52,0xc2]
 315 ; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
 316 entry:
 317   %0 = tail call <4 x float> @llvm.x86.avx512bf16.dpbf16ps.128(<4 x float> %E, <8 x bfloat> %A, <8 x bfloat> %B) #4
 318   ret <4 x float> %0
 319 }
 320
 321 define <4 x float> @test_mm128_maskz_dpbf16ps_128(<4 x float> %E, <8 x bfloat> %A, <8 x bfloat> %B, i4 zeroext %U) local_unnamed_addr #2 {
 322 ; X86-LABEL: test_mm128_maskz_dpbf16ps_128:
 323 ; X86:       # %bb.0: # %entry
 324 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
 325 ; X86-NEXT:    kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8]
 326 ; X86-NEXT:    vdpbf16ps %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x76,0x89,0x52,0xc2]
 327 ; X86-NEXT:    retl # encoding: [0xc3]
 328 ;
 329 ; X64-LABEL: test_mm128_maskz_dpbf16ps_128:
 330 ; X64:       # %bb.0: # %entry
 331 ; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
 332 ; X64-NEXT:    vdpbf16ps %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x76,0x89,0x52,0xc2]
 333 ; X64-NEXT:    retq # encoding: [0xc3]
 334 entry:
 335   %0 = tail call <4 x float> @llvm.x86.avx512bf16.dpbf16ps.128(<4 x float> %E, <8 x bfloat> %A, <8 x bfloat> %B) #4
 336   %1 = bitcast i4 %U to <4 x i1>
 337   %2 = select <4 x i1> %1, <4 x float> %0, <4 x float> zeroinitializer
 338   ret <4 x float> %2
 339 }
 340 define <4 x float> @test_mm128_mask_dpbf16ps_128(i4 zeroext %U, <4 x float> %E, <8 x bfloat> %A, <8 x bfloat> %B) local_unnamed_addr #2 {
 341 ; X86-LABEL: test_mm128_mask_dpbf16ps_128:
 342 ; X86:       # %bb.0: # %entry
 343 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
 344 ; X86-NEXT:    kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8]
 345 ; X86-NEXT:    vdpbf16ps %xmm2, %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf2,0x76,0x09,0x52,0xc2]
 346 ; X86-NEXT:    retl # encoding: [0xc3]
 347 ;
 348 ; X64-LABEL: test_mm128_mask_dpbf16ps_128:
 349 ; X64:       # %bb.0: # %entry
 350 ; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
 351 ; X64-NEXT:    vdpbf16ps %xmm2, %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf2,0x76,0x09,0x52,0xc2]
 352 ; X64-NEXT:    retq # encoding: [0xc3]
 353 entry:
 354   %0 = tail call <4 x float> @llvm.x86.avx512bf16.dpbf16ps.128(<4 x float> %E, <8 x bfloat> %A, <8 x bfloat> %B) #4
 355   %1 = bitcast i4 %U to <4 x i1>
 356   %2 = select <4 x i1> %1, <4 x float> %0, <4 x float> %E
 357   ret <4 x float> %2
 358 }
 359
 360 define <16 x i16> @test_no_vbroadcast1() {
 361 ; CHECK-LABEL: test_no_vbroadcast1:
 362 ; CHECK:       # %bb.0: # %entry
 363 ; CHECK-NEXT:    vcvtneps2bf16 %xmm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x08,0x72,0xc0]
 364 ; CHECK-NEXT:    vpbroadcastw %xmm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x79,0xc0]
 365 ; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
 366 entry:
 367   %0 = tail call <8 x bfloat> @llvm.x86.avx512bf16.mask.cvtneps2bf16.128(<4 x float> poison, <8 x bfloat> zeroinitializer, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
 368   %1 = bitcast <8 x bfloat> %0 to <8 x i16>
 369   %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <16 x i32> zeroinitializer
 370   ret <16 x i16> %2
 371 }
 372
 373 define <16 x bfloat> @test_no_vbroadcast2() nounwind {
 374 ; CHECK-LABEL: test_no_vbroadcast2:
 375 ; CHECK:       # %bb.0: # %entry
 376 ; CHECK-NEXT:    vcvtneps2bf16 %xmm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x08,0x72,0xc0]
 377 ; CHECK-NEXT:    vpbroadcastw %xmm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x79,0xc0]
 378 ; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
 379 entry:
 380   %0 = tail call <8 x bfloat> @llvm.x86.avx512bf16.mask.cvtneps2bf16.128(<4 x float> poison, <8 x bfloat> zeroinitializer, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
 381   %1 = shufflevector <8 x bfloat> %0, <8 x bfloat> undef, <16 x i32> zeroinitializer
 382   ret <16 x bfloat> %1
 383 }
 384
 385 define <16 x i32> @pr83358() {
 386 ; X86-LABEL: pr83358:
 387 ; X86:       # %bb.0:
 388 ; X86-NEXT:    vcvtneps2bf16y {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # encoding: [0x62,0xf2,0x7e,0x28,0x72,0x05,A,A,A,A]
 389 ; X86-NEXT:    # fixup A - offset: 6, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4
 390 ; X86-NEXT:    vshufi64x2 $0, %zmm0, %zmm0, %zmm0 # encoding: [0x62,0xf3,0xfd,0x48,0x43,0xc0,0x00]
 391 ; X86-NEXT:    # zmm0 = zmm0[0,1,0,1,0,1,0,1]
 392 ; X86-NEXT:    retl # encoding: [0xc3]
 393 ;
 394 ; X64-LABEL: pr83358:
 395 ; X64:       # %bb.0:
 396 ; X64-NEXT:    vcvtneps2bf16y {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # encoding: [0x62,0xf2,0x7e,0x28,0x72,0x05,A,A,A,A]
 397 ; X64-NEXT:    # fixup A - offset: 6, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte
 398 ; X64-NEXT:    vshufi64x2 $0, %zmm0, %zmm0, %zmm0 # encoding: [0x62,0xf3,0xfd,0x48,0x43,0xc0,0x00]
 399 ; X64-NEXT:    # zmm0 = zmm0[0,1,0,1,0,1,0,1]
 400 ; X64-NEXT:    retq # encoding: [0xc3]
 401   %1 = call <8 x bfloat> @llvm.x86.avx512bf16.cvtneps2bf16.256(<8 x float> <float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float 4.000000e+00, float 5.000000e+00, float 6.000000e+00, float 7.000000e+00, float 8.000000e+00>)
 402   %2 = bitcast <8 x bfloat> %1 to <4 x i32>
 403   %3 = shufflevector <4 x i32> %2, <4 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
 404   ret <16 x i32> %3
 405 }