llvm/test/CodeGen/X86/avx10_2bf16-intrinsics.ll

   1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
   2 ; RUN: llc < %s -verify-machineinstrs -mtriple=x86_64-unknown-unknown --show-mc-encoding -mattr=+avx10.2-256 | FileCheck %s --check-prefixes=CHECK,X64
   3 ; RUN: llc < %s -verify-machineinstrs -mtriple=i686-unknown-unknown --show-mc-encoding -mattr=+avx10.2-256 | FileCheck %s --check-prefixes=CHECK,X86
   4
   5 declare <16 x bfloat> @llvm.x86.avx10.vminpbf16256(<16 x bfloat>, <16 x bfloat>)
   6
   7 define <16 x bfloat> @test_int_x86_avx10_min_nepbf16_256(<16 x bfloat> %x1, <16 x bfloat> %x2) {
   8 ; CHECK-LABEL: test_int_x86_avx10_min_nepbf16_256:
   9 ; CHECK:       # %bb.0:
  10 ; CHECK-NEXT:    vminpbf16 %ymm1, %ymm0, %ymm0 # encoding: [0x62,0xf5,0x7d,0x28,0x5d,0xc1]
  11 ; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
  12   %res0 = call <16 x bfloat> @llvm.x86.avx10.vminpbf16256(<16 x bfloat> %x1, <16 x bfloat> %x2)
  13   ret <16 x bfloat> %res0
  14 }
  15
  16 define <16 x bfloat> @test_int_x86_avx10_maskz_min_nepbf16_256(<16 x bfloat> %x1, <16 x bfloat> %x2, i16 %msk) {
  17 ; X64-LABEL: test_int_x86_avx10_maskz_min_nepbf16_256:
  18 ; X64:       # %bb.0:
  19 ; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
  20 ; X64-NEXT:    vminpbf16 %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf5,0x7d,0xa9,0x5d,0xc1]
  21 ; X64-NEXT:    retq # encoding: [0xc3]
  22 ;
  23 ; X86-LABEL: test_int_x86_avx10_maskz_min_nepbf16_256:
  24 ; X86:       # %bb.0:
  25 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
  26 ; X86-NEXT:    vminpbf16 %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf5,0x7d,0xa9,0x5d,0xc1]
  27 ; X86-NEXT:    retl # encoding: [0xc3]
  28   %mask = bitcast i16 %msk to <16 x i1>
  29   %res0 = call <16 x bfloat> @llvm.x86.avx10.vminpbf16256(<16 x bfloat> %x1, <16 x bfloat> %x2)
  30   %res1 = select <16 x i1> %mask, <16 x bfloat> %res0, <16 x bfloat> zeroinitializer
  31   ret <16 x bfloat> %res1
  32 }
  33
  34 declare <8 x bfloat> @llvm.x86.avx10.vminpbf16128(<8 x bfloat>, <8 x bfloat>)
  35
  36 define <8 x bfloat> @test_int_x86_avx10_min_nepbf16_128(<8 x bfloat> %x1, <8 x bfloat> %x2) {
  37 ; CHECK-LABEL: test_int_x86_avx10_min_nepbf16_128:
  38 ; CHECK:       # %bb.0:
  39 ; CHECK-NEXT:    vminpbf16 %xmm1, %xmm0, %xmm0 # encoding: [0x62,0xf5,0x7d,0x08,0x5d,0xc1]
  40 ; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
  41   %res0 = call <8 x bfloat> @llvm.x86.avx10.vminpbf16128(<8 x bfloat> %x1, <8 x bfloat> %x2)
  42   ret <8 x bfloat> %res0
  43 }
  44
  45 define <8 x bfloat> @test_int_x86_avx10_maskz_min_nepbf16_128(<8 x bfloat> %x1, <8 x bfloat> %x2, i8 %msk) {
  46 ; X64-LABEL: test_int_x86_avx10_maskz_min_nepbf16_128:
  47 ; X64:       # %bb.0:
  48 ; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
  49 ; X64-NEXT:    vminpbf16 %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf5,0x7d,0x89,0x5d,0xc1]
  50 ; X64-NEXT:    retq # encoding: [0xc3]
  51 ;
  52 ; X86-LABEL: test_int_x86_avx10_maskz_min_nepbf16_128:
  53 ; X86:       # %bb.0:
  54 ; X86-NEXT:    kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04]
  55 ; X86-NEXT:    vminpbf16 %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf5,0x7d,0x89,0x5d,0xc1]
  56 ; X86-NEXT:    retl # encoding: [0xc3]
  57   %mask = bitcast i8 %msk to <8 x i1>
  58   %res0 = call <8 x bfloat> @llvm.x86.avx10.vminpbf16128(<8 x bfloat> %x1, <8 x bfloat> %x2)
  59   %res1 = select <8 x i1> %mask, <8 x bfloat> %res0, <8 x bfloat> zeroinitializer
  60   ret <8 x bfloat> %res1
  61 }
  62
  63 declare <16 x bfloat> @llvm.x86.avx10.vmaxpbf16256(<16 x bfloat>, <16 x bfloat>)
  64
  65 define <16 x bfloat> @test_int_x86_avx10_max_nepbf16_256(<16 x bfloat> %x1, <16 x bfloat> %x2) {
  66 ; CHECK-LABEL: test_int_x86_avx10_max_nepbf16_256:
  67 ; CHECK:       # %bb.0:
  68 ; CHECK-NEXT:    vmaxpbf16 %ymm1, %ymm0, %ymm0 # encoding: [0x62,0xf5,0x7d,0x28,0x5f,0xc1]
  69 ; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
  70   %res0 = call <16 x bfloat> @llvm.x86.avx10.vmaxpbf16256(<16 x bfloat> %x1, <16 x bfloat> %x2)
  71   ret <16 x bfloat> %res0
  72 }
  73
  74 define <16 x bfloat> @test_int_x86_avx10_maskz_max_nepbf16_256(<16 x bfloat> %x1, <16 x bfloat> %x2, i16 %msk) {
  75 ; X64-LABEL: test_int_x86_avx10_maskz_max_nepbf16_256:
  76 ; X64:       # %bb.0:
  77 ; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
  78 ; X64-NEXT:    vmaxpbf16 %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf5,0x7d,0xa9,0x5f,0xc1]
  79 ; X64-NEXT:    retq # encoding: [0xc3]
  80 ;
  81 ; X86-LABEL: test_int_x86_avx10_maskz_max_nepbf16_256:
  82 ; X86:       # %bb.0:
  83 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
  84 ; X86-NEXT:    vmaxpbf16 %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf5,0x7d,0xa9,0x5f,0xc1]
  85 ; X86-NEXT:    retl # encoding: [0xc3]
  86   %mask = bitcast i16 %msk to <16 x i1>
  87   %res0 = call <16 x bfloat> @llvm.x86.avx10.vmaxpbf16256(<16 x bfloat> %x1, <16 x bfloat> %x2)
  88   %res1 = select <16 x i1> %mask, <16 x bfloat> %res0, <16 x bfloat> zeroinitializer
  89   ret <16 x bfloat> %res1
  90 }
  91
  92 declare <8 x bfloat> @llvm.x86.avx10.vmaxpbf16128(<8 x bfloat>, <8 x bfloat>)
  93
  94 define <8 x bfloat> @test_int_x86_avx10_max_nepbf16_128(<8 x bfloat> %x1, <8 x bfloat> %x2) {
  95 ; CHECK-LABEL: test_int_x86_avx10_max_nepbf16_128:
  96 ; CHECK:       # %bb.0:
  97 ; CHECK-NEXT:    vmaxpbf16 %xmm1, %xmm0, %xmm0 # encoding: [0x62,0xf5,0x7d,0x08,0x5f,0xc1]
  98 ; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
  99   %res0 = call <8 x bfloat> @llvm.x86.avx10.vmaxpbf16128(<8 x bfloat> %x1, <8 x bfloat> %x2)
 100   ret <8 x bfloat> %res0
 101 }
 102
 103 define <8 x bfloat> @test_int_x86_avx10_maskz_max_nepbf16_128(<8 x bfloat> %x1, <8 x bfloat> %x2, i8 %msk) {
 104 ; X64-LABEL: test_int_x86_avx10_maskz_max_nepbf16_128:
 105 ; X64:       # %bb.0:
 106 ; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
 107 ; X64-NEXT:    vmaxpbf16 %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf5,0x7d,0x89,0x5f,0xc1]
 108 ; X64-NEXT:    retq # encoding: [0xc3]
 109 ;
 110 ; X86-LABEL: test_int_x86_avx10_maskz_max_nepbf16_128:
 111 ; X86:       # %bb.0:
 112 ; X86-NEXT:    kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04]
 113 ; X86-NEXT:    vmaxpbf16 %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf5,0x7d,0x89,0x5f,0xc1]
 114 ; X86-NEXT:    retl # encoding: [0xc3]
 115   %mask = bitcast i8 %msk to <8 x i1>
 116   %res0 = call <8 x bfloat> @llvm.x86.avx10.vmaxpbf16128(<8 x bfloat> %x1, <8 x bfloat> %x2)
 117   %res1 = select <8 x i1> %mask, <8 x bfloat> %res0, <8 x bfloat> zeroinitializer
 118   ret <8 x bfloat> %res1
 119 }
 120
 121 declare i32 @llvm.x86.avx10.vcomsbf16eq(<8 x bfloat>, <8 x bfloat>)
 122 declare i32 @llvm.x86.avx10.vcomsbf16lt(<8 x bfloat>, <8 x bfloat>)
 123 declare i32 @llvm.x86.avx10.vcomsbf16le(<8 x bfloat>, <8 x bfloat>)
 124 declare i32 @llvm.x86.avx10.vcomsbf16gt(<8 x bfloat>, <8 x bfloat>)
 125 declare i32 @llvm.x86.avx10.vcomsbf16ge(<8 x bfloat>, <8 x bfloat>)
 126 declare i32 @llvm.x86.avx10.vcomsbf16neq(<8 x bfloat>, <8 x bfloat>)
 127
 128 define i32 @test_x86_avx10_com_nesbf16_eq(<8 x bfloat> %a0, <8 x bfloat> %a1) {
 129 ; CHECK-LABEL: test_x86_avx10_com_nesbf16_eq:
 130 ; CHECK:       # %bb.0:
 131 ; CHECK-NEXT:    vcomsbf16 %xmm1, %xmm0 # encoding: [0x62,0xf5,0x7d,0x08,0x2f,0xc1]
 132 ; CHECK-NEXT:    setnp %al # encoding: [0x0f,0x9b,0xc0]
 133 ; CHECK-NEXT:    sete %cl # encoding: [0x0f,0x94,0xc1]
 134 ; CHECK-NEXT:    andb %al, %cl # encoding: [0x20,0xc1]
 135 ; CHECK-NEXT:    movzbl %cl, %eax # encoding: [0x0f,0xb6,0xc1]
 136 ; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
 137   %res = call i32 @llvm.x86.avx10.vcomsbf16eq(<8 x bfloat> %a0, <8 x bfloat> %a1)
 138   ret i32 %res
 139 }
 140
 141 define i32 @test_x86_avx10_com_nesbf16_lt(<8 x bfloat> %a0, <8 x bfloat> %a1) {
 142 ; CHECK-LABEL: test_x86_avx10_com_nesbf16_lt:
 143 ; CHECK:       # %bb.0:
 144 ; CHECK-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
 145 ; CHECK-NEXT:    vcomsbf16 %xmm0, %xmm1 # encoding: [0x62,0xf5,0x7d,0x08,0x2f,0xc8]
 146 ; CHECK-NEXT:    seta %al # encoding: [0x0f,0x97,0xc0]
 147 ; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
 148   %res = call i32 @llvm.x86.avx10.vcomsbf16lt(<8 x bfloat> %a0, <8 x bfloat> %a1)
 149   ret i32 %res
 150 }
 151
 152 define i32 @test_x86_avx10_com_nesbf16_le(<8 x bfloat> %a0, <8 x bfloat> %a1) {
 153 ; CHECK-LABEL: test_x86_avx10_com_nesbf16_le:
 154 ; CHECK:       # %bb.0:
 155 ; CHECK-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
 156 ; CHECK-NEXT:    vcomsbf16 %xmm0, %xmm1 # encoding: [0x62,0xf5,0x7d,0x08,0x2f,0xc8]
 157 ; CHECK-NEXT:    setae %al # encoding: [0x0f,0x93,0xc0]
 158 ; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
 159   %res = call i32 @llvm.x86.avx10.vcomsbf16le(<8 x bfloat> %a0, <8 x bfloat> %a1)
 160   ret i32 %res
 161 }
 162
 163 define i32 @test_x86_avx10_com_nesbf16_gt(<8 x bfloat> %a0, <8 x bfloat> %a1) {
 164 ; CHECK-LABEL: test_x86_avx10_com_nesbf16_gt:
 165 ; CHECK:       # %bb.0:
 166 ; CHECK-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
 167 ; CHECK-NEXT:    vcomsbf16 %xmm1, %xmm0 # encoding: [0x62,0xf5,0x7d,0x08,0x2f,0xc1]
 168 ; CHECK-NEXT:    setae %al # encoding: [0x0f,0x93,0xc0]
 169 ; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
 170   %res = call i32 @llvm.x86.avx10.vcomsbf16ge(<8 x bfloat> %a0, <8 x bfloat> %a1)
 171   ret i32 %res
 172 }
 173
 174 define i32 @test_x86_avx10_com_nesbf16_neq(<8 x bfloat> %a0, <8 x bfloat> %a1) {
 175 ; CHECK-LABEL: test_x86_avx10_com_nesbf16_neq:
 176 ; CHECK:       # %bb.0:
 177 ; CHECK-NEXT:    vcomsbf16 %xmm1, %xmm0 # encoding: [0x62,0xf5,0x7d,0x08,0x2f,0xc1]
 178 ; CHECK-NEXT:    setp %al # encoding: [0x0f,0x9a,0xc0]
 179 ; CHECK-NEXT:    setne %cl # encoding: [0x0f,0x95,0xc1]
 180 ; CHECK-NEXT:    orb %al, %cl # encoding: [0x08,0xc1]
 181 ; CHECK-NEXT:    movzbl %cl, %eax # encoding: [0x0f,0xb6,0xc1]
 182 ; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
 183   %res = call i32 @llvm.x86.avx10.vcomsbf16neq(<8 x bfloat> %a0, <8 x bfloat> %a1)
 184   ret i32 %res
 185 }
 186
 187 declare <8 x bfloat> @llvm.x86.avx10.mask.rsqrt.nepbf16.128(<8 x bfloat>, <8 x bfloat>, i8)
 188 declare <16 x bfloat> @llvm.x86.avx10.mask.rsqrt.nepbf16.256(<16 x bfloat>, <16 x bfloat>, i16)
 189
 190 define <8 x bfloat> @test_rsqrt_nepbf16_128(<8 x bfloat> %a0) {
 191 ; CHECK-LABEL: test_rsqrt_nepbf16_128:
 192 ; CHECK:       # %bb.0:
 193 ; CHECK-NEXT:    vrsqrtpbf16 %xmm0, %xmm0 # encoding: [0x62,0xf6,0x7c,0x08,0x4e,0xc0]
 194 ; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
 195   %res = call <8 x bfloat> @llvm.x86.avx10.mask.rsqrt.nepbf16.128(<8 x bfloat> %a0, <8 x bfloat> zeroinitializer, i8 -1)
 196   ret <8 x bfloat> %res
 197 }
 198
 199 define <16 x bfloat> @test_rsqrt_nepbf16_256(<16 x bfloat> %a0) {
 200 ; CHECK-LABEL: test_rsqrt_nepbf16_256:
 201 ; CHECK:       # %bb.0:
 202 ; CHECK-NEXT:    vrsqrtpbf16 %ymm0, %ymm0 # encoding: [0x62,0xf6,0x7c,0x28,0x4e,0xc0]
 203 ; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
 204   %res = call <16 x bfloat> @llvm.x86.avx10.mask.rsqrt.nepbf16.256(<16 x bfloat> %a0, <16 x bfloat> zeroinitializer, i16 -1)
 205   ret <16 x bfloat> %res
 206 }
 207
 208 declare <8 x bfloat> @llvm.x86.avx10.mask.rcp.nepbf16.128(<8 x bfloat>, <8 x bfloat>, i8)
 209 declare <16 x bfloat> @llvm.x86.avx10.mask.rcp.nepbf16.256(<16 x bfloat>, <16 x bfloat>, i16)
 210
 211 define <8 x bfloat> @test_rcp_nepbf16_128(<8 x bfloat> %a0, <8 x bfloat> %a1, i8 %mask) {
 212 ; X64-LABEL: test_rcp_nepbf16_128:
 213 ; X64:       # %bb.0:
 214 ; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
 215 ; X64-NEXT:    vrcppbf16 %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf6,0x7c,0x09,0x4c,0xc8]
 216 ; X64-NEXT:    vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1]
 217 ; X64-NEXT:    retq # encoding: [0xc3]
 218 ;
 219 ; X86-LABEL: test_rcp_nepbf16_128:
 220 ; X86:       # %bb.0:
 221 ; X86-NEXT:    kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04]
 222 ; X86-NEXT:    vrcppbf16 %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf6,0x7c,0x09,0x4c,0xc8]
 223 ; X86-NEXT:    vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1]
 224 ; X86-NEXT:    retl # encoding: [0xc3]
 225   %res = call <8 x bfloat> @llvm.x86.avx10.mask.rcp.nepbf16.128(<8 x bfloat> %a0, <8 x bfloat> %a1, i8 %mask)
 226   ret <8 x bfloat> %res
 227 }
 228
 229 define <16 x bfloat> @test_rcp_nepbf16_256(<16 x bfloat> %a0, <16 x bfloat> %a1, i16 %mask) {
 230 ; X64-LABEL: test_rcp_nepbf16_256:
 231 ; X64:       # %bb.0:
 232 ; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
 233 ; X64-NEXT:    vrcppbf16 %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf6,0x7c,0x29,0x4c,0xc8]
 234 ; X64-NEXT:    vmovaps %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc1]
 235 ; X64-NEXT:    retq # encoding: [0xc3]
 236 ;
 237 ; X86-LABEL: test_rcp_nepbf16_256:
 238 ; X86:       # %bb.0:
 239 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
 240 ; X86-NEXT:    vrcppbf16 %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf6,0x7c,0x29,0x4c,0xc8]
 241 ; X86-NEXT:    vmovaps %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc1]
 242 ; X86-NEXT:    retl # encoding: [0xc3]
 243   %res = call <16 x bfloat> @llvm.x86.avx10.mask.rcp.nepbf16.256(<16 x bfloat> %a0, <16 x bfloat> %a1, i16 %mask)
 244   ret <16 x bfloat> %res
 245 }
 246
 247 declare <8 x bfloat> @llvm.x86.avx10.mask.reduce.nepbf16.128(<8 x bfloat>, i32, <8 x bfloat>, i8)
 248 declare <16 x bfloat> @llvm.x86.avx10.mask.reduce.nepbf16.256(<16 x bfloat>, i32, <16 x bfloat>, i16)
 249
 250 define <8 x bfloat>@test_int_x86_avx512_mask_reduce_nepbf16_128(<8 x bfloat> %x0, <8 x bfloat> %x2, i8 %x3) {
 251 ; X64-LABEL: test_int_x86_avx512_mask_reduce_nepbf16_128:
 252 ; X64:       # %bb.0:
 253 ; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
 254 ; X64-NEXT:    vreducenepbf16 $8, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf3,0x7f,0x09,0x56,0xc8,0x08]
 255 ; X64-NEXT:    vreducenepbf16 $4, %xmm0, %xmm0 # encoding: [0x62,0xf3,0x7f,0x08,0x56,0xc0,0x04]
 256 ; X64-NEXT:    vaddnepbf16 %xmm0, %xmm1, %xmm0 # encoding: [0x62,0xf5,0x75,0x08,0x58,0xc0]
 257 ; X64-NEXT:    retq # encoding: [0xc3]
 258 ;
 259 ; X86-LABEL: test_int_x86_avx512_mask_reduce_nepbf16_128:
 260 ; X86:       # %bb.0:
 261 ; X86-NEXT:    kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04]
 262 ; X86-NEXT:    vreducenepbf16 $8, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf3,0x7f,0x09,0x56,0xc8,0x08]
 263 ; X86-NEXT:    vreducenepbf16 $4, %xmm0, %xmm0 # encoding: [0x62,0xf3,0x7f,0x08,0x56,0xc0,0x04]
 264 ; X86-NEXT:    vaddnepbf16 %xmm0, %xmm1, %xmm0 # encoding: [0x62,0xf5,0x75,0x08,0x58,0xc0]
 265 ; X86-NEXT:    retl # encoding: [0xc3]
 266   %res = call <8 x bfloat> @llvm.x86.avx10.mask.reduce.nepbf16.128(<8 x bfloat> %x0, i32 8, <8 x bfloat> %x2, i8 %x3)
 267   %res1 = call <8 x bfloat> @llvm.x86.avx10.mask.reduce.nepbf16.128(<8 x bfloat> %x0, i32 4, <8 x bfloat> %x2, i8 -1)
 268   %res2 = fadd <8 x bfloat> %res, %res1
 269   ret <8 x bfloat> %res2
 270 }
 271
 272 define <16 x bfloat>@test_int_x86_avx512_mask_reduce_nepbf16_256(<16 x bfloat> %x0, <16 x bfloat> %x2, i16 %x3) {
 273 ; X64-LABEL: test_int_x86_avx512_mask_reduce_nepbf16_256:
 274 ; X64:       # %bb.0:
 275 ; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
 276 ; X64-NEXT:    vreducenepbf16 $8, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf3,0x7f,0x29,0x56,0xc8,0x08]
 277 ; X64-NEXT:    vreducenepbf16 $4, %ymm0, %ymm0 # encoding: [0x62,0xf3,0x7f,0x28,0x56,0xc0,0x04]
 278 ; X64-NEXT:    vaddnepbf16 %ymm0, %ymm1, %ymm0 # encoding: [0x62,0xf5,0x75,0x28,0x58,0xc0]
 279 ; X64-NEXT:    retq # encoding: [0xc3]
 280 ;
 281 ; X86-LABEL: test_int_x86_avx512_mask_reduce_nepbf16_256:
 282 ; X86:       # %bb.0:
 283 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
 284 ; X86-NEXT:    vreducenepbf16 $8, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf3,0x7f,0x29,0x56,0xc8,0x08]
 285 ; X86-NEXT:    vreducenepbf16 $4, %ymm0, %ymm0 # encoding: [0x62,0xf3,0x7f,0x28,0x56,0xc0,0x04]
 286 ; X86-NEXT:    vaddnepbf16 %ymm0, %ymm1, %ymm0 # encoding: [0x62,0xf5,0x75,0x28,0x58,0xc0]
 287 ; X86-NEXT:    retl # encoding: [0xc3]
 288   %res = call <16 x bfloat> @llvm.x86.avx10.mask.reduce.nepbf16.256(<16 x bfloat> %x0, i32 8, <16 x bfloat> %x2, i16 %x3)
 289   %res1 = call <16 x bfloat> @llvm.x86.avx10.mask.reduce.nepbf16.256(<16 x bfloat> %x0, i32 4, <16 x bfloat> %x2, i16 -1)
 290   %res2 = fadd <16 x bfloat> %res, %res1
 291   ret <16 x bfloat> %res2
 292 }
 293
 294 declare <8 x i1> @llvm.x86.avx10.fpclass.nepbf16.128(<8 x bfloat>, i32)
 295 declare <16 x i1> @llvm.x86.avx10.fpclass.nepbf16.256(<16 x bfloat>, i32)
 296
 297 define i8 @test_int_x86_avx512_fpclass_nepbf16_128(<8 x bfloat> %x0) {
 298 ; CHECK-LABEL: test_int_x86_avx512_fpclass_nepbf16_128:
 299 ; CHECK:       # %bb.0:
 300 ; CHECK-NEXT:    vfpclasspbf16 $2, %xmm0, %k1 # encoding: [0x62,0xf3,0x7f,0x08,0x66,0xc8,0x02]
 301 ; CHECK-NEXT:    # k1 = isPositiveZero(xmm0)
 302 ; CHECK-NEXT:    vfpclasspbf16 $4, %xmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7f,0x09,0x66,0xc0,0x04]
 303 ; CHECK-NEXT:    kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
 304 ; CHECK-NEXT:    # kill: def $al killed $al killed $eax
 305 ; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
 306   %res = call <8 x i1> @llvm.x86.avx10.fpclass.nepbf16.128(<8 x bfloat> %x0, i32 4)
 307   %res1 = call <8 x i1> @llvm.x86.avx10.fpclass.nepbf16.128(<8 x bfloat> %x0, i32 2)
 308   %1 = and <8 x i1> %res1, %res
 309   %2 = bitcast <8 x i1> %1 to i8
 310   ret i8 %2
 311 }
 312
 313 define i16 @test_int_x86_avx512_fpclass_nepbf16_256(<16 x bfloat> %x0) {
 314 ; CHECK-LABEL: test_int_x86_avx512_fpclass_nepbf16_256:
 315 ; CHECK:       # %bb.0:
 316 ; CHECK-NEXT:    vfpclasspbf16 $2, %ymm0, %k1 # encoding: [0x62,0xf3,0x7f,0x28,0x66,0xc8,0x02]
 317 ; CHECK-NEXT:    # k1 = isPositiveZero(ymm0)
 318 ; CHECK-NEXT:    vfpclasspbf16 $4, %ymm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7f,0x29,0x66,0xc0,0x04]
 319 ; CHECK-NEXT:    kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
 320 ; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
 321 ; CHECK-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
 322 ; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
 323   %res = call <16 x i1> @llvm.x86.avx10.fpclass.nepbf16.256(<16 x bfloat> %x0, i32 4)
 324   %res1 = call <16 x i1> @llvm.x86.avx10.fpclass.nepbf16.256(<16 x bfloat> %x0, i32 2)
 325   %1 = and <16 x i1> %res1, %res
 326   %2 = bitcast <16 x i1> %1 to i16
 327   ret i16 %2
 328 }
 329
 330 declare <8 x bfloat> @llvm.x86.avx10.mask.getexp.nepbf16.128(<8 x bfloat>, <8 x bfloat>, i8)
 331 declare <16 x bfloat> @llvm.x86.avx10.mask.getexp.nepbf16.256(<16 x bfloat>, <16 x bfloat>, i16)
 332
 333 define <8 x bfloat>@test_int_x86_avx512_getexp_nepbf16_128(<8 x bfloat> %x0) {
 334 ; CHECK-LABEL: test_int_x86_avx512_getexp_nepbf16_128:
 335 ; CHECK:       # %bb.0:
 336 ; CHECK-NEXT:    vgetexppbf16 %xmm0, %xmm0 # encoding: [0x62,0xf5,0x7d,0x08,0x42,0xc0]
 337 ; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
 338   %res = call <8 x bfloat> @llvm.x86.avx10.mask.getexp.nepbf16.128(<8 x bfloat> %x0, <8 x bfloat> zeroinitializer, i8 -1)
 339   ret <8 x bfloat> %res
 340 }
 341
 342 define <8 x bfloat>@test_int_x86_avx512_mask_getexp_nepbf16_128(<8 x bfloat> %x0, <8 x bfloat> %x1, i8 %x2) {
 343 ; X64-LABEL: test_int_x86_avx512_mask_getexp_nepbf16_128:
 344 ; X64:       # %bb.0:
 345 ; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
 346 ; X64-NEXT:    vgetexppbf16 %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf5,0x7d,0x09,0x42,0xc8]
 347 ; X64-NEXT:    vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1]
 348 ; X64-NEXT:    retq # encoding: [0xc3]
 349 ;
 350 ; X86-LABEL: test_int_x86_avx512_mask_getexp_nepbf16_128:
 351 ; X86:       # %bb.0:
 352 ; X86-NEXT:    kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04]
 353 ; X86-NEXT:    vgetexppbf16 %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf5,0x7d,0x09,0x42,0xc8]
 354 ; X86-NEXT:    vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1]
 355 ; X86-NEXT:    retl # encoding: [0xc3]
 356   %res = call <8 x bfloat> @llvm.x86.avx10.mask.getexp.nepbf16.128(<8 x bfloat> %x0, <8 x bfloat> %x1, i8 %x2)
 357   ret <8 x bfloat> %res
 358 }
 359
 360 define <8 x bfloat>@test_int_x86_avx512_maskz_getexp_nepbf16_128(<8 x bfloat> %x0, i8 %x2) {
 361 ; X64-LABEL: test_int_x86_avx512_maskz_getexp_nepbf16_128:
 362 ; X64:       # %bb.0:
 363 ; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
 364 ; X64-NEXT:    vgetexppbf16 %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf5,0x7d,0x89,0x42,0xc0]
 365 ; X64-NEXT:    retq # encoding: [0xc3]
 366 ;
 367 ; X86-LABEL: test_int_x86_avx512_maskz_getexp_nepbf16_128:
 368 ; X86:       # %bb.0:
 369 ; X86-NEXT:    kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04]
 370 ; X86-NEXT:    vgetexppbf16 %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf5,0x7d,0x89,0x42,0xc0]
 371 ; X86-NEXT:    retl # encoding: [0xc3]
 372   %res = call <8 x bfloat> @llvm.x86.avx10.mask.getexp.nepbf16.128(<8 x bfloat> %x0, <8 x bfloat> zeroinitializer, i8 %x2)
 373   ret <8 x bfloat> %res
 374 }
 375
 376 define <16 x bfloat>@test_int_x86_avx512_getexp_nepbf16_256(<16 x bfloat> %x0) {
 377 ; CHECK-LABEL: test_int_x86_avx512_getexp_nepbf16_256:
 378 ; CHECK:       # %bb.0:
 379 ; CHECK-NEXT:    vgetexppbf16 %ymm0, %ymm0 # encoding: [0x62,0xf5,0x7d,0x28,0x42,0xc0]
 380 ; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
 381   %res = call <16 x bfloat> @llvm.x86.avx10.mask.getexp.nepbf16.256(<16 x bfloat> %x0, <16 x bfloat> zeroinitializer, i16 -1)
 382   ret <16 x bfloat> %res
 383 }
 384
 385 define <16 x bfloat>@test_int_x86_avx512_mask_getexp_nepbf16_256(<16 x bfloat> %x0, <16 x bfloat> %x1, i16 %x2) {
 386 ; X64-LABEL: test_int_x86_avx512_mask_getexp_nepbf16_256:
 387 ; X64:       # %bb.0:
 388 ; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
 389 ; X64-NEXT:    vgetexppbf16 %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf5,0x7d,0x29,0x42,0xc8]
 390 ; X64-NEXT:    vmovaps %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc1]
 391 ; X64-NEXT:    retq # encoding: [0xc3]
 392 ;
 393 ; X86-LABEL: test_int_x86_avx512_mask_getexp_nepbf16_256:
 394 ; X86:       # %bb.0:
 395 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
 396 ; X86-NEXT:    vgetexppbf16 %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf5,0x7d,0x29,0x42,0xc8]
 397 ; X86-NEXT:    vmovaps %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc1]
 398 ; X86-NEXT:    retl # encoding: [0xc3]
 399   %res = call <16 x bfloat> @llvm.x86.avx10.mask.getexp.nepbf16.256(<16 x bfloat> %x0, <16 x bfloat> %x1, i16 %x2)
 400   ret <16 x bfloat> %res
 401 }
 402
 403 define <16 x bfloat>@test_int_x86_avx512_maskz_getexp_nepbf16_256(<16 x bfloat> %x0, i16 %x2) {
 404 ; X64-LABEL: test_int_x86_avx512_maskz_getexp_nepbf16_256:
 405 ; X64:       # %bb.0:
 406 ; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
 407 ; X64-NEXT:    vgetexppbf16 %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf5,0x7d,0xa9,0x42,0xc0]
 408 ; X64-NEXT:    retq # encoding: [0xc3]
 409 ;
 410 ; X86-LABEL: test_int_x86_avx512_maskz_getexp_nepbf16_256:
 411 ; X86:       # %bb.0:
 412 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
 413 ; X86-NEXT:    vgetexppbf16 %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf5,0x7d,0xa9,0x42,0xc0]
 414 ; X86-NEXT:    retl # encoding: [0xc3]
 415   %res = call <16 x bfloat> @llvm.x86.avx10.mask.getexp.nepbf16.256(<16 x bfloat> %x0, <16 x bfloat> zeroinitializer, i16 %x2)
 416   ret <16 x bfloat> %res
 417 }
 418
 419 declare <8 x bfloat> @llvm.x86.avx10.mask.getmant.nepbf16.128(<8 x bfloat>, i32, <8 x bfloat>, i8)
 420 declare <16 x bfloat> @llvm.x86.avx10.mask.getmant.nepbf16.256(<16 x bfloat>, i32, <16 x bfloat>, i16)
 421
 422 define <8 x bfloat>@test_int_x86_avx512_mask_getmant_nepbf16_128(<8 x bfloat> %x0, <8 x bfloat> %x2, i8 %x3) {
 423 ; X64-LABEL: test_int_x86_avx512_mask_getmant_nepbf16_128:
 424 ; X64:       # %bb.0:
 425 ; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
 426 ; X64-NEXT:    vgetmantpbf16 $8, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf3,0x7f,0x09,0x26,0xc8,0x08]
 427 ; X64-NEXT:    vgetmantpbf16 $4, %xmm0, %xmm0 # encoding: [0x62,0xf3,0x7f,0x08,0x26,0xc0,0x04]
 428 ; X64-NEXT:    vaddnepbf16 %xmm0, %xmm1, %xmm0 # encoding: [0x62,0xf5,0x75,0x08,0x58,0xc0]
 429 ; X64-NEXT:    retq # encoding: [0xc3]
 430 ;
 431 ; X86-LABEL: test_int_x86_avx512_mask_getmant_nepbf16_128:
 432 ; X86:       # %bb.0:
 433 ; X86-NEXT:    kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04]
 434 ; X86-NEXT:    vgetmantpbf16 $8, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf3,0x7f,0x09,0x26,0xc8,0x08]
 435 ; X86-NEXT:    vgetmantpbf16 $4, %xmm0, %xmm0 # encoding: [0x62,0xf3,0x7f,0x08,0x26,0xc0,0x04]
 436 ; X86-NEXT:    vaddnepbf16 %xmm0, %xmm1, %xmm0 # encoding: [0x62,0xf5,0x75,0x08,0x58,0xc0]
 437 ; X86-NEXT:    retl # encoding: [0xc3]
 438   %res = call <8 x bfloat> @llvm.x86.avx10.mask.getmant.nepbf16.128(<8 x bfloat> %x0, i32 8, <8 x bfloat> %x2, i8 %x3)
 439   %res1 = call <8 x bfloat> @llvm.x86.avx10.mask.getmant.nepbf16.128(<8 x bfloat> %x0, i32 4, <8 x bfloat> %x2, i8 -1)
 440   %res2 = fadd <8 x bfloat> %res, %res1
 441   ret <8 x bfloat> %res2
 442 }
 443
 444 define <16 x bfloat>@test_int_x86_avx512_mask_getmant_nepbf16_256(<16 x bfloat> %x0, <16 x bfloat> %x2, i16 %x3) {
 445 ; X64-LABEL: test_int_x86_avx512_mask_getmant_nepbf16_256:
 446 ; X64:       # %bb.0:
 447 ; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
 448 ; X64-NEXT:    vgetmantpbf16 $8, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf3,0x7f,0x29,0x26,0xc8,0x08]
 449 ; X64-NEXT:    vgetmantpbf16 $4, %ymm0, %ymm0 # encoding: [0x62,0xf3,0x7f,0x28,0x26,0xc0,0x04]
 450 ; X64-NEXT:    vaddnepbf16 %ymm0, %ymm1, %ymm0 # encoding: [0x62,0xf5,0x75,0x28,0x58,0xc0]
 451 ; X64-NEXT:    retq # encoding: [0xc3]
 452 ;
 453 ; X86-LABEL: test_int_x86_avx512_mask_getmant_nepbf16_256:
 454 ; X86:       # %bb.0:
 455 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
 456 ; X86-NEXT:    vgetmantpbf16 $8, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf3,0x7f,0x29,0x26,0xc8,0x08]
 457 ; X86-NEXT:    vgetmantpbf16 $4, %ymm0, %ymm0 # encoding: [0x62,0xf3,0x7f,0x28,0x26,0xc0,0x04]
 458 ; X86-NEXT:    vaddnepbf16 %ymm0, %ymm1, %ymm0 # encoding: [0x62,0xf5,0x75,0x28,0x58,0xc0]
 459 ; X86-NEXT:    retl # encoding: [0xc3]
 460   %res = call <16 x bfloat> @llvm.x86.avx10.mask.getmant.nepbf16.256(<16 x bfloat> %x0, i32 8, <16 x bfloat> %x2, i16 %x3)
 461   %res1 = call <16 x bfloat> @llvm.x86.avx10.mask.getmant.nepbf16.256(<16 x bfloat> %x0, i32 4, <16 x bfloat> %x2, i16 -1)
 462   %res2 = fadd <16 x bfloat> %res, %res1
 463   ret <16 x bfloat> %res2
 464 }
 465
 466 declare <8 x bfloat> @llvm.x86.avx10.mask.rndscale.nepbf16.128(<8 x bfloat>, i32, <8 x bfloat>, i8)
 467 declare <16 x bfloat> @llvm.x86.avx10.mask.rndscale.nepbf16.256(<16 x bfloat>, i32, <16 x bfloat>, i16)
 468
 469 define <8 x bfloat>@test_int_x86_avx512_mask_rndscale_nepbf16_128(<8 x bfloat> %x0, <8 x bfloat> %x2, i8 %x3) {
 470 ; X64-LABEL: test_int_x86_avx512_mask_rndscale_nepbf16_128:
 471 ; X64:       # %bb.0:
 472 ; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
 473 ; X64-NEXT:    vrndscalenepbf16 $8, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf3,0x7f,0x09,0x08,0xc8,0x08]
 474 ; X64-NEXT:    vrndscalenepbf16 $4, %xmm0, %xmm0 # encoding: [0x62,0xf3,0x7f,0x08,0x08,0xc0,0x04]
 475 ; X64-NEXT:    vaddnepbf16 %xmm0, %xmm1, %xmm0 # encoding: [0x62,0xf5,0x75,0x08,0x58,0xc0]
 476 ; X64-NEXT:    retq # encoding: [0xc3]
 477 ;
 478 ; X86-LABEL: test_int_x86_avx512_mask_rndscale_nepbf16_128:
 479 ; X86:       # %bb.0:
 480 ; X86-NEXT:    kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04]
 481 ; X86-NEXT:    vrndscalenepbf16 $8, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf3,0x7f,0x09,0x08,0xc8,0x08]
 482 ; X86-NEXT:    vrndscalenepbf16 $4, %xmm0, %xmm0 # encoding: [0x62,0xf3,0x7f,0x08,0x08,0xc0,0x04]
 483 ; X86-NEXT:    vaddnepbf16 %xmm0, %xmm1, %xmm0 # encoding: [0x62,0xf5,0x75,0x08,0x58,0xc0]
 484 ; X86-NEXT:    retl # encoding: [0xc3]
 485   %res = call <8 x bfloat> @llvm.x86.avx10.mask.rndscale.nepbf16.128(<8 x bfloat> %x0, i32 8, <8 x bfloat> %x2, i8 %x3)
 486   %res1 = call <8 x bfloat> @llvm.x86.avx10.mask.rndscale.nepbf16.128(<8 x bfloat> %x0, i32 4, <8 x bfloat> %x2, i8 -1)
 487   %res2 = fadd <8 x bfloat> %res, %res1
 488   ret <8 x bfloat> %res2
 489 }
 490
 491 define <16 x bfloat>@test_int_x86_avx512_mask_rndscale_nepbf16_256(<16 x bfloat> %x0, <16 x bfloat> %x2, i16 %x3) {
 492 ; X64-LABEL: test_int_x86_avx512_mask_rndscale_nepbf16_256:
 493 ; X64:       # %bb.0:
 494 ; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
 495 ; X64-NEXT:    vrndscalenepbf16 $8, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf3,0x7f,0x29,0x08,0xc8,0x08]
 496 ; X64-NEXT:    vrndscalenepbf16 $4, %ymm0, %ymm0 # encoding: [0x62,0xf3,0x7f,0x28,0x08,0xc0,0x04]
 497 ; X64-NEXT:    vaddnepbf16 %ymm0, %ymm1, %ymm0 # encoding: [0x62,0xf5,0x75,0x28,0x58,0xc0]
 498 ; X64-NEXT:    retq # encoding: [0xc3]
 499 ;
 500 ; X86-LABEL: test_int_x86_avx512_mask_rndscale_nepbf16_256:
 501 ; X86:       # %bb.0:
 502 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
 503 ; X86-NEXT:    vrndscalenepbf16 $8, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf3,0x7f,0x29,0x08,0xc8,0x08]
 504 ; X86-NEXT:    vrndscalenepbf16 $4, %ymm0, %ymm0 # encoding: [0x62,0xf3,0x7f,0x28,0x08,0xc0,0x04]
 505 ; X86-NEXT:    vaddnepbf16 %ymm0, %ymm1, %ymm0 # encoding: [0x62,0xf5,0x75,0x28,0x58,0xc0]
 506 ; X86-NEXT:    retl # encoding: [0xc3]
 507   %res = call <16 x bfloat> @llvm.x86.avx10.mask.rndscale.nepbf16.256(<16 x bfloat> %x0, i32 8, <16 x bfloat> %x2, i16 %x3)
 508   %res1 = call <16 x bfloat> @llvm.x86.avx10.mask.rndscale.nepbf16.256(<16 x bfloat> %x0, i32 4, <16 x bfloat> %x2, i16 -1)
 509   %res2 = fadd <16 x bfloat> %res, %res1
 510   ret <16 x bfloat> %res2
 511 }
 512
 513 declare <8 x bfloat> @llvm.x86.avx10.mask.scalef.nepbf16.128(<8 x bfloat>, <8 x bfloat>, <8 x bfloat>, i8)
 514 declare <16 x bfloat> @llvm.x86.avx10.mask.scalef.nepbf16.256(<16 x bfloat>, <16 x bfloat>, <16 x bfloat>, i16)
 515
 516 define <8 x bfloat>@test_int_x86_avx512_scalef_nepbf16_128(<8 x bfloat> %x0, <8 x bfloat> %x1) {
 517 ; CHECK-LABEL: test_int_x86_avx512_scalef_nepbf16_128:
 518 ; CHECK:       # %bb.0:
 519 ; CHECK-NEXT:    vscalefpbf16 %xmm1, %xmm0, %xmm0 # encoding: [0x62,0xf6,0x7c,0x08,0x2c,0xc1]
 520 ; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
 521   %res = call <8 x bfloat> @llvm.x86.avx10.mask.scalef.nepbf16.128(<8 x bfloat> %x0, <8 x bfloat> %x1, <8 x bfloat> zeroinitializer, i8 -1)
 522   ret <8 x bfloat> %res
 523 }
 524
 525 define <8 x bfloat>@test_int_x86_avx512_mask_scalef_nepbf16_128(<8 x bfloat> %x0, <8 x bfloat> %x1, <8 x bfloat> %x2, i8 %x3) {
 526 ; X64-LABEL: test_int_x86_avx512_mask_scalef_nepbf16_128:
 527 ; X64:       # %bb.0:
 528 ; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
 529 ; X64-NEXT:    vscalefpbf16 %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf6,0x7c,0x09,0x2c,0xd1]
 530 ; X64-NEXT:    vmovaps %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc2]
 531 ; X64-NEXT:    retq # encoding: [0xc3]
 532 ;
 533 ; X86-LABEL: test_int_x86_avx512_mask_scalef_nepbf16_128:
 534 ; X86:       # %bb.0:
 535 ; X86-NEXT:    kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04]
 536 ; X86-NEXT:    vscalefpbf16 %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf6,0x7c,0x09,0x2c,0xd1]
 537 ; X86-NEXT:    vmovaps %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc2]
 538 ; X86-NEXT:    retl # encoding: [0xc3]
 539   %mask = bitcast i8 %x3 to <8 x i1>
 540   %res = call <8 x bfloat> @llvm.x86.avx10.mask.scalef.nepbf16.128(<8 x bfloat> %x0, <8 x bfloat> %x1, <8 x bfloat> %x2, i8 %x3)
 541   ret <8 x bfloat> %res
 542 }
 543
 544 define <8 x bfloat>@test_int_x86_avx512_maskz_scalef_nepbf16_128(<8 x bfloat> %x0, <8 x bfloat> %x1, i8 %x3) {
 545 ; X64-LABEL: test_int_x86_avx512_maskz_scalef_nepbf16_128:
 546 ; X64:       # %bb.0:
 547 ; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
 548 ; X64-NEXT:    vscalefpbf16 %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf6,0x7c,0x89,0x2c,0xc1]
 549 ; X64-NEXT:    retq # encoding: [0xc3]
 550 ;
 551 ; X86-LABEL: test_int_x86_avx512_maskz_scalef_nepbf16_128:
 552 ; X86:       # %bb.0:
 553 ; X86-NEXT:    kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04]
 554 ; X86-NEXT:    vscalefpbf16 %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf6,0x7c,0x89,0x2c,0xc1]
 555 ; X86-NEXT:    retl # encoding: [0xc3]
 556   %mask = bitcast i8 %x3 to <8 x i1>
 557   %res = call <8 x bfloat> @llvm.x86.avx10.mask.scalef.nepbf16.128(<8 x bfloat> %x0, <8 x bfloat> %x1, <8 x bfloat> zeroinitializer, i8 %x3)
 558   ret <8 x bfloat> %res
 559 }
 560
 561 define <16 x bfloat>@test_int_x86_avx512_scalef_nepbf16_256(<16 x bfloat> %x0, <16 x bfloat> %x1) {
 562 ; CHECK-LABEL: test_int_x86_avx512_scalef_nepbf16_256:
 563 ; CHECK:       # %bb.0:
 564 ; CHECK-NEXT:    vscalefpbf16 %ymm1, %ymm0, %ymm0 # encoding: [0x62,0xf6,0x7c,0x28,0x2c,0xc1]
 565 ; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
 566   %res = call <16 x bfloat> @llvm.x86.avx10.mask.scalef.nepbf16.256(<16 x bfloat> %x0, <16 x bfloat> %x1, <16 x bfloat> zeroinitializer, i16 -1)
 567   ret <16 x bfloat> %res
 568 }
 569
 570 define <16 x bfloat>@test_int_x86_avx512_mask_scalef_nepbf16_256(<16 x bfloat> %x0, <16 x bfloat> %x1, <16 x bfloat> %x2, i16 %x3) {
 571 ; X64-LABEL: test_int_x86_avx512_mask_scalef_nepbf16_256:
 572 ; X64:       # %bb.0:
 573 ; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
 574 ; X64-NEXT:    vscalefpbf16 %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf6,0x7c,0x29,0x2c,0xd1]
 575 ; X64-NEXT:    vmovaps %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc2]
 576 ; X64-NEXT:    retq # encoding: [0xc3]
 577 ;
 578 ; X86-LABEL: test_int_x86_avx512_mask_scalef_nepbf16_256:
 579 ; X86:       # %bb.0:
 580 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
 581 ; X86-NEXT:    vscalefpbf16 %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf6,0x7c,0x29,0x2c,0xd1]
 582 ; X86-NEXT:    vmovaps %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc2]
 583 ; X86-NEXT:    retl # encoding: [0xc3]
 584   %mask = bitcast i16 %x3 to <16 x i1>
 585   %res = call <16 x bfloat> @llvm.x86.avx10.mask.scalef.nepbf16.256(<16 x bfloat> %x0, <16 x bfloat> %x1, <16 x bfloat> %x2, i16 %x3)
 586   ret <16 x bfloat> %res
 587 }
 588
 589 define <16 x bfloat>@test_int_x86_avx512_maskz_scalef_nepbf16_256(<16 x bfloat> %x0, <16 x bfloat> %x1, i16 %x3) {
 590 ; X64-LABEL: test_int_x86_avx512_maskz_scalef_nepbf16_256:
 591 ; X64:       # %bb.0:
 592 ; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
 593 ; X64-NEXT:    vscalefpbf16 %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf6,0x7c,0xa9,0x2c,0xc1]
 594 ; X64-NEXT:    retq # encoding: [0xc3]
 595 ;
 596 ; X86-LABEL: test_int_x86_avx512_maskz_scalef_nepbf16_256:
 597 ; X86:       # %bb.0:
 598 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
 599 ; X86-NEXT:    vscalefpbf16 %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf6,0x7c,0xa9,0x2c,0xc1]
 600 ; X86-NEXT:    retl # encoding: [0xc3]
 601   %mask = bitcast i16 %x3 to <16 x i1>
 602   %res = call <16 x bfloat> @llvm.x86.avx10.mask.scalef.nepbf16.256(<16 x bfloat> %x0, <16 x bfloat> %x1, <16 x bfloat> zeroinitializer, i16 %x3)
 603   ret <16 x bfloat> %res
 604 }