llvm/test/CodeGen/X86/avx512vl-intrinsics-canonical.ll

   1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
   2 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512vl --show-mc-encoding | FileCheck %s --check-prefix=X86
   3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl --show-mc-encoding | FileCheck %s --check-prefix=X64
   4
   5 ; NOTE: This should use IR equivalent to what is generated by clang/test/CodeGen/avx512vl-builtins.c
   6
   7 define <2 x double> @test_mm_mask_fmadd_pd(<2 x double> %__A, i8 zeroext %__U, <2 x double> %__B, <2 x double> %__C) {
   8 ; X86-LABEL: test_mm_mask_fmadd_pd:
   9 ; X86:       # %bb.0: # %entry
  10 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
  11 ; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
  12 ; X86-NEXT:    vfmadd132pd %xmm1, %xmm2, %xmm0 {%k1} # encoding: [0x62,0xf2,0xed,0x09,0x98,0xc1]
  13 ; X86-NEXT:    # xmm0 {%k1} = (xmm0 * xmm1) + xmm2
  14 ; X86-NEXT:    retl # encoding: [0xc3]
  15 ;
  16 ; X64-LABEL: test_mm_mask_fmadd_pd:
  17 ; X64:       # %bb.0: # %entry
  18 ; X64-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
  19 ; X64-NEXT:    vfmadd132pd %xmm1, %xmm2, %xmm0 {%k1} # encoding: [0x62,0xf2,0xed,0x09,0x98,0xc1]
  20 ; X64-NEXT:    # xmm0 {%k1} = (xmm0 * xmm1) + xmm2
  21 ; X64-NEXT:    retq # encoding: [0xc3]
  22 entry:
  23   %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %__C) #9
  24   %1 = bitcast i8 %__U to <8 x i1>
  25   %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
  26   %2 = select <2 x i1> %extract.i, <2 x double> %0, <2 x double> %__A
  27   ret <2 x double> %2
  28 }
  29
  30 define <2 x double> @test_mm_mask_fmsub_pd(<2 x double> %__A, i8 zeroext %__U, <2 x double> %__B, <2 x double> %__C) {
  31 ; X86-LABEL: test_mm_mask_fmsub_pd:
  32 ; X86:       # %bb.0: # %entry
  33 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
  34 ; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
  35 ; X86-NEXT:    vfmsub132pd %xmm1, %xmm2, %xmm0 {%k1} # encoding: [0x62,0xf2,0xed,0x09,0x9a,0xc1]
  36 ; X86-NEXT:    # xmm0 {%k1} = (xmm0 * xmm1) - xmm2
  37 ; X86-NEXT:    retl # encoding: [0xc3]
  38 ;
  39 ; X64-LABEL: test_mm_mask_fmsub_pd:
  40 ; X64:       # %bb.0: # %entry
  41 ; X64-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
  42 ; X64-NEXT:    vfmsub132pd %xmm1, %xmm2, %xmm0 {%k1} # encoding: [0x62,0xf2,0xed,0x09,0x9a,0xc1]
  43 ; X64-NEXT:    # xmm0 {%k1} = (xmm0 * xmm1) - xmm2
  44 ; X64-NEXT:    retq # encoding: [0xc3]
  45 entry:
  46   %sub.i = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__C
  47   %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %sub.i) #9
  48   %1 = bitcast i8 %__U to <8 x i1>
  49   %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
  50   %2 = select <2 x i1> %extract.i, <2 x double> %0, <2 x double> %__A
  51   ret <2 x double> %2
  52 }
  53
  54 define <2 x double> @test_mm_mask3_fmadd_pd(<2 x double> %__A, <2 x double> %__B, <2 x double> %__C, i8 zeroext %__U) {
  55 ; X86-LABEL: test_mm_mask3_fmadd_pd:
  56 ; X86:       # %bb.0: # %entry
  57 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
  58 ; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
  59 ; X86-NEXT:    vfmadd231pd %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0xfd,0x09,0xb8,0xd1]
  60 ; X86-NEXT:    # xmm2 {%k1} = (xmm0 * xmm1) + xmm2
  61 ; X86-NEXT:    vmovapd %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0xc2]
  62 ; X86-NEXT:    retl # encoding: [0xc3]
  63 ;
  64 ; X64-LABEL: test_mm_mask3_fmadd_pd:
  65 ; X64:       # %bb.0: # %entry
  66 ; X64-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
  67 ; X64-NEXT:    vfmadd231pd %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0xfd,0x09,0xb8,0xd1]
  68 ; X64-NEXT:    # xmm2 {%k1} = (xmm0 * xmm1) + xmm2
  69 ; X64-NEXT:    vmovapd %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0xc2]
  70 ; X64-NEXT:    retq # encoding: [0xc3]
  71 entry:
  72   %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %__C) #9
  73   %1 = bitcast i8 %__U to <8 x i1>
  74   %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
  75   %2 = select <2 x i1> %extract.i, <2 x double> %0, <2 x double> %__C
  76   ret <2 x double> %2
  77 }
  78
  79 define <2 x double> @test_mm_mask3_fnmadd_pd(<2 x double> %__A, <2 x double> %__B, <2 x double> %__C, i8 zeroext %__U) {
  80 ; X86-LABEL: test_mm_mask3_fnmadd_pd:
  81 ; X86:       # %bb.0: # %entry
  82 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
  83 ; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
  84 ; X86-NEXT:    vfnmadd231pd %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0xfd,0x09,0xbc,0xd1]
  85 ; X86-NEXT:    # xmm2 {%k1} = -(xmm0 * xmm1) + xmm2
  86 ; X86-NEXT:    vmovapd %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0xc2]
  87 ; X86-NEXT:    retl # encoding: [0xc3]
  88 ;
  89 ; X64-LABEL: test_mm_mask3_fnmadd_pd:
  90 ; X64:       # %bb.0: # %entry
  91 ; X64-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
  92 ; X64-NEXT:    vfnmadd231pd %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0xfd,0x09,0xbc,0xd1]
  93 ; X64-NEXT:    # xmm2 {%k1} = -(xmm0 * xmm1) + xmm2
  94 ; X64-NEXT:    vmovapd %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0xc2]
  95 ; X64-NEXT:    retq # encoding: [0xc3]
  96 entry:
  97   %sub.i = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__A
  98   %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %sub.i, <2 x double> %__B, <2 x double> %__C) #9
  99   %1 = bitcast i8 %__U to <8 x i1>
 100   %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
 101   %2 = select <2 x i1> %extract.i, <2 x double> %0, <2 x double> %__C
 102   ret <2 x double> %2
 103 }
 104
 105 define <2 x double> @test_mm_maskz_fmadd_pd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B, <2 x double> %__C) {
 106 ; X86-LABEL: test_mm_maskz_fmadd_pd:
 107 ; X86:       # %bb.0: # %entry
 108 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
 109 ; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 110 ; X86-NEXT:    vfmadd213pd %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0x89,0xa8,0xc2]
 111 ; X86-NEXT:    # xmm0 {%k1} {z} = (xmm1 * xmm0) + xmm2
 112 ; X86-NEXT:    retl # encoding: [0xc3]
 113 ;
 114 ; X64-LABEL: test_mm_maskz_fmadd_pd:
 115 ; X64:       # %bb.0: # %entry
 116 ; X64-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
 117 ; X64-NEXT:    vfmadd213pd %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0x89,0xa8,0xc2]
 118 ; X64-NEXT:    # xmm0 {%k1} {z} = (xmm1 * xmm0) + xmm2
 119 ; X64-NEXT:    retq # encoding: [0xc3]
 120 entry:
 121   %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %__C) #9
 122   %1 = bitcast i8 %__U to <8 x i1>
 123   %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
 124   %2 = select <2 x i1> %extract.i, <2 x double> %0, <2 x double> zeroinitializer
 125   ret <2 x double> %2
 126 }
 127
 128 define <2 x double> @test_mm_maskz_fmsub_pd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B, <2 x double> %__C) {
 129 ; X86-LABEL: test_mm_maskz_fmsub_pd:
 130 ; X86:       # %bb.0: # %entry
 131 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
 132 ; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 133 ; X86-NEXT:    vfmsub213pd %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0x89,0xaa,0xc2]
 134 ; X86-NEXT:    # xmm0 {%k1} {z} = (xmm1 * xmm0) - xmm2
 135 ; X86-NEXT:    retl # encoding: [0xc3]
 136 ;
 137 ; X64-LABEL: test_mm_maskz_fmsub_pd:
 138 ; X64:       # %bb.0: # %entry
 139 ; X64-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
 140 ; X64-NEXT:    vfmsub213pd %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0x89,0xaa,0xc2]
 141 ; X64-NEXT:    # xmm0 {%k1} {z} = (xmm1 * xmm0) - xmm2
 142 ; X64-NEXT:    retq # encoding: [0xc3]
 143 entry:
 144   %sub.i = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__C
 145   %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %sub.i) #9
 146   %1 = bitcast i8 %__U to <8 x i1>
 147   %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
 148   %2 = select <2 x i1> %extract.i, <2 x double> %0, <2 x double> zeroinitializer
 149   ret <2 x double> %2
 150 }
 151
 152 define <2 x double> @test_mm_maskz_fnmadd_pd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B, <2 x double> %__C) {
 153 ; X86-LABEL: test_mm_maskz_fnmadd_pd:
 154 ; X86:       # %bb.0: # %entry
 155 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
 156 ; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 157 ; X86-NEXT:    vfnmadd213pd %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0x89,0xac,0xc2]
 158 ; X86-NEXT:    # xmm0 {%k1} {z} = -(xmm1 * xmm0) + xmm2
 159 ; X86-NEXT:    retl # encoding: [0xc3]
 160 ;
 161 ; X64-LABEL: test_mm_maskz_fnmadd_pd:
 162 ; X64:       # %bb.0: # %entry
 163 ; X64-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
 164 ; X64-NEXT:    vfnmadd213pd %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0x89,0xac,0xc2]
 165 ; X64-NEXT:    # xmm0 {%k1} {z} = -(xmm1 * xmm0) + xmm2
 166 ; X64-NEXT:    retq # encoding: [0xc3]
 167 entry:
 168   %sub.i = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__A
 169   %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %sub.i, <2 x double> %__B, <2 x double> %__C) #9
 170   %1 = bitcast i8 %__U to <8 x i1>
 171   %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
 172   %2 = select <2 x i1> %extract.i, <2 x double> %0, <2 x double> zeroinitializer
 173   ret <2 x double> %2
 174 }
 175
 176 define <2 x double> @test_mm_maskz_fnmsub_pd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B, <2 x double> %__C) {
 177 ; X86-LABEL: test_mm_maskz_fnmsub_pd:
 178 ; X86:       # %bb.0: # %entry
 179 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
 180 ; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 181 ; X86-NEXT:    vfnmsub213pd %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0x89,0xae,0xc2]
 182 ; X86-NEXT:    # xmm0 {%k1} {z} = -(xmm1 * xmm0) - xmm2
 183 ; X86-NEXT:    retl # encoding: [0xc3]
 184 ;
 185 ; X64-LABEL: test_mm_maskz_fnmsub_pd:
 186 ; X64:       # %bb.0: # %entry
 187 ; X64-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
 188 ; X64-NEXT:    vfnmsub213pd %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0x89,0xae,0xc2]
 189 ; X64-NEXT:    # xmm0 {%k1} {z} = -(xmm1 * xmm0) - xmm2
 190 ; X64-NEXT:    retq # encoding: [0xc3]
 191 entry:
 192   %sub.i = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__A
 193   %sub1.i = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__C
 194   %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %sub.i, <2 x double> %__B, <2 x double> %sub1.i) #9
 195   %1 = bitcast i8 %__U to <8 x i1>
 196   %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
 197   %2 = select <2 x i1> %extract.i, <2 x double> %0, <2 x double> zeroinitializer
 198   ret <2 x double> %2
 199 }
 200
 201 define <4 x double> @test_mm256_mask_fmadd_pd(<4 x double> %__A, i8 zeroext %__U, <4 x double> %__B, <4 x double> %__C) {
 202 ; X86-LABEL: test_mm256_mask_fmadd_pd:
 203 ; X86:       # %bb.0: # %entry
 204 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
 205 ; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 206 ; X86-NEXT:    vfmadd132pd %ymm1, %ymm2, %ymm0 {%k1} # encoding: [0x62,0xf2,0xed,0x29,0x98,0xc1]
 207 ; X86-NEXT:    # ymm0 {%k1} = (ymm0 * ymm1) + ymm2
 208 ; X86-NEXT:    retl # encoding: [0xc3]
 209 ;
 210 ; X64-LABEL: test_mm256_mask_fmadd_pd:
 211 ; X64:       # %bb.0: # %entry
 212 ; X64-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
 213 ; X64-NEXT:    vfmadd132pd %ymm1, %ymm2, %ymm0 {%k1} # encoding: [0x62,0xf2,0xed,0x29,0x98,0xc1]
 214 ; X64-NEXT:    # ymm0 {%k1} = (ymm0 * ymm1) + ymm2
 215 ; X64-NEXT:    retq # encoding: [0xc3]
 216 entry:
 217   %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %__C) #9
 218   %1 = bitcast i8 %__U to <8 x i1>
 219   %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 220   %2 = select <4 x i1> %extract.i, <4 x double> %0, <4 x double> %__A
 221   ret <4 x double> %2
 222 }
 223
 224 define <4 x double> @test_mm256_mask_fmsub_pd(<4 x double> %__A, i8 zeroext %__U, <4 x double> %__B, <4 x double> %__C) {
 225 ; X86-LABEL: test_mm256_mask_fmsub_pd:
 226 ; X86:       # %bb.0: # %entry
 227 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
 228 ; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 229 ; X86-NEXT:    vfmsub132pd %ymm1, %ymm2, %ymm0 {%k1} # encoding: [0x62,0xf2,0xed,0x29,0x9a,0xc1]
 230 ; X86-NEXT:    # ymm0 {%k1} = (ymm0 * ymm1) - ymm2
 231 ; X86-NEXT:    retl # encoding: [0xc3]
 232 ;
 233 ; X64-LABEL: test_mm256_mask_fmsub_pd:
 234 ; X64:       # %bb.0: # %entry
 235 ; X64-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
 236 ; X64-NEXT:    vfmsub132pd %ymm1, %ymm2, %ymm0 {%k1} # encoding: [0x62,0xf2,0xed,0x29,0x9a,0xc1]
 237 ; X64-NEXT:    # ymm0 {%k1} = (ymm0 * ymm1) - ymm2
 238 ; X64-NEXT:    retq # encoding: [0xc3]
 239 entry:
 240   %sub.i = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
 241   %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %sub.i) #9
 242   %1 = bitcast i8 %__U to <8 x i1>
 243   %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 244   %2 = select <4 x i1> %extract.i, <4 x double> %0, <4 x double> %__A
 245   ret <4 x double> %2
 246 }
 247
 248 define <4 x double> @test_mm256_mask3_fmadd_pd(<4 x double> %__A, <4 x double> %__B, <4 x double> %__C, i8 zeroext %__U) {
 249 ; X86-LABEL: test_mm256_mask3_fmadd_pd:
 250 ; X86:       # %bb.0: # %entry
 251 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
 252 ; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 253 ; X86-NEXT:    vfmadd231pd %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf2,0xfd,0x29,0xb8,0xd1]
 254 ; X86-NEXT:    # ymm2 {%k1} = (ymm0 * ymm1) + ymm2
 255 ; X86-NEXT:    vmovapd %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x28,0xc2]
 256 ; X86-NEXT:    retl # encoding: [0xc3]
 257 ;
 258 ; X64-LABEL: test_mm256_mask3_fmadd_pd:
 259 ; X64:       # %bb.0: # %entry
 260 ; X64-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
 261 ; X64-NEXT:    vfmadd231pd %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf2,0xfd,0x29,0xb8,0xd1]
 262 ; X64-NEXT:    # ymm2 {%k1} = (ymm0 * ymm1) + ymm2
 263 ; X64-NEXT:    vmovapd %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x28,0xc2]
 264 ; X64-NEXT:    retq # encoding: [0xc3]
 265 entry:
 266   %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %__C) #9
 267   %1 = bitcast i8 %__U to <8 x i1>
 268   %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 269   %2 = select <4 x i1> %extract.i, <4 x double> %0, <4 x double> %__C
 270   ret <4 x double> %2
 271 }
 272
 273 define <4 x double> @test_mm256_mask3_fnmadd_pd(<4 x double> %__A, <4 x double> %__B, <4 x double> %__C, i8 zeroext %__U) {
 274 ; X86-LABEL: test_mm256_mask3_fnmadd_pd:
 275 ; X86:       # %bb.0: # %entry
 276 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
 277 ; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 278 ; X86-NEXT:    vfnmadd231pd %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf2,0xfd,0x29,0xbc,0xd1]
 279 ; X86-NEXT:    # ymm2 {%k1} = -(ymm0 * ymm1) + ymm2
 280 ; X86-NEXT:    vmovapd %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x28,0xc2]
 281 ; X86-NEXT:    retl # encoding: [0xc3]
 282 ;
 283 ; X64-LABEL: test_mm256_mask3_fnmadd_pd:
 284 ; X64:       # %bb.0: # %entry
 285 ; X64-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
 286 ; X64-NEXT:    vfnmadd231pd %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf2,0xfd,0x29,0xbc,0xd1]
 287 ; X64-NEXT:    # ymm2 {%k1} = -(ymm0 * ymm1) + ymm2
 288 ; X64-NEXT:    vmovapd %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x28,0xc2]
 289 ; X64-NEXT:    retq # encoding: [0xc3]
 290 entry:
 291   %sub.i = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__A
 292   %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %sub.i, <4 x double> %__B, <4 x double> %__C) #9
 293   %1 = bitcast i8 %__U to <8 x i1>
 294   %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 295   %2 = select <4 x i1> %extract.i, <4 x double> %0, <4 x double> %__C
 296   ret <4 x double> %2
 297 }
 298
 299 define <4 x double> @test_mm256_maskz_fmadd_pd(i8 zeroext %__U, <4 x double> %__A, <4 x double> %__B, <4 x double> %__C) {
 300 ; X86-LABEL: test_mm256_maskz_fmadd_pd:
 301 ; X86:       # %bb.0: # %entry
 302 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
 303 ; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 304 ; X86-NEXT:    vfmadd213pd %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0xa9,0xa8,0xc2]
 305 ; X86-NEXT:    # ymm0 {%k1} {z} = (ymm1 * ymm0) + ymm2
 306 ; X86-NEXT:    retl # encoding: [0xc3]
 307 ;
 308 ; X64-LABEL: test_mm256_maskz_fmadd_pd:
 309 ; X64:       # %bb.0: # %entry
 310 ; X64-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
 311 ; X64-NEXT:    vfmadd213pd %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0xa9,0xa8,0xc2]
 312 ; X64-NEXT:    # ymm0 {%k1} {z} = (ymm1 * ymm0) + ymm2
 313 ; X64-NEXT:    retq # encoding: [0xc3]
 314 entry:
 315   %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %__C) #9
 316   %1 = bitcast i8 %__U to <8 x i1>
 317   %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 318   %2 = select <4 x i1> %extract.i, <4 x double> %0, <4 x double> zeroinitializer
 319   ret <4 x double> %2
 320 }
 321
 322 define <4 x double> @test_mm256_maskz_fmsub_pd(i8 zeroext %__U, <4 x double> %__A, <4 x double> %__B, <4 x double> %__C) {
 323 ; X86-LABEL: test_mm256_maskz_fmsub_pd:
 324 ; X86:       # %bb.0: # %entry
 325 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
 326 ; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 327 ; X86-NEXT:    vfmsub213pd %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0xa9,0xaa,0xc2]
 328 ; X86-NEXT:    # ymm0 {%k1} {z} = (ymm1 * ymm0) - ymm2
 329 ; X86-NEXT:    retl # encoding: [0xc3]
 330 ;
 331 ; X64-LABEL: test_mm256_maskz_fmsub_pd:
 332 ; X64:       # %bb.0: # %entry
 333 ; X64-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
 334 ; X64-NEXT:    vfmsub213pd %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0xa9,0xaa,0xc2]
 335 ; X64-NEXT:    # ymm0 {%k1} {z} = (ymm1 * ymm0) - ymm2
 336 ; X64-NEXT:    retq # encoding: [0xc3]
 337 entry:
 338   %sub.i = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
 339   %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %sub.i) #9
 340   %1 = bitcast i8 %__U to <8 x i1>
 341   %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 342   %2 = select <4 x i1> %extract.i, <4 x double> %0, <4 x double> zeroinitializer
 343   ret <4 x double> %2
 344 }
 345
 346 define <4 x double> @test_mm256_maskz_fnmadd_pd(i8 zeroext %__U, <4 x double> %__A, <4 x double> %__B, <4 x double> %__C) {
 347 ; X86-LABEL: test_mm256_maskz_fnmadd_pd:
 348 ; X86:       # %bb.0: # %entry
 349 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
 350 ; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 351 ; X86-NEXT:    vfnmadd213pd %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0xa9,0xac,0xc2]
 352 ; X86-NEXT:    # ymm0 {%k1} {z} = -(ymm1 * ymm0) + ymm2
 353 ; X86-NEXT:    retl # encoding: [0xc3]
 354 ;
 355 ; X64-LABEL: test_mm256_maskz_fnmadd_pd:
 356 ; X64:       # %bb.0: # %entry
 357 ; X64-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
 358 ; X64-NEXT:    vfnmadd213pd %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0xa9,0xac,0xc2]
 359 ; X64-NEXT:    # ymm0 {%k1} {z} = -(ymm1 * ymm0) + ymm2
 360 ; X64-NEXT:    retq # encoding: [0xc3]
 361 entry:
 362   %sub.i = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__A
 363   %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %sub.i, <4 x double> %__B, <4 x double> %__C) #9
 364   %1 = bitcast i8 %__U to <8 x i1>
 365   %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 366   %2 = select <4 x i1> %extract.i, <4 x double> %0, <4 x double> zeroinitializer
 367   ret <4 x double> %2
 368 }
 369
 370 define <4 x double> @test_mm256_maskz_fnmsub_pd(i8 zeroext %__U, <4 x double> %__A, <4 x double> %__B, <4 x double> %__C) {
 371 ; X86-LABEL: test_mm256_maskz_fnmsub_pd:
 372 ; X86:       # %bb.0: # %entry
 373 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
 374 ; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 375 ; X86-NEXT:    vfnmsub213pd %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0xa9,0xae,0xc2]
 376 ; X86-NEXT:    # ymm0 {%k1} {z} = -(ymm1 * ymm0) - ymm2
 377 ; X86-NEXT:    retl # encoding: [0xc3]
 378 ;
 379 ; X64-LABEL: test_mm256_maskz_fnmsub_pd:
 380 ; X64:       # %bb.0: # %entry
 381 ; X64-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
 382 ; X64-NEXT:    vfnmsub213pd %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0xa9,0xae,0xc2]
 383 ; X64-NEXT:    # ymm0 {%k1} {z} = -(ymm1 * ymm0) - ymm2
 384 ; X64-NEXT:    retq # encoding: [0xc3]
 385 entry:
 386   %sub.i = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__A
 387   %sub1.i = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
 388   %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %sub.i, <4 x double> %__B, <4 x double> %sub1.i) #9
 389   %1 = bitcast i8 %__U to <8 x i1>
 390   %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 391   %2 = select <4 x i1> %extract.i, <4 x double> %0, <4 x double> zeroinitializer
 392   ret <4 x double> %2
 393 }
 394
 395 define <4 x float> @test_mm_mask_fmadd_ps(<4 x float> %__A, i8 zeroext %__U, <4 x float> %__B, <4 x float> %__C) {
 396 ; X86-LABEL: test_mm_mask_fmadd_ps:
 397 ; X86:       # %bb.0: # %entry
 398 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
 399 ; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 400 ; X86-NEXT:    vfmadd132ps %xmm1, %xmm2, %xmm0 {%k1} # encoding: [0x62,0xf2,0x6d,0x09,0x98,0xc1]
 401 ; X86-NEXT:    # xmm0 {%k1} = (xmm0 * xmm1) + xmm2
 402 ; X86-NEXT:    retl # encoding: [0xc3]
 403 ;
 404 ; X64-LABEL: test_mm_mask_fmadd_ps:
 405 ; X64:       # %bb.0: # %entry
 406 ; X64-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
 407 ; X64-NEXT:    vfmadd132ps %xmm1, %xmm2, %xmm0 {%k1} # encoding: [0x62,0xf2,0x6d,0x09,0x98,0xc1]
 408 ; X64-NEXT:    # xmm0 {%k1} = (xmm0 * xmm1) + xmm2
 409 ; X64-NEXT:    retq # encoding: [0xc3]
 410 entry:
 411   %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %__C) #9
 412   %1 = bitcast i8 %__U to <8 x i1>
 413   %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 414   %2 = select <4 x i1> %extract.i, <4 x float> %0, <4 x float> %__A
 415   ret <4 x float> %2
 416 }
 417
 418 define <4 x float> @test_mm_mask_fmsub_ps(<4 x float> %__A, i8 zeroext %__U, <4 x float> %__B, <4 x float> %__C) {
 419 ; X86-LABEL: test_mm_mask_fmsub_ps:
 420 ; X86:       # %bb.0: # %entry
 421 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
 422 ; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 423 ; X86-NEXT:    vfmsub132ps %xmm1, %xmm2, %xmm0 {%k1} # encoding: [0x62,0xf2,0x6d,0x09,0x9a,0xc1]
 424 ; X86-NEXT:    # xmm0 {%k1} = (xmm0 * xmm1) - xmm2
 425 ; X86-NEXT:    retl # encoding: [0xc3]
 426 ;
 427 ; X64-LABEL: test_mm_mask_fmsub_ps:
 428 ; X64:       # %bb.0: # %entry
 429 ; X64-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
 430 ; X64-NEXT:    vfmsub132ps %xmm1, %xmm2, %xmm0 {%k1} # encoding: [0x62,0xf2,0x6d,0x09,0x9a,0xc1]
 431 ; X64-NEXT:    # xmm0 {%k1} = (xmm0 * xmm1) - xmm2
 432 ; X64-NEXT:    retq # encoding: [0xc3]
 433 entry:
 434   %sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
 435   %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %sub.i) #9
 436   %1 = bitcast i8 %__U to <8 x i1>
 437   %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 438   %2 = select <4 x i1> %extract.i, <4 x float> %0, <4 x float> %__A
 439   ret <4 x float> %2
 440 }
 441
 442 define <4 x float> @test_mm_mask3_fmadd_ps(<4 x float> %__A, <4 x float> %__B, <4 x float> %__C, i8 zeroext %__U) {
 443 ; X86-LABEL: test_mm_mask3_fmadd_ps:
 444 ; X86:       # %bb.0: # %entry
 445 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
 446 ; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 447 ; X86-NEXT:    vfmadd231ps %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0xb8,0xd1]
 448 ; X86-NEXT:    # xmm2 {%k1} = (xmm0 * xmm1) + xmm2
 449 ; X86-NEXT:    vmovaps %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc2]
 450 ; X86-NEXT:    retl # encoding: [0xc3]
 451 ;
 452 ; X64-LABEL: test_mm_mask3_fmadd_ps:
 453 ; X64:       # %bb.0: # %entry
 454 ; X64-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
 455 ; X64-NEXT:    vfmadd231ps %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0xb8,0xd1]
 456 ; X64-NEXT:    # xmm2 {%k1} = (xmm0 * xmm1) + xmm2
 457 ; X64-NEXT:    vmovaps %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc2]
 458 ; X64-NEXT:    retq # encoding: [0xc3]
 459 entry:
 460   %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %__C) #9
 461   %1 = bitcast i8 %__U to <8 x i1>
 462   %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 463   %2 = select <4 x i1> %extract.i, <4 x float> %0, <4 x float> %__C
 464   ret <4 x float> %2
 465 }
 466
 467 define <4 x float> @test_mm_mask3_fnmadd_ps(<4 x float> %__A, <4 x float> %__B, <4 x float> %__C, i8 zeroext %__U) {
 468 ; X86-LABEL: test_mm_mask3_fnmadd_ps:
 469 ; X86:       # %bb.0: # %entry
 470 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
 471 ; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 472 ; X86-NEXT:    vfnmadd231ps %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0xbc,0xd1]
 473 ; X86-NEXT:    # xmm2 {%k1} = -(xmm0 * xmm1) + xmm2
 474 ; X86-NEXT:    vmovaps %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc2]
 475 ; X86-NEXT:    retl # encoding: [0xc3]
 476 ;
 477 ; X64-LABEL: test_mm_mask3_fnmadd_ps:
 478 ; X64:       # %bb.0: # %entry
 479 ; X64-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
 480 ; X64-NEXT:    vfnmadd231ps %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0xbc,0xd1]
 481 ; X64-NEXT:    # xmm2 {%k1} = -(xmm0 * xmm1) + xmm2
 482 ; X64-NEXT:    vmovaps %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc2]
 483 ; X64-NEXT:    retq # encoding: [0xc3]
 484 entry:
 485   %sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A
 486   %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %sub.i, <4 x float> %__B, <4 x float> %__C) #9
 487   %1 = bitcast i8 %__U to <8 x i1>
 488   %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 489   %2 = select <4 x i1> %extract.i, <4 x float> %0, <4 x float> %__C
 490   ret <4 x float> %2
 491 }
 492
 493 define <4 x float> @test_mm_maskz_fmadd_ps(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B, <4 x float> %__C) {
 494 ; X86-LABEL: test_mm_maskz_fmadd_ps:
 495 ; X86:       # %bb.0: # %entry
 496 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
 497 ; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 498 ; X86-NEXT:    vfmadd213ps %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0x89,0xa8,0xc2]
 499 ; X86-NEXT:    # xmm0 {%k1} {z} = (xmm1 * xmm0) + xmm2
 500 ; X86-NEXT:    retl # encoding: [0xc3]
 501 ;
 502 ; X64-LABEL: test_mm_maskz_fmadd_ps:
 503 ; X64:       # %bb.0: # %entry
 504 ; X64-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
 505 ; X64-NEXT:    vfmadd213ps %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0x89,0xa8,0xc2]
 506 ; X64-NEXT:    # xmm0 {%k1} {z} = (xmm1 * xmm0) + xmm2
 507 ; X64-NEXT:    retq # encoding: [0xc3]
 508 entry:
 509   %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %__C) #9
 510   %1 = bitcast i8 %__U to <8 x i1>
 511   %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 512   %2 = select <4 x i1> %extract.i, <4 x float> %0, <4 x float> zeroinitializer
 513   ret <4 x float> %2
 514 }
 515
 516 define <4 x float> @test_mm_maskz_fmsub_ps(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B, <4 x float> %__C) {
 517 ; X86-LABEL: test_mm_maskz_fmsub_ps:
 518 ; X86:       # %bb.0: # %entry
 519 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
 520 ; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 521 ; X86-NEXT:    vfmsub213ps %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0x89,0xaa,0xc2]
 522 ; X86-NEXT:    # xmm0 {%k1} {z} = (xmm1 * xmm0) - xmm2
 523 ; X86-NEXT:    retl # encoding: [0xc3]
 524 ;
 525 ; X64-LABEL: test_mm_maskz_fmsub_ps:
 526 ; X64:       # %bb.0: # %entry
 527 ; X64-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
 528 ; X64-NEXT:    vfmsub213ps %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0x89,0xaa,0xc2]
 529 ; X64-NEXT:    # xmm0 {%k1} {z} = (xmm1 * xmm0) - xmm2
 530 ; X64-NEXT:    retq # encoding: [0xc3]
 531 entry:
 532   %sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
 533   %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %sub.i) #9
 534   %1 = bitcast i8 %__U to <8 x i1>
 535   %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 536   %2 = select <4 x i1> %extract.i, <4 x float> %0, <4 x float> zeroinitializer
 537   ret <4 x float> %2
 538 }
 539
 540 define <4 x float> @test_mm_maskz_fnmadd_ps(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B, <4 x float> %__C) {
 541 ; X86-LABEL: test_mm_maskz_fnmadd_ps:
 542 ; X86:       # %bb.0: # %entry
 543 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
 544 ; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 545 ; X86-NEXT:    vfnmadd213ps %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0x89,0xac,0xc2]
 546 ; X86-NEXT:    # xmm0 {%k1} {z} = -(xmm1 * xmm0) + xmm2
 547 ; X86-NEXT:    retl # encoding: [0xc3]
 548 ;
 549 ; X64-LABEL: test_mm_maskz_fnmadd_ps:
 550 ; X64:       # %bb.0: # %entry
 551 ; X64-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
 552 ; X64-NEXT:    vfnmadd213ps %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0x89,0xac,0xc2]
 553 ; X64-NEXT:    # xmm0 {%k1} {z} = -(xmm1 * xmm0) + xmm2
 554 ; X64-NEXT:    retq # encoding: [0xc3]
 555 entry:
 556   %sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A
 557   %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %sub.i, <4 x float> %__B, <4 x float> %__C) #9
 558   %1 = bitcast i8 %__U to <8 x i1>
 559   %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 560   %2 = select <4 x i1> %extract.i, <4 x float> %0, <4 x float> zeroinitializer
 561   ret <4 x float> %2
 562 }
 563
 564 define <4 x float> @test_mm_maskz_fnmsub_ps(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B, <4 x float> %__C) {
 565 ; X86-LABEL: test_mm_maskz_fnmsub_ps:
 566 ; X86:       # %bb.0: # %entry
 567 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
 568 ; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 569 ; X86-NEXT:    vfnmsub213ps %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0x89,0xae,0xc2]
 570 ; X86-NEXT:    # xmm0 {%k1} {z} = -(xmm1 * xmm0) - xmm2
 571 ; X86-NEXT:    retl # encoding: [0xc3]
 572 ;
 573 ; X64-LABEL: test_mm_maskz_fnmsub_ps:
 574 ; X64:       # %bb.0: # %entry
 575 ; X64-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
 576 ; X64-NEXT:    vfnmsub213ps %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0x89,0xae,0xc2]
 577 ; X64-NEXT:    # xmm0 {%k1} {z} = -(xmm1 * xmm0) - xmm2
 578 ; X64-NEXT:    retq # encoding: [0xc3]
 579 entry:
 580   %sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A
 581   %sub1.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
 582   %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %sub.i, <4 x float> %__B, <4 x float> %sub1.i) #9
 583   %1 = bitcast i8 %__U to <8 x i1>
 584   %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 585   %2 = select <4 x i1> %extract.i, <4 x float> %0, <4 x float> zeroinitializer
 586   ret <4 x float> %2
 587 }
 588
 589 define <8 x float> @test_mm256_mask_fmadd_ps(<8 x float> %__A, i8 zeroext %__U, <8 x float> %__B, <8 x float> %__C) {
 590 ; X86-LABEL: test_mm256_mask_fmadd_ps:
 591 ; X86:       # %bb.0: # %entry
 592 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
 593 ; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 594 ; X86-NEXT:    vfmadd132ps %ymm1, %ymm2, %ymm0 {%k1} # encoding: [0x62,0xf2,0x6d,0x29,0x98,0xc1]
 595 ; X86-NEXT:    # ymm0 {%k1} = (ymm0 * ymm1) + ymm2
 596 ; X86-NEXT:    retl # encoding: [0xc3]
 597 ;
 598 ; X64-LABEL: test_mm256_mask_fmadd_ps:
 599 ; X64:       # %bb.0: # %entry
 600 ; X64-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
 601 ; X64-NEXT:    vfmadd132ps %ymm1, %ymm2, %ymm0 {%k1} # encoding: [0x62,0xf2,0x6d,0x29,0x98,0xc1]
 602 ; X64-NEXT:    # ymm0 {%k1} = (ymm0 * ymm1) + ymm2
 603 ; X64-NEXT:    retq # encoding: [0xc3]
 604 entry:
 605   %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %__C) #9
 606   %1 = bitcast i8 %__U to <8 x i1>
 607   %2 = select <8 x i1> %1, <8 x float> %0, <8 x float> %__A
 608   ret <8 x float> %2
 609 }
 610
 611 define <8 x float> @test_mm256_mask_fmsub_ps(<8 x float> %__A, i8 zeroext %__U, <8 x float> %__B, <8 x float> %__C) {
 612 ; X86-LABEL: test_mm256_mask_fmsub_ps:
 613 ; X86:       # %bb.0: # %entry
 614 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
 615 ; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 616 ; X86-NEXT:    vfmsub132ps %ymm1, %ymm2, %ymm0 {%k1} # encoding: [0x62,0xf2,0x6d,0x29,0x9a,0xc1]
 617 ; X86-NEXT:    # ymm0 {%k1} = (ymm0 * ymm1) - ymm2
 618 ; X86-NEXT:    retl # encoding: [0xc3]
 619 ;
 620 ; X64-LABEL: test_mm256_mask_fmsub_ps:
 621 ; X64:       # %bb.0: # %entry
 622 ; X64-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
 623 ; X64-NEXT:    vfmsub132ps %ymm1, %ymm2, %ymm0 {%k1} # encoding: [0x62,0xf2,0x6d,0x29,0x9a,0xc1]
 624 ; X64-NEXT:    # ymm0 {%k1} = (ymm0 * ymm1) - ymm2
 625 ; X64-NEXT:    retq # encoding: [0xc3]
 626 entry:
 627   %sub.i = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
 628   %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %sub.i) #9
 629   %1 = bitcast i8 %__U to <8 x i1>
 630   %2 = select <8 x i1> %1, <8 x float> %0, <8 x float> %__A
 631   ret <8 x float> %2
 632 }
 633
 634 define <8 x float> @test_mm256_mask3_fmadd_ps(<8 x float> %__A, <8 x float> %__B, <8 x float> %__C, i8 zeroext %__U) {
 635 ; X86-LABEL: test_mm256_mask3_fmadd_ps:
 636 ; X86:       # %bb.0: # %entry
 637 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
 638 ; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 639 ; X86-NEXT:    vfmadd231ps %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0xb8,0xd1]
 640 ; X86-NEXT:    # ymm2 {%k1} = (ymm0 * ymm1) + ymm2
 641 ; X86-NEXT:    vmovaps %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc2]
 642 ; X86-NEXT:    retl # encoding: [0xc3]
 643 ;
 644 ; X64-LABEL: test_mm256_mask3_fmadd_ps:
 645 ; X64:       # %bb.0: # %entry
 646 ; X64-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
 647 ; X64-NEXT:    vfmadd231ps %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0xb8,0xd1]
 648 ; X64-NEXT:    # ymm2 {%k1} = (ymm0 * ymm1) + ymm2
 649 ; X64-NEXT:    vmovaps %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc2]
 650 ; X64-NEXT:    retq # encoding: [0xc3]
 651 entry:
 652   %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %__C) #9
 653   %1 = bitcast i8 %__U to <8 x i1>
 654   %2 = select <8 x i1> %1, <8 x float> %0, <8 x float> %__C
 655   ret <8 x float> %2
 656 }
 657
 658 define <8 x float> @test_mm256_mask3_fnmadd_ps(<8 x float> %__A, <8 x float> %__B, <8 x float> %__C, i8 zeroext %__U) {
 659 ; X86-LABEL: test_mm256_mask3_fnmadd_ps:
 660 ; X86:       # %bb.0: # %entry
 661 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
 662 ; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 663 ; X86-NEXT:    vfnmadd231ps %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0xbc,0xd1]
 664 ; X86-NEXT:    # ymm2 {%k1} = -(ymm0 * ymm1) + ymm2
 665 ; X86-NEXT:    vmovaps %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc2]
 666 ; X86-NEXT:    retl # encoding: [0xc3]
 667 ;
 668 ; X64-LABEL: test_mm256_mask3_fnmadd_ps:
 669 ; X64:       # %bb.0: # %entry
 670 ; X64-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
 671 ; X64-NEXT:    vfnmadd231ps %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0xbc,0xd1]
 672 ; X64-NEXT:    # ymm2 {%k1} = -(ymm0 * ymm1) + ymm2
 673 ; X64-NEXT:    vmovaps %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc2]
 674 ; X64-NEXT:    retq # encoding: [0xc3]
 675 entry:
 676   %sub.i = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A
 677   %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %sub.i, <8 x float> %__B, <8 x float> %__C) #9
 678   %1 = bitcast i8 %__U to <8 x i1>
 679   %2 = select <8 x i1> %1, <8 x float> %0, <8 x float> %__C
 680   ret <8 x float> %2
 681 }
 682
 683 define <8 x float> @test_mm256_maskz_fmadd_ps(i8 zeroext %__U, <8 x float> %__A, <8 x float> %__B, <8 x float> %__C) {
 684 ; X86-LABEL: test_mm256_maskz_fmadd_ps:
 685 ; X86:       # %bb.0: # %entry
 686 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
 687 ; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 688 ; X86-NEXT:    vfmadd213ps %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xa9,0xa8,0xc2]
 689 ; X86-NEXT:    # ymm0 {%k1} {z} = (ymm1 * ymm0) + ymm2
 690 ; X86-NEXT:    retl # encoding: [0xc3]
 691 ;
 692 ; X64-LABEL: test_mm256_maskz_fmadd_ps:
 693 ; X64:       # %bb.0: # %entry
 694 ; X64-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
 695 ; X64-NEXT:    vfmadd213ps %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xa9,0xa8,0xc2]
 696 ; X64-NEXT:    # ymm0 {%k1} {z} = (ymm1 * ymm0) + ymm2
 697 ; X64-NEXT:    retq # encoding: [0xc3]
 698 entry:
 699   %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %__C) #9
 700   %1 = bitcast i8 %__U to <8 x i1>
 701   %2 = select <8 x i1> %1, <8 x float> %0, <8 x float> zeroinitializer
 702   ret <8 x float> %2
 703 }
 704
 705 define <8 x float> @test_mm256_maskz_fmsub_ps(i8 zeroext %__U, <8 x float> %__A, <8 x float> %__B, <8 x float> %__C) {
 706 ; X86-LABEL: test_mm256_maskz_fmsub_ps:
 707 ; X86:       # %bb.0: # %entry
 708 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
 709 ; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 710 ; X86-NEXT:    vfmsub213ps %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xa9,0xaa,0xc2]
 711 ; X86-NEXT:    # ymm0 {%k1} {z} = (ymm1 * ymm0) - ymm2
 712 ; X86-NEXT:    retl # encoding: [0xc3]
 713 ;
 714 ; X64-LABEL: test_mm256_maskz_fmsub_ps:
 715 ; X64:       # %bb.0: # %entry
 716 ; X64-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
 717 ; X64-NEXT:    vfmsub213ps %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xa9,0xaa,0xc2]
 718 ; X64-NEXT:    # ymm0 {%k1} {z} = (ymm1 * ymm0) - ymm2
 719 ; X64-NEXT:    retq # encoding: [0xc3]
 720 entry:
 721   %sub.i = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
 722   %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %sub.i) #9
 723   %1 = bitcast i8 %__U to <8 x i1>
 724   %2 = select <8 x i1> %1, <8 x float> %0, <8 x float> zeroinitializer
 725   ret <8 x float> %2
 726 }
 727
 728 define <8 x float> @test_mm256_maskz_fnmadd_ps(i8 zeroext %__U, <8 x float> %__A, <8 x float> %__B, <8 x float> %__C) {
 729 ; X86-LABEL: test_mm256_maskz_fnmadd_ps:
 730 ; X86:       # %bb.0: # %entry
 731 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
 732 ; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 733 ; X86-NEXT:    vfnmadd213ps %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xa9,0xac,0xc2]
 734 ; X86-NEXT:    # ymm0 {%k1} {z} = -(ymm1 * ymm0) + ymm2
 735 ; X86-NEXT:    retl # encoding: [0xc3]
 736 ;
 737 ; X64-LABEL: test_mm256_maskz_fnmadd_ps:
 738 ; X64:       # %bb.0: # %entry
 739 ; X64-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
 740 ; X64-NEXT:    vfnmadd213ps %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xa9,0xac,0xc2]
 741 ; X64-NEXT:    # ymm0 {%k1} {z} = -(ymm1 * ymm0) + ymm2
 742 ; X64-NEXT:    retq # encoding: [0xc3]
 743 entry:
 744   %sub.i = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A
 745   %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %sub.i, <8 x float> %__B, <8 x float> %__C) #9
 746   %1 = bitcast i8 %__U to <8 x i1>
 747   %2 = select <8 x i1> %1, <8 x float> %0, <8 x float> zeroinitializer
 748   ret <8 x float> %2
 749 }
 750
 751 define <8 x float> @test_mm256_maskz_fnmsub_ps(i8 zeroext %__U, <8 x float> %__A, <8 x float> %__B, <8 x float> %__C) {
 752 ; X86-LABEL: test_mm256_maskz_fnmsub_ps:
 753 ; X86:       # %bb.0: # %entry
 754 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
 755 ; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 756 ; X86-NEXT:    vfnmsub213ps %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xa9,0xae,0xc2]
 757 ; X86-NEXT:    # ymm0 {%k1} {z} = -(ymm1 * ymm0) - ymm2
 758 ; X86-NEXT:    retl # encoding: [0xc3]
 759 ;
 760 ; X64-LABEL: test_mm256_maskz_fnmsub_ps:
 761 ; X64:       # %bb.0: # %entry
 762 ; X64-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
 763 ; X64-NEXT:    vfnmsub213ps %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xa9,0xae,0xc2]
 764 ; X64-NEXT:    # ymm0 {%k1} {z} = -(ymm1 * ymm0) - ymm2
 765 ; X64-NEXT:    retq # encoding: [0xc3]
 766 entry:
 767   %sub.i = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A
 768   %sub1.i = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
 769   %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %sub.i, <8 x float> %__B, <8 x float> %sub1.i) #9
 770   %1 = bitcast i8 %__U to <8 x i1>
 771   %2 = select <8 x i1> %1, <8 x float> %0, <8 x float> zeroinitializer
 772   ret <8 x float> %2
 773 }
 774
 775 define <2 x double> @test_mm_mask_fmaddsub_pd(<2 x double> %__A, i8 zeroext %__U, <2 x double> %__B, <2 x double> %__C) {
 776 ; X86-LABEL: test_mm_mask_fmaddsub_pd:
 777 ; X86:       # %bb.0: # %entry
 778 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
 779 ; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 780 ; X86-NEXT:    vfmaddsub132pd %xmm1, %xmm2, %xmm0 {%k1} # encoding: [0x62,0xf2,0xed,0x09,0x96,0xc1]
 781 ; X86-NEXT:    # xmm0 {%k1} = (xmm0 * xmm1) +/- xmm2
 782 ; X86-NEXT:    retl # encoding: [0xc3]
 783 ;
 784 ; X64-LABEL: test_mm_mask_fmaddsub_pd:
 785 ; X64:       # %bb.0: # %entry
 786 ; X64-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
 787 ; X64-NEXT:    vfmaddsub132pd %xmm1, %xmm2, %xmm0 {%k1} # encoding: [0x62,0xf2,0xed,0x09,0x96,0xc1]
 788 ; X64-NEXT:    # xmm0 {%k1} = (xmm0 * xmm1) +/- xmm2
 789 ; X64-NEXT:    retq # encoding: [0xc3]
 790 entry:
 791   %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %__C) #9
 792   %1 = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__C
 793   %2 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %1) #9
 794   %3 = shufflevector <2 x double> %2, <2 x double> %0, <2 x i32> <i32 0, i32 3>
 795   %4 = bitcast i8 %__U to <8 x i1>
 796   %extract.i = shufflevector <8 x i1> %4, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
 797   %5 = select <2 x i1> %extract.i, <2 x double> %3, <2 x double> %__A
 798   ret <2 x double> %5
 799 }
 800
 801 define <2 x double> @test_mm_mask_fmsubadd_pd(<2 x double> %__A, i8 zeroext %__U, <2 x double> %__B, <2 x double> %__C) {
 802 ; X86-LABEL: test_mm_mask_fmsubadd_pd:
 803 ; X86:       # %bb.0: # %entry
 804 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
 805 ; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 806 ; X86-NEXT:    vfmsubadd132pd %xmm1, %xmm2, %xmm0 {%k1} # encoding: [0x62,0xf2,0xed,0x09,0x97,0xc1]
 807 ; X86-NEXT:    # xmm0 {%k1} = (xmm0 * xmm1) -/+ xmm2
 808 ; X86-NEXT:    retl # encoding: [0xc3]
 809 ;
 810 ; X64-LABEL: test_mm_mask_fmsubadd_pd:
 811 ; X64:       # %bb.0: # %entry
 812 ; X64-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
 813 ; X64-NEXT:    vfmsubadd132pd %xmm1, %xmm2, %xmm0 {%k1} # encoding: [0x62,0xf2,0xed,0x09,0x97,0xc1]
 814 ; X64-NEXT:    # xmm0 {%k1} = (xmm0 * xmm1) -/+ xmm2
 815 ; X64-NEXT:    retq # encoding: [0xc3]
 816 entry:
 817   %sub.i = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__C
 818   %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %sub.i) #9
 819   %1 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %__C) #9
 820   %2 = shufflevector <2 x double> %1, <2 x double> %0, <2 x i32> <i32 0, i32 3>
 821   %3 = bitcast i8 %__U to <8 x i1>
 822   %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
 823   %4 = select <2 x i1> %extract.i, <2 x double> %2, <2 x double> %__A
 824   ret <2 x double> %4
 825 }
 826
 827 define <2 x double> @test_mm_mask3_fmaddsub_pd(<2 x double> %__A, <2 x double> %__B, <2 x double> %__C, i8 zeroext %__U) {
 828 ; X86-LABEL: test_mm_mask3_fmaddsub_pd:
 829 ; X86:       # %bb.0: # %entry
 830 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
 831 ; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 832 ; X86-NEXT:    vfmaddsub231pd %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0xfd,0x09,0xb6,0xd1]
 833 ; X86-NEXT:    # xmm2 {%k1} = (xmm0 * xmm1) +/- xmm2
 834 ; X86-NEXT:    vmovapd %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0xc2]
 835 ; X86-NEXT:    retl # encoding: [0xc3]
 836 ;
 837 ; X64-LABEL: test_mm_mask3_fmaddsub_pd:
 838 ; X64:       # %bb.0: # %entry
 839 ; X64-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
 840 ; X64-NEXT:    vfmaddsub231pd %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0xfd,0x09,0xb6,0xd1]
 841 ; X64-NEXT:    # xmm2 {%k1} = (xmm0 * xmm1) +/- xmm2
 842 ; X64-NEXT:    vmovapd %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0xc2]
 843 ; X64-NEXT:    retq # encoding: [0xc3]
 844 entry:
 845   %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %__C) #9
 846   %1 = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__C
 847   %2 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %1) #9
 848   %3 = shufflevector <2 x double> %2, <2 x double> %0, <2 x i32> <i32 0, i32 3>
 849   %4 = bitcast i8 %__U to <8 x i1>
 850   %extract.i = shufflevector <8 x i1> %4, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
 851   %5 = select <2 x i1> %extract.i, <2 x double> %3, <2 x double> %__C
 852   ret <2 x double> %5
 853 }
 854
 855 define <2 x double> @test_mm_maskz_fmaddsub_pd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B, <2 x double> %__C) {
 856 ; X86-LABEL: test_mm_maskz_fmaddsub_pd:
 857 ; X86:       # %bb.0: # %entry
 858 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
 859 ; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 860 ; X86-NEXT:    vfmaddsub213pd %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0x89,0xa6,0xc2]
 861 ; X86-NEXT:    # xmm0 {%k1} {z} = (xmm1 * xmm0) +/- xmm2
 862 ; X86-NEXT:    retl # encoding: [0xc3]
 863 ;
 864 ; X64-LABEL: test_mm_maskz_fmaddsub_pd:
 865 ; X64:       # %bb.0: # %entry
 866 ; X64-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
 867 ; X64-NEXT:    vfmaddsub213pd %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0x89,0xa6,0xc2]
 868 ; X64-NEXT:    # xmm0 {%k1} {z} = (xmm1 * xmm0) +/- xmm2
 869 ; X64-NEXT:    retq # encoding: [0xc3]
 870 entry:
 871   %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %__C) #9
 872   %1 = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__C
 873   %2 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %1) #9
 874   %3 = shufflevector <2 x double> %2, <2 x double> %0, <2 x i32> <i32 0, i32 3>
 875   %4 = bitcast i8 %__U to <8 x i1>
 876   %extract.i = shufflevector <8 x i1> %4, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
 877   %5 = select <2 x i1> %extract.i, <2 x double> %3, <2 x double> zeroinitializer
 878   ret <2 x double> %5
 879 }
 880
 881 define <2 x double> @test_mm_maskz_fmsubadd_pd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B, <2 x double> %__C) {
 882 ; X86-LABEL: test_mm_maskz_fmsubadd_pd:
 883 ; X86:       # %bb.0: # %entry
 884 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
 885 ; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 886 ; X86-NEXT:    vfmsubadd213pd %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0x89,0xa7,0xc2]
 887 ; X86-NEXT:    # xmm0 {%k1} {z} = (xmm1 * xmm0) -/+ xmm2
 888 ; X86-NEXT:    retl # encoding: [0xc3]
 889 ;
 890 ; X64-LABEL: test_mm_maskz_fmsubadd_pd:
 891 ; X64:       # %bb.0: # %entry
 892 ; X64-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
 893 ; X64-NEXT:    vfmsubadd213pd %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0x89,0xa7,0xc2]
 894 ; X64-NEXT:    # xmm0 {%k1} {z} = (xmm1 * xmm0) -/+ xmm2
 895 ; X64-NEXT:    retq # encoding: [0xc3]
 896 entry:
 897   %sub.i = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__C
 898   %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %sub.i) #9
 899   %1 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %__C) #9
 900   %2 = shufflevector <2 x double> %1, <2 x double> %0, <2 x i32> <i32 0, i32 3>
 901   %3 = bitcast i8 %__U to <8 x i1>
 902   %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
 903   %4 = select <2 x i1> %extract.i, <2 x double> %2, <2 x double> zeroinitializer
 904   ret <2 x double> %4
 905 }
 906
 907 define <4 x double> @test_mm256_mask_fmaddsub_pd(<4 x double> %__A, i8 zeroext %__U, <4 x double> %__B, <4 x double> %__C) {
 908 ; X86-LABEL: test_mm256_mask_fmaddsub_pd:
 909 ; X86:       # %bb.0: # %entry
 910 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
 911 ; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 912 ; X86-NEXT:    vfmaddsub132pd %ymm1, %ymm2, %ymm0 {%k1} # encoding: [0x62,0xf2,0xed,0x29,0x96,0xc1]
 913 ; X86-NEXT:    # ymm0 {%k1} = (ymm0 * ymm1) +/- ymm2
 914 ; X86-NEXT:    retl # encoding: [0xc3]
 915 ;
 916 ; X64-LABEL: test_mm256_mask_fmaddsub_pd:
 917 ; X64:       # %bb.0: # %entry
 918 ; X64-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
 919 ; X64-NEXT:    vfmaddsub132pd %ymm1, %ymm2, %ymm0 {%k1} # encoding: [0x62,0xf2,0xed,0x29,0x96,0xc1]
 920 ; X64-NEXT:    # ymm0 {%k1} = (ymm0 * ymm1) +/- ymm2
 921 ; X64-NEXT:    retq # encoding: [0xc3]
 922 entry:
 923   %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %__C) #9
 924   %1 = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
 925   %2 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %1) #9
 926   %3 = shufflevector <4 x double> %2, <4 x double> %0, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
 927   %4 = bitcast i8 %__U to <8 x i1>
 928   %extract.i = shufflevector <8 x i1> %4, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 929   %5 = select <4 x i1> %extract.i, <4 x double> %3, <4 x double> %__A
 930   ret <4 x double> %5
 931 }
 932
 933 define <4 x double> @test_mm256_mask_fmsubadd_pd(<4 x double> %__A, i8 zeroext %__U, <4 x double> %__B, <4 x double> %__C) {
 934 ; X86-LABEL: test_mm256_mask_fmsubadd_pd:
 935 ; X86:       # %bb.0: # %entry
 936 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
 937 ; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 938 ; X86-NEXT:    vfmsubadd132pd %ymm1, %ymm2, %ymm0 {%k1} # encoding: [0x62,0xf2,0xed,0x29,0x97,0xc1]
 939 ; X86-NEXT:    # ymm0 {%k1} = (ymm0 * ymm1) -/+ ymm2
 940 ; X86-NEXT:    retl # encoding: [0xc3]
 941 ;
 942 ; X64-LABEL: test_mm256_mask_fmsubadd_pd:
 943 ; X64:       # %bb.0: # %entry
 944 ; X64-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
 945 ; X64-NEXT:    vfmsubadd132pd %ymm1, %ymm2, %ymm0 {%k1} # encoding: [0x62,0xf2,0xed,0x29,0x97,0xc1]
 946 ; X64-NEXT:    # ymm0 {%k1} = (ymm0 * ymm1) -/+ ymm2
 947 ; X64-NEXT:    retq # encoding: [0xc3]
 948 entry:
 949   %sub.i = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
 950   %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %sub.i) #9
 951   %1 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %__C) #9
 952   %2 = shufflevector <4 x double> %1, <4 x double> %0, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
 953   %3 = bitcast i8 %__U to <8 x i1>
 954   %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 955   %4 = select <4 x i1> %extract.i, <4 x double> %2, <4 x double> %__A
 956   ret <4 x double> %4
 957 }
 958
 959 define <4 x double> @test_mm256_mask3_fmaddsub_pd(<4 x double> %__A, <4 x double> %__B, <4 x double> %__C, i8 zeroext %__U) {
 960 ; X86-LABEL: test_mm256_mask3_fmaddsub_pd:
 961 ; X86:       # %bb.0: # %entry
 962 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
 963 ; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 964 ; X86-NEXT:    vfmaddsub231pd %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf2,0xfd,0x29,0xb6,0xd1]
 965 ; X86-NEXT:    # ymm2 {%k1} = (ymm0 * ymm1) +/- ymm2
 966 ; X86-NEXT:    vmovapd %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x28,0xc2]
 967 ; X86-NEXT:    retl # encoding: [0xc3]
 968 ;
 969 ; X64-LABEL: test_mm256_mask3_fmaddsub_pd:
 970 ; X64:       # %bb.0: # %entry
 971 ; X64-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
 972 ; X64-NEXT:    vfmaddsub231pd %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf2,0xfd,0x29,0xb6,0xd1]
 973 ; X64-NEXT:    # ymm2 {%k1} = (ymm0 * ymm1) +/- ymm2
 974 ; X64-NEXT:    vmovapd %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x28,0xc2]
 975 ; X64-NEXT:    retq # encoding: [0xc3]
 976 entry:
 977   %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %__C) #9
 978   %1 = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
 979   %2 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %1) #9
 980   %3 = shufflevector <4 x double> %2, <4 x double> %0, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
 981   %4 = bitcast i8 %__U to <8 x i1>
 982   %extract.i = shufflevector <8 x i1> %4, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 983   %5 = select <4 x i1> %extract.i, <4 x double> %3, <4 x double> %__C
 984   ret <4 x double> %5
 985 }
 986
 987 define <4 x double> @test_mm256_maskz_fmaddsub_pd(i8 zeroext %__U, <4 x double> %__A, <4 x double> %__B, <4 x double> %__C) {
 988 ; X86-LABEL: test_mm256_maskz_fmaddsub_pd:
 989 ; X86:       # %bb.0: # %entry
 990 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
 991 ; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 992 ; X86-NEXT:    vfmaddsub213pd %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0xa9,0xa6,0xc2]
 993 ; X86-NEXT:    # ymm0 {%k1} {z} = (ymm1 * ymm0) +/- ymm2
 994 ; X86-NEXT:    retl # encoding: [0xc3]
 995 ;
 996 ; X64-LABEL: test_mm256_maskz_fmaddsub_pd:
 997 ; X64:       # %bb.0: # %entry
 998 ; X64-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
 999 ; X64-NEXT:    vfmaddsub213pd %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0xa9,0xa6,0xc2]
1000 ; X64-NEXT:    # ymm0 {%k1} {z} = (ymm1 * ymm0) +/- ymm2
1001 ; X64-NEXT:    retq # encoding: [0xc3]
1002 entry:
1003   %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %__C) #9
1004   %1 = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
1005   %2 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %1) #9
1006   %3 = shufflevector <4 x double> %2, <4 x double> %0, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
1007   %4 = bitcast i8 %__U to <8 x i1>
1008   %extract.i = shufflevector <8 x i1> %4, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1009   %5 = select <4 x i1> %extract.i, <4 x double> %3, <4 x double> zeroinitializer
1010   ret <4 x double> %5
1011 }
1012
1013 define <4 x double> @test_mm256_maskz_fmsubadd_pd(i8 zeroext %__U, <4 x double> %__A, <4 x double> %__B, <4 x double> %__C) {
1014 ; X86-LABEL: test_mm256_maskz_fmsubadd_pd:
1015 ; X86:       # %bb.0: # %entry
1016 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
1017 ; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
1018 ; X86-NEXT:    vfmsubadd213pd %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0xa9,0xa7,0xc2]
1019 ; X86-NEXT:    # ymm0 {%k1} {z} = (ymm1 * ymm0) -/+ ymm2
1020 ; X86-NEXT:    retl # encoding: [0xc3]
1021 ;
1022 ; X64-LABEL: test_mm256_maskz_fmsubadd_pd:
1023 ; X64:       # %bb.0: # %entry
1024 ; X64-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
1025 ; X64-NEXT:    vfmsubadd213pd %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0xa9,0xa7,0xc2]
1026 ; X64-NEXT:    # ymm0 {%k1} {z} = (ymm1 * ymm0) -/+ ymm2
1027 ; X64-NEXT:    retq # encoding: [0xc3]
1028 entry:
1029   %sub.i = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
1030   %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %sub.i) #9
1031   %1 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %__C) #9
1032   %2 = shufflevector <4 x double> %1, <4 x double> %0, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
1033   %3 = bitcast i8 %__U to <8 x i1>
1034   %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1035   %4 = select <4 x i1> %extract.i, <4 x double> %2, <4 x double> zeroinitializer
1036   ret <4 x double> %4
1037 }
1038
1039 define <4 x float> @test_mm_mask_fmaddsub_ps(<4 x float> %__A, i8 zeroext %__U, <4 x float> %__B, <4 x float> %__C) {
1040 ; X86-LABEL: test_mm_mask_fmaddsub_ps:
1041 ; X86:       # %bb.0: # %entry
1042 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
1043 ; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
1044 ; X86-NEXT:    vfmaddsub132ps %xmm1, %xmm2, %xmm0 {%k1} # encoding: [0x62,0xf2,0x6d,0x09,0x96,0xc1]
1045 ; X86-NEXT:    # xmm0 {%k1} = (xmm0 * xmm1) +/- xmm2
1046 ; X86-NEXT:    retl # encoding: [0xc3]
1047 ;
1048 ; X64-LABEL: test_mm_mask_fmaddsub_ps:
1049 ; X64:       # %bb.0: # %entry
1050 ; X64-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
1051 ; X64-NEXT:    vfmaddsub132ps %xmm1, %xmm2, %xmm0 {%k1} # encoding: [0x62,0xf2,0x6d,0x09,0x96,0xc1]
1052 ; X64-NEXT:    # xmm0 {%k1} = (xmm0 * xmm1) +/- xmm2
1053 ; X64-NEXT:    retq # encoding: [0xc3]
1054 entry:
1055   %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %__C) #9
1056   %1 = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
1057   %2 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %1) #9
1058   %3 = shufflevector <4 x float> %2, <4 x float> %0, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
1059   %4 = bitcast i8 %__U to <8 x i1>
1060   %extract.i = shufflevector <8 x i1> %4, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1061   %5 = select <4 x i1> %extract.i, <4 x float> %3, <4 x float> %__A
1062   ret <4 x float> %5
1063 }
1064
1065 define <4 x float> @test_mm_mask_fmsubadd_ps(<4 x float> %__A, i8 zeroext %__U, <4 x float> %__B, <4 x float> %__C) {
1066 ; X86-LABEL: test_mm_mask_fmsubadd_ps:
1067 ; X86:       # %bb.0: # %entry
1068 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
1069 ; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
1070 ; X86-NEXT:    vfmsubadd132ps %xmm1, %xmm2, %xmm0 {%k1} # encoding: [0x62,0xf2,0x6d,0x09,0x97,0xc1]
1071 ; X86-NEXT:    # xmm0 {%k1} = (xmm0 * xmm1) -/+ xmm2
1072 ; X86-NEXT:    retl # encoding: [0xc3]
1073 ;
1074 ; X64-LABEL: test_mm_mask_fmsubadd_ps:
1075 ; X64:       # %bb.0: # %entry
1076 ; X64-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
1077 ; X64-NEXT:    vfmsubadd132ps %xmm1, %xmm2, %xmm0 {%k1} # encoding: [0x62,0xf2,0x6d,0x09,0x97,0xc1]
1078 ; X64-NEXT:    # xmm0 {%k1} = (xmm0 * xmm1) -/+ xmm2
1079 ; X64-NEXT:    retq # encoding: [0xc3]
1080 entry:
1081   %sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
1082   %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %sub.i) #9
1083   %1 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %__C) #9
1084   %2 = shufflevector <4 x float> %1, <4 x float> %0, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
1085   %3 = bitcast i8 %__U to <8 x i1>
1086   %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1087   %4 = select <4 x i1> %extract.i, <4 x float> %2, <4 x float> %__A
1088   ret <4 x float> %4
1089 }
1090
1091 define <4 x float> @test_mm_mask3_fmaddsub_ps(<4 x float> %__A, <4 x float> %__B, <4 x float> %__C, i8 zeroext %__U) {
1092 ; X86-LABEL: test_mm_mask3_fmaddsub_ps:
1093 ; X86:       # %bb.0: # %entry
1094 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
1095 ; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
1096 ; X86-NEXT:    vfmaddsub231ps %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0xb6,0xd1]
1097 ; X86-NEXT:    # xmm2 {%k1} = (xmm0 * xmm1) +/- xmm2
1098 ; X86-NEXT:    vmovaps %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc2]
1099 ; X86-NEXT:    retl # encoding: [0xc3]
1100 ;
1101 ; X64-LABEL: test_mm_mask3_fmaddsub_ps:
1102 ; X64:       # %bb.0: # %entry
1103 ; X64-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
1104 ; X64-NEXT:    vfmaddsub231ps %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0xb6,0xd1]
1105 ; X64-NEXT:    # xmm2 {%k1} = (xmm0 * xmm1) +/- xmm2
1106 ; X64-NEXT:    vmovaps %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc2]
1107 ; X64-NEXT:    retq # encoding: [0xc3]
1108 entry:
1109   %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %__C) #9
1110   %1 = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
1111   %2 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %1) #9
1112   %3 = shufflevector <4 x float> %2, <4 x float> %0, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
1113   %4 = bitcast i8 %__U to <8 x i1>
1114   %extract.i = shufflevector <8 x i1> %4, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1115   %5 = select <4 x i1> %extract.i, <4 x float> %3, <4 x float> %__C
1116   ret <4 x float> %5
1117 }
1118
1119 define <4 x float> @test_mm_maskz_fmaddsub_ps(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B, <4 x float> %__C) {
1120 ; X86-LABEL: test_mm_maskz_fmaddsub_ps:
1121 ; X86:       # %bb.0: # %entry
1122 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
1123 ; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
1124 ; X86-NEXT:    vfmaddsub213ps %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0x89,0xa6,0xc2]
1125 ; X86-NEXT:    # xmm0 {%k1} {z} = (xmm1 * xmm0) +/- xmm2
1126 ; X86-NEXT:    retl # encoding: [0xc3]
1127 ;
1128 ; X64-LABEL: test_mm_maskz_fmaddsub_ps:
1129 ; X64:       # %bb.0: # %entry
1130 ; X64-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
1131 ; X64-NEXT:    vfmaddsub213ps %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0x89,0xa6,0xc2]
1132 ; X64-NEXT:    # xmm0 {%k1} {z} = (xmm1 * xmm0) +/- xmm2
1133 ; X64-NEXT:    retq # encoding: [0xc3]
1134 entry:
1135   %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %__C) #9
1136   %1 = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
1137   %2 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %1) #9
1138   %3 = shufflevector <4 x float> %2, <4 x float> %0, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
1139   %4 = bitcast i8 %__U to <8 x i1>
1140   %extract.i = shufflevector <8 x i1> %4, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1141   %5 = select <4 x i1> %extract.i, <4 x float> %3, <4 x float> zeroinitializer
1142   ret <4 x float> %5
1143 }
1144
1145 define <4 x float> @test_mm_maskz_fmsubadd_ps(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B, <4 x float> %__C) {
1146 ; X86-LABEL: test_mm_maskz_fmsubadd_ps:
1147 ; X86:       # %bb.0: # %entry
1148 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
1149 ; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
1150 ; X86-NEXT:    vfmsubadd213ps %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0x89,0xa7,0xc2]
1151 ; X86-NEXT:    # xmm0 {%k1} {z} = (xmm1 * xmm0) -/+ xmm2
1152 ; X86-NEXT:    retl # encoding: [0xc3]
1153 ;
1154 ; X64-LABEL: test_mm_maskz_fmsubadd_ps:
1155 ; X64:       # %bb.0: # %entry
1156 ; X64-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
1157 ; X64-NEXT:    vfmsubadd213ps %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0x89,0xa7,0xc2]
1158 ; X64-NEXT:    # xmm0 {%k1} {z} = (xmm1 * xmm0) -/+ xmm2
1159 ; X64-NEXT:    retq # encoding: [0xc3]
1160 entry:
1161   %sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
1162   %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %sub.i) #9
1163   %1 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %__C) #9
1164   %2 = shufflevector <4 x float> %1, <4 x float> %0, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
1165   %3 = bitcast i8 %__U to <8 x i1>
1166   %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1167   %4 = select <4 x i1> %extract.i, <4 x float> %2, <4 x float> zeroinitializer
1168   ret <4 x float> %4
1169 }
1170
1171 define <8 x float> @test_mm256_mask_fmaddsub_ps(<8 x float> %__A, i8 zeroext %__U, <8 x float> %__B, <8 x float> %__C) {
1172 ; X86-LABEL: test_mm256_mask_fmaddsub_ps:
1173 ; X86:       # %bb.0: # %entry
1174 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
1175 ; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
1176 ; X86-NEXT:    vfmaddsub132ps %ymm1, %ymm2, %ymm0 {%k1} # encoding: [0x62,0xf2,0x6d,0x29,0x96,0xc1]
1177 ; X86-NEXT:    # ymm0 {%k1} = (ymm0 * ymm1) +/- ymm2
1178 ; X86-NEXT:    retl # encoding: [0xc3]
1179 ;
1180 ; X64-LABEL: test_mm256_mask_fmaddsub_ps:
1181 ; X64:       # %bb.0: # %entry
1182 ; X64-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
1183 ; X64-NEXT:    vfmaddsub132ps %ymm1, %ymm2, %ymm0 {%k1} # encoding: [0x62,0xf2,0x6d,0x29,0x96,0xc1]
1184 ; X64-NEXT:    # ymm0 {%k1} = (ymm0 * ymm1) +/- ymm2
1185 ; X64-NEXT:    retq # encoding: [0xc3]
1186 entry:
1187   %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %__C) #9
1188   %1 = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
1189   %2 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %1) #9
1190   %3 = shufflevector <8 x float> %2, <8 x float> %0, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
1191   %4 = bitcast i8 %__U to <8 x i1>
1192   %5 = select <8 x i1> %4, <8 x float> %3, <8 x float> %__A
1193   ret <8 x float> %5
1194 }
1195
1196 define <8 x float> @test_mm256_mask_fmsubadd_ps(<8 x float> %__A, i8 zeroext %__U, <8 x float> %__B, <8 x float> %__C) {
1197 ; X86-LABEL: test_mm256_mask_fmsubadd_ps:
1198 ; X86:       # %bb.0: # %entry
1199 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
1200 ; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
1201 ; X86-NEXT:    vfmsubadd132ps %ymm1, %ymm2, %ymm0 {%k1} # encoding: [0x62,0xf2,0x6d,0x29,0x97,0xc1]
1202 ; X86-NEXT:    # ymm0 {%k1} = (ymm0 * ymm1) -/+ ymm2
1203 ; X86-NEXT:    retl # encoding: [0xc3]
1204 ;
1205 ; X64-LABEL: test_mm256_mask_fmsubadd_ps:
1206 ; X64:       # %bb.0: # %entry
1207 ; X64-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
1208 ; X64-NEXT:    vfmsubadd132ps %ymm1, %ymm2, %ymm0 {%k1} # encoding: [0x62,0xf2,0x6d,0x29,0x97,0xc1]
1209 ; X64-NEXT:    # ymm0 {%k1} = (ymm0 * ymm1) -/+ ymm2
1210 ; X64-NEXT:    retq # encoding: [0xc3]
1211 entry:
1212   %sub.i = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
1213   %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %sub.i) #9
1214   %1 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %__C) #9
1215   %2 = shufflevector <8 x float> %1, <8 x float> %0, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
1216   %3 = bitcast i8 %__U to <8 x i1>
1217   %4 = select <8 x i1> %3, <8 x float> %2, <8 x float> %__A
1218   ret <8 x float> %4
1219 }
1220
1221 define <8 x float> @test_mm256_mask3_fmaddsub_ps(<8 x float> %__A, <8 x float> %__B, <8 x float> %__C, i8 zeroext %__U) {
1222 ; X86-LABEL: test_mm256_mask3_fmaddsub_ps:
1223 ; X86:       # %bb.0: # %entry
1224 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
1225 ; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
1226 ; X86-NEXT:    vfmaddsub231ps %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0xb6,0xd1]
1227 ; X86-NEXT:    # ymm2 {%k1} = (ymm0 * ymm1) +/- ymm2
1228 ; X86-NEXT:    vmovaps %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc2]
1229 ; X86-NEXT:    retl # encoding: [0xc3]
1230 ;
1231 ; X64-LABEL: test_mm256_mask3_fmaddsub_ps:
1232 ; X64:       # %bb.0: # %entry
1233 ; X64-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
1234 ; X64-NEXT:    vfmaddsub231ps %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0xb6,0xd1]
1235 ; X64-NEXT:    # ymm2 {%k1} = (ymm0 * ymm1) +/- ymm2
1236 ; X64-NEXT:    vmovaps %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc2]
1237 ; X64-NEXT:    retq # encoding: [0xc3]
1238 entry:
1239   %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %__C) #9
1240   %1 = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
1241   %2 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %1) #9
1242   %3 = shufflevector <8 x float> %2, <8 x float> %0, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
1243   %4 = bitcast i8 %__U to <8 x i1>
1244   %5 = select <8 x i1> %4, <8 x float> %3, <8 x float> %__C
1245   ret <8 x float> %5
1246 }
1247
1248 define <8 x float> @test_mm256_maskz_fmaddsub_ps(i8 zeroext %__U, <8 x float> %__A, <8 x float> %__B, <8 x float> %__C) {
1249 ; X86-LABEL: test_mm256_maskz_fmaddsub_ps:
1250 ; X86:       # %bb.0: # %entry
1251 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
1252 ; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
1253 ; X86-NEXT:    vfmaddsub213ps %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xa9,0xa6,0xc2]
1254 ; X86-NEXT:    # ymm0 {%k1} {z} = (ymm1 * ymm0) +/- ymm2
1255 ; X86-NEXT:    retl # encoding: [0xc3]
1256 ;
1257 ; X64-LABEL: test_mm256_maskz_fmaddsub_ps:
1258 ; X64:       # %bb.0: # %entry
1259 ; X64-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
1260 ; X64-NEXT:    vfmaddsub213ps %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xa9,0xa6,0xc2]
1261 ; X64-NEXT:    # ymm0 {%k1} {z} = (ymm1 * ymm0) +/- ymm2
1262 ; X64-NEXT:    retq # encoding: [0xc3]
1263 entry:
1264   %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %__C) #9
1265   %1 = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
1266   %2 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %1) #9
1267   %3 = shufflevector <8 x float> %2, <8 x float> %0, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
1268   %4 = bitcast i8 %__U to <8 x i1>
1269   %5 = select <8 x i1> %4, <8 x float> %3, <8 x float> zeroinitializer
1270   ret <8 x float> %5
1271 }
1272
1273 define <8 x float> @test_mm256_maskz_fmsubadd_ps(i8 zeroext %__U, <8 x float> %__A, <8 x float> %__B, <8 x float> %__C) {
1274 ; X86-LABEL: test_mm256_maskz_fmsubadd_ps:
1275 ; X86:       # %bb.0: # %entry
1276 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
1277 ; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
1278 ; X86-NEXT:    vfmsubadd213ps %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xa9,0xa7,0xc2]
1279 ; X86-NEXT:    # ymm0 {%k1} {z} = (ymm1 * ymm0) -/+ ymm2
1280 ; X86-NEXT:    retl # encoding: [0xc3]
1281 ;
1282 ; X64-LABEL: test_mm256_maskz_fmsubadd_ps:
1283 ; X64:       # %bb.0: # %entry
1284 ; X64-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
1285 ; X64-NEXT:    vfmsubadd213ps %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xa9,0xa7,0xc2]
1286 ; X64-NEXT:    # ymm0 {%k1} {z} = (ymm1 * ymm0) -/+ ymm2
1287 ; X64-NEXT:    retq # encoding: [0xc3]
1288 entry:
1289   %sub.i = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
1290   %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %sub.i) #9
1291   %1 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %__C) #9
1292   %2 = shufflevector <8 x float> %1, <8 x float> %0, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
1293   %3 = bitcast i8 %__U to <8 x i1>
1294   %4 = select <8 x i1> %3, <8 x float> %2, <8 x float> zeroinitializer
1295   ret <8 x float> %4
1296 }
1297
1298 define <2 x double> @test_mm_mask3_fmsub_pd(<2 x double> %__A, <2 x double> %__B, <2 x double> %__C, i8 zeroext %__U) {
1299 ; X86-LABEL: test_mm_mask3_fmsub_pd:
1300 ; X86:       # %bb.0: # %entry
1301 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
1302 ; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
1303 ; X86-NEXT:    vfmsub231pd %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0xfd,0x09,0xba,0xd1]
1304 ; X86-NEXT:    # xmm2 {%k1} = (xmm0 * xmm1) - xmm2
1305 ; X86-NEXT:    vmovapd %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0xc2]
1306 ; X86-NEXT:    retl # encoding: [0xc3]
1307 ;
1308 ; X64-LABEL: test_mm_mask3_fmsub_pd:
1309 ; X64:       # %bb.0: # %entry
1310 ; X64-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
1311 ; X64-NEXT:    vfmsub231pd %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0xfd,0x09,0xba,0xd1]
1312 ; X64-NEXT:    # xmm2 {%k1} = (xmm0 * xmm1) - xmm2
1313 ; X64-NEXT:    vmovapd %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0xc2]
1314 ; X64-NEXT:    retq # encoding: [0xc3]
1315 entry:
1316   %sub.i = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__C
1317   %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %sub.i) #9
1318   %1 = bitcast i8 %__U to <8 x i1>
1319   %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
1320   %2 = select <2 x i1> %extract.i, <2 x double> %0, <2 x double> %__C
1321   ret <2 x double> %2
1322 }
1323
1324 define <4 x double> @test_mm256_mask3_fmsub_pd(<4 x double> %__A, <4 x double> %__B, <4 x double> %__C, i8 zeroext %__U) {
1325 ; X86-LABEL: test_mm256_mask3_fmsub_pd:
1326 ; X86:       # %bb.0: # %entry
1327 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
1328 ; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
1329 ; X86-NEXT:    vfmsub231pd %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf2,0xfd,0x29,0xba,0xd1]
1330 ; X86-NEXT:    # ymm2 {%k1} = (ymm0 * ymm1) - ymm2
1331 ; X86-NEXT:    vmovapd %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x28,0xc2]
1332 ; X86-NEXT:    retl # encoding: [0xc3]
1333 ;
1334 ; X64-LABEL: test_mm256_mask3_fmsub_pd:
1335 ; X64:       # %bb.0: # %entry
1336 ; X64-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
1337 ; X64-NEXT:    vfmsub231pd %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf2,0xfd,0x29,0xba,0xd1]
1338 ; X64-NEXT:    # ymm2 {%k1} = (ymm0 * ymm1) - ymm2
1339 ; X64-NEXT:    vmovapd %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x28,0xc2]
1340 ; X64-NEXT:    retq # encoding: [0xc3]
1341 entry:
1342   %sub.i = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
1343   %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %sub.i) #9
1344   %1 = bitcast i8 %__U to <8 x i1>
1345   %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1346   %2 = select <4 x i1> %extract.i, <4 x double> %0, <4 x double> %__C
1347   ret <4 x double> %2
1348 }
1349
1350 define <4 x float> @test_mm_mask3_fmsub_ps(<4 x float> %__A, <4 x float> %__B, <4 x float> %__C, i8 zeroext %__U) {
1351 ; X86-LABEL: test_mm_mask3_fmsub_ps:
1352 ; X86:       # %bb.0: # %entry
1353 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
1354 ; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
1355 ; X86-NEXT:    vfmsub231ps %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0xba,0xd1]
1356 ; X86-NEXT:    # xmm2 {%k1} = (xmm0 * xmm1) - xmm2
1357 ; X86-NEXT:    vmovaps %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc2]
1358 ; X86-NEXT:    retl # encoding: [0xc3]
1359 ;
1360 ; X64-LABEL: test_mm_mask3_fmsub_ps:
1361 ; X64:       # %bb.0: # %entry
1362 ; X64-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
1363 ; X64-NEXT:    vfmsub231ps %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0xba,0xd1]
1364 ; X64-NEXT:    # xmm2 {%k1} = (xmm0 * xmm1) - xmm2
1365 ; X64-NEXT:    vmovaps %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc2]
1366 ; X64-NEXT:    retq # encoding: [0xc3]
1367 entry:
1368   %sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
1369   %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %sub.i) #9
1370   %1 = bitcast i8 %__U to <8 x i1>
1371   %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1372   %2 = select <4 x i1> %extract.i, <4 x float> %0, <4 x float> %__C
1373   ret <4 x float> %2
1374 }
1375
1376 define <8 x float> @test_mm256_mask3_fmsub_ps(<8 x float> %__A, <8 x float> %__B, <8 x float> %__C, i8 zeroext %__U) {
1377 ; X86-LABEL: test_mm256_mask3_fmsub_ps:
1378 ; X86:       # %bb.0: # %entry
1379 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
1380 ; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
1381 ; X86-NEXT:    vfmsub231ps %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0xba,0xd1]
1382 ; X86-NEXT:    # ymm2 {%k1} = (ymm0 * ymm1) - ymm2
1383 ; X86-NEXT:    vmovaps %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc2]
1384 ; X86-NEXT:    retl # encoding: [0xc3]
1385 ;
1386 ; X64-LABEL: test_mm256_mask3_fmsub_ps:
1387 ; X64:       # %bb.0: # %entry
1388 ; X64-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
1389 ; X64-NEXT:    vfmsub231ps %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0xba,0xd1]
1390 ; X64-NEXT:    # ymm2 {%k1} = (ymm0 * ymm1) - ymm2
1391 ; X64-NEXT:    vmovaps %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc2]
1392 ; X64-NEXT:    retq # encoding: [0xc3]
1393 entry:
1394   %sub.i = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
1395   %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %sub.i) #9
1396   %1 = bitcast i8 %__U to <8 x i1>
1397   %2 = select <8 x i1> %1, <8 x float> %0, <8 x float> %__C
1398   ret <8 x float> %2
1399 }
1400
1401 define <2 x double> @test_mm_mask3_fmsubadd_pd(<2 x double> %__A, <2 x double> %__B, <2 x double> %__C, i8 zeroext %__U) {
1402 ; X86-LABEL: test_mm_mask3_fmsubadd_pd:
1403 ; X86:       # %bb.0: # %entry
1404 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
1405 ; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
1406 ; X86-NEXT:    vfmsubadd231pd %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0xfd,0x09,0xb7,0xd1]
1407 ; X86-NEXT:    # xmm2 {%k1} = (xmm0 * xmm1) -/+ xmm2
1408 ; X86-NEXT:    vmovapd %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0xc2]
1409 ; X86-NEXT:    retl # encoding: [0xc3]
1410 ;
1411 ; X64-LABEL: test_mm_mask3_fmsubadd_pd:
1412 ; X64:       # %bb.0: # %entry
1413 ; X64-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
1414 ; X64-NEXT:    vfmsubadd231pd %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0xfd,0x09,0xb7,0xd1]
1415 ; X64-NEXT:    # xmm2 {%k1} = (xmm0 * xmm1) -/+ xmm2
1416 ; X64-NEXT:    vmovapd %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0xc2]
1417 ; X64-NEXT:    retq # encoding: [0xc3]
1418 entry:
1419   %sub.i = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__C
1420   %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %sub.i) #9
1421   %1 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %__C) #9
1422   %2 = shufflevector <2 x double> %1, <2 x double> %0, <2 x i32> <i32 0, i32 3>
1423   %3 = bitcast i8 %__U to <8 x i1>
1424   %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
1425   %4 = select <2 x i1> %extract.i, <2 x double> %2, <2 x double> %__C
1426   ret <2 x double> %4
1427 }
1428
1429 define <4 x double> @test_mm256_mask3_fmsubadd_pd(<4 x double> %__A, <4 x double> %__B, <4 x double> %__C, i8 zeroext %__U) {
1430 ; X86-LABEL: test_mm256_mask3_fmsubadd_pd:
1431 ; X86:       # %bb.0: # %entry
1432 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
1433 ; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
1434 ; X86-NEXT:    vfmsubadd231pd %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf2,0xfd,0x29,0xb7,0xd1]
1435 ; X86-NEXT:    # ymm2 {%k1} = (ymm0 * ymm1) -/+ ymm2
1436 ; X86-NEXT:    vmovapd %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x28,0xc2]
1437 ; X86-NEXT:    retl # encoding: [0xc3]
1438 ;
1439 ; X64-LABEL: test_mm256_mask3_fmsubadd_pd:
1440 ; X64:       # %bb.0: # %entry
1441 ; X64-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
1442 ; X64-NEXT:    vfmsubadd231pd %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf2,0xfd,0x29,0xb7,0xd1]
1443 ; X64-NEXT:    # ymm2 {%k1} = (ymm0 * ymm1) -/+ ymm2
1444 ; X64-NEXT:    vmovapd %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x28,0xc2]
1445 ; X64-NEXT:    retq # encoding: [0xc3]
1446 entry:
1447   %sub.i = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
1448   %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %sub.i) #9
1449   %1 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %__C) #9
1450   %2 = shufflevector <4 x double> %1, <4 x double> %0, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
1451   %3 = bitcast i8 %__U to <8 x i1>
1452   %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1453   %4 = select <4 x i1> %extract.i, <4 x double> %2, <4 x double> %__C
1454   ret <4 x double> %4
1455 }
1456
1457 define <4 x float> @test_mm_mask3_fmsubadd_ps(<4 x float> %__A, <4 x float> %__B, <4 x float> %__C, i8 zeroext %__U) {
1458 ; X86-LABEL: test_mm_mask3_fmsubadd_ps:
1459 ; X86:       # %bb.0: # %entry
1460 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
1461 ; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
1462 ; X86-NEXT:    vfmsubadd231ps %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0xb7,0xd1]
1463 ; X86-NEXT:    # xmm2 {%k1} = (xmm0 * xmm1) -/+ xmm2
1464 ; X86-NEXT:    vmovaps %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc2]
1465 ; X86-NEXT:    retl # encoding: [0xc3]
1466 ;
1467 ; X64-LABEL: test_mm_mask3_fmsubadd_ps:
1468 ; X64:       # %bb.0: # %entry
1469 ; X64-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
1470 ; X64-NEXT:    vfmsubadd231ps %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0xb7,0xd1]
1471 ; X64-NEXT:    # xmm2 {%k1} = (xmm0 * xmm1) -/+ xmm2
1472 ; X64-NEXT:    vmovaps %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc2]
1473 ; X64-NEXT:    retq # encoding: [0xc3]
1474 entry:
1475   %sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
1476   %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %sub.i) #9
1477   %1 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %__C) #9
1478   %2 = shufflevector <4 x float> %1, <4 x float> %0, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
1479   %3 = bitcast i8 %__U to <8 x i1>
1480   %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1481   %4 = select <4 x i1> %extract.i, <4 x float> %2, <4 x float> %__C
1482   ret <4 x float> %4
1483 }
1484
1485 define <8 x float> @test_mm256_mask3_fmsubadd_ps(<8 x float> %__A, <8 x float> %__B, <8 x float> %__C, i8 zeroext %__U) {
1486 ; X86-LABEL: test_mm256_mask3_fmsubadd_ps:
1487 ; X86:       # %bb.0: # %entry
1488 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
1489 ; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
1490 ; X86-NEXT:    vfmsubadd231ps %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0xb7,0xd1]
1491 ; X86-NEXT:    # ymm2 {%k1} = (ymm0 * ymm1) -/+ ymm2
1492 ; X86-NEXT:    vmovaps %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc2]
1493 ; X86-NEXT:    retl # encoding: [0xc3]
1494 ;
1495 ; X64-LABEL: test_mm256_mask3_fmsubadd_ps:
1496 ; X64:       # %bb.0: # %entry
1497 ; X64-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
1498 ; X64-NEXT:    vfmsubadd231ps %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0xb7,0xd1]
1499 ; X64-NEXT:    # ymm2 {%k1} = (ymm0 * ymm1) -/+ ymm2
1500 ; X64-NEXT:    vmovaps %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc2]
1501 ; X64-NEXT:    retq # encoding: [0xc3]
1502 entry:
1503   %sub.i = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
1504   %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %sub.i) #9
1505   %1 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %__C) #9
1506   %2 = shufflevector <8 x float> %1, <8 x float> %0, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
1507   %3 = bitcast i8 %__U to <8 x i1>
1508   %4 = select <8 x i1> %3, <8 x float> %2, <8 x float> %__C
1509   ret <8 x float> %4
1510 }
1511
1512 define <2 x double> @test_mm_mask_fnmadd_pd(<2 x double> %__A, i8 zeroext %__U, <2 x double> %__B, <2 x double> %__C) {
1513 ; X86-LABEL: test_mm_mask_fnmadd_pd:
1514 ; X86:       # %bb.0: # %entry
1515 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
1516 ; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
1517 ; X86-NEXT:    vfnmadd132pd %xmm1, %xmm2, %xmm0 {%k1} # encoding: [0x62,0xf2,0xed,0x09,0x9c,0xc1]
1518 ; X86-NEXT:    # xmm0 {%k1} = -(xmm0 * xmm1) + xmm2
1519 ; X86-NEXT:    retl # encoding: [0xc3]
1520 ;
1521 ; X64-LABEL: test_mm_mask_fnmadd_pd:
1522 ; X64:       # %bb.0: # %entry
1523 ; X64-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
1524 ; X64-NEXT:    vfnmadd132pd %xmm1, %xmm2, %xmm0 {%k1} # encoding: [0x62,0xf2,0xed,0x09,0x9c,0xc1]
1525 ; X64-NEXT:    # xmm0 {%k1} = -(xmm0 * xmm1) + xmm2
1526 ; X64-NEXT:    retq # encoding: [0xc3]
1527 entry:
1528   %sub.i = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__B
1529   %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %sub.i, <2 x double> %__C) #9
1530   %1 = bitcast i8 %__U to <8 x i1>
1531   %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
1532   %2 = select <2 x i1> %extract.i, <2 x double> %0, <2 x double> %__A
1533   ret <2 x double> %2
1534 }
1535
1536 define <4 x double> @test_mm256_mask_fnmadd_pd(<4 x double> %__A, i8 zeroext %__U, <4 x double> %__B, <4 x double> %__C) {
1537 ; X86-LABEL: test_mm256_mask_fnmadd_pd:
1538 ; X86:       # %bb.0: # %entry
1539 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
1540 ; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
1541 ; X86-NEXT:    vfnmadd132pd %ymm1, %ymm2, %ymm0 {%k1} # encoding: [0x62,0xf2,0xed,0x29,0x9c,0xc1]
1542 ; X86-NEXT:    # ymm0 {%k1} = -(ymm0 * ymm1) + ymm2
1543 ; X86-NEXT:    retl # encoding: [0xc3]
1544 ;
1545 ; X64-LABEL: test_mm256_mask_fnmadd_pd:
1546 ; X64:       # %bb.0: # %entry
1547 ; X64-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
1548 ; X64-NEXT:    vfnmadd132pd %ymm1, %ymm2, %ymm0 {%k1} # encoding: [0x62,0xf2,0xed,0x29,0x9c,0xc1]
1549 ; X64-NEXT:    # ymm0 {%k1} = -(ymm0 * ymm1) + ymm2
1550 ; X64-NEXT:    retq # encoding: [0xc3]
1551 entry:
1552   %sub.i = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__B
1553   %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %sub.i, <4 x double> %__C) #9
1554   %1 = bitcast i8 %__U to <8 x i1>
1555   %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1556   %2 = select <4 x i1> %extract.i, <4 x double> %0, <4 x double> %__A
1557   ret <4 x double> %2
1558 }
1559
1560 define <4 x float> @test_mm_mask_fnmadd_ps(<4 x float> %__A, i8 zeroext %__U, <4 x float> %__B, <4 x float> %__C) {
1561 ; X86-LABEL: test_mm_mask_fnmadd_ps:
1562 ; X86:       # %bb.0: # %entry
1563 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
1564 ; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
1565 ; X86-NEXT:    vfnmadd132ps %xmm1, %xmm2, %xmm0 {%k1} # encoding: [0x62,0xf2,0x6d,0x09,0x9c,0xc1]
1566 ; X86-NEXT:    # xmm0 {%k1} = -(xmm0 * xmm1) + xmm2
1567 ; X86-NEXT:    retl # encoding: [0xc3]
1568 ;
1569 ; X64-LABEL: test_mm_mask_fnmadd_ps:
1570 ; X64:       # %bb.0: # %entry
1571 ; X64-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
1572 ; X64-NEXT:    vfnmadd132ps %xmm1, %xmm2, %xmm0 {%k1} # encoding: [0x62,0xf2,0x6d,0x09,0x9c,0xc1]
1573 ; X64-NEXT:    # xmm0 {%k1} = -(xmm0 * xmm1) + xmm2
1574 ; X64-NEXT:    retq # encoding: [0xc3]
1575 entry:
1576   %sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__B
1577   %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %sub.i, <4 x float> %__C) #9
1578   %1 = bitcast i8 %__U to <8 x i1>
1579   %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1580   %2 = select <4 x i1> %extract.i, <4 x float> %0, <4 x float> %__A
1581   ret <4 x float> %2
1582 }
1583
1584 define <8 x float> @test_mm256_mask_fnmadd_ps(<8 x float> %__A, i8 zeroext %__U, <8 x float> %__B, <8 x float> %__C) {
1585 ; X86-LABEL: test_mm256_mask_fnmadd_ps:
1586 ; X86:       # %bb.0: # %entry
1587 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
1588 ; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
1589 ; X86-NEXT:    vfnmadd132ps %ymm1, %ymm2, %ymm0 {%k1} # encoding: [0x62,0xf2,0x6d,0x29,0x9c,0xc1]
1590 ; X86-NEXT:    # ymm0 {%k1} = -(ymm0 * ymm1) + ymm2
1591 ; X86-NEXT:    retl # encoding: [0xc3]
1592 ;
1593 ; X64-LABEL: test_mm256_mask_fnmadd_ps:
1594 ; X64:       # %bb.0: # %entry
1595 ; X64-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
1596 ; X64-NEXT:    vfnmadd132ps %ymm1, %ymm2, %ymm0 {%k1} # encoding: [0x62,0xf2,0x6d,0x29,0x9c,0xc1]
1597 ; X64-NEXT:    # ymm0 {%k1} = -(ymm0 * ymm1) + ymm2
1598 ; X64-NEXT:    retq # encoding: [0xc3]
1599 entry:
1600   %sub.i = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__B
1601   %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %sub.i, <8 x float> %__C) #9
1602   %1 = bitcast i8 %__U to <8 x i1>
1603   %2 = select <8 x i1> %1, <8 x float> %0, <8 x float> %__A
1604   ret <8 x float> %2
1605 }
1606
1607 define <2 x double> @test_mm_mask_fnmsub_pd(<2 x double> %__A, i8 zeroext %__U, <2 x double> %__B, <2 x double> %__C) {
1608 ; X86-LABEL: test_mm_mask_fnmsub_pd:
1609 ; X86:       # %bb.0: # %entry
1610 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
1611 ; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
1612 ; X86-NEXT:    vfnmsub132pd %xmm1, %xmm2, %xmm0 {%k1} # encoding: [0x62,0xf2,0xed,0x09,0x9e,0xc1]
1613 ; X86-NEXT:    # xmm0 {%k1} = -(xmm0 * xmm1) - xmm2
1614 ; X86-NEXT:    retl # encoding: [0xc3]
1615 ;
1616 ; X64-LABEL: test_mm_mask_fnmsub_pd:
1617 ; X64:       # %bb.0: # %entry
1618 ; X64-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
1619 ; X64-NEXT:    vfnmsub132pd %xmm1, %xmm2, %xmm0 {%k1} # encoding: [0x62,0xf2,0xed,0x09,0x9e,0xc1]
1620 ; X64-NEXT:    # xmm0 {%k1} = -(xmm0 * xmm1) - xmm2
1621 ; X64-NEXT:    retq # encoding: [0xc3]
1622 entry:
1623   %sub.i = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__B
1624   %sub1.i = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__C
1625   %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %sub.i, <2 x double> %sub1.i) #9
1626   %1 = bitcast i8 %__U to <8 x i1>
1627   %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
1628   %2 = select <2 x i1> %extract.i, <2 x double> %0, <2 x double> %__A
1629   ret <2 x double> %2
1630 }
1631
1632 define <2 x double> @test_mm_mask3_fnmsub_pd(<2 x double> %__A, <2 x double> %__B, <2 x double> %__C, i8 zeroext %__U) {
1633 ; X86-LABEL: test_mm_mask3_fnmsub_pd:
1634 ; X86:       # %bb.0: # %entry
1635 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
1636 ; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
1637 ; X86-NEXT:    vfnmsub231pd %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0xfd,0x09,0xbe,0xd1]
1638 ; X86-NEXT:    # xmm2 {%k1} = -(xmm0 * xmm1) - xmm2
1639 ; X86-NEXT:    vmovapd %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0xc2]
1640 ; X86-NEXT:    retl # encoding: [0xc3]
1641 ;
1642 ; X64-LABEL: test_mm_mask3_fnmsub_pd:
1643 ; X64:       # %bb.0: # %entry
1644 ; X64-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
1645 ; X64-NEXT:    vfnmsub231pd %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0xfd,0x09,0xbe,0xd1]
1646 ; X64-NEXT:    # xmm2 {%k1} = -(xmm0 * xmm1) - xmm2
1647 ; X64-NEXT:    vmovapd %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0xc2]
1648 ; X64-NEXT:    retq # encoding: [0xc3]
1649 entry:
1650   %sub.i = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__B
1651   %sub1.i = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__C
1652   %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %sub.i, <2 x double> %sub1.i) #9
1653   %1 = bitcast i8 %__U to <8 x i1>
1654   %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
1655   %2 = select <2 x i1> %extract.i, <2 x double> %0, <2 x double> %__C
1656   ret <2 x double> %2
1657 }
1658
1659 define <4 x double> @test_mm256_mask_fnmsub_pd(<4 x double> %__A, i8 zeroext %__U, <4 x double> %__B, <4 x double> %__C) {
1660 ; X86-LABEL: test_mm256_mask_fnmsub_pd:
1661 ; X86:       # %bb.0: # %entry
1662 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
1663 ; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
1664 ; X86-NEXT:    vfnmsub132pd %ymm1, %ymm2, %ymm0 {%k1} # encoding: [0x62,0xf2,0xed,0x29,0x9e,0xc1]
1665 ; X86-NEXT:    # ymm0 {%k1} = -(ymm0 * ymm1) - ymm2
1666 ; X86-NEXT:    retl # encoding: [0xc3]
1667 ;
1668 ; X64-LABEL: test_mm256_mask_fnmsub_pd:
1669 ; X64:       # %bb.0: # %entry
1670 ; X64-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
1671 ; X64-NEXT:    vfnmsub132pd %ymm1, %ymm2, %ymm0 {%k1} # encoding: [0x62,0xf2,0xed,0x29,0x9e,0xc1]
1672 ; X64-NEXT:    # ymm0 {%k1} = -(ymm0 * ymm1) - ymm2
1673 ; X64-NEXT:    retq # encoding: [0xc3]
1674 entry:
1675   %sub.i = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__B
1676   %sub1.i = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
1677   %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %sub.i, <4 x double> %sub1.i) #9
1678   %1 = bitcast i8 %__U to <8 x i1>
1679   %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1680   %2 = select <4 x i1> %extract.i, <4 x double> %0, <4 x double> %__A
1681   ret <4 x double> %2
1682 }
1683
1684 define <4 x double> @test_mm256_mask3_fnmsub_pd(<4 x double> %__A, <4 x double> %__B, <4 x double> %__C, i8 zeroext %__U) {
1685 ; X86-LABEL: test_mm256_mask3_fnmsub_pd:
1686 ; X86:       # %bb.0: # %entry
1687 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
1688 ; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
1689 ; X86-NEXT:    vfnmsub231pd %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf2,0xfd,0x29,0xbe,0xd1]
1690 ; X86-NEXT:    # ymm2 {%k1} = -(ymm0 * ymm1) - ymm2
1691 ; X86-NEXT:    vmovapd %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x28,0xc2]
1692 ; X86-NEXT:    retl # encoding: [0xc3]
1693 ;
1694 ; X64-LABEL: test_mm256_mask3_fnmsub_pd:
1695 ; X64:       # %bb.0: # %entry
1696 ; X64-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
1697 ; X64-NEXT:    vfnmsub231pd %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf2,0xfd,0x29,0xbe,0xd1]
1698 ; X64-NEXT:    # ymm2 {%k1} = -(ymm0 * ymm1) - ymm2
1699 ; X64-NEXT:    vmovapd %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x28,0xc2]
1700 ; X64-NEXT:    retq # encoding: [0xc3]
1701 entry:
1702   %sub.i = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__B
1703   %sub1.i = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
1704   %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %sub.i, <4 x double> %sub1.i) #9
1705   %1 = bitcast i8 %__U to <8 x i1>
1706   %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1707   %2 = select <4 x i1> %extract.i, <4 x double> %0, <4 x double> %__C
1708   ret <4 x double> %2
1709 }
1710
1711 define <4 x float> @test_mm_mask_fnmsub_ps(<4 x float> %__A, i8 zeroext %__U, <4 x float> %__B, <4 x float> %__C) {
1712 ; X86-LABEL: test_mm_mask_fnmsub_ps:
1713 ; X86:       # %bb.0: # %entry
1714 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
1715 ; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
1716 ; X86-NEXT:    vfnmsub132ps %xmm1, %xmm2, %xmm0 {%k1} # encoding: [0x62,0xf2,0x6d,0x09,0x9e,0xc1]
1717 ; X86-NEXT:    # xmm0 {%k1} = -(xmm0 * xmm1) - xmm2
1718 ; X86-NEXT:    retl # encoding: [0xc3]
1719 ;
1720 ; X64-LABEL: test_mm_mask_fnmsub_ps:
1721 ; X64:       # %bb.0: # %entry
1722 ; X64-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
1723 ; X64-NEXT:    vfnmsub132ps %xmm1, %xmm2, %xmm0 {%k1} # encoding: [0x62,0xf2,0x6d,0x09,0x9e,0xc1]
1724 ; X64-NEXT:    # xmm0 {%k1} = -(xmm0 * xmm1) - xmm2
1725 ; X64-NEXT:    retq # encoding: [0xc3]
1726 entry:
1727   %sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__B
1728   %sub1.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
1729   %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %sub.i, <4 x float> %sub1.i) #9
1730   %1 = bitcast i8 %__U to <8 x i1>
1731   %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1732   %2 = select <4 x i1> %extract.i, <4 x float> %0, <4 x float> %__A
1733   ret <4 x float> %2
1734 }
1735
1736 define <4 x float> @test_mm_mask3_fnmsub_ps(<4 x float> %__A, <4 x float> %__B, <4 x float> %__C, i8 zeroext %__U) {
1737 ; X86-LABEL: test_mm_mask3_fnmsub_ps:
1738 ; X86:       # %bb.0: # %entry
1739 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
1740 ; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
1741 ; X86-NEXT:    vfnmsub231ps %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0xbe,0xd1]
1742 ; X86-NEXT:    # xmm2 {%k1} = -(xmm0 * xmm1) - xmm2
1743 ; X86-NEXT:    vmovaps %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc2]
1744 ; X86-NEXT:    retl # encoding: [0xc3]
1745 ;
1746 ; X64-LABEL: test_mm_mask3_fnmsub_ps:
1747 ; X64:       # %bb.0: # %entry
1748 ; X64-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
1749 ; X64-NEXT:    vfnmsub231ps %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0xbe,0xd1]
1750 ; X64-NEXT:    # xmm2 {%k1} = -(xmm0 * xmm1) - xmm2
1751 ; X64-NEXT:    vmovaps %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc2]
1752 ; X64-NEXT:    retq # encoding: [0xc3]
1753 entry:
1754   %sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__B
1755   %sub1.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
1756   %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %sub.i, <4 x float> %sub1.i) #9
1757   %1 = bitcast i8 %__U to <8 x i1>
1758   %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1759   %2 = select <4 x i1> %extract.i, <4 x float> %0, <4 x float> %__C
1760   ret <4 x float> %2
1761 }
1762
1763 define <8 x float> @test_mm256_mask_fnmsub_ps(<8 x float> %__A, i8 zeroext %__U, <8 x float> %__B, <8 x float> %__C) {
1764 ; X86-LABEL: test_mm256_mask_fnmsub_ps:
1765 ; X86:       # %bb.0: # %entry
1766 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
1767 ; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
1768 ; X86-NEXT:    vfnmsub132ps %ymm1, %ymm2, %ymm0 {%k1} # encoding: [0x62,0xf2,0x6d,0x29,0x9e,0xc1]
1769 ; X86-NEXT:    # ymm0 {%k1} = -(ymm0 * ymm1) - ymm2
1770 ; X86-NEXT:    retl # encoding: [0xc3]
1771 ;
1772 ; X64-LABEL: test_mm256_mask_fnmsub_ps:
1773 ; X64:       # %bb.0: # %entry
1774 ; X64-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
1775 ; X64-NEXT:    vfnmsub132ps %ymm1, %ymm2, %ymm0 {%k1} # encoding: [0x62,0xf2,0x6d,0x29,0x9e,0xc1]
1776 ; X64-NEXT:    # ymm0 {%k1} = -(ymm0 * ymm1) - ymm2
1777 ; X64-NEXT:    retq # encoding: [0xc3]
1778 entry:
1779   %sub.i = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__B
1780   %sub1.i = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
1781   %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %sub.i, <8 x float> %sub1.i) #9
1782   %1 = bitcast i8 %__U to <8 x i1>
1783   %2 = select <8 x i1> %1, <8 x float> %0, <8 x float> %__A
1784   ret <8 x float> %2
1785 }
1786
1787 define <8 x float> @test_mm256_mask3_fnmsub_ps(<8 x float> %__A, <8 x float> %__B, <8 x float> %__C, i8 zeroext %__U) {
1788 ; X86-LABEL: test_mm256_mask3_fnmsub_ps:
1789 ; X86:       # %bb.0: # %entry
1790 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
1791 ; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
1792 ; X86-NEXT:    vfnmsub231ps %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0xbe,0xd1]
1793 ; X86-NEXT:    # ymm2 {%k1} = -(ymm0 * ymm1) - ymm2
1794 ; X86-NEXT:    vmovaps %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc2]
1795 ; X86-NEXT:    retl # encoding: [0xc3]
1796 ;
1797 ; X64-LABEL: test_mm256_mask3_fnmsub_ps:
1798 ; X64:       # %bb.0: # %entry
1799 ; X64-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
1800 ; X64-NEXT:    vfnmsub231ps %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0xbe,0xd1]
1801 ; X64-NEXT:    # ymm2 {%k1} = -(ymm0 * ymm1) - ymm2
1802 ; X64-NEXT:    vmovaps %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc2]
1803 ; X64-NEXT:    retq # encoding: [0xc3]
1804 entry:
1805   %sub.i = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__B
1806   %sub1.i = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
1807   %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %sub.i, <8 x float> %sub1.i) #9
1808   %1 = bitcast i8 %__U to <8 x i1>
1809   %2 = select <8 x i1> %1, <8 x float> %0, <8 x float> %__C
1810   ret <8 x float> %2
1811 }
1812
1813 declare <2 x double> @llvm.fma.v2f64(<2 x double>, <2 x double>, <2 x double>) #8
1814 declare <4 x double> @llvm.fma.v4f64(<4 x double>, <4 x double>, <4 x double>) #8
1815 declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>) #8
1816 declare <8 x float> @llvm.fma.v8f32(<8 x float>, <8 x float>, <8 x float>) #8