llvm/test/CodeGen/X86/sse-scalar-fp-arith.ll

   1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
   2 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,X86-SSE,SSE2,X86-SSE2
   3 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,X86-SSE,SSE41,X86-SSE41
   4 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,X86-AVX,X86-AVX1
   5 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX,X86-AVX,X86-AVX512
   6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,X64-SSE,SSE2,X64-SSE2
   7 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,X64-SSE,SSE41,X64-SSE41
   8 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,X64-AVX,X64-AVX1
   9 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX,X64-AVX,X64-AVX512
  10
  11 ; Ensure that the backend no longer emits unnecessary vector insert
  12 ; instructions immediately after SSE scalar fp instructions
  13 ; like addss or mulss.
  14
  15 define <4 x float> @test_add_ss(<4 x float> %a, <4 x float> %b) {
  16 ; SSE-LABEL: test_add_ss:
  17 ; SSE:       # %bb.0:
  18 ; SSE-NEXT:    addss %xmm1, %xmm0
  19 ; SSE-NEXT:    ret{{[l|q]}}
  20 ;
  21 ; AVX-LABEL: test_add_ss:
  22 ; AVX:       # %bb.0:
  23 ; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm0
  24 ; AVX-NEXT:    ret{{[l|q]}}
  25   %1 = extractelement <4 x float> %b, i32 0
  26   %2 = extractelement <4 x float> %a, i32 0
  27   %add = fadd float %2, %1
  28   %3 = insertelement <4 x float> %a, float %add, i32 0
  29   ret <4 x float> %3
  30 }
  31
  32 define <4 x float> @test_sub_ss(<4 x float> %a, <4 x float> %b) {
  33 ; SSE-LABEL: test_sub_ss:
  34 ; SSE:       # %bb.0:
  35 ; SSE-NEXT:    subss %xmm1, %xmm0
  36 ; SSE-NEXT:    ret{{[l|q]}}
  37 ;
  38 ; AVX-LABEL: test_sub_ss:
  39 ; AVX:       # %bb.0:
  40 ; AVX-NEXT:    vsubss %xmm1, %xmm0, %xmm0
  41 ; AVX-NEXT:    ret{{[l|q]}}
  42   %1 = extractelement <4 x float> %b, i32 0
  43   %2 = extractelement <4 x float> %a, i32 0
  44   %sub = fsub float %2, %1
  45   %3 = insertelement <4 x float> %a, float %sub, i32 0
  46   ret <4 x float> %3
  47 }
  48
  49 define <4 x float> @test_mul_ss(<4 x float> %a, <4 x float> %b) {
  50 ; SSE-LABEL: test_mul_ss:
  51 ; SSE:       # %bb.0:
  52 ; SSE-NEXT:    mulss %xmm1, %xmm0
  53 ; SSE-NEXT:    ret{{[l|q]}}
  54 ;
  55 ; AVX-LABEL: test_mul_ss:
  56 ; AVX:       # %bb.0:
  57 ; AVX-NEXT:    vmulss %xmm1, %xmm0, %xmm0
  58 ; AVX-NEXT:    ret{{[l|q]}}
  59   %1 = extractelement <4 x float> %b, i32 0
  60   %2 = extractelement <4 x float> %a, i32 0
  61   %mul = fmul float %2, %1
  62   %3 = insertelement <4 x float> %a, float %mul, i32 0
  63   ret <4 x float> %3
  64 }
  65
  66 define <4 x float> @test_div_ss(<4 x float> %a, <4 x float> %b) {
  67 ; SSE-LABEL: test_div_ss:
  68 ; SSE:       # %bb.0:
  69 ; SSE-NEXT:    divss %xmm1, %xmm0
  70 ; SSE-NEXT:    ret{{[l|q]}}
  71 ;
  72 ; AVX-LABEL: test_div_ss:
  73 ; AVX:       # %bb.0:
  74 ; AVX-NEXT:    vdivss %xmm1, %xmm0, %xmm0
  75 ; AVX-NEXT:    ret{{[l|q]}}
  76   %1 = extractelement <4 x float> %b, i32 0
  77   %2 = extractelement <4 x float> %a, i32 0
  78   %div = fdiv float %2, %1
  79   %3 = insertelement <4 x float> %a, float %div, i32 0
  80   ret <4 x float> %3
  81 }
  82
  83 define <4 x float> @test_sqrt_ss(<4 x float> %a) {
  84 ; SSE-LABEL: test_sqrt_ss:
  85 ; SSE:       # %bb.0:
  86 ; SSE-NEXT:    sqrtss %xmm0, %xmm0
  87 ; SSE-NEXT:    ret{{[l|q]}}
  88 ;
  89 ; AVX-LABEL: test_sqrt_ss:
  90 ; AVX:       # %bb.0:
  91 ; AVX-NEXT:    vsqrtss %xmm0, %xmm0, %xmm0
  92 ; AVX-NEXT:    ret{{[l|q]}}
  93   %1 = extractelement <4 x float> %a, i32 0
  94   %2 = call float @llvm.sqrt.f32(float %1)
  95   %3 = insertelement <4 x float> %a, float %2, i32 0
  96   ret <4 x float> %3
  97 }
  98 declare float @llvm.sqrt.f32(float)
  99
 100 define <2 x double> @test_add_sd(<2 x double> %a, <2 x double> %b) {
 101 ; SSE-LABEL: test_add_sd:
 102 ; SSE:       # %bb.0:
 103 ; SSE-NEXT:    addsd %xmm1, %xmm0
 104 ; SSE-NEXT:    ret{{[l|q]}}
 105 ;
 106 ; AVX-LABEL: test_add_sd:
 107 ; AVX:       # %bb.0:
 108 ; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
 109 ; AVX-NEXT:    ret{{[l|q]}}
 110   %1 = extractelement <2 x double> %b, i32 0
 111   %2 = extractelement <2 x double> %a, i32 0
 112   %add = fadd double %2, %1
 113   %3 = insertelement <2 x double> %a, double %add, i32 0
 114   ret <2 x double> %3
 115 }
 116
 117 define <2 x double> @test_sub_sd(<2 x double> %a, <2 x double> %b) {
 118 ; SSE-LABEL: test_sub_sd:
 119 ; SSE:       # %bb.0:
 120 ; SSE-NEXT:    subsd %xmm1, %xmm0
 121 ; SSE-NEXT:    ret{{[l|q]}}
 122 ;
 123 ; AVX-LABEL: test_sub_sd:
 124 ; AVX:       # %bb.0:
 125 ; AVX-NEXT:    vsubsd %xmm1, %xmm0, %xmm0
 126 ; AVX-NEXT:    ret{{[l|q]}}
 127   %1 = extractelement <2 x double> %b, i32 0
 128   %2 = extractelement <2 x double> %a, i32 0
 129   %sub = fsub double %2, %1
 130   %3 = insertelement <2 x double> %a, double %sub, i32 0
 131   ret <2 x double> %3
 132 }
 133
 134 define <2 x double> @test_mul_sd(<2 x double> %a, <2 x double> %b) {
 135 ; SSE-LABEL: test_mul_sd:
 136 ; SSE:       # %bb.0:
 137 ; SSE-NEXT:    mulsd %xmm1, %xmm0
 138 ; SSE-NEXT:    ret{{[l|q]}}
 139 ;
 140 ; AVX-LABEL: test_mul_sd:
 141 ; AVX:       # %bb.0:
 142 ; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
 143 ; AVX-NEXT:    ret{{[l|q]}}
 144   %1 = extractelement <2 x double> %b, i32 0
 145   %2 = extractelement <2 x double> %a, i32 0
 146   %mul = fmul double %2, %1
 147   %3 = insertelement <2 x double> %a, double %mul, i32 0
 148   ret <2 x double> %3
 149 }
 150
 151 define <2 x double> @test_div_sd(<2 x double> %a, <2 x double> %b) {
 152 ; SSE-LABEL: test_div_sd:
 153 ; SSE:       # %bb.0:
 154 ; SSE-NEXT:    divsd %xmm1, %xmm0
 155 ; SSE-NEXT:    ret{{[l|q]}}
 156 ;
 157 ; AVX-LABEL: test_div_sd:
 158 ; AVX:       # %bb.0:
 159 ; AVX-NEXT:    vdivsd %xmm1, %xmm0, %xmm0
 160 ; AVX-NEXT:    ret{{[l|q]}}
 161   %1 = extractelement <2 x double> %b, i32 0
 162   %2 = extractelement <2 x double> %a, i32 0
 163   %div = fdiv double %2, %1
 164   %3 = insertelement <2 x double> %a, double %div, i32 0
 165   ret <2 x double> %3
 166 }
 167
 168 define <2 x double> @test_sqrt_sd(<2 x double> %a) {
 169 ; SSE-LABEL: test_sqrt_sd:
 170 ; SSE:       # %bb.0:
 171 ; SSE-NEXT:    sqrtsd %xmm0, %xmm0
 172 ; SSE-NEXT:    ret{{[l|q]}}
 173 ;
 174 ; AVX-LABEL: test_sqrt_sd:
 175 ; AVX:       # %bb.0:
 176 ; AVX-NEXT:    vsqrtsd %xmm0, %xmm0, %xmm0
 177 ; AVX-NEXT:    ret{{[l|q]}}
 178   %1 = extractelement <2 x double> %a, i32 0
 179   %2 = call double @llvm.sqrt.f64(double %1)
 180   %3 = insertelement <2 x double> %a, double %2, i32 0
 181   ret <2 x double> %3
 182 }
 183 declare double @llvm.sqrt.f64(double)
 184
 185 define <4 x float> @test2_add_ss(<4 x float> %a, <4 x float> %b) {
 186 ; SSE-LABEL: test2_add_ss:
 187 ; SSE:       # %bb.0:
 188 ; SSE-NEXT:    addss %xmm0, %xmm1
 189 ; SSE-NEXT:    movaps %xmm1, %xmm0
 190 ; SSE-NEXT:    ret{{[l|q]}}
 191 ;
 192 ; AVX-LABEL: test2_add_ss:
 193 ; AVX:       # %bb.0:
 194 ; AVX-NEXT:    vaddss %xmm0, %xmm1, %xmm0
 195 ; AVX-NEXT:    ret{{[l|q]}}
 196   %1 = extractelement <4 x float> %a, i32 0
 197   %2 = extractelement <4 x float> %b, i32 0
 198   %add = fadd float %1, %2
 199   %3 = insertelement <4 x float> %b, float %add, i32 0
 200   ret <4 x float> %3
 201 }
 202
 203 define <4 x float> @test2_sub_ss(<4 x float> %a, <4 x float> %b) {
 204 ; SSE-LABEL: test2_sub_ss:
 205 ; SSE:       # %bb.0:
 206 ; SSE-NEXT:    subss %xmm0, %xmm1
 207 ; SSE-NEXT:    movaps %xmm1, %xmm0
 208 ; SSE-NEXT:    ret{{[l|q]}}
 209 ;
 210 ; AVX-LABEL: test2_sub_ss:
 211 ; AVX:       # %bb.0:
 212 ; AVX-NEXT:    vsubss %xmm0, %xmm1, %xmm0
 213 ; AVX-NEXT:    ret{{[l|q]}}
 214   %1 = extractelement <4 x float> %a, i32 0
 215   %2 = extractelement <4 x float> %b, i32 0
 216   %sub = fsub float %2, %1
 217   %3 = insertelement <4 x float> %b, float %sub, i32 0
 218   ret <4 x float> %3
 219 }
 220
 221 define <4 x float> @test2_mul_ss(<4 x float> %a, <4 x float> %b) {
 222 ; SSE-LABEL: test2_mul_ss:
 223 ; SSE:       # %bb.0:
 224 ; SSE-NEXT:    mulss %xmm0, %xmm1
 225 ; SSE-NEXT:    movaps %xmm1, %xmm0
 226 ; SSE-NEXT:    ret{{[l|q]}}
 227 ;
 228 ; AVX-LABEL: test2_mul_ss:
 229 ; AVX:       # %bb.0:
 230 ; AVX-NEXT:    vmulss %xmm0, %xmm1, %xmm0
 231 ; AVX-NEXT:    ret{{[l|q]}}
 232   %1 = extractelement <4 x float> %a, i32 0
 233   %2 = extractelement <4 x float> %b, i32 0
 234   %mul = fmul float %1, %2
 235   %3 = insertelement <4 x float> %b, float %mul, i32 0
 236   ret <4 x float> %3
 237 }
 238
 239 define <4 x float> @test2_div_ss(<4 x float> %a, <4 x float> %b) {
 240 ; SSE-LABEL: test2_div_ss:
 241 ; SSE:       # %bb.0:
 242 ; SSE-NEXT:    divss %xmm0, %xmm1
 243 ; SSE-NEXT:    movaps %xmm1, %xmm0
 244 ; SSE-NEXT:    ret{{[l|q]}}
 245 ;
 246 ; AVX-LABEL: test2_div_ss:
 247 ; AVX:       # %bb.0:
 248 ; AVX-NEXT:    vdivss %xmm0, %xmm1, %xmm0
 249 ; AVX-NEXT:    ret{{[l|q]}}
 250   %1 = extractelement <4 x float> %a, i32 0
 251   %2 = extractelement <4 x float> %b, i32 0
 252   %div = fdiv float %2, %1
 253   %3 = insertelement <4 x float> %b, float %div, i32 0
 254   ret <4 x float> %3
 255 }
 256
 257 define <2 x double> @test2_add_sd(<2 x double> %a, <2 x double> %b) {
 258 ; SSE-LABEL: test2_add_sd:
 259 ; SSE:       # %bb.0:
 260 ; SSE-NEXT:    addsd %xmm0, %xmm1
 261 ; SSE-NEXT:    movapd %xmm1, %xmm0
 262 ; SSE-NEXT:    ret{{[l|q]}}
 263 ;
 264 ; AVX-LABEL: test2_add_sd:
 265 ; AVX:       # %bb.0:
 266 ; AVX-NEXT:    vaddsd %xmm0, %xmm1, %xmm0
 267 ; AVX-NEXT:    ret{{[l|q]}}
 268   %1 = extractelement <2 x double> %a, i32 0
 269   %2 = extractelement <2 x double> %b, i32 0
 270   %add = fadd double %1, %2
 271   %3 = insertelement <2 x double> %b, double %add, i32 0
 272   ret <2 x double> %3
 273 }
 274
 275 define <2 x double> @test2_sub_sd(<2 x double> %a, <2 x double> %b) {
 276 ; SSE-LABEL: test2_sub_sd:
 277 ; SSE:       # %bb.0:
 278 ; SSE-NEXT:    subsd %xmm0, %xmm1
 279 ; SSE-NEXT:    movapd %xmm1, %xmm0
 280 ; SSE-NEXT:    ret{{[l|q]}}
 281 ;
 282 ; AVX-LABEL: test2_sub_sd:
 283 ; AVX:       # %bb.0:
 284 ; AVX-NEXT:    vsubsd %xmm0, %xmm1, %xmm0
 285 ; AVX-NEXT:    ret{{[l|q]}}
 286   %1 = extractelement <2 x double> %a, i32 0
 287   %2 = extractelement <2 x double> %b, i32 0
 288   %sub = fsub double %2, %1
 289   %3 = insertelement <2 x double> %b, double %sub, i32 0
 290   ret <2 x double> %3
 291 }
 292
 293 define <2 x double> @test2_mul_sd(<2 x double> %a, <2 x double> %b) {
 294 ; SSE-LABEL: test2_mul_sd:
 295 ; SSE:       # %bb.0:
 296 ; SSE-NEXT:    mulsd %xmm0, %xmm1
 297 ; SSE-NEXT:    movapd %xmm1, %xmm0
 298 ; SSE-NEXT:    ret{{[l|q]}}
 299 ;
 300 ; AVX-LABEL: test2_mul_sd:
 301 ; AVX:       # %bb.0:
 302 ; AVX-NEXT:    vmulsd %xmm0, %xmm1, %xmm0
 303 ; AVX-NEXT:    ret{{[l|q]}}
 304   %1 = extractelement <2 x double> %a, i32 0
 305   %2 = extractelement <2 x double> %b, i32 0
 306   %mul = fmul double %1, %2
 307   %3 = insertelement <2 x double> %b, double %mul, i32 0
 308   ret <2 x double> %3
 309 }
 310
 311 define <2 x double> @test2_div_sd(<2 x double> %a, <2 x double> %b) {
 312 ; SSE-LABEL: test2_div_sd:
 313 ; SSE:       # %bb.0:
 314 ; SSE-NEXT:    divsd %xmm0, %xmm1
 315 ; SSE-NEXT:    movapd %xmm1, %xmm0
 316 ; SSE-NEXT:    ret{{[l|q]}}
 317 ;
 318 ; AVX-LABEL: test2_div_sd:
 319 ; AVX:       # %bb.0:
 320 ; AVX-NEXT:    vdivsd %xmm0, %xmm1, %xmm0
 321 ; AVX-NEXT:    ret{{[l|q]}}
 322   %1 = extractelement <2 x double> %a, i32 0
 323   %2 = extractelement <2 x double> %b, i32 0
 324   %div = fdiv double %2, %1
 325   %3 = insertelement <2 x double> %b, double %div, i32 0
 326   ret <2 x double> %3
 327 }
 328
 329 define <4 x float> @test_multiple_add_ss(<4 x float> %a, <4 x float> %b) {
 330 ; SSE-LABEL: test_multiple_add_ss:
 331 ; SSE:       # %bb.0:
 332 ; SSE-NEXT:    addss %xmm0, %xmm1
 333 ; SSE-NEXT:    addss %xmm1, %xmm0
 334 ; SSE-NEXT:    ret{{[l|q]}}
 335 ;
 336 ; AVX-LABEL: test_multiple_add_ss:
 337 ; AVX:       # %bb.0:
 338 ; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm1
 339 ; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm0
 340 ; AVX-NEXT:    ret{{[l|q]}}
 341   %1 = extractelement <4 x float> %b, i32 0
 342   %2 = extractelement <4 x float> %a, i32 0
 343   %add = fadd float %2, %1
 344   %add2 = fadd float %2, %add
 345   %3 = insertelement <4 x float> %a, float %add2, i32 0
 346   ret <4 x float> %3
 347 }
 348
 349 define <4 x float> @test_multiple_sub_ss(<4 x float> %a, <4 x float> %b) {
 350 ; SSE-LABEL: test_multiple_sub_ss:
 351 ; SSE:       # %bb.0:
 352 ; SSE-NEXT:    movaps %xmm0, %xmm2
 353 ; SSE-NEXT:    subss %xmm1, %xmm2
 354 ; SSE-NEXT:    subss %xmm2, %xmm0
 355 ; SSE-NEXT:    ret{{[l|q]}}
 356 ;
 357 ; AVX-LABEL: test_multiple_sub_ss:
 358 ; AVX:       # %bb.0:
 359 ; AVX-NEXT:    vsubss %xmm1, %xmm0, %xmm1
 360 ; AVX-NEXT:    vsubss %xmm1, %xmm0, %xmm0
 361 ; AVX-NEXT:    ret{{[l|q]}}
 362   %1 = extractelement <4 x float> %b, i32 0
 363   %2 = extractelement <4 x float> %a, i32 0
 364   %sub = fsub float %2, %1
 365   %sub2 = fsub float %2, %sub
 366   %3 = insertelement <4 x float> %a, float %sub2, i32 0
 367   ret <4 x float> %3
 368 }
 369
 370 define <4 x float> @test_multiple_mul_ss(<4 x float> %a, <4 x float> %b) {
 371 ; SSE-LABEL: test_multiple_mul_ss:
 372 ; SSE:       # %bb.0:
 373 ; SSE-NEXT:    mulss %xmm0, %xmm1
 374 ; SSE-NEXT:    mulss %xmm1, %xmm0
 375 ; SSE-NEXT:    ret{{[l|q]}}
 376 ;
 377 ; AVX-LABEL: test_multiple_mul_ss:
 378 ; AVX:       # %bb.0:
 379 ; AVX-NEXT:    vmulss %xmm1, %xmm0, %xmm1
 380 ; AVX-NEXT:    vmulss %xmm1, %xmm0, %xmm0
 381 ; AVX-NEXT:    ret{{[l|q]}}
 382   %1 = extractelement <4 x float> %b, i32 0
 383   %2 = extractelement <4 x float> %a, i32 0
 384   %mul = fmul float %2, %1
 385   %mul2 = fmul float %2, %mul
 386   %3 = insertelement <4 x float> %a, float %mul2, i32 0
 387   ret <4 x float> %3
 388 }
 389
 390 define <4 x float> @test_multiple_div_ss(<4 x float> %a, <4 x float> %b) {
 391 ; SSE-LABEL: test_multiple_div_ss:
 392 ; SSE:       # %bb.0:
 393 ; SSE-NEXT:    movaps %xmm0, %xmm2
 394 ; SSE-NEXT:    divss %xmm1, %xmm2
 395 ; SSE-NEXT:    divss %xmm2, %xmm0
 396 ; SSE-NEXT:    ret{{[l|q]}}
 397 ;
 398 ; AVX-LABEL: test_multiple_div_ss:
 399 ; AVX:       # %bb.0:
 400 ; AVX-NEXT:    vdivss %xmm1, %xmm0, %xmm1
 401 ; AVX-NEXT:    vdivss %xmm1, %xmm0, %xmm0
 402 ; AVX-NEXT:    ret{{[l|q]}}
 403   %1 = extractelement <4 x float> %b, i32 0
 404   %2 = extractelement <4 x float> %a, i32 0
 405   %div = fdiv float %2, %1
 406   %div2 = fdiv float %2, %div
 407   %3 = insertelement <4 x float> %a, float %div2, i32 0
 408   ret <4 x float> %3
 409 }
 410
 411 ; With SSE4.1 or greater, the shuffles in the following tests may
 412 ; be lowered to X86Blendi nodes.
 413
 414 define <4 x float> @blend_add_ss(<4 x float> %a, float %b) {
 415 ; X86-SSE-LABEL: blend_add_ss:
 416 ; X86-SSE:       # %bb.0:
 417 ; X86-SSE-NEXT:    addss {{[0-9]+}}(%esp), %xmm0
 418 ; X86-SSE-NEXT:    retl
 419 ;
 420 ; X86-AVX-LABEL: blend_add_ss:
 421 ; X86-AVX:       # %bb.0:
 422 ; X86-AVX-NEXT:    vaddss {{[0-9]+}}(%esp), %xmm0, %xmm0
 423 ; X86-AVX-NEXT:    retl
 424 ;
 425 ; X64-SSE-LABEL: blend_add_ss:
 426 ; X64-SSE:       # %bb.0:
 427 ; X64-SSE-NEXT:    addss %xmm1, %xmm0
 428 ; X64-SSE-NEXT:    retq
 429 ;
 430 ; X64-AVX-LABEL: blend_add_ss:
 431 ; X64-AVX:       # %bb.0:
 432 ; X64-AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm0
 433 ; X64-AVX-NEXT:    retq
 434
 435   %ext = extractelement <4 x float> %a, i32 0
 436   %op = fadd float %b, %ext
 437   %ins = insertelement <4 x float> undef, float %op, i32 0
 438   %shuf = shufflevector <4 x float> %ins, <4 x float> %a, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
 439   ret <4 x float> %shuf
 440 }
 441
 442 define <4 x float> @blend_sub_ss(<4 x float> %a, float %b) {
 443 ; X86-SSE-LABEL: blend_sub_ss:
 444 ; X86-SSE:       # %bb.0:
 445 ; X86-SSE-NEXT:    subss {{[0-9]+}}(%esp), %xmm0
 446 ; X86-SSE-NEXT:    retl
 447 ;
 448 ; X86-AVX-LABEL: blend_sub_ss:
 449 ; X86-AVX:       # %bb.0:
 450 ; X86-AVX-NEXT:    vsubss {{[0-9]+}}(%esp), %xmm0, %xmm0
 451 ; X86-AVX-NEXT:    retl
 452 ;
 453 ; X64-SSE-LABEL: blend_sub_ss:
 454 ; X64-SSE:       # %bb.0:
 455 ; X64-SSE-NEXT:    subss %xmm1, %xmm0
 456 ; X64-SSE-NEXT:    retq
 457 ;
 458 ; X64-AVX-LABEL: blend_sub_ss:
 459 ; X64-AVX:       # %bb.0:
 460 ; X64-AVX-NEXT:    vsubss %xmm1, %xmm0, %xmm0
 461 ; X64-AVX-NEXT:    retq
 462
 463   %ext = extractelement <4 x float> %a, i32 0
 464   %op = fsub float %ext, %b
 465   %ins = insertelement <4 x float> undef, float %op, i32 0
 466   %shuf = shufflevector <4 x float> %ins, <4 x float> %a, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
 467   ret <4 x float> %shuf
 468 }
 469
 470 define <4 x float> @blend_mul_ss(<4 x float> %a, float %b) {
 471 ; X86-SSE-LABEL: blend_mul_ss:
 472 ; X86-SSE:       # %bb.0:
 473 ; X86-SSE-NEXT:    mulss {{[0-9]+}}(%esp), %xmm0
 474 ; X86-SSE-NEXT:    retl
 475 ;
 476 ; X86-AVX-LABEL: blend_mul_ss:
 477 ; X86-AVX:       # %bb.0:
 478 ; X86-AVX-NEXT:    vmulss {{[0-9]+}}(%esp), %xmm0, %xmm0
 479 ; X86-AVX-NEXT:    retl
 480 ;
 481 ; X64-SSE-LABEL: blend_mul_ss:
 482 ; X64-SSE:       # %bb.0:
 483 ; X64-SSE-NEXT:    mulss %xmm1, %xmm0
 484 ; X64-SSE-NEXT:    retq
 485 ;
 486 ; X64-AVX-LABEL: blend_mul_ss:
 487 ; X64-AVX:       # %bb.0:
 488 ; X64-AVX-NEXT:    vmulss %xmm1, %xmm0, %xmm0
 489 ; X64-AVX-NEXT:    retq
 490
 491   %ext = extractelement <4 x float> %a, i32 0
 492   %op = fmul float %b, %ext
 493   %ins = insertelement <4 x float> undef, float %op, i32 0
 494   %shuf = shufflevector <4 x float> %ins, <4 x float> %a, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
 495   ret <4 x float> %shuf
 496 }
 497
 498 define <4 x float> @blend_div_ss(<4 x float> %a, float %b) {
 499 ; X86-SSE-LABEL: blend_div_ss:
 500 ; X86-SSE:       # %bb.0:
 501 ; X86-SSE-NEXT:    divss {{[0-9]+}}(%esp), %xmm0
 502 ; X86-SSE-NEXT:    retl
 503 ;
 504 ; X86-AVX-LABEL: blend_div_ss:
 505 ; X86-AVX:       # %bb.0:
 506 ; X86-AVX-NEXT:    vdivss {{[0-9]+}}(%esp), %xmm0, %xmm0
 507 ; X86-AVX-NEXT:    retl
 508 ;
 509 ; X64-SSE-LABEL: blend_div_ss:
 510 ; X64-SSE:       # %bb.0:
 511 ; X64-SSE-NEXT:    divss %xmm1, %xmm0
 512 ; X64-SSE-NEXT:    retq
 513 ;
 514 ; X64-AVX-LABEL: blend_div_ss:
 515 ; X64-AVX:       # %bb.0:
 516 ; X64-AVX-NEXT:    vdivss %xmm1, %xmm0, %xmm0
 517 ; X64-AVX-NEXT:    retq
 518
 519   %ext = extractelement <4 x float> %a, i32 0
 520   %op = fdiv float %ext, %b
 521   %ins = insertelement <4 x float> undef, float %op, i32 0
 522   %shuf = shufflevector <4 x float> %ins, <4 x float> %a, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
 523   ret <4 x float> %shuf
 524 }
 525
 526 define <2 x double> @blend_add_sd(<2 x double> %a, double %b) {
 527 ; X86-SSE-LABEL: blend_add_sd:
 528 ; X86-SSE:       # %bb.0:
 529 ; X86-SSE-NEXT:    addsd {{[0-9]+}}(%esp), %xmm0
 530 ; X86-SSE-NEXT:    retl
 531 ;
 532 ; X86-AVX-LABEL: blend_add_sd:
 533 ; X86-AVX:       # %bb.0:
 534 ; X86-AVX-NEXT:    vaddsd {{[0-9]+}}(%esp), %xmm0, %xmm0
 535 ; X86-AVX-NEXT:    retl
 536 ;
 537 ; X64-SSE-LABEL: blend_add_sd:
 538 ; X64-SSE:       # %bb.0:
 539 ; X64-SSE-NEXT:    addsd %xmm1, %xmm0
 540 ; X64-SSE-NEXT:    retq
 541 ;
 542 ; X64-AVX-LABEL: blend_add_sd:
 543 ; X64-AVX:       # %bb.0:
 544 ; X64-AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
 545 ; X64-AVX-NEXT:    retq
 546
 547   %ext = extractelement <2 x double> %a, i32 0
 548   %op = fadd double %b, %ext
 549   %ins = insertelement <2 x double> undef, double %op, i32 0
 550   %shuf = shufflevector <2 x double> %ins, <2 x double> %a, <2 x i32> <i32 0, i32 3>
 551   ret <2 x double> %shuf
 552 }
 553
 554 define <2 x double> @blend_sub_sd(<2 x double> %a, double %b) {
 555 ; X86-SSE-LABEL: blend_sub_sd:
 556 ; X86-SSE:       # %bb.0:
 557 ; X86-SSE-NEXT:    subsd {{[0-9]+}}(%esp), %xmm0
 558 ; X86-SSE-NEXT:    retl
 559 ;
 560 ; X86-AVX-LABEL: blend_sub_sd:
 561 ; X86-AVX:       # %bb.0:
 562 ; X86-AVX-NEXT:    vsubsd {{[0-9]+}}(%esp), %xmm0, %xmm0
 563 ; X86-AVX-NEXT:    retl
 564 ;
 565 ; X64-SSE-LABEL: blend_sub_sd:
 566 ; X64-SSE:       # %bb.0:
 567 ; X64-SSE-NEXT:    subsd %xmm1, %xmm0
 568 ; X64-SSE-NEXT:    retq
 569 ;
 570 ; X64-AVX-LABEL: blend_sub_sd:
 571 ; X64-AVX:       # %bb.0:
 572 ; X64-AVX-NEXT:    vsubsd %xmm1, %xmm0, %xmm0
 573 ; X64-AVX-NEXT:    retq
 574
 575   %ext = extractelement <2 x double> %a, i32 0
 576   %op = fsub double %ext, %b
 577   %ins = insertelement <2 x double> undef, double %op, i32 0
 578   %shuf = shufflevector <2 x double> %ins, <2 x double> %a, <2 x i32> <i32 0, i32 3>
 579   ret <2 x double> %shuf
 580 }
 581
 582 define <2 x double> @blend_mul_sd(<2 x double> %a, double %b) {
 583 ; X86-SSE-LABEL: blend_mul_sd:
 584 ; X86-SSE:       # %bb.0:
 585 ; X86-SSE-NEXT:    mulsd {{[0-9]+}}(%esp), %xmm0
 586 ; X86-SSE-NEXT:    retl
 587 ;
 588 ; X86-AVX-LABEL: blend_mul_sd:
 589 ; X86-AVX:       # %bb.0:
 590 ; X86-AVX-NEXT:    vmulsd {{[0-9]+}}(%esp), %xmm0, %xmm0
 591 ; X86-AVX-NEXT:    retl
 592 ;
 593 ; X64-SSE-LABEL: blend_mul_sd:
 594 ; X64-SSE:       # %bb.0:
 595 ; X64-SSE-NEXT:    mulsd %xmm1, %xmm0
 596 ; X64-SSE-NEXT:    retq
 597 ;
 598 ; X64-AVX-LABEL: blend_mul_sd:
 599 ; X64-AVX:       # %bb.0:
 600 ; X64-AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
 601 ; X64-AVX-NEXT:    retq
 602
 603   %ext = extractelement <2 x double> %a, i32 0
 604   %op = fmul double %b, %ext
 605   %ins = insertelement <2 x double> undef, double %op, i32 0
 606   %shuf = shufflevector <2 x double> %ins, <2 x double> %a, <2 x i32> <i32 0, i32 3>
 607   ret <2 x double> %shuf
 608 }
 609
 610 define <2 x double> @blend_div_sd(<2 x double> %a, double %b) {
 611 ; X86-SSE-LABEL: blend_div_sd:
 612 ; X86-SSE:       # %bb.0:
 613 ; X86-SSE-NEXT:    divsd {{[0-9]+}}(%esp), %xmm0
 614 ; X86-SSE-NEXT:    retl
 615 ;
 616 ; X86-AVX-LABEL: blend_div_sd:
 617 ; X86-AVX:       # %bb.0:
 618 ; X86-AVX-NEXT:    vdivsd {{[0-9]+}}(%esp), %xmm0, %xmm0
 619 ; X86-AVX-NEXT:    retl
 620 ;
 621 ; X64-SSE-LABEL: blend_div_sd:
 622 ; X64-SSE:       # %bb.0:
 623 ; X64-SSE-NEXT:    divsd %xmm1, %xmm0
 624 ; X64-SSE-NEXT:    retq
 625 ;
 626 ; X64-AVX-LABEL: blend_div_sd:
 627 ; X64-AVX:       # %bb.0:
 628 ; X64-AVX-NEXT:    vdivsd %xmm1, %xmm0, %xmm0
 629 ; X64-AVX-NEXT:    retq
 630
 631   %ext = extractelement <2 x double> %a, i32 0
 632   %op = fdiv double %ext, %b
 633   %ins = insertelement <2 x double> undef, double %op, i32 0
 634   %shuf = shufflevector <2 x double> %ins, <2 x double> %a, <2 x i32> <i32 0, i32 3>
 635   ret <2 x double> %shuf
 636 }
 637
 638 ; Ensure that the backend selects SSE/AVX scalar fp instructions
 639 ; from a packed fp instruction plus a vector insert.
 640
 641 define <4 x float> @insert_test_add_ss(<4 x float> %a, <4 x float> %b) {
 642 ; SSE-LABEL: insert_test_add_ss:
 643 ; SSE:       # %bb.0:
 644 ; SSE-NEXT:    addss %xmm1, %xmm0
 645 ; SSE-NEXT:    ret{{[l|q]}}
 646 ;
 647 ; AVX-LABEL: insert_test_add_ss:
 648 ; AVX:       # %bb.0:
 649 ; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm0
 650 ; AVX-NEXT:    ret{{[l|q]}}
 651   %1 = fadd <4 x float> %a, %b
 652   %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
 653   ret <4 x float> %2
 654 }
 655
 656 define <4 x float> @insert_test_sub_ss(<4 x float> %a, <4 x float> %b) {
 657 ; SSE-LABEL: insert_test_sub_ss:
 658 ; SSE:       # %bb.0:
 659 ; SSE-NEXT:    subss %xmm1, %xmm0
 660 ; SSE-NEXT:    ret{{[l|q]}}
 661 ;
 662 ; AVX-LABEL: insert_test_sub_ss:
 663 ; AVX:       # %bb.0:
 664 ; AVX-NEXT:    vsubss %xmm1, %xmm0, %xmm0
 665 ; AVX-NEXT:    ret{{[l|q]}}
 666   %1 = fsub <4 x float> %a, %b
 667   %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
 668   ret <4 x float> %2
 669 }
 670
 671 define <4 x float> @insert_test_mul_ss(<4 x float> %a, <4 x float> %b) {
 672 ; SSE-LABEL: insert_test_mul_ss:
 673 ; SSE:       # %bb.0:
 674 ; SSE-NEXT:    mulss %xmm1, %xmm0
 675 ; SSE-NEXT:    ret{{[l|q]}}
 676 ;
 677 ; AVX-LABEL: insert_test_mul_ss:
 678 ; AVX:       # %bb.0:
 679 ; AVX-NEXT:    vmulss %xmm1, %xmm0, %xmm0
 680 ; AVX-NEXT:    ret{{[l|q]}}
 681   %1 = fmul <4 x float> %a, %b
 682   %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
 683   ret <4 x float> %2
 684 }
 685
 686 define <4 x float> @insert_test_div_ss(<4 x float> %a, <4 x float> %b) {
 687 ; SSE-LABEL: insert_test_div_ss:
 688 ; SSE:       # %bb.0:
 689 ; SSE-NEXT:    divss %xmm1, %xmm0
 690 ; SSE-NEXT:    ret{{[l|q]}}
 691 ;
 692 ; AVX-LABEL: insert_test_div_ss:
 693 ; AVX:       # %bb.0:
 694 ; AVX-NEXT:    vdivss %xmm1, %xmm0, %xmm0
 695 ; AVX-NEXT:    ret{{[l|q]}}
 696   %1 = fdiv <4 x float> %a, %b
 697   %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
 698   ret <4 x float> %2
 699 }
 700
 701 define <2 x double> @insert_test_add_sd(<2 x double> %a, <2 x double> %b) {
 702 ; SSE-LABEL: insert_test_add_sd:
 703 ; SSE:       # %bb.0:
 704 ; SSE-NEXT:    addsd %xmm1, %xmm0
 705 ; SSE-NEXT:    ret{{[l|q]}}
 706 ;
 707 ; AVX-LABEL: insert_test_add_sd:
 708 ; AVX:       # %bb.0:
 709 ; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
 710 ; AVX-NEXT:    ret{{[l|q]}}
 711   %1 = fadd <2 x double> %a, %b
 712   %2 = shufflevector <2 x double> %1, <2 x double> %a, <2 x i32> <i32 0, i32 3>
 713   ret <2 x double> %2
 714 }
 715
 716 define <2 x double> @insert_test_sub_sd(<2 x double> %a, <2 x double> %b) {
 717 ; SSE-LABEL: insert_test_sub_sd:
 718 ; SSE:       # %bb.0:
 719 ; SSE-NEXT:    subsd %xmm1, %xmm0
 720 ; SSE-NEXT:    ret{{[l|q]}}
 721 ;
 722 ; AVX-LABEL: insert_test_sub_sd:
 723 ; AVX:       # %bb.0:
 724 ; AVX-NEXT:    vsubsd %xmm1, %xmm0, %xmm0
 725 ; AVX-NEXT:    ret{{[l|q]}}
 726   %1 = fsub <2 x double> %a, %b
 727   %2 = shufflevector <2 x double> %1, <2 x double> %a, <2 x i32> <i32 0, i32 3>
 728   ret <2 x double> %2
 729 }
 730
 731 define <2 x double> @insert_test_mul_sd(<2 x double> %a, <2 x double> %b) {
 732 ; SSE-LABEL: insert_test_mul_sd:
 733 ; SSE:       # %bb.0:
 734 ; SSE-NEXT:    mulsd %xmm1, %xmm0
 735 ; SSE-NEXT:    ret{{[l|q]}}
 736 ;
 737 ; AVX-LABEL: insert_test_mul_sd:
 738 ; AVX:       # %bb.0:
 739 ; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
 740 ; AVX-NEXT:    ret{{[l|q]}}
 741   %1 = fmul <2 x double> %a, %b
 742   %2 = shufflevector <2 x double> %1, <2 x double> %a, <2 x i32> <i32 0, i32 3>
 743   ret <2 x double> %2
 744 }
 745
 746 define <2 x double> @insert_test_div_sd(<2 x double> %a, <2 x double> %b) {
 747 ; SSE-LABEL: insert_test_div_sd:
 748 ; SSE:       # %bb.0:
 749 ; SSE-NEXT:    divsd %xmm1, %xmm0
 750 ; SSE-NEXT:    ret{{[l|q]}}
 751 ;
 752 ; AVX-LABEL: insert_test_div_sd:
 753 ; AVX:       # %bb.0:
 754 ; AVX-NEXT:    vdivsd %xmm1, %xmm0, %xmm0
 755 ; AVX-NEXT:    ret{{[l|q]}}
 756   %1 = fdiv <2 x double> %a, %b
 757   %2 = shufflevector <2 x double> %1, <2 x double> %a, <2 x i32> <i32 0, i32 3>
 758   ret <2 x double> %2
 759 }
 760
 761 define <4 x float> @insert_test2_add_ss(<4 x float> %a, <4 x float> %b) {
 762 ; SSE-LABEL: insert_test2_add_ss:
 763 ; SSE:       # %bb.0:
 764 ; SSE-NEXT:    addss %xmm0, %xmm1
 765 ; SSE-NEXT:    movaps %xmm1, %xmm0
 766 ; SSE-NEXT:    ret{{[l|q]}}
 767 ;
 768 ; AVX-LABEL: insert_test2_add_ss:
 769 ; AVX:       # %bb.0:
 770 ; AVX-NEXT:    vaddss %xmm0, %xmm1, %xmm0
 771 ; AVX-NEXT:    ret{{[l|q]}}
 772   %1 = fadd <4 x float> %b, %a
 773   %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
 774   ret <4 x float> %2
 775 }
 776
 777 define <4 x float> @insert_test2_sub_ss(<4 x float> %a, <4 x float> %b) {
 778 ; SSE-LABEL: insert_test2_sub_ss:
 779 ; SSE:       # %bb.0:
 780 ; SSE-NEXT:    subss %xmm0, %xmm1
 781 ; SSE-NEXT:    movaps %xmm1, %xmm0
 782 ; SSE-NEXT:    ret{{[l|q]}}
 783 ;
 784 ; AVX-LABEL: insert_test2_sub_ss:
 785 ; AVX:       # %bb.0:
 786 ; AVX-NEXT:    vsubss %xmm0, %xmm1, %xmm0
 787 ; AVX-NEXT:    ret{{[l|q]}}
 788   %1 = fsub <4 x float> %b, %a
 789   %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
 790   ret <4 x float> %2
 791 }
 792
 793 define <4 x float> @insert_test2_mul_ss(<4 x float> %a, <4 x float> %b) {
 794 ; SSE-LABEL: insert_test2_mul_ss:
 795 ; SSE:       # %bb.0:
 796 ; SSE-NEXT:    mulss %xmm0, %xmm1
 797 ; SSE-NEXT:    movaps %xmm1, %xmm0
 798 ; SSE-NEXT:    ret{{[l|q]}}
 799 ;
 800 ; AVX-LABEL: insert_test2_mul_ss:
 801 ; AVX:       # %bb.0:
 802 ; AVX-NEXT:    vmulss %xmm0, %xmm1, %xmm0
 803 ; AVX-NEXT:    ret{{[l|q]}}
 804   %1 = fmul <4 x float> %b, %a
 805   %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
 806   ret <4 x float> %2
 807 }
 808
 809 define <4 x float> @insert_test2_div_ss(<4 x float> %a, <4 x float> %b) {
 810 ; SSE-LABEL: insert_test2_div_ss:
 811 ; SSE:       # %bb.0:
 812 ; SSE-NEXT:    divss %xmm0, %xmm1
 813 ; SSE-NEXT:    movaps %xmm1, %xmm0
 814 ; SSE-NEXT:    ret{{[l|q]}}
 815 ;
 816 ; AVX-LABEL: insert_test2_div_ss:
 817 ; AVX:       # %bb.0:
 818 ; AVX-NEXT:    vdivss %xmm0, %xmm1, %xmm0
 819 ; AVX-NEXT:    ret{{[l|q]}}
 820   %1 = fdiv <4 x float> %b, %a
 821   %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
 822   ret <4 x float> %2
 823 }
 824
 825 define <2 x double> @insert_test2_add_sd(<2 x double> %a, <2 x double> %b) {
 826 ; SSE-LABEL: insert_test2_add_sd:
 827 ; SSE:       # %bb.0:
 828 ; SSE-NEXT:    addsd %xmm0, %xmm1
 829 ; SSE-NEXT:    movapd %xmm1, %xmm0
 830 ; SSE-NEXT:    ret{{[l|q]}}
 831 ;
 832 ; AVX-LABEL: insert_test2_add_sd:
 833 ; AVX:       # %bb.0:
 834 ; AVX-NEXT:    vaddsd %xmm0, %xmm1, %xmm0
 835 ; AVX-NEXT:    ret{{[l|q]}}
 836   %1 = fadd <2 x double> %b, %a
 837   %2 = shufflevector <2 x double> %1, <2 x double> %b, <2 x i32> <i32 0, i32 3>
 838   ret <2 x double> %2
 839 }
 840
 841 define <2 x double> @insert_test2_sub_sd(<2 x double> %a, <2 x double> %b) {
 842 ; SSE-LABEL: insert_test2_sub_sd:
 843 ; SSE:       # %bb.0:
 844 ; SSE-NEXT:    subsd %xmm0, %xmm1
 845 ; SSE-NEXT:    movapd %xmm1, %xmm0
 846 ; SSE-NEXT:    ret{{[l|q]}}
 847 ;
 848 ; AVX-LABEL: insert_test2_sub_sd:
 849 ; AVX:       # %bb.0:
 850 ; AVX-NEXT:    vsubsd %xmm0, %xmm1, %xmm0
 851 ; AVX-NEXT:    ret{{[l|q]}}
 852   %1 = fsub <2 x double> %b, %a
 853   %2 = shufflevector <2 x double> %1, <2 x double> %b, <2 x i32> <i32 0, i32 3>
 854   ret <2 x double> %2
 855 }
 856
 857 define <2 x double> @insert_test2_mul_sd(<2 x double> %a, <2 x double> %b) {
 858 ; SSE-LABEL: insert_test2_mul_sd:
 859 ; SSE:       # %bb.0:
 860 ; SSE-NEXT:    mulsd %xmm0, %xmm1
 861 ; SSE-NEXT:    movapd %xmm1, %xmm0
 862 ; SSE-NEXT:    ret{{[l|q]}}
 863 ;
 864 ; AVX-LABEL: insert_test2_mul_sd:
 865 ; AVX:       # %bb.0:
 866 ; AVX-NEXT:    vmulsd %xmm0, %xmm1, %xmm0
 867 ; AVX-NEXT:    ret{{[l|q]}}
 868   %1 = fmul <2 x double> %b, %a
 869   %2 = shufflevector <2 x double> %1, <2 x double> %b, <2 x i32> <i32 0, i32 3>
 870   ret <2 x double> %2
 871 }
 872
 873 define <2 x double> @insert_test2_div_sd(<2 x double> %a, <2 x double> %b) {
 874 ; SSE-LABEL: insert_test2_div_sd:
 875 ; SSE:       # %bb.0:
 876 ; SSE-NEXT:    divsd %xmm0, %xmm1
 877 ; SSE-NEXT:    movapd %xmm1, %xmm0
 878 ; SSE-NEXT:    ret{{[l|q]}}
 879 ;
 880 ; AVX-LABEL: insert_test2_div_sd:
 881 ; AVX:       # %bb.0:
 882 ; AVX-NEXT:    vdivsd %xmm0, %xmm1, %xmm0
 883 ; AVX-NEXT:    ret{{[l|q]}}
 884   %1 = fdiv <2 x double> %b, %a
 885   %2 = shufflevector <2 x double> %1, <2 x double> %b, <2 x i32> <i32 0, i32 3>
 886   ret <2 x double> %2
 887 }
 888
 889 define <4 x float> @insert_test3_add_ss(<4 x float> %a, <4 x float> %b) {
 890 ; SSE-LABEL: insert_test3_add_ss:
 891 ; SSE:       # %bb.0:
 892 ; SSE-NEXT:    addss %xmm1, %xmm0
 893 ; SSE-NEXT:    ret{{[l|q]}}
 894 ;
 895 ; AVX-LABEL: insert_test3_add_ss:
 896 ; AVX:       # %bb.0:
 897 ; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm0
 898 ; AVX-NEXT:    ret{{[l|q]}}
 899   %1 = fadd <4 x float> %a, %b
 900   %2 = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> %a, <4 x float> %1
 901   ret <4 x float> %2
 902 }
 903
 904 define <4 x float> @insert_test3_sub_ss(<4 x float> %a, <4 x float> %b) {
 905 ; SSE-LABEL: insert_test3_sub_ss:
 906 ; SSE:       # %bb.0:
 907 ; SSE-NEXT:    subss %xmm1, %xmm0
 908 ; SSE-NEXT:    ret{{[l|q]}}
 909 ;
 910 ; AVX-LABEL: insert_test3_sub_ss:
 911 ; AVX:       # %bb.0:
 912 ; AVX-NEXT:    vsubss %xmm1, %xmm0, %xmm0
 913 ; AVX-NEXT:    ret{{[l|q]}}
 914   %1 = fsub <4 x float> %a, %b
 915   %2 = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> %a, <4 x float> %1
 916   ret <4 x float> %2
 917 }
 918
 919 define <4 x float> @insert_test3_mul_ss(<4 x float> %a, <4 x float> %b) {
 920 ; SSE-LABEL: insert_test3_mul_ss:
 921 ; SSE:       # %bb.0:
 922 ; SSE-NEXT:    mulss %xmm1, %xmm0
 923 ; SSE-NEXT:    ret{{[l|q]}}
 924 ;
 925 ; AVX-LABEL: insert_test3_mul_ss:
 926 ; AVX:       # %bb.0:
 927 ; AVX-NEXT:    vmulss %xmm1, %xmm0, %xmm0
 928 ; AVX-NEXT:    ret{{[l|q]}}
 929   %1 = fmul <4 x float> %a, %b
 930   %2 = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> %a, <4 x float> %1
 931   ret <4 x float> %2
 932 }
 933
 934 define <4 x float> @insert_test3_div_ss(<4 x float> %a, <4 x float> %b) {
 935 ; SSE-LABEL: insert_test3_div_ss:
 936 ; SSE:       # %bb.0:
 937 ; SSE-NEXT:    divss %xmm1, %xmm0
 938 ; SSE-NEXT:    ret{{[l|q]}}
 939 ;
 940 ; AVX-LABEL: insert_test3_div_ss:
 941 ; AVX:       # %bb.0:
 942 ; AVX-NEXT:    vdivss %xmm1, %xmm0, %xmm0
 943 ; AVX-NEXT:    ret{{[l|q]}}
 944   %1 = fdiv <4 x float> %a, %b
 945   %2 = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> %a, <4 x float> %1
 946   ret <4 x float> %2
 947 }
 948
 949 define <2 x double> @insert_test3_add_sd(<2 x double> %a, <2 x double> %b) {
 950 ; SSE-LABEL: insert_test3_add_sd:
 951 ; SSE:       # %bb.0:
 952 ; SSE-NEXT:    addsd %xmm1, %xmm0
 953 ; SSE-NEXT:    ret{{[l|q]}}
 954 ;
 955 ; AVX-LABEL: insert_test3_add_sd:
 956 ; AVX:       # %bb.0:
 957 ; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
 958 ; AVX-NEXT:    ret{{[l|q]}}
 959   %1 = fadd <2 x double> %a, %b
 960   %2 = select <2 x i1> <i1 false, i1 true>, <2 x double> %a, <2 x double> %1
 961   ret <2 x double> %2
 962 }
 963
 964 define <2 x double> @insert_test3_sub_sd(<2 x double> %a, <2 x double> %b) {
 965 ; SSE-LABEL: insert_test3_sub_sd:
 966 ; SSE:       # %bb.0:
 967 ; SSE-NEXT:    subsd %xmm1, %xmm0
 968 ; SSE-NEXT:    ret{{[l|q]}}
 969 ;
 970 ; AVX-LABEL: insert_test3_sub_sd:
 971 ; AVX:       # %bb.0:
 972 ; AVX-NEXT:    vsubsd %xmm1, %xmm0, %xmm0
 973 ; AVX-NEXT:    ret{{[l|q]}}
 974   %1 = fsub <2 x double> %a, %b
 975   %2 = select <2 x i1> <i1 false, i1 true>, <2 x double> %a, <2 x double> %1
 976   ret <2 x double> %2
 977 }
 978
 979 define <2 x double> @insert_test3_mul_sd(<2 x double> %a, <2 x double> %b) {
 980 ; SSE-LABEL: insert_test3_mul_sd:
 981 ; SSE:       # %bb.0:
 982 ; SSE-NEXT:    mulsd %xmm1, %xmm0
 983 ; SSE-NEXT:    ret{{[l|q]}}
 984 ;
 985 ; AVX-LABEL: insert_test3_mul_sd:
 986 ; AVX:       # %bb.0:
 987 ; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
 988 ; AVX-NEXT:    ret{{[l|q]}}
 989   %1 = fmul <2 x double> %a, %b
 990   %2 = select <2 x i1> <i1 false, i1 true>, <2 x double> %a, <2 x double> %1
 991   ret <2 x double> %2
 992 }
 993
 994 define <2 x double> @insert_test3_div_sd(<2 x double> %a, <2 x double> %b) {
 995 ; SSE-LABEL: insert_test3_div_sd:
 996 ; SSE:       # %bb.0:
 997 ; SSE-NEXT:    divsd %xmm1, %xmm0
 998 ; SSE-NEXT:    ret{{[l|q]}}
 999 ;
1000 ; AVX-LABEL: insert_test3_div_sd:
1001 ; AVX:       # %bb.0:
1002 ; AVX-NEXT:    vdivsd %xmm1, %xmm0, %xmm0
1003 ; AVX-NEXT:    ret{{[l|q]}}
1004   %1 = fdiv <2 x double> %a, %b
1005   %2 = select <2 x i1> <i1 false, i1 true>, <2 x double> %a, <2 x double> %1
1006   ret <2 x double> %2
1007 }
1008
1009 define <4 x float> @insert_test4_add_ss(<4 x float> %a, <4 x float> %b) {
1010 ; SSE-LABEL: insert_test4_add_ss:
1011 ; SSE:       # %bb.0:
1012 ; SSE-NEXT:    addss %xmm0, %xmm1
1013 ; SSE-NEXT:    movaps %xmm1, %xmm0
1014 ; SSE-NEXT:    ret{{[l|q]}}
1015 ;
1016 ; AVX-LABEL: insert_test4_add_ss:
1017 ; AVX:       # %bb.0:
1018 ; AVX-NEXT:    vaddss %xmm0, %xmm1, %xmm0
1019 ; AVX-NEXT:    ret{{[l|q]}}
1020   %1 = fadd <4 x float> %b, %a
1021   %2 = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> %b, <4 x float> %1
1022   ret <4 x float> %2
1023 }
1024
1025 define <4 x float> @insert_test4_sub_ss(<4 x float> %a, <4 x float> %b) {
1026 ; SSE-LABEL: insert_test4_sub_ss:
1027 ; SSE:       # %bb.0:
1028 ; SSE-NEXT:    subss %xmm0, %xmm1
1029 ; SSE-NEXT:    movaps %xmm1, %xmm0
1030 ; SSE-NEXT:    ret{{[l|q]}}
1031 ;
1032 ; AVX-LABEL: insert_test4_sub_ss:
1033 ; AVX:       # %bb.0:
1034 ; AVX-NEXT:    vsubss %xmm0, %xmm1, %xmm0
1035 ; AVX-NEXT:    ret{{[l|q]}}
1036   %1 = fsub <4 x float> %b, %a
1037   %2 = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> %b, <4 x float> %1
1038   ret <4 x float> %2
1039 }
1040
1041 define <4 x float> @insert_test4_mul_ss(<4 x float> %a, <4 x float> %b) {
1042 ; SSE-LABEL: insert_test4_mul_ss:
1043 ; SSE:       # %bb.0:
1044 ; SSE-NEXT:    mulss %xmm0, %xmm1
1045 ; SSE-NEXT:    movaps %xmm1, %xmm0
1046 ; SSE-NEXT:    ret{{[l|q]}}
1047 ;
1048 ; AVX-LABEL: insert_test4_mul_ss:
1049 ; AVX:       # %bb.0:
1050 ; AVX-NEXT:    vmulss %xmm0, %xmm1, %xmm0
1051 ; AVX-NEXT:    ret{{[l|q]}}
1052   %1 = fmul <4 x float> %b, %a
1053   %2 = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> %b, <4 x float> %1
1054   ret <4 x float> %2
1055 }
1056
1057 define <4 x float> @insert_test4_div_ss(<4 x float> %a, <4 x float> %b) {
1058 ; SSE-LABEL: insert_test4_div_ss:
1059 ; SSE:       # %bb.0:
1060 ; SSE-NEXT:    divss %xmm0, %xmm1
1061 ; SSE-NEXT:    movaps %xmm1, %xmm0
1062 ; SSE-NEXT:    ret{{[l|q]}}
1063 ;
1064 ; AVX-LABEL: insert_test4_div_ss:
1065 ; AVX:       # %bb.0:
1066 ; AVX-NEXT:    vdivss %xmm0, %xmm1, %xmm0
1067 ; AVX-NEXT:    ret{{[l|q]}}
1068   %1 = fdiv <4 x float> %b, %a
1069   %2 = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> %b, <4 x float> %1
1070   ret <4 x float> %2
1071 }
1072
1073 define <2 x double> @insert_test4_add_sd(<2 x double> %a, <2 x double> %b) {
1074 ; SSE-LABEL: insert_test4_add_sd:
1075 ; SSE:       # %bb.0:
1076 ; SSE-NEXT:    addsd %xmm0, %xmm1
1077 ; SSE-NEXT:    movapd %xmm1, %xmm0
1078 ; SSE-NEXT:    ret{{[l|q]}}
1079 ;
1080 ; AVX-LABEL: insert_test4_add_sd:
1081 ; AVX:       # %bb.0:
1082 ; AVX-NEXT:    vaddsd %xmm0, %xmm1, %xmm0
1083 ; AVX-NEXT:    ret{{[l|q]}}
1084   %1 = fadd <2 x double> %b, %a
1085   %2 = select <2 x i1> <i1 false, i1 true>, <2 x double> %b, <2 x double> %1
1086   ret <2 x double> %2
1087 }
1088
1089 define <2 x double> @insert_test4_sub_sd(<2 x double> %a, <2 x double> %b) {
1090 ; SSE-LABEL: insert_test4_sub_sd:
1091 ; SSE:       # %bb.0:
1092 ; SSE-NEXT:    subsd %xmm0, %xmm1
1093 ; SSE-NEXT:    movapd %xmm1, %xmm0
1094 ; SSE-NEXT:    ret{{[l|q]}}
1095 ;
1096 ; AVX-LABEL: insert_test4_sub_sd:
1097 ; AVX:       # %bb.0:
1098 ; AVX-NEXT:    vsubsd %xmm0, %xmm1, %xmm0
1099 ; AVX-NEXT:    ret{{[l|q]}}
1100   %1 = fsub <2 x double> %b, %a
1101   %2 = select <2 x i1> <i1 false, i1 true>, <2 x double> %b, <2 x double> %1
1102   ret <2 x double> %2
1103 }
1104
1105 define <2 x double> @insert_test4_mul_sd(<2 x double> %a, <2 x double> %b) {
1106 ; SSE-LABEL: insert_test4_mul_sd:
1107 ; SSE:       # %bb.0:
1108 ; SSE-NEXT:    mulsd %xmm0, %xmm1
1109 ; SSE-NEXT:    movapd %xmm1, %xmm0
1110 ; SSE-NEXT:    ret{{[l|q]}}
1111 ;
1112 ; AVX-LABEL: insert_test4_mul_sd:
1113 ; AVX:       # %bb.0:
1114 ; AVX-NEXT:    vmulsd %xmm0, %xmm1, %xmm0
1115 ; AVX-NEXT:    ret{{[l|q]}}
1116   %1 = fmul <2 x double> %b, %a
1117   %2 = select <2 x i1> <i1 false, i1 true>, <2 x double> %b, <2 x double> %1
1118   ret <2 x double> %2
1119 }
1120
1121 define <2 x double> @insert_test4_div_sd(<2 x double> %a, <2 x double> %b) {
1122 ; SSE-LABEL: insert_test4_div_sd:
1123 ; SSE:       # %bb.0:
1124 ; SSE-NEXT:    divsd %xmm0, %xmm1
1125 ; SSE-NEXT:    movapd %xmm1, %xmm0
1126 ; SSE-NEXT:    ret{{[l|q]}}
1127 ;
1128 ; AVX-LABEL: insert_test4_div_sd:
1129 ; AVX:       # %bb.0:
1130 ; AVX-NEXT:    vdivsd %xmm0, %xmm1, %xmm0
1131 ; AVX-NEXT:    ret{{[l|q]}}
1132   %1 = fdiv <2 x double> %b, %a
1133   %2 = select <2 x i1> <i1 false, i1 true>, <2 x double> %b, <2 x double> %1
1134   ret <2 x double> %2
1135 }
1136
1137 define <4 x float> @insert_test5_add_ss(<4 x float> %a, <4 x float> %b) {
1138 ; SSE-LABEL: insert_test5_add_ss:
1139 ; SSE:       # %bb.0:
1140 ; SSE-NEXT:    addss %xmm1, %xmm0
1141 ; SSE-NEXT:    ret{{[l|q]}}
1142 ;
1143 ; AVX-LABEL: insert_test5_add_ss:
1144 ; AVX:       # %bb.0:
1145 ; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm0
1146 ; AVX-NEXT:    ret{{[l|q]}}
1147   %1 = fadd <4 x float> %b, %a
1148   %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
1149   ret <4 x float> %2
1150 }
1151
1152 define <4 x float> @insert_test5_sub_ss(<4 x float> %a, <4 x float> %b) {
1153 ; SSE2-LABEL: insert_test5_sub_ss:
1154 ; SSE2:       # %bb.0:
1155 ; SSE2-NEXT:    subps %xmm0, %xmm1
1156 ; SSE2-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
1157 ; SSE2-NEXT:    ret{{[l|q]}}
1158 ;
1159 ; SSE41-LABEL: insert_test5_sub_ss:
1160 ; SSE41:       # %bb.0:
1161 ; SSE41-NEXT:    subps %xmm0, %xmm1
1162 ; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
1163 ; SSE41-NEXT:    ret{{[l|q]}}
1164 ;
1165 ; AVX-LABEL: insert_test5_sub_ss:
1166 ; AVX:       # %bb.0:
1167 ; AVX-NEXT:    vsubps %xmm0, %xmm1, %xmm1
1168 ; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
1169 ; AVX-NEXT:    ret{{[l|q]}}
1170   %1 = fsub <4 x float> %b, %a
1171   %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
1172   ret <4 x float> %2
1173 }
1174
1175 define <4 x float> @insert_test5_mul_ss(<4 x float> %a, <4 x float> %b) {
1176 ; SSE-LABEL: insert_test5_mul_ss:
1177 ; SSE:       # %bb.0:
1178 ; SSE-NEXT:    mulss %xmm1, %xmm0
1179 ; SSE-NEXT:    ret{{[l|q]}}
1180 ;
1181 ; AVX-LABEL: insert_test5_mul_ss:
1182 ; AVX:       # %bb.0:
1183 ; AVX-NEXT:    vmulss %xmm1, %xmm0, %xmm0
1184 ; AVX-NEXT:    ret{{[l|q]}}
1185   %1 = fmul <4 x float> %b, %a
1186   %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
1187   ret <4 x float> %2
1188 }
1189
1190 define <4 x float> @insert_test5_div_ss(<4 x float> %a, <4 x float> %b) {
1191 ; SSE2-LABEL: insert_test5_div_ss:
1192 ; SSE2:       # %bb.0:
1193 ; SSE2-NEXT:    divps %xmm0, %xmm1
1194 ; SSE2-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
1195 ; SSE2-NEXT:    ret{{[l|q]}}
1196 ;
1197 ; SSE41-LABEL: insert_test5_div_ss:
1198 ; SSE41:       # %bb.0:
1199 ; SSE41-NEXT:    divps %xmm0, %xmm1
1200 ; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
1201 ; SSE41-NEXT:    ret{{[l|q]}}
1202 ;
1203 ; AVX-LABEL: insert_test5_div_ss:
1204 ; AVX:       # %bb.0:
1205 ; AVX-NEXT:    vdivps %xmm0, %xmm1, %xmm1
1206 ; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
1207 ; AVX-NEXT:    ret{{[l|q]}}
1208   %1 = fdiv <4 x float> %b, %a
1209   %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
1210   ret <4 x float> %2
1211 }
1212
1213 define <2 x double> @insert_test5_add_sd(<2 x double> %a, <2 x double> %b) {
1214 ; SSE-LABEL: insert_test5_add_sd:
1215 ; SSE:       # %bb.0:
1216 ; SSE-NEXT:    addsd %xmm1, %xmm0
1217 ; SSE-NEXT:    ret{{[l|q]}}
1218 ;
1219 ; AVX-LABEL: insert_test5_add_sd:
1220 ; AVX:       # %bb.0:
1221 ; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1222 ; AVX-NEXT:    ret{{[l|q]}}
1223   %1 = fadd <2 x double> %b, %a
1224   %2 = shufflevector <2 x double> %1, <2 x double> %a, <2 x i32> <i32 0, i32 3>
1225   ret <2 x double> %2
1226 }
1227
1228 define <2 x double> @insert_test5_sub_sd(<2 x double> %a, <2 x double> %b) {
1229 ; SSE2-LABEL: insert_test5_sub_sd:
1230 ; SSE2:       # %bb.0:
1231 ; SSE2-NEXT:    subpd %xmm0, %xmm1
1232 ; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
1233 ; SSE2-NEXT:    ret{{[l|q]}}
1234 ;
1235 ; SSE41-LABEL: insert_test5_sub_sd:
1236 ; SSE41:       # %bb.0:
1237 ; SSE41-NEXT:    subpd %xmm0, %xmm1
1238 ; SSE41-NEXT:    blendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
1239 ; SSE41-NEXT:    ret{{[l|q]}}
1240 ;
1241 ; AVX-LABEL: insert_test5_sub_sd:
1242 ; AVX:       # %bb.0:
1243 ; AVX-NEXT:    vsubpd %xmm0, %xmm1, %xmm1
1244 ; AVX-NEXT:    vblendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
1245 ; AVX-NEXT:    ret{{[l|q]}}
1246   %1 = fsub <2 x double> %b, %a
1247   %2 = shufflevector <2 x double> %1, <2 x double> %a, <2 x i32> <i32 0, i32 3>
1248   ret <2 x double> %2
1249 }
1250
1251 define <2 x double> @insert_test5_mul_sd(<2 x double> %a, <2 x double> %b) {
1252 ; SSE-LABEL: insert_test5_mul_sd:
1253 ; SSE:       # %bb.0:
1254 ; SSE-NEXT:    mulsd %xmm1, %xmm0
1255 ; SSE-NEXT:    ret{{[l|q]}}
1256 ;
1257 ; AVX-LABEL: insert_test5_mul_sd:
1258 ; AVX:       # %bb.0:
1259 ; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
1260 ; AVX-NEXT:    ret{{[l|q]}}
1261   %1 = fmul <2 x double> %b, %a
1262   %2 = shufflevector <2 x double> %1, <2 x double> %a, <2 x i32> <i32 0, i32 3>
1263   ret <2 x double> %2
1264 }
1265
1266 define <2 x double> @insert_test5_div_sd(<2 x double> %a, <2 x double> %b) {
1267 ; SSE2-LABEL: insert_test5_div_sd:
1268 ; SSE2:       # %bb.0:
1269 ; SSE2-NEXT:    divpd %xmm0, %xmm1
1270 ; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
1271 ; SSE2-NEXT:    ret{{[l|q]}}
1272 ;
1273 ; SSE41-LABEL: insert_test5_div_sd:
1274 ; SSE41:       # %bb.0:
1275 ; SSE41-NEXT:    divpd %xmm0, %xmm1
1276 ; SSE41-NEXT:    blendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
1277 ; SSE41-NEXT:    ret{{[l|q]}}
1278 ;
1279 ; AVX-LABEL: insert_test5_div_sd:
1280 ; AVX:       # %bb.0:
1281 ; AVX-NEXT:    vdivpd %xmm0, %xmm1, %xmm1
1282 ; AVX-NEXT:    vblendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
1283 ; AVX-NEXT:    ret{{[l|q]}}
1284   %1 = fdiv <2 x double> %b, %a
1285   %2 = shufflevector <2 x double> %1, <2 x double> %a, <2 x i32> <i32 0, i32 3>
1286   ret <2 x double> %2
1287 }
1288
1289 define <4 x float> @add_ss_mask(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) {
1290 ; X86-SSE2-LABEL: add_ss_mask:
1291 ; X86-SSE2:       # %bb.0:
1292 ; X86-SSE2-NEXT:    testb $1, {{[0-9]+}}(%esp)
1293 ; X86-SSE2-NEXT:    jne .LBB70_1
1294 ; X86-SSE2-NEXT:  # %bb.2:
1295 ; X86-SSE2-NEXT:    movss {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3]
1296 ; X86-SSE2-NEXT:    retl
1297 ; X86-SSE2-NEXT:  .LBB70_1:
1298 ; X86-SSE2-NEXT:    addss %xmm0, %xmm1
1299 ; X86-SSE2-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
1300 ; X86-SSE2-NEXT:    retl
1301 ;
1302 ; X86-SSE41-LABEL: add_ss_mask:
1303 ; X86-SSE41:       # %bb.0:
1304 ; X86-SSE41-NEXT:    testb $1, {{[0-9]+}}(%esp)
1305 ; X86-SSE41-NEXT:    jne .LBB70_1
1306 ; X86-SSE41-NEXT:  # %bb.2:
1307 ; X86-SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3]
1308 ; X86-SSE41-NEXT:    retl
1309 ; X86-SSE41-NEXT:  .LBB70_1:
1310 ; X86-SSE41-NEXT:    addss %xmm0, %xmm1
1311 ; X86-SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
1312 ; X86-SSE41-NEXT:    retl
1313 ;
1314 ; X86-AVX1-LABEL: add_ss_mask:
1315 ; X86-AVX1:       # %bb.0:
1316 ; X86-AVX1-NEXT:    testb $1, {{[0-9]+}}(%esp)
1317 ; X86-AVX1-NEXT:    je .LBB70_2
1318 ; X86-AVX1-NEXT:  # %bb.1:
1319 ; X86-AVX1-NEXT:    vaddss %xmm1, %xmm0, %xmm2
1320 ; X86-AVX1-NEXT:  .LBB70_2:
1321 ; X86-AVX1-NEXT:    vblendps {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3]
1322 ; X86-AVX1-NEXT:    retl
1323 ;
1324 ; X86-AVX512-LABEL: add_ss_mask:
1325 ; X86-AVX512:       # %bb.0:
1326 ; X86-AVX512-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
1327 ; X86-AVX512-NEXT:    kmovw %eax, %k1
1328 ; X86-AVX512-NEXT:    vaddss %xmm1, %xmm0, %xmm2 {%k1}
1329 ; X86-AVX512-NEXT:    vmovaps %xmm2, %xmm0
1330 ; X86-AVX512-NEXT:    retl
1331 ;
1332 ; X64-SSE2-LABEL: add_ss_mask:
1333 ; X64-SSE2:       # %bb.0:
1334 ; X64-SSE2-NEXT:    testb $1, %dil
1335 ; X64-SSE2-NEXT:    jne .LBB70_1
1336 ; X64-SSE2-NEXT:  # %bb.2:
1337 ; X64-SSE2-NEXT:    movss {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3]
1338 ; X64-SSE2-NEXT:    retq
1339 ; X64-SSE2-NEXT:  .LBB70_1:
1340 ; X64-SSE2-NEXT:    addss %xmm0, %xmm1
1341 ; X64-SSE2-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
1342 ; X64-SSE2-NEXT:    retq
1343 ;
1344 ; X64-SSE41-LABEL: add_ss_mask:
1345 ; X64-SSE41:       # %bb.0:
1346 ; X64-SSE41-NEXT:    testb $1, %dil
1347 ; X64-SSE41-NEXT:    jne .LBB70_1
1348 ; X64-SSE41-NEXT:  # %bb.2:
1349 ; X64-SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3]
1350 ; X64-SSE41-NEXT:    retq
1351 ; X64-SSE41-NEXT:  .LBB70_1:
1352 ; X64-SSE41-NEXT:    addss %xmm0, %xmm1
1353 ; X64-SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
1354 ; X64-SSE41-NEXT:    retq
1355 ;
1356 ; X64-AVX1-LABEL: add_ss_mask:
1357 ; X64-AVX1:       # %bb.0:
1358 ; X64-AVX1-NEXT:    testb $1, %dil
1359 ; X64-AVX1-NEXT:    je .LBB70_2
1360 ; X64-AVX1-NEXT:  # %bb.1:
1361 ; X64-AVX1-NEXT:    vaddss %xmm1, %xmm0, %xmm2
1362 ; X64-AVX1-NEXT:  .LBB70_2:
1363 ; X64-AVX1-NEXT:    vblendps {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3]
1364 ; X64-AVX1-NEXT:    retq
1365 ;
1366 ; X64-AVX512-LABEL: add_ss_mask:
1367 ; X64-AVX512:       # %bb.0:
1368 ; X64-AVX512-NEXT:    kmovw %edi, %k1
1369 ; X64-AVX512-NEXT:    vaddss %xmm1, %xmm0, %xmm2 {%k1}
1370 ; X64-AVX512-NEXT:    vmovaps %xmm2, %xmm0
1371 ; X64-AVX512-NEXT:    retq
1372   %1 = extractelement <4 x float> %a, i64 0
1373   %2 = extractelement <4 x float> %b, i64 0
1374   %3 = fadd float %1, %2
1375   %4 = extractelement <4 x float> %c, i32 0
1376   %5 = bitcast i8 %mask to <8 x i1>
1377   %6 = extractelement <8 x i1> %5, i64 0
1378   %7 = select i1 %6, float %3, float %4
1379   %8 = insertelement <4 x float> %a, float %7, i64 0
1380   ret <4 x float> %8
1381 }
1382
1383 define <2 x double> @add_sd_mask(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) {
1384 ; X86-SSE2-LABEL: add_sd_mask:
1385 ; X86-SSE2:       # %bb.0:
1386 ; X86-SSE2-NEXT:    testb $1, {{[0-9]+}}(%esp)
1387 ; X86-SSE2-NEXT:    jne .LBB71_1
1388 ; X86-SSE2-NEXT:  # %bb.2:
1389 ; X86-SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
1390 ; X86-SSE2-NEXT:    retl
1391 ; X86-SSE2-NEXT:  .LBB71_1:
1392 ; X86-SSE2-NEXT:    addsd %xmm0, %xmm1
1393 ; X86-SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
1394 ; X86-SSE2-NEXT:    retl
1395 ;
1396 ; X86-SSE41-LABEL: add_sd_mask:
1397 ; X86-SSE41:       # %bb.0:
1398 ; X86-SSE41-NEXT:    testb $1, {{[0-9]+}}(%esp)
1399 ; X86-SSE41-NEXT:    jne .LBB71_1
1400 ; X86-SSE41-NEXT:  # %bb.2:
1401 ; X86-SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3]
1402 ; X86-SSE41-NEXT:    retl
1403 ; X86-SSE41-NEXT:  .LBB71_1:
1404 ; X86-SSE41-NEXT:    addsd %xmm0, %xmm1
1405 ; X86-SSE41-NEXT:    blendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
1406 ; X86-SSE41-NEXT:    retl
1407 ;
1408 ; X86-AVX1-LABEL: add_sd_mask:
1409 ; X86-AVX1:       # %bb.0:
1410 ; X86-AVX1-NEXT:    testb $1, {{[0-9]+}}(%esp)
1411 ; X86-AVX1-NEXT:    je .LBB71_2
1412 ; X86-AVX1-NEXT:  # %bb.1:
1413 ; X86-AVX1-NEXT:    vaddsd %xmm1, %xmm0, %xmm2
1414 ; X86-AVX1-NEXT:  .LBB71_2:
1415 ; X86-AVX1-NEXT:    vblendpd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
1416 ; X86-AVX1-NEXT:    retl
1417 ;
1418 ; X86-AVX512-LABEL: add_sd_mask:
1419 ; X86-AVX512:       # %bb.0:
1420 ; X86-AVX512-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
1421 ; X86-AVX512-NEXT:    kmovw %eax, %k1
1422 ; X86-AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm2 {%k1}
1423 ; X86-AVX512-NEXT:    vmovapd %xmm2, %xmm0
1424 ; X86-AVX512-NEXT:    retl
1425 ;
1426 ; X64-SSE2-LABEL: add_sd_mask:
1427 ; X64-SSE2:       # %bb.0:
1428 ; X64-SSE2-NEXT:    testb $1, %dil
1429 ; X64-SSE2-NEXT:    jne .LBB71_1
1430 ; X64-SSE2-NEXT:  # %bb.2:
1431 ; X64-SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
1432 ; X64-SSE2-NEXT:    retq
1433 ; X64-SSE2-NEXT:  .LBB71_1:
1434 ; X64-SSE2-NEXT:    addsd %xmm0, %xmm1
1435 ; X64-SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
1436 ; X64-SSE2-NEXT:    retq
1437 ;
1438 ; X64-SSE41-LABEL: add_sd_mask:
1439 ; X64-SSE41:       # %bb.0:
1440 ; X64-SSE41-NEXT:    testb $1, %dil
1441 ; X64-SSE41-NEXT:    jne .LBB71_1
1442 ; X64-SSE41-NEXT:  # %bb.2:
1443 ; X64-SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3]
1444 ; X64-SSE41-NEXT:    retq
1445 ; X64-SSE41-NEXT:  .LBB71_1:
1446 ; X64-SSE41-NEXT:    addsd %xmm0, %xmm1
1447 ; X64-SSE41-NEXT:    blendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
1448 ; X64-SSE41-NEXT:    retq
1449 ;
1450 ; X64-AVX1-LABEL: add_sd_mask:
1451 ; X64-AVX1:       # %bb.0:
1452 ; X64-AVX1-NEXT:    testb $1, %dil
1453 ; X64-AVX1-NEXT:    je .LBB71_2
1454 ; X64-AVX1-NEXT:  # %bb.1:
1455 ; X64-AVX1-NEXT:    vaddsd %xmm1, %xmm0, %xmm2
1456 ; X64-AVX1-NEXT:  .LBB71_2:
1457 ; X64-AVX1-NEXT:    vblendpd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
1458 ; X64-AVX1-NEXT:    retq
1459 ;
1460 ; X64-AVX512-LABEL: add_sd_mask:
1461 ; X64-AVX512:       # %bb.0:
1462 ; X64-AVX512-NEXT:    kmovw %edi, %k1
1463 ; X64-AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm2 {%k1}
1464 ; X64-AVX512-NEXT:    vmovapd %xmm2, %xmm0
1465 ; X64-AVX512-NEXT:    retq
1466   %1 = extractelement <2 x double> %a, i64 0
1467   %2 = extractelement <2 x double> %b, i64 0
1468   %3 = fadd double %1, %2
1469   %4 = extractelement <2 x double> %c, i32 0
1470   %5 = bitcast i8 %mask to <8 x i1>
1471   %6 = extractelement <8 x i1> %5, i64 0
1472   %7 = select i1 %6, double %3, double %4
1473   %8 = insertelement <2 x double> %a, double %7, i64 0
1474   ret <2 x double> %8
1475 }
1476
1477 define float @PR26515(<4 x float> %0) nounwind {
1478 ; X86-SSE-LABEL: PR26515:
1479 ; X86-SSE:       # %bb.0:
1480 ; X86-SSE-NEXT:    pushl %eax
1481 ; X86-SSE-NEXT:    movaps %xmm0, %xmm1
1482 ; X86-SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
1483 ; X86-SSE-NEXT:    addss %xmm0, %xmm1
1484 ; X86-SSE-NEXT:    movss %xmm1, (%esp)
1485 ; X86-SSE-NEXT:    flds (%esp)
1486 ; X86-SSE-NEXT:    popl %eax
1487 ; X86-SSE-NEXT:    retl
1488 ;
1489 ; X86-AVX-LABEL: PR26515:
1490 ; X86-AVX:       # %bb.0:
1491 ; X86-AVX-NEXT:    pushl %eax
1492 ; X86-AVX-NEXT:    vshufpd {{.*#+}} xmm1 = xmm0[1,0]
1493 ; X86-AVX-NEXT:    vaddss %xmm0, %xmm1, %xmm0
1494 ; X86-AVX-NEXT:    vmovss %xmm0, (%esp)
1495 ; X86-AVX-NEXT:    flds (%esp)
1496 ; X86-AVX-NEXT:    popl %eax
1497 ; X86-AVX-NEXT:    retl
1498 ;
1499 ; X64-SSE-LABEL: PR26515:
1500 ; X64-SSE:       # %bb.0:
1501 ; X64-SSE-NEXT:    movaps %xmm0, %xmm1
1502 ; X64-SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
1503 ; X64-SSE-NEXT:    addss %xmm1, %xmm0
1504 ; X64-SSE-NEXT:    retq
1505 ;
1506 ; X64-AVX-LABEL: PR26515:
1507 ; X64-AVX:       # %bb.0:
1508 ; X64-AVX-NEXT:    vshufpd {{.*#+}} xmm1 = xmm0[1,0]
1509 ; X64-AVX-NEXT:    vaddss %xmm0, %xmm1, %xmm0
1510 ; X64-AVX-NEXT:    retq
1511   %2 = shufflevector <4 x float> %0, <4 x float> poison, <4 x i32> <i32 2, i32 undef, i32 undef, i32 undef>
1512   %3 = fadd <4 x float> %2, %0
1513   %4 = extractelement <4 x float> %3, i64 0
1514   ret float %4
1515 }