test/CodeGen/X86/haddsub-undef.ll

   1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
   2 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse3               | FileCheck %s --check-prefixes=SSE,SSE-SLOW
   3 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse3,fast-hops     | FileCheck %s --check-prefixes=SSE,SSE-FAST
   4 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx                | FileCheck %s --check-prefixes=AVX,AVX-SLOW,AVX1-SLOW
   5 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx,fast-hops      | FileCheck %s --check-prefixes=AVX,AVX-FAST,AVX1-FAST
   6 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512f            | FileCheck %s --check-prefixes=AVX,AVX-SLOW,AVX512,AVX512-SLOW
   7 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512f,fast-hops  | FileCheck %s --check-prefixes=AVX,AVX-FAST,AVX512,AVX512-FAST
   8
   9 ; Verify that we correctly fold horizontal binop even in the presence of UNDEFs.
  10
  11 define <4 x float> @test1_undef(<4 x float> %a, <4 x float> %b) {
  12 ; SSE-LABEL: test1_undef:
  13 ; SSE:       # %bb.0:
  14 ; SSE-NEXT:    haddps %xmm1, %xmm0
  15 ; SSE-NEXT:    retq
  16 ;
  17 ; AVX-LABEL: test1_undef:
  18 ; AVX:       # %bb.0:
  19 ; AVX-NEXT:    vhaddps %xmm1, %xmm0, %xmm0
  20 ; AVX-NEXT:    retq
  21   %vecext = extractelement <4 x float> %a, i32 0
  22   %vecext1 = extractelement <4 x float> %a, i32 1
  23   %add = fadd float %vecext, %vecext1
  24   %vecinit = insertelement <4 x float> undef, float %add, i32 0
  25   %vecext2 = extractelement <4 x float> %a, i32 2
  26   %vecext3 = extractelement <4 x float> %a, i32 3
  27   %add4 = fadd float %vecext2, %vecext3
  28   %vecinit5 = insertelement <4 x float> %vecinit, float %add4, i32 1
  29   %vecext10 = extractelement <4 x float> %b, i32 2
  30   %vecext11 = extractelement <4 x float> %b, i32 3
  31   %add12 = fadd float %vecext10, %vecext11
  32   %vecinit13 = insertelement <4 x float> %vecinit5, float %add12, i32 3
  33   ret <4 x float> %vecinit13
  34 }
  35
  36 define <4 x float> @test2_undef(<4 x float> %a, <4 x float> %b) {
  37 ; SSE-LABEL: test2_undef:
  38 ; SSE:       # %bb.0:
  39 ; SSE-NEXT:    haddps %xmm1, %xmm0
  40 ; SSE-NEXT:    retq
  41 ;
  42 ; AVX-LABEL: test2_undef:
  43 ; AVX:       # %bb.0:
  44 ; AVX-NEXT:    vhaddps %xmm1, %xmm0, %xmm0
  45 ; AVX-NEXT:    retq
  46   %vecext = extractelement <4 x float> %a, i32 0
  47   %vecext1 = extractelement <4 x float> %a, i32 1
  48   %add = fadd float %vecext, %vecext1
  49   %vecinit = insertelement <4 x float> undef, float %add, i32 0
  50   %vecext6 = extractelement <4 x float> %b, i32 0
  51   %vecext7 = extractelement <4 x float> %b, i32 1
  52   %add8 = fadd float %vecext6, %vecext7
  53   %vecinit9 = insertelement <4 x float> %vecinit, float %add8, i32 2
  54   %vecext10 = extractelement <4 x float> %b, i32 2
  55   %vecext11 = extractelement <4 x float> %b, i32 3
  56   %add12 = fadd float %vecext10, %vecext11
  57   %vecinit13 = insertelement <4 x float> %vecinit9, float %add12, i32 3
  58   ret <4 x float> %vecinit13
  59 }
  60
  61 define <4 x float> @test3_undef(<4 x float> %a, <4 x float> %b) {
  62 ; SSE-LABEL: test3_undef:
  63 ; SSE:       # %bb.0:
  64 ; SSE-NEXT:    haddps %xmm1, %xmm0
  65 ; SSE-NEXT:    retq
  66 ;
  67 ; AVX-LABEL: test3_undef:
  68 ; AVX:       # %bb.0:
  69 ; AVX-NEXT:    vhaddps %xmm1, %xmm0, %xmm0
  70 ; AVX-NEXT:    retq
  71   %vecext = extractelement <4 x float> %a, i32 0
  72   %vecext1 = extractelement <4 x float> %a, i32 1
  73   %add = fadd float %vecext, %vecext1
  74   %vecinit = insertelement <4 x float> undef, float %add, i32 0
  75   %vecext2 = extractelement <4 x float> %a, i32 2
  76   %vecext3 = extractelement <4 x float> %a, i32 3
  77   %add4 = fadd float %vecext2, %vecext3
  78   %vecinit5 = insertelement <4 x float> %vecinit, float %add4, i32 1
  79   %vecext6 = extractelement <4 x float> %b, i32 0
  80   %vecext7 = extractelement <4 x float> %b, i32 1
  81   %add8 = fadd float %vecext6, %vecext7
  82   %vecinit9 = insertelement <4 x float> %vecinit5, float %add8, i32 2
  83   ret <4 x float> %vecinit9
  84 }
  85
  86 define <4 x float> @test4_undef(<4 x float> %a, <4 x float> %b) {
  87 ; SSE-SLOW-LABEL: test4_undef:
  88 ; SSE-SLOW:       # %bb.0:
  89 ; SSE-SLOW-NEXT:    movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
  90 ; SSE-SLOW-NEXT:    addss %xmm1, %xmm0
  91 ; SSE-SLOW-NEXT:    retq
  92 ;
  93 ; SSE-FAST-LABEL: test4_undef:
  94 ; SSE-FAST:       # %bb.0:
  95 ; SSE-FAST-NEXT:    haddps %xmm0, %xmm0
  96 ; SSE-FAST-NEXT:    retq
  97 ;
  98 ; AVX-SLOW-LABEL: test4_undef:
  99 ; AVX-SLOW:       # %bb.0:
 100 ; AVX-SLOW-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
 101 ; AVX-SLOW-NEXT:    vaddss %xmm1, %xmm0, %xmm0
 102 ; AVX-SLOW-NEXT:    retq
 103 ;
 104 ; AVX-FAST-LABEL: test4_undef:
 105 ; AVX-FAST:       # %bb.0:
 106 ; AVX-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
 107 ; AVX-FAST-NEXT:    retq
 108   %vecext = extractelement <4 x float> %a, i32 0
 109   %vecext1 = extractelement <4 x float> %a, i32 1
 110   %add = fadd float %vecext, %vecext1
 111   %vecinit = insertelement <4 x float> undef, float %add, i32 0
 112   ret <4 x float> %vecinit
 113 }
 114
 115 define <2 x double> @test5_undef(<2 x double> %a, <2 x double> %b) {
 116 ; SSE-SLOW-LABEL: test5_undef:
 117 ; SSE-SLOW:       # %bb.0:
 118 ; SSE-SLOW-NEXT:    movapd %xmm0, %xmm1
 119 ; SSE-SLOW-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
 120 ; SSE-SLOW-NEXT:    addsd %xmm0, %xmm1
 121 ; SSE-SLOW-NEXT:    movapd %xmm1, %xmm0
 122 ; SSE-SLOW-NEXT:    retq
 123 ;
 124 ; SSE-FAST-LABEL: test5_undef:
 125 ; SSE-FAST:       # %bb.0:
 126 ; SSE-FAST-NEXT:    haddpd %xmm0, %xmm0
 127 ; SSE-FAST-NEXT:    retq
 128 ;
 129 ; AVX-SLOW-LABEL: test5_undef:
 130 ; AVX-SLOW:       # %bb.0:
 131 ; AVX-SLOW-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
 132 ; AVX-SLOW-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
 133 ; AVX-SLOW-NEXT:    retq
 134 ;
 135 ; AVX-FAST-LABEL: test5_undef:
 136 ; AVX-FAST:       # %bb.0:
 137 ; AVX-FAST-NEXT:    vhaddpd %xmm0, %xmm0, %xmm0
 138 ; AVX-FAST-NEXT:    retq
 139   %vecext = extractelement <2 x double> %a, i32 0
 140   %vecext1 = extractelement <2 x double> %a, i32 1
 141   %add = fadd double %vecext, %vecext1
 142   %vecinit = insertelement <2 x double> undef, double %add, i32 0
 143   ret <2 x double> %vecinit
 144 }
 145
 146 define <4 x float> @test6_undef(<4 x float> %a, <4 x float> %b) {
 147 ; SSE-LABEL: test6_undef:
 148 ; SSE:       # %bb.0:
 149 ; SSE-NEXT:    haddps %xmm0, %xmm0
 150 ; SSE-NEXT:    retq
 151 ;
 152 ; AVX-LABEL: test6_undef:
 153 ; AVX:       # %bb.0:
 154 ; AVX-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
 155 ; AVX-NEXT:    retq
 156   %vecext = extractelement <4 x float> %a, i32 0
 157   %vecext1 = extractelement <4 x float> %a, i32 1
 158   %add = fadd float %vecext, %vecext1
 159   %vecinit = insertelement <4 x float> undef, float %add, i32 0
 160   %vecext2 = extractelement <4 x float> %a, i32 2
 161   %vecext3 = extractelement <4 x float> %a, i32 3
 162   %add4 = fadd float %vecext2, %vecext3
 163   %vecinit5 = insertelement <4 x float> %vecinit, float %add4, i32 1
 164   ret <4 x float> %vecinit5
 165 }
 166
 167 define <4 x float> @test7_undef(<4 x float> %a, <4 x float> %b) {
 168 ; SSE-LABEL: test7_undef:
 169 ; SSE:       # %bb.0:
 170 ; SSE-NEXT:    haddps %xmm1, %xmm0
 171 ; SSE-NEXT:    retq
 172 ;
 173 ; AVX-LABEL: test7_undef:
 174 ; AVX:       # %bb.0:
 175 ; AVX-NEXT:    vhaddps %xmm1, %xmm0, %xmm0
 176 ; AVX-NEXT:    retq
 177   %vecext = extractelement <4 x float> %b, i32 0
 178   %vecext1 = extractelement <4 x float> %b, i32 1
 179   %add = fadd float %vecext, %vecext1
 180   %vecinit = insertelement <4 x float> undef, float %add, i32 2
 181   %vecext2 = extractelement <4 x float> %b, i32 2
 182   %vecext3 = extractelement <4 x float> %b, i32 3
 183   %add4 = fadd float %vecext2, %vecext3
 184   %vecinit5 = insertelement <4 x float> %vecinit, float %add4, i32 3
 185   ret <4 x float> %vecinit5
 186 }
 187
 188 define <4 x float> @test8_undef(<4 x float> %a, <4 x float> %b) {
 189 ; SSE-SLOW-LABEL: test8_undef:
 190 ; SSE-SLOW:       # %bb.0:
 191 ; SSE-SLOW-NEXT:    movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
 192 ; SSE-SLOW-NEXT:    addss %xmm0, %xmm1
 193 ; SSE-SLOW-NEXT:    movaps %xmm0, %xmm2
 194 ; SSE-SLOW-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1]
 195 ; SSE-SLOW-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
 196 ; SSE-SLOW-NEXT:    addss %xmm2, %xmm0
 197 ; SSE-SLOW-NEXT:    movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
 198 ; SSE-SLOW-NEXT:    movaps %xmm1, %xmm0
 199 ; SSE-SLOW-NEXT:    retq
 200 ;
 201 ; SSE-FAST-LABEL: test8_undef:
 202 ; SSE-FAST:       # %bb.0:
 203 ; SSE-FAST-NEXT:    haddps %xmm0, %xmm0
 204 ; SSE-FAST-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1,1,3]
 205 ; SSE-FAST-NEXT:    retq
 206 ;
 207 ; AVX-SLOW-LABEL: test8_undef:
 208 ; AVX-SLOW:       # %bb.0:
 209 ; AVX-SLOW-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
 210 ; AVX-SLOW-NEXT:    vaddss %xmm1, %xmm0, %xmm1
 211 ; AVX-SLOW-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
 212 ; AVX-SLOW-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
 213 ; AVX-SLOW-NEXT:    vaddss %xmm0, %xmm2, %xmm0
 214 ; AVX-SLOW-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
 215 ; AVX-SLOW-NEXT:    retq
 216 ;
 217 ; AVX-FAST-LABEL: test8_undef:
 218 ; AVX-FAST:       # %bb.0:
 219 ; AVX-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
 220 ; AVX-FAST-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,3]
 221 ; AVX-FAST-NEXT:    retq
 222   %vecext = extractelement <4 x float> %a, i32 0
 223   %vecext1 = extractelement <4 x float> %a, i32 1
 224   %add = fadd float %vecext, %vecext1
 225   %vecinit = insertelement <4 x float> undef, float %add, i32 0
 226   %vecext2 = extractelement <4 x float> %a, i32 2
 227   %vecext3 = extractelement <4 x float> %a, i32 3
 228   %add4 = fadd float %vecext2, %vecext3
 229   %vecinit5 = insertelement <4 x float> %vecinit, float %add4, i32 2
 230   ret <4 x float> %vecinit5
 231 }
 232
 233 define <4 x float> @test9_undef(<4 x float> %a, <4 x float> %b) {
 234 ; SSE-LABEL: test9_undef:
 235 ; SSE:       # %bb.0:
 236 ; SSE-NEXT:    haddps %xmm1, %xmm0
 237 ; SSE-NEXT:    retq
 238 ;
 239 ; AVX-LABEL: test9_undef:
 240 ; AVX:       # %bb.0:
 241 ; AVX-NEXT:    vhaddps %xmm1, %xmm0, %xmm0
 242 ; AVX-NEXT:    retq
 243   %vecext = extractelement <4 x float> %a, i32 0
 244   %vecext1 = extractelement <4 x float> %a, i32 1
 245   %add = fadd float %vecext, %vecext1
 246   %vecinit = insertelement <4 x float> undef, float %add, i32 0
 247   %vecext2 = extractelement <4 x float> %b, i32 2
 248   %vecext3 = extractelement <4 x float> %b, i32 3
 249   %add4 = fadd float %vecext2, %vecext3
 250   %vecinit5 = insertelement <4 x float> %vecinit, float %add4, i32 3
 251   ret <4 x float> %vecinit5
 252 }
 253
 254 define <8 x float> @test10_undef(<8 x float> %a, <8 x float> %b) {
 255 ; SSE-LABEL: test10_undef:
 256 ; SSE:       # %bb.0:
 257 ; SSE-NEXT:    haddps %xmm2, %xmm0
 258 ; SSE-NEXT:    retq
 259 ;
 260 ; AVX-LABEL: test10_undef:
 261 ; AVX:       # %bb.0:
 262 ; AVX-NEXT:    vhaddps %xmm1, %xmm0, %xmm0
 263 ; AVX-NEXT:    retq
 264   %vecext = extractelement <8 x float> %a, i32 0
 265   %vecext1 = extractelement <8 x float> %a, i32 1
 266   %add = fadd float %vecext, %vecext1
 267   %vecinit = insertelement <8 x float> undef, float %add, i32 0
 268   %vecext2 = extractelement <8 x float> %b, i32 2
 269   %vecext3 = extractelement <8 x float> %b, i32 3
 270   %add4 = fadd float %vecext2, %vecext3
 271   %vecinit5 = insertelement <8 x float> %vecinit, float %add4, i32 3
 272   ret <8 x float> %vecinit5
 273 }
 274
 275 define <8 x float> @test11_undef(<8 x float> %a, <8 x float> %b) {
 276 ; SSE-SLOW-LABEL: test11_undef:
 277 ; SSE-SLOW:       # %bb.0:
 278 ; SSE-SLOW-NEXT:    movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
 279 ; SSE-SLOW-NEXT:    addss %xmm1, %xmm0
 280 ; SSE-SLOW-NEXT:    movshdup {{.*#+}} xmm1 = xmm3[1,1,3,3]
 281 ; SSE-SLOW-NEXT:    addss %xmm3, %xmm1
 282 ; SSE-SLOW-NEXT:    movddup {{.*#+}} xmm1 = xmm1[0,0]
 283 ; SSE-SLOW-NEXT:    retq
 284 ;
 285 ; SSE-FAST-LABEL: test11_undef:
 286 ; SSE-FAST:       # %bb.0:
 287 ; SSE-FAST-NEXT:    haddps %xmm0, %xmm0
 288 ; SSE-FAST-NEXT:    haddps %xmm3, %xmm3
 289 ; SSE-FAST-NEXT:    movddup {{.*#+}} xmm1 = xmm3[0,0]
 290 ; SSE-FAST-NEXT:    retq
 291 ;
 292 ; AVX-LABEL: test11_undef:
 293 ; AVX:       # %bb.0:
 294 ; AVX-NEXT:    vhaddps %ymm1, %ymm0, %ymm0
 295 ; AVX-NEXT:    retq
 296   %vecext = extractelement <8 x float> %a, i32 0
 297   %vecext1 = extractelement <8 x float> %a, i32 1
 298   %add = fadd float %vecext, %vecext1
 299   %vecinit = insertelement <8 x float> undef, float %add, i32 0
 300   %vecext2 = extractelement <8 x float> %b, i32 4
 301   %vecext3 = extractelement <8 x float> %b, i32 5
 302   %add4 = fadd float %vecext2, %vecext3
 303   %vecinit5 = insertelement <8 x float> %vecinit, float %add4, i32 6
 304   ret <8 x float> %vecinit5
 305 }
 306
 307 define <8 x float> @test12_undef(<8 x float> %a, <8 x float> %b) {
 308 ; SSE-LABEL: test12_undef:
 309 ; SSE:       # %bb.0:
 310 ; SSE-NEXT:    haddps %xmm0, %xmm0
 311 ; SSE-NEXT:    retq
 312 ;
 313 ; AVX-LABEL: test12_undef:
 314 ; AVX:       # %bb.0:
 315 ; AVX-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
 316 ; AVX-NEXT:    retq
 317   %vecext = extractelement <8 x float> %a, i32 0
 318   %vecext1 = extractelement <8 x float> %a, i32 1
 319   %add = fadd float %vecext, %vecext1
 320   %vecinit = insertelement <8 x float> undef, float %add, i32 0
 321   %vecext2 = extractelement <8 x float> %a, i32 2
 322   %vecext3 = extractelement <8 x float> %a, i32 3
 323   %add4 = fadd float %vecext2, %vecext3
 324   %vecinit5 = insertelement <8 x float> %vecinit, float %add4, i32 1
 325   ret <8 x float> %vecinit5
 326 }
 327
 328 define <8 x float> @test13_undef(<8 x float> %a, <8 x float> %b) {
 329 ; SSE-LABEL: test13_undef:
 330 ; SSE:       # %bb.0:
 331 ; SSE-NEXT:    haddps %xmm1, %xmm0
 332 ; SSE-NEXT:    retq
 333 ;
 334 ; AVX-LABEL: test13_undef:
 335 ; AVX:       # %bb.0:
 336 ; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
 337 ; AVX-NEXT:    vhaddps %xmm1, %xmm0, %xmm0
 338 ; AVX-NEXT:    retq
 339   %vecext = extractelement <8 x float> %a, i32 0
 340   %vecext1 = extractelement <8 x float> %a, i32 1
 341   %add1 = fadd float %vecext, %vecext1
 342   %vecinit1 = insertelement <8 x float> undef, float %add1, i32 0
 343   %vecext2 = extractelement <8 x float> %a, i32 2
 344   %vecext3 = extractelement <8 x float> %a, i32 3
 345   %add2 = fadd float %vecext2, %vecext3
 346   %vecinit2 = insertelement <8 x float> %vecinit1, float %add2, i32 1
 347   %vecext4 = extractelement <8 x float> %a, i32 4
 348   %vecext5 = extractelement <8 x float> %a, i32 5
 349   %add3 = fadd float %vecext4, %vecext5
 350   %vecinit3 = insertelement <8 x float> %vecinit2, float %add3, i32 2
 351   %vecext6 = extractelement <8 x float> %a, i32 6
 352   %vecext7 = extractelement <8 x float> %a, i32 7
 353   %add4 = fadd float %vecext6, %vecext7
 354   %vecinit4 = insertelement <8 x float> %vecinit3, float %add4, i32 3
 355   ret <8 x float> %vecinit4
 356 }
 357
 358 define <16 x float> @test13_v16f32_undef(<16 x float> %a, <16 x float> %b) {
 359 ; SSE-LABEL: test13_v16f32_undef:
 360 ; SSE:       # %bb.0:
 361 ; SSE-NEXT:    haddps %xmm1, %xmm0
 362 ; SSE-NEXT:    retq
 363 ;
 364 ; AVX1-SLOW-LABEL: test13_v16f32_undef:
 365 ; AVX1-SLOW:       # %bb.0:
 366 ; AVX1-SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm1
 367 ; AVX1-SLOW-NEXT:    vhaddps %xmm1, %xmm0, %xmm0
 368 ; AVX1-SLOW-NEXT:    retq
 369 ;
 370 ; AVX-FAST-LABEL: test13_v16f32_undef:
 371 ; AVX-FAST:       # %bb.0:
 372 ; AVX-FAST-NEXT:    vextractf128 $1, %ymm0, %xmm1
 373 ; AVX-FAST-NEXT:    vhaddps %xmm1, %xmm0, %xmm0
 374 ; AVX-FAST-NEXT:    retq
 375 ;
 376 ; AVX512-SLOW-LABEL: test13_v16f32_undef:
 377 ; AVX512-SLOW:       # %bb.0:
 378 ; AVX512-SLOW-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
 379 ; AVX512-SLOW-NEXT:    vaddss %xmm1, %xmm0, %xmm1
 380 ; AVX512-SLOW-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
 381 ; AVX512-SLOW-NEXT:    vpermilps {{.*#+}} xmm3 = xmm0[3,1,2,3]
 382 ; AVX512-SLOW-NEXT:    vaddss %xmm3, %xmm2, %xmm2
 383 ; AVX512-SLOW-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[2,3]
 384 ; AVX512-SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm0
 385 ; AVX512-SLOW-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
 386 ; AVX512-SLOW-NEXT:    vaddss %xmm2, %xmm0, %xmm2
 387 ; AVX512-SLOW-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
 388 ; AVX512-SLOW-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
 389 ; AVX512-SLOW-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
 390 ; AVX512-SLOW-NEXT:    vaddss %xmm0, %xmm2, %xmm0
 391 ; AVX512-SLOW-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
 392 ; AVX512-SLOW-NEXT:    retq
 393   %vecext = extractelement <16 x float> %a, i32 0
 394   %vecext1 = extractelement <16 x float> %a, i32 1
 395   %add1 = fadd float %vecext, %vecext1
 396   %vecinit1 = insertelement <16 x float> undef, float %add1, i32 0
 397   %vecext2 = extractelement <16 x float> %a, i32 2
 398   %vecext3 = extractelement <16 x float> %a, i32 3
 399   %add2 = fadd float %vecext2, %vecext3
 400   %vecinit2 = insertelement <16 x float> %vecinit1, float %add2, i32 1
 401   %vecext4 = extractelement <16 x float> %a, i32 4
 402   %vecext5 = extractelement <16 x float> %a, i32 5
 403   %add3 = fadd float %vecext4, %vecext5
 404   %vecinit3 = insertelement <16 x float> %vecinit2, float %add3, i32 2
 405   %vecext6 = extractelement <16 x float> %a, i32 6
 406   %vecext7 = extractelement <16 x float> %a, i32 7
 407   %add4 = fadd float %vecext6, %vecext7
 408   %vecinit4 = insertelement <16 x float> %vecinit3, float %add4, i32 3
 409   ret <16 x float> %vecinit4
 410 }
 411 define <2 x double> @add_pd_003(<2 x double> %x) {
 412 ; SSE-SLOW-LABEL: add_pd_003:
 413 ; SSE-SLOW:       # %bb.0:
 414 ; SSE-SLOW-NEXT:    movddup {{.*#+}} xmm1 = xmm0[0,0]
 415 ; SSE-SLOW-NEXT:    addpd %xmm1, %xmm0
 416 ; SSE-SLOW-NEXT:    retq
 417 ;
 418 ; SSE-FAST-LABEL: add_pd_003:
 419 ; SSE-FAST:       # %bb.0:
 420 ; SSE-FAST-NEXT:    haddpd %xmm0, %xmm0
 421 ; SSE-FAST-NEXT:    retq
 422 ;
 423 ; AVX-SLOW-LABEL: add_pd_003:
 424 ; AVX-SLOW:       # %bb.0:
 425 ; AVX-SLOW-NEXT:    vmovddup {{.*#+}} xmm1 = xmm0[0,0]
 426 ; AVX-SLOW-NEXT:    vaddpd %xmm0, %xmm1, %xmm0
 427 ; AVX-SLOW-NEXT:    retq
 428 ;
 429 ; AVX-FAST-LABEL: add_pd_003:
 430 ; AVX-FAST:       # %bb.0:
 431 ; AVX-FAST-NEXT:    vhaddpd %xmm0, %xmm0, %xmm0
 432 ; AVX-FAST-NEXT:    retq
 433   %l = shufflevector <2 x double> %x, <2 x double> undef, <2 x i32> <i32 undef, i32 0>
 434   %add = fadd <2 x double> %l, %x
 435   ret <2 x double> %add
 436 }
 437
 438 ; Change shuffle mask - no undefs.
 439
 440 define <2 x double> @add_pd_003_2(<2 x double> %x) {
 441 ; SSE-SLOW-LABEL: add_pd_003_2:
 442 ; SSE-SLOW:       # %bb.0:
 443 ; SSE-SLOW-NEXT:    movapd %xmm0, %xmm1
 444 ; SSE-SLOW-NEXT:    shufpd {{.*#+}} xmm1 = xmm1[1],xmm0[0]
 445 ; SSE-SLOW-NEXT:    addpd %xmm0, %xmm1
 446 ; SSE-SLOW-NEXT:    movapd %xmm1, %xmm0
 447 ; SSE-SLOW-NEXT:    retq
 448 ;
 449 ; SSE-FAST-LABEL: add_pd_003_2:
 450 ; SSE-FAST:       # %bb.0:
 451 ; SSE-FAST-NEXT:    haddpd %xmm0, %xmm0
 452 ; SSE-FAST-NEXT:    retq
 453 ;
 454 ; AVX-SLOW-LABEL: add_pd_003_2:
 455 ; AVX-SLOW:       # %bb.0:
 456 ; AVX-SLOW-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
 457 ; AVX-SLOW-NEXT:    vaddpd %xmm0, %xmm1, %xmm0
 458 ; AVX-SLOW-NEXT:    retq
 459 ;
 460 ; AVX-FAST-LABEL: add_pd_003_2:
 461 ; AVX-FAST:       # %bb.0:
 462 ; AVX-FAST-NEXT:    vhaddpd %xmm0, %xmm0, %xmm0
 463 ; AVX-FAST-NEXT:    retq
 464   %l = shufflevector <2 x double> %x, <2 x double> undef, <2 x i32> <i32 1, i32 0>
 465   %add = fadd <2 x double> %l, %x
 466   ret <2 x double> %add
 467 }
 468
 469 define <2 x double> @add_pd_010(<2 x double> %x) {
 470 ; SSE-SLOW-LABEL: add_pd_010:
 471 ; SSE-SLOW:       # %bb.0:
 472 ; SSE-SLOW-NEXT:    movddup {{.*#+}} xmm1 = xmm0[0,0]
 473 ; SSE-SLOW-NEXT:    addpd %xmm0, %xmm1
 474 ; SSE-SLOW-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1,1]
 475 ; SSE-SLOW-NEXT:    movapd %xmm1, %xmm0
 476 ; SSE-SLOW-NEXT:    retq
 477 ;
 478 ; SSE-FAST-LABEL: add_pd_010:
 479 ; SSE-FAST:       # %bb.0:
 480 ; SSE-FAST-NEXT:    haddpd %xmm0, %xmm0
 481 ; SSE-FAST-NEXT:    retq
 482 ;
 483 ; AVX-SLOW-LABEL: add_pd_010:
 484 ; AVX-SLOW:       # %bb.0:
 485 ; AVX-SLOW-NEXT:    vmovddup {{.*#+}} xmm1 = xmm0[0,0]
 486 ; AVX-SLOW-NEXT:    vaddpd %xmm0, %xmm1, %xmm0
 487 ; AVX-SLOW-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
 488 ; AVX-SLOW-NEXT:    retq
 489 ;
 490 ; AVX-FAST-LABEL: add_pd_010:
 491 ; AVX-FAST:       # %bb.0:
 492 ; AVX-FAST-NEXT:    vhaddpd %xmm0, %xmm0, %xmm0
 493 ; AVX-FAST-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
 494 ; AVX-FAST-NEXT:    retq
 495   %l = shufflevector <2 x double> %x, <2 x double> undef, <2 x i32> <i32 undef, i32 0>
 496   %add = fadd <2 x double> %l, %x
 497   %shuffle2 = shufflevector <2 x double> %add, <2 x double> undef, <2 x i32> <i32 1, i32 undef>
 498   ret <2 x double> %shuffle2
 499 }
 500
 501 define <4 x float> @add_ps_007(<4 x float> %x) {
 502 ; SSE-LABEL: add_ps_007:
 503 ; SSE:       # %bb.0:
 504 ; SSE-NEXT:    haddps %xmm0, %xmm0
 505 ; SSE-NEXT:    retq
 506 ;
 507 ; AVX-LABEL: add_ps_007:
 508 ; AVX:       # %bb.0:
 509 ; AVX-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
 510 ; AVX-NEXT:    retq
 511   %l = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 2>
 512   %r = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 1, i32 3>
 513   %add = fadd <4 x float> %l, %r
 514   ret <4 x float> %add
 515 }
 516
 517 define <4 x float> @add_ps_030(<4 x float> %x) {
 518 ; SSE-LABEL: add_ps_030:
 519 ; SSE:       # %bb.0:
 520 ; SSE-NEXT:    haddps %xmm0, %xmm0
 521 ; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,2,2,3]
 522 ; SSE-NEXT:    retq
 523 ;
 524 ; AVX-LABEL: add_ps_030:
 525 ; AVX:       # %bb.0:
 526 ; AVX-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
 527 ; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,2,2,3]
 528 ; AVX-NEXT:    retq
 529   %l = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 2>
 530   %r = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 1, i32 3>
 531   %add = fadd <4 x float> %l, %r
 532   %shuffle2 = shufflevector <4 x float> %add, <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 undef, i32 undef>
 533   ret <4 x float> %shuffle2
 534 }
 535
 536 define <4 x float> @add_ps_007_2(<4 x float> %x) {
 537 ; SSE-LABEL: add_ps_007_2:
 538 ; SSE:       # %bb.0:
 539 ; SSE-NEXT:    haddps %xmm0, %xmm0
 540 ; SSE-NEXT:    retq
 541 ;
 542 ; AVX-LABEL: add_ps_007_2:
 543 ; AVX:       # %bb.0:
 544 ; AVX-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
 545 ; AVX-NEXT:    retq
 546   %l = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 undef>
 547   %r = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 1, i32 undef>
 548   %add = fadd <4 x float> %l, %r
 549   ret <4 x float> %add
 550 }
 551
 552 define <4 x float> @add_ps_008(<4 x float> %x) {
 553 ; SSE-SLOW-LABEL: add_ps_008:
 554 ; SSE-SLOW:       # %bb.0:
 555 ; SSE-SLOW-NEXT:    movsldup {{.*#+}} xmm1 = xmm0[0,0,2,2]
 556 ; SSE-SLOW-NEXT:    addps %xmm1, %xmm0
 557 ; SSE-SLOW-NEXT:    retq
 558 ;
 559 ; SSE-FAST-LABEL: add_ps_008:
 560 ; SSE-FAST:       # %bb.0:
 561 ; SSE-FAST-NEXT:    haddps %xmm0, %xmm0
 562 ; SSE-FAST-NEXT:    retq
 563 ;
 564 ; AVX-SLOW-LABEL: add_ps_008:
 565 ; AVX-SLOW:       # %bb.0:
 566 ; AVX-SLOW-NEXT:    vmovsldup {{.*#+}} xmm1 = xmm0[0,0,2,2]
 567 ; AVX-SLOW-NEXT:    vaddps %xmm0, %xmm1, %xmm0
 568 ; AVX-SLOW-NEXT:    retq
 569 ;
 570 ; AVX-FAST-LABEL: add_ps_008:
 571 ; AVX-FAST:       # %bb.0:
 572 ; AVX-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
 573 ; AVX-FAST-NEXT:    retq
 574   %l = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 undef, i32 2>
 575   %add = fadd <4 x float> %l, %x
 576   ret <4 x float> %add
 577 }
 578
 579 define <4 x float> @add_ps_017(<4 x float> %x) {
 580 ; SSE-SLOW-LABEL: add_ps_017:
 581 ; SSE-SLOW:       # %bb.0:
 582 ; SSE-SLOW-NEXT:    movsldup {{.*#+}} xmm1 = xmm0[0,0,2,2]
 583 ; SSE-SLOW-NEXT:    addps %xmm0, %xmm1
 584 ; SSE-SLOW-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,1,2,3]
 585 ; SSE-SLOW-NEXT:    movaps %xmm1, %xmm0
 586 ; SSE-SLOW-NEXT:    retq
 587 ;
 588 ; SSE-FAST-LABEL: add_ps_017:
 589 ; SSE-FAST:       # %bb.0:
 590 ; SSE-FAST-NEXT:    haddps %xmm0, %xmm0
 591 ; SSE-FAST-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
 592 ; SSE-FAST-NEXT:    retq
 593 ;
 594 ; AVX-SLOW-LABEL: add_ps_017:
 595 ; AVX-SLOW:       # %bb.0:
 596 ; AVX-SLOW-NEXT:    vmovsldup {{.*#+}} xmm1 = xmm0[0,0,2,2]
 597 ; AVX-SLOW-NEXT:    vaddps %xmm0, %xmm1, %xmm0
 598 ; AVX-SLOW-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
 599 ; AVX-SLOW-NEXT:    retq
 600 ;
 601 ; AVX-FAST-LABEL: add_ps_017:
 602 ; AVX-FAST:       # %bb.0:
 603 ; AVX-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
 604 ; AVX-FAST-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
 605 ; AVX-FAST-NEXT:    retq
 606   %l = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 undef, i32 2>
 607   %add = fadd <4 x float> %l, %x
 608   %shuffle2 = shufflevector <4 x float> %add, <4 x float> undef, <4 x i32> <i32 3, i32 undef, i32 undef, i32 undef>
 609   ret <4 x float> %shuffle2
 610 }
 611
 612 define <4 x float> @add_ps_018(<4 x float> %x) {
 613 ; SSE-LABEL: add_ps_018:
 614 ; SSE:       # %bb.0:
 615 ; SSE-NEXT:    haddps %xmm0, %xmm0
 616 ; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2,2,3]
 617 ; SSE-NEXT:    retq
 618 ;
 619 ; AVX-LABEL: add_ps_018:
 620 ; AVX:       # %bb.0:
 621 ; AVX-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
 622 ; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3]
 623 ; AVX-NEXT:    retq
 624   %l = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 undef>
 625   %r = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 1, i32 undef>
 626   %add = fadd <4 x float> %l, %r
 627   %shuffle2 = shufflevector <4 x float> %add, <4 x float> undef, <4 x i32> <i32 undef, i32 2, i32 undef, i32 undef>
 628   ret <4 x float> %shuffle2
 629 }
 630
 631 define <4 x float> @v8f32_inputs_v4f32_output_0101(<8 x float> %a, <8 x float> %b) {
 632 ; SSE-LABEL: v8f32_inputs_v4f32_output_0101:
 633 ; SSE:       # %bb.0:
 634 ; SSE-NEXT:    haddps %xmm2, %xmm0
 635 ; SSE-NEXT:    retq
 636 ;
 637 ; AVX-LABEL: v8f32_inputs_v4f32_output_0101:
 638 ; AVX:       # %bb.0:
 639 ; AVX-NEXT:    vhaddps %xmm1, %xmm0, %xmm0
 640 ; AVX-NEXT:    vzeroupper
 641 ; AVX-NEXT:    retq
 642   %a0 = extractelement <8 x float> %a, i32 0
 643   %a1 = extractelement <8 x float> %a, i32 1
 644   %b0 = extractelement <8 x float> %b, i32 0
 645   %b1 = extractelement <8 x float> %b, i32 1
 646   %add0 = fadd float %a0, %a1
 647   %add2 = fadd float %b0, %b1
 648   %r0 = insertelement <4 x float> undef, float %add0, i32 0
 649   %r = insertelement <4 x float> %r0, float %add2, i32 2
 650   ret <4 x float> %r
 651 }
 652
 653 define <4 x float> @v8f32_input0_v4f32_output_0123(<8 x float> %a, <4 x float> %b) {
 654 ; SSE-LABEL: v8f32_input0_v4f32_output_0123:
 655 ; SSE:       # %bb.0:
 656 ; SSE-NEXT:    haddps %xmm2, %xmm0
 657 ; SSE-NEXT:    retq
 658 ;
 659 ; AVX-LABEL: v8f32_input0_v4f32_output_0123:
 660 ; AVX:       # %bb.0:
 661 ; AVX-NEXT:    vhaddps %xmm1, %xmm0, %xmm0
 662 ; AVX-NEXT:    vzeroupper
 663 ; AVX-NEXT:    retq
 664   %a0 = extractelement <8 x float> %a, i32 0
 665   %a1 = extractelement <8 x float> %a, i32 1
 666   %b2 = extractelement <4 x float> %b, i32 2
 667   %b3 = extractelement <4 x float> %b, i32 3
 668   %add0 = fadd float %a0, %a1
 669   %add3 = fadd float %b2, %b3
 670   %r0 = insertelement <4 x float> undef, float %add0, i32 0
 671   %r = insertelement <4 x float> %r0, float %add3, i32 3
 672   ret <4 x float> %r
 673 }
 674
 675 define <4 x float> @v8f32_input1_v4f32_output_2301(<4 x float> %a, <8 x float> %b) {
 676 ; SSE-LABEL: v8f32_input1_v4f32_output_2301:
 677 ; SSE:       # %bb.0:
 678 ; SSE-NEXT:    haddps %xmm1, %xmm0
 679 ; SSE-NEXT:    retq
 680 ;
 681 ; AVX-LABEL: v8f32_input1_v4f32_output_2301:
 682 ; AVX:       # %bb.0:
 683 ; AVX-NEXT:    vhaddps %xmm1, %xmm0, %xmm0
 684 ; AVX-NEXT:    vzeroupper
 685 ; AVX-NEXT:    retq
 686   %a2 = extractelement <4 x float> %a, i32 2
 687   %a3 = extractelement <4 x float> %a, i32 3
 688   %b0 = extractelement <8 x float> %b, i32 0
 689   %b1 = extractelement <8 x float> %b, i32 1
 690   %add1 = fadd float %a2, %a3
 691   %add2 = fadd float %b0, %b1
 692   %r1 = insertelement <4 x float> undef, float %add1, i32 1
 693   %r = insertelement <4 x float> %r1, float %add2, i32 2
 694   ret <4 x float> %r
 695 }
 696
 697 define <4 x float> @v8f32_inputs_v4f32_output_2323(<8 x float> %a, <8 x float> %b) {
 698 ; SSE-LABEL: v8f32_inputs_v4f32_output_2323:
 699 ; SSE:       # %bb.0:
 700 ; SSE-NEXT:    haddps %xmm2, %xmm0
 701 ; SSE-NEXT:    retq
 702 ;
 703 ; AVX-LABEL: v8f32_inputs_v4f32_output_2323:
 704 ; AVX:       # %bb.0:
 705 ; AVX-NEXT:    vhaddps %xmm1, %xmm0, %xmm0
 706 ; AVX-NEXT:    vzeroupper
 707 ; AVX-NEXT:    retq
 708   %a2 = extractelement <8 x float> %a, i32 2
 709   %a3 = extractelement <8 x float> %a, i32 3
 710   %b2 = extractelement <8 x float> %b, i32 2
 711   %b3 = extractelement <8 x float> %b, i32 3
 712   %add1 = fadd float %a2, %a3
 713   %add3 = fadd float %b2, %b3
 714   %r1 = insertelement <4 x float> undef, float %add1, i32 1
 715   %r = insertelement <4 x float> %r1, float %add3, i32 3
 716   ret <4 x float> %r
 717 }
 718
 719 define <4 x float> @v16f32_inputs_v4f32_output_0123(<16 x float> %a, <16 x float> %b) {
 720 ; SSE-LABEL: v16f32_inputs_v4f32_output_0123:
 721 ; SSE:       # %bb.0:
 722 ; SSE-NEXT:    haddps %xmm4, %xmm0
 723 ; SSE-NEXT:    retq
 724 ;
 725 ; AVX1-SLOW-LABEL: v16f32_inputs_v4f32_output_0123:
 726 ; AVX1-SLOW:       # %bb.0:
 727 ; AVX1-SLOW-NEXT:    vhaddps %xmm2, %xmm0, %xmm0
 728 ; AVX1-SLOW-NEXT:    vzeroupper
 729 ; AVX1-SLOW-NEXT:    retq
 730 ;
 731 ; AVX1-FAST-LABEL: v16f32_inputs_v4f32_output_0123:
 732 ; AVX1-FAST:       # %bb.0:
 733 ; AVX1-FAST-NEXT:    vhaddps %xmm2, %xmm0, %xmm0
 734 ; AVX1-FAST-NEXT:    vzeroupper
 735 ; AVX1-FAST-NEXT:    retq
 736 ;
 737 ; AVX512-LABEL: v16f32_inputs_v4f32_output_0123:
 738 ; AVX512:       # %bb.0:
 739 ; AVX512-NEXT:    vhaddps %xmm1, %xmm0, %xmm0
 740 ; AVX512-NEXT:    vzeroupper
 741 ; AVX512-NEXT:    retq
 742   %a0 = extractelement <16 x float> %a, i32 0
 743   %a1 = extractelement <16 x float> %a, i32 1
 744   %b2 = extractelement <16 x float> %b, i32 2
 745   %b3 = extractelement <16 x float> %b, i32 3
 746   %add0 = fadd float %a0, %a1
 747   %add3 = fadd float %b2, %b3
 748   %r0 = insertelement <4 x float> undef, float %add0, i32 0
 749   %r = insertelement <4 x float> %r0, float %add3, i32 3
 750   ret <4 x float> %r
 751 }
 752
 753 define <8 x float> @v16f32_inputs_v8f32_output_4567(<16 x float> %a, <16 x float> %b) {
 754 ; SSE-LABEL: v16f32_inputs_v8f32_output_4567:
 755 ; SSE:       # %bb.0:
 756 ; SSE-NEXT:    haddps %xmm5, %xmm1
 757 ; SSE-NEXT:    retq
 758 ;
 759 ; AVX1-SLOW-LABEL: v16f32_inputs_v8f32_output_4567:
 760 ; AVX1-SLOW:       # %bb.0:
 761 ; AVX1-SLOW-NEXT:    vhaddps %ymm2, %ymm0, %ymm0
 762 ; AVX1-SLOW-NEXT:    retq
 763 ;
 764 ; AVX1-FAST-LABEL: v16f32_inputs_v8f32_output_4567:
 765 ; AVX1-FAST:       # %bb.0:
 766 ; AVX1-FAST-NEXT:    vhaddps %ymm2, %ymm0, %ymm0
 767 ; AVX1-FAST-NEXT:    retq
 768 ;
 769 ; AVX512-LABEL: v16f32_inputs_v8f32_output_4567:
 770 ; AVX512:       # %bb.0:
 771 ; AVX512-NEXT:    vhaddps %ymm1, %ymm0, %ymm0
 772 ; AVX512-NEXT:    retq
 773   %a4 = extractelement <16 x float> %a, i32 4
 774   %a5 = extractelement <16 x float> %a, i32 5
 775   %b6 = extractelement <16 x float> %b, i32 6
 776   %b7 = extractelement <16 x float> %b, i32 7
 777   %add4 = fadd float %a4, %a5
 778   %add7 = fadd float %b6, %b7
 779   %r4 = insertelement <8 x float> undef, float %add4, i32 4
 780   %r = insertelement <8 x float> %r4, float %add7, i32 7
 781   ret <8 x float> %r
 782 }
 783
 784 define <8 x float> @PR40243(<8 x float> %a, <8 x float> %b) {
 785 ; SSE-LABEL: PR40243:
 786 ; SSE:       # %bb.0:
 787 ; SSE-NEXT:    haddps %xmm3, %xmm1
 788 ; SSE-NEXT:    retq
 789 ;
 790 ; AVX-LABEL: PR40243:
 791 ; AVX:       # %bb.0:
 792 ; AVX-NEXT:    vhaddps %ymm1, %ymm0, %ymm0
 793 ; AVX-NEXT:    retq
 794   %a4 = extractelement <8 x float> %a, i32 4
 795   %a5 = extractelement <8 x float> %a, i32 5
 796   %add4 = fadd float %a4, %a5
 797   %b6 = extractelement <8 x float> %b, i32 6
 798   %b7 = extractelement <8 x float> %b, i32 7
 799   %add7 = fadd float %b6, %b7
 800   %r4 = insertelement <8 x float> undef, float %add4, i32 4
 801   %r = insertelement <8 x float> %r4, float %add7, i32 7
 802   ret <8 x float> %r
 803 }
 804