test/CodeGen/X86/haddsub.ll

   1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
   2 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse3               | FileCheck %s --check-prefixes=SSE3,SSE3-SLOW
   3 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse3,fast-hops     | FileCheck %s --check-prefixes=SSE3,SSE3-FAST
   4 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx                | FileCheck %s --check-prefixes=AVX,AVX-SLOW,AVX1,AVX1-SLOW
   5 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx,fast-hops      | FileCheck %s --check-prefixes=AVX,AVX-FAST,AVX1,AVX1-FAST
   6 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2               | FileCheck %s --check-prefixes=AVX,AVX-SLOW,AVX2,AVX2-SLOW
   7 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2,fast-hops     | FileCheck %s --check-prefixes=AVX,AVX-FAST,AVX2,AVX2-FAST
   8 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512f            | FileCheck %s --check-prefixes=AVX,AVX-SLOW,AVX512,AVX512-SLOW
   9 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512f,fast-hops  | FileCheck %s --check-prefixes=AVX,AVX-FAST,AVX512,AVX512-FAST
  10
  11 define <2 x double> @haddpd1(<2 x double> %x, <2 x double> %y) {
  12 ; SSE3-LABEL: haddpd1:
  13 ; SSE3:       # %bb.0:
  14 ; SSE3-NEXT:    haddpd %xmm1, %xmm0
  15 ; SSE3-NEXT:    retq
  16 ;
  17 ; AVX-LABEL: haddpd1:
  18 ; AVX:       # %bb.0:
  19 ; AVX-NEXT:    vhaddpd %xmm1, %xmm0, %xmm0
  20 ; AVX-NEXT:    retq
  21   %a = shufflevector <2 x double> %x, <2 x double> %y, <2 x i32> <i32 0, i32 2>
  22   %b = shufflevector <2 x double> %x, <2 x double> %y, <2 x i32> <i32 1, i32 3>
  23   %r = fadd <2 x double> %a, %b
  24   ret <2 x double> %r
  25 }
  26
  27 define <2 x double> @haddpd2(<2 x double> %x, <2 x double> %y) {
  28 ; SSE3-LABEL: haddpd2:
  29 ; SSE3:       # %bb.0:
  30 ; SSE3-NEXT:    haddpd %xmm1, %xmm0
  31 ; SSE3-NEXT:    retq
  32 ;
  33 ; AVX-LABEL: haddpd2:
  34 ; AVX:       # %bb.0:
  35 ; AVX-NEXT:    vhaddpd %xmm1, %xmm0, %xmm0
  36 ; AVX-NEXT:    retq
  37   %a = shufflevector <2 x double> %x, <2 x double> %y, <2 x i32> <i32 1, i32 2>
  38   %b = shufflevector <2 x double> %y, <2 x double> %x, <2 x i32> <i32 2, i32 1>
  39   %r = fadd <2 x double> %a, %b
  40   ret <2 x double> %r
  41 }
  42
  43 define <2 x double> @haddpd3(<2 x double> %x) {
  44 ; SSE3-SLOW-LABEL: haddpd3:
  45 ; SSE3-SLOW:       # %bb.0:
  46 ; SSE3-SLOW-NEXT:    movapd %xmm0, %xmm1
  47 ; SSE3-SLOW-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
  48 ; SSE3-SLOW-NEXT:    addpd %xmm0, %xmm1
  49 ; SSE3-SLOW-NEXT:    movapd %xmm1, %xmm0
  50 ; SSE3-SLOW-NEXT:    retq
  51 ;
  52 ; SSE3-FAST-LABEL: haddpd3:
  53 ; SSE3-FAST:       # %bb.0:
  54 ; SSE3-FAST-NEXT:    haddpd %xmm0, %xmm0
  55 ; SSE3-FAST-NEXT:    retq
  56 ;
  57 ; AVX-SLOW-LABEL: haddpd3:
  58 ; AVX-SLOW:       # %bb.0:
  59 ; AVX-SLOW-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
  60 ; AVX-SLOW-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
  61 ; AVX-SLOW-NEXT:    retq
  62 ;
  63 ; AVX-FAST-LABEL: haddpd3:
  64 ; AVX-FAST:       # %bb.0:
  65 ; AVX-FAST-NEXT:    vhaddpd %xmm0, %xmm0, %xmm0
  66 ; AVX-FAST-NEXT:    retq
  67   %a = shufflevector <2 x double> %x, <2 x double> undef, <2 x i32> <i32 0, i32 undef>
  68   %b = shufflevector <2 x double> %x, <2 x double> undef, <2 x i32> <i32 1, i32 undef>
  69   %r = fadd <2 x double> %a, %b
  70   ret <2 x double> %r
  71 }
  72
  73 define <4 x float> @haddps1(<4 x float> %x, <4 x float> %y) {
  74 ; SSE3-LABEL: haddps1:
  75 ; SSE3:       # %bb.0:
  76 ; SSE3-NEXT:    haddps %xmm1, %xmm0
  77 ; SSE3-NEXT:    retq
  78 ;
  79 ; AVX-LABEL: haddps1:
  80 ; AVX:       # %bb.0:
  81 ; AVX-NEXT:    vhaddps %xmm1, %xmm0, %xmm0
  82 ; AVX-NEXT:    retq
  83   %a = shufflevector <4 x float> %x, <4 x float> %y, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
  84   %b = shufflevector <4 x float> %x, <4 x float> %y, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
  85   %r = fadd <4 x float> %a, %b
  86   ret <4 x float> %r
  87 }
  88
  89 define <4 x float> @haddps2(<4 x float> %x, <4 x float> %y) {
  90 ; SSE3-LABEL: haddps2:
  91 ; SSE3:       # %bb.0:
  92 ; SSE3-NEXT:    haddps %xmm1, %xmm0
  93 ; SSE3-NEXT:    retq
  94 ;
  95 ; AVX-LABEL: haddps2:
  96 ; AVX:       # %bb.0:
  97 ; AVX-NEXT:    vhaddps %xmm1, %xmm0, %xmm0
  98 ; AVX-NEXT:    retq
  99   %a = shufflevector <4 x float> %x, <4 x float> %y, <4 x i32> <i32 1, i32 2, i32 5, i32 6>
 100   %b = shufflevector <4 x float> %y, <4 x float> %x, <4 x i32> <i32 4, i32 7, i32 0, i32 3>
 101   %r = fadd <4 x float> %a, %b
 102   ret <4 x float> %r
 103 }
 104
 105 define <4 x float> @haddps3(<4 x float> %x) {
 106 ; SSE3-LABEL: haddps3:
 107 ; SSE3:       # %bb.0:
 108 ; SSE3-NEXT:    haddps %xmm0, %xmm0
 109 ; SSE3-NEXT:    retq
 110 ;
 111 ; AVX-LABEL: haddps3:
 112 ; AVX:       # %bb.0:
 113 ; AVX-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
 114 ; AVX-NEXT:    retq
 115   %a = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 2, i32 4, i32 6>
 116   %b = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 3, i32 5, i32 7>
 117   %r = fadd <4 x float> %a, %b
 118   ret <4 x float> %r
 119 }
 120
 121 define <4 x float> @haddps4(<4 x float> %x) {
 122 ; SSE3-LABEL: haddps4:
 123 ; SSE3:       # %bb.0:
 124 ; SSE3-NEXT:    haddps %xmm0, %xmm0
 125 ; SSE3-NEXT:    retq
 126 ;
 127 ; AVX-LABEL: haddps4:
 128 ; AVX:       # %bb.0:
 129 ; AVX-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
 130 ; AVX-NEXT:    retq
 131   %a = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef>
 132   %b = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 1, i32 3, i32 undef, i32 undef>
 133   %r = fadd <4 x float> %a, %b
 134   ret <4 x float> %r
 135 }
 136
 137 define <4 x float> @haddps5(<4 x float> %x) {
 138 ; SSE3-LABEL: haddps5:
 139 ; SSE3:       # %bb.0:
 140 ; SSE3-NEXT:    haddps %xmm0, %xmm0
 141 ; SSE3-NEXT:    retq
 142 ;
 143 ; AVX-LABEL: haddps5:
 144 ; AVX:       # %bb.0:
 145 ; AVX-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
 146 ; AVX-NEXT:    retq
 147   %a = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 0, i32 3, i32 undef, i32 undef>
 148   %b = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 1, i32 2, i32 undef, i32 undef>
 149   %r = fadd <4 x float> %a, %b
 150   ret <4 x float> %r
 151 }
 152
 153 define <4 x float> @haddps6(<4 x float> %x) {
 154 ; SSE3-SLOW-LABEL: haddps6:
 155 ; SSE3-SLOW:       # %bb.0:
 156 ; SSE3-SLOW-NEXT:    movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
 157 ; SSE3-SLOW-NEXT:    addps %xmm1, %xmm0
 158 ; SSE3-SLOW-NEXT:    retq
 159 ;
 160 ; SSE3-FAST-LABEL: haddps6:
 161 ; SSE3-FAST:       # %bb.0:
 162 ; SSE3-FAST-NEXT:    haddps %xmm0, %xmm0
 163 ; SSE3-FAST-NEXT:    retq
 164 ;
 165 ; AVX-SLOW-LABEL: haddps6:
 166 ; AVX-SLOW:       # %bb.0:
 167 ; AVX-SLOW-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
 168 ; AVX-SLOW-NEXT:    vaddps %xmm1, %xmm0, %xmm0
 169 ; AVX-SLOW-NEXT:    retq
 170 ;
 171 ; AVX-FAST-LABEL: haddps6:
 172 ; AVX-FAST:       # %bb.0:
 173 ; AVX-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
 174 ; AVX-FAST-NEXT:    retq
 175   %a = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
 176   %b = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
 177   %r = fadd <4 x float> %a, %b
 178   ret <4 x float> %r
 179 }
 180
 181 define <4 x float> @haddps7(<4 x float> %x) {
 182 ; SSE3-LABEL: haddps7:
 183 ; SSE3:       # %bb.0:
 184 ; SSE3-NEXT:    haddps %xmm0, %xmm0
 185 ; SSE3-NEXT:    retq
 186 ;
 187 ; AVX-LABEL: haddps7:
 188 ; AVX:       # %bb.0:
 189 ; AVX-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
 190 ; AVX-NEXT:    retq
 191   %a = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 3, i32 undef, i32 undef>
 192   %b = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 2, i32 undef, i32 undef>
 193   %r = fadd <4 x float> %a, %b
 194   ret <4 x float> %r
 195 }
 196
 197 define <2 x double> @hsubpd1(<2 x double> %x, <2 x double> %y) {
 198 ; SSE3-LABEL: hsubpd1:
 199 ; SSE3:       # %bb.0:
 200 ; SSE3-NEXT:    hsubpd %xmm1, %xmm0
 201 ; SSE3-NEXT:    retq
 202 ;
 203 ; AVX-LABEL: hsubpd1:
 204 ; AVX:       # %bb.0:
 205 ; AVX-NEXT:    vhsubpd %xmm1, %xmm0, %xmm0
 206 ; AVX-NEXT:    retq
 207   %a = shufflevector <2 x double> %x, <2 x double> %y, <2 x i32> <i32 0, i32 2>
 208   %b = shufflevector <2 x double> %x, <2 x double> %y, <2 x i32> <i32 1, i32 3>
 209   %r = fsub <2 x double> %a, %b
 210   ret <2 x double> %r
 211 }
 212
 213 define <2 x double> @hsubpd2(<2 x double> %x) {
 214 ; SSE3-SLOW-LABEL: hsubpd2:
 215 ; SSE3-SLOW:       # %bb.0:
 216 ; SSE3-SLOW-NEXT:    movapd %xmm0, %xmm1
 217 ; SSE3-SLOW-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
 218 ; SSE3-SLOW-NEXT:    subpd %xmm1, %xmm0
 219 ; SSE3-SLOW-NEXT:    retq
 220 ;
 221 ; SSE3-FAST-LABEL: hsubpd2:
 222 ; SSE3-FAST:       # %bb.0:
 223 ; SSE3-FAST-NEXT:    hsubpd %xmm0, %xmm0
 224 ; SSE3-FAST-NEXT:    retq
 225 ;
 226 ; AVX-SLOW-LABEL: hsubpd2:
 227 ; AVX-SLOW:       # %bb.0:
 228 ; AVX-SLOW-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
 229 ; AVX-SLOW-NEXT:    vsubpd %xmm1, %xmm0, %xmm0
 230 ; AVX-SLOW-NEXT:    retq
 231 ;
 232 ; AVX-FAST-LABEL: hsubpd2:
 233 ; AVX-FAST:       # %bb.0:
 234 ; AVX-FAST-NEXT:    vhsubpd %xmm0, %xmm0, %xmm0
 235 ; AVX-FAST-NEXT:    retq
 236   %a = shufflevector <2 x double> %x, <2 x double> undef, <2 x i32> <i32 0, i32 undef>
 237   %b = shufflevector <2 x double> %x, <2 x double> undef, <2 x i32> <i32 1, i32 undef>
 238   %r = fsub <2 x double> %a, %b
 239   ret <2 x double> %r
 240 }
 241
 242 define <4 x float> @hsubps1(<4 x float> %x, <4 x float> %y) {
 243 ; SSE3-LABEL: hsubps1:
 244 ; SSE3:       # %bb.0:
 245 ; SSE3-NEXT:    hsubps %xmm1, %xmm0
 246 ; SSE3-NEXT:    retq
 247 ;
 248 ; AVX-LABEL: hsubps1:
 249 ; AVX:       # %bb.0:
 250 ; AVX-NEXT:    vhsubps %xmm1, %xmm0, %xmm0
 251 ; AVX-NEXT:    retq
 252   %a = shufflevector <4 x float> %x, <4 x float> %y, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
 253   %b = shufflevector <4 x float> %x, <4 x float> %y, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
 254   %r = fsub <4 x float> %a, %b
 255   ret <4 x float> %r
 256 }
 257
 258 define <4 x float> @hsubps2(<4 x float> %x) {
 259 ; SSE3-LABEL: hsubps2:
 260 ; SSE3:       # %bb.0:
 261 ; SSE3-NEXT:    hsubps %xmm0, %xmm0
 262 ; SSE3-NEXT:    retq
 263 ;
 264 ; AVX-LABEL: hsubps2:
 265 ; AVX:       # %bb.0:
 266 ; AVX-NEXT:    vhsubps %xmm0, %xmm0, %xmm0
 267 ; AVX-NEXT:    retq
 268   %a = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 2, i32 4, i32 6>
 269   %b = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 3, i32 5, i32 7>
 270   %r = fsub <4 x float> %a, %b
 271   ret <4 x float> %r
 272 }
 273
 274 define <4 x float> @hsubps3(<4 x float> %x) {
 275 ; SSE3-LABEL: hsubps3:
 276 ; SSE3:       # %bb.0:
 277 ; SSE3-NEXT:    hsubps %xmm0, %xmm0
 278 ; SSE3-NEXT:    retq
 279 ;
 280 ; AVX-LABEL: hsubps3:
 281 ; AVX:       # %bb.0:
 282 ; AVX-NEXT:    vhsubps %xmm0, %xmm0, %xmm0
 283 ; AVX-NEXT:    retq
 284   %a = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef>
 285   %b = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 1, i32 3, i32 undef, i32 undef>
 286   %r = fsub <4 x float> %a, %b
 287   ret <4 x float> %r
 288 }
 289
 290 define <4 x float> @hsubps4(<4 x float> %x) {
 291 ; SSE3-SLOW-LABEL: hsubps4:
 292 ; SSE3-SLOW:       # %bb.0:
 293 ; SSE3-SLOW-NEXT:    movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
 294 ; SSE3-SLOW-NEXT:    subps %xmm1, %xmm0
 295 ; SSE3-SLOW-NEXT:    retq
 296 ;
 297 ; SSE3-FAST-LABEL: hsubps4:
 298 ; SSE3-FAST:       # %bb.0:
 299 ; SSE3-FAST-NEXT:    hsubps %xmm0, %xmm0
 300 ; SSE3-FAST-NEXT:    retq
 301 ;
 302 ; AVX-SLOW-LABEL: hsubps4:
 303 ; AVX-SLOW:       # %bb.0:
 304 ; AVX-SLOW-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
 305 ; AVX-SLOW-NEXT:    vsubps %xmm1, %xmm0, %xmm0
 306 ; AVX-SLOW-NEXT:    retq
 307 ;
 308 ; AVX-FAST-LABEL: hsubps4:
 309 ; AVX-FAST:       # %bb.0:
 310 ; AVX-FAST-NEXT:    vhsubps %xmm0, %xmm0, %xmm0
 311 ; AVX-FAST-NEXT:    retq
 312   %a = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
 313   %b = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
 314   %r = fsub <4 x float> %a, %b
 315   ret <4 x float> %r
 316 }
 317
 318 define <8 x float> @vhaddps1(<8 x float> %x, <8 x float> %y) {
 319 ; SSE3-LABEL: vhaddps1:
 320 ; SSE3:       # %bb.0:
 321 ; SSE3-NEXT:    haddps %xmm2, %xmm0
 322 ; SSE3-NEXT:    haddps %xmm3, %xmm1
 323 ; SSE3-NEXT:    retq
 324 ;
 325 ; AVX-LABEL: vhaddps1:
 326 ; AVX:       # %bb.0:
 327 ; AVX-NEXT:    vhaddps %ymm1, %ymm0, %ymm0
 328 ; AVX-NEXT:    retq
 329   %a = shufflevector <8 x float> %x, <8 x float> %y, <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
 330   %b = shufflevector <8 x float> %x, <8 x float> %y, <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
 331   %r = fadd <8 x float> %a, %b
 332   ret <8 x float> %r
 333 }
 334
 335 define <8 x float> @vhaddps2(<8 x float> %x, <8 x float> %y) {
 336 ; SSE3-LABEL: vhaddps2:
 337 ; SSE3:       # %bb.0:
 338 ; SSE3-NEXT:    haddps %xmm2, %xmm0
 339 ; SSE3-NEXT:    haddps %xmm3, %xmm1
 340 ; SSE3-NEXT:    retq
 341 ;
 342 ; AVX-LABEL: vhaddps2:
 343 ; AVX:       # %bb.0:
 344 ; AVX-NEXT:    vhaddps %ymm1, %ymm0, %ymm0
 345 ; AVX-NEXT:    retq
 346   %a = shufflevector <8 x float> %x, <8 x float> %y, <8 x i32> <i32 1, i32 2, i32 9, i32 10, i32 5, i32 6, i32 13, i32 14>
 347   %b = shufflevector <8 x float> %y, <8 x float> %x, <8 x i32> <i32 8, i32 11, i32 0, i32 3, i32 12, i32 15, i32 4, i32 7>
 348   %r = fadd <8 x float> %a, %b
 349   ret <8 x float> %r
 350 }
 351
 352 define <8 x float> @vhaddps3(<8 x float> %x) {
 353 ; SSE3-LABEL: vhaddps3:
 354 ; SSE3:       # %bb.0:
 355 ; SSE3-NEXT:    haddps %xmm0, %xmm0
 356 ; SSE3-NEXT:    haddps %xmm1, %xmm1
 357 ; SSE3-NEXT:    retq
 358 ;
 359 ; AVX-LABEL: vhaddps3:
 360 ; AVX:       # %bb.0:
 361 ; AVX-NEXT:    vhaddps %ymm0, %ymm0, %ymm0
 362 ; AVX-NEXT:    retq
 363   %a = shufflevector <8 x float> %x, <8 x float> undef, <8 x i32> <i32 undef, i32 2, i32 8, i32 10, i32 4, i32 6, i32 undef, i32 14>
 364   %b = shufflevector <8 x float> %x, <8 x float> undef, <8 x i32> <i32 1, i32 3, i32 9, i32 undef, i32 5, i32 7, i32 13, i32 15>
 365   %r = fadd <8 x float> %a, %b
 366   ret <8 x float> %r
 367 }
 368
 369 define <8 x float> @vhsubps1(<8 x float> %x, <8 x float> %y) {
 370 ; SSE3-LABEL: vhsubps1:
 371 ; SSE3:       # %bb.0:
 372 ; SSE3-NEXT:    hsubps %xmm2, %xmm0
 373 ; SSE3-NEXT:    hsubps %xmm3, %xmm1
 374 ; SSE3-NEXT:    retq
 375 ;
 376 ; AVX-LABEL: vhsubps1:
 377 ; AVX:       # %bb.0:
 378 ; AVX-NEXT:    vhsubps %ymm1, %ymm0, %ymm0
 379 ; AVX-NEXT:    retq
 380   %a = shufflevector <8 x float> %x, <8 x float> %y, <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
 381   %b = shufflevector <8 x float> %x, <8 x float> %y, <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
 382   %r = fsub <8 x float> %a, %b
 383   ret <8 x float> %r
 384 }
 385
 386 define <8 x float> @vhsubps3(<8 x float> %x) {
 387 ; SSE3-LABEL: vhsubps3:
 388 ; SSE3:       # %bb.0:
 389 ; SSE3-NEXT:    hsubps %xmm0, %xmm0
 390 ; SSE3-NEXT:    hsubps %xmm1, %xmm1
 391 ; SSE3-NEXT:    retq
 392 ;
 393 ; AVX-LABEL: vhsubps3:
 394 ; AVX:       # %bb.0:
 395 ; AVX-NEXT:    vhsubps %ymm0, %ymm0, %ymm0
 396 ; AVX-NEXT:    retq
 397   %a = shufflevector <8 x float> %x, <8 x float> undef, <8 x i32> <i32 undef, i32 2, i32 8, i32 10, i32 4, i32 6, i32 undef, i32 14>
 398   %b = shufflevector <8 x float> %x, <8 x float> undef, <8 x i32> <i32 1, i32 3, i32 9, i32 undef, i32 5, i32 7, i32 13, i32 15>
 399   %r = fsub <8 x float> %a, %b
 400   ret <8 x float> %r
 401 }
 402
 403 define <4 x double> @vhaddpd1(<4 x double> %x, <4 x double> %y) {
 404 ; SSE3-LABEL: vhaddpd1:
 405 ; SSE3:       # %bb.0:
 406 ; SSE3-NEXT:    haddpd %xmm2, %xmm0
 407 ; SSE3-NEXT:    haddpd %xmm3, %xmm1
 408 ; SSE3-NEXT:    retq
 409 ;
 410 ; AVX-LABEL: vhaddpd1:
 411 ; AVX:       # %bb.0:
 412 ; AVX-NEXT:    vhaddpd %ymm1, %ymm0, %ymm0
 413 ; AVX-NEXT:    retq
 414   %a = shufflevector <4 x double> %x, <4 x double> %y, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
 415   %b = shufflevector <4 x double> %x, <4 x double> %y, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
 416   %r = fadd <4 x double> %a, %b
 417   ret <4 x double> %r
 418 }
 419
 420 define <4 x double> @vhsubpd1(<4 x double> %x, <4 x double> %y) {
 421 ; SSE3-LABEL: vhsubpd1:
 422 ; SSE3:       # %bb.0:
 423 ; SSE3-NEXT:    hsubpd %xmm2, %xmm0
 424 ; SSE3-NEXT:    hsubpd %xmm3, %xmm1
 425 ; SSE3-NEXT:    retq
 426 ;
 427 ; AVX-LABEL: vhsubpd1:
 428 ; AVX:       # %bb.0:
 429 ; AVX-NEXT:    vhsubpd %ymm1, %ymm0, %ymm0
 430 ; AVX-NEXT:    retq
 431   %a = shufflevector <4 x double> %x, <4 x double> %y, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
 432   %b = shufflevector <4 x double> %x, <4 x double> %y, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
 433   %r = fsub <4 x double> %a, %b
 434   ret <4 x double> %r
 435 }
 436
 437 define <2 x float> @haddps_v2f32(<4 x float> %v0) {
 438 ; SSE3-LABEL: haddps_v2f32:
 439 ; SSE3:       # %bb.0:
 440 ; SSE3-NEXT:    haddps %xmm0, %xmm0
 441 ; SSE3-NEXT:    retq
 442 ;
 443 ; AVX-LABEL: haddps_v2f32:
 444 ; AVX:       # %bb.0:
 445 ; AVX-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
 446 ; AVX-NEXT:    retq
 447   %v0.0 = extractelement <4 x float> %v0, i32 0
 448   %v0.1 = extractelement <4 x float> %v0, i32 1
 449   %v0.2 = extractelement <4 x float> %v0, i32 2
 450   %v0.3 = extractelement <4 x float> %v0, i32 3
 451   %op0 = fadd float %v0.0, %v0.1
 452   %op1 = fadd float %v0.2, %v0.3
 453   %res0 = insertelement <2 x float> undef, float %op0, i32 0
 454   %res1 = insertelement <2 x float> %res0, float %op1, i32 1
 455   ret <2 x float> %res1
 456 }
 457
 458 ; 128-bit vectors, float/double, fadd/fsub
 459
 460 define float @extract_extract01_v4f32_fadd_f32(<4 x float> %x) {
 461 ; SSE3-SLOW-LABEL: extract_extract01_v4f32_fadd_f32:
 462 ; SSE3-SLOW:       # %bb.0:
 463 ; SSE3-SLOW-NEXT:    movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
 464 ; SSE3-SLOW-NEXT:    addss %xmm1, %xmm0
 465 ; SSE3-SLOW-NEXT:    retq
 466 ;
 467 ; SSE3-FAST-LABEL: extract_extract01_v4f32_fadd_f32:
 468 ; SSE3-FAST:       # %bb.0:
 469 ; SSE3-FAST-NEXT:    haddps %xmm0, %xmm0
 470 ; SSE3-FAST-NEXT:    retq
 471 ;
 472 ; AVX-SLOW-LABEL: extract_extract01_v4f32_fadd_f32:
 473 ; AVX-SLOW:       # %bb.0:
 474 ; AVX-SLOW-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
 475 ; AVX-SLOW-NEXT:    vaddss %xmm1, %xmm0, %xmm0
 476 ; AVX-SLOW-NEXT:    retq
 477 ;
 478 ; AVX-FAST-LABEL: extract_extract01_v4f32_fadd_f32:
 479 ; AVX-FAST:       # %bb.0:
 480 ; AVX-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
 481 ; AVX-FAST-NEXT:    retq
 482   %x0 = extractelement <4 x float> %x, i32 0
 483   %x1 = extractelement <4 x float> %x, i32 1
 484   %x01 = fadd float %x0, %x1
 485   ret float %x01
 486 }
 487
 488 define float @extract_extract23_v4f32_fadd_f32(<4 x float> %x) {
 489 ; SSE3-SLOW-LABEL: extract_extract23_v4f32_fadd_f32:
 490 ; SSE3-SLOW:       # %bb.0:
 491 ; SSE3-SLOW-NEXT:    movaps %xmm0, %xmm1
 492 ; SSE3-SLOW-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
 493 ; SSE3-SLOW-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
 494 ; SSE3-SLOW-NEXT:    addss %xmm1, %xmm0
 495 ; SSE3-SLOW-NEXT:    retq
 496 ;
 497 ; SSE3-FAST-LABEL: extract_extract23_v4f32_fadd_f32:
 498 ; SSE3-FAST:       # %bb.0:
 499 ; SSE3-FAST-NEXT:    haddps %xmm0, %xmm0
 500 ; SSE3-FAST-NEXT:    movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
 501 ; SSE3-FAST-NEXT:    retq
 502 ;
 503 ; AVX-SLOW-LABEL: extract_extract23_v4f32_fadd_f32:
 504 ; AVX-SLOW:       # %bb.0:
 505 ; AVX-SLOW-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
 506 ; AVX-SLOW-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
 507 ; AVX-SLOW-NEXT:    vaddss %xmm0, %xmm1, %xmm0
 508 ; AVX-SLOW-NEXT:    retq
 509 ;
 510 ; AVX-FAST-LABEL: extract_extract23_v4f32_fadd_f32:
 511 ; AVX-FAST:       # %bb.0:
 512 ; AVX-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
 513 ; AVX-FAST-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
 514 ; AVX-FAST-NEXT:    retq
 515   %x0 = extractelement <4 x float> %x, i32 2
 516   %x1 = extractelement <4 x float> %x, i32 3
 517   %x01 = fadd float %x0, %x1
 518   ret float %x01
 519 }
 520
 521 define float @extract_extract01_v4f32_fadd_f32_commute(<4 x float> %x) {
 522 ; SSE3-SLOW-LABEL: extract_extract01_v4f32_fadd_f32_commute:
 523 ; SSE3-SLOW:       # %bb.0:
 524 ; SSE3-SLOW-NEXT:    movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
 525 ; SSE3-SLOW-NEXT:    addss %xmm1, %xmm0
 526 ; SSE3-SLOW-NEXT:    retq
 527 ;
 528 ; SSE3-FAST-LABEL: extract_extract01_v4f32_fadd_f32_commute:
 529 ; SSE3-FAST:       # %bb.0:
 530 ; SSE3-FAST-NEXT:    haddps %xmm0, %xmm0
 531 ; SSE3-FAST-NEXT:    retq
 532 ;
 533 ; AVX-SLOW-LABEL: extract_extract01_v4f32_fadd_f32_commute:
 534 ; AVX-SLOW:       # %bb.0:
 535 ; AVX-SLOW-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
 536 ; AVX-SLOW-NEXT:    vaddss %xmm0, %xmm1, %xmm0
 537 ; AVX-SLOW-NEXT:    retq
 538 ;
 539 ; AVX-FAST-LABEL: extract_extract01_v4f32_fadd_f32_commute:
 540 ; AVX-FAST:       # %bb.0:
 541 ; AVX-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
 542 ; AVX-FAST-NEXT:    retq
 543   %x0 = extractelement <4 x float> %x, i32 0
 544   %x1 = extractelement <4 x float> %x, i32 1
 545   %x01 = fadd float %x1, %x0
 546   ret float %x01
 547 }
 548
 549 define float @extract_extract23_v4f32_fadd_f32_commute(<4 x float> %x) {
 550 ; SSE3-SLOW-LABEL: extract_extract23_v4f32_fadd_f32_commute:
 551 ; SSE3-SLOW:       # %bb.0:
 552 ; SSE3-SLOW-NEXT:    movaps %xmm0, %xmm1
 553 ; SSE3-SLOW-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
 554 ; SSE3-SLOW-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
 555 ; SSE3-SLOW-NEXT:    addss %xmm1, %xmm0
 556 ; SSE3-SLOW-NEXT:    retq
 557 ;
 558 ; SSE3-FAST-LABEL: extract_extract23_v4f32_fadd_f32_commute:
 559 ; SSE3-FAST:       # %bb.0:
 560 ; SSE3-FAST-NEXT:    haddps %xmm0, %xmm0
 561 ; SSE3-FAST-NEXT:    movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
 562 ; SSE3-FAST-NEXT:    retq
 563 ;
 564 ; AVX-SLOW-LABEL: extract_extract23_v4f32_fadd_f32_commute:
 565 ; AVX-SLOW:       # %bb.0:
 566 ; AVX-SLOW-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
 567 ; AVX-SLOW-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
 568 ; AVX-SLOW-NEXT:    vaddss %xmm1, %xmm0, %xmm0
 569 ; AVX-SLOW-NEXT:    retq
 570 ;
 571 ; AVX-FAST-LABEL: extract_extract23_v4f32_fadd_f32_commute:
 572 ; AVX-FAST:       # %bb.0:
 573 ; AVX-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
 574 ; AVX-FAST-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
 575 ; AVX-FAST-NEXT:    retq
 576   %x0 = extractelement <4 x float> %x, i32 2
 577   %x1 = extractelement <4 x float> %x, i32 3
 578   %x01 = fadd float %x1, %x0
 579   ret float %x01
 580 }
 581
 582 define double @extract_extract01_v2f64_fadd_f64(<2 x double> %x) {
 583 ; SSE3-SLOW-LABEL: extract_extract01_v2f64_fadd_f64:
 584 ; SSE3-SLOW:       # %bb.0:
 585 ; SSE3-SLOW-NEXT:    movapd %xmm0, %xmm1
 586 ; SSE3-SLOW-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
 587 ; SSE3-SLOW-NEXT:    addsd %xmm0, %xmm1
 588 ; SSE3-SLOW-NEXT:    movapd %xmm1, %xmm0
 589 ; SSE3-SLOW-NEXT:    retq
 590 ;
 591 ; SSE3-FAST-LABEL: extract_extract01_v2f64_fadd_f64:
 592 ; SSE3-FAST:       # %bb.0:
 593 ; SSE3-FAST-NEXT:    haddpd %xmm0, %xmm0
 594 ; SSE3-FAST-NEXT:    retq
 595 ;
 596 ; AVX-SLOW-LABEL: extract_extract01_v2f64_fadd_f64:
 597 ; AVX-SLOW:       # %bb.0:
 598 ; AVX-SLOW-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
 599 ; AVX-SLOW-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
 600 ; AVX-SLOW-NEXT:    retq
 601 ;
 602 ; AVX-FAST-LABEL: extract_extract01_v2f64_fadd_f64:
 603 ; AVX-FAST:       # %bb.0:
 604 ; AVX-FAST-NEXT:    vhaddpd %xmm0, %xmm0, %xmm0
 605 ; AVX-FAST-NEXT:    retq
 606   %x0 = extractelement <2 x double> %x, i32 0
 607   %x1 = extractelement <2 x double> %x, i32 1
 608   %x01 = fadd double %x0, %x1
 609   ret double %x01
 610 }
 611
 612 define double @extract_extract01_v2f64_fadd_f64_commute(<2 x double> %x) {
 613 ; SSE3-SLOW-LABEL: extract_extract01_v2f64_fadd_f64_commute:
 614 ; SSE3-SLOW:       # %bb.0:
 615 ; SSE3-SLOW-NEXT:    movapd %xmm0, %xmm1
 616 ; SSE3-SLOW-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
 617 ; SSE3-SLOW-NEXT:    addsd %xmm0, %xmm1
 618 ; SSE3-SLOW-NEXT:    movapd %xmm1, %xmm0
 619 ; SSE3-SLOW-NEXT:    retq
 620 ;
 621 ; SSE3-FAST-LABEL: extract_extract01_v2f64_fadd_f64_commute:
 622 ; SSE3-FAST:       # %bb.0:
 623 ; SSE3-FAST-NEXT:    haddpd %xmm0, %xmm0
 624 ; SSE3-FAST-NEXT:    retq
 625 ;
 626 ; AVX-SLOW-LABEL: extract_extract01_v2f64_fadd_f64_commute:
 627 ; AVX-SLOW:       # %bb.0:
 628 ; AVX-SLOW-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
 629 ; AVX-SLOW-NEXT:    vaddsd %xmm0, %xmm1, %xmm0
 630 ; AVX-SLOW-NEXT:    retq
 631 ;
 632 ; AVX-FAST-LABEL: extract_extract01_v2f64_fadd_f64_commute:
 633 ; AVX-FAST:       # %bb.0:
 634 ; AVX-FAST-NEXT:    vhaddpd %xmm0, %xmm0, %xmm0
 635 ; AVX-FAST-NEXT:    retq
 636   %x0 = extractelement <2 x double> %x, i32 0
 637   %x1 = extractelement <2 x double> %x, i32 1
 638   %x01 = fadd double %x1, %x0
 639   ret double %x01
 640 }
 641
 642 define float @extract_extract01_v4f32_fsub_f32(<4 x float> %x) {
 643 ; SSE3-SLOW-LABEL: extract_extract01_v4f32_fsub_f32:
 644 ; SSE3-SLOW:       # %bb.0:
 645 ; SSE3-SLOW-NEXT:    movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
 646 ; SSE3-SLOW-NEXT:    subss %xmm1, %xmm0
 647 ; SSE3-SLOW-NEXT:    retq
 648 ;
 649 ; SSE3-FAST-LABEL: extract_extract01_v4f32_fsub_f32:
 650 ; SSE3-FAST:       # %bb.0:
 651 ; SSE3-FAST-NEXT:    hsubps %xmm0, %xmm0
 652 ; SSE3-FAST-NEXT:    retq
 653 ;
 654 ; AVX-SLOW-LABEL: extract_extract01_v4f32_fsub_f32:
 655 ; AVX-SLOW:       # %bb.0:
 656 ; AVX-SLOW-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
 657 ; AVX-SLOW-NEXT:    vsubss %xmm1, %xmm0, %xmm0
 658 ; AVX-SLOW-NEXT:    retq
 659 ;
 660 ; AVX-FAST-LABEL: extract_extract01_v4f32_fsub_f32:
 661 ; AVX-FAST:       # %bb.0:
 662 ; AVX-FAST-NEXT:    vhsubps %xmm0, %xmm0, %xmm0
 663 ; AVX-FAST-NEXT:    retq
 664   %x0 = extractelement <4 x float> %x, i32 0
 665   %x1 = extractelement <4 x float> %x, i32 1
 666   %x01 = fsub float %x0, %x1
 667   ret float %x01
 668 }
 669
 670 define float @extract_extract23_v4f32_fsub_f32(<4 x float> %x) {
 671 ; SSE3-SLOW-LABEL: extract_extract23_v4f32_fsub_f32:
 672 ; SSE3-SLOW:       # %bb.0:
 673 ; SSE3-SLOW-NEXT:    movaps %xmm0, %xmm1
 674 ; SSE3-SLOW-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
 675 ; SSE3-SLOW-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
 676 ; SSE3-SLOW-NEXT:    subss %xmm0, %xmm1
 677 ; SSE3-SLOW-NEXT:    movaps %xmm1, %xmm0
 678 ; SSE3-SLOW-NEXT:    retq
 679 ;
 680 ; SSE3-FAST-LABEL: extract_extract23_v4f32_fsub_f32:
 681 ; SSE3-FAST:       # %bb.0:
 682 ; SSE3-FAST-NEXT:    hsubps %xmm0, %xmm0
 683 ; SSE3-FAST-NEXT:    movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
 684 ; SSE3-FAST-NEXT:    retq
 685 ;
 686 ; AVX-SLOW-LABEL: extract_extract23_v4f32_fsub_f32:
 687 ; AVX-SLOW:       # %bb.0:
 688 ; AVX-SLOW-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
 689 ; AVX-SLOW-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
 690 ; AVX-SLOW-NEXT:    vsubss %xmm0, %xmm1, %xmm0
 691 ; AVX-SLOW-NEXT:    retq
 692 ;
 693 ; AVX-FAST-LABEL: extract_extract23_v4f32_fsub_f32:
 694 ; AVX-FAST:       # %bb.0:
 695 ; AVX-FAST-NEXT:    vhsubps %xmm0, %xmm0, %xmm0
 696 ; AVX-FAST-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
 697 ; AVX-FAST-NEXT:    retq
 698   %x0 = extractelement <4 x float> %x, i32 2
 699   %x1 = extractelement <4 x float> %x, i32 3
 700   %x01 = fsub float %x0, %x1
 701   ret float %x01
 702 }
 703
 704 define float @extract_extract01_v4f32_fsub_f32_commute(<4 x float> %x) {
 705 ; SSE3-LABEL: extract_extract01_v4f32_fsub_f32_commute:
 706 ; SSE3:       # %bb.0:
 707 ; SSE3-NEXT:    movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
 708 ; SSE3-NEXT:    subss %xmm0, %xmm1
 709 ; SSE3-NEXT:    movaps %xmm1, %xmm0
 710 ; SSE3-NEXT:    retq
 711 ;
 712 ; AVX-LABEL: extract_extract01_v4f32_fsub_f32_commute:
 713 ; AVX:       # %bb.0:
 714 ; AVX-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
 715 ; AVX-NEXT:    vsubss %xmm0, %xmm1, %xmm0
 716 ; AVX-NEXT:    retq
 717   %x0 = extractelement <4 x float> %x, i32 0
 718   %x1 = extractelement <4 x float> %x, i32 1
 719   %x01 = fsub float %x1, %x0
 720   ret float %x01
 721 }
 722
 723 define float @extract_extract23_v4f32_fsub_f32_commute(<4 x float> %x) {
 724 ; SSE3-LABEL: extract_extract23_v4f32_fsub_f32_commute:
 725 ; SSE3:       # %bb.0:
 726 ; SSE3-NEXT:    movaps %xmm0, %xmm1
 727 ; SSE3-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
 728 ; SSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
 729 ; SSE3-NEXT:    subss %xmm1, %xmm0
 730 ; SSE3-NEXT:    retq
 731 ;
 732 ; AVX-LABEL: extract_extract23_v4f32_fsub_f32_commute:
 733 ; AVX:       # %bb.0:
 734 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
 735 ; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
 736 ; AVX-NEXT:    vsubss %xmm1, %xmm0, %xmm0
 737 ; AVX-NEXT:    retq
 738   %x0 = extractelement <4 x float> %x, i32 2
 739   %x1 = extractelement <4 x float> %x, i32 3
 740   %x01 = fsub float %x1, %x0
 741   ret float %x01
 742 }
 743
 744 define double @extract_extract01_v2f64_fsub_f64(<2 x double> %x) {
 745 ; SSE3-SLOW-LABEL: extract_extract01_v2f64_fsub_f64:
 746 ; SSE3-SLOW:       # %bb.0:
 747 ; SSE3-SLOW-NEXT:    movapd %xmm0, %xmm1
 748 ; SSE3-SLOW-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
 749 ; SSE3-SLOW-NEXT:    subsd %xmm1, %xmm0
 750 ; SSE3-SLOW-NEXT:    retq
 751 ;
 752 ; SSE3-FAST-LABEL: extract_extract01_v2f64_fsub_f64:
 753 ; SSE3-FAST:       # %bb.0:
 754 ; SSE3-FAST-NEXT:    hsubpd %xmm0, %xmm0
 755 ; SSE3-FAST-NEXT:    retq
 756 ;
 757 ; AVX-SLOW-LABEL: extract_extract01_v2f64_fsub_f64:
 758 ; AVX-SLOW:       # %bb.0:
 759 ; AVX-SLOW-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
 760 ; AVX-SLOW-NEXT:    vsubsd %xmm1, %xmm0, %xmm0
 761 ; AVX-SLOW-NEXT:    retq
 762 ;
 763 ; AVX-FAST-LABEL: extract_extract01_v2f64_fsub_f64:
 764 ; AVX-FAST:       # %bb.0:
 765 ; AVX-FAST-NEXT:    vhsubpd %xmm0, %xmm0, %xmm0
 766 ; AVX-FAST-NEXT:    retq
 767   %x0 = extractelement <2 x double> %x, i32 0
 768   %x1 = extractelement <2 x double> %x, i32 1
 769   %x01 = fsub double %x0, %x1
 770   ret double %x01
 771 }
 772
 773 define double @extract_extract01_v2f64_fsub_f64_commute(<2 x double> %x) {
 774 ; SSE3-LABEL: extract_extract01_v2f64_fsub_f64_commute:
 775 ; SSE3:       # %bb.0:
 776 ; SSE3-NEXT:    movapd %xmm0, %xmm1
 777 ; SSE3-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
 778 ; SSE3-NEXT:    subsd %xmm0, %xmm1
 779 ; SSE3-NEXT:    movapd %xmm1, %xmm0
 780 ; SSE3-NEXT:    retq
 781 ;
 782 ; AVX-LABEL: extract_extract01_v2f64_fsub_f64_commute:
 783 ; AVX:       # %bb.0:
 784 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
 785 ; AVX-NEXT:    vsubsd %xmm0, %xmm1, %xmm0
 786 ; AVX-NEXT:    retq
 787   %x0 = extractelement <2 x double> %x, i32 0
 788   %x1 = extractelement <2 x double> %x, i32 1
 789   %x01 = fsub double %x1, %x0
 790   ret double %x01
 791 }
 792
 793 ; 256-bit vectors, float/double, fadd/fsub
 794
 795 define float @extract_extract01_v8f32_fadd_f32(<8 x float> %x) {
 796 ; SSE3-SLOW-LABEL: extract_extract01_v8f32_fadd_f32:
 797 ; SSE3-SLOW:       # %bb.0:
 798 ; SSE3-SLOW-NEXT:    movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
 799 ; SSE3-SLOW-NEXT:    addss %xmm1, %xmm0
 800 ; SSE3-SLOW-NEXT:    retq
 801 ;
 802 ; SSE3-FAST-LABEL: extract_extract01_v8f32_fadd_f32:
 803 ; SSE3-FAST:       # %bb.0:
 804 ; SSE3-FAST-NEXT:    haddps %xmm0, %xmm0
 805 ; SSE3-FAST-NEXT:    retq
 806 ;
 807 ; AVX-SLOW-LABEL: extract_extract01_v8f32_fadd_f32:
 808 ; AVX-SLOW:       # %bb.0:
 809 ; AVX-SLOW-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
 810 ; AVX-SLOW-NEXT:    vaddss %xmm1, %xmm0, %xmm0
 811 ; AVX-SLOW-NEXT:    vzeroupper
 812 ; AVX-SLOW-NEXT:    retq
 813 ;
 814 ; AVX-FAST-LABEL: extract_extract01_v8f32_fadd_f32:
 815 ; AVX-FAST:       # %bb.0:
 816 ; AVX-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
 817 ; AVX-FAST-NEXT:    vzeroupper
 818 ; AVX-FAST-NEXT:    retq
 819   %x0 = extractelement <8 x float> %x, i32 0
 820   %x1 = extractelement <8 x float> %x, i32 1
 821   %x01 = fadd float %x0, %x1
 822   ret float %x01
 823 }
 824
 825 define float @extract_extract23_v8f32_fadd_f32(<8 x float> %x) {
 826 ; SSE3-SLOW-LABEL: extract_extract23_v8f32_fadd_f32:
 827 ; SSE3-SLOW:       # %bb.0:
 828 ; SSE3-SLOW-NEXT:    movaps %xmm0, %xmm1
 829 ; SSE3-SLOW-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
 830 ; SSE3-SLOW-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
 831 ; SSE3-SLOW-NEXT:    addss %xmm1, %xmm0
 832 ; SSE3-SLOW-NEXT:    retq
 833 ;
 834 ; SSE3-FAST-LABEL: extract_extract23_v8f32_fadd_f32:
 835 ; SSE3-FAST:       # %bb.0:
 836 ; SSE3-FAST-NEXT:    haddps %xmm0, %xmm0
 837 ; SSE3-FAST-NEXT:    movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
 838 ; SSE3-FAST-NEXT:    retq
 839 ;
 840 ; AVX-SLOW-LABEL: extract_extract23_v8f32_fadd_f32:
 841 ; AVX-SLOW:       # %bb.0:
 842 ; AVX-SLOW-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
 843 ; AVX-SLOW-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
 844 ; AVX-SLOW-NEXT:    vaddss %xmm0, %xmm1, %xmm0
 845 ; AVX-SLOW-NEXT:    vzeroupper
 846 ; AVX-SLOW-NEXT:    retq
 847 ;
 848 ; AVX-FAST-LABEL: extract_extract23_v8f32_fadd_f32:
 849 ; AVX-FAST:       # %bb.0:
 850 ; AVX-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
 851 ; AVX-FAST-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
 852 ; AVX-FAST-NEXT:    vzeroupper
 853 ; AVX-FAST-NEXT:    retq
 854   %x0 = extractelement <8 x float> %x, i32 2
 855   %x1 = extractelement <8 x float> %x, i32 3
 856   %x01 = fadd float %x0, %x1
 857   ret float %x01
 858 }
 859
 860 define float @extract_extract67_v8f32_fadd_f32(<8 x float> %x) {
 861 ; SSE3-SLOW-LABEL: extract_extract67_v8f32_fadd_f32:
 862 ; SSE3-SLOW:       # %bb.0:
 863 ; SSE3-SLOW-NEXT:    movaps %xmm1, %xmm0
 864 ; SSE3-SLOW-NEXT:    movhlps {{.*#+}} xmm1 = xmm1[1,1]
 865 ; SSE3-SLOW-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
 866 ; SSE3-SLOW-NEXT:    addss %xmm1, %xmm0
 867 ; SSE3-SLOW-NEXT:    retq
 868 ;
 869 ; SSE3-FAST-LABEL: extract_extract67_v8f32_fadd_f32:
 870 ; SSE3-FAST:       # %bb.0:
 871 ; SSE3-FAST-NEXT:    haddps %xmm1, %xmm1
 872 ; SSE3-FAST-NEXT:    movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
 873 ; SSE3-FAST-NEXT:    retq
 874 ;
 875 ; AVX-SLOW-LABEL: extract_extract67_v8f32_fadd_f32:
 876 ; AVX-SLOW:       # %bb.0:
 877 ; AVX-SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm0
 878 ; AVX-SLOW-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
 879 ; AVX-SLOW-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
 880 ; AVX-SLOW-NEXT:    vaddss %xmm0, %xmm1, %xmm0
 881 ; AVX-SLOW-NEXT:    vzeroupper
 882 ; AVX-SLOW-NEXT:    retq
 883 ;
 884 ; AVX-FAST-LABEL: extract_extract67_v8f32_fadd_f32:
 885 ; AVX-FAST:       # %bb.0:
 886 ; AVX-FAST-NEXT:    vextractf128 $1, %ymm0, %xmm0
 887 ; AVX-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
 888 ; AVX-FAST-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
 889 ; AVX-FAST-NEXT:    vzeroupper
 890 ; AVX-FAST-NEXT:    retq
 891   %x0 = extractelement <8 x float> %x, i32 6
 892   %x1 = extractelement <8 x float> %x, i32 7
 893   %x01 = fadd float %x0, %x1
 894   ret float %x01
 895 }
 896
 897 define float @extract_extract01_v8f32_fadd_f32_commute(<8 x float> %x) {
 898 ; SSE3-SLOW-LABEL: extract_extract01_v8f32_fadd_f32_commute:
 899 ; SSE3-SLOW:       # %bb.0:
 900 ; SSE3-SLOW-NEXT:    movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
 901 ; SSE3-SLOW-NEXT:    addss %xmm1, %xmm0
 902 ; SSE3-SLOW-NEXT:    retq
 903 ;
 904 ; SSE3-FAST-LABEL: extract_extract01_v8f32_fadd_f32_commute:
 905 ; SSE3-FAST:       # %bb.0:
 906 ; SSE3-FAST-NEXT:    haddps %xmm0, %xmm0
 907 ; SSE3-FAST-NEXT:    retq
 908 ;
 909 ; AVX-SLOW-LABEL: extract_extract01_v8f32_fadd_f32_commute:
 910 ; AVX-SLOW:       # %bb.0:
 911 ; AVX-SLOW-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
 912 ; AVX-SLOW-NEXT:    vaddss %xmm0, %xmm1, %xmm0
 913 ; AVX-SLOW-NEXT:    vzeroupper
 914 ; AVX-SLOW-NEXT:    retq
 915 ;
 916 ; AVX-FAST-LABEL: extract_extract01_v8f32_fadd_f32_commute:
 917 ; AVX-FAST:       # %bb.0:
 918 ; AVX-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
 919 ; AVX-FAST-NEXT:    vzeroupper
 920 ; AVX-FAST-NEXT:    retq
 921   %x0 = extractelement <8 x float> %x, i32 0
 922   %x1 = extractelement <8 x float> %x, i32 1
 923   %x01 = fadd float %x1, %x0
 924   ret float %x01
 925 }
 926
 927 define float @extract_extract23_v8f32_fadd_f32_commute(<8 x float> %x) {
 928 ; SSE3-SLOW-LABEL: extract_extract23_v8f32_fadd_f32_commute:
 929 ; SSE3-SLOW:       # %bb.0:
 930 ; SSE3-SLOW-NEXT:    movaps %xmm0, %xmm1
 931 ; SSE3-SLOW-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
 932 ; SSE3-SLOW-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
 933 ; SSE3-SLOW-NEXT:    addss %xmm1, %xmm0
 934 ; SSE3-SLOW-NEXT:    retq
 935 ;
 936 ; SSE3-FAST-LABEL: extract_extract23_v8f32_fadd_f32_commute:
 937 ; SSE3-FAST:       # %bb.0:
 938 ; SSE3-FAST-NEXT:    haddps %xmm0, %xmm0
 939 ; SSE3-FAST-NEXT:    movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
 940 ; SSE3-FAST-NEXT:    retq
 941 ;
 942 ; AVX-SLOW-LABEL: extract_extract23_v8f32_fadd_f32_commute:
 943 ; AVX-SLOW:       # %bb.0:
 944 ; AVX-SLOW-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
 945 ; AVX-SLOW-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
 946 ; AVX-SLOW-NEXT:    vaddss %xmm1, %xmm0, %xmm0
 947 ; AVX-SLOW-NEXT:    vzeroupper
 948 ; AVX-SLOW-NEXT:    retq
 949 ;
 950 ; AVX-FAST-LABEL: extract_extract23_v8f32_fadd_f32_commute:
 951 ; AVX-FAST:       # %bb.0:
 952 ; AVX-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
 953 ; AVX-FAST-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
 954 ; AVX-FAST-NEXT:    vzeroupper
 955 ; AVX-FAST-NEXT:    retq
 956   %x0 = extractelement <8 x float> %x, i32 2
 957   %x1 = extractelement <8 x float> %x, i32 3
 958   %x01 = fadd float %x1, %x0
 959   ret float %x01
 960 }
 961
 962 define float @extract_extract67_v8f32_fadd_f32_commute(<8 x float> %x) {
 963 ; SSE3-SLOW-LABEL: extract_extract67_v8f32_fadd_f32_commute:
 964 ; SSE3-SLOW:       # %bb.0:
 965 ; SSE3-SLOW-NEXT:    movaps %xmm1, %xmm0
 966 ; SSE3-SLOW-NEXT:    movhlps {{.*#+}} xmm1 = xmm1[1,1]
 967 ; SSE3-SLOW-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
 968 ; SSE3-SLOW-NEXT:    addss %xmm1, %xmm0
 969 ; SSE3-SLOW-NEXT:    retq
 970 ;
 971 ; SSE3-FAST-LABEL: extract_extract67_v8f32_fadd_f32_commute:
 972 ; SSE3-FAST:       # %bb.0:
 973 ; SSE3-FAST-NEXT:    haddps %xmm1, %xmm1
 974 ; SSE3-FAST-NEXT:    movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
 975 ; SSE3-FAST-NEXT:    retq
 976 ;
 977 ; AVX-SLOW-LABEL: extract_extract67_v8f32_fadd_f32_commute:
 978 ; AVX-SLOW:       # %bb.0:
 979 ; AVX-SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm0
 980 ; AVX-SLOW-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
 981 ; AVX-SLOW-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
 982 ; AVX-SLOW-NEXT:    vaddss %xmm1, %xmm0, %xmm0
 983 ; AVX-SLOW-NEXT:    vzeroupper
 984 ; AVX-SLOW-NEXT:    retq
 985 ;
 986 ; AVX-FAST-LABEL: extract_extract67_v8f32_fadd_f32_commute:
 987 ; AVX-FAST:       # %bb.0:
 988 ; AVX-FAST-NEXT:    vextractf128 $1, %ymm0, %xmm0
 989 ; AVX-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
 990 ; AVX-FAST-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
 991 ; AVX-FAST-NEXT:    vzeroupper
 992 ; AVX-FAST-NEXT:    retq
 993   %x0 = extractelement <8 x float> %x, i32 6
 994   %x1 = extractelement <8 x float> %x, i32 7
 995   %x01 = fadd float %x1, %x0
 996   ret float %x01
 997 }
 998
 999 define double @extract_extract01_v4f64_fadd_f64(<4 x double> %x) {
1000 ; SSE3-SLOW-LABEL: extract_extract01_v4f64_fadd_f64:
1001 ; SSE3-SLOW:       # %bb.0:
1002 ; SSE3-SLOW-NEXT:    movapd %xmm0, %xmm1
1003 ; SSE3-SLOW-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
1004 ; SSE3-SLOW-NEXT:    addsd %xmm0, %xmm1
1005 ; SSE3-SLOW-NEXT:    movapd %xmm1, %xmm0
1006 ; SSE3-SLOW-NEXT:    retq
1007 ;
1008 ; SSE3-FAST-LABEL: extract_extract01_v4f64_fadd_f64:
1009 ; SSE3-FAST:       # %bb.0:
1010 ; SSE3-FAST-NEXT:    haddpd %xmm0, %xmm0
1011 ; SSE3-FAST-NEXT:    retq
1012 ;
1013 ; AVX-SLOW-LABEL: extract_extract01_v4f64_fadd_f64:
1014 ; AVX-SLOW:       # %bb.0:
1015 ; AVX-SLOW-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1016 ; AVX-SLOW-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1017 ; AVX-SLOW-NEXT:    vzeroupper
1018 ; AVX-SLOW-NEXT:    retq
1019 ;
1020 ; AVX-FAST-LABEL: extract_extract01_v4f64_fadd_f64:
1021 ; AVX-FAST:       # %bb.0:
1022 ; AVX-FAST-NEXT:    vhaddpd %xmm0, %xmm0, %xmm0
1023 ; AVX-FAST-NEXT:    vzeroupper
1024 ; AVX-FAST-NEXT:    retq
1025   %x0 = extractelement <4 x double> %x, i32 0
1026   %x1 = extractelement <4 x double> %x, i32 1
1027   %x01 = fadd double %x0, %x1
1028   ret double %x01
1029 }
1030
1031 define double @extract_extract23_v4f64_fadd_f64(<4 x double> %x) {
1032 ; SSE3-SLOW-LABEL: extract_extract23_v4f64_fadd_f64:
1033 ; SSE3-SLOW:       # %bb.0:
1034 ; SSE3-SLOW-NEXT:    movapd %xmm1, %xmm0
1035 ; SSE3-SLOW-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
1036 ; SSE3-SLOW-NEXT:    addsd %xmm1, %xmm0
1037 ; SSE3-SLOW-NEXT:    retq
1038 ;
1039 ; SSE3-FAST-LABEL: extract_extract23_v4f64_fadd_f64:
1040 ; SSE3-FAST:       # %bb.0:
1041 ; SSE3-FAST-NEXT:    movapd %xmm1, %xmm0
1042 ; SSE3-FAST-NEXT:    haddpd %xmm1, %xmm0
1043 ; SSE3-FAST-NEXT:    retq
1044 ;
1045 ; AVX-SLOW-LABEL: extract_extract23_v4f64_fadd_f64:
1046 ; AVX-SLOW:       # %bb.0:
1047 ; AVX-SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm0
1048 ; AVX-SLOW-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1049 ; AVX-SLOW-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1050 ; AVX-SLOW-NEXT:    vzeroupper
1051 ; AVX-SLOW-NEXT:    retq
1052 ;
1053 ; AVX-FAST-LABEL: extract_extract23_v4f64_fadd_f64:
1054 ; AVX-FAST:       # %bb.0:
1055 ; AVX-FAST-NEXT:    vextractf128 $1, %ymm0, %xmm0
1056 ; AVX-FAST-NEXT:    vhaddpd %xmm0, %xmm0, %xmm0
1057 ; AVX-FAST-NEXT:    vzeroupper
1058 ; AVX-FAST-NEXT:    retq
1059   %x0 = extractelement <4 x double> %x, i32 2
1060   %x1 = extractelement <4 x double> %x, i32 3
1061   %x01 = fadd double %x0, %x1
1062   ret double %x01
1063 }
1064
1065 define double @extract_extract01_v4f64_fadd_f64_commute(<4 x double> %x) {
1066 ; SSE3-SLOW-LABEL: extract_extract01_v4f64_fadd_f64_commute:
1067 ; SSE3-SLOW:       # %bb.0:
1068 ; SSE3-SLOW-NEXT:    movapd %xmm0, %xmm1
1069 ; SSE3-SLOW-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
1070 ; SSE3-SLOW-NEXT:    addsd %xmm0, %xmm1
1071 ; SSE3-SLOW-NEXT:    movapd %xmm1, %xmm0
1072 ; SSE3-SLOW-NEXT:    retq
1073 ;
1074 ; SSE3-FAST-LABEL: extract_extract01_v4f64_fadd_f64_commute:
1075 ; SSE3-FAST:       # %bb.0:
1076 ; SSE3-FAST-NEXT:    haddpd %xmm0, %xmm0
1077 ; SSE3-FAST-NEXT:    retq
1078 ;
1079 ; AVX-SLOW-LABEL: extract_extract01_v4f64_fadd_f64_commute:
1080 ; AVX-SLOW:       # %bb.0:
1081 ; AVX-SLOW-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1082 ; AVX-SLOW-NEXT:    vaddsd %xmm0, %xmm1, %xmm0
1083 ; AVX-SLOW-NEXT:    vzeroupper
1084 ; AVX-SLOW-NEXT:    retq
1085 ;
1086 ; AVX-FAST-LABEL: extract_extract01_v4f64_fadd_f64_commute:
1087 ; AVX-FAST:       # %bb.0:
1088 ; AVX-FAST-NEXT:    vhaddpd %xmm0, %xmm0, %xmm0
1089 ; AVX-FAST-NEXT:    vzeroupper
1090 ; AVX-FAST-NEXT:    retq
1091   %x0 = extractelement <4 x double> %x, i32 0
1092   %x1 = extractelement <4 x double> %x, i32 1
1093   %x01 = fadd double %x1, %x0
1094   ret double %x01
1095 }
1096
1097 define double @extract_extract23_v4f64_fadd_f64_commute(<4 x double> %x) {
1098 ; SSE3-SLOW-LABEL: extract_extract23_v4f64_fadd_f64_commute:
1099 ; SSE3-SLOW:       # %bb.0:
1100 ; SSE3-SLOW-NEXT:    movapd %xmm1, %xmm0
1101 ; SSE3-SLOW-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
1102 ; SSE3-SLOW-NEXT:    addsd %xmm1, %xmm0
1103 ; SSE3-SLOW-NEXT:    retq
1104 ;
1105 ; SSE3-FAST-LABEL: extract_extract23_v4f64_fadd_f64_commute:
1106 ; SSE3-FAST:       # %bb.0:
1107 ; SSE3-FAST-NEXT:    movapd %xmm1, %xmm0
1108 ; SSE3-FAST-NEXT:    haddpd %xmm1, %xmm0
1109 ; SSE3-FAST-NEXT:    retq
1110 ;
1111 ; AVX-SLOW-LABEL: extract_extract23_v4f64_fadd_f64_commute:
1112 ; AVX-SLOW:       # %bb.0:
1113 ; AVX-SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm0
1114 ; AVX-SLOW-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1115 ; AVX-SLOW-NEXT:    vaddsd %xmm0, %xmm1, %xmm0
1116 ; AVX-SLOW-NEXT:    vzeroupper
1117 ; AVX-SLOW-NEXT:    retq
1118 ;
1119 ; AVX-FAST-LABEL: extract_extract23_v4f64_fadd_f64_commute:
1120 ; AVX-FAST:       # %bb.0:
1121 ; AVX-FAST-NEXT:    vextractf128 $1, %ymm0, %xmm0
1122 ; AVX-FAST-NEXT:    vhaddpd %xmm0, %xmm0, %xmm0
1123 ; AVX-FAST-NEXT:    vzeroupper
1124 ; AVX-FAST-NEXT:    retq
1125   %x0 = extractelement <4 x double> %x, i32 2
1126   %x1 = extractelement <4 x double> %x, i32 3
1127   %x01 = fadd double %x1, %x0
1128   ret double %x01
1129 }
1130
1131 define float @extract_extract01_v8f32_fsub_f32(<8 x float> %x) {
1132 ; SSE3-SLOW-LABEL: extract_extract01_v8f32_fsub_f32:
1133 ; SSE3-SLOW:       # %bb.0:
1134 ; SSE3-SLOW-NEXT:    movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1135 ; SSE3-SLOW-NEXT:    subss %xmm1, %xmm0
1136 ; SSE3-SLOW-NEXT:    retq
1137 ;
1138 ; SSE3-FAST-LABEL: extract_extract01_v8f32_fsub_f32:
1139 ; SSE3-FAST:       # %bb.0:
1140 ; SSE3-FAST-NEXT:    hsubps %xmm0, %xmm0
1141 ; SSE3-FAST-NEXT:    retq
1142 ;
1143 ; AVX-SLOW-LABEL: extract_extract01_v8f32_fsub_f32:
1144 ; AVX-SLOW:       # %bb.0:
1145 ; AVX-SLOW-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1146 ; AVX-SLOW-NEXT:    vsubss %xmm1, %xmm0, %xmm0
1147 ; AVX-SLOW-NEXT:    vzeroupper
1148 ; AVX-SLOW-NEXT:    retq
1149 ;
1150 ; AVX-FAST-LABEL: extract_extract01_v8f32_fsub_f32:
1151 ; AVX-FAST:       # %bb.0:
1152 ; AVX-FAST-NEXT:    vhsubps %xmm0, %xmm0, %xmm0
1153 ; AVX-FAST-NEXT:    vzeroupper
1154 ; AVX-FAST-NEXT:    retq
1155   %x0 = extractelement <8 x float> %x, i32 0
1156   %x1 = extractelement <8 x float> %x, i32 1
1157   %x01 = fsub float %x0, %x1
1158   ret float %x01
1159 }
1160
1161 define float @extract_extract23_v8f32_fsub_f32(<8 x float> %x) {
1162 ; SSE3-SLOW-LABEL: extract_extract23_v8f32_fsub_f32:
1163 ; SSE3-SLOW:       # %bb.0:
1164 ; SSE3-SLOW-NEXT:    movaps %xmm0, %xmm1
1165 ; SSE3-SLOW-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
1166 ; SSE3-SLOW-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
1167 ; SSE3-SLOW-NEXT:    subss %xmm0, %xmm1
1168 ; SSE3-SLOW-NEXT:    movaps %xmm1, %xmm0
1169 ; SSE3-SLOW-NEXT:    retq
1170 ;
1171 ; SSE3-FAST-LABEL: extract_extract23_v8f32_fsub_f32:
1172 ; SSE3-FAST:       # %bb.0:
1173 ; SSE3-FAST-NEXT:    hsubps %xmm0, %xmm0
1174 ; SSE3-FAST-NEXT:    movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
1175 ; SSE3-FAST-NEXT:    retq
1176 ;
1177 ; AVX-SLOW-LABEL: extract_extract23_v8f32_fsub_f32:
1178 ; AVX-SLOW:       # %bb.0:
1179 ; AVX-SLOW-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1180 ; AVX-SLOW-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
1181 ; AVX-SLOW-NEXT:    vsubss %xmm0, %xmm1, %xmm0
1182 ; AVX-SLOW-NEXT:    vzeroupper
1183 ; AVX-SLOW-NEXT:    retq
1184 ;
1185 ; AVX-FAST-LABEL: extract_extract23_v8f32_fsub_f32:
1186 ; AVX-FAST:       # %bb.0:
1187 ; AVX-FAST-NEXT:    vhsubps %xmm0, %xmm0, %xmm0
1188 ; AVX-FAST-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
1189 ; AVX-FAST-NEXT:    vzeroupper
1190 ; AVX-FAST-NEXT:    retq
1191   %x0 = extractelement <8 x float> %x, i32 2
1192   %x1 = extractelement <8 x float> %x, i32 3
1193   %x01 = fsub float %x0, %x1
1194   ret float %x01
1195 }
1196
1197 define float @extract_extract45_v8f32_fsub_f32(<8 x float> %x) {
1198 ; SSE3-SLOW-LABEL: extract_extract45_v8f32_fsub_f32:
1199 ; SSE3-SLOW:       # %bb.0:
1200 ; SSE3-SLOW-NEXT:    movaps %xmm1, %xmm0
1201 ; SSE3-SLOW-NEXT:    movshdup {{.*#+}} xmm1 = xmm1[1,1,3,3]
1202 ; SSE3-SLOW-NEXT:    subss %xmm1, %xmm0
1203 ; SSE3-SLOW-NEXT:    retq
1204 ;
1205 ; SSE3-FAST-LABEL: extract_extract45_v8f32_fsub_f32:
1206 ; SSE3-FAST:       # %bb.0:
1207 ; SSE3-FAST-NEXT:    movaps %xmm1, %xmm0
1208 ; SSE3-FAST-NEXT:    hsubps %xmm1, %xmm0
1209 ; SSE3-FAST-NEXT:    retq
1210 ;
1211 ; AVX-SLOW-LABEL: extract_extract45_v8f32_fsub_f32:
1212 ; AVX-SLOW:       # %bb.0:
1213 ; AVX-SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm0
1214 ; AVX-SLOW-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1215 ; AVX-SLOW-NEXT:    vsubss %xmm1, %xmm0, %xmm0
1216 ; AVX-SLOW-NEXT:    vzeroupper
1217 ; AVX-SLOW-NEXT:    retq
1218 ;
1219 ; AVX-FAST-LABEL: extract_extract45_v8f32_fsub_f32:
1220 ; AVX-FAST:       # %bb.0:
1221 ; AVX-FAST-NEXT:    vextractf128 $1, %ymm0, %xmm0
1222 ; AVX-FAST-NEXT:    vhsubps %xmm0, %xmm0, %xmm0
1223 ; AVX-FAST-NEXT:    vzeroupper
1224 ; AVX-FAST-NEXT:    retq
1225   %x0 = extractelement <8 x float> %x, i32 4
1226   %x1 = extractelement <8 x float> %x, i32 5
1227   %x01 = fsub float %x0, %x1
1228   ret float %x01
1229 }
1230
1231 ; Negative test...or get hoppy and negate?
1232
1233 define float @extract_extract01_v8f32_fsub_f32_commute(<8 x float> %x) {
1234 ; SSE3-LABEL: extract_extract01_v8f32_fsub_f32_commute:
1235 ; SSE3:       # %bb.0:
1236 ; SSE3-NEXT:    movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1237 ; SSE3-NEXT:    subss %xmm0, %xmm1
1238 ; SSE3-NEXT:    movaps %xmm1, %xmm0
1239 ; SSE3-NEXT:    retq
1240 ;
1241 ; AVX-LABEL: extract_extract01_v8f32_fsub_f32_commute:
1242 ; AVX:       # %bb.0:
1243 ; AVX-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1244 ; AVX-NEXT:    vsubss %xmm0, %xmm1, %xmm0
1245 ; AVX-NEXT:    vzeroupper
1246 ; AVX-NEXT:    retq
1247   %x0 = extractelement <8 x float> %x, i32 0
1248   %x1 = extractelement <8 x float> %x, i32 1
1249   %x01 = fsub float %x1, %x0
1250   ret float %x01
1251 }
1252
1253 define double @extract_extract01_v4f64_fsub_f64(<4 x double> %x) {
1254 ; SSE3-SLOW-LABEL: extract_extract01_v4f64_fsub_f64:
1255 ; SSE3-SLOW:       # %bb.0:
1256 ; SSE3-SLOW-NEXT:    movapd %xmm0, %xmm1
1257 ; SSE3-SLOW-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
1258 ; SSE3-SLOW-NEXT:    subsd %xmm1, %xmm0
1259 ; SSE3-SLOW-NEXT:    retq
1260 ;
1261 ; SSE3-FAST-LABEL: extract_extract01_v4f64_fsub_f64:
1262 ; SSE3-FAST:       # %bb.0:
1263 ; SSE3-FAST-NEXT:    hsubpd %xmm0, %xmm0
1264 ; SSE3-FAST-NEXT:    retq
1265 ;
1266 ; AVX-SLOW-LABEL: extract_extract01_v4f64_fsub_f64:
1267 ; AVX-SLOW:       # %bb.0:
1268 ; AVX-SLOW-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1269 ; AVX-SLOW-NEXT:    vsubsd %xmm1, %xmm0, %xmm0
1270 ; AVX-SLOW-NEXT:    vzeroupper
1271 ; AVX-SLOW-NEXT:    retq
1272 ;
1273 ; AVX-FAST-LABEL: extract_extract01_v4f64_fsub_f64:
1274 ; AVX-FAST:       # %bb.0:
1275 ; AVX-FAST-NEXT:    vhsubpd %xmm0, %xmm0, %xmm0
1276 ; AVX-FAST-NEXT:    vzeroupper
1277 ; AVX-FAST-NEXT:    retq
1278   %x0 = extractelement <4 x double> %x, i32 0
1279   %x1 = extractelement <4 x double> %x, i32 1
1280   %x01 = fsub double %x0, %x1
1281   ret double %x01
1282 }
1283
1284 ; Negative test...or get hoppy and negate?
1285
1286 define double @extract_extract01_v4f64_fsub_f64_commute(<4 x double> %x) {
1287 ; SSE3-LABEL: extract_extract01_v4f64_fsub_f64_commute:
1288 ; SSE3:       # %bb.0:
1289 ; SSE3-NEXT:    movapd %xmm0, %xmm1
1290 ; SSE3-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
1291 ; SSE3-NEXT:    subsd %xmm0, %xmm1
1292 ; SSE3-NEXT:    movapd %xmm1, %xmm0
1293 ; SSE3-NEXT:    retq
1294 ;
1295 ; AVX-LABEL: extract_extract01_v4f64_fsub_f64_commute:
1296 ; AVX:       # %bb.0:
1297 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1298 ; AVX-NEXT:    vsubsd %xmm0, %xmm1, %xmm0
1299 ; AVX-NEXT:    vzeroupper
1300 ; AVX-NEXT:    retq
1301   %x0 = extractelement <4 x double> %x, i32 0
1302   %x1 = extractelement <4 x double> %x, i32 1
1303   %x01 = fsub double %x1, %x0
1304   ret double %x01
1305 }
1306
1307 ; 512-bit vectors, float/double, fadd/fsub
1308
1309 define float @extract_extract01_v16f32_fadd_f32(<16 x float> %x) {
1310 ; SSE3-SLOW-LABEL: extract_extract01_v16f32_fadd_f32:
1311 ; SSE3-SLOW:       # %bb.0:
1312 ; SSE3-SLOW-NEXT:    movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1313 ; SSE3-SLOW-NEXT:    addss %xmm1, %xmm0
1314 ; SSE3-SLOW-NEXT:    retq
1315 ;
1316 ; SSE3-FAST-LABEL: extract_extract01_v16f32_fadd_f32:
1317 ; SSE3-FAST:       # %bb.0:
1318 ; SSE3-FAST-NEXT:    haddps %xmm0, %xmm0
1319 ; SSE3-FAST-NEXT:    retq
1320 ;
1321 ; AVX-SLOW-LABEL: extract_extract01_v16f32_fadd_f32:
1322 ; AVX-SLOW:       # %bb.0:
1323 ; AVX-SLOW-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1324 ; AVX-SLOW-NEXT:    vaddss %xmm1, %xmm0, %xmm0
1325 ; AVX-SLOW-NEXT:    vzeroupper
1326 ; AVX-SLOW-NEXT:    retq
1327 ;
1328 ; AVX-FAST-LABEL: extract_extract01_v16f32_fadd_f32:
1329 ; AVX-FAST:       # %bb.0:
1330 ; AVX-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
1331 ; AVX-FAST-NEXT:    vzeroupper
1332 ; AVX-FAST-NEXT:    retq
1333   %x0 = extractelement <16 x float> %x, i32 0
1334   %x1 = extractelement <16 x float> %x, i32 1
1335   %x01 = fadd float %x0, %x1
1336   ret float %x01
1337 }
1338
1339 define float @extract_extract01_v16f32_fadd_f32_commute(<16 x float> %x) {
1340 ; SSE3-SLOW-LABEL: extract_extract01_v16f32_fadd_f32_commute:
1341 ; SSE3-SLOW:       # %bb.0:
1342 ; SSE3-SLOW-NEXT:    movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1343 ; SSE3-SLOW-NEXT:    addss %xmm1, %xmm0
1344 ; SSE3-SLOW-NEXT:    retq
1345 ;
1346 ; SSE3-FAST-LABEL: extract_extract01_v16f32_fadd_f32_commute:
1347 ; SSE3-FAST:       # %bb.0:
1348 ; SSE3-FAST-NEXT:    haddps %xmm0, %xmm0
1349 ; SSE3-FAST-NEXT:    retq
1350 ;
1351 ; AVX-SLOW-LABEL: extract_extract01_v16f32_fadd_f32_commute:
1352 ; AVX-SLOW:       # %bb.0:
1353 ; AVX-SLOW-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1354 ; AVX-SLOW-NEXT:    vaddss %xmm0, %xmm1, %xmm0
1355 ; AVX-SLOW-NEXT:    vzeroupper
1356 ; AVX-SLOW-NEXT:    retq
1357 ;
1358 ; AVX-FAST-LABEL: extract_extract01_v16f32_fadd_f32_commute:
1359 ; AVX-FAST:       # %bb.0:
1360 ; AVX-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
1361 ; AVX-FAST-NEXT:    vzeroupper
1362 ; AVX-FAST-NEXT:    retq
1363   %x0 = extractelement <16 x float> %x, i32 0
1364   %x1 = extractelement <16 x float> %x, i32 1
1365   %x01 = fadd float %x1, %x0
1366   ret float %x01
1367 }
1368
1369 define double @extract_extract01_v8f64_fadd_f64(<8 x double> %x) {
1370 ; SSE3-SLOW-LABEL: extract_extract01_v8f64_fadd_f64:
1371 ; SSE3-SLOW:       # %bb.0:
1372 ; SSE3-SLOW-NEXT:    movapd %xmm0, %xmm1
1373 ; SSE3-SLOW-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
1374 ; SSE3-SLOW-NEXT:    addsd %xmm0, %xmm1
1375 ; SSE3-SLOW-NEXT:    movapd %xmm1, %xmm0
1376 ; SSE3-SLOW-NEXT:    retq
1377 ;
1378 ; SSE3-FAST-LABEL: extract_extract01_v8f64_fadd_f64:
1379 ; SSE3-FAST:       # %bb.0:
1380 ; SSE3-FAST-NEXT:    haddpd %xmm0, %xmm0
1381 ; SSE3-FAST-NEXT:    retq
1382 ;
1383 ; AVX-SLOW-LABEL: extract_extract01_v8f64_fadd_f64:
1384 ; AVX-SLOW:       # %bb.0:
1385 ; AVX-SLOW-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1386 ; AVX-SLOW-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1387 ; AVX-SLOW-NEXT:    vzeroupper
1388 ; AVX-SLOW-NEXT:    retq
1389 ;
1390 ; AVX-FAST-LABEL: extract_extract01_v8f64_fadd_f64:
1391 ; AVX-FAST:       # %bb.0:
1392 ; AVX-FAST-NEXT:    vhaddpd %xmm0, %xmm0, %xmm0
1393 ; AVX-FAST-NEXT:    vzeroupper
1394 ; AVX-FAST-NEXT:    retq
1395   %x0 = extractelement <8 x double> %x, i32 0
1396   %x1 = extractelement <8 x double> %x, i32 1
1397   %x01 = fadd double %x0, %x1
1398   ret double %x01
1399 }
1400
1401 define double @extract_extract01_v8f64_fadd_f64_commute(<8 x double> %x) {
1402 ; SSE3-SLOW-LABEL: extract_extract01_v8f64_fadd_f64_commute:
1403 ; SSE3-SLOW:       # %bb.0:
1404 ; SSE3-SLOW-NEXT:    movapd %xmm0, %xmm1
1405 ; SSE3-SLOW-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
1406 ; SSE3-SLOW-NEXT:    addsd %xmm0, %xmm1
1407 ; SSE3-SLOW-NEXT:    movapd %xmm1, %xmm0
1408 ; SSE3-SLOW-NEXT:    retq
1409 ;
1410 ; SSE3-FAST-LABEL: extract_extract01_v8f64_fadd_f64_commute:
1411 ; SSE3-FAST:       # %bb.0:
1412 ; SSE3-FAST-NEXT:    haddpd %xmm0, %xmm0
1413 ; SSE3-FAST-NEXT:    retq
1414 ;
1415 ; AVX-SLOW-LABEL: extract_extract01_v8f64_fadd_f64_commute:
1416 ; AVX-SLOW:       # %bb.0:
1417 ; AVX-SLOW-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1418 ; AVX-SLOW-NEXT:    vaddsd %xmm0, %xmm1, %xmm0
1419 ; AVX-SLOW-NEXT:    vzeroupper
1420 ; AVX-SLOW-NEXT:    retq
1421 ;
1422 ; AVX-FAST-LABEL: extract_extract01_v8f64_fadd_f64_commute:
1423 ; AVX-FAST:       # %bb.0:
1424 ; AVX-FAST-NEXT:    vhaddpd %xmm0, %xmm0, %xmm0
1425 ; AVX-FAST-NEXT:    vzeroupper
1426 ; AVX-FAST-NEXT:    retq
1427   %x0 = extractelement <8 x double> %x, i32 0
1428   %x1 = extractelement <8 x double> %x, i32 1
1429   %x01 = fadd double %x1, %x0
1430   ret double %x01
1431 }
1432
1433 define float @extract_extract01_v16f32_fsub_f32(<16 x float> %x) {
1434 ; SSE3-SLOW-LABEL: extract_extract01_v16f32_fsub_f32:
1435 ; SSE3-SLOW:       # %bb.0:
1436 ; SSE3-SLOW-NEXT:    movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1437 ; SSE3-SLOW-NEXT:    subss %xmm1, %xmm0
1438 ; SSE3-SLOW-NEXT:    retq
1439 ;
1440 ; SSE3-FAST-LABEL: extract_extract01_v16f32_fsub_f32:
1441 ; SSE3-FAST:       # %bb.0:
1442 ; SSE3-FAST-NEXT:    hsubps %xmm0, %xmm0
1443 ; SSE3-FAST-NEXT:    retq
1444 ;
1445 ; AVX-SLOW-LABEL: extract_extract01_v16f32_fsub_f32:
1446 ; AVX-SLOW:       # %bb.0:
1447 ; AVX-SLOW-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1448 ; AVX-SLOW-NEXT:    vsubss %xmm1, %xmm0, %xmm0
1449 ; AVX-SLOW-NEXT:    vzeroupper
1450 ; AVX-SLOW-NEXT:    retq
1451 ;
1452 ; AVX-FAST-LABEL: extract_extract01_v16f32_fsub_f32:
1453 ; AVX-FAST:       # %bb.0:
1454 ; AVX-FAST-NEXT:    vhsubps %xmm0, %xmm0, %xmm0
1455 ; AVX-FAST-NEXT:    vzeroupper
1456 ; AVX-FAST-NEXT:    retq
1457   %x0 = extractelement <16 x float> %x, i32 0
1458   %x1 = extractelement <16 x float> %x, i32 1
1459   %x01 = fsub float %x0, %x1
1460   ret float %x01
1461 }
1462
1463 define float @extract_extract01_v16f32_fsub_f32_commute(<16 x float> %x) {
1464 ; SSE3-LABEL: extract_extract01_v16f32_fsub_f32_commute:
1465 ; SSE3:       # %bb.0:
1466 ; SSE3-NEXT:    movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1467 ; SSE3-NEXT:    subss %xmm0, %xmm1
1468 ; SSE3-NEXT:    movaps %xmm1, %xmm0
1469 ; SSE3-NEXT:    retq
1470 ;
1471 ; AVX-LABEL: extract_extract01_v16f32_fsub_f32_commute:
1472 ; AVX:       # %bb.0:
1473 ; AVX-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1474 ; AVX-NEXT:    vsubss %xmm0, %xmm1, %xmm0
1475 ; AVX-NEXT:    vzeroupper
1476 ; AVX-NEXT:    retq
1477   %x0 = extractelement <16 x float> %x, i32 0
1478   %x1 = extractelement <16 x float> %x, i32 1
1479   %x01 = fsub float %x1, %x0
1480   ret float %x01
1481 }
1482
1483 define double @extract_extract01_v8f64_fsub_f64(<8 x double> %x) {
1484 ; SSE3-SLOW-LABEL: extract_extract01_v8f64_fsub_f64:
1485 ; SSE3-SLOW:       # %bb.0:
1486 ; SSE3-SLOW-NEXT:    movapd %xmm0, %xmm1
1487 ; SSE3-SLOW-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
1488 ; SSE3-SLOW-NEXT:    subsd %xmm1, %xmm0
1489 ; SSE3-SLOW-NEXT:    retq
1490 ;
1491 ; SSE3-FAST-LABEL: extract_extract01_v8f64_fsub_f64:
1492 ; SSE3-FAST:       # %bb.0:
1493 ; SSE3-FAST-NEXT:    hsubpd %xmm0, %xmm0
1494 ; SSE3-FAST-NEXT:    retq
1495 ;
1496 ; AVX-SLOW-LABEL: extract_extract01_v8f64_fsub_f64:
1497 ; AVX-SLOW:       # %bb.0:
1498 ; AVX-SLOW-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1499 ; AVX-SLOW-NEXT:    vsubsd %xmm1, %xmm0, %xmm0
1500 ; AVX-SLOW-NEXT:    vzeroupper
1501 ; AVX-SLOW-NEXT:    retq
1502 ;
1503 ; AVX-FAST-LABEL: extract_extract01_v8f64_fsub_f64:
1504 ; AVX-FAST:       # %bb.0:
1505 ; AVX-FAST-NEXT:    vhsubpd %xmm0, %xmm0, %xmm0
1506 ; AVX-FAST-NEXT:    vzeroupper
1507 ; AVX-FAST-NEXT:    retq
1508   %x0 = extractelement <8 x double> %x, i32 0
1509   %x1 = extractelement <8 x double> %x, i32 1
1510   %x01 = fsub double %x0, %x1
1511   ret double %x01
1512 }
1513
1514 define double @extract_extract01_v8f64_fsub_f64_commute(<8 x double> %x) {
1515 ; SSE3-LABEL: extract_extract01_v8f64_fsub_f64_commute:
1516 ; SSE3:       # %bb.0:
1517 ; SSE3-NEXT:    movapd %xmm0, %xmm1
1518 ; SSE3-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
1519 ; SSE3-NEXT:    subsd %xmm0, %xmm1
1520 ; SSE3-NEXT:    movapd %xmm1, %xmm0
1521 ; SSE3-NEXT:    retq
1522 ;
1523 ; AVX-LABEL: extract_extract01_v8f64_fsub_f64_commute:
1524 ; AVX:       # %bb.0:
1525 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1526 ; AVX-NEXT:    vsubsd %xmm0, %xmm1, %xmm0
1527 ; AVX-NEXT:    vzeroupper
1528 ; AVX-NEXT:    retq
1529   %x0 = extractelement <8 x double> %x, i32 0
1530   %x1 = extractelement <8 x double> %x, i32 1
1531   %x01 = fsub double %x1, %x0
1532   ret double %x01
1533 }
1534
1535 ; Check output when 1 or both extracts have extra uses.
1536
1537 define float @extract_extract01_v4f32_fadd_f32_uses1(<4 x float> %x, float* %p) {
1538 ; SSE3-SLOW-LABEL: extract_extract01_v4f32_fadd_f32_uses1:
1539 ; SSE3-SLOW:       # %bb.0:
1540 ; SSE3-SLOW-NEXT:    movss %xmm0, (%rdi)
1541 ; SSE3-SLOW-NEXT:    movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1542 ; SSE3-SLOW-NEXT:    addss %xmm1, %xmm0
1543 ; SSE3-SLOW-NEXT:    retq
1544 ;
1545 ; SSE3-FAST-LABEL: extract_extract01_v4f32_fadd_f32_uses1:
1546 ; SSE3-FAST:       # %bb.0:
1547 ; SSE3-FAST-NEXT:    movss %xmm0, (%rdi)
1548 ; SSE3-FAST-NEXT:    haddps %xmm0, %xmm0
1549 ; SSE3-FAST-NEXT:    retq
1550 ;
1551 ; AVX-SLOW-LABEL: extract_extract01_v4f32_fadd_f32_uses1:
1552 ; AVX-SLOW:       # %bb.0:
1553 ; AVX-SLOW-NEXT:    vmovss %xmm0, (%rdi)
1554 ; AVX-SLOW-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1555 ; AVX-SLOW-NEXT:    vaddss %xmm1, %xmm0, %xmm0
1556 ; AVX-SLOW-NEXT:    retq
1557 ;
1558 ; AVX-FAST-LABEL: extract_extract01_v4f32_fadd_f32_uses1:
1559 ; AVX-FAST:       # %bb.0:
1560 ; AVX-FAST-NEXT:    vmovss %xmm0, (%rdi)
1561 ; AVX-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
1562 ; AVX-FAST-NEXT:    retq
1563   %x0 = extractelement <4 x float> %x, i32 0
1564   store float %x0, float* %p
1565   %x1 = extractelement <4 x float> %x, i32 1
1566   %x01 = fadd float %x0, %x1
1567   ret float %x01
1568 }
1569
1570 define float @extract_extract01_v4f32_fadd_f32_uses2(<4 x float> %x, float* %p) {
1571 ; SSE3-SLOW-LABEL: extract_extract01_v4f32_fadd_f32_uses2:
1572 ; SSE3-SLOW:       # %bb.0:
1573 ; SSE3-SLOW-NEXT:    movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1574 ; SSE3-SLOW-NEXT:    movss %xmm1, (%rdi)
1575 ; SSE3-SLOW-NEXT:    addss %xmm1, %xmm0
1576 ; SSE3-SLOW-NEXT:    retq
1577 ;
1578 ; SSE3-FAST-LABEL: extract_extract01_v4f32_fadd_f32_uses2:
1579 ; SSE3-FAST:       # %bb.0:
1580 ; SSE3-FAST-NEXT:    movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1581 ; SSE3-FAST-NEXT:    movss %xmm1, (%rdi)
1582 ; SSE3-FAST-NEXT:    haddps %xmm0, %xmm0
1583 ; SSE3-FAST-NEXT:    retq
1584 ;
1585 ; AVX-SLOW-LABEL: extract_extract01_v4f32_fadd_f32_uses2:
1586 ; AVX-SLOW:       # %bb.0:
1587 ; AVX-SLOW-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1588 ; AVX-SLOW-NEXT:    vmovss %xmm1, (%rdi)
1589 ; AVX-SLOW-NEXT:    vaddss %xmm1, %xmm0, %xmm0
1590 ; AVX-SLOW-NEXT:    retq
1591 ;
1592 ; AVX-FAST-LABEL: extract_extract01_v4f32_fadd_f32_uses2:
1593 ; AVX-FAST:       # %bb.0:
1594 ; AVX-FAST-NEXT:    vextractps $1, %xmm0, (%rdi)
1595 ; AVX-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
1596 ; AVX-FAST-NEXT:    retq
1597   %x0 = extractelement <4 x float> %x, i32 0
1598   %x1 = extractelement <4 x float> %x, i32 1
1599   store float %x1, float* %p
1600   %x01 = fadd float %x0, %x1
1601   ret float %x01
1602 }
1603
1604 define float @extract_extract01_v4f32_fadd_f32_uses3(<4 x float> %x, float* %p1, float* %p2) {
1605 ; SSE3-LABEL: extract_extract01_v4f32_fadd_f32_uses3:
1606 ; SSE3:       # %bb.0:
1607 ; SSE3-NEXT:    movss %xmm0, (%rdi)
1608 ; SSE3-NEXT:    movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1609 ; SSE3-NEXT:    movss %xmm1, (%rsi)
1610 ; SSE3-NEXT:    addss %xmm1, %xmm0
1611 ; SSE3-NEXT:    retq
1612 ;
1613 ; AVX-LABEL: extract_extract01_v4f32_fadd_f32_uses3:
1614 ; AVX:       # %bb.0:
1615 ; AVX-NEXT:    vmovss %xmm0, (%rdi)
1616 ; AVX-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1617 ; AVX-NEXT:    vmovss %xmm1, (%rsi)
1618 ; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm0
1619 ; AVX-NEXT:    retq
1620   %x0 = extractelement <4 x float> %x, i32 0
1621   store float %x0, float* %p1
1622   %x1 = extractelement <4 x float> %x, i32 1
1623   store float %x1, float* %p2
1624   %x01 = fadd float %x0, %x1
1625   ret float %x01
1626 }
1627
1628 ; Repeat tests from general reductions to verify output for hoppy targets:
1629 ; PR38971: https://bugs.llvm.org/show_bug.cgi?id=38971
1630
1631 declare float @llvm.experimental.vector.reduce.v2.fadd.f32.v8f32(float, <8 x float>)
1632 declare double @llvm.experimental.vector.reduce.v2.fadd.f64.v4f64(double, <4 x double>)
1633
1634 define float @fadd_reduce_v8f32(float %a0, <8 x float> %a1) {
1635 ; SSE3-SLOW-LABEL: fadd_reduce_v8f32:
1636 ; SSE3-SLOW:       # %bb.0:
1637 ; SSE3-SLOW-NEXT:    addps %xmm2, %xmm1
1638 ; SSE3-SLOW-NEXT:    movaps %xmm1, %xmm2
1639 ; SSE3-SLOW-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1]
1640 ; SSE3-SLOW-NEXT:    addps %xmm1, %xmm2
1641 ; SSE3-SLOW-NEXT:    movshdup {{.*#+}} xmm1 = xmm2[1,1,3,3]
1642 ; SSE3-SLOW-NEXT:    addss %xmm2, %xmm1
1643 ; SSE3-SLOW-NEXT:    addss %xmm1, %xmm0
1644 ; SSE3-SLOW-NEXT:    retq
1645 ;
1646 ; SSE3-FAST-LABEL: fadd_reduce_v8f32:
1647 ; SSE3-FAST:       # %bb.0:
1648 ; SSE3-FAST-NEXT:    haddps %xmm1, %xmm2
1649 ; SSE3-FAST-NEXT:    haddps %xmm2, %xmm2
1650 ; SSE3-FAST-NEXT:    haddps %xmm2, %xmm2
1651 ; SSE3-FAST-NEXT:    addss %xmm2, %xmm0
1652 ; SSE3-FAST-NEXT:    retq
1653 ;
1654 ; AVX-SLOW-LABEL: fadd_reduce_v8f32:
1655 ; AVX-SLOW:       # %bb.0:
1656 ; AVX-SLOW-NEXT:    vextractf128 $1, %ymm1, %xmm2
1657 ; AVX-SLOW-NEXT:    vaddps %xmm2, %xmm1, %xmm1
1658 ; AVX-SLOW-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
1659 ; AVX-SLOW-NEXT:    vaddps %xmm2, %xmm1, %xmm1
1660 ; AVX-SLOW-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
1661 ; AVX-SLOW-NEXT:    vaddss %xmm2, %xmm1, %xmm1
1662 ; AVX-SLOW-NEXT:    vaddss %xmm1, %xmm0, %xmm0
1663 ; AVX-SLOW-NEXT:    vzeroupper
1664 ; AVX-SLOW-NEXT:    retq
1665 ;
1666 ; AVX-FAST-LABEL: fadd_reduce_v8f32:
1667 ; AVX-FAST:       # %bb.0:
1668 ; AVX-FAST-NEXT:    vextractf128 $1, %ymm1, %xmm2
1669 ; AVX-FAST-NEXT:    vhaddps %xmm1, %xmm2, %xmm1
1670 ; AVX-FAST-NEXT:    vhaddps %xmm1, %xmm1, %xmm1
1671 ; AVX-FAST-NEXT:    vhaddps %xmm1, %xmm1, %xmm1
1672 ; AVX-FAST-NEXT:    vaddss %xmm1, %xmm0, %xmm0
1673 ; AVX-FAST-NEXT:    vzeroupper
1674 ; AVX-FAST-NEXT:    retq
1675   %r = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v8f32(float %a0, <8 x float> %a1)
1676   ret float %r
1677 }
1678
1679 define double @fadd_reduce_v4f64(double %a0, <4 x double> %a1) {
1680 ; SSE3-SLOW-LABEL: fadd_reduce_v4f64:
1681 ; SSE3-SLOW:       # %bb.0:
1682 ; SSE3-SLOW-NEXT:    addpd %xmm2, %xmm1
1683 ; SSE3-SLOW-NEXT:    movapd %xmm1, %xmm2
1684 ; SSE3-SLOW-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1]
1685 ; SSE3-SLOW-NEXT:    addsd %xmm1, %xmm2
1686 ; SSE3-SLOW-NEXT:    addsd %xmm2, %xmm0
1687 ; SSE3-SLOW-NEXT:    retq
1688 ;
1689 ; SSE3-FAST-LABEL: fadd_reduce_v4f64:
1690 ; SSE3-FAST:       # %bb.0:
1691 ; SSE3-FAST-NEXT:    haddpd %xmm1, %xmm2
1692 ; SSE3-FAST-NEXT:    haddpd %xmm2, %xmm2
1693 ; SSE3-FAST-NEXT:    addsd %xmm2, %xmm0
1694 ; SSE3-FAST-NEXT:    retq
1695 ;
1696 ; AVX-SLOW-LABEL: fadd_reduce_v4f64:
1697 ; AVX-SLOW:       # %bb.0:
1698 ; AVX-SLOW-NEXT:    vextractf128 $1, %ymm1, %xmm2
1699 ; AVX-SLOW-NEXT:    vaddpd %xmm2, %xmm1, %xmm1
1700 ; AVX-SLOW-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
1701 ; AVX-SLOW-NEXT:    vaddsd %xmm2, %xmm1, %xmm1
1702 ; AVX-SLOW-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1703 ; AVX-SLOW-NEXT:    vzeroupper
1704 ; AVX-SLOW-NEXT:    retq
1705 ;
1706 ; AVX-FAST-LABEL: fadd_reduce_v4f64:
1707 ; AVX-FAST:       # %bb.0:
1708 ; AVX-FAST-NEXT:    vextractf128 $1, %ymm1, %xmm2
1709 ; AVX-FAST-NEXT:    vhaddpd %xmm1, %xmm2, %xmm1
1710 ; AVX-FAST-NEXT:    vhaddpd %xmm1, %xmm1, %xmm1
1711 ; AVX-FAST-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1712 ; AVX-FAST-NEXT:    vzeroupper
1713 ; AVX-FAST-NEXT:    retq
1714   %r = call fast double @llvm.experimental.vector.reduce.v2.fadd.f64.v4f64(double %a0, <4 x double> %a1)
1715   ret double %r
1716 }
1717
1718 define float @PR39936_v8f32(<8 x float>) {
1719 ; SSSE3-SLOW-LABEL: PR39936_v8f32:
1720 ; SSSE3-SLOW:       # %bb.0:
1721 ; SSSE3-SLOW-NEXT:    haddps %xmm1, %xmm0
1722 ; SSSE3-SLOW-NEXT:    movaps %xmm0, %xmm1
1723 ; SSSE3-SLOW-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[2,3]
1724 ; SSSE3-SLOW-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,3,2,3]
1725 ; SSSE3-SLOW-NEXT:    addps %xmm1, %xmm0
1726 ; SSSE3-SLOW-NEXT:    movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1727 ; SSSE3-SLOW-NEXT:    addss %xmm1, %xmm0
1728 ; SSSE3-SLOW-NEXT:    retq
1729 ;
1730 ; SSSE3-FAST-LABEL: PR39936_v8f32:
1731 ; SSSE3-FAST:       # %bb.0:
1732 ; SSSE3-FAST-NEXT:    haddps %xmm1, %xmm0
1733 ; SSSE3-FAST-NEXT:    haddps %xmm0, %xmm0
1734 ; SSSE3-FAST-NEXT:    haddps %xmm0, %xmm0
1735 ; SSSE3-FAST-NEXT:    retq
1736 ;
1737 ; SSE3-SLOW-LABEL: PR39936_v8f32:
1738 ; SSE3-SLOW:       # %bb.0:
1739 ; SSE3-SLOW-NEXT:    haddps %xmm1, %xmm0
1740 ; SSE3-SLOW-NEXT:    haddps %xmm0, %xmm0
1741 ; SSE3-SLOW-NEXT:    movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1742 ; SSE3-SLOW-NEXT:    addss %xmm1, %xmm0
1743 ; SSE3-SLOW-NEXT:    retq
1744 ;
1745 ; SSE3-FAST-LABEL: PR39936_v8f32:
1746 ; SSE3-FAST:       # %bb.0:
1747 ; SSE3-FAST-NEXT:    haddps %xmm1, %xmm0
1748 ; SSE3-FAST-NEXT:    haddps %xmm0, %xmm0
1749 ; SSE3-FAST-NEXT:    haddps %xmm0, %xmm0
1750 ; SSE3-FAST-NEXT:    retq
1751 ;
1752 ; AVX-SLOW-LABEL: PR39936_v8f32:
1753 ; AVX-SLOW:       # %bb.0:
1754 ; AVX-SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm1
1755 ; AVX-SLOW-NEXT:    vhaddps %xmm1, %xmm0, %xmm0
1756 ; AVX-SLOW-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
1757 ; AVX-SLOW-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1758 ; AVX-SLOW-NEXT:    vaddss %xmm1, %xmm0, %xmm0
1759 ; AVX-SLOW-NEXT:    vzeroupper
1760 ; AVX-SLOW-NEXT:    retq
1761 ;
1762 ; AVX-FAST-LABEL: PR39936_v8f32:
1763 ; AVX-FAST:       # %bb.0:
1764 ; AVX-FAST-NEXT:    vextractf128 $1, %ymm0, %xmm1
1765 ; AVX-FAST-NEXT:    vhaddps %xmm1, %xmm0, %xmm0
1766 ; AVX-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
1767 ; AVX-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
1768 ; AVX-FAST-NEXT:    vzeroupper
1769 ; AVX-FAST-NEXT:    retq
1770   %2 = shufflevector <8 x float> %0, <8 x float> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 undef, i32 undef, i32 undef, i32 undef>
1771   %3 = shufflevector <8 x float> %0, <8 x float> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
1772   %4 = fadd <8 x float> %2, %3
1773   %5 = shufflevector <8 x float> %4, <8 x float> undef, <8 x i32> <i32 0, i32 2, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1774   %6 = shufflevector <8 x float> %4, <8 x float> undef, <8 x i32> <i32 1, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1775   %7 = fadd <8 x float> %5, %6
1776   %8 = shufflevector <8 x float> %7, <8 x float> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1777   %9 = fadd <8 x float> %7, %8
1778   %10 = extractelement <8 x float> %9, i32 0
1779   ret float %10
1780 }
1781
1782 define float @hadd32_4(<4 x float> %x225) {
1783 ; SSE3-SLOW-LABEL: hadd32_4:
1784 ; SSE3-SLOW:       # %bb.0:
1785 ; SSE3-SLOW-NEXT:    movaps %xmm0, %xmm1
1786 ; SSE3-SLOW-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
1787 ; SSE3-SLOW-NEXT:    addps %xmm0, %xmm1
1788 ; SSE3-SLOW-NEXT:    movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
1789 ; SSE3-SLOW-NEXT:    addss %xmm0, %xmm1
1790 ; SSE3-SLOW-NEXT:    movaps %xmm1, %xmm0
1791 ; SSE3-SLOW-NEXT:    retq
1792 ;
1793 ; SSE3-FAST-LABEL: hadd32_4:
1794 ; SSE3-FAST:       # %bb.0:
1795 ; SSE3-FAST-NEXT:    movaps %xmm0, %xmm1
1796 ; SSE3-FAST-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
1797 ; SSE3-FAST-NEXT:    addps %xmm0, %xmm1
1798 ; SSE3-FAST-NEXT:    haddps %xmm1, %xmm1
1799 ; SSE3-FAST-NEXT:    movaps %xmm1, %xmm0
1800 ; SSE3-FAST-NEXT:    retq
1801 ;
1802 ; AVX-SLOW-LABEL: hadd32_4:
1803 ; AVX-SLOW:       # %bb.0:
1804 ; AVX-SLOW-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1805 ; AVX-SLOW-NEXT:    vaddps %xmm1, %xmm0, %xmm0
1806 ; AVX-SLOW-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1807 ; AVX-SLOW-NEXT:    vaddss %xmm1, %xmm0, %xmm0
1808 ; AVX-SLOW-NEXT:    retq
1809 ;
1810 ; AVX-FAST-LABEL: hadd32_4:
1811 ; AVX-FAST:       # %bb.0:
1812 ; AVX-FAST-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1813 ; AVX-FAST-NEXT:    vaddps %xmm1, %xmm0, %xmm0
1814 ; AVX-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
1815 ; AVX-FAST-NEXT:    retq
1816   %x226 = shufflevector <4 x float> %x225, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
1817   %x227 = fadd <4 x float> %x225, %x226
1818   %x228 = shufflevector <4 x float> %x227, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
1819   %x229 = fadd <4 x float> %x227, %x228
1820   %x230 = extractelement <4 x float> %x229, i32 0
1821   ret float %x230
1822 }
1823
1824 define float @hadd32_8(<8 x float> %x225) {
1825 ; SSE3-SLOW-LABEL: hadd32_8:
1826 ; SSE3-SLOW:       # %bb.0:
1827 ; SSE3-SLOW-NEXT:    movaps %xmm0, %xmm1
1828 ; SSE3-SLOW-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
1829 ; SSE3-SLOW-NEXT:    addps %xmm0, %xmm1
1830 ; SSE3-SLOW-NEXT:    movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
1831 ; SSE3-SLOW-NEXT:    addss %xmm0, %xmm1
1832 ; SSE3-SLOW-NEXT:    movaps %xmm1, %xmm0
1833 ; SSE3-SLOW-NEXT:    retq
1834 ;
1835 ; SSE3-FAST-LABEL: hadd32_8:
1836 ; SSE3-FAST:       # %bb.0:
1837 ; SSE3-FAST-NEXT:    movaps %xmm0, %xmm1
1838 ; SSE3-FAST-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
1839 ; SSE3-FAST-NEXT:    addps %xmm0, %xmm1
1840 ; SSE3-FAST-NEXT:    haddps %xmm1, %xmm1
1841 ; SSE3-FAST-NEXT:    movaps %xmm1, %xmm0
1842 ; SSE3-FAST-NEXT:    retq
1843 ;
1844 ; AVX-SLOW-LABEL: hadd32_8:
1845 ; AVX-SLOW:       # %bb.0:
1846 ; AVX-SLOW-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1847 ; AVX-SLOW-NEXT:    vaddps %xmm1, %xmm0, %xmm0
1848 ; AVX-SLOW-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1849 ; AVX-SLOW-NEXT:    vaddss %xmm1, %xmm0, %xmm0
1850 ; AVX-SLOW-NEXT:    vzeroupper
1851 ; AVX-SLOW-NEXT:    retq
1852 ;
1853 ; AVX-FAST-LABEL: hadd32_8:
1854 ; AVX-FAST:       # %bb.0:
1855 ; AVX-FAST-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1856 ; AVX-FAST-NEXT:    vaddps %xmm1, %xmm0, %xmm0
1857 ; AVX-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
1858 ; AVX-FAST-NEXT:    vzeroupper
1859 ; AVX-FAST-NEXT:    retq
1860   %x226 = shufflevector <8 x float> %x225, <8 x float> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1861   %x227 = fadd <8 x float> %x225, %x226
1862   %x228 = shufflevector <8 x float> %x227, <8 x float> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1863   %x229 = fadd <8 x float> %x227, %x228
1864   %x230 = extractelement <8 x float> %x229, i32 0
1865   ret float %x230
1866 }
1867
1868 define float @hadd32_16(<16 x float> %x225) {
1869 ; SSE3-SLOW-LABEL: hadd32_16:
1870 ; SSE3-SLOW:       # %bb.0:
1871 ; SSE3-SLOW-NEXT:    movaps %xmm0, %xmm1
1872 ; SSE3-SLOW-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
1873 ; SSE3-SLOW-NEXT:    addps %xmm0, %xmm1
1874 ; SSE3-SLOW-NEXT:    movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
1875 ; SSE3-SLOW-NEXT:    addss %xmm0, %xmm1
1876 ; SSE3-SLOW-NEXT:    movaps %xmm1, %xmm0
1877 ; SSE3-SLOW-NEXT:    retq
1878 ;
1879 ; SSE3-FAST-LABEL: hadd32_16:
1880 ; SSE3-FAST:       # %bb.0:
1881 ; SSE3-FAST-NEXT:    movaps %xmm0, %xmm1
1882 ; SSE3-FAST-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
1883 ; SSE3-FAST-NEXT:    addps %xmm0, %xmm1
1884 ; SSE3-FAST-NEXT:    haddps %xmm1, %xmm1
1885 ; SSE3-FAST-NEXT:    movaps %xmm1, %xmm0
1886 ; SSE3-FAST-NEXT:    retq
1887 ;
1888 ; AVX-SLOW-LABEL: hadd32_16:
1889 ; AVX-SLOW:       # %bb.0:
1890 ; AVX-SLOW-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1891 ; AVX-SLOW-NEXT:    vaddps %xmm1, %xmm0, %xmm0
1892 ; AVX-SLOW-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1893 ; AVX-SLOW-NEXT:    vaddss %xmm1, %xmm0, %xmm0
1894 ; AVX-SLOW-NEXT:    vzeroupper
1895 ; AVX-SLOW-NEXT:    retq
1896 ;
1897 ; AVX-FAST-LABEL: hadd32_16:
1898 ; AVX-FAST:       # %bb.0:
1899 ; AVX-FAST-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1900 ; AVX-FAST-NEXT:    vaddps %xmm1, %xmm0, %xmm0
1901 ; AVX-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
1902 ; AVX-FAST-NEXT:    vzeroupper
1903 ; AVX-FAST-NEXT:    retq
1904   %x226 = shufflevector <16 x float> %x225, <16 x float> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1905   %x227 = fadd <16 x float> %x225, %x226
1906   %x228 = shufflevector <16 x float> %x227, <16 x float> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1907   %x229 = fadd <16 x float> %x227, %x228
1908   %x230 = extractelement <16 x float> %x229, i32 0
1909   ret float %x230
1910 }
1911
1912 define float @hadd32_4_optsize(<4 x float> %x225) optsize {
1913 ; SSE3-LABEL: hadd32_4_optsize:
1914 ; SSE3:       # %bb.0:
1915 ; SSE3-NEXT:    movaps %xmm0, %xmm1
1916 ; SSE3-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
1917 ; SSE3-NEXT:    addps %xmm0, %xmm1
1918 ; SSE3-NEXT:    haddps %xmm1, %xmm1
1919 ; SSE3-NEXT:    movaps %xmm1, %xmm0
1920 ; SSE3-NEXT:    retq
1921 ;
1922 ; AVX-LABEL: hadd32_4_optsize:
1923 ; AVX:       # %bb.0:
1924 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1925 ; AVX-NEXT:    vaddps %xmm1, %xmm0, %xmm0
1926 ; AVX-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
1927 ; AVX-NEXT:    retq
1928   %x226 = shufflevector <4 x float> %x225, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
1929   %x227 = fadd <4 x float> %x225, %x226
1930   %x228 = shufflevector <4 x float> %x227, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
1931   %x229 = fadd <4 x float> %x227, %x228
1932   %x230 = extractelement <4 x float> %x229, i32 0
1933   ret float %x230
1934 }
1935
1936 define float @hadd32_8_optsize(<8 x float> %x225) optsize {
1937 ; SSE3-LABEL: hadd32_8_optsize:
1938 ; SSE3:       # %bb.0:
1939 ; SSE3-NEXT:    movaps %xmm0, %xmm1
1940 ; SSE3-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
1941 ; SSE3-NEXT:    addps %xmm0, %xmm1
1942 ; SSE3-NEXT:    haddps %xmm1, %xmm1
1943 ; SSE3-NEXT:    movaps %xmm1, %xmm0
1944 ; SSE3-NEXT:    retq
1945 ;
1946 ; AVX-LABEL: hadd32_8_optsize:
1947 ; AVX:       # %bb.0:
1948 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1949 ; AVX-NEXT:    vaddps %xmm1, %xmm0, %xmm0
1950 ; AVX-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
1951 ; AVX-NEXT:    vzeroupper
1952 ; AVX-NEXT:    retq
1953   %x226 = shufflevector <8 x float> %x225, <8 x float> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1954   %x227 = fadd <8 x float> %x225, %x226
1955   %x228 = shufflevector <8 x float> %x227, <8 x float> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1956   %x229 = fadd <8 x float> %x227, %x228
1957   %x230 = extractelement <8 x float> %x229, i32 0
1958   ret float %x230
1959 }
1960
1961 define float @hadd32_16_optsize(<16 x float> %x225) optsize {
1962 ; SSE3-LABEL: hadd32_16_optsize:
1963 ; SSE3:       # %bb.0:
1964 ; SSE3-NEXT:    movaps %xmm0, %xmm1
1965 ; SSE3-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
1966 ; SSE3-NEXT:    addps %xmm0, %xmm1
1967 ; SSE3-NEXT:    haddps %xmm1, %xmm1
1968 ; SSE3-NEXT:    movaps %xmm1, %xmm0
1969 ; SSE3-NEXT:    retq
1970 ;
1971 ; AVX-LABEL: hadd32_16_optsize:
1972 ; AVX:       # %bb.0:
1973 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1974 ; AVX-NEXT:    vaddps %xmm1, %xmm0, %xmm0
1975 ; AVX-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
1976 ; AVX-NEXT:    vzeroupper
1977 ; AVX-NEXT:    retq
1978   %x226 = shufflevector <16 x float> %x225, <16 x float> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1979   %x227 = fadd <16 x float> %x225, %x226
1980   %x228 = shufflevector <16 x float> %x227, <16 x float> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1981   %x229 = fadd <16 x float> %x227, %x228
1982   %x230 = extractelement <16 x float> %x229, i32 0
1983   ret float %x230
1984 }
1985
1986 define float @partial_reduction_fadd_v8f32(<8 x float> %x) {
1987 ; SSE3-SLOW-LABEL: partial_reduction_fadd_v8f32:
1988 ; SSE3-SLOW:       # %bb.0:
1989 ; SSE3-SLOW-NEXT:    movaps %xmm0, %xmm1
1990 ; SSE3-SLOW-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
1991 ; SSE3-SLOW-NEXT:    addps %xmm0, %xmm1
1992 ; SSE3-SLOW-NEXT:    movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
1993 ; SSE3-SLOW-NEXT:    addss %xmm0, %xmm1
1994 ; SSE3-SLOW-NEXT:    movaps %xmm1, %xmm0
1995 ; SSE3-SLOW-NEXT:    retq
1996 ;
1997 ; SSE3-FAST-LABEL: partial_reduction_fadd_v8f32:
1998 ; SSE3-FAST:       # %bb.0:
1999 ; SSE3-FAST-NEXT:    movaps %xmm0, %xmm1
2000 ; SSE3-FAST-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
2001 ; SSE3-FAST-NEXT:    addps %xmm0, %xmm1
2002 ; SSE3-FAST-NEXT:    haddps %xmm1, %xmm1
2003 ; SSE3-FAST-NEXT:    movaps %xmm1, %xmm0
2004 ; SSE3-FAST-NEXT:    retq
2005 ;
2006 ; AVX-SLOW-LABEL: partial_reduction_fadd_v8f32:
2007 ; AVX-SLOW:       # %bb.0:
2008 ; AVX-SLOW-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
2009 ; AVX-SLOW-NEXT:    vaddps %xmm1, %xmm0, %xmm0
2010 ; AVX-SLOW-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
2011 ; AVX-SLOW-NEXT:    vaddss %xmm1, %xmm0, %xmm0
2012 ; AVX-SLOW-NEXT:    vzeroupper
2013 ; AVX-SLOW-NEXT:    retq
2014 ;
2015 ; AVX-FAST-LABEL: partial_reduction_fadd_v8f32:
2016 ; AVX-FAST:       # %bb.0:
2017 ; AVX-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
2018 ; AVX-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
2019 ; AVX-FAST-NEXT:    vzeroupper
2020 ; AVX-FAST-NEXT:    retq
2021   %x23 = shufflevector <8 x float> %x, <8 x float> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
2022   %x0213 = fadd <8 x float> %x, %x23
2023   %x13 = shufflevector <8 x float> %x0213, <8 x float> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
2024   %x0123 = fadd nsz reassoc <8 x float> %x0213, %x13
2025   %r = extractelement <8 x float> %x0123, i32 0
2026   ret float %r
2027 }
2028
2029 ; Negative test - only the flags on the final math op in the
2030 ; sequence determine whether we can transform to horizontal ops.
2031
2032 define float @partial_reduction_fadd_v8f32_wrong_flags(<8 x float> %x) {
2033 ; SSE3-SLOW-LABEL: partial_reduction_fadd_v8f32_wrong_flags:
2034 ; SSE3-SLOW:       # %bb.0:
2035 ; SSE3-SLOW-NEXT:    movaps %xmm0, %xmm1
2036 ; SSE3-SLOW-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
2037 ; SSE3-SLOW-NEXT:    addps %xmm0, %xmm1
2038 ; SSE3-SLOW-NEXT:    movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
2039 ; SSE3-SLOW-NEXT:    addss %xmm0, %xmm1
2040 ; SSE3-SLOW-NEXT:    movaps %xmm1, %xmm0
2041 ; SSE3-SLOW-NEXT:    retq
2042 ;
2043 ; SSE3-FAST-LABEL: partial_reduction_fadd_v8f32_wrong_flags:
2044 ; SSE3-FAST:       # %bb.0:
2045 ; SSE3-FAST-NEXT:    movaps %xmm0, %xmm1
2046 ; SSE3-FAST-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
2047 ; SSE3-FAST-NEXT:    addps %xmm0, %xmm1
2048 ; SSE3-FAST-NEXT:    haddps %xmm1, %xmm1
2049 ; SSE3-FAST-NEXT:    movaps %xmm1, %xmm0
2050 ; SSE3-FAST-NEXT:    retq
2051 ;
2052 ; AVX-SLOW-LABEL: partial_reduction_fadd_v8f32_wrong_flags:
2053 ; AVX-SLOW:       # %bb.0:
2054 ; AVX-SLOW-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
2055 ; AVX-SLOW-NEXT:    vaddps %xmm1, %xmm0, %xmm0
2056 ; AVX-SLOW-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
2057 ; AVX-SLOW-NEXT:    vaddss %xmm1, %xmm0, %xmm0
2058 ; AVX-SLOW-NEXT:    vzeroupper
2059 ; AVX-SLOW-NEXT:    retq
2060 ;
2061 ; AVX-FAST-LABEL: partial_reduction_fadd_v8f32_wrong_flags:
2062 ; AVX-FAST:       # %bb.0:
2063 ; AVX-FAST-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
2064 ; AVX-FAST-NEXT:    vaddps %xmm1, %xmm0, %xmm0
2065 ; AVX-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
2066 ; AVX-FAST-NEXT:    vzeroupper
2067 ; AVX-FAST-NEXT:    retq
2068   %x23 = shufflevector <8 x float> %x, <8 x float> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
2069   %x0213 = fadd fast <8 x float> %x, %x23
2070   %x13 = shufflevector <8 x float> %x0213, <8 x float> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
2071   %x0123 = fadd ninf nnan <8 x float> %x0213, %x13
2072   %r = extractelement <8 x float> %x0123, i32 0
2073   ret float %r
2074 }
2075
2076 define float @partial_reduction_fadd_v16f32(<16 x float> %x) {
2077 ; SSE3-SLOW-LABEL: partial_reduction_fadd_v16f32:
2078 ; SSE3-SLOW:       # %bb.0:
2079 ; SSE3-SLOW-NEXT:    movaps %xmm0, %xmm1
2080 ; SSE3-SLOW-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
2081 ; SSE3-SLOW-NEXT:    addps %xmm0, %xmm1
2082 ; SSE3-SLOW-NEXT:    movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
2083 ; SSE3-SLOW-NEXT:    addss %xmm0, %xmm1
2084 ; SSE3-SLOW-NEXT:    movaps %xmm1, %xmm0
2085 ; SSE3-SLOW-NEXT:    retq
2086 ;
2087 ; SSE3-FAST-LABEL: partial_reduction_fadd_v16f32:
2088 ; SSE3-FAST:       # %bb.0:
2089 ; SSE3-FAST-NEXT:    movaps %xmm0, %xmm1
2090 ; SSE3-FAST-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
2091 ; SSE3-FAST-NEXT:    addps %xmm0, %xmm1
2092 ; SSE3-FAST-NEXT:    haddps %xmm1, %xmm1
2093 ; SSE3-FAST-NEXT:    movaps %xmm1, %xmm0
2094 ; SSE3-FAST-NEXT:    retq
2095 ;
2096 ; AVX-SLOW-LABEL: partial_reduction_fadd_v16f32:
2097 ; AVX-SLOW:       # %bb.0:
2098 ; AVX-SLOW-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
2099 ; AVX-SLOW-NEXT:    vaddps %xmm1, %xmm0, %xmm0
2100 ; AVX-SLOW-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
2101 ; AVX-SLOW-NEXT:    vaddss %xmm1, %xmm0, %xmm0
2102 ; AVX-SLOW-NEXT:    vzeroupper
2103 ; AVX-SLOW-NEXT:    retq
2104 ;
2105 ; AVX-FAST-LABEL: partial_reduction_fadd_v16f32:
2106 ; AVX-FAST:       # %bb.0:
2107 ; AVX-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
2108 ; AVX-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
2109 ; AVX-FAST-NEXT:    vzeroupper
2110 ; AVX-FAST-NEXT:    retq
2111   %x23 = shufflevector <16 x float> %x, <16 x float> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
2112   %x0213 = fadd <16 x float> %x, %x23
2113   %x13 = shufflevector <16 x float> %x0213, <16 x float> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
2114   %x0123 = fadd reassoc nsz <16 x float> %x0213, %x13
2115   %r = extractelement <16 x float> %x0123, i32 0
2116   ret float %r
2117 }