test/CodeGen/X86/haddsub.ll

   1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
   2 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse3               | FileCheck %s --check-prefixes=SSE3,SSE3-SLOW
   3 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse3,fast-hops     | FileCheck %s --check-prefixes=SSE3,SSE3-FAST
   4 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx                | FileCheck %s --check-prefixes=AVX,AVX-SLOW,AVX1,AVX1-SLOW
   5 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx,fast-hops      | FileCheck %s --check-prefixes=AVX,AVX-FAST,AVX1,AVX1-FAST
   6 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2               | FileCheck %s --check-prefixes=AVX,AVX-SLOW,AVX2,AVX2-SLOW
   7 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2,fast-hops     | FileCheck %s --check-prefixes=AVX,AVX-FAST,AVX2,AVX2-FAST
   8 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512f            | FileCheck %s --check-prefixes=AVX,AVX-SLOW,AVX512,AVX512-SLOW
   9 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512f,fast-hops  | FileCheck %s --check-prefixes=AVX,AVX-FAST,AVX512,AVX512-FAST
  10
  11 define <2 x double> @haddpd1(<2 x double> %x, <2 x double> %y) {
  12 ; SSE3-LABEL: haddpd1:
  13 ; SSE3:       # %bb.0:
  14 ; SSE3-NEXT:    haddpd %xmm1, %xmm0
  15 ; SSE3-NEXT:    retq
  16 ;
  17 ; AVX-LABEL: haddpd1:
  18 ; AVX:       # %bb.0:
  19 ; AVX-NEXT:    vhaddpd %xmm1, %xmm0, %xmm0
  20 ; AVX-NEXT:    retq
  21   %a = shufflevector <2 x double> %x, <2 x double> %y, <2 x i32> <i32 0, i32 2>
  22   %b = shufflevector <2 x double> %x, <2 x double> %y, <2 x i32> <i32 1, i32 3>
  23   %r = fadd <2 x double> %a, %b
  24   ret <2 x double> %r
  25 }
  26
  27 define <2 x double> @haddpd2(<2 x double> %x, <2 x double> %y) {
  28 ; SSE3-LABEL: haddpd2:
  29 ; SSE3:       # %bb.0:
  30 ; SSE3-NEXT:    haddpd %xmm1, %xmm0
  31 ; SSE3-NEXT:    retq
  32 ;
  33 ; AVX-LABEL: haddpd2:
  34 ; AVX:       # %bb.0:
  35 ; AVX-NEXT:    vhaddpd %xmm1, %xmm0, %xmm0
  36 ; AVX-NEXT:    retq
  37   %a = shufflevector <2 x double> %x, <2 x double> %y, <2 x i32> <i32 1, i32 2>
  38   %b = shufflevector <2 x double> %y, <2 x double> %x, <2 x i32> <i32 2, i32 1>
  39   %r = fadd <2 x double> %a, %b
  40   ret <2 x double> %r
  41 }
  42
  43 define <2 x double> @haddpd3(<2 x double> %x) {
  44 ; SSE3-SLOW-LABEL: haddpd3:
  45 ; SSE3-SLOW:       # %bb.0:
  46 ; SSE3-SLOW-NEXT:    movapd %xmm0, %xmm1
  47 ; SSE3-SLOW-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
  48 ; SSE3-SLOW-NEXT:    addpd %xmm0, %xmm1
  49 ; SSE3-SLOW-NEXT:    movapd %xmm1, %xmm0
  50 ; SSE3-SLOW-NEXT:    retq
  51 ;
  52 ; SSE3-FAST-LABEL: haddpd3:
  53 ; SSE3-FAST:       # %bb.0:
  54 ; SSE3-FAST-NEXT:    haddpd %xmm0, %xmm0
  55 ; SSE3-FAST-NEXT:    retq
  56 ;
  57 ; AVX-SLOW-LABEL: haddpd3:
  58 ; AVX-SLOW:       # %bb.0:
  59 ; AVX-SLOW-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
  60 ; AVX-SLOW-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
  61 ; AVX-SLOW-NEXT:    retq
  62 ;
  63 ; AVX-FAST-LABEL: haddpd3:
  64 ; AVX-FAST:       # %bb.0:
  65 ; AVX-FAST-NEXT:    vhaddpd %xmm0, %xmm0, %xmm0
  66 ; AVX-FAST-NEXT:    retq
  67   %a = shufflevector <2 x double> %x, <2 x double> undef, <2 x i32> <i32 0, i32 undef>
  68   %b = shufflevector <2 x double> %x, <2 x double> undef, <2 x i32> <i32 1, i32 undef>
  69   %r = fadd <2 x double> %a, %b
  70   ret <2 x double> %r
  71 }
  72
  73 define <4 x float> @haddps1(<4 x float> %x, <4 x float> %y) {
  74 ; SSE3-LABEL: haddps1:
  75 ; SSE3:       # %bb.0:
  76 ; SSE3-NEXT:    haddps %xmm1, %xmm0
  77 ; SSE3-NEXT:    retq
  78 ;
  79 ; AVX-LABEL: haddps1:
  80 ; AVX:       # %bb.0:
  81 ; AVX-NEXT:    vhaddps %xmm1, %xmm0, %xmm0
  82 ; AVX-NEXT:    retq
  83   %a = shufflevector <4 x float> %x, <4 x float> %y, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
  84   %b = shufflevector <4 x float> %x, <4 x float> %y, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
  85   %r = fadd <4 x float> %a, %b
  86   ret <4 x float> %r
  87 }
  88
  89 define <4 x float> @haddps2(<4 x float> %x, <4 x float> %y) {
  90 ; SSE3-LABEL: haddps2:
  91 ; SSE3:       # %bb.0:
  92 ; SSE3-NEXT:    haddps %xmm1, %xmm0
  93 ; SSE3-NEXT:    retq
  94 ;
  95 ; AVX-LABEL: haddps2:
  96 ; AVX:       # %bb.0:
  97 ; AVX-NEXT:    vhaddps %xmm1, %xmm0, %xmm0
  98 ; AVX-NEXT:    retq
  99   %a = shufflevector <4 x float> %x, <4 x float> %y, <4 x i32> <i32 1, i32 2, i32 5, i32 6>
 100   %b = shufflevector <4 x float> %y, <4 x float> %x, <4 x i32> <i32 4, i32 7, i32 0, i32 3>
 101   %r = fadd <4 x float> %a, %b
 102   ret <4 x float> %r
 103 }
 104
 105 define <4 x float> @haddps3(<4 x float> %x) {
 106 ; SSE3-LABEL: haddps3:
 107 ; SSE3:       # %bb.0:
 108 ; SSE3-NEXT:    haddps %xmm0, %xmm0
 109 ; SSE3-NEXT:    retq
 110 ;
 111 ; AVX-LABEL: haddps3:
 112 ; AVX:       # %bb.0:
 113 ; AVX-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
 114 ; AVX-NEXT:    retq
 115   %a = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 2, i32 4, i32 6>
 116   %b = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 3, i32 5, i32 7>
 117   %r = fadd <4 x float> %a, %b
 118   ret <4 x float> %r
 119 }
 120
 121 define <4 x float> @haddps4(<4 x float> %x) {
 122 ; SSE3-LABEL: haddps4:
 123 ; SSE3:       # %bb.0:
 124 ; SSE3-NEXT:    haddps %xmm0, %xmm0
 125 ; SSE3-NEXT:    retq
 126 ;
 127 ; AVX-LABEL: haddps4:
 128 ; AVX:       # %bb.0:
 129 ; AVX-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
 130 ; AVX-NEXT:    retq
 131   %a = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef>
 132   %b = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 1, i32 3, i32 undef, i32 undef>
 133   %r = fadd <4 x float> %a, %b
 134   ret <4 x float> %r
 135 }
 136
 137 define <4 x float> @haddps5(<4 x float> %x) {
 138 ; SSE3-LABEL: haddps5:
 139 ; SSE3:       # %bb.0:
 140 ; SSE3-NEXT:    haddps %xmm0, %xmm0
 141 ; SSE3-NEXT:    retq
 142 ;
 143 ; AVX-LABEL: haddps5:
 144 ; AVX:       # %bb.0:
 145 ; AVX-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
 146 ; AVX-NEXT:    retq
 147   %a = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 0, i32 3, i32 undef, i32 undef>
 148   %b = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 1, i32 2, i32 undef, i32 undef>
 149   %r = fadd <4 x float> %a, %b
 150   ret <4 x float> %r
 151 }
 152
 153 define <4 x float> @haddps6(<4 x float> %x) {
 154 ; SSE3-SLOW-LABEL: haddps6:
 155 ; SSE3-SLOW:       # %bb.0:
 156 ; SSE3-SLOW-NEXT:    movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
 157 ; SSE3-SLOW-NEXT:    addps %xmm1, %xmm0
 158 ; SSE3-SLOW-NEXT:    retq
 159 ;
 160 ; SSE3-FAST-LABEL: haddps6:
 161 ; SSE3-FAST:       # %bb.0:
 162 ; SSE3-FAST-NEXT:    haddps %xmm0, %xmm0
 163 ; SSE3-FAST-NEXT:    retq
 164 ;
 165 ; AVX-SLOW-LABEL: haddps6:
 166 ; AVX-SLOW:       # %bb.0:
 167 ; AVX-SLOW-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
 168 ; AVX-SLOW-NEXT:    vaddps %xmm1, %xmm0, %xmm0
 169 ; AVX-SLOW-NEXT:    retq
 170 ;
 171 ; AVX-FAST-LABEL: haddps6:
 172 ; AVX-FAST:       # %bb.0:
 173 ; AVX-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
 174 ; AVX-FAST-NEXT:    retq
 175   %a = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
 176   %b = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
 177   %r = fadd <4 x float> %a, %b
 178   ret <4 x float> %r
 179 }
 180
 181 define <4 x float> @haddps7(<4 x float> %x) {
 182 ; SSE3-LABEL: haddps7:
 183 ; SSE3:       # %bb.0:
 184 ; SSE3-NEXT:    haddps %xmm0, %xmm0
 185 ; SSE3-NEXT:    retq
 186 ;
 187 ; AVX-LABEL: haddps7:
 188 ; AVX:       # %bb.0:
 189 ; AVX-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
 190 ; AVX-NEXT:    retq
 191   %a = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 3, i32 undef, i32 undef>
 192   %b = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 2, i32 undef, i32 undef>
 193   %r = fadd <4 x float> %a, %b
 194   ret <4 x float> %r
 195 }
 196
 197 define <2 x double> @hsubpd1(<2 x double> %x, <2 x double> %y) {
 198 ; SSE3-LABEL: hsubpd1:
 199 ; SSE3:       # %bb.0:
 200 ; SSE3-NEXT:    hsubpd %xmm1, %xmm0
 201 ; SSE3-NEXT:    retq
 202 ;
 203 ; AVX-LABEL: hsubpd1:
 204 ; AVX:       # %bb.0:
 205 ; AVX-NEXT:    vhsubpd %xmm1, %xmm0, %xmm0
 206 ; AVX-NEXT:    retq
 207   %a = shufflevector <2 x double> %x, <2 x double> %y, <2 x i32> <i32 0, i32 2>
 208   %b = shufflevector <2 x double> %x, <2 x double> %y, <2 x i32> <i32 1, i32 3>
 209   %r = fsub <2 x double> %a, %b
 210   ret <2 x double> %r
 211 }
 212
 213 define <2 x double> @hsubpd2(<2 x double> %x) {
 214 ; SSE3-SLOW-LABEL: hsubpd2:
 215 ; SSE3-SLOW:       # %bb.0:
 216 ; SSE3-SLOW-NEXT:    movapd %xmm0, %xmm1
 217 ; SSE3-SLOW-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
 218 ; SSE3-SLOW-NEXT:    subpd %xmm1, %xmm0
 219 ; SSE3-SLOW-NEXT:    retq
 220 ;
 221 ; SSE3-FAST-LABEL: hsubpd2:
 222 ; SSE3-FAST:       # %bb.0:
 223 ; SSE3-FAST-NEXT:    hsubpd %xmm0, %xmm0
 224 ; SSE3-FAST-NEXT:    retq
 225 ;
 226 ; AVX-SLOW-LABEL: hsubpd2:
 227 ; AVX-SLOW:       # %bb.0:
 228 ; AVX-SLOW-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
 229 ; AVX-SLOW-NEXT:    vsubpd %xmm1, %xmm0, %xmm0
 230 ; AVX-SLOW-NEXT:    retq
 231 ;
 232 ; AVX-FAST-LABEL: hsubpd2:
 233 ; AVX-FAST:       # %bb.0:
 234 ; AVX-FAST-NEXT:    vhsubpd %xmm0, %xmm0, %xmm0
 235 ; AVX-FAST-NEXT:    retq
 236   %a = shufflevector <2 x double> %x, <2 x double> undef, <2 x i32> <i32 0, i32 undef>
 237   %b = shufflevector <2 x double> %x, <2 x double> undef, <2 x i32> <i32 1, i32 undef>
 238   %r = fsub <2 x double> %a, %b
 239   ret <2 x double> %r
 240 }
 241
 242 define <4 x float> @hsubps1(<4 x float> %x, <4 x float> %y) {
 243 ; SSE3-LABEL: hsubps1:
 244 ; SSE3:       # %bb.0:
 245 ; SSE3-NEXT:    hsubps %xmm1, %xmm0
 246 ; SSE3-NEXT:    retq
 247 ;
 248 ; AVX-LABEL: hsubps1:
 249 ; AVX:       # %bb.0:
 250 ; AVX-NEXT:    vhsubps %xmm1, %xmm0, %xmm0
 251 ; AVX-NEXT:    retq
 252   %a = shufflevector <4 x float> %x, <4 x float> %y, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
 253   %b = shufflevector <4 x float> %x, <4 x float> %y, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
 254   %r = fsub <4 x float> %a, %b
 255   ret <4 x float> %r
 256 }
 257
 258 define <4 x float> @hsubps2(<4 x float> %x) {
 259 ; SSE3-LABEL: hsubps2:
 260 ; SSE3:       # %bb.0:
 261 ; SSE3-NEXT:    hsubps %xmm0, %xmm0
 262 ; SSE3-NEXT:    retq
 263 ;
 264 ; AVX-LABEL: hsubps2:
 265 ; AVX:       # %bb.0:
 266 ; AVX-NEXT:    vhsubps %xmm0, %xmm0, %xmm0
 267 ; AVX-NEXT:    retq
 268   %a = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 2, i32 4, i32 6>
 269   %b = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 3, i32 5, i32 7>
 270   %r = fsub <4 x float> %a, %b
 271   ret <4 x float> %r
 272 }
 273
 274 define <4 x float> @hsubps3(<4 x float> %x) {
 275 ; SSE3-LABEL: hsubps3:
 276 ; SSE3:       # %bb.0:
 277 ; SSE3-NEXT:    hsubps %xmm0, %xmm0
 278 ; SSE3-NEXT:    retq
 279 ;
 280 ; AVX-LABEL: hsubps3:
 281 ; AVX:       # %bb.0:
 282 ; AVX-NEXT:    vhsubps %xmm0, %xmm0, %xmm0
 283 ; AVX-NEXT:    retq
 284   %a = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef>
 285   %b = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 1, i32 3, i32 undef, i32 undef>
 286   %r = fsub <4 x float> %a, %b
 287   ret <4 x float> %r
 288 }
 289
 290 define <4 x float> @hsubps4(<4 x float> %x) {
 291 ; SSE3-SLOW-LABEL: hsubps4:
 292 ; SSE3-SLOW:       # %bb.0:
 293 ; SSE3-SLOW-NEXT:    movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
 294 ; SSE3-SLOW-NEXT:    subps %xmm1, %xmm0
 295 ; SSE3-SLOW-NEXT:    retq
 296 ;
 297 ; SSE3-FAST-LABEL: hsubps4:
 298 ; SSE3-FAST:       # %bb.0:
 299 ; SSE3-FAST-NEXT:    hsubps %xmm0, %xmm0
 300 ; SSE3-FAST-NEXT:    retq
 301 ;
 302 ; AVX-SLOW-LABEL: hsubps4:
 303 ; AVX-SLOW:       # %bb.0:
 304 ; AVX-SLOW-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
 305 ; AVX-SLOW-NEXT:    vsubps %xmm1, %xmm0, %xmm0
 306 ; AVX-SLOW-NEXT:    retq
 307 ;
 308 ; AVX-FAST-LABEL: hsubps4:
 309 ; AVX-FAST:       # %bb.0:
 310 ; AVX-FAST-NEXT:    vhsubps %xmm0, %xmm0, %xmm0
 311 ; AVX-FAST-NEXT:    retq
 312   %a = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
 313   %b = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
 314   %r = fsub <4 x float> %a, %b
 315   ret <4 x float> %r
 316 }
 317
 318 define <8 x float> @vhaddps1(<8 x float> %x, <8 x float> %y) {
 319 ; SSE3-LABEL: vhaddps1:
 320 ; SSE3:       # %bb.0:
 321 ; SSE3-NEXT:    haddps %xmm2, %xmm0
 322 ; SSE3-NEXT:    haddps %xmm3, %xmm1
 323 ; SSE3-NEXT:    retq
 324 ;
 325 ; AVX-LABEL: vhaddps1:
 326 ; AVX:       # %bb.0:
 327 ; AVX-NEXT:    vhaddps %ymm1, %ymm0, %ymm0
 328 ; AVX-NEXT:    retq
 329   %a = shufflevector <8 x float> %x, <8 x float> %y, <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
 330   %b = shufflevector <8 x float> %x, <8 x float> %y, <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
 331   %r = fadd <8 x float> %a, %b
 332   ret <8 x float> %r
 333 }
 334
 335 define <8 x float> @vhaddps2(<8 x float> %x, <8 x float> %y) {
 336 ; SSE3-LABEL: vhaddps2:
 337 ; SSE3:       # %bb.0:
 338 ; SSE3-NEXT:    haddps %xmm2, %xmm0
 339 ; SSE3-NEXT:    haddps %xmm3, %xmm1
 340 ; SSE3-NEXT:    retq
 341 ;
 342 ; AVX-LABEL: vhaddps2:
 343 ; AVX:       # %bb.0:
 344 ; AVX-NEXT:    vhaddps %ymm1, %ymm0, %ymm0
 345 ; AVX-NEXT:    retq
 346   %a = shufflevector <8 x float> %x, <8 x float> %y, <8 x i32> <i32 1, i32 2, i32 9, i32 10, i32 5, i32 6, i32 13, i32 14>
 347   %b = shufflevector <8 x float> %y, <8 x float> %x, <8 x i32> <i32 8, i32 11, i32 0, i32 3, i32 12, i32 15, i32 4, i32 7>
 348   %r = fadd <8 x float> %a, %b
 349   ret <8 x float> %r
 350 }
 351
 352 define <8 x float> @vhaddps3(<8 x float> %x) {
 353 ; SSE3-LABEL: vhaddps3:
 354 ; SSE3:       # %bb.0:
 355 ; SSE3-NEXT:    haddps %xmm0, %xmm0
 356 ; SSE3-NEXT:    haddps %xmm1, %xmm1
 357 ; SSE3-NEXT:    retq
 358 ;
 359 ; AVX-LABEL: vhaddps3:
 360 ; AVX:       # %bb.0:
 361 ; AVX-NEXT:    vhaddps %ymm0, %ymm0, %ymm0
 362 ; AVX-NEXT:    retq
 363   %a = shufflevector <8 x float> %x, <8 x float> undef, <8 x i32> <i32 undef, i32 2, i32 8, i32 10, i32 4, i32 6, i32 undef, i32 14>
 364   %b = shufflevector <8 x float> %x, <8 x float> undef, <8 x i32> <i32 1, i32 3, i32 9, i32 undef, i32 5, i32 7, i32 13, i32 15>
 365   %r = fadd <8 x float> %a, %b
 366   ret <8 x float> %r
 367 }
 368
 369 define <8 x float> @vhsubps1(<8 x float> %x, <8 x float> %y) {
 370 ; SSE3-LABEL: vhsubps1:
 371 ; SSE3:       # %bb.0:
 372 ; SSE3-NEXT:    hsubps %xmm2, %xmm0
 373 ; SSE3-NEXT:    hsubps %xmm3, %xmm1
 374 ; SSE3-NEXT:    retq
 375 ;
 376 ; AVX-LABEL: vhsubps1:
 377 ; AVX:       # %bb.0:
 378 ; AVX-NEXT:    vhsubps %ymm1, %ymm0, %ymm0
 379 ; AVX-NEXT:    retq
 380   %a = shufflevector <8 x float> %x, <8 x float> %y, <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
 381   %b = shufflevector <8 x float> %x, <8 x float> %y, <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
 382   %r = fsub <8 x float> %a, %b
 383   ret <8 x float> %r
 384 }
 385
 386 define <8 x float> @vhsubps3(<8 x float> %x) {
 387 ; SSE3-LABEL: vhsubps3:
 388 ; SSE3:       # %bb.0:
 389 ; SSE3-NEXT:    hsubps %xmm0, %xmm0
 390 ; SSE3-NEXT:    hsubps %xmm1, %xmm1
 391 ; SSE3-NEXT:    retq
 392 ;
 393 ; AVX-LABEL: vhsubps3:
 394 ; AVX:       # %bb.0:
 395 ; AVX-NEXT:    vhsubps %ymm0, %ymm0, %ymm0
 396 ; AVX-NEXT:    retq
 397   %a = shufflevector <8 x float> %x, <8 x float> undef, <8 x i32> <i32 undef, i32 2, i32 8, i32 10, i32 4, i32 6, i32 undef, i32 14>
 398   %b = shufflevector <8 x float> %x, <8 x float> undef, <8 x i32> <i32 1, i32 3, i32 9, i32 undef, i32 5, i32 7, i32 13, i32 15>
 399   %r = fsub <8 x float> %a, %b
 400   ret <8 x float> %r
 401 }
 402
 403 define <4 x double> @vhaddpd1(<4 x double> %x, <4 x double> %y) {
 404 ; SSE3-LABEL: vhaddpd1:
 405 ; SSE3:       # %bb.0:
 406 ; SSE3-NEXT:    haddpd %xmm2, %xmm0
 407 ; SSE3-NEXT:    haddpd %xmm3, %xmm1
 408 ; SSE3-NEXT:    retq
 409 ;
 410 ; AVX-LABEL: vhaddpd1:
 411 ; AVX:       # %bb.0:
 412 ; AVX-NEXT:    vhaddpd %ymm1, %ymm0, %ymm0
 413 ; AVX-NEXT:    retq
 414   %a = shufflevector <4 x double> %x, <4 x double> %y, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
 415   %b = shufflevector <4 x double> %x, <4 x double> %y, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
 416   %r = fadd <4 x double> %a, %b
 417   ret <4 x double> %r
 418 }
 419
 420 define <4 x double> @vhsubpd1(<4 x double> %x, <4 x double> %y) {
 421 ; SSE3-LABEL: vhsubpd1:
 422 ; SSE3:       # %bb.0:
 423 ; SSE3-NEXT:    hsubpd %xmm2, %xmm0
 424 ; SSE3-NEXT:    hsubpd %xmm3, %xmm1
 425 ; SSE3-NEXT:    retq
 426 ;
 427 ; AVX-LABEL: vhsubpd1:
 428 ; AVX:       # %bb.0:
 429 ; AVX-NEXT:    vhsubpd %ymm1, %ymm0, %ymm0
 430 ; AVX-NEXT:    retq
 431   %a = shufflevector <4 x double> %x, <4 x double> %y, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
 432   %b = shufflevector <4 x double> %x, <4 x double> %y, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
 433   %r = fsub <4 x double> %a, %b
 434   ret <4 x double> %r
 435 }
 436
 437 define <2 x float> @haddps_v2f32(<4 x float> %v0) {
 438 ; SSE3-LABEL: haddps_v2f32:
 439 ; SSE3:       # %bb.0:
 440 ; SSE3-NEXT:    haddps %xmm0, %xmm0
 441 ; SSE3-NEXT:    retq
 442 ;
 443 ; AVX-LABEL: haddps_v2f32:
 444 ; AVX:       # %bb.0:
 445 ; AVX-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
 446 ; AVX-NEXT:    retq
 447   %v0.0 = extractelement <4 x float> %v0, i32 0
 448   %v0.1 = extractelement <4 x float> %v0, i32 1
 449   %v0.2 = extractelement <4 x float> %v0, i32 2
 450   %v0.3 = extractelement <4 x float> %v0, i32 3
 451   %op0 = fadd float %v0.0, %v0.1
 452   %op1 = fadd float %v0.2, %v0.3
 453   %res0 = insertelement <2 x float> undef, float %op0, i32 0
 454   %res1 = insertelement <2 x float> %res0, float %op1, i32 1
 455   ret <2 x float> %res1
 456 }
 457
 458 ; 128-bit vectors, float/double, fadd/fsub
 459
 460 define float @extract_extract01_v4f32_fadd_f32(<4 x float> %x) {
 461 ; SSE3-SLOW-LABEL: extract_extract01_v4f32_fadd_f32:
 462 ; SSE3-SLOW:       # %bb.0:
 463 ; SSE3-SLOW-NEXT:    movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
 464 ; SSE3-SLOW-NEXT:    addss %xmm1, %xmm0
 465 ; SSE3-SLOW-NEXT:    retq
 466 ;
 467 ; SSE3-FAST-LABEL: extract_extract01_v4f32_fadd_f32:
 468 ; SSE3-FAST:       # %bb.0:
 469 ; SSE3-FAST-NEXT:    haddps %xmm0, %xmm0
 470 ; SSE3-FAST-NEXT:    retq
 471 ;
 472 ; AVX-SLOW-LABEL: extract_extract01_v4f32_fadd_f32:
 473 ; AVX-SLOW:       # %bb.0:
 474 ; AVX-SLOW-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
 475 ; AVX-SLOW-NEXT:    vaddss %xmm1, %xmm0, %xmm0
 476 ; AVX-SLOW-NEXT:    retq
 477 ;
 478 ; AVX-FAST-LABEL: extract_extract01_v4f32_fadd_f32:
 479 ; AVX-FAST:       # %bb.0:
 480 ; AVX-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
 481 ; AVX-FAST-NEXT:    retq
 482   %x0 = extractelement <4 x float> %x, i32 0
 483   %x1 = extractelement <4 x float> %x, i32 1
 484   %x01 = fadd float %x0, %x1
 485   ret float %x01
 486 }
 487
 488 define float @extract_extract23_v4f32_fadd_f32(<4 x float> %x) {
 489 ; SSE3-SLOW-LABEL: extract_extract23_v4f32_fadd_f32:
 490 ; SSE3-SLOW:       # %bb.0:
 491 ; SSE3-SLOW-NEXT:    movaps %xmm0, %xmm1
 492 ; SSE3-SLOW-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
 493 ; SSE3-SLOW-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
 494 ; SSE3-SLOW-NEXT:    addss %xmm1, %xmm0
 495 ; SSE3-SLOW-NEXT:    retq
 496 ;
 497 ; SSE3-FAST-LABEL: extract_extract23_v4f32_fadd_f32:
 498 ; SSE3-FAST:       # %bb.0:
 499 ; SSE3-FAST-NEXT:    haddps %xmm0, %xmm0
 500 ; SSE3-FAST-NEXT:    movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
 501 ; SSE3-FAST-NEXT:    retq
 502 ;
 503 ; AVX-SLOW-LABEL: extract_extract23_v4f32_fadd_f32:
 504 ; AVX-SLOW:       # %bb.0:
 505 ; AVX-SLOW-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
 506 ; AVX-SLOW-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
 507 ; AVX-SLOW-NEXT:    vaddss %xmm0, %xmm1, %xmm0
 508 ; AVX-SLOW-NEXT:    retq
 509 ;
 510 ; AVX-FAST-LABEL: extract_extract23_v4f32_fadd_f32:
 511 ; AVX-FAST:       # %bb.0:
 512 ; AVX-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
 513 ; AVX-FAST-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
 514 ; AVX-FAST-NEXT:    retq
 515   %x0 = extractelement <4 x float> %x, i32 2
 516   %x1 = extractelement <4 x float> %x, i32 3
 517   %x01 = fadd float %x0, %x1
 518   ret float %x01
 519 }
 520
 521 define float @extract_extract01_v4f32_fadd_f32_commute(<4 x float> %x) {
 522 ; SSE3-SLOW-LABEL: extract_extract01_v4f32_fadd_f32_commute:
 523 ; SSE3-SLOW:       # %bb.0:
 524 ; SSE3-SLOW-NEXT:    movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
 525 ; SSE3-SLOW-NEXT:    addss %xmm1, %xmm0
 526 ; SSE3-SLOW-NEXT:    retq
 527 ;
 528 ; SSE3-FAST-LABEL: extract_extract01_v4f32_fadd_f32_commute:
 529 ; SSE3-FAST:       # %bb.0:
 530 ; SSE3-FAST-NEXT:    haddps %xmm0, %xmm0
 531 ; SSE3-FAST-NEXT:    retq
 532 ;
 533 ; AVX-SLOW-LABEL: extract_extract01_v4f32_fadd_f32_commute:
 534 ; AVX-SLOW:       # %bb.0:
 535 ; AVX-SLOW-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
 536 ; AVX-SLOW-NEXT:    vaddss %xmm0, %xmm1, %xmm0
 537 ; AVX-SLOW-NEXT:    retq
 538 ;
 539 ; AVX-FAST-LABEL: extract_extract01_v4f32_fadd_f32_commute:
 540 ; AVX-FAST:       # %bb.0:
 541 ; AVX-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
 542 ; AVX-FAST-NEXT:    retq
 543   %x0 = extractelement <4 x float> %x, i32 0
 544   %x1 = extractelement <4 x float> %x, i32 1
 545   %x01 = fadd float %x1, %x0
 546   ret float %x01
 547 }
 548
 549 define float @extract_extract23_v4f32_fadd_f32_commute(<4 x float> %x) {
 550 ; SSE3-SLOW-LABEL: extract_extract23_v4f32_fadd_f32_commute:
 551 ; SSE3-SLOW:       # %bb.0:
 552 ; SSE3-SLOW-NEXT:    movaps %xmm0, %xmm1
 553 ; SSE3-SLOW-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
 554 ; SSE3-SLOW-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
 555 ; SSE3-SLOW-NEXT:    addss %xmm1, %xmm0
 556 ; SSE3-SLOW-NEXT:    retq
 557 ;
 558 ; SSE3-FAST-LABEL: extract_extract23_v4f32_fadd_f32_commute:
 559 ; SSE3-FAST:       # %bb.0:
 560 ; SSE3-FAST-NEXT:    haddps %xmm0, %xmm0
 561 ; SSE3-FAST-NEXT:    movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
 562 ; SSE3-FAST-NEXT:    retq
 563 ;
 564 ; AVX-SLOW-LABEL: extract_extract23_v4f32_fadd_f32_commute:
 565 ; AVX-SLOW:       # %bb.0:
 566 ; AVX-SLOW-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
 567 ; AVX-SLOW-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
 568 ; AVX-SLOW-NEXT:    vaddss %xmm1, %xmm0, %xmm0
 569 ; AVX-SLOW-NEXT:    retq
 570 ;
 571 ; AVX-FAST-LABEL: extract_extract23_v4f32_fadd_f32_commute:
 572 ; AVX-FAST:       # %bb.0:
 573 ; AVX-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
 574 ; AVX-FAST-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
 575 ; AVX-FAST-NEXT:    retq
 576   %x0 = extractelement <4 x float> %x, i32 2
 577   %x1 = extractelement <4 x float> %x, i32 3
 578   %x01 = fadd float %x1, %x0
 579   ret float %x01
 580 }
 581
 582 define double @extract_extract01_v2f64_fadd_f64(<2 x double> %x) {
 583 ; SSE3-SLOW-LABEL: extract_extract01_v2f64_fadd_f64:
 584 ; SSE3-SLOW:       # %bb.0:
 585 ; SSE3-SLOW-NEXT:    movapd %xmm0, %xmm1
 586 ; SSE3-SLOW-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
 587 ; SSE3-SLOW-NEXT:    addsd %xmm0, %xmm1
 588 ; SSE3-SLOW-NEXT:    movapd %xmm1, %xmm0
 589 ; SSE3-SLOW-NEXT:    retq
 590 ;
 591 ; SSE3-FAST-LABEL: extract_extract01_v2f64_fadd_f64:
 592 ; SSE3-FAST:       # %bb.0:
 593 ; SSE3-FAST-NEXT:    haddpd %xmm0, %xmm0
 594 ; SSE3-FAST-NEXT:    retq
 595 ;
 596 ; AVX-SLOW-LABEL: extract_extract01_v2f64_fadd_f64:
 597 ; AVX-SLOW:       # %bb.0:
 598 ; AVX-SLOW-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
 599 ; AVX-SLOW-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
 600 ; AVX-SLOW-NEXT:    retq
 601 ;
 602 ; AVX-FAST-LABEL: extract_extract01_v2f64_fadd_f64:
 603 ; AVX-FAST:       # %bb.0:
 604 ; AVX-FAST-NEXT:    vhaddpd %xmm0, %xmm0, %xmm0
 605 ; AVX-FAST-NEXT:    retq
 606   %x0 = extractelement <2 x double> %x, i32 0
 607   %x1 = extractelement <2 x double> %x, i32 1
 608   %x01 = fadd double %x0, %x1
 609   ret double %x01
 610 }
 611
 612 define double @extract_extract01_v2f64_fadd_f64_commute(<2 x double> %x) {
 613 ; SSE3-SLOW-LABEL: extract_extract01_v2f64_fadd_f64_commute:
 614 ; SSE3-SLOW:       # %bb.0:
 615 ; SSE3-SLOW-NEXT:    movapd %xmm0, %xmm1
 616 ; SSE3-SLOW-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
 617 ; SSE3-SLOW-NEXT:    addsd %xmm0, %xmm1
 618 ; SSE3-SLOW-NEXT:    movapd %xmm1, %xmm0
 619 ; SSE3-SLOW-NEXT:    retq
 620 ;
 621 ; SSE3-FAST-LABEL: extract_extract01_v2f64_fadd_f64_commute:
 622 ; SSE3-FAST:       # %bb.0:
 623 ; SSE3-FAST-NEXT:    haddpd %xmm0, %xmm0
 624 ; SSE3-FAST-NEXT:    retq
 625 ;
 626 ; AVX-SLOW-LABEL: extract_extract01_v2f64_fadd_f64_commute:
 627 ; AVX-SLOW:       # %bb.0:
 628 ; AVX-SLOW-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
 629 ; AVX-SLOW-NEXT:    vaddsd %xmm0, %xmm1, %xmm0
 630 ; AVX-SLOW-NEXT:    retq
 631 ;
 632 ; AVX-FAST-LABEL: extract_extract01_v2f64_fadd_f64_commute:
 633 ; AVX-FAST:       # %bb.0:
 634 ; AVX-FAST-NEXT:    vhaddpd %xmm0, %xmm0, %xmm0
 635 ; AVX-FAST-NEXT:    retq
 636   %x0 = extractelement <2 x double> %x, i32 0
 637   %x1 = extractelement <2 x double> %x, i32 1
 638   %x01 = fadd double %x1, %x0
 639   ret double %x01
 640 }
 641
 642 define float @extract_extract01_v4f32_fsub_f32(<4 x float> %x) {
 643 ; SSE3-SLOW-LABEL: extract_extract01_v4f32_fsub_f32:
 644 ; SSE3-SLOW:       # %bb.0:
 645 ; SSE3-SLOW-NEXT:    movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
 646 ; SSE3-SLOW-NEXT:    subss %xmm1, %xmm0
 647 ; SSE3-SLOW-NEXT:    retq
 648 ;
 649 ; SSE3-FAST-LABEL: extract_extract01_v4f32_fsub_f32:
 650 ; SSE3-FAST:       # %bb.0:
 651 ; SSE3-FAST-NEXT:    hsubps %xmm0, %xmm0
 652 ; SSE3-FAST-NEXT:    retq
 653 ;
 654 ; AVX-SLOW-LABEL: extract_extract01_v4f32_fsub_f32:
 655 ; AVX-SLOW:       # %bb.0:
 656 ; AVX-SLOW-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
 657 ; AVX-SLOW-NEXT:    vsubss %xmm1, %xmm0, %xmm0
 658 ; AVX-SLOW-NEXT:    retq
 659 ;
 660 ; AVX-FAST-LABEL: extract_extract01_v4f32_fsub_f32:
 661 ; AVX-FAST:       # %bb.0:
 662 ; AVX-FAST-NEXT:    vhsubps %xmm0, %xmm0, %xmm0
 663 ; AVX-FAST-NEXT:    retq
 664   %x0 = extractelement <4 x float> %x, i32 0
 665   %x1 = extractelement <4 x float> %x, i32 1
 666   %x01 = fsub float %x0, %x1
 667   ret float %x01
 668 }
 669
 670 define float @extract_extract23_v4f32_fsub_f32(<4 x float> %x) {
 671 ; SSE3-SLOW-LABEL: extract_extract23_v4f32_fsub_f32:
 672 ; SSE3-SLOW:       # %bb.0:
 673 ; SSE3-SLOW-NEXT:    movaps %xmm0, %xmm1
 674 ; SSE3-SLOW-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
 675 ; SSE3-SLOW-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
 676 ; SSE3-SLOW-NEXT:    subss %xmm0, %xmm1
 677 ; SSE3-SLOW-NEXT:    movaps %xmm1, %xmm0
 678 ; SSE3-SLOW-NEXT:    retq
 679 ;
 680 ; SSE3-FAST-LABEL: extract_extract23_v4f32_fsub_f32:
 681 ; SSE3-FAST:       # %bb.0:
 682 ; SSE3-FAST-NEXT:    hsubps %xmm0, %xmm0
 683 ; SSE3-FAST-NEXT:    movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
 684 ; SSE3-FAST-NEXT:    retq
 685 ;
 686 ; AVX-SLOW-LABEL: extract_extract23_v4f32_fsub_f32:
 687 ; AVX-SLOW:       # %bb.0:
 688 ; AVX-SLOW-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
 689 ; AVX-SLOW-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
 690 ; AVX-SLOW-NEXT:    vsubss %xmm0, %xmm1, %xmm0
 691 ; AVX-SLOW-NEXT:    retq
 692 ;
 693 ; AVX-FAST-LABEL: extract_extract23_v4f32_fsub_f32:
 694 ; AVX-FAST:       # %bb.0:
 695 ; AVX-FAST-NEXT:    vhsubps %xmm0, %xmm0, %xmm0
 696 ; AVX-FAST-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
 697 ; AVX-FAST-NEXT:    retq
 698   %x0 = extractelement <4 x float> %x, i32 2
 699   %x1 = extractelement <4 x float> %x, i32 3
 700   %x01 = fsub float %x0, %x1
 701   ret float %x01
 702 }
 703
 704 define float @extract_extract01_v4f32_fsub_f32_commute(<4 x float> %x) {
 705 ; SSE3-LABEL: extract_extract01_v4f32_fsub_f32_commute:
 706 ; SSE3:       # %bb.0:
 707 ; SSE3-NEXT:    movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
 708 ; SSE3-NEXT:    subss %xmm0, %xmm1
 709 ; SSE3-NEXT:    movaps %xmm1, %xmm0
 710 ; SSE3-NEXT:    retq
 711 ;
 712 ; AVX-LABEL: extract_extract01_v4f32_fsub_f32_commute:
 713 ; AVX:       # %bb.0:
 714 ; AVX-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
 715 ; AVX-NEXT:    vsubss %xmm0, %xmm1, %xmm0
 716 ; AVX-NEXT:    retq
 717   %x0 = extractelement <4 x float> %x, i32 0
 718   %x1 = extractelement <4 x float> %x, i32 1
 719   %x01 = fsub float %x1, %x0
 720   ret float %x01
 721 }
 722
 723 define float @extract_extract23_v4f32_fsub_f32_commute(<4 x float> %x) {
 724 ; SSE3-LABEL: extract_extract23_v4f32_fsub_f32_commute:
 725 ; SSE3:       # %bb.0:
 726 ; SSE3-NEXT:    movaps %xmm0, %xmm1
 727 ; SSE3-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
 728 ; SSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
 729 ; SSE3-NEXT:    subss %xmm1, %xmm0
 730 ; SSE3-NEXT:    retq
 731 ;
 732 ; AVX-LABEL: extract_extract23_v4f32_fsub_f32_commute:
 733 ; AVX:       # %bb.0:
 734 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
 735 ; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
 736 ; AVX-NEXT:    vsubss %xmm1, %xmm0, %xmm0
 737 ; AVX-NEXT:    retq
 738   %x0 = extractelement <4 x float> %x, i32 2
 739   %x1 = extractelement <4 x float> %x, i32 3
 740   %x01 = fsub float %x1, %x0
 741   ret float %x01
 742 }
 743
 744 define double @extract_extract01_v2f64_fsub_f64(<2 x double> %x) {
 745 ; SSE3-SLOW-LABEL: extract_extract01_v2f64_fsub_f64:
 746 ; SSE3-SLOW:       # %bb.0:
 747 ; SSE3-SLOW-NEXT:    movapd %xmm0, %xmm1
 748 ; SSE3-SLOW-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
 749 ; SSE3-SLOW-NEXT:    subsd %xmm1, %xmm0
 750 ; SSE3-SLOW-NEXT:    retq
 751 ;
 752 ; SSE3-FAST-LABEL: extract_extract01_v2f64_fsub_f64:
 753 ; SSE3-FAST:       # %bb.0:
 754 ; SSE3-FAST-NEXT:    hsubpd %xmm0, %xmm0
 755 ; SSE3-FAST-NEXT:    retq
 756 ;
 757 ; AVX-SLOW-LABEL: extract_extract01_v2f64_fsub_f64:
 758 ; AVX-SLOW:       # %bb.0:
 759 ; AVX-SLOW-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
 760 ; AVX-SLOW-NEXT:    vsubsd %xmm1, %xmm0, %xmm0
 761 ; AVX-SLOW-NEXT:    retq
 762 ;
 763 ; AVX-FAST-LABEL: extract_extract01_v2f64_fsub_f64:
 764 ; AVX-FAST:       # %bb.0:
 765 ; AVX-FAST-NEXT:    vhsubpd %xmm0, %xmm0, %xmm0
 766 ; AVX-FAST-NEXT:    retq
 767   %x0 = extractelement <2 x double> %x, i32 0
 768   %x1 = extractelement <2 x double> %x, i32 1
 769   %x01 = fsub double %x0, %x1
 770   ret double %x01
 771 }
 772
 773 define double @extract_extract01_v2f64_fsub_f64_commute(<2 x double> %x) {
 774 ; SSE3-LABEL: extract_extract01_v2f64_fsub_f64_commute:
 775 ; SSE3:       # %bb.0:
 776 ; SSE3-NEXT:    movapd %xmm0, %xmm1
 777 ; SSE3-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
 778 ; SSE3-NEXT:    subsd %xmm0, %xmm1
 779 ; SSE3-NEXT:    movapd %xmm1, %xmm0
 780 ; SSE3-NEXT:    retq
 781 ;
 782 ; AVX-LABEL: extract_extract01_v2f64_fsub_f64_commute:
 783 ; AVX:       # %bb.0:
 784 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
 785 ; AVX-NEXT:    vsubsd %xmm0, %xmm1, %xmm0
 786 ; AVX-NEXT:    retq
 787   %x0 = extractelement <2 x double> %x, i32 0
 788   %x1 = extractelement <2 x double> %x, i32 1
 789   %x01 = fsub double %x1, %x0
 790   ret double %x01
 791 }
 792
 793 ; 256-bit vectors, float/double, fadd/fsub
 794
 795 define float @extract_extract01_v8f32_fadd_f32(<8 x float> %x) {
 796 ; SSE3-SLOW-LABEL: extract_extract01_v8f32_fadd_f32:
 797 ; SSE3-SLOW:       # %bb.0:
 798 ; SSE3-SLOW-NEXT:    movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
 799 ; SSE3-SLOW-NEXT:    addss %xmm1, %xmm0
 800 ; SSE3-SLOW-NEXT:    retq
 801 ;
 802 ; SSE3-FAST-LABEL: extract_extract01_v8f32_fadd_f32:
 803 ; SSE3-FAST:       # %bb.0:
 804 ; SSE3-FAST-NEXT:    haddps %xmm0, %xmm0
 805 ; SSE3-FAST-NEXT:    retq
 806 ;
 807 ; AVX-SLOW-LABEL: extract_extract01_v8f32_fadd_f32:
 808 ; AVX-SLOW:       # %bb.0:
 809 ; AVX-SLOW-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
 810 ; AVX-SLOW-NEXT:    vaddss %xmm1, %xmm0, %xmm0
 811 ; AVX-SLOW-NEXT:    vzeroupper
 812 ; AVX-SLOW-NEXT:    retq
 813 ;
 814 ; AVX-FAST-LABEL: extract_extract01_v8f32_fadd_f32:
 815 ; AVX-FAST:       # %bb.0:
 816 ; AVX-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
 817 ; AVX-FAST-NEXT:    vzeroupper
 818 ; AVX-FAST-NEXT:    retq
 819   %x0 = extractelement <8 x float> %x, i32 0
 820   %x1 = extractelement <8 x float> %x, i32 1
 821   %x01 = fadd float %x0, %x1
 822   ret float %x01
 823 }
 824
 825 define float @extract_extract23_v8f32_fadd_f32(<8 x float> %x) {
 826 ; SSE3-SLOW-LABEL: extract_extract23_v8f32_fadd_f32:
 827 ; SSE3-SLOW:       # %bb.0:
 828 ; SSE3-SLOW-NEXT:    movaps %xmm0, %xmm1
 829 ; SSE3-SLOW-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
 830 ; SSE3-SLOW-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
 831 ; SSE3-SLOW-NEXT:    addss %xmm1, %xmm0
 832 ; SSE3-SLOW-NEXT:    retq
 833 ;
 834 ; SSE3-FAST-LABEL: extract_extract23_v8f32_fadd_f32:
 835 ; SSE3-FAST:       # %bb.0:
 836 ; SSE3-FAST-NEXT:    haddps %xmm0, %xmm0
 837 ; SSE3-FAST-NEXT:    movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
 838 ; SSE3-FAST-NEXT:    retq
 839 ;
 840 ; AVX-SLOW-LABEL: extract_extract23_v8f32_fadd_f32:
 841 ; AVX-SLOW:       # %bb.0:
 842 ; AVX-SLOW-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
 843 ; AVX-SLOW-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
 844 ; AVX-SLOW-NEXT:    vaddss %xmm0, %xmm1, %xmm0
 845 ; AVX-SLOW-NEXT:    vzeroupper
 846 ; AVX-SLOW-NEXT:    retq
 847 ;
 848 ; AVX-FAST-LABEL: extract_extract23_v8f32_fadd_f32:
 849 ; AVX-FAST:       # %bb.0:
 850 ; AVX-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
 851 ; AVX-FAST-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
 852 ; AVX-FAST-NEXT:    vzeroupper
 853 ; AVX-FAST-NEXT:    retq
 854   %x0 = extractelement <8 x float> %x, i32 2
 855   %x1 = extractelement <8 x float> %x, i32 3
 856   %x01 = fadd float %x0, %x1
 857   ret float %x01
 858 }
 859
 860 define float @extract_extract67_v8f32_fadd_f32(<8 x float> %x) {
 861 ; SSE3-SLOW-LABEL: extract_extract67_v8f32_fadd_f32:
 862 ; SSE3-SLOW:       # %bb.0:
 863 ; SSE3-SLOW-NEXT:    movaps %xmm1, %xmm0
 864 ; SSE3-SLOW-NEXT:    movhlps {{.*#+}} xmm1 = xmm1[1,1]
 865 ; SSE3-SLOW-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
 866 ; SSE3-SLOW-NEXT:    addss %xmm1, %xmm0
 867 ; SSE3-SLOW-NEXT:    retq
 868 ;
 869 ; SSE3-FAST-LABEL: extract_extract67_v8f32_fadd_f32:
 870 ; SSE3-FAST:       # %bb.0:
 871 ; SSE3-FAST-NEXT:    haddps %xmm1, %xmm1
 872 ; SSE3-FAST-NEXT:    movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
 873 ; SSE3-FAST-NEXT:    retq
 874 ;
 875 ; AVX-SLOW-LABEL: extract_extract67_v8f32_fadd_f32:
 876 ; AVX-SLOW:       # %bb.0:
 877 ; AVX-SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm0
 878 ; AVX-SLOW-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
 879 ; AVX-SLOW-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
 880 ; AVX-SLOW-NEXT:    vaddss %xmm0, %xmm1, %xmm0
 881 ; AVX-SLOW-NEXT:    vzeroupper
 882 ; AVX-SLOW-NEXT:    retq
 883 ;
 884 ; AVX-FAST-LABEL: extract_extract67_v8f32_fadd_f32:
 885 ; AVX-FAST:       # %bb.0:
 886 ; AVX-FAST-NEXT:    vextractf128 $1, %ymm0, %xmm0
 887 ; AVX-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
 888 ; AVX-FAST-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
 889 ; AVX-FAST-NEXT:    vzeroupper
 890 ; AVX-FAST-NEXT:    retq
 891   %x0 = extractelement <8 x float> %x, i32 6
 892   %x1 = extractelement <8 x float> %x, i32 7
 893   %x01 = fadd float %x0, %x1
 894   ret float %x01
 895 }
 896
 897 define float @extract_extract01_v8f32_fadd_f32_commute(<8 x float> %x) {
 898 ; SSE3-SLOW-LABEL: extract_extract01_v8f32_fadd_f32_commute:
 899 ; SSE3-SLOW:       # %bb.0:
 900 ; SSE3-SLOW-NEXT:    movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
 901 ; SSE3-SLOW-NEXT:    addss %xmm1, %xmm0
 902 ; SSE3-SLOW-NEXT:    retq
 903 ;
 904 ; SSE3-FAST-LABEL: extract_extract01_v8f32_fadd_f32_commute:
 905 ; SSE3-FAST:       # %bb.0:
 906 ; SSE3-FAST-NEXT:    haddps %xmm0, %xmm0
 907 ; SSE3-FAST-NEXT:    retq
 908 ;
 909 ; AVX-SLOW-LABEL: extract_extract01_v8f32_fadd_f32_commute:
 910 ; AVX-SLOW:       # %bb.0:
 911 ; AVX-SLOW-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
 912 ; AVX-SLOW-NEXT:    vaddss %xmm0, %xmm1, %xmm0
 913 ; AVX-SLOW-NEXT:    vzeroupper
 914 ; AVX-SLOW-NEXT:    retq
 915 ;
 916 ; AVX-FAST-LABEL: extract_extract01_v8f32_fadd_f32_commute:
 917 ; AVX-FAST:       # %bb.0:
 918 ; AVX-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
 919 ; AVX-FAST-NEXT:    vzeroupper
 920 ; AVX-FAST-NEXT:    retq
 921   %x0 = extractelement <8 x float> %x, i32 0
 922   %x1 = extractelement <8 x float> %x, i32 1
 923   %x01 = fadd float %x1, %x0
 924   ret float %x01
 925 }
 926
 927 define float @extract_extract23_v8f32_fadd_f32_commute(<8 x float> %x) {
 928 ; SSE3-SLOW-LABEL: extract_extract23_v8f32_fadd_f32_commute:
 929 ; SSE3-SLOW:       # %bb.0:
 930 ; SSE3-SLOW-NEXT:    movaps %xmm0, %xmm1
 931 ; SSE3-SLOW-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
 932 ; SSE3-SLOW-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
 933 ; SSE3-SLOW-NEXT:    addss %xmm1, %xmm0
 934 ; SSE3-SLOW-NEXT:    retq
 935 ;
 936 ; SSE3-FAST-LABEL: extract_extract23_v8f32_fadd_f32_commute:
 937 ; SSE3-FAST:       # %bb.0:
 938 ; SSE3-FAST-NEXT:    haddps %xmm0, %xmm0
 939 ; SSE3-FAST-NEXT:    movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
 940 ; SSE3-FAST-NEXT:    retq
 941 ;
 942 ; AVX-SLOW-LABEL: extract_extract23_v8f32_fadd_f32_commute:
 943 ; AVX-SLOW:       # %bb.0:
 944 ; AVX-SLOW-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
 945 ; AVX-SLOW-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
 946 ; AVX-SLOW-NEXT:    vaddss %xmm1, %xmm0, %xmm0
 947 ; AVX-SLOW-NEXT:    vzeroupper
 948 ; AVX-SLOW-NEXT:    retq
 949 ;
 950 ; AVX-FAST-LABEL: extract_extract23_v8f32_fadd_f32_commute:
 951 ; AVX-FAST:       # %bb.0:
 952 ; AVX-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
 953 ; AVX-FAST-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
 954 ; AVX-FAST-NEXT:    vzeroupper
 955 ; AVX-FAST-NEXT:    retq
 956   %x0 = extractelement <8 x float> %x, i32 2
 957   %x1 = extractelement <8 x float> %x, i32 3
 958   %x01 = fadd float %x1, %x0
 959   ret float %x01
 960 }
 961
 962 define float @extract_extract67_v8f32_fadd_f32_commute(<8 x float> %x) {
 963 ; SSE3-SLOW-LABEL: extract_extract67_v8f32_fadd_f32_commute:
 964 ; SSE3-SLOW:       # %bb.0:
 965 ; SSE3-SLOW-NEXT:    movaps %xmm1, %xmm0
 966 ; SSE3-SLOW-NEXT:    movhlps {{.*#+}} xmm1 = xmm1[1,1]
 967 ; SSE3-SLOW-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
 968 ; SSE3-SLOW-NEXT:    addss %xmm1, %xmm0
 969 ; SSE3-SLOW-NEXT:    retq
 970 ;
 971 ; SSE3-FAST-LABEL: extract_extract67_v8f32_fadd_f32_commute:
 972 ; SSE3-FAST:       # %bb.0:
 973 ; SSE3-FAST-NEXT:    haddps %xmm1, %xmm1
 974 ; SSE3-FAST-NEXT:    movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
 975 ; SSE3-FAST-NEXT:    retq
 976 ;
 977 ; AVX-SLOW-LABEL: extract_extract67_v8f32_fadd_f32_commute:
 978 ; AVX-SLOW:       # %bb.0:
 979 ; AVX-SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm0
 980 ; AVX-SLOW-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
 981 ; AVX-SLOW-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
 982 ; AVX-SLOW-NEXT:    vaddss %xmm1, %xmm0, %xmm0
 983 ; AVX-SLOW-NEXT:    vzeroupper
 984 ; AVX-SLOW-NEXT:    retq
 985 ;
 986 ; AVX-FAST-LABEL: extract_extract67_v8f32_fadd_f32_commute:
 987 ; AVX-FAST:       # %bb.0:
 988 ; AVX-FAST-NEXT:    vextractf128 $1, %ymm0, %xmm0
 989 ; AVX-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
 990 ; AVX-FAST-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
 991 ; AVX-FAST-NEXT:    vzeroupper
 992 ; AVX-FAST-NEXT:    retq
 993   %x0 = extractelement <8 x float> %x, i32 6
 994   %x1 = extractelement <8 x float> %x, i32 7
 995   %x01 = fadd float %x1, %x0
 996   ret float %x01
 997 }
 998
 999 define double @extract_extract01_v4f64_fadd_f64(<4 x double> %x) {
1000 ; SSE3-SLOW-LABEL: extract_extract01_v4f64_fadd_f64:
1001 ; SSE3-SLOW:       # %bb.0:
1002 ; SSE3-SLOW-NEXT:    movapd %xmm0, %xmm1
1003 ; SSE3-SLOW-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
1004 ; SSE3-SLOW-NEXT:    addsd %xmm0, %xmm1
1005 ; SSE3-SLOW-NEXT:    movapd %xmm1, %xmm0
1006 ; SSE3-SLOW-NEXT:    retq
1007 ;
1008 ; SSE3-FAST-LABEL: extract_extract01_v4f64_fadd_f64:
1009 ; SSE3-FAST:       # %bb.0:
1010 ; SSE3-FAST-NEXT:    haddpd %xmm0, %xmm0
1011 ; SSE3-FAST-NEXT:    retq
1012 ;
1013 ; AVX-SLOW-LABEL: extract_extract01_v4f64_fadd_f64:
1014 ; AVX-SLOW:       # %bb.0:
1015 ; AVX-SLOW-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1016 ; AVX-SLOW-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1017 ; AVX-SLOW-NEXT:    vzeroupper
1018 ; AVX-SLOW-NEXT:    retq
1019 ;
1020 ; AVX-FAST-LABEL: extract_extract01_v4f64_fadd_f64:
1021 ; AVX-FAST:       # %bb.0:
1022 ; AVX-FAST-NEXT:    vhaddpd %xmm0, %xmm0, %xmm0
1023 ; AVX-FAST-NEXT:    vzeroupper
1024 ; AVX-FAST-NEXT:    retq
1025   %x0 = extractelement <4 x double> %x, i32 0
1026   %x1 = extractelement <4 x double> %x, i32 1
1027   %x01 = fadd double %x0, %x1
1028   ret double %x01
1029 }
1030
1031 define double @extract_extract23_v4f64_fadd_f64(<4 x double> %x) {
1032 ; SSE3-SLOW-LABEL: extract_extract23_v4f64_fadd_f64:
1033 ; SSE3-SLOW:       # %bb.0:
1034 ; SSE3-SLOW-NEXT:    movapd %xmm1, %xmm0
1035 ; SSE3-SLOW-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
1036 ; SSE3-SLOW-NEXT:    addsd %xmm1, %xmm0
1037 ; SSE3-SLOW-NEXT:    retq
1038 ;
1039 ; SSE3-FAST-LABEL: extract_extract23_v4f64_fadd_f64:
1040 ; SSE3-FAST:       # %bb.0:
1041 ; SSE3-FAST-NEXT:    movapd %xmm1, %xmm0
1042 ; SSE3-FAST-NEXT:    haddpd %xmm1, %xmm0
1043 ; SSE3-FAST-NEXT:    retq
1044 ;
1045 ; AVX-SLOW-LABEL: extract_extract23_v4f64_fadd_f64:
1046 ; AVX-SLOW:       # %bb.0:
1047 ; AVX-SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm0
1048 ; AVX-SLOW-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1049 ; AVX-SLOW-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1050 ; AVX-SLOW-NEXT:    vzeroupper
1051 ; AVX-SLOW-NEXT:    retq
1052 ;
1053 ; AVX-FAST-LABEL: extract_extract23_v4f64_fadd_f64:
1054 ; AVX-FAST:       # %bb.0:
1055 ; AVX-FAST-NEXT:    vextractf128 $1, %ymm0, %xmm0
1056 ; AVX-FAST-NEXT:    vhaddpd %xmm0, %xmm0, %xmm0
1057 ; AVX-FAST-NEXT:    vzeroupper
1058 ; AVX-FAST-NEXT:    retq
1059   %x0 = extractelement <4 x double> %x, i32 2
1060   %x1 = extractelement <4 x double> %x, i32 3
1061   %x01 = fadd double %x0, %x1
1062   ret double %x01
1063 }
1064
1065 define double @extract_extract01_v4f64_fadd_f64_commute(<4 x double> %x) {
1066 ; SSE3-SLOW-LABEL: extract_extract01_v4f64_fadd_f64_commute:
1067 ; SSE3-SLOW:       # %bb.0:
1068 ; SSE3-SLOW-NEXT:    movapd %xmm0, %xmm1
1069 ; SSE3-SLOW-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
1070 ; SSE3-SLOW-NEXT:    addsd %xmm0, %xmm1
1071 ; SSE3-SLOW-NEXT:    movapd %xmm1, %xmm0
1072 ; SSE3-SLOW-NEXT:    retq
1073 ;
1074 ; SSE3-FAST-LABEL: extract_extract01_v4f64_fadd_f64_commute:
1075 ; SSE3-FAST:       # %bb.0:
1076 ; SSE3-FAST-NEXT:    haddpd %xmm0, %xmm0
1077 ; SSE3-FAST-NEXT:    retq
1078 ;
1079 ; AVX-SLOW-LABEL: extract_extract01_v4f64_fadd_f64_commute:
1080 ; AVX-SLOW:       # %bb.0:
1081 ; AVX-SLOW-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1082 ; AVX-SLOW-NEXT:    vaddsd %xmm0, %xmm1, %xmm0
1083 ; AVX-SLOW-NEXT:    vzeroupper
1084 ; AVX-SLOW-NEXT:    retq
1085 ;
1086 ; AVX-FAST-LABEL: extract_extract01_v4f64_fadd_f64_commute:
1087 ; AVX-FAST:       # %bb.0:
1088 ; AVX-FAST-NEXT:    vhaddpd %xmm0, %xmm0, %xmm0
1089 ; AVX-FAST-NEXT:    vzeroupper
1090 ; AVX-FAST-NEXT:    retq
1091   %x0 = extractelement <4 x double> %x, i32 0
1092   %x1 = extractelement <4 x double> %x, i32 1
1093   %x01 = fadd double %x1, %x0
1094   ret double %x01
1095 }
1096
1097 define double @extract_extract23_v4f64_fadd_f64_commute(<4 x double> %x) {
1098 ; SSE3-SLOW-LABEL: extract_extract23_v4f64_fadd_f64_commute:
1099 ; SSE3-SLOW:       # %bb.0:
1100 ; SSE3-SLOW-NEXT:    movapd %xmm1, %xmm0
1101 ; SSE3-SLOW-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
1102 ; SSE3-SLOW-NEXT:    addsd %xmm1, %xmm0
1103 ; SSE3-SLOW-NEXT:    retq
1104 ;
1105 ; SSE3-FAST-LABEL: extract_extract23_v4f64_fadd_f64_commute:
1106 ; SSE3-FAST:       # %bb.0:
1107 ; SSE3-FAST-NEXT:    movapd %xmm1, %xmm0
1108 ; SSE3-FAST-NEXT:    haddpd %xmm1, %xmm0
1109 ; SSE3-FAST-NEXT:    retq
1110 ;
1111 ; AVX-SLOW-LABEL: extract_extract23_v4f64_fadd_f64_commute:
1112 ; AVX-SLOW:       # %bb.0:
1113 ; AVX-SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm0
1114 ; AVX-SLOW-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1115 ; AVX-SLOW-NEXT:    vaddsd %xmm0, %xmm1, %xmm0
1116 ; AVX-SLOW-NEXT:    vzeroupper
1117 ; AVX-SLOW-NEXT:    retq
1118 ;
1119 ; AVX-FAST-LABEL: extract_extract23_v4f64_fadd_f64_commute:
1120 ; AVX-FAST:       # %bb.0:
1121 ; AVX-FAST-NEXT:    vextractf128 $1, %ymm0, %xmm0
1122 ; AVX-FAST-NEXT:    vhaddpd %xmm0, %xmm0, %xmm0
1123 ; AVX-FAST-NEXT:    vzeroupper
1124 ; AVX-FAST-NEXT:    retq
1125   %x0 = extractelement <4 x double> %x, i32 2
1126   %x1 = extractelement <4 x double> %x, i32 3
1127   %x01 = fadd double %x1, %x0
1128   ret double %x01
1129 }
1130
1131 define float @extract_extract01_v8f32_fsub_f32(<8 x float> %x) {
1132 ; SSE3-SLOW-LABEL: extract_extract01_v8f32_fsub_f32:
1133 ; SSE3-SLOW:       # %bb.0:
1134 ; SSE3-SLOW-NEXT:    movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1135 ; SSE3-SLOW-NEXT:    subss %xmm1, %xmm0
1136 ; SSE3-SLOW-NEXT:    retq
1137 ;
1138 ; SSE3-FAST-LABEL: extract_extract01_v8f32_fsub_f32:
1139 ; SSE3-FAST:       # %bb.0:
1140 ; SSE3-FAST-NEXT:    hsubps %xmm0, %xmm0
1141 ; SSE3-FAST-NEXT:    retq
1142 ;
1143 ; AVX-SLOW-LABEL: extract_extract01_v8f32_fsub_f32:
1144 ; AVX-SLOW:       # %bb.0:
1145 ; AVX-SLOW-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1146 ; AVX-SLOW-NEXT:    vsubss %xmm1, %xmm0, %xmm0
1147 ; AVX-SLOW-NEXT:    vzeroupper
1148 ; AVX-SLOW-NEXT:    retq
1149 ;
1150 ; AVX-FAST-LABEL: extract_extract01_v8f32_fsub_f32:
1151 ; AVX-FAST:       # %bb.0:
1152 ; AVX-FAST-NEXT:    vhsubps %xmm0, %xmm0, %xmm0
1153 ; AVX-FAST-NEXT:    vzeroupper
1154 ; AVX-FAST-NEXT:    retq
1155   %x0 = extractelement <8 x float> %x, i32 0
1156   %x1 = extractelement <8 x float> %x, i32 1
1157   %x01 = fsub float %x0, %x1
1158   ret float %x01
1159 }
1160
1161 define float @extract_extract23_v8f32_fsub_f32(<8 x float> %x) {
1162 ; SSE3-SLOW-LABEL: extract_extract23_v8f32_fsub_f32:
1163 ; SSE3-SLOW:       # %bb.0:
1164 ; SSE3-SLOW-NEXT:    movaps %xmm0, %xmm1
1165 ; SSE3-SLOW-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
1166 ; SSE3-SLOW-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
1167 ; SSE3-SLOW-NEXT:    subss %xmm0, %xmm1
1168 ; SSE3-SLOW-NEXT:    movaps %xmm1, %xmm0
1169 ; SSE3-SLOW-NEXT:    retq
1170 ;
1171 ; SSE3-FAST-LABEL: extract_extract23_v8f32_fsub_f32:
1172 ; SSE3-FAST:       # %bb.0:
1173 ; SSE3-FAST-NEXT:    hsubps %xmm0, %xmm0
1174 ; SSE3-FAST-NEXT:    movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
1175 ; SSE3-FAST-NEXT:    retq
1176 ;
1177 ; AVX-SLOW-LABEL: extract_extract23_v8f32_fsub_f32:
1178 ; AVX-SLOW:       # %bb.0:
1179 ; AVX-SLOW-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1180 ; AVX-SLOW-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
1181 ; AVX-SLOW-NEXT:    vsubss %xmm0, %xmm1, %xmm0
1182 ; AVX-SLOW-NEXT:    vzeroupper
1183 ; AVX-SLOW-NEXT:    retq
1184 ;
1185 ; AVX-FAST-LABEL: extract_extract23_v8f32_fsub_f32:
1186 ; AVX-FAST:       # %bb.0:
1187 ; AVX-FAST-NEXT:    vhsubps %xmm0, %xmm0, %xmm0
1188 ; AVX-FAST-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
1189 ; AVX-FAST-NEXT:    vzeroupper
1190 ; AVX-FAST-NEXT:    retq
1191   %x0 = extractelement <8 x float> %x, i32 2
1192   %x1 = extractelement <8 x float> %x, i32 3
1193   %x01 = fsub float %x0, %x1
1194   ret float %x01
1195 }
1196
1197 define float @extract_extract45_v8f32_fsub_f32(<8 x float> %x) {
1198 ; SSE3-SLOW-LABEL: extract_extract45_v8f32_fsub_f32:
1199 ; SSE3-SLOW:       # %bb.0:
1200 ; SSE3-SLOW-NEXT:    movaps %xmm1, %xmm0
1201 ; SSE3-SLOW-NEXT:    movshdup {{.*#+}} xmm1 = xmm1[1,1,3,3]
1202 ; SSE3-SLOW-NEXT:    subss %xmm1, %xmm0
1203 ; SSE3-SLOW-NEXT:    retq
1204 ;
1205 ; SSE3-FAST-LABEL: extract_extract45_v8f32_fsub_f32:
1206 ; SSE3-FAST:       # %bb.0:
1207 ; SSE3-FAST-NEXT:    movaps %xmm1, %xmm0
1208 ; SSE3-FAST-NEXT:    hsubps %xmm1, %xmm0
1209 ; SSE3-FAST-NEXT:    retq
1210 ;
1211 ; AVX-SLOW-LABEL: extract_extract45_v8f32_fsub_f32:
1212 ; AVX-SLOW:       # %bb.0:
1213 ; AVX-SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm0
1214 ; AVX-SLOW-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1215 ; AVX-SLOW-NEXT:    vsubss %xmm1, %xmm0, %xmm0
1216 ; AVX-SLOW-NEXT:    vzeroupper
1217 ; AVX-SLOW-NEXT:    retq
1218 ;
1219 ; AVX-FAST-LABEL: extract_extract45_v8f32_fsub_f32:
1220 ; AVX-FAST:       # %bb.0:
1221 ; AVX-FAST-NEXT:    vextractf128 $1, %ymm0, %xmm0
1222 ; AVX-FAST-NEXT:    vhsubps %xmm0, %xmm0, %xmm0
1223 ; AVX-FAST-NEXT:    vzeroupper
1224 ; AVX-FAST-NEXT:    retq
1225   %x0 = extractelement <8 x float> %x, i32 4
1226   %x1 = extractelement <8 x float> %x, i32 5
1227   %x01 = fsub float %x0, %x1
1228   ret float %x01
1229 }
1230
1231 ; Negative test...or get hoppy and negate?
1232
1233 define float @extract_extract01_v8f32_fsub_f32_commute(<8 x float> %x) {
1234 ; SSE3-LABEL: extract_extract01_v8f32_fsub_f32_commute:
1235 ; SSE3:       # %bb.0:
1236 ; SSE3-NEXT:    movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1237 ; SSE3-NEXT:    subss %xmm0, %xmm1
1238 ; SSE3-NEXT:    movaps %xmm1, %xmm0
1239 ; SSE3-NEXT:    retq
1240 ;
1241 ; AVX-LABEL: extract_extract01_v8f32_fsub_f32_commute:
1242 ; AVX:       # %bb.0:
1243 ; AVX-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1244 ; AVX-NEXT:    vsubss %xmm0, %xmm1, %xmm0
1245 ; AVX-NEXT:    vzeroupper
1246 ; AVX-NEXT:    retq
1247   %x0 = extractelement <8 x float> %x, i32 0
1248   %x1 = extractelement <8 x float> %x, i32 1
1249   %x01 = fsub float %x1, %x0
1250   ret float %x01
1251 }
1252
1253 define double @extract_extract01_v4f64_fsub_f64(<4 x double> %x) {
1254 ; SSE3-SLOW-LABEL: extract_extract01_v4f64_fsub_f64:
1255 ; SSE3-SLOW:       # %bb.0:
1256 ; SSE3-SLOW-NEXT:    movapd %xmm0, %xmm1
1257 ; SSE3-SLOW-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
1258 ; SSE3-SLOW-NEXT:    subsd %xmm1, %xmm0
1259 ; SSE3-SLOW-NEXT:    retq
1260 ;
1261 ; SSE3-FAST-LABEL: extract_extract01_v4f64_fsub_f64:
1262 ; SSE3-FAST:       # %bb.0:
1263 ; SSE3-FAST-NEXT:    hsubpd %xmm0, %xmm0
1264 ; SSE3-FAST-NEXT:    retq
1265 ;
1266 ; AVX-SLOW-LABEL: extract_extract01_v4f64_fsub_f64:
1267 ; AVX-SLOW:       # %bb.0:
1268 ; AVX-SLOW-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1269 ; AVX-SLOW-NEXT:    vsubsd %xmm1, %xmm0, %xmm0
1270 ; AVX-SLOW-NEXT:    vzeroupper
1271 ; AVX-SLOW-NEXT:    retq
1272 ;
1273 ; AVX-FAST-LABEL: extract_extract01_v4f64_fsub_f64:
1274 ; AVX-FAST:       # %bb.0:
1275 ; AVX-FAST-NEXT:    vhsubpd %xmm0, %xmm0, %xmm0
1276 ; AVX-FAST-NEXT:    vzeroupper
1277 ; AVX-FAST-NEXT:    retq
1278   %x0 = extractelement <4 x double> %x, i32 0
1279   %x1 = extractelement <4 x double> %x, i32 1
1280   %x01 = fsub double %x0, %x1
1281   ret double %x01
1282 }
1283
1284 ; Negative test...or get hoppy and negate?
1285
1286 define double @extract_extract01_v4f64_fsub_f64_commute(<4 x double> %x) {
1287 ; SSE3-LABEL: extract_extract01_v4f64_fsub_f64_commute:
1288 ; SSE3:       # %bb.0:
1289 ; SSE3-NEXT:    movapd %xmm0, %xmm1
1290 ; SSE3-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
1291 ; SSE3-NEXT:    subsd %xmm0, %xmm1
1292 ; SSE3-NEXT:    movapd %xmm1, %xmm0
1293 ; SSE3-NEXT:    retq
1294 ;
1295 ; AVX-LABEL: extract_extract01_v4f64_fsub_f64_commute:
1296 ; AVX:       # %bb.0:
1297 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1298 ; AVX-NEXT:    vsubsd %xmm0, %xmm1, %xmm0
1299 ; AVX-NEXT:    vzeroupper
1300 ; AVX-NEXT:    retq
1301   %x0 = extractelement <4 x double> %x, i32 0
1302   %x1 = extractelement <4 x double> %x, i32 1
1303   %x01 = fsub double %x1, %x0
1304   ret double %x01
1305 }
1306
1307 ; 512-bit vectors, float/double, fadd/fsub
1308
1309 define float @extract_extract01_v16f32_fadd_f32(<16 x float> %x) {
1310 ; SSE3-SLOW-LABEL: extract_extract01_v16f32_fadd_f32:
1311 ; SSE3-SLOW:       # %bb.0:
1312 ; SSE3-SLOW-NEXT:    movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1313 ; SSE3-SLOW-NEXT:    addss %xmm1, %xmm0
1314 ; SSE3-SLOW-NEXT:    retq
1315 ;
1316 ; SSE3-FAST-LABEL: extract_extract01_v16f32_fadd_f32:
1317 ; SSE3-FAST:       # %bb.0:
1318 ; SSE3-FAST-NEXT:    haddps %xmm0, %xmm0
1319 ; SSE3-FAST-NEXT:    retq
1320 ;
1321 ; AVX-SLOW-LABEL: extract_extract01_v16f32_fadd_f32:
1322 ; AVX-SLOW:       # %bb.0:
1323 ; AVX-SLOW-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1324 ; AVX-SLOW-NEXT:    vaddss %xmm1, %xmm0, %xmm0
1325 ; AVX-SLOW-NEXT:    vzeroupper
1326 ; AVX-SLOW-NEXT:    retq
1327 ;
1328 ; AVX-FAST-LABEL: extract_extract01_v16f32_fadd_f32:
1329 ; AVX-FAST:       # %bb.0:
1330 ; AVX-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
1331 ; AVX-FAST-NEXT:    vzeroupper
1332 ; AVX-FAST-NEXT:    retq
1333   %x0 = extractelement <16 x float> %x, i32 0
1334   %x1 = extractelement <16 x float> %x, i32 1
1335   %x01 = fadd float %x0, %x1
1336   ret float %x01
1337 }
1338
1339 define float @extract_extract01_v16f32_fadd_f32_commute(<16 x float> %x) {
1340 ; SSE3-SLOW-LABEL: extract_extract01_v16f32_fadd_f32_commute:
1341 ; SSE3-SLOW:       # %bb.0:
1342 ; SSE3-SLOW-NEXT:    movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1343 ; SSE3-SLOW-NEXT:    addss %xmm1, %xmm0
1344 ; SSE3-SLOW-NEXT:    retq
1345 ;
1346 ; SSE3-FAST-LABEL: extract_extract01_v16f32_fadd_f32_commute:
1347 ; SSE3-FAST:       # %bb.0:
1348 ; SSE3-FAST-NEXT:    haddps %xmm0, %xmm0
1349 ; SSE3-FAST-NEXT:    retq
1350 ;
1351 ; AVX-SLOW-LABEL: extract_extract01_v16f32_fadd_f32_commute:
1352 ; AVX-SLOW:       # %bb.0:
1353 ; AVX-SLOW-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1354 ; AVX-SLOW-NEXT:    vaddss %xmm0, %xmm1, %xmm0
1355 ; AVX-SLOW-NEXT:    vzeroupper
1356 ; AVX-SLOW-NEXT:    retq
1357 ;
1358 ; AVX-FAST-LABEL: extract_extract01_v16f32_fadd_f32_commute:
1359 ; AVX-FAST:       # %bb.0:
1360 ; AVX-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
1361 ; AVX-FAST-NEXT:    vzeroupper
1362 ; AVX-FAST-NEXT:    retq
1363   %x0 = extractelement <16 x float> %x, i32 0
1364   %x1 = extractelement <16 x float> %x, i32 1
1365   %x01 = fadd float %x1, %x0
1366   ret float %x01
1367 }
1368
1369 define double @extract_extract01_v8f64_fadd_f64(<8 x double> %x) {
1370 ; SSE3-SLOW-LABEL: extract_extract01_v8f64_fadd_f64:
1371 ; SSE3-SLOW:       # %bb.0:
1372 ; SSE3-SLOW-NEXT:    movapd %xmm0, %xmm1
1373 ; SSE3-SLOW-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
1374 ; SSE3-SLOW-NEXT:    addsd %xmm0, %xmm1
1375 ; SSE3-SLOW-NEXT:    movapd %xmm1, %xmm0
1376 ; SSE3-SLOW-NEXT:    retq
1377 ;
1378 ; SSE3-FAST-LABEL: extract_extract01_v8f64_fadd_f64:
1379 ; SSE3-FAST:       # %bb.0:
1380 ; SSE3-FAST-NEXT:    haddpd %xmm0, %xmm0
1381 ; SSE3-FAST-NEXT:    retq
1382 ;
1383 ; AVX-SLOW-LABEL: extract_extract01_v8f64_fadd_f64:
1384 ; AVX-SLOW:       # %bb.0:
1385 ; AVX-SLOW-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1386 ; AVX-SLOW-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1387 ; AVX-SLOW-NEXT:    vzeroupper
1388 ; AVX-SLOW-NEXT:    retq
1389 ;
1390 ; AVX-FAST-LABEL: extract_extract01_v8f64_fadd_f64:
1391 ; AVX-FAST:       # %bb.0:
1392 ; AVX-FAST-NEXT:    vhaddpd %xmm0, %xmm0, %xmm0
1393 ; AVX-FAST-NEXT:    vzeroupper
1394 ; AVX-FAST-NEXT:    retq
1395   %x0 = extractelement <8 x double> %x, i32 0
1396   %x1 = extractelement <8 x double> %x, i32 1
1397   %x01 = fadd double %x0, %x1
1398   ret double %x01
1399 }
1400
1401 define double @extract_extract01_v8f64_fadd_f64_commute(<8 x double> %x) {
1402 ; SSE3-SLOW-LABEL: extract_extract01_v8f64_fadd_f64_commute:
1403 ; SSE3-SLOW:       # %bb.0:
1404 ; SSE3-SLOW-NEXT:    movapd %xmm0, %xmm1
1405 ; SSE3-SLOW-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
1406 ; SSE3-SLOW-NEXT:    addsd %xmm0, %xmm1
1407 ; SSE3-SLOW-NEXT:    movapd %xmm1, %xmm0
1408 ; SSE3-SLOW-NEXT:    retq
1409 ;
1410 ; SSE3-FAST-LABEL: extract_extract01_v8f64_fadd_f64_commute:
1411 ; SSE3-FAST:       # %bb.0:
1412 ; SSE3-FAST-NEXT:    haddpd %xmm0, %xmm0
1413 ; SSE3-FAST-NEXT:    retq
1414 ;
1415 ; AVX-SLOW-LABEL: extract_extract01_v8f64_fadd_f64_commute:
1416 ; AVX-SLOW:       # %bb.0:
1417 ; AVX-SLOW-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1418 ; AVX-SLOW-NEXT:    vaddsd %xmm0, %xmm1, %xmm0
1419 ; AVX-SLOW-NEXT:    vzeroupper
1420 ; AVX-SLOW-NEXT:    retq
1421 ;
1422 ; AVX-FAST-LABEL: extract_extract01_v8f64_fadd_f64_commute:
1423 ; AVX-FAST:       # %bb.0:
1424 ; AVX-FAST-NEXT:    vhaddpd %xmm0, %xmm0, %xmm0
1425 ; AVX-FAST-NEXT:    vzeroupper
1426 ; AVX-FAST-NEXT:    retq
1427   %x0 = extractelement <8 x double> %x, i32 0
1428   %x1 = extractelement <8 x double> %x, i32 1
1429   %x01 = fadd double %x1, %x0
1430   ret double %x01
1431 }
1432
1433 define float @extract_extract01_v16f32_fsub_f32(<16 x float> %x) {
1434 ; SSE3-SLOW-LABEL: extract_extract01_v16f32_fsub_f32:
1435 ; SSE3-SLOW:       # %bb.0:
1436 ; SSE3-SLOW-NEXT:    movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1437 ; SSE3-SLOW-NEXT:    subss %xmm1, %xmm0
1438 ; SSE3-SLOW-NEXT:    retq
1439 ;
1440 ; SSE3-FAST-LABEL: extract_extract01_v16f32_fsub_f32:
1441 ; SSE3-FAST:       # %bb.0:
1442 ; SSE3-FAST-NEXT:    hsubps %xmm0, %xmm0
1443 ; SSE3-FAST-NEXT:    retq
1444 ;
1445 ; AVX-SLOW-LABEL: extract_extract01_v16f32_fsub_f32:
1446 ; AVX-SLOW:       # %bb.0:
1447 ; AVX-SLOW-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1448 ; AVX-SLOW-NEXT:    vsubss %xmm1, %xmm0, %xmm0
1449 ; AVX-SLOW-NEXT:    vzeroupper
1450 ; AVX-SLOW-NEXT:    retq
1451 ;
1452 ; AVX-FAST-LABEL: extract_extract01_v16f32_fsub_f32:
1453 ; AVX-FAST:       # %bb.0:
1454 ; AVX-FAST-NEXT:    vhsubps %xmm0, %xmm0, %xmm0
1455 ; AVX-FAST-NEXT:    vzeroupper
1456 ; AVX-FAST-NEXT:    retq
1457   %x0 = extractelement <16 x float> %x, i32 0
1458   %x1 = extractelement <16 x float> %x, i32 1
1459   %x01 = fsub float %x0, %x1
1460   ret float %x01
1461 }
1462
1463 define float @extract_extract01_v16f32_fsub_f32_commute(<16 x float> %x) {
1464 ; SSE3-LABEL: extract_extract01_v16f32_fsub_f32_commute:
1465 ; SSE3:       # %bb.0:
1466 ; SSE3-NEXT:    movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1467 ; SSE3-NEXT:    subss %xmm0, %xmm1
1468 ; SSE3-NEXT:    movaps %xmm1, %xmm0
1469 ; SSE3-NEXT:    retq
1470 ;
1471 ; AVX-LABEL: extract_extract01_v16f32_fsub_f32_commute:
1472 ; AVX:       # %bb.0:
1473 ; AVX-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1474 ; AVX-NEXT:    vsubss %xmm0, %xmm1, %xmm0
1475 ; AVX-NEXT:    vzeroupper
1476 ; AVX-NEXT:    retq
1477   %x0 = extractelement <16 x float> %x, i32 0
1478   %x1 = extractelement <16 x float> %x, i32 1
1479   %x01 = fsub float %x1, %x0
1480   ret float %x01
1481 }
1482
1483 define double @extract_extract01_v8f64_fsub_f64(<8 x double> %x) {
1484 ; SSE3-SLOW-LABEL: extract_extract01_v8f64_fsub_f64:
1485 ; SSE3-SLOW:       # %bb.0:
1486 ; SSE3-SLOW-NEXT:    movapd %xmm0, %xmm1
1487 ; SSE3-SLOW-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
1488 ; SSE3-SLOW-NEXT:    subsd %xmm1, %xmm0
1489 ; SSE3-SLOW-NEXT:    retq
1490 ;
1491 ; SSE3-FAST-LABEL: extract_extract01_v8f64_fsub_f64:
1492 ; SSE3-FAST:       # %bb.0:
1493 ; SSE3-FAST-NEXT:    hsubpd %xmm0, %xmm0
1494 ; SSE3-FAST-NEXT:    retq
1495 ;
1496 ; AVX-SLOW-LABEL: extract_extract01_v8f64_fsub_f64:
1497 ; AVX-SLOW:       # %bb.0:
1498 ; AVX-SLOW-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1499 ; AVX-SLOW-NEXT:    vsubsd %xmm1, %xmm0, %xmm0
1500 ; AVX-SLOW-NEXT:    vzeroupper
1501 ; AVX-SLOW-NEXT:    retq
1502 ;
1503 ; AVX-FAST-LABEL: extract_extract01_v8f64_fsub_f64:
1504 ; AVX-FAST:       # %bb.0:
1505 ; AVX-FAST-NEXT:    vhsubpd %xmm0, %xmm0, %xmm0
1506 ; AVX-FAST-NEXT:    vzeroupper
1507 ; AVX-FAST-NEXT:    retq
1508   %x0 = extractelement <8 x double> %x, i32 0
1509   %x1 = extractelement <8 x double> %x, i32 1
1510   %x01 = fsub double %x0, %x1
1511   ret double %x01
1512 }
1513
1514 define double @extract_extract01_v8f64_fsub_f64_commute(<8 x double> %x) {
1515 ; SSE3-LABEL: extract_extract01_v8f64_fsub_f64_commute:
1516 ; SSE3:       # %bb.0:
1517 ; SSE3-NEXT:    movapd %xmm0, %xmm1
1518 ; SSE3-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
1519 ; SSE3-NEXT:    subsd %xmm0, %xmm1
1520 ; SSE3-NEXT:    movapd %xmm1, %xmm0
1521 ; SSE3-NEXT:    retq
1522 ;
1523 ; AVX-LABEL: extract_extract01_v8f64_fsub_f64_commute:
1524 ; AVX:       # %bb.0:
1525 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1526 ; AVX-NEXT:    vsubsd %xmm0, %xmm1, %xmm0
1527 ; AVX-NEXT:    vzeroupper
1528 ; AVX-NEXT:    retq
1529   %x0 = extractelement <8 x double> %x, i32 0
1530   %x1 = extractelement <8 x double> %x, i32 1
1531   %x01 = fsub double %x1, %x0
1532   ret double %x01
1533 }
1534
1535 ; Check output when 1 or both extracts have extra uses.
1536
1537 define float @extract_extract01_v4f32_fadd_f32_uses1(<4 x float> %x, float* %p) {
1538 ; SSE3-SLOW-LABEL: extract_extract01_v4f32_fadd_f32_uses1:
1539 ; SSE3-SLOW:       # %bb.0:
1540 ; SSE3-SLOW-NEXT:    movss %xmm0, (%rdi)
1541 ; SSE3-SLOW-NEXT:    movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1542 ; SSE3-SLOW-NEXT:    addss %xmm1, %xmm0
1543 ; SSE3-SLOW-NEXT:    retq
1544 ;
1545 ; SSE3-FAST-LABEL: extract_extract01_v4f32_fadd_f32_uses1:
1546 ; SSE3-FAST:       # %bb.0:
1547 ; SSE3-FAST-NEXT:    movss %xmm0, (%rdi)
1548 ; SSE3-FAST-NEXT:    haddps %xmm0, %xmm0
1549 ; SSE3-FAST-NEXT:    retq
1550 ;
1551 ; AVX-SLOW-LABEL: extract_extract01_v4f32_fadd_f32_uses1:
1552 ; AVX-SLOW:       # %bb.0:
1553 ; AVX-SLOW-NEXT:    vmovss %xmm0, (%rdi)
1554 ; AVX-SLOW-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1555 ; AVX-SLOW-NEXT:    vaddss %xmm1, %xmm0, %xmm0
1556 ; AVX-SLOW-NEXT:    retq
1557 ;
1558 ; AVX-FAST-LABEL: extract_extract01_v4f32_fadd_f32_uses1:
1559 ; AVX-FAST:       # %bb.0:
1560 ; AVX-FAST-NEXT:    vmovss %xmm0, (%rdi)
1561 ; AVX-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
1562 ; AVX-FAST-NEXT:    retq
1563   %x0 = extractelement <4 x float> %x, i32 0
1564   store float %x0, float* %p
1565   %x1 = extractelement <4 x float> %x, i32 1
1566   %x01 = fadd float %x0, %x1
1567   ret float %x01
1568 }
1569
1570 define float @extract_extract01_v4f32_fadd_f32_uses2(<4 x float> %x, float* %p) {
1571 ; SSE3-SLOW-LABEL: extract_extract01_v4f32_fadd_f32_uses2:
1572 ; SSE3-SLOW:       # %bb.0:
1573 ; SSE3-SLOW-NEXT:    movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1574 ; SSE3-SLOW-NEXT:    movss %xmm1, (%rdi)
1575 ; SSE3-SLOW-NEXT:    addss %xmm1, %xmm0
1576 ; SSE3-SLOW-NEXT:    retq
1577 ;
1578 ; SSE3-FAST-LABEL: extract_extract01_v4f32_fadd_f32_uses2:
1579 ; SSE3-FAST:       # %bb.0:
1580 ; SSE3-FAST-NEXT:    movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1581 ; SSE3-FAST-NEXT:    movss %xmm1, (%rdi)
1582 ; SSE3-FAST-NEXT:    haddps %xmm0, %xmm0
1583 ; SSE3-FAST-NEXT:    retq
1584 ;
1585 ; AVX-SLOW-LABEL: extract_extract01_v4f32_fadd_f32_uses2:
1586 ; AVX-SLOW:       # %bb.0:
1587 ; AVX-SLOW-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1588 ; AVX-SLOW-NEXT:    vmovss %xmm1, (%rdi)
1589 ; AVX-SLOW-NEXT:    vaddss %xmm1, %xmm0, %xmm0
1590 ; AVX-SLOW-NEXT:    retq
1591 ;
1592 ; AVX-FAST-LABEL: extract_extract01_v4f32_fadd_f32_uses2:
1593 ; AVX-FAST:       # %bb.0:
1594 ; AVX-FAST-NEXT:    vextractps $1, %xmm0, (%rdi)
1595 ; AVX-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
1596 ; AVX-FAST-NEXT:    retq
1597   %x0 = extractelement <4 x float> %x, i32 0
1598   %x1 = extractelement <4 x float> %x, i32 1
1599   store float %x1, float* %p
1600   %x01 = fadd float %x0, %x1
1601   ret float %x01
1602 }
1603
1604 define float @extract_extract01_v4f32_fadd_f32_uses3(<4 x float> %x, float* %p1, float* %p2) {
1605 ; SSE3-LABEL: extract_extract01_v4f32_fadd_f32_uses3:
1606 ; SSE3:       # %bb.0:
1607 ; SSE3-NEXT:    movss %xmm0, (%rdi)
1608 ; SSE3-NEXT:    movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1609 ; SSE3-NEXT:    movss %xmm1, (%rsi)
1610 ; SSE3-NEXT:    addss %xmm1, %xmm0
1611 ; SSE3-NEXT:    retq
1612 ;
1613 ; AVX-LABEL: extract_extract01_v4f32_fadd_f32_uses3:
1614 ; AVX:       # %bb.0:
1615 ; AVX-NEXT:    vmovss %xmm0, (%rdi)
1616 ; AVX-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1617 ; AVX-NEXT:    vmovss %xmm1, (%rsi)
1618 ; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm0
1619 ; AVX-NEXT:    retq
1620   %x0 = extractelement <4 x float> %x, i32 0
1621   store float %x0, float* %p1
1622   %x1 = extractelement <4 x float> %x, i32 1
1623   store float %x1, float* %p2
1624   %x01 = fadd float %x0, %x1
1625   ret float %x01
1626 }
1627
1628 ; Repeat tests from general reductions to verify output for hoppy targets:
1629 ; PR38971: https://bugs.llvm.org/show_bug.cgi?id=38971
1630
1631 declare float @llvm.experimental.vector.reduce.v2.fadd.f32.v8f32(float, <8 x float>)
1632 declare double @llvm.experimental.vector.reduce.v2.fadd.f64.v4f64(double, <4 x double>)
1633
1634 define float @fadd_reduce_v8f32(float %a0, <8 x float> %a1) {
1635 ; SSE3-SLOW-LABEL: fadd_reduce_v8f32:
1636 ; SSE3-SLOW:       # %bb.0:
1637 ; SSE3-SLOW-NEXT:    addps %xmm2, %xmm1
1638 ; SSE3-SLOW-NEXT:    movaps %xmm1, %xmm2
1639 ; SSE3-SLOW-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1]
1640 ; SSE3-SLOW-NEXT:    addps %xmm1, %xmm2
1641 ; SSE3-SLOW-NEXT:    movshdup {{.*#+}} xmm1 = xmm2[1,1,3,3]
1642 ; SSE3-SLOW-NEXT:    addss %xmm2, %xmm1
1643 ; SSE3-SLOW-NEXT:    addss %xmm1, %xmm0
1644 ; SSE3-SLOW-NEXT:    retq
1645 ;
1646 ; SSE3-FAST-LABEL: fadd_reduce_v8f32:
1647 ; SSE3-FAST:       # %bb.0:
1648 ; SSE3-FAST-NEXT:    addps %xmm2, %xmm1
1649 ; SSE3-FAST-NEXT:    movaps %xmm1, %xmm2
1650 ; SSE3-FAST-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1]
1651 ; SSE3-FAST-NEXT:    addps %xmm1, %xmm2
1652 ; SSE3-FAST-NEXT:    haddps %xmm2, %xmm2
1653 ; SSE3-FAST-NEXT:    addss %xmm2, %xmm0
1654 ; SSE3-FAST-NEXT:    retq
1655 ;
1656 ; AVX-SLOW-LABEL: fadd_reduce_v8f32:
1657 ; AVX-SLOW:       # %bb.0:
1658 ; AVX-SLOW-NEXT:    vextractf128 $1, %ymm1, %xmm2
1659 ; AVX-SLOW-NEXT:    vaddps %xmm2, %xmm1, %xmm1
1660 ; AVX-SLOW-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
1661 ; AVX-SLOW-NEXT:    vaddps %xmm2, %xmm1, %xmm1
1662 ; AVX-SLOW-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
1663 ; AVX-SLOW-NEXT:    vaddss %xmm2, %xmm1, %xmm1
1664 ; AVX-SLOW-NEXT:    vaddss %xmm1, %xmm0, %xmm0
1665 ; AVX-SLOW-NEXT:    vzeroupper
1666 ; AVX-SLOW-NEXT:    retq
1667 ;
1668 ; AVX-FAST-LABEL: fadd_reduce_v8f32:
1669 ; AVX-FAST:       # %bb.0:
1670 ; AVX-FAST-NEXT:    vextractf128 $1, %ymm1, %xmm2
1671 ; AVX-FAST-NEXT:    vaddps %xmm2, %xmm1, %xmm1
1672 ; AVX-FAST-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
1673 ; AVX-FAST-NEXT:    vaddps %xmm2, %xmm1, %xmm1
1674 ; AVX-FAST-NEXT:    vhaddps %xmm1, %xmm1, %xmm1
1675 ; AVX-FAST-NEXT:    vaddss %xmm1, %xmm0, %xmm0
1676 ; AVX-FAST-NEXT:    vzeroupper
1677 ; AVX-FAST-NEXT:    retq
1678   %r = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v8f32(float %a0, <8 x float> %a1)
1679   ret float %r
1680 }
1681
1682 define double @fadd_reduce_v4f64(double %a0, <4 x double> %a1) {
1683 ; SSE3-SLOW-LABEL: fadd_reduce_v4f64:
1684 ; SSE3-SLOW:       # %bb.0:
1685 ; SSE3-SLOW-NEXT:    addpd %xmm2, %xmm1
1686 ; SSE3-SLOW-NEXT:    movapd %xmm1, %xmm2
1687 ; SSE3-SLOW-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1]
1688 ; SSE3-SLOW-NEXT:    addsd %xmm1, %xmm2
1689 ; SSE3-SLOW-NEXT:    addsd %xmm2, %xmm0
1690 ; SSE3-SLOW-NEXT:    retq
1691 ;
1692 ; SSE3-FAST-LABEL: fadd_reduce_v4f64:
1693 ; SSE3-FAST:       # %bb.0:
1694 ; SSE3-FAST-NEXT:    addpd %xmm2, %xmm1
1695 ; SSE3-FAST-NEXT:    haddpd %xmm1, %xmm1
1696 ; SSE3-FAST-NEXT:    addsd %xmm1, %xmm0
1697 ; SSE3-FAST-NEXT:    retq
1698 ;
1699 ; AVX-SLOW-LABEL: fadd_reduce_v4f64:
1700 ; AVX-SLOW:       # %bb.0:
1701 ; AVX-SLOW-NEXT:    vextractf128 $1, %ymm1, %xmm2
1702 ; AVX-SLOW-NEXT:    vaddpd %xmm2, %xmm1, %xmm1
1703 ; AVX-SLOW-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
1704 ; AVX-SLOW-NEXT:    vaddsd %xmm2, %xmm1, %xmm1
1705 ; AVX-SLOW-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1706 ; AVX-SLOW-NEXT:    vzeroupper
1707 ; AVX-SLOW-NEXT:    retq
1708 ;
1709 ; AVX-FAST-LABEL: fadd_reduce_v4f64:
1710 ; AVX-FAST:       # %bb.0:
1711 ; AVX-FAST-NEXT:    vextractf128 $1, %ymm1, %xmm2
1712 ; AVX-FAST-NEXT:    vaddpd %xmm2, %xmm1, %xmm1
1713 ; AVX-FAST-NEXT:    vhaddpd %xmm1, %xmm1, %xmm1
1714 ; AVX-FAST-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1715 ; AVX-FAST-NEXT:    vzeroupper
1716 ; AVX-FAST-NEXT:    retq
1717   %r = call fast double @llvm.experimental.vector.reduce.v2.fadd.f64.v4f64(double %a0, <4 x double> %a1)
1718   ret double %r
1719 }
1720
1721 define float @PR39936_v8f32(<8 x float>) {
1722 ; SSSE3-SLOW-LABEL: PR39936_v8f32:
1723 ; SSSE3-SLOW:       # %bb.0:
1724 ; SSSE3-SLOW-NEXT:    haddps %xmm1, %xmm0
1725 ; SSSE3-SLOW-NEXT:    movaps %xmm0, %xmm1
1726 ; SSSE3-SLOW-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[2,3]
1727 ; SSSE3-SLOW-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,3,2,3]
1728 ; SSSE3-SLOW-NEXT:    addps %xmm1, %xmm0
1729 ; SSSE3-SLOW-NEXT:    movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1730 ; SSSE3-SLOW-NEXT:    addss %xmm1, %xmm0
1731 ; SSSE3-SLOW-NEXT:    retq
1732 ;
1733 ; SSSE3-FAST-LABEL: PR39936_v8f32:
1734 ; SSSE3-FAST:       # %bb.0:
1735 ; SSSE3-FAST-NEXT:    haddps %xmm1, %xmm0
1736 ; SSSE3-FAST-NEXT:    haddps %xmm0, %xmm0
1737 ; SSSE3-FAST-NEXT:    haddps %xmm0, %xmm0
1738 ; SSSE3-FAST-NEXT:    retq
1739 ;
1740 ; SSE3-SLOW-LABEL: PR39936_v8f32:
1741 ; SSE3-SLOW:       # %bb.0:
1742 ; SSE3-SLOW-NEXT:    haddps %xmm1, %xmm0
1743 ; SSE3-SLOW-NEXT:    haddps %xmm0, %xmm0
1744 ; SSE3-SLOW-NEXT:    movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1745 ; SSE3-SLOW-NEXT:    addss %xmm1, %xmm0
1746 ; SSE3-SLOW-NEXT:    retq
1747 ;
1748 ; SSE3-FAST-LABEL: PR39936_v8f32:
1749 ; SSE3-FAST:       # %bb.0:
1750 ; SSE3-FAST-NEXT:    haddps %xmm1, %xmm0
1751 ; SSE3-FAST-NEXT:    haddps %xmm0, %xmm0
1752 ; SSE3-FAST-NEXT:    haddps %xmm0, %xmm0
1753 ; SSE3-FAST-NEXT:    retq
1754 ;
1755 ; AVX-SLOW-LABEL: PR39936_v8f32:
1756 ; AVX-SLOW:       # %bb.0:
1757 ; AVX-SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm1
1758 ; AVX-SLOW-NEXT:    vhaddps %xmm1, %xmm0, %xmm0
1759 ; AVX-SLOW-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
1760 ; AVX-SLOW-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1761 ; AVX-SLOW-NEXT:    vaddss %xmm1, %xmm0, %xmm0
1762 ; AVX-SLOW-NEXT:    vzeroupper
1763 ; AVX-SLOW-NEXT:    retq
1764 ;
1765 ; AVX-FAST-LABEL: PR39936_v8f32:
1766 ; AVX-FAST:       # %bb.0:
1767 ; AVX-FAST-NEXT:    vextractf128 $1, %ymm0, %xmm1
1768 ; AVX-FAST-NEXT:    vhaddps %xmm1, %xmm0, %xmm0
1769 ; AVX-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
1770 ; AVX-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
1771 ; AVX-FAST-NEXT:    vzeroupper
1772 ; AVX-FAST-NEXT:    retq
1773   %2 = shufflevector <8 x float> %0, <8 x float> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 undef, i32 undef, i32 undef, i32 undef>
1774   %3 = shufflevector <8 x float> %0, <8 x float> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
1775   %4 = fadd <8 x float> %2, %3
1776   %5 = shufflevector <8 x float> %4, <8 x float> undef, <8 x i32> <i32 0, i32 2, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1777   %6 = shufflevector <8 x float> %4, <8 x float> undef, <8 x i32> <i32 1, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1778   %7 = fadd <8 x float> %5, %6
1779   %8 = shufflevector <8 x float> %7, <8 x float> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1780   %9 = fadd <8 x float> %7, %8
1781   %10 = extractelement <8 x float> %9, i32 0
1782   ret float %10
1783 }