test/CodeGen/X86/haddsub-shuf.ll

   1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
   2 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3           | FileCheck %s --check-prefixes=SSSE3,SSSE3_SLOW
   3 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3,fast-hops | FileCheck %s --check-prefixes=SSSE3,SSSE3_FAST
   4 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx             | FileCheck %s --check-prefixes=AVX,AVX1,AVX1_SLOW
   5 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx,fast-hops   | FileCheck %s --check-prefixes=AVX,AVX1,AVX1_FAST
   6 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2            | FileCheck %s --check-prefixes=AVX,AVX2,AVX2_SLOW
   7 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2,fast-hops  | FileCheck %s --check-prefixes=AVX,AVX2,AVX2_FAST
   8
   9 ; The next 8 tests check for matching the horizontal op and eliminating the shuffle.
  10 ; PR34111 - https://bugs.llvm.org/show_bug.cgi?id=34111
  11
  12 define <4 x float> @hadd_v4f32(<4 x float> %a) {
  13 ; SSSE3-LABEL: hadd_v4f32:
  14 ; SSSE3:       # %bb.0:
  15 ; SSSE3-NEXT:    haddps %xmm0, %xmm0
  16 ; SSSE3-NEXT:    retq
  17 ;
  18 ; AVX-LABEL: hadd_v4f32:
  19 ; AVX:       # %bb.0:
  20 ; AVX-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
  21 ; AVX-NEXT:    retq
  22   %a02 = shufflevector <4 x float> %a, <4 x float> undef, <2 x i32> <i32 0, i32 2>
  23   %a13 = shufflevector <4 x float> %a, <4 x float> undef, <2 x i32> <i32 1, i32 3>
  24   %hop = fadd <2 x float> %a02, %a13
  25   %shuf = shufflevector <2 x float> %hop, <2 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 1>
  26   ret <4 x float> %shuf
  27 }
  28
  29 define <8 x float> @hadd_v8f32a(<8 x float> %a) {
  30 ; SSSE3-LABEL: hadd_v8f32a:
  31 ; SSSE3:       # %bb.0:
  32 ; SSSE3-NEXT:    movaps %xmm0, %xmm2
  33 ; SSSE3-NEXT:    haddps %xmm1, %xmm2
  34 ; SSSE3-NEXT:    movddup {{.*#+}} xmm0 = xmm2[0,0]
  35 ; SSSE3-NEXT:    movaps %xmm2, %xmm1
  36 ; SSSE3-NEXT:    retq
  37 ;
  38 ; AVX1-LABEL: hadd_v8f32a:
  39 ; AVX1:       # %bb.0:
  40 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
  41 ; AVX1-NEXT:    vhaddps %xmm1, %xmm0, %xmm0
  42 ; AVX1-NEXT:    vmovddup {{.*#+}} xmm1 = xmm0[0,0]
  43 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
  44 ; AVX1-NEXT:    retq
  45 ;
  46 ; AVX2-LABEL: hadd_v8f32a:
  47 ; AVX2:       # %bb.0:
  48 ; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm1
  49 ; AVX2-NEXT:    vhaddps %xmm1, %xmm0, %xmm0
  50 ; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,0,2,1]
  51 ; AVX2-NEXT:    retq
  52   %a0 = shufflevector <8 x float> %a, <8 x float> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
  53   %a1 = shufflevector <8 x float> %a, <8 x float> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
  54   %hop = fadd <4 x float> %a0, %a1
  55   %shuf = shufflevector <4 x float> %hop, <4 x float> undef, <8 x i32> <i32 undef, i32 undef, i32 0, i32 1, i32 undef, i32 undef, i32 2, i32 3>
  56   ret <8 x float> %shuf
  57 }
  58
  59 define <8 x float> @hadd_v8f32b(<8 x float> %a) {
  60 ; SSSE3-LABEL: hadd_v8f32b:
  61 ; SSSE3:       # %bb.0:
  62 ; SSSE3-NEXT:    haddps %xmm0, %xmm0
  63 ; SSSE3-NEXT:    haddps %xmm1, %xmm1
  64 ; SSSE3-NEXT:    retq
  65 ;
  66 ; AVX-LABEL: hadd_v8f32b:
  67 ; AVX:       # %bb.0:
  68 ; AVX-NEXT:    vhaddps %ymm0, %ymm0, %ymm0
  69 ; AVX-NEXT:    retq
  70   %a0 = shufflevector <8 x float> %a, <8 x float> undef, <8 x i32> <i32 0, i32 2, i32 undef, i32 undef, i32 4, i32 6, i32 undef, i32 undef>
  71   %a1 = shufflevector <8 x float> %a, <8 x float> undef, <8 x i32> <i32 1, i32 3, i32 undef, i32 undef, i32 5, i32 7, i32 undef, i32 undef>
  72   %hop = fadd <8 x float> %a0, %a1
  73   %shuf = shufflevector <8 x float> %hop, <8 x float> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 4, i32 5, i32 4, i32 5>
  74   ret <8 x float> %shuf
  75 }
  76
  77 define <4 x float> @hsub_v4f32(<4 x float> %a) {
  78 ; SSSE3-LABEL: hsub_v4f32:
  79 ; SSSE3:       # %bb.0:
  80 ; SSSE3-NEXT:    hsubps %xmm0, %xmm0
  81 ; SSSE3-NEXT:    retq
  82 ;
  83 ; AVX-LABEL: hsub_v4f32:
  84 ; AVX:       # %bb.0:
  85 ; AVX-NEXT:    vhsubps %xmm0, %xmm0, %xmm0
  86 ; AVX-NEXT:    retq
  87   %a02 = shufflevector <4 x float> %a, <4 x float> undef, <2 x i32> <i32 0, i32 2>
  88   %a13 = shufflevector <4 x float> %a, <4 x float> undef, <2 x i32> <i32 1, i32 3>
  89   %hop = fsub <2 x float> %a02, %a13
  90   %shuf = shufflevector <2 x float> %hop, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
  91   ret <4 x float> %shuf
  92 }
  93
  94 define <8 x float> @hsub_v8f32a(<8 x float> %a) {
  95 ; SSSE3-LABEL: hsub_v8f32a:
  96 ; SSSE3:       # %bb.0:
  97 ; SSSE3-NEXT:    movaps %xmm0, %xmm2
  98 ; SSSE3-NEXT:    hsubps %xmm1, %xmm2
  99 ; SSSE3-NEXT:    movddup {{.*#+}} xmm0 = xmm2[0,0]
 100 ; SSSE3-NEXT:    movaps %xmm2, %xmm1
 101 ; SSSE3-NEXT:    retq
 102 ;
 103 ; AVX1-LABEL: hsub_v8f32a:
 104 ; AVX1:       # %bb.0:
 105 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
 106 ; AVX1-NEXT:    vhsubps %xmm1, %xmm0, %xmm0
 107 ; AVX1-NEXT:    vmovddup {{.*#+}} xmm1 = xmm0[0,0]
 108 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
 109 ; AVX1-NEXT:    retq
 110 ;
 111 ; AVX2-LABEL: hsub_v8f32a:
 112 ; AVX2:       # %bb.0:
 113 ; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm1
 114 ; AVX2-NEXT:    vhsubps %xmm1, %xmm0, %xmm0
 115 ; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,0,2,1]
 116 ; AVX2-NEXT:    retq
 117   %a0 = shufflevector <8 x float> %a, <8 x float> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
 118   %a1 = shufflevector <8 x float> %a, <8 x float> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
 119   %hop = fsub <4 x float> %a0, %a1
 120   %shuf = shufflevector <4 x float> %hop, <4 x float> undef, <8 x i32> <i32 undef, i32 undef, i32 0, i32 1, i32 undef, i32 undef, i32 2, i32 3>
 121   ret <8 x float> %shuf
 122 }
 123
 124 define <8 x float> @hsub_v8f32b(<8 x float> %a) {
 125 ; SSSE3-LABEL: hsub_v8f32b:
 126 ; SSSE3:       # %bb.0:
 127 ; SSSE3-NEXT:    hsubps %xmm0, %xmm0
 128 ; SSSE3-NEXT:    hsubps %xmm1, %xmm1
 129 ; SSSE3-NEXT:    retq
 130 ;
 131 ; AVX-LABEL: hsub_v8f32b:
 132 ; AVX:       # %bb.0:
 133 ; AVX-NEXT:    vhsubps %ymm0, %ymm0, %ymm0
 134 ; AVX-NEXT:    retq
 135   %a0 = shufflevector <8 x float> %a, <8 x float> undef, <8 x i32> <i32 0, i32 2, i32 undef, i32 undef, i32 4, i32 6, i32 undef, i32 undef>
 136   %a1 = shufflevector <8 x float> %a, <8 x float> undef, <8 x i32> <i32 1, i32 3, i32 undef, i32 undef, i32 5, i32 7, i32 undef, i32 undef>
 137   %hop = fsub <8 x float> %a0, %a1
 138   %shuf = shufflevector <8 x float> %hop, <8 x float> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 4, i32 5, i32 4, i32 5>
 139   ret <8 x float> %shuf
 140 }
 141
 142 define <2 x double> @hadd_v2f64(<2 x double> %a) {
 143 ; SSSE3_SLOW-LABEL: hadd_v2f64:
 144 ; SSSE3_SLOW:       # %bb.0:
 145 ; SSSE3_SLOW-NEXT:    movapd %xmm0, %xmm1
 146 ; SSSE3_SLOW-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
 147 ; SSSE3_SLOW-NEXT:    addsd %xmm0, %xmm1
 148 ; SSSE3_SLOW-NEXT:    movddup {{.*#+}} xmm0 = xmm1[0,0]
 149 ; SSSE3_SLOW-NEXT:    retq
 150 ;
 151 ; SSSE3_FAST-LABEL: hadd_v2f64:
 152 ; SSSE3_FAST:       # %bb.0:
 153 ; SSSE3_FAST-NEXT:    haddpd %xmm0, %xmm0
 154 ; SSSE3_FAST-NEXT:    retq
 155 ;
 156 ; AVX1_SLOW-LABEL: hadd_v2f64:
 157 ; AVX1_SLOW:       # %bb.0:
 158 ; AVX1_SLOW-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
 159 ; AVX1_SLOW-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
 160 ; AVX1_SLOW-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
 161 ; AVX1_SLOW-NEXT:    retq
 162 ;
 163 ; AVX1_FAST-LABEL: hadd_v2f64:
 164 ; AVX1_FAST:       # %bb.0:
 165 ; AVX1_FAST-NEXT:    vhaddpd %xmm0, %xmm0, %xmm0
 166 ; AVX1_FAST-NEXT:    retq
 167 ;
 168 ; AVX2_SLOW-LABEL: hadd_v2f64:
 169 ; AVX2_SLOW:       # %bb.0:
 170 ; AVX2_SLOW-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
 171 ; AVX2_SLOW-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
 172 ; AVX2_SLOW-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
 173 ; AVX2_SLOW-NEXT:    retq
 174 ;
 175 ; AVX2_FAST-LABEL: hadd_v2f64:
 176 ; AVX2_FAST:       # %bb.0:
 177 ; AVX2_FAST-NEXT:    vhaddpd %xmm0, %xmm0, %xmm0
 178 ; AVX2_FAST-NEXT:    retq
 179   %a0 = shufflevector <2 x double> %a, <2 x double> undef, <2 x i32> <i32 0, i32 undef>
 180   %a1 = shufflevector <2 x double> %a, <2 x double> undef, <2 x i32> <i32 1, i32 undef>
 181   %hop = fadd <2 x double> %a0, %a1
 182   %shuf = shufflevector <2 x double> %hop, <2 x double> undef, <2 x i32> <i32 0, i32 0>
 183   ret <2 x double> %shuf
 184 }
 185
 186 define <2 x double> @hadd_v2f64_scalar_splat(<2 x double> %a) {
 187 ; SSSE3_SLOW-LABEL: hadd_v2f64_scalar_splat:
 188 ; SSSE3_SLOW:       # %bb.0:
 189 ; SSSE3_SLOW-NEXT:    movapd %xmm0, %xmm1
 190 ; SSSE3_SLOW-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
 191 ; SSSE3_SLOW-NEXT:    addsd %xmm0, %xmm1
 192 ; SSSE3_SLOW-NEXT:    movddup {{.*#+}} xmm0 = xmm1[0,0]
 193 ; SSSE3_SLOW-NEXT:    retq
 194 ;
 195 ; SSSE3_FAST-LABEL: hadd_v2f64_scalar_splat:
 196 ; SSSE3_FAST:       # %bb.0:
 197 ; SSSE3_FAST-NEXT:    haddpd %xmm0, %xmm0
 198 ; SSSE3_FAST-NEXT:    retq
 199 ;
 200 ; AVX1_SLOW-LABEL: hadd_v2f64_scalar_splat:
 201 ; AVX1_SLOW:       # %bb.0:
 202 ; AVX1_SLOW-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
 203 ; AVX1_SLOW-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
 204 ; AVX1_SLOW-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
 205 ; AVX1_SLOW-NEXT:    retq
 206 ;
 207 ; AVX1_FAST-LABEL: hadd_v2f64_scalar_splat:
 208 ; AVX1_FAST:       # %bb.0:
 209 ; AVX1_FAST-NEXT:    vhaddpd %xmm0, %xmm0, %xmm0
 210 ; AVX1_FAST-NEXT:    retq
 211 ;
 212 ; AVX2_SLOW-LABEL: hadd_v2f64_scalar_splat:
 213 ; AVX2_SLOW:       # %bb.0:
 214 ; AVX2_SLOW-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
 215 ; AVX2_SLOW-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
 216 ; AVX2_SLOW-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
 217 ; AVX2_SLOW-NEXT:    retq
 218 ;
 219 ; AVX2_FAST-LABEL: hadd_v2f64_scalar_splat:
 220 ; AVX2_FAST:       # %bb.0:
 221 ; AVX2_FAST-NEXT:    vhaddpd %xmm0, %xmm0, %xmm0
 222 ; AVX2_FAST-NEXT:    retq
 223   %a0 = extractelement <2 x double> %a, i32 0
 224   %a1 = extractelement <2 x double> %a, i32 1
 225   %hop = fadd double %a0, %a1
 226   %ins = insertelement <2 x double> undef, double %hop, i32 0
 227   %shuf = shufflevector <2 x double> %ins, <2 x double> undef, <2 x i32> <i32 0, i32 0>
 228   ret <2 x double> %shuf
 229 }
 230
 231 define <4 x double> @hadd_v4f64_scalar_splat(<4 x double> %a) {
 232 ; SSSE3_SLOW-LABEL: hadd_v4f64_scalar_splat:
 233 ; SSSE3_SLOW:       # %bb.0:
 234 ; SSSE3_SLOW-NEXT:    movapd %xmm0, %xmm2
 235 ; SSSE3_SLOW-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1]
 236 ; SSSE3_SLOW-NEXT:    addsd %xmm0, %xmm2
 237 ; SSSE3_SLOW-NEXT:    movapd %xmm1, %xmm3
 238 ; SSSE3_SLOW-NEXT:    unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm1[1]
 239 ; SSSE3_SLOW-NEXT:    addsd %xmm1, %xmm3
 240 ; SSSE3_SLOW-NEXT:    movddup {{.*#+}} xmm0 = xmm2[0,0]
 241 ; SSSE3_SLOW-NEXT:    movddup {{.*#+}} xmm1 = xmm3[0,0]
 242 ; SSSE3_SLOW-NEXT:    retq
 243 ;
 244 ; SSSE3_FAST-LABEL: hadd_v4f64_scalar_splat:
 245 ; SSSE3_FAST:       # %bb.0:
 246 ; SSSE3_FAST-NEXT:    haddpd %xmm0, %xmm0
 247 ; SSSE3_FAST-NEXT:    haddpd %xmm1, %xmm1
 248 ; SSSE3_FAST-NEXT:    retq
 249 ;
 250 ; AVX-LABEL: hadd_v4f64_scalar_splat:
 251 ; AVX:       # %bb.0:
 252 ; AVX-NEXT:    vhaddpd %ymm0, %ymm0, %ymm0
 253 ; AVX-NEXT:    retq
 254   %a0 = extractelement <4 x double> %a, i32 0
 255   %a1 = extractelement <4 x double> %a, i32 1
 256   %hop0 = fadd double %a0, %a1
 257   %a2 = extractelement <4 x double> %a, i32 2
 258   %a3 = extractelement <4 x double> %a, i32 3
 259   %hop1 = fadd double %a2, %a3
 260   %ins = insertelement <4 x double> undef, double %hop0, i32 0
 261   %ins2 = insertelement <4 x double> %ins,  double %hop1, i32 2
 262   %shuf = shufflevector <4 x double> %ins2, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
 263   ret <4 x double> %shuf
 264 }
 265
 266 define <4 x double> @hadd_v4f64_scalar_broadcast(<4 x double> %a) {
 267 ; SSSE3_SLOW-LABEL: hadd_v4f64_scalar_broadcast:
 268 ; SSSE3_SLOW:       # %bb.0:
 269 ; SSSE3_SLOW-NEXT:    movapd %xmm0, %xmm1
 270 ; SSSE3_SLOW-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
 271 ; SSSE3_SLOW-NEXT:    addsd %xmm0, %xmm1
 272 ; SSSE3_SLOW-NEXT:    movddup {{.*#+}} xmm0 = xmm1[0,0]
 273 ; SSSE3_SLOW-NEXT:    movapd %xmm0, %xmm1
 274 ; SSSE3_SLOW-NEXT:    retq
 275 ;
 276 ; SSSE3_FAST-LABEL: hadd_v4f64_scalar_broadcast:
 277 ; SSSE3_FAST:       # %bb.0:
 278 ; SSSE3_FAST-NEXT:    haddpd %xmm0, %xmm0
 279 ; SSSE3_FAST-NEXT:    movapd %xmm0, %xmm1
 280 ; SSSE3_FAST-NEXT:    retq
 281 ;
 282 ; AVX1_SLOW-LABEL: hadd_v4f64_scalar_broadcast:
 283 ; AVX1_SLOW:       # %bb.0:
 284 ; AVX1_SLOW-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
 285 ; AVX1_SLOW-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
 286 ; AVX1_SLOW-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
 287 ; AVX1_SLOW-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
 288 ; AVX1_SLOW-NEXT:    retq
 289 ;
 290 ; AVX1_FAST-LABEL: hadd_v4f64_scalar_broadcast:
 291 ; AVX1_FAST:       # %bb.0:
 292 ; AVX1_FAST-NEXT:    vhaddpd %xmm0, %xmm0, %xmm0
 293 ; AVX1_FAST-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
 294 ; AVX1_FAST-NEXT:    retq
 295 ;
 296 ; AVX2_SLOW-LABEL: hadd_v4f64_scalar_broadcast:
 297 ; AVX2_SLOW:       # %bb.0:
 298 ; AVX2_SLOW-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
 299 ; AVX2_SLOW-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
 300 ; AVX2_SLOW-NEXT:    vbroadcastsd %xmm0, %ymm0
 301 ; AVX2_SLOW-NEXT:    retq
 302 ;
 303 ; AVX2_FAST-LABEL: hadd_v4f64_scalar_broadcast:
 304 ; AVX2_FAST:       # %bb.0:
 305 ; AVX2_FAST-NEXT:    vhaddpd %xmm0, %xmm0, %xmm0
 306 ; AVX2_FAST-NEXT:    vbroadcastsd %xmm0, %ymm0
 307 ; AVX2_FAST-NEXT:    retq
 308   %a0 = extractelement <4 x double> %a, i32 0
 309   %a1 = extractelement <4 x double> %a, i32 1
 310   %hop0 = fadd double %a0, %a1
 311   %a2 = extractelement <4 x double> %a, i32 2
 312   %a3 = extractelement <4 x double> %a, i32 3
 313   %hop1 = fadd double %a2, %a3
 314   %ins = insertelement <4 x double> undef, double %hop0, i32 0
 315   %ins2 = insertelement <4 x double> %ins,  double %hop1, i32 2
 316   %shuf = shufflevector <4 x double> %ins2, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
 317   ret <4 x double> %shuf
 318 }
 319
 320 define <4 x double> @hadd_v4f64(<4 x double> %a) {
 321 ; SSSE3_SLOW-LABEL: hadd_v4f64:
 322 ; SSSE3_SLOW:       # %bb.0:
 323 ; SSSE3_SLOW-NEXT:    movapd %xmm0, %xmm2
 324 ; SSSE3_SLOW-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1]
 325 ; SSSE3_SLOW-NEXT:    addsd %xmm0, %xmm2
 326 ; SSSE3_SLOW-NEXT:    movddup {{.*#+}} xmm0 = xmm2[0,0]
 327 ; SSSE3_SLOW-NEXT:    movapd %xmm1, %xmm2
 328 ; SSSE3_SLOW-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1]
 329 ; SSSE3_SLOW-NEXT:    addsd %xmm1, %xmm2
 330 ; SSSE3_SLOW-NEXT:    movddup {{.*#+}} xmm1 = xmm2[0,0]
 331 ; SSSE3_SLOW-NEXT:    retq
 332 ;
 333 ; SSSE3_FAST-LABEL: hadd_v4f64:
 334 ; SSSE3_FAST:       # %bb.0:
 335 ; SSSE3_FAST-NEXT:    haddpd %xmm0, %xmm0
 336 ; SSSE3_FAST-NEXT:    haddpd %xmm1, %xmm1
 337 ; SSSE3_FAST-NEXT:    retq
 338 ;
 339 ; AVX1_SLOW-LABEL: hadd_v4f64:
 340 ; AVX1_SLOW:       # %bb.0:
 341 ; AVX1_SLOW-NEXT:    vpermilpd {{.*#+}} ymm1 = ymm0[1,0,3,2]
 342 ; AVX1_SLOW-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
 343 ; AVX1_SLOW-NEXT:    vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
 344 ; AVX1_SLOW-NEXT:    retq
 345 ;
 346 ; AVX1_FAST-LABEL: hadd_v4f64:
 347 ; AVX1_FAST:       # %bb.0:
 348 ; AVX1_FAST-NEXT:    vhaddpd %ymm0, %ymm0, %ymm0
 349 ; AVX1_FAST-NEXT:    retq
 350 ;
 351 ; AVX2_SLOW-LABEL: hadd_v4f64:
 352 ; AVX2_SLOW:       # %bb.0:
 353 ; AVX2_SLOW-NEXT:    vpermilpd {{.*#+}} ymm1 = ymm0[1,0,3,2]
 354 ; AVX2_SLOW-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
 355 ; AVX2_SLOW-NEXT:    vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
 356 ; AVX2_SLOW-NEXT:    retq
 357 ;
 358 ; AVX2_FAST-LABEL: hadd_v4f64:
 359 ; AVX2_FAST:       # %bb.0:
 360 ; AVX2_FAST-NEXT:    vhaddpd %ymm0, %ymm0, %ymm0
 361 ; AVX2_FAST-NEXT:    retq
 362   %a0 = shufflevector <4 x double> %a, <4 x double> undef, <4 x i32> <i32 0, i32 undef, i32 2, i32 undef>
 363   %a1 = shufflevector <4 x double> %a, <4 x double> undef, <4 x i32> <i32 1, i32 undef, i32 3, i32 undef>
 364   %hop = fadd <4 x double> %a0, %a1
 365   %shuf = shufflevector <4 x double> %hop, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
 366   ret <4 x double> %shuf
 367 }
 368
 369 define <2 x double> @hsub_v2f64(<2 x double> %a) {
 370 ; SSSE3_SLOW-LABEL: hsub_v2f64:
 371 ; SSSE3_SLOW:       # %bb.0:
 372 ; SSSE3_SLOW-NEXT:    movapd %xmm0, %xmm1
 373 ; SSSE3_SLOW-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
 374 ; SSSE3_SLOW-NEXT:    subsd %xmm1, %xmm0
 375 ; SSSE3_SLOW-NEXT:    movddup {{.*#+}} xmm0 = xmm0[0,0]
 376 ; SSSE3_SLOW-NEXT:    retq
 377 ;
 378 ; SSSE3_FAST-LABEL: hsub_v2f64:
 379 ; SSSE3_FAST:       # %bb.0:
 380 ; SSSE3_FAST-NEXT:    hsubpd %xmm0, %xmm0
 381 ; SSSE3_FAST-NEXT:    retq
 382 ;
 383 ; AVX1_SLOW-LABEL: hsub_v2f64:
 384 ; AVX1_SLOW:       # %bb.0:
 385 ; AVX1_SLOW-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
 386 ; AVX1_SLOW-NEXT:    vsubsd %xmm1, %xmm0, %xmm0
 387 ; AVX1_SLOW-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
 388 ; AVX1_SLOW-NEXT:    retq
 389 ;
 390 ; AVX1_FAST-LABEL: hsub_v2f64:
 391 ; AVX1_FAST:       # %bb.0:
 392 ; AVX1_FAST-NEXT:    vhsubpd %xmm0, %xmm0, %xmm0
 393 ; AVX1_FAST-NEXT:    retq
 394 ;
 395 ; AVX2_SLOW-LABEL: hsub_v2f64:
 396 ; AVX2_SLOW:       # %bb.0:
 397 ; AVX2_SLOW-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
 398 ; AVX2_SLOW-NEXT:    vsubsd %xmm1, %xmm0, %xmm0
 399 ; AVX2_SLOW-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
 400 ; AVX2_SLOW-NEXT:    retq
 401 ;
 402 ; AVX2_FAST-LABEL: hsub_v2f64:
 403 ; AVX2_FAST:       # %bb.0:
 404 ; AVX2_FAST-NEXT:    vhsubpd %xmm0, %xmm0, %xmm0
 405 ; AVX2_FAST-NEXT:    retq
 406   %a0 = shufflevector <2 x double> %a, <2 x double> undef, <2 x i32> <i32 0, i32 undef>
 407   %a1 = shufflevector <2 x double> %a, <2 x double> undef, <2 x i32> <i32 1, i32 undef>
 408   %hop = fsub <2 x double> %a0, %a1
 409   %shuf = shufflevector <2 x double> %hop, <2 x double> undef, <2 x i32> <i32 undef, i32 0>
 410   ret <2 x double> %shuf
 411 }
 412
 413 define <4 x double> @hsub_v4f64(<4 x double> %a) {
 414 ; SSSE3_SLOW-LABEL: hsub_v4f64:
 415 ; SSSE3_SLOW:       # %bb.0:
 416 ; SSSE3_SLOW-NEXT:    movapd %xmm0, %xmm2
 417 ; SSSE3_SLOW-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1]
 418 ; SSSE3_SLOW-NEXT:    subsd %xmm2, %xmm0
 419 ; SSSE3_SLOW-NEXT:    movddup {{.*#+}} xmm0 = xmm0[0,0]
 420 ; SSSE3_SLOW-NEXT:    movapd %xmm1, %xmm2
 421 ; SSSE3_SLOW-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1]
 422 ; SSSE3_SLOW-NEXT:    subsd %xmm2, %xmm1
 423 ; SSSE3_SLOW-NEXT:    movddup {{.*#+}} xmm1 = xmm1[0,0]
 424 ; SSSE3_SLOW-NEXT:    retq
 425 ;
 426 ; SSSE3_FAST-LABEL: hsub_v4f64:
 427 ; SSSE3_FAST:       # %bb.0:
 428 ; SSSE3_FAST-NEXT:    hsubpd %xmm0, %xmm0
 429 ; SSSE3_FAST-NEXT:    hsubpd %xmm1, %xmm1
 430 ; SSSE3_FAST-NEXT:    retq
 431 ;
 432 ; AVX1_SLOW-LABEL: hsub_v4f64:
 433 ; AVX1_SLOW:       # %bb.0:
 434 ; AVX1_SLOW-NEXT:    vpermilpd {{.*#+}} ymm1 = ymm0[1,0,3,2]
 435 ; AVX1_SLOW-NEXT:    vsubpd %ymm1, %ymm0, %ymm0
 436 ; AVX1_SLOW-NEXT:    vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
 437 ; AVX1_SLOW-NEXT:    retq
 438 ;
 439 ; AVX1_FAST-LABEL: hsub_v4f64:
 440 ; AVX1_FAST:       # %bb.0:
 441 ; AVX1_FAST-NEXT:    vhsubpd %ymm0, %ymm0, %ymm0
 442 ; AVX1_FAST-NEXT:    retq
 443 ;
 444 ; AVX2_SLOW-LABEL: hsub_v4f64:
 445 ; AVX2_SLOW:       # %bb.0:
 446 ; AVX2_SLOW-NEXT:    vpermilpd {{.*#+}} ymm1 = ymm0[1,0,3,2]
 447 ; AVX2_SLOW-NEXT:    vsubpd %ymm1, %ymm0, %ymm0
 448 ; AVX2_SLOW-NEXT:    vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
 449 ; AVX2_SLOW-NEXT:    retq
 450 ;
 451 ; AVX2_FAST-LABEL: hsub_v4f64:
 452 ; AVX2_FAST:       # %bb.0:
 453 ; AVX2_FAST-NEXT:    vhsubpd %ymm0, %ymm0, %ymm0
 454 ; AVX2_FAST-NEXT:    retq
 455   %a0 = shufflevector <4 x double> %a, <4 x double> undef, <4 x i32> <i32 0, i32 undef, i32 2, i32 undef>
 456   %a1 = shufflevector <4 x double> %a, <4 x double> undef, <4 x i32> <i32 1, i32 undef, i32 3, i32 undef>
 457   %hop = fsub <4 x double> %a0, %a1
 458   %shuf = shufflevector <4 x double> %hop, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
 459   ret <4 x double> %shuf
 460 }
 461
 462 define <4 x i32> @hadd_v4i32(<4 x i32> %a) {
 463 ; SSSE3-LABEL: hadd_v4i32:
 464 ; SSSE3:       # %bb.0:
 465 ; SSSE3-NEXT:    phaddd %xmm0, %xmm0
 466 ; SSSE3-NEXT:    retq
 467 ;
 468 ; AVX-LABEL: hadd_v4i32:
 469 ; AVX:       # %bb.0:
 470 ; AVX-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
 471 ; AVX-NEXT:    retq
 472   %a02 = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef>
 473   %a13 = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 1, i32 3, i32 undef, i32 undef>
 474   %hop = add <4 x i32> %a02, %a13
 475   %shuf = shufflevector <4 x i32> %hop, <4 x i32> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 1>
 476   ret <4 x i32> %shuf
 477 }
 478
 479 define <8 x i32> @hadd_v8i32a(<8 x i32> %a) {
 480 ; SSSE3-LABEL: hadd_v8i32a:
 481 ; SSSE3:       # %bb.0:
 482 ; SSSE3-NEXT:    movdqa %xmm0, %xmm2
 483 ; SSSE3-NEXT:    phaddd %xmm1, %xmm2
 484 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,1,0,1]
 485 ; SSSE3-NEXT:    movdqa %xmm2, %xmm1
 486 ; SSSE3-NEXT:    retq
 487 ;
 488 ; AVX1-LABEL: hadd_v8i32a:
 489 ; AVX1:       # %bb.0:
 490 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
 491 ; AVX1-NEXT:    vphaddd %xmm1, %xmm0, %xmm0
 492 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[0,1,0,1]
 493 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
 494 ; AVX1-NEXT:    retq
 495 ;
 496 ; AVX2-LABEL: hadd_v8i32a:
 497 ; AVX2:       # %bb.0:
 498 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
 499 ; AVX2-NEXT:    vphaddd %xmm1, %xmm0, %xmm0
 500 ; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1]
 501 ; AVX2-NEXT:    retq
 502   %a0 = shufflevector <8 x i32> %a, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
 503   %a1 = shufflevector <8 x i32> %a, <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
 504   %hop = add <4 x i32> %a0, %a1
 505   %shuf = shufflevector <4 x i32> %hop, <4 x i32> undef, <8 x i32> <i32 undef, i32 undef, i32 0, i32 1, i32 undef, i32 undef, i32 2, i32 3>
 506   ret <8 x i32> %shuf
 507 }
 508
 509 define <8 x i32> @hadd_v8i32b(<8 x i32> %a) {
 510 ; SSSE3-LABEL: hadd_v8i32b:
 511 ; SSSE3:       # %bb.0:
 512 ; SSSE3-NEXT:    phaddd %xmm0, %xmm0
 513 ; SSSE3-NEXT:    phaddd %xmm1, %xmm1
 514 ; SSSE3-NEXT:    retq
 515 ;
 516 ; AVX1-LABEL: hadd_v8i32b:
 517 ; AVX1:       # %bb.0:
 518 ; AVX1-NEXT:    vphaddd %xmm0, %xmm0, %xmm1
 519 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
 520 ; AVX1-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
 521 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
 522 ; AVX1-NEXT:    vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
 523 ; AVX1-NEXT:    retq
 524 ;
 525 ; AVX2-LABEL: hadd_v8i32b:
 526 ; AVX2:       # %bb.0:
 527 ; AVX2-NEXT:    vphaddd %ymm0, %ymm0, %ymm0
 528 ; AVX2-NEXT:    retq
 529   %a0 = shufflevector <8 x i32> %a, <8 x i32> undef, <8 x i32> <i32 0, i32 2, i32 undef, i32 undef, i32 4, i32 6, i32 undef, i32 undef>
 530   %a1 = shufflevector <8 x i32> %a, <8 x i32> undef, <8 x i32> <i32 1, i32 3, i32 undef, i32 undef, i32 5, i32 7, i32 undef, i32 undef>
 531   %hop = add <8 x i32> %a0, %a1
 532   %shuf = shufflevector <8 x i32> %hop, <8 x i32> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 4, i32 5, i32 4, i32 5>
 533   ret <8 x i32> %shuf
 534 }
 535
 536 define <4 x i32> @hsub_v4i32(<4 x i32> %a) {
 537 ; SSSE3-LABEL: hsub_v4i32:
 538 ; SSSE3:       # %bb.0:
 539 ; SSSE3-NEXT:    phsubd %xmm0, %xmm0
 540 ; SSSE3-NEXT:    retq
 541 ;
 542 ; AVX-LABEL: hsub_v4i32:
 543 ; AVX:       # %bb.0:
 544 ; AVX-NEXT:    vphsubd %xmm0, %xmm0, %xmm0
 545 ; AVX-NEXT:    retq
 546   %a02 = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef>
 547   %a13 = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 1, i32 3, i32 undef, i32 undef>
 548   %hop = sub <4 x i32> %a02, %a13
 549   %shuf = shufflevector <4 x i32> %hop, <4 x i32> undef, <4 x i32> <i32 undef, i32 1, i32 0, i32 undef>
 550   ret <4 x i32> %shuf
 551 }
 552
 553 define <8 x i32> @hsub_v8i32a(<8 x i32> %a) {
 554 ; SSSE3-LABEL: hsub_v8i32a:
 555 ; SSSE3:       # %bb.0:
 556 ; SSSE3-NEXT:    movdqa %xmm0, %xmm2
 557 ; SSSE3-NEXT:    phsubd %xmm1, %xmm2
 558 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,1,0,1]
 559 ; SSSE3-NEXT:    movdqa %xmm2, %xmm1
 560 ; SSSE3-NEXT:    retq
 561 ;
 562 ; AVX1-LABEL: hsub_v8i32a:
 563 ; AVX1:       # %bb.0:
 564 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
 565 ; AVX1-NEXT:    vphsubd %xmm1, %xmm0, %xmm0
 566 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[0,1,0,1]
 567 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
 568 ; AVX1-NEXT:    retq
 569 ;
 570 ; AVX2-LABEL: hsub_v8i32a:
 571 ; AVX2:       # %bb.0:
 572 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
 573 ; AVX2-NEXT:    vphsubd %xmm1, %xmm0, %xmm0
 574 ; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1]
 575 ; AVX2-NEXT:    retq
 576   %a0 = shufflevector <8 x i32> %a, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
 577   %a1 = shufflevector <8 x i32> %a, <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
 578   %hop = sub <4 x i32> %a0, %a1
 579   %shuf = shufflevector <4 x i32> %hop, <4 x i32> undef, <8 x i32> <i32 undef, i32 undef, i32 0, i32 1, i32 undef, i32 undef, i32 2, i32 3>
 580   ret <8 x i32> %shuf
 581 }
 582
 583 define <8 x i32> @hsub_v8i32b(<8 x i32> %a) {
 584 ; SSSE3-LABEL: hsub_v8i32b:
 585 ; SSSE3:       # %bb.0:
 586 ; SSSE3-NEXT:    phsubd %xmm0, %xmm0
 587 ; SSSE3-NEXT:    phsubd %xmm1, %xmm1
 588 ; SSSE3-NEXT:    retq
 589 ;
 590 ; AVX1-LABEL: hsub_v8i32b:
 591 ; AVX1:       # %bb.0:
 592 ; AVX1-NEXT:    vphsubd %xmm0, %xmm0, %xmm1
 593 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
 594 ; AVX1-NEXT:    vphsubd %xmm0, %xmm0, %xmm0
 595 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
 596 ; AVX1-NEXT:    vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
 597 ; AVX1-NEXT:    retq
 598 ;
 599 ; AVX2-LABEL: hsub_v8i32b:
 600 ; AVX2:       # %bb.0:
 601 ; AVX2-NEXT:    vphsubd %ymm0, %ymm0, %ymm0
 602 ; AVX2-NEXT:    retq
 603   %a0 = shufflevector <8 x i32> %a, <8 x i32> undef, <8 x i32> <i32 0, i32 2, i32 undef, i32 undef, i32 4, i32 6, i32 undef, i32 undef>
 604   %a1 = shufflevector <8 x i32> %a, <8 x i32> undef, <8 x i32> <i32 1, i32 3, i32 undef, i32 undef, i32 5, i32 7, i32 undef, i32 undef>
 605   %hop = sub <8 x i32> %a0, %a1
 606   %shuf = shufflevector <8 x i32> %hop, <8 x i32> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 4, i32 5, i32 4, i32 5>
 607   ret <8 x i32> %shuf
 608 }
 609
 610 define <8 x i16> @hadd_v8i16(<8 x i16> %a) {
 611 ; SSSE3-LABEL: hadd_v8i16:
 612 ; SSSE3:       # %bb.0:
 613 ; SSSE3-NEXT:    phaddw %xmm0, %xmm0
 614 ; SSSE3-NEXT:    retq
 615 ;
 616 ; AVX-LABEL: hadd_v8i16:
 617 ; AVX:       # %bb.0:
 618 ; AVX-NEXT:    vphaddw %xmm0, %xmm0, %xmm0
 619 ; AVX-NEXT:    retq
 620   %a0246 = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 undef, i32 undef, i32 undef, i32 undef>
 621   %a1357 = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
 622   %hop = add <8 x i16> %a0246, %a1357
 623   %shuf = shufflevector <8 x i16> %hop, <8 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 1, i32 2, i32 3>
 624   ret <8 x i16> %shuf
 625 }
 626
 627 define <16 x i16> @hadd_v16i16a(<16 x i16> %a) {
 628 ; SSSE3-LABEL: hadd_v16i16a:
 629 ; SSSE3:       # %bb.0:
 630 ; SSSE3-NEXT:    movdqa %xmm0, %xmm2
 631 ; SSSE3-NEXT:    phaddw %xmm1, %xmm2
 632 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,1,0,1]
 633 ; SSSE3-NEXT:    movdqa %xmm2, %xmm1
 634 ; SSSE3-NEXT:    retq
 635 ;
 636 ; AVX1-LABEL: hadd_v16i16a:
 637 ; AVX1:       # %bb.0:
 638 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
 639 ; AVX1-NEXT:    vphaddw %xmm1, %xmm0, %xmm0
 640 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[0,1,0,1]
 641 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
 642 ; AVX1-NEXT:    retq
 643 ;
 644 ; AVX2-LABEL: hadd_v16i16a:
 645 ; AVX2:       # %bb.0:
 646 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
 647 ; AVX2-NEXT:    vphaddw %xmm1, %xmm0, %xmm0
 648 ; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1]
 649 ; AVX2-NEXT:    retq
 650   %a0 = shufflevector <16 x i16> %a, <16 x i16> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
 651   %a1 = shufflevector <16 x i16> %a, <16 x i16> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
 652   %hop = add <8 x i16> %a0, %a1
 653   %shuf = shufflevector <8 x i16> %hop, <8 x i16> undef, <16 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 4, i32 5, i32 6, i32 7>
 654   ret <16 x i16> %shuf
 655 }
 656
 657 define <16 x i16> @hadd_v16i16b(<16 x i16> %a) {
 658 ; SSSE3-LABEL: hadd_v16i16b:
 659 ; SSSE3:       # %bb.0:
 660 ; SSSE3-NEXT:    phaddw %xmm0, %xmm0
 661 ; SSSE3-NEXT:    phaddw %xmm1, %xmm1
 662 ; SSSE3-NEXT:    retq
 663 ;
 664 ; AVX1-LABEL: hadd_v16i16b:
 665 ; AVX1:       # %bb.0:
 666 ; AVX1-NEXT:    vphaddw %xmm0, %xmm0, %xmm1
 667 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
 668 ; AVX1-NEXT:    vphaddw %xmm0, %xmm0, %xmm0
 669 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
 670 ; AVX1-NEXT:    vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
 671 ; AVX1-NEXT:    retq
 672 ;
 673 ; AVX2-LABEL: hadd_v16i16b:
 674 ; AVX2:       # %bb.0:
 675 ; AVX2-NEXT:    vphaddw %ymm0, %ymm0, %ymm0
 676 ; AVX2-NEXT:    retq
 677   %a0 = shufflevector <16 x i16> %a, <16 x i16> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 undef, i32 undef, i32 undef, i32 undef, i32 8, i32 10, i32 12, i32 14, i32 undef, i32 undef, i32 undef, i32 undef>
 678   %a1 = shufflevector <16 x i16> %a, <16 x i16> undef, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 9, i32 11, i32 13, i32 15, i32 undef, i32 undef, i32 undef, i32 undef>
 679   %hop = add <16 x i16> %a0, %a1
 680   %shuf = shufflevector <16 x i16> %hop, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11, i32 8, i32 9, i32 10, i32 11>
 681   ret <16 x i16> %shuf
 682 }
 683
 684 define <8 x i16> @hsub_v8i16(<8 x i16> %a) {
 685 ; SSSE3-LABEL: hsub_v8i16:
 686 ; SSSE3:       # %bb.0:
 687 ; SSSE3-NEXT:    phsubw %xmm0, %xmm0
 688 ; SSSE3-NEXT:    retq
 689 ;
 690 ; AVX-LABEL: hsub_v8i16:
 691 ; AVX:       # %bb.0:
 692 ; AVX-NEXT:    vphsubw %xmm0, %xmm0, %xmm0
 693 ; AVX-NEXT:    retq
 694   %a0246 = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 undef, i32 undef, i32 undef, i32 undef>
 695   %a1357 = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
 696   %hop = sub <8 x i16> %a0246, %a1357
 697   %shuf = shufflevector <8 x i16> %hop, <8 x i16> undef, <8 x i32> <i32 0, i32 undef, i32 2, i32 undef, i32 undef, i32 1, i32 undef, i32 3>
 698   ret <8 x i16> %shuf
 699 }
 700
 701 define <16 x i16> @hsub_v16i16a(<16 x i16> %a) {
 702 ; SSSE3-LABEL: hsub_v16i16a:
 703 ; SSSE3:       # %bb.0:
 704 ; SSSE3-NEXT:    movdqa %xmm0, %xmm2
 705 ; SSSE3-NEXT:    phsubw %xmm1, %xmm2
 706 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,1,0,1]
 707 ; SSSE3-NEXT:    movdqa %xmm2, %xmm1
 708 ; SSSE3-NEXT:    retq
 709 ;
 710 ; AVX1-LABEL: hsub_v16i16a:
 711 ; AVX1:       # %bb.0:
 712 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
 713 ; AVX1-NEXT:    vphsubw %xmm1, %xmm0, %xmm0
 714 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[0,1,0,1]
 715 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
 716 ; AVX1-NEXT:    retq
 717 ;
 718 ; AVX2-LABEL: hsub_v16i16a:
 719 ; AVX2:       # %bb.0:
 720 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
 721 ; AVX2-NEXT:    vphsubw %xmm1, %xmm0, %xmm0
 722 ; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1]
 723 ; AVX2-NEXT:    retq
 724   %a0 = shufflevector <16 x i16> %a, <16 x i16> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
 725   %a1 = shufflevector <16 x i16> %a, <16 x i16> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
 726   %hop = sub <8 x i16> %a0, %a1
 727   %shuf = shufflevector <8 x i16> %hop, <8 x i16> undef, <16 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 4, i32 5, i32 6, i32 7>
 728   ret <16 x i16> %shuf
 729 }
 730
 731 define <16 x i16> @hsub_v16i16b(<16 x i16> %a) {
 732 ; SSSE3-LABEL: hsub_v16i16b:
 733 ; SSSE3:       # %bb.0:
 734 ; SSSE3-NEXT:    phsubw %xmm0, %xmm0
 735 ; SSSE3-NEXT:    phsubw %xmm1, %xmm1
 736 ; SSSE3-NEXT:    retq
 737 ;
 738 ; AVX1-LABEL: hsub_v16i16b:
 739 ; AVX1:       # %bb.0:
 740 ; AVX1-NEXT:    vphsubw %xmm0, %xmm0, %xmm1
 741 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
 742 ; AVX1-NEXT:    vphsubw %xmm0, %xmm0, %xmm0
 743 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
 744 ; AVX1-NEXT:    vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
 745 ; AVX1-NEXT:    retq
 746 ;
 747 ; AVX2-LABEL: hsub_v16i16b:
 748 ; AVX2:       # %bb.0:
 749 ; AVX2-NEXT:    vphsubw %ymm0, %ymm0, %ymm0
 750 ; AVX2-NEXT:    retq
 751   %a0 = shufflevector <16 x i16> %a, <16 x i16> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 undef, i32 undef, i32 undef, i32 undef, i32 8, i32 10, i32 12, i32 14, i32 undef, i32 undef, i32 undef, i32 undef>
 752   %a1 = shufflevector <16 x i16> %a, <16 x i16> undef, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 9, i32 11, i32 13, i32 15, i32 undef, i32 undef, i32 undef, i32 undef>
 753   %hop = sub <16 x i16> %a0, %a1
 754   %shuf = shufflevector <16 x i16> %hop, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11, i32 8, i32 9, i32 10, i32 11>
 755   ret <16 x i16> %shuf
 756 }
 757
 758 define <4 x float> @broadcast_haddps_v4f32(<4 x float> %a0) {
 759 ; SSSE3-LABEL: broadcast_haddps_v4f32:
 760 ; SSSE3:       # %bb.0:
 761 ; SSSE3-NEXT:    haddps %xmm0, %xmm0
 762 ; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0,0,0]
 763 ; SSSE3-NEXT:    retq
 764 ;
 765 ; AVX1-LABEL: broadcast_haddps_v4f32:
 766 ; AVX1:       # %bb.0:
 767 ; AVX1-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
 768 ; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0]
 769 ; AVX1-NEXT:    retq
 770 ;
 771 ; AVX2-LABEL: broadcast_haddps_v4f32:
 772 ; AVX2:       # %bb.0:
 773 ; AVX2-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
 774 ; AVX2-NEXT:    vbroadcastss %xmm0, %xmm0
 775 ; AVX2-NEXT:    retq
 776   %1 = tail call <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float> %a0, <4 x float> %a0)
 777   %2 = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> zeroinitializer
 778   ret <4 x float> %2
 779 }
 780
 781 declare <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float>, <4 x float>)