llvm/test/CodeGen/X86/vector-shuffle-combining.ll

   1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
   2 ; RUN: llc < %s -mcpu=x86-64 -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE2
   3 ; RUN: llc < %s -mcpu=x86-64 -mattr=+ssse3 | FileCheck %s --check-prefixes=CHECK,SSE,SSSE3
   4 ; RUN: llc < %s -mcpu=x86-64 -mattr=+sse4.1 | FileCheck %s --check-prefixes=CHECK,SSE,SSE41
   5 ; RUN: llc < %s -mcpu=x86-64 -mattr=+avx | FileCheck %s --check-prefixes=CHECK,AVX,AVX1
   6 ; RUN: llc < %s -mcpu=x86-64 -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2,AVX2-SLOW
   7 ; RUN: llc < %s -mcpu=x86-64 -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=CHECK,AVX,AVX2,AVX2-FAST,AVX2-FAST-ALL
   8 ; RUN: llc < %s -mcpu=x86-64 -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=CHECK,AVX,AVX2,AVX2-FAST,AVX2-FAST-PERLANE
   9 ;
  10 ; Verify that the DAG combiner correctly folds bitwise operations across
  11 ; shuffles, nested shuffles with undef, pairs of nested shuffles, and other
  12 ; basic and always-safe patterns. Also test that the DAG combiner will combine
  13 ; target-specific shuffle instructions where reasonable.
  14
  15 target triple = "x86_64-unknown-unknown"
  16
  17 declare <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32>, i8)
  18 declare <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16>, i8)
  19 declare <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16>, i8)
  20
  21 define <4 x i32> @combine_pshufd1(<4 x i32> %a) {
  22 ; CHECK-LABEL: combine_pshufd1:
  23 ; CHECK:       # %bb.0: # %entry
  24 ; CHECK-NEXT:    retq
  25 entry:
  26   %b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 27)
  27   %c = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %b, i8 27)
  28   ret <4 x i32> %c
  29 }
  30
  31 define <4 x i32> @combine_pshufd2(<4 x i32> %a) {
  32 ; CHECK-LABEL: combine_pshufd2:
  33 ; CHECK:       # %bb.0: # %entry
  34 ; CHECK-NEXT:    retq
  35 entry:
  36   %b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 27)
  37   %b.cast = bitcast <4 x i32> %b to <8 x i16>
  38   %c = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %b.cast, i8 -28)
  39   %c.cast = bitcast <8 x i16> %c to <4 x i32>
  40   %d = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %c.cast, i8 27)
  41   ret <4 x i32> %d
  42 }
  43
  44 define <4 x i32> @combine_pshufd3(<4 x i32> %a) {
  45 ; CHECK-LABEL: combine_pshufd3:
  46 ; CHECK:       # %bb.0: # %entry
  47 ; CHECK-NEXT:    retq
  48 entry:
  49   %b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 27)
  50   %b.cast = bitcast <4 x i32> %b to <8 x i16>
  51   %c = call <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16> %b.cast, i8 -28)
  52   %c.cast = bitcast <8 x i16> %c to <4 x i32>
  53   %d = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %c.cast, i8 27)
  54   ret <4 x i32> %d
  55 }
  56
  57 define <4 x i32> @combine_pshufd4(<4 x i32> %a) {
  58 ; SSE-LABEL: combine_pshufd4:
  59 ; SSE:       # %bb.0: # %entry
  60 ; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
  61 ; SSE-NEXT:    retq
  62 ;
  63 ; AVX-LABEL: combine_pshufd4:
  64 ; AVX:       # %bb.0: # %entry
  65 ; AVX-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
  66 ; AVX-NEXT:    retq
  67 entry:
  68   %b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 -31)
  69   %b.cast = bitcast <4 x i32> %b to <8 x i16>
  70   %c = call <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16> %b.cast, i8 27)
  71   %c.cast = bitcast <8 x i16> %c to <4 x i32>
  72   %d = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %c.cast, i8 -31)
  73   ret <4 x i32> %d
  74 }
  75
  76 define <4 x i32> @combine_pshufd5(<4 x i32> %a) {
  77 ; SSE-LABEL: combine_pshufd5:
  78 ; SSE:       # %bb.0: # %entry
  79 ; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
  80 ; SSE-NEXT:    retq
  81 ;
  82 ; AVX-LABEL: combine_pshufd5:
  83 ; AVX:       # %bb.0: # %entry
  84 ; AVX-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
  85 ; AVX-NEXT:    retq
  86 entry:
  87   %b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 -76)
  88   %b.cast = bitcast <4 x i32> %b to <8 x i16>
  89   %c = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %b.cast, i8 27)
  90   %c.cast = bitcast <8 x i16> %c to <4 x i32>
  91   %d = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %c.cast, i8 -76)
  92   ret <4 x i32> %d
  93 }
  94
  95 define <4 x i32> @combine_pshufd6(<4 x i32> %a) {
  96 ; SSE-LABEL: combine_pshufd6:
  97 ; SSE:       # %bb.0: # %entry
  98 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
  99 ; SSE-NEXT:    retq
 100 ;
 101 ; AVX1-LABEL: combine_pshufd6:
 102 ; AVX1:       # %bb.0: # %entry
 103 ; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,0,0,0]
 104 ; AVX1-NEXT:    retq
 105 ;
 106 ; AVX2-LABEL: combine_pshufd6:
 107 ; AVX2:       # %bb.0: # %entry
 108 ; AVX2-NEXT:    vbroadcastss %xmm0, %xmm0
 109 ; AVX2-NEXT:    retq
 110 entry:
 111   %b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 0)
 112   %c = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %b, i8 8)
 113   ret <4 x i32> %c
 114 }
 115
 116 define <8 x i16> @combine_pshuflw1(<8 x i16> %a) {
 117 ; CHECK-LABEL: combine_pshuflw1:
 118 ; CHECK:       # %bb.0: # %entry
 119 ; CHECK-NEXT:    retq
 120 entry:
 121   %b = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %a, i8 27)
 122   %c = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %b, i8 27)
 123   ret <8 x i16> %c
 124 }
 125
 126 define <8 x i16> @combine_pshuflw2(<8 x i16> %a) {
 127 ; CHECK-LABEL: combine_pshuflw2:
 128 ; CHECK:       # %bb.0: # %entry
 129 ; CHECK-NEXT:    retq
 130 entry:
 131   %b = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %a, i8 27)
 132   %c = call <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16> %b, i8 -28)
 133   %d = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %c, i8 27)
 134   ret <8 x i16> %d
 135 }
 136
 137 define <8 x i16> @combine_pshuflw3(<8 x i16> %a) {
 138 ; SSE-LABEL: combine_pshuflw3:
 139 ; SSE:       # %bb.0: # %entry
 140 ; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
 141 ; SSE-NEXT:    retq
 142 ;
 143 ; AVX-LABEL: combine_pshuflw3:
 144 ; AVX:       # %bb.0: # %entry
 145 ; AVX-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
 146 ; AVX-NEXT:    retq
 147 entry:
 148   %b = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %a, i8 27)
 149   %c = call <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16> %b, i8 27)
 150   %d = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %c, i8 27)
 151   ret <8 x i16> %d
 152 }
 153
 154 define <8 x i16> @combine_pshufhw1(<8 x i16> %a) {
 155 ; SSE-LABEL: combine_pshufhw1:
 156 ; SSE:       # %bb.0: # %entry
 157 ; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
 158 ; SSE-NEXT:    retq
 159 ;
 160 ; AVX-LABEL: combine_pshufhw1:
 161 ; AVX:       # %bb.0: # %entry
 162 ; AVX-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
 163 ; AVX-NEXT:    retq
 164 entry:
 165   %b = call <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16> %a, i8 27)
 166   %c = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %b, i8 27)
 167   %d = call <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16> %c, i8 27)
 168   ret <8 x i16> %d
 169 }
 170
 171 define <4 x i32> @combine_bitwise_ops_test1(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
 172 ; SSE-LABEL: combine_bitwise_ops_test1:
 173 ; SSE:       # %bb.0:
 174 ; SSE-NEXT:    pand %xmm1, %xmm0
 175 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
 176 ; SSE-NEXT:    retq
 177 ;
 178 ; AVX-LABEL: combine_bitwise_ops_test1:
 179 ; AVX:       # %bb.0:
 180 ; AVX-NEXT:    vandps %xmm1, %xmm0, %xmm0
 181 ; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
 182 ; AVX-NEXT:    retq
 183   %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 1, i32 3>
 184   %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 1, i32 3>
 185   %and = and <4 x i32> %shuf1, %shuf2
 186   ret <4 x i32> %and
 187 }
 188
 189 define <4 x i32> @combine_bitwise_ops_test2(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
 190 ; SSE-LABEL: combine_bitwise_ops_test2:
 191 ; SSE:       # %bb.0:
 192 ; SSE-NEXT:    por %xmm1, %xmm0
 193 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
 194 ; SSE-NEXT:    retq
 195 ;
 196 ; AVX-LABEL: combine_bitwise_ops_test2:
 197 ; AVX:       # %bb.0:
 198 ; AVX-NEXT:    vorps %xmm1, %xmm0, %xmm0
 199 ; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
 200 ; AVX-NEXT:    retq
 201   %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 1, i32 3>
 202   %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 1, i32 3>
 203   %or = or <4 x i32> %shuf1, %shuf2
 204   ret <4 x i32> %or
 205 }
 206
 207 define <4 x i32> @combine_bitwise_ops_test3(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
 208 ; SSE-LABEL: combine_bitwise_ops_test3:
 209 ; SSE:       # %bb.0:
 210 ; SSE-NEXT:    pxor %xmm1, %xmm0
 211 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
 212 ; SSE-NEXT:    retq
 213 ;
 214 ; AVX-LABEL: combine_bitwise_ops_test3:
 215 ; AVX:       # %bb.0:
 216 ; AVX-NEXT:    vxorps %xmm1, %xmm0, %xmm0
 217 ; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
 218 ; AVX-NEXT:    retq
 219   %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 1, i32 3>
 220   %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 1, i32 3>
 221   %xor = xor <4 x i32> %shuf1, %shuf2
 222   ret <4 x i32> %xor
 223 }
 224
 225 define <4 x i32> @combine_bitwise_ops_test4(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
 226 ; SSE-LABEL: combine_bitwise_ops_test4:
 227 ; SSE:       # %bb.0:
 228 ; SSE-NEXT:    pand %xmm1, %xmm0
 229 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
 230 ; SSE-NEXT:    retq
 231 ;
 232 ; AVX-LABEL: combine_bitwise_ops_test4:
 233 ; AVX:       # %bb.0:
 234 ; AVX-NEXT:    vandps %xmm1, %xmm0, %xmm0
 235 ; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
 236 ; AVX-NEXT:    retq
 237   %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 4, i32 6, i32 5, i32 7>
 238   %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 4, i32 6, i32 5, i32 7>
 239   %and = and <4 x i32> %shuf1, %shuf2
 240   ret <4 x i32> %and
 241 }
 242
 243 define <4 x i32> @combine_bitwise_ops_test5(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
 244 ; SSE-LABEL: combine_bitwise_ops_test5:
 245 ; SSE:       # %bb.0:
 246 ; SSE-NEXT:    por %xmm1, %xmm0
 247 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
 248 ; SSE-NEXT:    retq
 249 ;
 250 ; AVX-LABEL: combine_bitwise_ops_test5:
 251 ; AVX:       # %bb.0:
 252 ; AVX-NEXT:    vorps %xmm1, %xmm0, %xmm0
 253 ; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
 254 ; AVX-NEXT:    retq
 255   %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 4, i32 6, i32 5, i32 7>
 256   %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 4, i32 6, i32 5, i32 7>
 257   %or = or <4 x i32> %shuf1, %shuf2
 258   ret <4 x i32> %or
 259 }
 260
 261 define <4 x i32> @combine_bitwise_ops_test6(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
 262 ; SSE-LABEL: combine_bitwise_ops_test6:
 263 ; SSE:       # %bb.0:
 264 ; SSE-NEXT:    pxor %xmm1, %xmm0
 265 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
 266 ; SSE-NEXT:    retq
 267 ;
 268 ; AVX-LABEL: combine_bitwise_ops_test6:
 269 ; AVX:       # %bb.0:
 270 ; AVX-NEXT:    vxorps %xmm1, %xmm0, %xmm0
 271 ; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
 272 ; AVX-NEXT:    retq
 273   %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 4, i32 6, i32 5, i32 7>
 274   %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 4, i32 6, i32 5, i32 7>
 275   %xor = xor <4 x i32> %shuf1, %shuf2
 276   ret <4 x i32> %xor
 277 }
 278
 279
 280 ; Verify that DAGCombiner moves the shuffle after the xor/and/or even if shuffles
 281 ; are not performing a swizzle operations.
 282
 283 define <4 x i32> @combine_bitwise_ops_test1b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
 284 ; SSE2-LABEL: combine_bitwise_ops_test1b:
 285 ; SSE2:       # %bb.0:
 286 ; SSE2-NEXT:    pand %xmm1, %xmm0
 287 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
 288 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3]
 289 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 290 ; SSE2-NEXT:    retq
 291 ;
 292 ; SSSE3-LABEL: combine_bitwise_ops_test1b:
 293 ; SSSE3:       # %bb.0:
 294 ; SSSE3-NEXT:    pand %xmm1, %xmm0
 295 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
 296 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3]
 297 ; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 298 ; SSSE3-NEXT:    retq
 299 ;
 300 ; SSE41-LABEL: combine_bitwise_ops_test1b:
 301 ; SSE41:       # %bb.0:
 302 ; SSE41-NEXT:    andps %xmm1, %xmm0
 303 ; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
 304 ; SSE41-NEXT:    retq
 305 ;
 306 ; AVX-LABEL: combine_bitwise_ops_test1b:
 307 ; AVX:       # %bb.0:
 308 ; AVX-NEXT:    vandps %xmm1, %xmm0, %xmm0
 309 ; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
 310 ; AVX-NEXT:    retq
 311   %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 5, i32 2, i32 7>
 312   %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 5, i32 2, i32 7>
 313   %and = and <4 x i32> %shuf1, %shuf2
 314   ret <4 x i32> %and
 315 }
 316
 317 define <4 x i32> @combine_bitwise_ops_test2b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
 318 ; SSE2-LABEL: combine_bitwise_ops_test2b:
 319 ; SSE2:       # %bb.0:
 320 ; SSE2-NEXT:    por %xmm1, %xmm0
 321 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
 322 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3]
 323 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 324 ; SSE2-NEXT:    retq
 325 ;
 326 ; SSSE3-LABEL: combine_bitwise_ops_test2b:
 327 ; SSSE3:       # %bb.0:
 328 ; SSSE3-NEXT:    por %xmm1, %xmm0
 329 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
 330 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3]
 331 ; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 332 ; SSSE3-NEXT:    retq
 333 ;
 334 ; SSE41-LABEL: combine_bitwise_ops_test2b:
 335 ; SSE41:       # %bb.0:
 336 ; SSE41-NEXT:    orps %xmm1, %xmm0
 337 ; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
 338 ; SSE41-NEXT:    retq
 339 ;
 340 ; AVX-LABEL: combine_bitwise_ops_test2b:
 341 ; AVX:       # %bb.0:
 342 ; AVX-NEXT:    vorps %xmm1, %xmm0, %xmm0
 343 ; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
 344 ; AVX-NEXT:    retq
 345   %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 5, i32 2, i32 7>
 346   %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 5, i32 2, i32 7>
 347   %or = or <4 x i32> %shuf1, %shuf2
 348   ret <4 x i32> %or
 349 }
 350
 351 define <4 x i32> @combine_bitwise_ops_test3b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
 352 ; SSE2-LABEL: combine_bitwise_ops_test3b:
 353 ; SSE2:       # %bb.0:
 354 ; SSE2-NEXT:    xorps %xmm1, %xmm0
 355 ; SSE2-NEXT:    andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 356 ; SSE2-NEXT:    retq
 357 ;
 358 ; SSSE3-LABEL: combine_bitwise_ops_test3b:
 359 ; SSSE3:       # %bb.0:
 360 ; SSSE3-NEXT:    xorps %xmm1, %xmm0
 361 ; SSSE3-NEXT:    andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 362 ; SSSE3-NEXT:    retq
 363 ;
 364 ; SSE41-LABEL: combine_bitwise_ops_test3b:
 365 ; SSE41:       # %bb.0:
 366 ; SSE41-NEXT:    xorps %xmm1, %xmm0
 367 ; SSE41-NEXT:    xorps %xmm1, %xmm1
 368 ; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
 369 ; SSE41-NEXT:    retq
 370 ;
 371 ; AVX-LABEL: combine_bitwise_ops_test3b:
 372 ; AVX:       # %bb.0:
 373 ; AVX-NEXT:    vxorps %xmm1, %xmm0, %xmm0
 374 ; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
 375 ; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
 376 ; AVX-NEXT:    retq
 377   %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 5, i32 2, i32 7>
 378   %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 5, i32 2, i32 7>
 379   %xor = xor <4 x i32> %shuf1, %shuf2
 380   ret <4 x i32> %xor
 381 }
 382
 383 define <4 x i32> @combine_bitwise_ops_test4b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
 384 ; SSE2-LABEL: combine_bitwise_ops_test4b:
 385 ; SSE2:       # %bb.0:
 386 ; SSE2-NEXT:    pand %xmm1, %xmm0
 387 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3]
 388 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
 389 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 390 ; SSE2-NEXT:    retq
 391 ;
 392 ; SSSE3-LABEL: combine_bitwise_ops_test4b:
 393 ; SSSE3:       # %bb.0:
 394 ; SSSE3-NEXT:    pand %xmm1, %xmm0
 395 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3]
 396 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
 397 ; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 398 ; SSSE3-NEXT:    retq
 399 ;
 400 ; SSE41-LABEL: combine_bitwise_ops_test4b:
 401 ; SSE41:       # %bb.0:
 402 ; SSE41-NEXT:    andps %xmm1, %xmm0
 403 ; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3]
 404 ; SSE41-NEXT:    retq
 405 ;
 406 ; AVX-LABEL: combine_bitwise_ops_test4b:
 407 ; AVX:       # %bb.0:
 408 ; AVX-NEXT:    vandps %xmm1, %xmm0, %xmm0
 409 ; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3]
 410 ; AVX-NEXT:    retq
 411   %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 0, i32 5, i32 2, i32 7>
 412   %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 0, i32 5, i32 2, i32 7>
 413   %and = and <4 x i32> %shuf1, %shuf2
 414   ret <4 x i32> %and
 415 }
 416
 417 define <4 x i32> @combine_bitwise_ops_test5b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
 418 ; SSE2-LABEL: combine_bitwise_ops_test5b:
 419 ; SSE2:       # %bb.0:
 420 ; SSE2-NEXT:    por %xmm1, %xmm0
 421 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3]
 422 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
 423 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 424 ; SSE2-NEXT:    retq
 425 ;
 426 ; SSSE3-LABEL: combine_bitwise_ops_test5b:
 427 ; SSSE3:       # %bb.0:
 428 ; SSSE3-NEXT:    por %xmm1, %xmm0
 429 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3]
 430 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
 431 ; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 432 ; SSSE3-NEXT:    retq
 433 ;
 434 ; SSE41-LABEL: combine_bitwise_ops_test5b:
 435 ; SSE41:       # %bb.0:
 436 ; SSE41-NEXT:    orps %xmm1, %xmm0
 437 ; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3]
 438 ; SSE41-NEXT:    retq
 439 ;
 440 ; AVX-LABEL: combine_bitwise_ops_test5b:
 441 ; AVX:       # %bb.0:
 442 ; AVX-NEXT:    vorps %xmm1, %xmm0, %xmm0
 443 ; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3]
 444 ; AVX-NEXT:    retq
 445   %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 0, i32 5, i32 2, i32 7>
 446   %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 0, i32 5, i32 2, i32 7>
 447   %or = or <4 x i32> %shuf1, %shuf2
 448   ret <4 x i32> %or
 449 }
 450
 451 define <4 x i32> @combine_bitwise_ops_test6b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
 452 ; SSE2-LABEL: combine_bitwise_ops_test6b:
 453 ; SSE2:       # %bb.0:
 454 ; SSE2-NEXT:    xorps %xmm1, %xmm0
 455 ; SSE2-NEXT:    andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 456 ; SSE2-NEXT:    retq
 457 ;
 458 ; SSSE3-LABEL: combine_bitwise_ops_test6b:
 459 ; SSSE3:       # %bb.0:
 460 ; SSSE3-NEXT:    xorps %xmm1, %xmm0
 461 ; SSSE3-NEXT:    andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 462 ; SSSE3-NEXT:    retq
 463 ;
 464 ; SSE41-LABEL: combine_bitwise_ops_test6b:
 465 ; SSE41:       # %bb.0:
 466 ; SSE41-NEXT:    xorps %xmm1, %xmm0
 467 ; SSE41-NEXT:    xorps %xmm1, %xmm1
 468 ; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3]
 469 ; SSE41-NEXT:    retq
 470 ;
 471 ; AVX-LABEL: combine_bitwise_ops_test6b:
 472 ; AVX:       # %bb.0:
 473 ; AVX-NEXT:    vxorps %xmm1, %xmm0, %xmm0
 474 ; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
 475 ; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3]
 476 ; AVX-NEXT:    retq
 477   %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 0, i32 5, i32 2, i32 7>
 478   %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 0, i32 5, i32 2, i32 7>
 479   %xor = xor <4 x i32> %shuf1, %shuf2
 480   ret <4 x i32> %xor
 481 }
 482
 483 define <4 x i32> @combine_bitwise_ops_test1c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
 484 ; SSE-LABEL: combine_bitwise_ops_test1c:
 485 ; SSE:       # %bb.0:
 486 ; SSE-NEXT:    andps %xmm1, %xmm0
 487 ; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[1,3]
 488 ; SSE-NEXT:    retq
 489 ;
 490 ; AVX-LABEL: combine_bitwise_ops_test1c:
 491 ; AVX:       # %bb.0:
 492 ; AVX-NEXT:    vandps %xmm1, %xmm0, %xmm0
 493 ; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[1,3]
 494 ; AVX-NEXT:    retq
 495   %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7>
 496   %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7>
 497   %and = and <4 x i32> %shuf1, %shuf2
 498   ret <4 x i32> %and
 499 }
 500
 501 define <4 x i32> @combine_bitwise_ops_test2c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
 502 ; SSE-LABEL: combine_bitwise_ops_test2c:
 503 ; SSE:       # %bb.0:
 504 ; SSE-NEXT:    orps %xmm1, %xmm0
 505 ; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[1,3]
 506 ; SSE-NEXT:    retq
 507 ;
 508 ; AVX-LABEL: combine_bitwise_ops_test2c:
 509 ; AVX:       # %bb.0:
 510 ; AVX-NEXT:    vorps %xmm1, %xmm0, %xmm0
 511 ; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[1,3]
 512 ; AVX-NEXT:    retq
 513   %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7>
 514   %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7>
 515   %or = or <4 x i32> %shuf1, %shuf2
 516   ret <4 x i32> %or
 517 }
 518
 519 define <4 x i32> @combine_bitwise_ops_test3c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
 520 ; SSE2-LABEL: combine_bitwise_ops_test3c:
 521 ; SSE2:       # %bb.0:
 522 ; SSE2-NEXT:    xorps %xmm1, %xmm0
 523 ; SSE2-NEXT:    xorps %xmm1, %xmm1
 524 ; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,3]
 525 ; SSE2-NEXT:    retq
 526 ;
 527 ; SSSE3-LABEL: combine_bitwise_ops_test3c:
 528 ; SSSE3:       # %bb.0:
 529 ; SSSE3-NEXT:    xorps %xmm1, %xmm0
 530 ; SSSE3-NEXT:    xorps %xmm1, %xmm1
 531 ; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,3]
 532 ; SSSE3-NEXT:    retq
 533 ;
 534 ; SSE41-LABEL: combine_bitwise_ops_test3c:
 535 ; SSE41:       # %bb.0:
 536 ; SSE41-NEXT:    xorps %xmm1, %xmm0
 537 ; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
 538 ; SSE41-NEXT:    retq
 539 ;
 540 ; AVX-LABEL: combine_bitwise_ops_test3c:
 541 ; AVX:       # %bb.0:
 542 ; AVX-NEXT:    vxorps %xmm1, %xmm0, %xmm0
 543 ; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
 544 ; AVX-NEXT:    retq
 545   %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7>
 546   %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7>
 547   %xor = xor <4 x i32> %shuf1, %shuf2
 548   ret <4 x i32> %xor
 549 }
 550
 551 define <4 x i32> @combine_bitwise_ops_test4c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
 552 ; SSE-LABEL: combine_bitwise_ops_test4c:
 553 ; SSE:       # %bb.0:
 554 ; SSE-NEXT:    andps %xmm1, %xmm0
 555 ; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,2],xmm0[1,3]
 556 ; SSE-NEXT:    movaps %xmm2, %xmm0
 557 ; SSE-NEXT:    retq
 558 ;
 559 ; AVX-LABEL: combine_bitwise_ops_test4c:
 560 ; AVX:       # %bb.0:
 561 ; AVX-NEXT:    vandps %xmm1, %xmm0, %xmm0
 562 ; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm2[0,2],xmm0[1,3]
 563 ; AVX-NEXT:    retq
 564   %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 0, i32 2, i32 5, i32 7>
 565   %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 0, i32 2, i32 5, i32 7>
 566   %and = and <4 x i32> %shuf1, %shuf2
 567   ret <4 x i32> %and
 568 }
 569
 570 define <4 x i32> @combine_bitwise_ops_test5c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
 571 ; SSE-LABEL: combine_bitwise_ops_test5c:
 572 ; SSE:       # %bb.0:
 573 ; SSE-NEXT:    orps %xmm1, %xmm0
 574 ; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,2],xmm0[1,3]
 575 ; SSE-NEXT:    movaps %xmm2, %xmm0
 576 ; SSE-NEXT:    retq
 577 ;
 578 ; AVX-LABEL: combine_bitwise_ops_test5c:
 579 ; AVX:       # %bb.0:
 580 ; AVX-NEXT:    vorps %xmm1, %xmm0, %xmm0
 581 ; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm2[0,2],xmm0[1,3]
 582 ; AVX-NEXT:    retq
 583   %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 0, i32 2, i32 5, i32 7>
 584   %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 0, i32 2, i32 5, i32 7>
 585   %or = or <4 x i32> %shuf1, %shuf2
 586   ret <4 x i32> %or
 587 }
 588
 589 define <4 x i32> @combine_bitwise_ops_test6c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
 590 ; SSE2-LABEL: combine_bitwise_ops_test6c:
 591 ; SSE2:       # %bb.0:
 592 ; SSE2-NEXT:    xorps %xmm1, %xmm0
 593 ; SSE2-NEXT:    xorps %xmm1, %xmm1
 594 ; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[1,3]
 595 ; SSE2-NEXT:    movaps %xmm1, %xmm0
 596 ; SSE2-NEXT:    retq
 597 ;
 598 ; SSSE3-LABEL: combine_bitwise_ops_test6c:
 599 ; SSSE3:       # %bb.0:
 600 ; SSSE3-NEXT:    xorps %xmm1, %xmm0
 601 ; SSSE3-NEXT:    xorps %xmm1, %xmm1
 602 ; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[1,3]
 603 ; SSSE3-NEXT:    movaps %xmm1, %xmm0
 604 ; SSSE3-NEXT:    retq
 605 ;
 606 ; SSE41-LABEL: combine_bitwise_ops_test6c:
 607 ; SSE41:       # %bb.0:
 608 ; SSE41-NEXT:    xorps %xmm1, %xmm0
 609 ; SSE41-NEXT:    insertps {{.*#+}} xmm0 = zero,zero,xmm0[1,3]
 610 ; SSE41-NEXT:    retq
 611 ;
 612 ; AVX-LABEL: combine_bitwise_ops_test6c:
 613 ; AVX:       # %bb.0:
 614 ; AVX-NEXT:    vxorps %xmm1, %xmm0, %xmm0
 615 ; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = zero,zero,xmm0[1,3]
 616 ; AVX-NEXT:    retq
 617   %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 0, i32 2, i32 5, i32 7>
 618   %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 0, i32 2, i32 5, i32 7>
 619   %xor = xor <4 x i32> %shuf1, %shuf2
 620   ret <4 x i32> %xor
 621 }
 622
 623 define <4 x i32> @combine_nested_undef_test1(<4 x i32> %A, <4 x i32> %B) {
 624 ; SSE-LABEL: combine_nested_undef_test1:
 625 ; SSE:       # %bb.0:
 626 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,1,0,1]
 627 ; SSE-NEXT:    retq
 628 ;
 629 ; AVX-LABEL: combine_nested_undef_test1:
 630 ; AVX:       # %bb.0:
 631 ; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[3,1,0,1]
 632 ; AVX-NEXT:    retq
 633   %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 4, i32 3, i32 1>
 634   %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 4, i32 0, i32 3>
 635   ret <4 x i32> %2
 636 }
 637
 638 define <4 x i32> @combine_nested_undef_test2(<4 x i32> %A, <4 x i32> %B) {
 639 ; SSE-LABEL: combine_nested_undef_test2:
 640 ; SSE:       # %bb.0:
 641 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,1,0,3]
 642 ; SSE-NEXT:    retq
 643 ;
 644 ; AVX-LABEL: combine_nested_undef_test2:
 645 ; AVX:       # %bb.0:
 646 ; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[2,1,0,3]
 647 ; AVX-NEXT:    retq
 648   %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 5, i32 2, i32 3>
 649   %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 4, i32 0, i32 3>
 650   ret <4 x i32> %2
 651 }
 652
 653 define <4 x i32> @combine_nested_undef_test3(<4 x i32> %A, <4 x i32> %B) {
 654 ; SSE-LABEL: combine_nested_undef_test3:
 655 ; SSE:       # %bb.0:
 656 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,1,0,3]
 657 ; SSE-NEXT:    retq
 658 ;
 659 ; AVX-LABEL: combine_nested_undef_test3:
 660 ; AVX:       # %bb.0:
 661 ; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[2,1,0,3]
 662 ; AVX-NEXT:    retq
 663   %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 6, i32 2, i32 3>
 664   %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 4, i32 0, i32 3>
 665   ret <4 x i32> %2
 666 }
 667
 668 define <4 x i32> @combine_nested_undef_test4(<4 x i32> %A, <4 x i32> %B) {
 669 ; SSE-LABEL: combine_nested_undef_test4:
 670 ; SSE:       # %bb.0:
 671 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
 672 ; SSE-NEXT:    retq
 673 ;
 674 ; AVX1-LABEL: combine_nested_undef_test4:
 675 ; AVX1:       # %bb.0:
 676 ; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,1,0,1]
 677 ; AVX1-NEXT:    retq
 678 ;
 679 ; AVX2-LABEL: combine_nested_undef_test4:
 680 ; AVX2:       # %bb.0:
 681 ; AVX2-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
 682 ; AVX2-NEXT:    retq
 683   %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 4, i32 7, i32 1>
 684   %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 4, i32 4, i32 0, i32 3>
 685   ret <4 x i32> %2
 686 }
 687
 688 define <4 x i32> @combine_nested_undef_test5(<4 x i32> %A, <4 x i32> %B) {
 689 ; SSE-LABEL: combine_nested_undef_test5:
 690 ; SSE:       # %bb.0:
 691 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
 692 ; SSE-NEXT:    retq
 693 ;
 694 ; AVX-LABEL: combine_nested_undef_test5:
 695 ; AVX:       # %bb.0:
 696 ; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3]
 697 ; AVX-NEXT:    retq
 698   %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 5, i32 5, i32 2, i32 3>
 699   %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 4, i32 4, i32 3>
 700   ret <4 x i32> %2
 701 }
 702
 703 define <4 x i32> @combine_nested_undef_test6(<4 x i32> %A, <4 x i32> %B) {
 704 ; SSE-LABEL: combine_nested_undef_test6:
 705 ; SSE:       # %bb.0:
 706 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
 707 ; SSE-NEXT:    retq
 708 ;
 709 ; AVX-LABEL: combine_nested_undef_test6:
 710 ; AVX:       # %bb.0:
 711 ; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[2,3,0,1]
 712 ; AVX-NEXT:    retq
 713   %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 6, i32 2, i32 4>
 714   %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 4, i32 0, i32 4>
 715   ret <4 x i32> %2
 716 }
 717
 718 define <4 x i32> @combine_nested_undef_test7(<4 x i32> %A, <4 x i32> %B) {
 719 ; SSE-LABEL: combine_nested_undef_test7:
 720 ; SSE:       # %bb.0:
 721 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,0,2]
 722 ; SSE-NEXT:    retq
 723 ;
 724 ; AVX-LABEL: combine_nested_undef_test7:
 725 ; AVX:       # %bb.0:
 726 ; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2,0,2]
 727 ; AVX-NEXT:    retq
 728   %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
 729   %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 2, i32 0, i32 2>
 730   ret <4 x i32> %2
 731 }
 732
 733 define <4 x i32> @combine_nested_undef_test8(<4 x i32> %A, <4 x i32> %B) {
 734 ; SSE-LABEL: combine_nested_undef_test8:
 735 ; SSE:       # %bb.0:
 736 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
 737 ; SSE-NEXT:    retq
 738 ;
 739 ; AVX-LABEL: combine_nested_undef_test8:
 740 ; AVX:       # %bb.0:
 741 ; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,1,3,3]
 742 ; AVX-NEXT:    retq
 743   %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
 744   %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 1, i32 4, i32 3, i32 4>
 745   ret <4 x i32> %2
 746 }
 747
 748 define <4 x i32> @combine_nested_undef_test9(<4 x i32> %A, <4 x i32> %B) {
 749 ; SSE-LABEL: combine_nested_undef_test9:
 750 ; SSE:       # %bb.0:
 751 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,3,2,2]
 752 ; SSE-NEXT:    retq
 753 ;
 754 ; AVX-LABEL: combine_nested_undef_test9:
 755 ; AVX:       # %bb.0:
 756 ; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,3,2,2]
 757 ; AVX-NEXT:    retq
 758   %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 1, i32 3, i32 2, i32 5>
 759   %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 4, i32 2>
 760   ret <4 x i32> %2
 761 }
 762
 763 define <4 x i32> @combine_nested_undef_test10(<4 x i32> %A, <4 x i32> %B) {
 764 ; SSE-LABEL: combine_nested_undef_test10:
 765 ; SSE:       # %bb.0:
 766 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
 767 ; SSE-NEXT:    retq
 768 ;
 769 ; AVX-LABEL: combine_nested_undef_test10:
 770 ; AVX:       # %bb.0:
 771 ; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
 772 ; AVX-NEXT:    retq
 773   %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 1, i32 1, i32 5, i32 5>
 774   %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 4, i32 1, i32 4>
 775   ret <4 x i32> %2
 776 }
 777
 778 define <4 x i32> @combine_nested_undef_test11(<4 x i32> %A, <4 x i32> %B) {
 779 ; SSE-LABEL: combine_nested_undef_test11:
 780 ; SSE:       # %bb.0:
 781 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,1]
 782 ; SSE-NEXT:    retq
 783 ;
 784 ; AVX-LABEL: combine_nested_undef_test11:
 785 ; AVX:       # %bb.0:
 786 ; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,1,2,1]
 787 ; AVX-NEXT:    retq
 788   %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 1, i32 2, i32 5, i32 4>
 789   %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 4, i32 1, i32 0>
 790   ret <4 x i32> %2
 791 }
 792
 793 define <4 x i32> @combine_nested_undef_test12(<4 x i32> %A, <4 x i32> %B) {
 794 ; SSE-LABEL: combine_nested_undef_test12:
 795 ; SSE:       # %bb.0:
 796 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
 797 ; SSE-NEXT:    retq
 798 ;
 799 ; AVX1-LABEL: combine_nested_undef_test12:
 800 ; AVX1:       # %bb.0:
 801 ; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,1,0,1]
 802 ; AVX1-NEXT:    retq
 803 ;
 804 ; AVX2-LABEL: combine_nested_undef_test12:
 805 ; AVX2:       # %bb.0:
 806 ; AVX2-NEXT:    vbroadcastss %xmm0, %xmm0
 807 ; AVX2-NEXT:    retq
 808   %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 0, i32 2, i32 4>
 809   %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 1, i32 4, i32 0, i32 4>
 810   ret <4 x i32> %2
 811 }
 812
 813 ; The following pair of shuffles is folded into vector %A.
 814 define <4 x i32> @combine_nested_undef_test13(<4 x i32> %A, <4 x i32> %B) {
 815 ; CHECK-LABEL: combine_nested_undef_test13:
 816 ; CHECK:       # %bb.0:
 817 ; CHECK-NEXT:    retq
 818   %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 1, i32 4, i32 2, i32 6>
 819   %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 4, i32 0, i32 2, i32 4>
 820   ret <4 x i32> %2
 821 }
 822
 823 ; The following pair of shuffles is folded into vector %B.
 824 define <4 x i32> @combine_nested_undef_test14(<4 x i32> %A, <4 x i32> %B) {
 825 ; SSE-LABEL: combine_nested_undef_test14:
 826 ; SSE:       # %bb.0:
 827 ; SSE-NEXT:    movaps %xmm1, %xmm0
 828 ; SSE-NEXT:    retq
 829 ;
 830 ; AVX-LABEL: combine_nested_undef_test14:
 831 ; AVX:       # %bb.0:
 832 ; AVX-NEXT:    vmovaps %xmm1, %xmm0
 833 ; AVX-NEXT:    retq
 834   %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 6, i32 2, i32 4>
 835   %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 3, i32 4, i32 1, i32 4>
 836   ret <4 x i32> %2
 837 }
 838
 839
 840 ; Verify that we don't optimize the following cases. We expect more than one shuffle.
 841 ;
 842 ; FIXME: Many of these already don't make sense, and the rest should stop
 843 ; making sense with th enew vector shuffle lowering. Revisit at least testing for
 844 ; it.
 845
 846 define <4 x i32> @combine_nested_undef_test15(<4 x i32> %A, <4 x i32> %B) {
 847 ; SSE2-LABEL: combine_nested_undef_test15:
 848 ; SSE2:       # %bb.0:
 849 ; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0]
 850 ; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[0,1]
 851 ; SSE2-NEXT:    movaps %xmm1, %xmm0
 852 ; SSE2-NEXT:    retq
 853 ;
 854 ; SSSE3-LABEL: combine_nested_undef_test15:
 855 ; SSSE3:       # %bb.0:
 856 ; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0]
 857 ; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[0,1]
 858 ; SSSE3-NEXT:    movaps %xmm1, %xmm0
 859 ; SSSE3-NEXT:    retq
 860 ;
 861 ; SSE41-LABEL: combine_nested_undef_test15:
 862 ; SSE41:       # %bb.0:
 863 ; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
 864 ; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,1,0,1]
 865 ; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
 866 ; SSE41-NEXT:    retq
 867 ;
 868 ; AVX1-LABEL: combine_nested_undef_test15:
 869 ; AVX1:       # %bb.0:
 870 ; AVX1-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[0,0,1,1]
 871 ; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[3,1,0,1]
 872 ; AVX1-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
 873 ; AVX1-NEXT:    retq
 874 ;
 875 ; AVX2-LABEL: combine_nested_undef_test15:
 876 ; AVX2:       # %bb.0:
 877 ; AVX2-NEXT:    vbroadcastss %xmm1, %xmm1
 878 ; AVX2-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[3,1,0,1]
 879 ; AVX2-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
 880 ; AVX2-NEXT:    retq
 881   %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 4, i32 3, i32 1>
 882   %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 1, i32 0, i32 3>
 883   ret <4 x i32> %2
 884 }
 885
 886 define <4 x i32> @combine_nested_undef_test16(<4 x i32> %A, <4 x i32> %B) {
 887 ; SSE2-LABEL: combine_nested_undef_test16:
 888 ; SSE2:       # %bb.0:
 889 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
 890 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,0,2,3]
 891 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 892 ; SSE2-NEXT:    retq
 893 ;
 894 ; SSSE3-LABEL: combine_nested_undef_test16:
 895 ; SSSE3:       # %bb.0:
 896 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
 897 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,0,2,3]
 898 ; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 899 ; SSSE3-NEXT:    retq
 900 ;
 901 ; SSE41-LABEL: combine_nested_undef_test16:
 902 ; SSE41:       # %bb.0:
 903 ; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
 904 ; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
 905 ; SSE41-NEXT:    retq
 906 ;
 907 ; AVX-LABEL: combine_nested_undef_test16:
 908 ; AVX:       # %bb.0:
 909 ; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[2,3,0,1]
 910 ; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
 911 ; AVX-NEXT:    retq
 912   %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
 913   %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 1, i32 0, i32 3>
 914   ret <4 x i32> %2
 915 }
 916
 917 define <4 x i32> @combine_nested_undef_test17(<4 x i32> %A, <4 x i32> %B) {
 918 ; SSE2-LABEL: combine_nested_undef_test17:
 919 ; SSE2:       # %bb.0:
 920 ; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[1,0]
 921 ; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,1],xmm1[0,2]
 922 ; SSE2-NEXT:    retq
 923 ;
 924 ; SSSE3-LABEL: combine_nested_undef_test17:
 925 ; SSSE3:       # %bb.0:
 926 ; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[1,0]
 927 ; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,1],xmm1[0,2]
 928 ; SSSE3-NEXT:    retq
 929 ;
 930 ; SSE41-LABEL: combine_nested_undef_test17:
 931 ; SSE41:       # %bb.0:
 932 ; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5,6,7]
 933 ; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,1,0,1]
 934 ; SSE41-NEXT:    retq
 935 ;
 936 ; AVX-LABEL: combine_nested_undef_test17:
 937 ; AVX:       # %bb.0:
 938 ; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
 939 ; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[3,1,0,1]
 940 ; AVX-NEXT:    retq
 941   %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 4, i32 1, i32 3, i32 1>
 942   %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 1, i32 0, i32 3>
 943   ret <4 x i32> %2
 944 }
 945
 946 define <4 x i32> @combine_nested_undef_test18(<4 x i32> %A, <4 x i32> %B) {
 947 ; SSE-LABEL: combine_nested_undef_test18:
 948 ; SSE:       # %bb.0:
 949 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,0,3]
 950 ; SSE-NEXT:    retq
 951 ;
 952 ; AVX-LABEL: combine_nested_undef_test18:
 953 ; AVX:       # %bb.0:
 954 ; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm1[1,1,0,3]
 955 ; AVX-NEXT:    retq
 956   %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 4, i32 5, i32 2, i32 7>
 957   %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 0, i32 3>
 958   ret <4 x i32> %2
 959 }
 960
 961 define <4 x i32> @combine_nested_undef_test19(<4 x i32> %A, <4 x i32> %B) {
 962 ; SSE2-LABEL: combine_nested_undef_test19:
 963 ; SSE2:       # %bb.0:
 964 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 965 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,0,0,0]
 966 ; SSE2-NEXT:    retq
 967 ;
 968 ; SSSE3-LABEL: combine_nested_undef_test19:
 969 ; SSSE3:       # %bb.0:
 970 ; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 971 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,0,0,0]
 972 ; SSSE3-NEXT:    retq
 973 ;
 974 ; SSE41-LABEL: combine_nested_undef_test19:
 975 ; SSE41:       # %bb.0:
 976 ; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
 977 ; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,0,0,0]
 978 ; SSE41-NEXT:    retq
 979 ;
 980 ; AVX-LABEL: combine_nested_undef_test19:
 981 ; AVX:       # %bb.0:
 982 ; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
 983 ; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,0,0,0]
 984 ; AVX-NEXT:    retq
 985   %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 4, i32 5, i32 6>
 986   %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 0, i32 0, i32 0>
 987   ret <4 x i32> %2
 988 }
 989
 990 define <4 x i32> @combine_nested_undef_test20(<4 x i32> %A, <4 x i32> %B) {
 991 ; SSE2-LABEL: combine_nested_undef_test20:
 992 ; SSE2:       # %bb.0:
 993 ; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,3]
 994 ; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2,3,1]
 995 ; SSE2-NEXT:    movaps %xmm1, %xmm0
 996 ; SSE2-NEXT:    retq
 997 ;
 998 ; SSSE3-LABEL: combine_nested_undef_test20:
 999 ; SSSE3:       # %bb.0:
1000 ; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,3]
1001 ; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2,3,1]
1002 ; SSSE3-NEXT:    movaps %xmm1, %xmm0
1003 ; SSSE3-NEXT:    retq
1004 ;
1005 ; SSE41-LABEL: combine_nested_undef_test20:
1006 ; SSE41:       # %bb.0:
1007 ; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
1008 ; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,3,0]
1009 ; SSE41-NEXT:    retq
1010 ;
1011 ; AVX-LABEL: combine_nested_undef_test20:
1012 ; AVX:       # %bb.0:
1013 ; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
1014 ; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2,3,0]
1015 ; AVX-NEXT:    retq
1016   %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 3, i32 2, i32 4, i32 4>
1017   %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 1, i32 0, i32 3>
1018   ret <4 x i32> %2
1019 }
1020
1021 define <4 x i32> @combine_nested_undef_test21(<4 x i32> %A, <4 x i32> %B) {
1022 ; SSE2-LABEL: combine_nested_undef_test21:
1023 ; SSE2:       # %bb.0:
1024 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
1025 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,3,0,3]
1026 ; SSE2-NEXT:    retq
1027 ;
1028 ; SSSE3-LABEL: combine_nested_undef_test21:
1029 ; SSSE3:       # %bb.0:
1030 ; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
1031 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,3,0,3]
1032 ; SSSE3-NEXT:    retq
1033 ;
1034 ; SSE41-LABEL: combine_nested_undef_test21:
1035 ; SSE41:       # %bb.0:
1036 ; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7]
1037 ; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
1038 ; SSE41-NEXT:    retq
1039 ;
1040 ; AVX1-LABEL: combine_nested_undef_test21:
1041 ; AVX1:       # %bb.0:
1042 ; AVX1-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
1043 ; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,1,0,1]
1044 ; AVX1-NEXT:    retq
1045 ;
1046 ; AVX2-LABEL: combine_nested_undef_test21:
1047 ; AVX2:       # %bb.0:
1048 ; AVX2-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
1049 ; AVX2-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
1050 ; AVX2-NEXT:    retq
1051   %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 4, i32 1, i32 3, i32 1>
1052   %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 3>
1053   ret <4 x i32> %2
1054 }
1055
1056
1057 ; Test that we correctly combine shuffles according to rule
1058 ;  shuffle(shuffle(x, y), undef) -> shuffle(y, undef)
1059
1060 define <4 x i32> @combine_nested_undef_test22(<4 x i32> %A, <4 x i32> %B) {
1061 ; SSE-LABEL: combine_nested_undef_test22:
1062 ; SSE:       # %bb.0:
1063 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,3]
1064 ; SSE-NEXT:    retq
1065 ;
1066 ; AVX-LABEL: combine_nested_undef_test22:
1067 ; AVX:       # %bb.0:
1068 ; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm1[1,1,1,3]
1069 ; AVX-NEXT:    retq
1070   %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 4, i32 5, i32 2, i32 7>
1071   %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 3>
1072   ret <4 x i32> %2
1073 }
1074
1075 define <4 x i32> @combine_nested_undef_test23(<4 x i32> %A, <4 x i32> %B) {
1076 ; SSE-LABEL: combine_nested_undef_test23:
1077 ; SSE:       # %bb.0:
1078 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,1,0,3]
1079 ; SSE-NEXT:    retq
1080 ;
1081 ; AVX-LABEL: combine_nested_undef_test23:
1082 ; AVX:       # %bb.0:
1083 ; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm1[0,1,0,3]
1084 ; AVX-NEXT:    retq
1085   %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 4, i32 5, i32 2, i32 7>
1086   %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 3>
1087   ret <4 x i32> %2
1088 }
1089
1090 define <4 x i32> @combine_nested_undef_test24(<4 x i32> %A, <4 x i32> %B) {
1091 ; SSE-LABEL: combine_nested_undef_test24:
1092 ; SSE:       # %bb.0:
1093 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,3,2,3]
1094 ; SSE-NEXT:    retq
1095 ;
1096 ; AVX-LABEL: combine_nested_undef_test24:
1097 ; AVX:       # %bb.0:
1098 ; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm1[0,3,2,3]
1099 ; AVX-NEXT:    retq
1100   %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 4, i32 1, i32 6, i32 7>
1101   %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 3, i32 2, i32 4>
1102   ret <4 x i32> %2
1103 }
1104
1105 define <4 x i32> @combine_nested_undef_test25(<4 x i32> %A, <4 x i32> %B) {
1106 ; SSE-LABEL: combine_nested_undef_test25:
1107 ; SSE:       # %bb.0:
1108 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
1109 ; SSE-NEXT:    retq
1110 ;
1111 ; AVX1-LABEL: combine_nested_undef_test25:
1112 ; AVX1:       # %bb.0:
1113 ; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,1,0,1]
1114 ; AVX1-NEXT:    retq
1115 ;
1116 ; AVX2-LABEL: combine_nested_undef_test25:
1117 ; AVX2:       # %bb.0:
1118 ; AVX2-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
1119 ; AVX2-NEXT:    retq
1120   %1 = shufflevector <4 x i32> %B, <4 x i32> %A, <4 x i32> <i32 1, i32 5, i32 2, i32 4>
1121   %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 3, i32 1, i32 3, i32 1>
1122   ret <4 x i32> %2
1123 }
1124
1125 define <4 x i32> @combine_nested_undef_test26(<4 x i32> %A, <4 x i32> %B) {
1126 ; SSE-LABEL: combine_nested_undef_test26:
1127 ; SSE:       # %bb.0:
1128 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
1129 ; SSE-NEXT:    retq
1130 ;
1131 ; AVX-LABEL: combine_nested_undef_test26:
1132 ; AVX:       # %bb.0:
1133 ; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3]
1134 ; AVX-NEXT:    retq
1135   %1 = shufflevector <4 x i32> %B, <4 x i32> %A, <4 x i32> <i32 1, i32 2, i32 6, i32 7>
1136   %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 2, i32 3>
1137   ret <4 x i32> %2
1138 }
1139
1140 define <4 x i32> @combine_nested_undef_test27(<4 x i32> %A, <4 x i32> %B) {
1141 ; SSE-LABEL: combine_nested_undef_test27:
1142 ; SSE:       # %bb.0:
1143 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
1144 ; SSE-NEXT:    retq
1145 ;
1146 ; AVX1-LABEL: combine_nested_undef_test27:
1147 ; AVX1:       # %bb.0:
1148 ; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,1,0,1]
1149 ; AVX1-NEXT:    retq
1150 ;
1151 ; AVX2-LABEL: combine_nested_undef_test27:
1152 ; AVX2:       # %bb.0:
1153 ; AVX2-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
1154 ; AVX2-NEXT:    retq
1155   %1 = shufflevector <4 x i32> %B, <4 x i32> %A, <4 x i32> <i32 2, i32 1, i32 5, i32 4>
1156   %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 3, i32 2>
1157   ret <4 x i32> %2
1158 }
1159
1160 define <4 x i32> @combine_nested_undef_test28(<4 x i32> %A, <4 x i32> %B) {
1161 ; SSE-LABEL: combine_nested_undef_test28:
1162 ; SSE:       # %bb.0:
1163 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,1,0]
1164 ; SSE-NEXT:    retq
1165 ;
1166 ; AVX-LABEL: combine_nested_undef_test28:
1167 ; AVX:       # %bb.0:
1168 ; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,1,1,0]
1169 ; AVX-NEXT:    retq
1170   %1 = shufflevector <4 x i32> %B, <4 x i32> %A, <4 x i32> <i32 1, i32 2, i32 4, i32 5>
1171   %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 3, i32 2>
1172   ret <4 x i32> %2
1173 }
1174
1175 define <4 x float> @combine_test1(<4 x float> %a, <4 x float> %b) {
1176 ; SSE-LABEL: combine_test1:
1177 ; SSE:       # %bb.0:
1178 ; SSE-NEXT:    movaps %xmm1, %xmm0
1179 ; SSE-NEXT:    retq
1180 ;
1181 ; AVX-LABEL: combine_test1:
1182 ; AVX:       # %bb.0:
1183 ; AVX-NEXT:    vmovaps %xmm1, %xmm0
1184 ; AVX-NEXT:    retq
1185   %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
1186   %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
1187   ret <4 x float> %2
1188 }
1189
1190 define <4 x float> @combine_test2(<4 x float> %a, <4 x float> %b) {
1191 ; SSE2-LABEL: combine_test2:
1192 ; SSE2:       # %bb.0:
1193 ; SSE2-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
1194 ; SSE2-NEXT:    movaps %xmm1, %xmm0
1195 ; SSE2-NEXT:    retq
1196 ;
1197 ; SSSE3-LABEL: combine_test2:
1198 ; SSSE3:       # %bb.0:
1199 ; SSSE3-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
1200 ; SSSE3-NEXT:    movaps %xmm1, %xmm0
1201 ; SSSE3-NEXT:    retq
1202 ;
1203 ; SSE41-LABEL: combine_test2:
1204 ; SSE41:       # %bb.0:
1205 ; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
1206 ; SSE41-NEXT:    retq
1207 ;
1208 ; AVX-LABEL: combine_test2:
1209 ; AVX:       # %bb.0:
1210 ; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
1211 ; AVX-NEXT:    retq
1212   %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
1213   %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 6, i32 3>
1214   ret <4 x float> %2
1215 }
1216
1217 define <4 x float> @combine_test3(<4 x float> %a, <4 x float> %b) {
1218 ; SSE-LABEL: combine_test3:
1219 ; SSE:       # %bb.0:
1220 ; SSE-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1221 ; SSE-NEXT:    retq
1222 ;
1223 ; AVX-LABEL: combine_test3:
1224 ; AVX:       # %bb.0:
1225 ; AVX-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1226 ; AVX-NEXT:    retq
1227   %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 1, i32 7>
1228   %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 1>
1229   ret <4 x float> %2
1230 }
1231
1232 define <4 x float> @combine_test4(<4 x float> %a, <4 x float> %b) {
1233 ; SSE-LABEL: combine_test4:
1234 ; SSE:       # %bb.0:
1235 ; SSE-NEXT:    movhlps {{.*#+}} xmm0 = xmm1[1],xmm0[1]
1236 ; SSE-NEXT:    retq
1237 ;
1238 ; AVX-LABEL: combine_test4:
1239 ; AVX:       # %bb.0:
1240 ; AVX-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
1241 ; AVX-NEXT:    retq
1242   %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 2, i32 3, i32 5, i32 5>
1243   %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
1244   ret <4 x float> %2
1245 }
1246
1247 define <4 x float> @combine_test5(<4 x float> %a, <4 x float> %b) {
1248 ; SSE2-LABEL: combine_test5:
1249 ; SSE2:       # %bb.0:
1250 ; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0]
1251 ; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
1252 ; SSE2-NEXT:    retq
1253 ;
1254 ; SSSE3-LABEL: combine_test5:
1255 ; SSSE3:       # %bb.0:
1256 ; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0]
1257 ; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
1258 ; SSSE3-NEXT:    retq
1259 ;
1260 ; SSE41-LABEL: combine_test5:
1261 ; SSE41:       # %bb.0:
1262 ; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
1263 ; SSE41-NEXT:    retq
1264 ;
1265 ; AVX-LABEL: combine_test5:
1266 ; AVX:       # %bb.0:
1267 ; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
1268 ; AVX-NEXT:    retq
1269   %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
1270   %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
1271   ret <4 x float> %2
1272 }
1273
1274 define <4 x i32> @combine_test6(<4 x i32> %a, <4 x i32> %b) {
1275 ; SSE-LABEL: combine_test6:
1276 ; SSE:       # %bb.0:
1277 ; SSE-NEXT:    movaps %xmm1, %xmm0
1278 ; SSE-NEXT:    retq
1279 ;
1280 ; AVX-LABEL: combine_test6:
1281 ; AVX:       # %bb.0:
1282 ; AVX-NEXT:    vmovaps %xmm1, %xmm0
1283 ; AVX-NEXT:    retq
1284   %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
1285   %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
1286   ret <4 x i32> %2
1287 }
1288
1289 define <4 x i32> @combine_test7(<4 x i32> %a, <4 x i32> %b) {
1290 ; SSE2-LABEL: combine_test7:
1291 ; SSE2:       # %bb.0:
1292 ; SSE2-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
1293 ; SSE2-NEXT:    movaps %xmm1, %xmm0
1294 ; SSE2-NEXT:    retq
1295 ;
1296 ; SSSE3-LABEL: combine_test7:
1297 ; SSSE3:       # %bb.0:
1298 ; SSSE3-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
1299 ; SSSE3-NEXT:    movaps %xmm1, %xmm0
1300 ; SSSE3-NEXT:    retq
1301 ;
1302 ; SSE41-LABEL: combine_test7:
1303 ; SSE41:       # %bb.0:
1304 ; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
1305 ; SSE41-NEXT:    retq
1306 ;
1307 ; AVX-LABEL: combine_test7:
1308 ; AVX:       # %bb.0:
1309 ; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
1310 ; AVX-NEXT:    retq
1311   %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
1312   %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 6, i32 3>
1313   ret <4 x i32> %2
1314 }
1315
1316 define <4 x i32> @combine_test8(<4 x i32> %a, <4 x i32> %b) {
1317 ; SSE-LABEL: combine_test8:
1318 ; SSE:       # %bb.0:
1319 ; SSE-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1320 ; SSE-NEXT:    retq
1321 ;
1322 ; AVX-LABEL: combine_test8:
1323 ; AVX:       # %bb.0:
1324 ; AVX-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1325 ; AVX-NEXT:    retq
1326   %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 5, i32 1, i32 7>
1327   %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 1>
1328   ret <4 x i32> %2
1329 }
1330
1331 define <4 x i32> @combine_test9(<4 x i32> %a, <4 x i32> %b) {
1332 ; SSE-LABEL: combine_test9:
1333 ; SSE:       # %bb.0:
1334 ; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
1335 ; SSE-NEXT:    movaps %xmm1, %xmm0
1336 ; SSE-NEXT:    retq
1337 ;
1338 ; AVX-LABEL: combine_test9:
1339 ; AVX:       # %bb.0:
1340 ; AVX-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
1341 ; AVX-NEXT:    retq
1342   %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 2, i32 3, i32 5, i32 5>
1343   %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
1344   ret <4 x i32> %2
1345 }
1346
1347 define <4 x i32> @combine_test10(<4 x i32> %a, <4 x i32> %b) {
1348 ; SSE2-LABEL: combine_test10:
1349 ; SSE2:       # %bb.0:
1350 ; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0]
1351 ; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
1352 ; SSE2-NEXT:    retq
1353 ;
1354 ; SSSE3-LABEL: combine_test10:
1355 ; SSSE3:       # %bb.0:
1356 ; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0]
1357 ; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
1358 ; SSSE3-NEXT:    retq
1359 ;
1360 ; SSE41-LABEL: combine_test10:
1361 ; SSE41:       # %bb.0:
1362 ; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
1363 ; SSE41-NEXT:    retq
1364 ;
1365 ; AVX-LABEL: combine_test10:
1366 ; AVX:       # %bb.0:
1367 ; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
1368 ; AVX-NEXT:    retq
1369   %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
1370   %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
1371   ret <4 x i32> %2
1372 }
1373
1374 define <4 x float> @combine_test11(<4 x float> %a, <4 x float> %b) {
1375 ; CHECK-LABEL: combine_test11:
1376 ; CHECK:       # %bb.0:
1377 ; CHECK-NEXT:    retq
1378   %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
1379   %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
1380   ret <4 x float> %2
1381 }
1382
1383 define <4 x float> @combine_test12(<4 x float> %a, <4 x float> %b) {
1384 ; SSE2-LABEL: combine_test12:
1385 ; SSE2:       # %bb.0:
1386 ; SSE2-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
1387 ; SSE2-NEXT:    movaps %xmm1, %xmm0
1388 ; SSE2-NEXT:    retq
1389 ;
1390 ; SSSE3-LABEL: combine_test12:
1391 ; SSSE3:       # %bb.0:
1392 ; SSSE3-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
1393 ; SSSE3-NEXT:    movaps %xmm1, %xmm0
1394 ; SSSE3-NEXT:    retq
1395 ;
1396 ; SSE41-LABEL: combine_test12:
1397 ; SSE41:       # %bb.0:
1398 ; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
1399 ; SSE41-NEXT:    retq
1400 ;
1401 ; AVX-LABEL: combine_test12:
1402 ; AVX:       # %bb.0:
1403 ; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
1404 ; AVX-NEXT:    retq
1405   %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
1406   %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
1407   ret <4 x float> %2
1408 }
1409
1410 define <4 x float> @combine_test13(<4 x float> %a, <4 x float> %b) {
1411 ; SSE-LABEL: combine_test13:
1412 ; SSE:       # %bb.0:
1413 ; SSE-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1414 ; SSE-NEXT:    retq
1415 ;
1416 ; AVX-LABEL: combine_test13:
1417 ; AVX:       # %bb.0:
1418 ; AVX-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1419 ; AVX-NEXT:    retq
1420   %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
1421   %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
1422   ret <4 x float> %2
1423 }
1424
1425 define <4 x float> @combine_test14(<4 x float> %a, <4 x float> %b) {
1426 ; SSE-LABEL: combine_test14:
1427 ; SSE:       # %bb.0:
1428 ; SSE-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
1429 ; SSE-NEXT:    retq
1430 ;
1431 ; AVX-LABEL: combine_test14:
1432 ; AVX:       # %bb.0:
1433 ; AVX-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
1434 ; AVX-NEXT:    retq
1435   %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 6, i32 7, i32 5, i32 5>
1436   %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
1437   ret <4 x float> %2
1438 }
1439
1440 define <4 x float> @combine_test15(<4 x float> %a, <4 x float> %b) {
1441 ; SSE2-LABEL: combine_test15:
1442 ; SSE2:       # %bb.0:
1443 ; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0]
1444 ; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
1445 ; SSE2-NEXT:    retq
1446 ;
1447 ; SSSE3-LABEL: combine_test15:
1448 ; SSSE3:       # %bb.0:
1449 ; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0]
1450 ; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
1451 ; SSSE3-NEXT:    retq
1452 ;
1453 ; SSE41-LABEL: combine_test15:
1454 ; SSE41:       # %bb.0:
1455 ; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
1456 ; SSE41-NEXT:    retq
1457 ;
1458 ; AVX-LABEL: combine_test15:
1459 ; AVX:       # %bb.0:
1460 ; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
1461 ; AVX-NEXT:    retq
1462   %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 7>
1463   %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 0, i32 5, i32 2, i32 3>
1464   ret <4 x float> %2
1465 }
1466
1467 define <4 x i32> @combine_test16(<4 x i32> %a, <4 x i32> %b) {
1468 ; CHECK-LABEL: combine_test16:
1469 ; CHECK:       # %bb.0:
1470 ; CHECK-NEXT:    retq
1471   %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
1472   %2 = shufflevector <4 x i32> %1, <4 x i32> %a, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
1473   ret <4 x i32> %2
1474 }
1475
1476 define <4 x i32> @combine_test17(<4 x i32> %a, <4 x i32> %b) {
1477 ; SSE2-LABEL: combine_test17:
1478 ; SSE2:       # %bb.0:
1479 ; SSE2-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
1480 ; SSE2-NEXT:    movaps %xmm1, %xmm0
1481 ; SSE2-NEXT:    retq
1482 ;
1483 ; SSSE3-LABEL: combine_test17:
1484 ; SSSE3:       # %bb.0:
1485 ; SSSE3-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
1486 ; SSSE3-NEXT:    movaps %xmm1, %xmm0
1487 ; SSSE3-NEXT:    retq
1488 ;
1489 ; SSE41-LABEL: combine_test17:
1490 ; SSE41:       # %bb.0:
1491 ; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
1492 ; SSE41-NEXT:    retq
1493 ;
1494 ; AVX-LABEL: combine_test17:
1495 ; AVX:       # %bb.0:
1496 ; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
1497 ; AVX-NEXT:    retq
1498   %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
1499   %2 = shufflevector <4 x i32> %1, <4 x i32> %a, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
1500   ret <4 x i32> %2
1501 }
1502
1503 define <4 x i32> @combine_test18(<4 x i32> %a, <4 x i32> %b) {
1504 ; SSE-LABEL: combine_test18:
1505 ; SSE:       # %bb.0:
1506 ; SSE-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1507 ; SSE-NEXT:    retq
1508 ;
1509 ; AVX-LABEL: combine_test18:
1510 ; AVX:       # %bb.0:
1511 ; AVX-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1512 ; AVX-NEXT:    retq
1513   %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
1514   %2 = shufflevector <4 x i32> %1, <4 x i32> %a, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
1515   ret <4 x i32> %2
1516 }
1517
1518 define <4 x i32> @combine_test19(<4 x i32> %a, <4 x i32> %b) {
1519 ; SSE-LABEL: combine_test19:
1520 ; SSE:       # %bb.0:
1521 ; SSE-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
1522 ; SSE-NEXT:    retq
1523 ;
1524 ; AVX-LABEL: combine_test19:
1525 ; AVX:       # %bb.0:
1526 ; AVX-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
1527 ; AVX-NEXT:    retq
1528   %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 6, i32 7, i32 5, i32 5>
1529   %2 = shufflevector <4 x i32> %1, <4 x i32> %a, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
1530   ret <4 x i32> %2
1531 }
1532
1533 define <4 x i32> @combine_test20(<4 x i32> %a, <4 x i32> %b) {
1534 ; SSE2-LABEL: combine_test20:
1535 ; SSE2:       # %bb.0:
1536 ; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0]
1537 ; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
1538 ; SSE2-NEXT:    retq
1539 ;
1540 ; SSSE3-LABEL: combine_test20:
1541 ; SSSE3:       # %bb.0:
1542 ; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0]
1543 ; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
1544 ; SSSE3-NEXT:    retq
1545 ;
1546 ; SSE41-LABEL: combine_test20:
1547 ; SSE41:       # %bb.0:
1548 ; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
1549 ; SSE41-NEXT:    retq
1550 ;
1551 ; AVX-LABEL: combine_test20:
1552 ; AVX:       # %bb.0:
1553 ; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
1554 ; AVX-NEXT:    retq
1555   %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 7>
1556   %2 = shufflevector <4 x i32> %1, <4 x i32> %a, <4 x i32> <i32 0, i32 5, i32 2, i32 3>
1557   ret <4 x i32> %2
1558 }
1559
1560 define <4 x i32> @combine_test21(<8 x i32> %a, ptr %ptr) {
1561 ; SSE-LABEL: combine_test21:
1562 ; SSE:       # %bb.0:
1563 ; SSE-NEXT:    movaps %xmm0, %xmm2
1564 ; SSE-NEXT:    movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0]
1565 ; SSE-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
1566 ; SSE-NEXT:    movaps %xmm2, (%rdi)
1567 ; SSE-NEXT:    retq
1568 ;
1569 ; AVX1-LABEL: combine_test21:
1570 ; AVX1:       # %bb.0:
1571 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
1572 ; AVX1-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm0[0],xmm1[0]
1573 ; AVX1-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
1574 ; AVX1-NEXT:    vmovaps %xmm2, (%rdi)
1575 ; AVX1-NEXT:    vzeroupper
1576 ; AVX1-NEXT:    retq
1577 ;
1578 ; AVX2-LABEL: combine_test21:
1579 ; AVX2:       # %bb.0:
1580 ; AVX2-NEXT:    vpermpd {{.*#+}} ymm1 = ymm0[1,3,2,3]
1581 ; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3]
1582 ; AVX2-NEXT:    vmovaps %xmm0, (%rdi)
1583 ; AVX2-NEXT:    vmovaps %xmm1, %xmm0
1584 ; AVX2-NEXT:    vzeroupper
1585 ; AVX2-NEXT:    retq
1586   %1 = shufflevector <8 x i32> %a, <8 x i32> %a, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
1587   %2 = shufflevector <8 x i32> %a, <8 x i32> %a, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
1588   store <4 x i32> %1, ptr %ptr, align 16
1589   ret <4 x i32> %2
1590 }
1591
1592 define <8 x float> @combine_test22(ptr %a, ptr %b) {
1593 ; SSE-LABEL: combine_test22:
1594 ; SSE:       # %bb.0:
1595 ; SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
1596 ; SSE-NEXT:    movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
1597 ; SSE-NEXT:    retq
1598 ;
1599 ; AVX-LABEL: combine_test22:
1600 ; AVX:       # %bb.0:
1601 ; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
1602 ; AVX-NEXT:    vmovhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
1603 ; AVX-NEXT:    retq
1604 ; Current AVX2 lowering of this is still awful, not adding a test case.
1605   %1 = load <2 x float>, ptr %a, align 8
1606   %2 = load <2 x float>, ptr %b, align 8
1607   %3 = shufflevector <2 x float> %1, <2 x float> %2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
1608   ret <8 x float> %3
1609 }
1610
1611 ; PR22359
1612 define void @combine_test23(<8 x float> %v, ptr %ptr) {
1613 ; SSE-LABEL: combine_test23:
1614 ; SSE:       # %bb.0:
1615 ; SSE-NEXT:    movups %xmm0, (%rdi)
1616 ; SSE-NEXT:    retq
1617 ;
1618 ; AVX-LABEL: combine_test23:
1619 ; AVX:       # %bb.0:
1620 ; AVX-NEXT:    vmovups %xmm0, (%rdi)
1621 ; AVX-NEXT:    vzeroupper
1622 ; AVX-NEXT:    retq
1623   %idx2 = getelementptr inbounds <2 x float>, ptr %ptr, i64 1
1624   %shuffle0 = shufflevector <8 x float> %v, <8 x float> undef, <2 x i32> <i32 0, i32 1>
1625   %shuffle1 = shufflevector <8 x float> %v, <8 x float> undef, <2 x i32> <i32 2, i32 3>
1626   store <2 x float> %shuffle0, ptr %ptr, align 8
1627   store <2 x float> %shuffle1, ptr %idx2, align 8
1628   ret void
1629 }
1630
1631 ; Check some negative cases.
1632 ; FIXME: Do any of these really make sense? Are they redundant with the above tests?
1633
1634 define <4 x float> @combine_test1b(<4 x float> %a, <4 x float> %b) {
1635 ; SSE-LABEL: combine_test1b:
1636 ; SSE:       # %bb.0:
1637 ; SSE-NEXT:    movaps %xmm1, %xmm0
1638 ; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0]
1639 ; SSE-NEXT:    retq
1640 ;
1641 ; AVX-LABEL: combine_test1b:
1642 ; AVX:       # %bb.0:
1643 ; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm1[0,1,2,0]
1644 ; AVX-NEXT:    retq
1645   %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
1646   %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 0>
1647   ret <4 x float> %2
1648 }
1649
1650 define <4 x float> @combine_test2b(<4 x float> %a, <4 x float> %b) {
1651 ; SSE2-LABEL: combine_test2b:
1652 ; SSE2:       # %bb.0:
1653 ; SSE2-NEXT:    movaps %xmm1, %xmm0
1654 ; SSE2-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1655 ; SSE2-NEXT:    retq
1656 ;
1657 ; SSSE3-LABEL: combine_test2b:
1658 ; SSSE3:       # %bb.0:
1659 ; SSSE3-NEXT:    movddup {{.*#+}} xmm0 = xmm1[0,0]
1660 ; SSSE3-NEXT:    retq
1661 ;
1662 ; SSE41-LABEL: combine_test2b:
1663 ; SSE41:       # %bb.0:
1664 ; SSE41-NEXT:    movddup {{.*#+}} xmm0 = xmm1[0,0]
1665 ; SSE41-NEXT:    retq
1666 ;
1667 ; AVX-LABEL: combine_test2b:
1668 ; AVX:       # %bb.0:
1669 ; AVX-NEXT:    vmovddup {{.*#+}} xmm0 = xmm1[0,0]
1670 ; AVX-NEXT:    retq
1671   %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
1672   %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 0, i32 5>
1673   ret <4 x float> %2
1674 }
1675
1676 define <4 x float> @combine_test3b(<4 x float> %a, <4 x float> %b) {
1677 ; SSE2-LABEL: combine_test3b:
1678 ; SSE2:       # %bb.0:
1679 ; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[3,0]
1680 ; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,3]
1681 ; SSE2-NEXT:    retq
1682 ;
1683 ; SSSE3-LABEL: combine_test3b:
1684 ; SSSE3:       # %bb.0:
1685 ; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[3,0]
1686 ; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,3]
1687 ; SSSE3-NEXT:    retq
1688 ;
1689 ; SSE41-LABEL: combine_test3b:
1690 ; SSE41:       # %bb.0:
1691 ; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
1692 ; SSE41-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,3,2,3]
1693 ; SSE41-NEXT:    retq
1694 ;
1695 ; AVX-LABEL: combine_test3b:
1696 ; AVX:       # %bb.0:
1697 ; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
1698 ; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,3,2,3]
1699 ; AVX-NEXT:    retq
1700   %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 0, i32 6, i32 3>
1701   %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 7, i32 2, i32 7>
1702   ret <4 x float> %2
1703 }
1704
1705 define <4 x float> @combine_test4b(<4 x float> %a, <4 x float> %b) {
1706 ; SSE-LABEL: combine_test4b:
1707 ; SSE:       # %bb.0:
1708 ; SSE-NEXT:    movaps %xmm1, %xmm0
1709 ; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[2,3]
1710 ; SSE-NEXT:    retq
1711 ;
1712 ; AVX-LABEL: combine_test4b:
1713 ; AVX:       # %bb.0:
1714 ; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm1[1,1,2,3]
1715 ; AVX-NEXT:    retq
1716   %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
1717   %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 5, i32 5, i32 2, i32 7>
1718   ret <4 x float> %2
1719 }
1720
1721
1722 ; Verify that we correctly fold shuffles even when we use illegal vector types.
1723
1724 define <4 x i8> @combine_test1c(ptr %a, ptr %b) {
1725 ; SSE2-LABEL: combine_test1c:
1726 ; SSE2:       # %bb.0:
1727 ; SSE2-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
1728 ; SSE2-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
1729 ; SSE2-NEXT:    movaps {{.*#+}} xmm0 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
1730 ; SSE2-NEXT:    andps %xmm0, %xmm2
1731 ; SSE2-NEXT:    andnps %xmm1, %xmm0
1732 ; SSE2-NEXT:    orps %xmm2, %xmm0
1733 ; SSE2-NEXT:    retq
1734 ;
1735 ; SSSE3-LABEL: combine_test1c:
1736 ; SSSE3:       # %bb.0:
1737 ; SSSE3-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
1738 ; SSSE3-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1739 ; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
1740 ; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[1,2,4,6,u,u,u,u,u,u,u,u,u,u,u,u]
1741 ; SSSE3-NEXT:    retq
1742 ;
1743 ; SSE41-LABEL: combine_test1c:
1744 ; SSE41:       # %bb.0:
1745 ; SSE41-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
1746 ; SSE41-NEXT:    movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
1747 ; SSE41-NEXT:    movaps {{.*#+}} xmm0 = <0,255,255,255,u,u,u,u,u,u,u,u,u,u,u,u>
1748 ; SSE41-NEXT:    pblendvb %xmm0, %xmm2, %xmm1
1749 ; SSE41-NEXT:    movdqa %xmm1, %xmm0
1750 ; SSE41-NEXT:    retq
1751 ;
1752 ; AVX1-LABEL: combine_test1c:
1753 ; AVX1:       # %bb.0:
1754 ; AVX1-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1755 ; AVX1-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
1756 ; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm2 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255]
1757 ; AVX1-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
1758 ; AVX1-NEXT:    retq
1759 ;
1760 ; AVX2-LABEL: combine_test1c:
1761 ; AVX2:       # %bb.0:
1762 ; AVX2-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1763 ; AVX2-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
1764 ; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255]
1765 ; AVX2-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
1766 ; AVX2-NEXT:    retq
1767   %A = load <4 x i8>, ptr %a
1768   %B = load <4 x i8>, ptr %b
1769   %1 = shufflevector <4 x i8> %A, <4 x i8> %B, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
1770   %2 = shufflevector <4 x i8> %1, <4 x i8> %B, <4 x i32> <i32 0, i32 1, i32 6, i32 3>
1771   ret <4 x i8> %2
1772 }
1773
1774 define <4 x i8> @combine_test2c(ptr %a, ptr %b) {
1775 ; SSE-LABEL: combine_test2c:
1776 ; SSE:       # %bb.0:
1777 ; SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1778 ; SSE-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
1779 ; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1780 ; SSE-NEXT:    retq
1781 ;
1782 ; AVX-LABEL: combine_test2c:
1783 ; AVX:       # %bb.0:
1784 ; AVX-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1785 ; AVX-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
1786 ; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1787 ; AVX-NEXT:    retq
1788   %A = load <4 x i8>, ptr %a
1789   %B = load <4 x i8>, ptr %b
1790   %1 = shufflevector <4 x i8> %A, <4 x i8> %B, <4 x i32> <i32 0, i32 5, i32 1, i32 5>
1791   %2 = shufflevector <4 x i8> %1, <4 x i8> %B, <4 x i32> <i32 0, i32 2, i32 4, i32 1>
1792   ret <4 x i8> %2
1793 }
1794
1795 define <4 x i8> @combine_test3c(ptr %a, ptr %b) {
1796 ; SSE-LABEL: combine_test3c:
1797 ; SSE:       # %bb.0:
1798 ; SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1799 ; SSE-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
1800 ; SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
1801 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
1802 ; SSE-NEXT:    retq
1803 ;
1804 ; AVX-LABEL: combine_test3c:
1805 ; AVX:       # %bb.0:
1806 ; AVX-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1807 ; AVX-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
1808 ; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
1809 ; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
1810 ; AVX-NEXT:    retq
1811   %A = load <4 x i8>, ptr %a
1812   %B = load <4 x i8>, ptr %b
1813   %1 = shufflevector <4 x i8> %A, <4 x i8> %B, <4 x i32> <i32 2, i32 3, i32 5, i32 5>
1814   %2 = shufflevector <4 x i8> %1, <4 x i8> %B, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
1815   ret <4 x i8> %2
1816 }
1817
1818 define <4 x i8> @combine_test4c(ptr %a, ptr %b) {
1819 ; SSE2-LABEL: combine_test4c:
1820 ; SSE2:       # %bb.0:
1821 ; SSE2-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
1822 ; SSE2-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
1823 ; SSE2-NEXT:    movaps {{.*#+}} xmm0 = [255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
1824 ; SSE2-NEXT:    andps %xmm0, %xmm2
1825 ; SSE2-NEXT:    andnps %xmm1, %xmm0
1826 ; SSE2-NEXT:    orps %xmm2, %xmm0
1827 ; SSE2-NEXT:    retq
1828 ;
1829 ; SSSE3-LABEL: combine_test4c:
1830 ; SSSE3:       # %bb.0:
1831 ; SSSE3-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
1832 ; SSSE3-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1833 ; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
1834 ; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,3,4,6,u,u,u,u,u,u,u,u,u,u,u,u]
1835 ; SSSE3-NEXT:    retq
1836 ;
1837 ; SSE41-LABEL: combine_test4c:
1838 ; SSE41:       # %bb.0:
1839 ; SSE41-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
1840 ; SSE41-NEXT:    movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
1841 ; SSE41-NEXT:    movaps {{.*#+}} xmm0 = <255,0,255,255,u,u,u,u,u,u,u,u,u,u,u,u>
1842 ; SSE41-NEXT:    pblendvb %xmm0, %xmm2, %xmm1
1843 ; SSE41-NEXT:    movdqa %xmm1, %xmm0
1844 ; SSE41-NEXT:    retq
1845 ;
1846 ; AVX1-LABEL: combine_test4c:
1847 ; AVX1:       # %bb.0:
1848 ; AVX1-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1849 ; AVX1-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
1850 ; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm2 = [255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255]
1851 ; AVX1-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
1852 ; AVX1-NEXT:    retq
1853 ;
1854 ; AVX2-LABEL: combine_test4c:
1855 ; AVX2:       # %bb.0:
1856 ; AVX2-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1857 ; AVX2-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
1858 ; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255]
1859 ; AVX2-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
1860 ; AVX2-NEXT:    retq
1861   %A = load <4 x i8>, ptr %a
1862   %B = load <4 x i8>, ptr %b
1863   %1 = shufflevector <4 x i8> %A, <4 x i8> %B, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
1864   %2 = shufflevector <4 x i8> %1, <4 x i8> %B, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
1865   ret <4 x i8> %2
1866 }
1867
1868
1869 ; The following test cases are generated from this C++ code
1870 ;
1871 ;__m128 blend_01(__m128 a, __m128 b)
1872 ;{
1873 ;  __m128 s = a;
1874 ;  s = _mm_blend_ps( s, b, 1<<0 );
1875 ;  s = _mm_blend_ps( s, b, 1<<1 );
1876 ;  return s;
1877 ;}
1878 ;
1879 ;__m128 blend_02(__m128 a, __m128 b)
1880 ;{
1881 ;  __m128 s = a;
1882 ;  s = _mm_blend_ps( s, b, 1<<0 );
1883 ;  s = _mm_blend_ps( s, b, 1<<2 );
1884 ;  return s;
1885 ;}
1886 ;
1887 ;__m128 blend_123(__m128 a, __m128 b)
1888 ;{
1889 ;  __m128 s = a;
1890 ;  s = _mm_blend_ps( s, b, 1<<1 );
1891 ;  s = _mm_blend_ps( s, b, 1<<2 );
1892 ;  s = _mm_blend_ps( s, b, 1<<3 );
1893 ;  return s;
1894 ;}
1895
1896 ; Ideally, we should collapse the following shuffles into a single one.
1897
1898 define <4 x float> @combine_blend_01(<4 x float> %a, <4 x float> %b) {
1899 ; SSE2-LABEL: combine_blend_01:
1900 ; SSE2:       # %bb.0:
1901 ; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
1902 ; SSE2-NEXT:    retq
1903 ;
1904 ; SSSE3-LABEL: combine_blend_01:
1905 ; SSSE3:       # %bb.0:
1906 ; SSSE3-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
1907 ; SSSE3-NEXT:    retq
1908 ;
1909 ; SSE41-LABEL: combine_blend_01:
1910 ; SSE41:       # %bb.0:
1911 ; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
1912 ; SSE41-NEXT:    retq
1913 ;
1914 ; AVX-LABEL: combine_blend_01:
1915 ; AVX:       # %bb.0:
1916 ; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
1917 ; AVX-NEXT:    retq
1918   %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 undef, i32 2, i32 3>
1919   %shuffle6 = shufflevector <4 x float> %shuffle, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 3>
1920   ret <4 x float> %shuffle6
1921 }
1922
1923 define <4 x float> @combine_blend_02(<4 x float> %a, <4 x float> %b) {
1924 ; SSE2-LABEL: combine_blend_02:
1925 ; SSE2:       # %bb.0:
1926 ; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[1,3]
1927 ; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2,1,3]
1928 ; SSE2-NEXT:    movaps %xmm1, %xmm0
1929 ; SSE2-NEXT:    retq
1930 ;
1931 ; SSSE3-LABEL: combine_blend_02:
1932 ; SSSE3:       # %bb.0:
1933 ; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[1,3]
1934 ; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2,1,3]
1935 ; SSSE3-NEXT:    movaps %xmm1, %xmm0
1936 ; SSSE3-NEXT:    retq
1937 ;
1938 ; SSE41-LABEL: combine_blend_02:
1939 ; SSE41:       # %bb.0:
1940 ; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3]
1941 ; SSE41-NEXT:    retq
1942 ;
1943 ; AVX-LABEL: combine_blend_02:
1944 ; AVX:       # %bb.0:
1945 ; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3]
1946 ; AVX-NEXT:    retq
1947   %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 undef, i32 3>
1948   %shuffle6 = shufflevector <4 x float> %shuffle, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 6, i32 3>
1949   ret <4 x float> %shuffle6
1950 }
1951
1952 define <4 x float> @combine_blend_123(<4 x float> %a, <4 x float> %b) {
1953 ; SSE2-LABEL: combine_blend_123:
1954 ; SSE2:       # %bb.0:
1955 ; SSE2-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
1956 ; SSE2-NEXT:    movaps %xmm1, %xmm0
1957 ; SSE2-NEXT:    retq
1958 ;
1959 ; SSSE3-LABEL: combine_blend_123:
1960 ; SSSE3:       # %bb.0:
1961 ; SSSE3-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
1962 ; SSSE3-NEXT:    movaps %xmm1, %xmm0
1963 ; SSSE3-NEXT:    retq
1964 ;
1965 ; SSE41-LABEL: combine_blend_123:
1966 ; SSE41:       # %bb.0:
1967 ; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
1968 ; SSE41-NEXT:    retq
1969 ;
1970 ; AVX-LABEL: combine_blend_123:
1971 ; AVX:       # %bb.0:
1972 ; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
1973 ; AVX-NEXT:    retq
1974   %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 undef, i32 undef>
1975   %shuffle6 = shufflevector <4 x float> %shuffle, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 6, i32 undef>
1976   %shuffle12 = shufflevector <4 x float> %shuffle6, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
1977   ret <4 x float> %shuffle12
1978 }
1979
1980 define <4 x i32> @combine_test_movhl_1(<4 x i32> %a, <4 x i32> %b) {
1981 ; SSE-LABEL: combine_test_movhl_1:
1982 ; SSE:       # %bb.0:
1983 ; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
1984 ; SSE-NEXT:    movaps %xmm1, %xmm0
1985 ; SSE-NEXT:    retq
1986 ;
1987 ; AVX-LABEL: combine_test_movhl_1:
1988 ; AVX:       # %bb.0:
1989 ; AVX-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
1990 ; AVX-NEXT:    retq
1991   %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 2, i32 7, i32 5, i32 3>
1992   %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 6, i32 1, i32 0, i32 3>
1993   ret <4 x i32> %2
1994 }
1995
1996 define <4 x i32> @combine_test_movhl_2(<4 x i32> %a, <4 x i32> %b) {
1997 ; SSE-LABEL: combine_test_movhl_2:
1998 ; SSE:       # %bb.0:
1999 ; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
2000 ; SSE-NEXT:    movaps %xmm1, %xmm0
2001 ; SSE-NEXT:    retq
2002 ;
2003 ; AVX-LABEL: combine_test_movhl_2:
2004 ; AVX:       # %bb.0:
2005 ; AVX-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
2006 ; AVX-NEXT:    retq
2007   %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 2, i32 0, i32 3, i32 6>
2008   %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 3, i32 7, i32 0, i32 2>
2009   ret <4 x i32> %2
2010 }
2011
2012 define <4 x i32> @combine_test_movhl_3(<4 x i32> %a, <4 x i32> %b) {
2013 ; SSE-LABEL: combine_test_movhl_3:
2014 ; SSE:       # %bb.0:
2015 ; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
2016 ; SSE-NEXT:    movaps %xmm1, %xmm0
2017 ; SSE-NEXT:    retq
2018 ;
2019 ; AVX-LABEL: combine_test_movhl_3:
2020 ; AVX:       # %bb.0:
2021 ; AVX-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
2022 ; AVX-NEXT:    retq
2023   %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 7, i32 6, i32 3, i32 2>
2024   %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 6, i32 0, i32 3, i32 2>
2025   ret <4 x i32> %2
2026 }
2027
2028 define <16 x i8> @combine_and_or_shuffle(<16 x i8> %x, <16 x i8> %y) {
2029 ; SSE2-LABEL: combine_and_or_shuffle:
2030 ; SSE2:       # %bb.0:
2031 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3]
2032 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
2033 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,3,2,1]
2034 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,3,1,2,4,5,6,7]
2035 ; SSE2-NEXT:    pshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,6,5,7,7]
2036 ; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
2037 ; SSE2-NEXT:    pxor %xmm3, %xmm3
2038 ; SSE2-NEXT:    movdqa %xmm1, %xmm0
2039 ; SSE2-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm3[8],xmm0[9],xmm3[9],xmm0[10],xmm3[10],xmm0[11],xmm3[11],xmm0[12],xmm3[12],xmm0[13],xmm3[13],xmm0[14],xmm3[14],xmm0[15],xmm3[15]
2040 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,2,4,5,6,7]
2041 ; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[0,0,1,3]
2042 ; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [65535,65535,0,65535,0,0,65535,65535]
2043 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
2044 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3]
2045 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[3,0,2,1,4,5,6,7]
2046 ; SSE2-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,7,7,7]
2047 ; SSE2-NEXT:    pand %xmm0, %xmm1
2048 ; SSE2-NEXT:    pandn %xmm4, %xmm0
2049 ; SSE2-NEXT:    por %xmm1, %xmm0
2050 ; SSE2-NEXT:    packuswb %xmm0, %xmm0
2051 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
2052 ; SSE2-NEXT:    por %xmm2, %xmm0
2053 ; SSE2-NEXT:    retq
2054 ;
2055 ; SSSE3-LABEL: combine_and_or_shuffle:
2056 ; SSSE3:       # %bb.0:
2057 ; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = zero,xmm0[u],zero,xmm0[15],zero,xmm0[1],zero,xmm0[14],zero,xmm0[2],zero,xmm0[13],zero,xmm0[3],zero,zero
2058 ; SSSE3-NEXT:    pshufb {{.*#+}} xmm1 = xmm1[7,u,0],zero,xmm1[8],zero,xmm1[1],zero,xmm1[9],zero,xmm1[10],zero,xmm1[7],zero,xmm1[7],zero
2059 ; SSSE3-NEXT:    por %xmm1, %xmm0
2060 ; SSSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
2061 ; SSSE3-NEXT:    retq
2062 ;
2063 ; SSE41-LABEL: combine_and_or_shuffle:
2064 ; SSE41:       # %bb.0:
2065 ; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = zero,xmm0[u],zero,xmm0[15],zero,xmm0[1],zero,xmm0[14],zero,xmm0[2],zero,xmm0[13],zero,xmm0[3],zero,zero
2066 ; SSE41-NEXT:    pshufb {{.*#+}} xmm1 = xmm1[7,u,0],zero,xmm1[8],zero,xmm1[1],zero,xmm1[9],zero,xmm1[10],zero,xmm1[7],zero,xmm1[7],zero
2067 ; SSE41-NEXT:    por %xmm1, %xmm0
2068 ; SSE41-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
2069 ; SSE41-NEXT:    retq
2070 ;
2071 ; AVX-LABEL: combine_and_or_shuffle:
2072 ; AVX:       # %bb.0:
2073 ; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = zero,xmm0[u],zero,xmm0[15],zero,xmm0[1],zero,xmm0[14],zero,xmm0[2],zero,xmm0[13],zero,xmm0[3],zero,zero
2074 ; AVX-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[7,u,0],zero,xmm1[8],zero,xmm1[1],zero,xmm1[9],zero,xmm1[10],zero,xmm1[7],zero,xmm1[7],zero
2075 ; AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
2076 ; AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2077 ; AVX-NEXT:    retq
2078   %1 = shufflevector <16 x i8> %x, <16 x i8> zeroinitializer, <16 x i32> <i32 16, i32 16, i32 16, i32 15, i32 16, i32 1, i32 16, i32 14, i32 16, i32 2, i32 16, i32 13, i32 16, i32 3, i32 16, i32 16>
2079   %2 = shufflevector <16 x i8> %y, <16 x i8> zeroinitializer, <16 x i32> <i32 7, i32 16, i32 0, i32 16, i32 8, i32 16, i32 1, i32 16, i32 9, i32 16, i32 10, i32 16, i32 7, i32 16, i32 7, i32 16>
2080   %3 = or <16 x i8> %1, %2
2081   %4 = and <16 x i8> %3, <i8 -1, i8 0, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
2082   ret <16 x i8> %4
2083 }
2084
2085 ; Verify that we fold shuffles according to rule:
2086 ;  (shuffle(shuffle A, Undef, M0), B, M1) -> (shuffle A, B, M2)
2087
2088 define <4 x float> @combine_undef_input_test1(<4 x float> %a, <4 x float> %b) {
2089 ; SSE2-LABEL: combine_undef_input_test1:
2090 ; SSE2:       # %bb.0:
2091 ; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
2092 ; SSE2-NEXT:    retq
2093 ;
2094 ; SSSE3-LABEL: combine_undef_input_test1:
2095 ; SSSE3:       # %bb.0:
2096 ; SSSE3-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
2097 ; SSSE3-NEXT:    retq
2098 ;
2099 ; SSE41-LABEL: combine_undef_input_test1:
2100 ; SSE41:       # %bb.0:
2101 ; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
2102 ; SSE41-NEXT:    retq
2103 ;
2104 ; AVX-LABEL: combine_undef_input_test1:
2105 ; AVX:       # %bb.0:
2106 ; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
2107 ; AVX-NEXT:    retq
2108   %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 4, i32 2, i32 3, i32 1>
2109   %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 4, i32 5, i32 1, i32 2>
2110   ret <4 x float> %2
2111 }
2112
2113 define <4 x float> @combine_undef_input_test2(<4 x float> %a, <4 x float> %b) {
2114 ; SSE-LABEL: combine_undef_input_test2:
2115 ; SSE:       # %bb.0:
2116 ; SSE-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2117 ; SSE-NEXT:    retq
2118 ;
2119 ; AVX-LABEL: combine_undef_input_test2:
2120 ; AVX:       # %bb.0:
2121 ; AVX-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2122 ; AVX-NEXT:    retq
2123   %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 6, i32 0, i32 1, i32 7>
2124   %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 1, i32 2, i32 4, i32 5>
2125   ret <4 x float> %2
2126 }
2127
2128 define <4 x float> @combine_undef_input_test3(<4 x float> %a, <4 x float> %b) {
2129 ; SSE-LABEL: combine_undef_input_test3:
2130 ; SSE:       # %bb.0:
2131 ; SSE-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2132 ; SSE-NEXT:    retq
2133 ;
2134 ; AVX-LABEL: combine_undef_input_test3:
2135 ; AVX:       # %bb.0:
2136 ; AVX-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2137 ; AVX-NEXT:    retq
2138   %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 5, i32 1, i32 7>
2139   %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 1>
2140   ret <4 x float> %2
2141 }
2142
2143 define <4 x float> @combine_undef_input_test4(<4 x float> %a, <4 x float> %b) {
2144 ; SSE-LABEL: combine_undef_input_test4:
2145 ; SSE:       # %bb.0:
2146 ; SSE-NEXT:    movhlps {{.*#+}} xmm0 = xmm1[1],xmm0[1]
2147 ; SSE-NEXT:    retq
2148 ;
2149 ; AVX-LABEL: combine_undef_input_test4:
2150 ; AVX:       # %bb.0:
2151 ; AVX-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
2152 ; AVX-NEXT:    retq
2153   %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 5, i32 5>
2154   %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
2155   ret <4 x float> %2
2156 }
2157
2158 define <4 x float> @combine_undef_input_test5(<4 x float> %a, <4 x float> %b) {
2159 ; SSE2-LABEL: combine_undef_input_test5:
2160 ; SSE2:       # %bb.0:
2161 ; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
2162 ; SSE2-NEXT:    retq
2163 ;
2164 ; SSSE3-LABEL: combine_undef_input_test5:
2165 ; SSSE3:       # %bb.0:
2166 ; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
2167 ; SSSE3-NEXT:    retq
2168 ;
2169 ; SSE41-LABEL: combine_undef_input_test5:
2170 ; SSE41:       # %bb.0:
2171 ; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
2172 ; SSE41-NEXT:    retq
2173 ;
2174 ; AVX-LABEL: combine_undef_input_test5:
2175 ; AVX:       # %bb.0:
2176 ; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
2177 ; AVX-NEXT:    retq
2178   %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 4, i32 1, i32 3>
2179   %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 2, i32 6, i32 7>
2180   ret <4 x float> %2
2181 }
2182
2183
2184 ; Verify that we fold shuffles according to rule:
2185 ;  (shuffle(shuffle A, Undef, M0), A, M1) -> (shuffle A, Undef, M2)
2186
2187 define <4 x float> @combine_undef_input_test6(<4 x float> %a) {
2188 ; CHECK-LABEL: combine_undef_input_test6:
2189 ; CHECK:       # %bb.0:
2190 ; CHECK-NEXT:    retq
2191   %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 4, i32 2, i32 3, i32 1>
2192   %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 4, i32 5, i32 1, i32 2>
2193   ret <4 x float> %2
2194 }
2195
2196 define <4 x float> @combine_undef_input_test7(<4 x float> %a) {
2197 ; SSE2-LABEL: combine_undef_input_test7:
2198 ; SSE2:       # %bb.0:
2199 ; SSE2-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0,0]
2200 ; SSE2-NEXT:    retq
2201 ;
2202 ; SSSE3-LABEL: combine_undef_input_test7:
2203 ; SSSE3:       # %bb.0:
2204 ; SSSE3-NEXT:    movddup {{.*#+}} xmm0 = xmm0[0,0]
2205 ; SSSE3-NEXT:    retq
2206 ;
2207 ; SSE41-LABEL: combine_undef_input_test7:
2208 ; SSE41:       # %bb.0:
2209 ; SSE41-NEXT:    movddup {{.*#+}} xmm0 = xmm0[0,0]
2210 ; SSE41-NEXT:    retq
2211 ;
2212 ; AVX-LABEL: combine_undef_input_test7:
2213 ; AVX:       # %bb.0:
2214 ; AVX-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
2215 ; AVX-NEXT:    retq
2216   %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 6, i32 0, i32 1, i32 7>
2217   %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 1, i32 2, i32 4, i32 5>
2218   ret <4 x float> %2
2219 }
2220
2221 define <4 x float> @combine_undef_input_test8(<4 x float> %a) {
2222 ; SSE2-LABEL: combine_undef_input_test8:
2223 ; SSE2:       # %bb.0:
2224 ; SSE2-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0,0]
2225 ; SSE2-NEXT:    retq
2226 ;
2227 ; SSSE3-LABEL: combine_undef_input_test8:
2228 ; SSSE3:       # %bb.0:
2229 ; SSSE3-NEXT:    movddup {{.*#+}} xmm0 = xmm0[0,0]
2230 ; SSSE3-NEXT:    retq
2231 ;
2232 ; SSE41-LABEL: combine_undef_input_test8:
2233 ; SSE41:       # %bb.0:
2234 ; SSE41-NEXT:    movddup {{.*#+}} xmm0 = xmm0[0,0]
2235 ; SSE41-NEXT:    retq
2236 ;
2237 ; AVX-LABEL: combine_undef_input_test8:
2238 ; AVX:       # %bb.0:
2239 ; AVX-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
2240 ; AVX-NEXT:    retq
2241   %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 5, i32 1, i32 7>
2242   %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 0, i32 2, i32 4, i32 1>
2243   ret <4 x float> %2
2244 }
2245
2246 define <4 x float> @combine_undef_input_test9(<4 x float> %a) {
2247 ; SSE-LABEL: combine_undef_input_test9:
2248 ; SSE:       # %bb.0:
2249 ; SSE-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
2250 ; SSE-NEXT:    retq
2251 ;
2252 ; AVX-LABEL: combine_undef_input_test9:
2253 ; AVX:       # %bb.0:
2254 ; AVX-NEXT:    vshufpd {{.*#+}} xmm0 = xmm0[1,1]
2255 ; AVX-NEXT:    retq
2256   %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 5, i32 5>
2257   %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
2258   ret <4 x float> %2
2259 }
2260
2261 define <4 x float> @combine_undef_input_test10(<4 x float> %a) {
2262 ; CHECK-LABEL: combine_undef_input_test10:
2263 ; CHECK:       # %bb.0:
2264 ; CHECK-NEXT:    retq
2265   %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 4, i32 1, i32 3>
2266   %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 0, i32 2, i32 6, i32 7>
2267   ret <4 x float> %2
2268 }
2269
2270 define <4 x float> @combine_undef_input_test11(<4 x float> %a, <4 x float> %b) {
2271 ; SSE2-LABEL: combine_undef_input_test11:
2272 ; SSE2:       # %bb.0:
2273 ; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
2274 ; SSE2-NEXT:    retq
2275 ;
2276 ; SSSE3-LABEL: combine_undef_input_test11:
2277 ; SSSE3:       # %bb.0:
2278 ; SSSE3-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
2279 ; SSSE3-NEXT:    retq
2280 ;
2281 ; SSE41-LABEL: combine_undef_input_test11:
2282 ; SSE41:       # %bb.0:
2283 ; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
2284 ; SSE41-NEXT:    retq
2285 ;
2286 ; AVX-LABEL: combine_undef_input_test11:
2287 ; AVX:       # %bb.0:
2288 ; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
2289 ; AVX-NEXT:    retq
2290   %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 4, i32 2, i32 3, i32 1>
2291   %2 = shufflevector <4 x float> %b, <4 x float> %1, <4 x i32> <i32 0, i32 1, i32 5, i32 6>
2292   ret <4 x float> %2
2293 }
2294
2295 define <4 x float> @combine_undef_input_test12(<4 x float> %a, <4 x float> %b) {
2296 ; SSE-LABEL: combine_undef_input_test12:
2297 ; SSE:       # %bb.0:
2298 ; SSE-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2299 ; SSE-NEXT:    retq
2300 ;
2301 ; AVX-LABEL: combine_undef_input_test12:
2302 ; AVX:       # %bb.0:
2303 ; AVX-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2304 ; AVX-NEXT:    retq
2305   %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 6, i32 0, i32 1, i32 7>
2306   %2 = shufflevector <4 x float> %b, <4 x float> %1, <4 x i32> <i32 5, i32 6, i32 0, i32 1>
2307   ret <4 x float> %2
2308 }
2309
2310 define <4 x float> @combine_undef_input_test13(<4 x float> %a, <4 x float> %b) {
2311 ; SSE-LABEL: combine_undef_input_test13:
2312 ; SSE:       # %bb.0:
2313 ; SSE-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2314 ; SSE-NEXT:    retq
2315 ;
2316 ; AVX-LABEL: combine_undef_input_test13:
2317 ; AVX:       # %bb.0:
2318 ; AVX-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2319 ; AVX-NEXT:    retq
2320   %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 5, i32 1, i32 7>
2321   %2 = shufflevector <4 x float> %b, <4 x float> %1, <4 x i32> <i32 4, i32 5, i32 0, i32 5>
2322   ret <4 x float> %2
2323 }
2324
2325 define <4 x float> @combine_undef_input_test14(<4 x float> %a, <4 x float> %b) {
2326 ; SSE-LABEL: combine_undef_input_test14:
2327 ; SSE:       # %bb.0:
2328 ; SSE-NEXT:    movhlps {{.*#+}} xmm0 = xmm1[1],xmm0[1]
2329 ; SSE-NEXT:    retq
2330 ;
2331 ; AVX-LABEL: combine_undef_input_test14:
2332 ; AVX:       # %bb.0:
2333 ; AVX-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
2334 ; AVX-NEXT:    retq
2335   %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 5, i32 5>
2336   %2 = shufflevector <4 x float> %b, <4 x float> %1, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
2337   ret <4 x float> %2
2338 }
2339
2340 define <4 x float> @combine_undef_input_test15(<4 x float> %a, <4 x float> %b) {
2341 ; SSE2-LABEL: combine_undef_input_test15:
2342 ; SSE2:       # %bb.0:
2343 ; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
2344 ; SSE2-NEXT:    retq
2345 ;
2346 ; SSSE3-LABEL: combine_undef_input_test15:
2347 ; SSSE3:       # %bb.0:
2348 ; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
2349 ; SSSE3-NEXT:    retq
2350 ;
2351 ; SSE41-LABEL: combine_undef_input_test15:
2352 ; SSE41:       # %bb.0:
2353 ; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
2354 ; SSE41-NEXT:    retq
2355 ;
2356 ; AVX-LABEL: combine_undef_input_test15:
2357 ; AVX:       # %bb.0:
2358 ; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
2359 ; AVX-NEXT:    retq
2360   %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 4, i32 1, i32 3>
2361   %2 = shufflevector <4 x float> %b, <4 x float> %1, <4 x i32> <i32 4, i32 6, i32 2, i32 3>
2362   ret <4 x float> %2
2363 }
2364
2365
2366 ; Verify that shuffles are canonicalized according to rules:
2367 ;  shuffle(B, shuffle(A, Undef)) -> shuffle(shuffle(A, Undef), B)
2368 ;
2369 ; This allows to trigger the following combine rule:
2370 ;  (shuffle(shuffle A, Undef, M0), A, M1) -> (shuffle A, Undef, M2)
2371 ;
2372 ; As a result, all the shuffle pairs in each function below should be
2373 ; combined into a single legal shuffle operation.
2374
2375 define <4 x float> @combine_undef_input_test16(<4 x float> %a) {
2376 ; CHECK-LABEL: combine_undef_input_test16:
2377 ; CHECK:       # %bb.0:
2378 ; CHECK-NEXT:    retq
2379   %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 4, i32 2, i32 3, i32 1>
2380   %2 = shufflevector <4 x float> %a, <4 x float> %1, <4 x i32> <i32 0, i32 1, i32 5, i32 3>
2381   ret <4 x float> %2
2382 }
2383
2384 define <4 x float> @combine_undef_input_test17(<4 x float> %a) {
2385 ; SSE2-LABEL: combine_undef_input_test17:
2386 ; SSE2:       # %bb.0:
2387 ; SSE2-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0,0]
2388 ; SSE2-NEXT:    retq
2389 ;
2390 ; SSSE3-LABEL: combine_undef_input_test17:
2391 ; SSSE3:       # %bb.0:
2392 ; SSSE3-NEXT:    movddup {{.*#+}} xmm0 = xmm0[0,0]
2393 ; SSSE3-NEXT:    retq
2394 ;
2395 ; SSE41-LABEL: combine_undef_input_test17:
2396 ; SSE41:       # %bb.0:
2397 ; SSE41-NEXT:    movddup {{.*#+}} xmm0 = xmm0[0,0]
2398 ; SSE41-NEXT:    retq
2399 ;
2400 ; AVX-LABEL: combine_undef_input_test17:
2401 ; AVX:       # %bb.0:
2402 ; AVX-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
2403 ; AVX-NEXT:    retq
2404   %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 6, i32 0, i32 1, i32 7>
2405   %2 = shufflevector <4 x float> %a, <4 x float> %1, <4 x i32> <i32 5, i32 6, i32 0, i32 1>
2406   ret <4 x float> %2
2407 }
2408
2409 define <4 x float> @combine_undef_input_test18(<4 x float> %a) {
2410 ; SSE2-LABEL: combine_undef_input_test18:
2411 ; SSE2:       # %bb.0:
2412 ; SSE2-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0,0]
2413 ; SSE2-NEXT:    retq
2414 ;
2415 ; SSSE3-LABEL: combine_undef_input_test18:
2416 ; SSSE3:       # %bb.0:
2417 ; SSSE3-NEXT:    movddup {{.*#+}} xmm0 = xmm0[0,0]
2418 ; SSSE3-NEXT:    retq
2419 ;
2420 ; SSE41-LABEL: combine_undef_input_test18:
2421 ; SSE41:       # %bb.0:
2422 ; SSE41-NEXT:    movddup {{.*#+}} xmm0 = xmm0[0,0]
2423 ; SSE41-NEXT:    retq
2424 ;
2425 ; AVX-LABEL: combine_undef_input_test18:
2426 ; AVX:       # %bb.0:
2427 ; AVX-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
2428 ; AVX-NEXT:    retq
2429   %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 5, i32 1, i32 7>
2430   %2 = shufflevector <4 x float> %a, <4 x float> %1, <4 x i32> <i32 4, i32 6, i32 0, i32 5>
2431   ret <4 x float> %2
2432 }
2433
2434 define <4 x float> @combine_undef_input_test19(<4 x float> %a) {
2435 ; SSE-LABEL: combine_undef_input_test19:
2436 ; SSE:       # %bb.0:
2437 ; SSE-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
2438 ; SSE-NEXT:    retq
2439 ;
2440 ; AVX-LABEL: combine_undef_input_test19:
2441 ; AVX:       # %bb.0:
2442 ; AVX-NEXT:    vshufpd {{.*#+}} xmm0 = xmm0[1,1]
2443 ; AVX-NEXT:    retq
2444   %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 5, i32 5>
2445   %2 = shufflevector <4 x float> %a, <4 x float> %1, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
2446   ret <4 x float> %2
2447 }
2448
2449 define <4 x float> @combine_undef_input_test20(<4 x float> %a) {
2450 ; CHECK-LABEL: combine_undef_input_test20:
2451 ; CHECK:       # %bb.0:
2452 ; CHECK-NEXT:    retq
2453   %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 4, i32 1, i32 3>
2454   %2 = shufflevector <4 x float> %a, <4 x float> %1, <4 x i32> <i32 4, i32 6, i32 2, i32 3>
2455   ret <4 x float> %2
2456 }
2457
2458 ; These tests are designed to test the ability to combine away unnecessary
2459 ; operations feeding into a shuffle. The AVX cases are the important ones as
2460 ; they leverage operations which cannot be done naturally on the entire vector
2461 ; and thus are decomposed into multiple smaller operations.
2462
2463 define <8 x i32> @combine_unneeded_subvector1(<8 x i32> %a) {
2464 ; SSE-LABEL: combine_unneeded_subvector1:
2465 ; SSE:       # %bb.0:
2466 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[3,2,1,0]
2467 ; SSE-NEXT:    paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
2468 ; SSE-NEXT:    movdqa %xmm0, %xmm1
2469 ; SSE-NEXT:    retq
2470 ;
2471 ; AVX1-LABEL: combine_unneeded_subvector1:
2472 ; AVX1:       # %bb.0:
2473 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
2474 ; AVX1-NEXT:    vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2475 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
2476 ; AVX1-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
2477 ; AVX1-NEXT:    retq
2478 ;
2479 ; AVX2-SLOW-LABEL: combine_unneeded_subvector1:
2480 ; AVX2-SLOW:       # %bb.0:
2481 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
2482 ; AVX2-SLOW-NEXT:    vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
2483 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3]
2484 ; AVX2-SLOW-NEXT:    retq
2485 ;
2486 ; AVX2-FAST-ALL-LABEL: combine_unneeded_subvector1:
2487 ; AVX2-FAST-ALL:       # %bb.0:
2488 ; AVX2-FAST-ALL-NEXT:    vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
2489 ; AVX2-FAST-ALL-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [7,6,5,4,7,6,5,4]
2490 ; AVX2-FAST-ALL-NEXT:    # ymm1 = mem[0,1,0,1]
2491 ; AVX2-FAST-ALL-NEXT:    vpermd %ymm0, %ymm1, %ymm0
2492 ; AVX2-FAST-ALL-NEXT:    retq
2493 ;
2494 ; AVX2-FAST-PERLANE-LABEL: combine_unneeded_subvector1:
2495 ; AVX2-FAST-PERLANE:       # %bb.0:
2496 ; AVX2-FAST-PERLANE-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
2497 ; AVX2-FAST-PERLANE-NEXT:    vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
2498 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3]
2499 ; AVX2-FAST-PERLANE-NEXT:    retq
2500   %b = add <8 x i32> %a, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
2501   %c = shufflevector <8 x i32> %b, <8 x i32> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 7, i32 6, i32 5, i32 4>
2502   ret <8 x i32> %c
2503 }
2504
2505 define <8 x i32> @combine_unneeded_subvector2(<8 x i32> %a, <8 x i32> %b) {
2506 ; SSE-LABEL: combine_unneeded_subvector2:
2507 ; SSE:       # %bb.0:
2508 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[3,2,1,0]
2509 ; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[3,2,1,0]
2510 ; SSE-NEXT:    paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
2511 ; SSE-NEXT:    retq
2512 ;
2513 ; AVX1-LABEL: combine_unneeded_subvector2:
2514 ; AVX1:       # %bb.0:
2515 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
2516 ; AVX1-NEXT:    vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2517 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
2518 ; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
2519 ; AVX1-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
2520 ; AVX1-NEXT:    retq
2521 ;
2522 ; AVX2-LABEL: combine_unneeded_subvector2:
2523 ; AVX2:       # %bb.0:
2524 ; AVX2-NEXT:    vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
2525 ; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
2526 ; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
2527 ; AVX2-NEXT:    retq
2528   %c = add <8 x i32> %a, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
2529   %d = shufflevector <8 x i32> %b, <8 x i32> %c, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 15, i32 14, i32 13, i32 12>
2530   ret <8 x i32> %d
2531 }
2532
2533 define <4 x float> @combine_insertps1(<4 x float> %a, <4 x float> %b) {
2534 ; SSE2-LABEL: combine_insertps1:
2535 ; SSE2:       # %bb.0:
2536 ; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[1,0]
2537 ; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[2,3]
2538 ; SSE2-NEXT:    movaps %xmm1, %xmm0
2539 ; SSE2-NEXT:    retq
2540 ;
2541 ; SSSE3-LABEL: combine_insertps1:
2542 ; SSSE3:       # %bb.0:
2543 ; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[1,0]
2544 ; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[2,3]
2545 ; SSSE3-NEXT:    movaps %xmm1, %xmm0
2546 ; SSSE3-NEXT:    retq
2547 ;
2548 ; SSE41-LABEL: combine_insertps1:
2549 ; SSE41:       # %bb.0:
2550 ; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm1[2],xmm0[1,2,3]
2551 ; SSE41-NEXT:    retq
2552 ;
2553 ; AVX-LABEL: combine_insertps1:
2554 ; AVX:       # %bb.0:
2555 ; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[2],xmm0[1,2,3]
2556 ; AVX-NEXT:    retq
2557
2558   %c = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32><i32 0, i32 6, i32 2, i32 4>
2559   %d = shufflevector <4 x float> %a, <4 x float> %c, <4 x i32> <i32 5, i32 1, i32 6, i32 3>
2560   ret <4 x float> %d
2561 }
2562
2563 define <4 x float> @combine_insertps2(<4 x float> %a, <4 x float> %b) {
2564 ; SSE2-LABEL: combine_insertps2:
2565 ; SSE2:       # %bb.0:
2566 ; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[0,0]
2567 ; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3]
2568 ; SSE2-NEXT:    movaps %xmm1, %xmm0
2569 ; SSE2-NEXT:    retq
2570 ;
2571 ; SSSE3-LABEL: combine_insertps2:
2572 ; SSSE3:       # %bb.0:
2573 ; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[0,0]
2574 ; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3]
2575 ; SSSE3-NEXT:    movaps %xmm1, %xmm0
2576 ; SSSE3-NEXT:    retq
2577 ;
2578 ; SSE41-LABEL: combine_insertps2:
2579 ; SSE41:       # %bb.0:
2580 ; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],xmm1[2],xmm0[2,3]
2581 ; SSE41-NEXT:    retq
2582 ;
2583 ; AVX-LABEL: combine_insertps2:
2584 ; AVX:       # %bb.0:
2585 ; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[2],xmm0[2,3]
2586 ; AVX-NEXT:    retq
2587
2588   %c = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32><i32 0, i32 1, i32 6, i32 7>
2589   %d = shufflevector <4 x float> %a, <4 x float> %c, <4 x i32> <i32 4, i32 6, i32 2, i32 3>
2590   ret <4 x float> %d
2591 }
2592
2593 define <4 x float> @combine_insertps3(<4 x float> %a, <4 x float> %b) {
2594 ; SSE2-LABEL: combine_insertps3:
2595 ; SSE2:       # %bb.0:
2596 ; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0]
2597 ; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
2598 ; SSE2-NEXT:    retq
2599 ;
2600 ; SSSE3-LABEL: combine_insertps3:
2601 ; SSSE3:       # %bb.0:
2602 ; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0]
2603 ; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
2604 ; SSSE3-NEXT:    retq
2605 ;
2606 ; SSE41-LABEL: combine_insertps3:
2607 ; SSE41:       # %bb.0:
2608 ; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
2609 ; SSE41-NEXT:    retq
2610 ;
2611 ; AVX-LABEL: combine_insertps3:
2612 ; AVX:       # %bb.0:
2613 ; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
2614 ; AVX-NEXT:    retq
2615
2616   %c = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32><i32 0, i32 4, i32 2, i32 5>
2617   %d = shufflevector <4 x float> %a, <4 x float> %c, <4 x i32><i32 4, i32 1, i32 5, i32 3>
2618   ret <4 x float> %d
2619 }
2620
2621 define <4 x float> @combine_insertps4(<4 x float> %a, <4 x float> %b) {
2622 ; SSE2-LABEL: combine_insertps4:
2623 ; SSE2:       # %bb.0:
2624 ; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3]
2625 ; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0]
2626 ; SSE2-NEXT:    retq
2627 ;
2628 ; SSSE3-LABEL: combine_insertps4:
2629 ; SSSE3:       # %bb.0:
2630 ; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3]
2631 ; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0]
2632 ; SSSE3-NEXT:    retq
2633 ;
2634 ; SSE41-LABEL: combine_insertps4:
2635 ; SSE41:       # %bb.0:
2636 ; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
2637 ; SSE41-NEXT:    retq
2638 ;
2639 ; AVX-LABEL: combine_insertps4:
2640 ; AVX:       # %bb.0:
2641 ; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
2642 ; AVX-NEXT:    retq
2643
2644   %c = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32><i32 0, i32 4, i32 2, i32 5>
2645   %d = shufflevector <4 x float> %a, <4 x float> %c, <4 x i32><i32 4, i32 1, i32 6, i32 5>
2646   ret <4 x float> %d
2647 }
2648
2649 define void @combine_scalar_load_with_blend_with_zero(ptr %a0, ptr %a1) {
2650 ; SSE-LABEL: combine_scalar_load_with_blend_with_zero:
2651 ; SSE:       # %bb.0:
2652 ; SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
2653 ; SSE-NEXT:    movaps %xmm0, (%rsi)
2654 ; SSE-NEXT:    retq
2655 ;
2656 ; AVX-LABEL: combine_scalar_load_with_blend_with_zero:
2657 ; AVX:       # %bb.0:
2658 ; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
2659 ; AVX-NEXT:    vmovaps %xmm0, (%rsi)
2660 ; AVX-NEXT:    retq
2661   %1 = load double, ptr %a0, align 8
2662   %2 = insertelement <2 x double> undef, double %1, i32 0
2663   %3 = insertelement <2 x double> %2, double 0.000000e+00, i32 1
2664   %4 = bitcast <2 x double> %3 to <4 x float>
2665   %5 = shufflevector <4 x float> %4, <4 x float> <float 0.000000e+00, float undef, float undef, float undef>, <4 x i32> <i32 0, i32 1, i32 4, i32 3>
2666   store <4 x float> %5, ptr %a1, align 16
2667   ret void
2668 }
2669
2670 ; PR30371
2671 define <4 x float> @combine_constant_insertion_v4f32(float %f) {
2672 ; SSE2-LABEL: combine_constant_insertion_v4f32:
2673 ; SSE2:       # %bb.0:
2674 ; SSE2-NEXT:    movaps {{.*#+}} xmm1 = <u,4.0E+0,5.0E+0,3.0E+0>
2675 ; SSE2-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
2676 ; SSE2-NEXT:    movaps %xmm1, %xmm0
2677 ; SSE2-NEXT:    retq
2678 ;
2679 ; SSSE3-LABEL: combine_constant_insertion_v4f32:
2680 ; SSSE3:       # %bb.0:
2681 ; SSSE3-NEXT:    movaps {{.*#+}} xmm1 = <u,4.0E+0,5.0E+0,3.0E+0>
2682 ; SSSE3-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
2683 ; SSSE3-NEXT:    movaps %xmm1, %xmm0
2684 ; SSSE3-NEXT:    retq
2685 ;
2686 ; SSE41-LABEL: combine_constant_insertion_v4f32:
2687 ; SSE41:       # %bb.0:
2688 ; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0],mem[1,2,3]
2689 ; SSE41-NEXT:    retq
2690 ;
2691 ; AVX-LABEL: combine_constant_insertion_v4f32:
2692 ; AVX:       # %bb.0:
2693 ; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],mem[1,2,3]
2694 ; AVX-NEXT:    retq
2695   %a0 = insertelement <4 x float> undef, float %f, i32 0
2696   %ret = shufflevector <4 x float> %a0, <4 x float> <float undef, float 4.0, float 5.0, float 3.0>, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
2697   ret <4 x float> %ret
2698 }
2699
2700 define <4 x i32> @combine_constant_insertion_v4i32(i32 %f) {
2701 ; SSE2-LABEL: combine_constant_insertion_v4i32:
2702 ; SSE2:       # %bb.0:
2703 ; SSE2-NEXT:    movd %edi, %xmm1
2704 ; SSE2-NEXT:    movaps {{.*#+}} xmm0 = <u,4,5,30>
2705 ; SSE2-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
2706 ; SSE2-NEXT:    retq
2707 ;
2708 ; SSSE3-LABEL: combine_constant_insertion_v4i32:
2709 ; SSSE3:       # %bb.0:
2710 ; SSSE3-NEXT:    movd %edi, %xmm1
2711 ; SSSE3-NEXT:    movaps {{.*#+}} xmm0 = <u,4,5,30>
2712 ; SSSE3-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
2713 ; SSSE3-NEXT:    retq
2714 ;
2715 ; SSE41-LABEL: combine_constant_insertion_v4i32:
2716 ; SSE41:       # %bb.0:
2717 ; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = <u,4,5,30>
2718 ; SSE41-NEXT:    pinsrd $0, %edi, %xmm0
2719 ; SSE41-NEXT:    retq
2720 ;
2721 ; AVX-LABEL: combine_constant_insertion_v4i32:
2722 ; AVX:       # %bb.0:
2723 ; AVX-NEXT:    vmovdqa {{.*#+}} xmm0 = <u,4,5,30>
2724 ; AVX-NEXT:    vpinsrd $0, %edi, %xmm0, %xmm0
2725 ; AVX-NEXT:    retq
2726   %a0 = insertelement <4 x i32> undef, i32 %f, i32 0
2727   %ret = shufflevector <4 x i32> %a0, <4 x i32> <i32 undef, i32 4, i32 5, i32 30>, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
2728   ret <4 x i32> %ret
2729 }
2730
2731 define <4 x float> @PR22377(<4 x float> %a, <4 x float> %b) {
2732 ; SSE2-LABEL: PR22377:
2733 ; SSE2:       # %bb.0: # %entry
2734 ; SSE2-NEXT:    movaps %xmm0, %xmm1
2735 ; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,3],xmm0[2,3]
2736 ; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2,0,2]
2737 ; SSE2-NEXT:    addps %xmm0, %xmm1
2738 ; SSE2-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
2739 ; SSE2-NEXT:    retq
2740 ;
2741 ; SSSE3-LABEL: PR22377:
2742 ; SSSE3:       # %bb.0: # %entry
2743 ; SSSE3-NEXT:    movaps %xmm0, %xmm1
2744 ; SSSE3-NEXT:    haddps %xmm0, %xmm1
2745 ; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,1]
2746 ; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
2747 ; SSSE3-NEXT:    retq
2748 ;
2749 ; SSE41-LABEL: PR22377:
2750 ; SSE41:       # %bb.0: # %entry
2751 ; SSE41-NEXT:    movaps %xmm0, %xmm1
2752 ; SSE41-NEXT:    haddps %xmm0, %xmm1
2753 ; SSE41-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,1]
2754 ; SSE41-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
2755 ; SSE41-NEXT:    retq
2756 ;
2757 ; AVX-LABEL: PR22377:
2758 ; AVX:       # %bb.0: # %entry
2759 ; AVX-NEXT:    vhaddps %xmm0, %xmm0, %xmm1
2760 ; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,1]
2761 ; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
2762 ; AVX-NEXT:    retq
2763 entry:
2764   %s1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 1, i32 3, i32 1, i32 3>
2765   %s2 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 2, i32 0, i32 2>
2766   %r2 = fadd <4 x float> %s1, %s2
2767   %s3 = shufflevector <4 x float> %s2, <4 x float> %r2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
2768   ret <4 x float> %s3
2769 }
2770
2771 define <4 x float> @PR22390(<4 x float> %a, <4 x float> %b) {
2772 ; SSE2-LABEL: PR22390:
2773 ; SSE2:       # %bb.0: # %entry
2774 ; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,0,1,2]
2775 ; SSE2-NEXT:    movaps %xmm0, %xmm2
2776 ; SSE2-NEXT:    movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3]
2777 ; SSE2-NEXT:    addps %xmm2, %xmm0
2778 ; SSE2-NEXT:    retq
2779 ;
2780 ; SSSE3-LABEL: PR22390:
2781 ; SSSE3:       # %bb.0: # %entry
2782 ; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,0,1,2]
2783 ; SSSE3-NEXT:    movaps %xmm0, %xmm2
2784 ; SSSE3-NEXT:    movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3]
2785 ; SSSE3-NEXT:    addps %xmm2, %xmm0
2786 ; SSSE3-NEXT:    retq
2787 ;
2788 ; SSE41-LABEL: PR22390:
2789 ; SSE41:       # %bb.0: # %entry
2790 ; SSE41-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,0,1,2]
2791 ; SSE41-NEXT:    blendps {{.*#+}} xmm1 = xmm1[0],xmm0[1,2,3]
2792 ; SSE41-NEXT:    addps %xmm1, %xmm0
2793 ; SSE41-NEXT:    retq
2794 ;
2795 ; AVX-LABEL: PR22390:
2796 ; AVX:       # %bb.0: # %entry
2797 ; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[3,0,1,2]
2798 ; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0],xmm0[1,2,3]
2799 ; AVX-NEXT:    vaddps %xmm1, %xmm0, %xmm0
2800 ; AVX-NEXT:    retq
2801 entry:
2802   %s1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 3, i32 0, i32 1, i32 2>
2803   %s2 = shufflevector <4 x float> %s1, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
2804   %r2 = fadd <4 x float> %s1, %s2
2805   ret <4 x float> %r2
2806 }
2807
2808 define <8 x float> @PR22412(<8 x float> %a, <8 x float> %b) {
2809 ; SSE-LABEL: PR22412:
2810 ; SSE:       # %bb.0: # %entry
2811 ; SSE-NEXT:    movaps %xmm3, %xmm1
2812 ; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,0],xmm3[3,2]
2813 ; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,0],xmm2[3,2]
2814 ; SSE-NEXT:    retq
2815 ;
2816 ; AVX1-LABEL: PR22412:
2817 ; AVX1:       # %bb.0: # %entry
2818 ; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm1[2,3,0,1]
2819 ; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
2820 ; AVX1-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[1,0],ymm2[3,2],ymm0[5,4],ymm2[7,6]
2821 ; AVX1-NEXT:    retq
2822 ;
2823 ; AVX2-LABEL: PR22412:
2824 ; AVX2:       # %bb.0: # %entry
2825 ; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
2826 ; AVX2-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[2,3,0,1]
2827 ; AVX2-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[1,0],ymm1[3,2],ymm0[5,4],ymm1[7,6]
2828 ; AVX2-NEXT:    retq
2829 entry:
2830   %s1 = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 1, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
2831   %s2 = shufflevector <8 x float> %s1, <8 x float> undef, <8 x i32> <i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2>
2832   ret <8 x float> %s2
2833 }
2834
2835 define <4 x float> @PR30264(<4 x float> %x) {
2836 ; SSE2-LABEL: PR30264:
2837 ; SSE2:       # %bb.0:
2838 ; SSE2-NEXT:    xorps %xmm1, %xmm1
2839 ; SSE2-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
2840 ; SSE2-NEXT:    shufpd {{.*#+}} xmm1 = xmm1[0],mem[1]
2841 ; SSE2-NEXT:    movapd %xmm1, %xmm0
2842 ; SSE2-NEXT:    retq
2843 ;
2844 ; SSSE3-LABEL: PR30264:
2845 ; SSSE3:       # %bb.0:
2846 ; SSSE3-NEXT:    xorps %xmm1, %xmm1
2847 ; SSSE3-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
2848 ; SSSE3-NEXT:    shufpd {{.*#+}} xmm1 = xmm1[0],mem[1]
2849 ; SSSE3-NEXT:    movapd %xmm1, %xmm0
2850 ; SSSE3-NEXT:    retq
2851 ;
2852 ; SSE41-LABEL: PR30264:
2853 ; SSE41:       # %bb.0:
2854 ; SSE41-NEXT:    movaps {{.*#+}} xmm1 = <u,u,4.0E+0,1.0E+0>
2855 ; SSE41-NEXT:    insertps {{.*#+}} xmm1 = xmm0[0],zero,xmm1[2,3]
2856 ; SSE41-NEXT:    movaps %xmm1, %xmm0
2857 ; SSE41-NEXT:    retq
2858 ;
2859 ; AVX-LABEL: PR30264:
2860 ; AVX:       # %bb.0:
2861 ; AVX-NEXT:    vmovddup {{.*#+}} xmm1 = [4.0E+0,1.0E+0,4.0E+0,1.0E+0]
2862 ; AVX-NEXT:    # xmm1 = mem[0,0]
2863 ; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],zero,xmm1[2,3]
2864 ; AVX-NEXT:    retq
2865   %shuf1 = shufflevector <4 x float> %x, <4 x float> <float undef, float 0.0, float undef, float undef>, <4 x i32> <i32 0, i32 5, i32 undef, i32 undef>
2866   %shuf2 = shufflevector <4 x float> %shuf1, <4 x float> <float undef, float undef, float 4.0, float 1.0>, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
2867   ret <4 x float> %shuf2
2868 }
2869
2870 define <8 x i16> @PR39549(<16 x i8> %x) {
2871 ; SSE-LABEL: PR39549:
2872 ; SSE:       # %bb.0:
2873 ; SSE-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
2874 ; SSE-NEXT:    psraw $8, %xmm0
2875 ; SSE-NEXT:    retq
2876 ;
2877 ; AVX-LABEL: PR39549:
2878 ; AVX:       # %bb.0:
2879 ; AVX-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
2880 ; AVX-NEXT:    vpsraw $8, %xmm0, %xmm0
2881 ; AVX-NEXT:    retq
2882   %a = shufflevector <16 x i8> %x, <16 x i8> undef, <16 x i32> <i32 8, i32 undef, i32 9, i32 undef, i32 10, i32 undef, i32 11, i32 undef, i32 12, i32 undef, i32 13, i32 undef, i32 14, i32 undef, i32 15, i32 undef>
2883   %b = bitcast <16 x i8> %a to <8 x i16>
2884   %c = shl <8 x i16> %b, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
2885   %d = ashr <8 x i16> %c, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
2886   ret <8 x i16> %d
2887 }
2888
2889 define <4 x i32> @PR41545(<4 x i32> %a0, <16 x i8> %a1) {
2890 ; SSE-LABEL: PR41545:
2891 ; SSE:       # %bb.0:
2892 ; SSE-NEXT:    paddd %xmm1, %xmm0
2893 ; SSE-NEXT:    retq
2894 ;
2895 ; AVX-LABEL: PR41545:
2896 ; AVX:       # %bb.0:
2897 ; AVX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
2898 ; AVX-NEXT:    retq
2899   %1  = shufflevector <16 x i8> %a1, <16 x i8> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
2900   %2  = shufflevector <16 x i8> %a1, <16 x i8> undef, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
2901   %3  = shufflevector <16 x i8> %a1, <16 x i8> undef, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
2902   %4  = shufflevector <16 x i8> %a1, <16 x i8> undef, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
2903   %5  = zext <4 x i8> %1 to <4 x i32>
2904   %6  = zext <4 x i8> %2 to <4 x i32>
2905   %7  = zext <4 x i8> %3 to <4 x i32>
2906   %8  = zext <4 x i8> %4 to <4 x i32>
2907   %9  = shl <4 x i32> %6, <i32 8, i32 8, i32 8, i32 8>
2908   %10 = shl <4 x i32> %7, <i32 16, i32 16, i32 16, i32 16>
2909   %11 = shl <4 x i32> %8, <i32 24, i32 24, i32 24, i32 24>
2910   %12 = or <4 x i32> %5, %9
2911   %13 = or <4 x i32> %12, %10
2912   %14 = or <4 x i32> %13, %11
2913   %15 = add <4 x i32> %a0, %14
2914   ret <4 x i32> %15
2915 }
2916
2917 define <8 x i16> @shuffle_extract_insert(<8 x i16> %a) {
2918 ; SSE-LABEL: shuffle_extract_insert:
2919 ; SSE:       # %bb.0:
2920 ; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[2,1,0,3,4,5,6,7]
2921 ; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,4,7]
2922 ; SSE-NEXT:    retq
2923 ;
2924 ; AVX1-LABEL: shuffle_extract_insert:
2925 ; AVX1:       # %bb.0:
2926 ; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[2,1,0,3,4,5,6,7]
2927 ; AVX1-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,4,7]
2928 ; AVX1-NEXT:    retq
2929 ;
2930 ; AVX2-SLOW-LABEL: shuffle_extract_insert:
2931 ; AVX2-SLOW:       # %bb.0:
2932 ; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[2,1,0,3,4,5,6,7]
2933 ; AVX2-SLOW-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,4,7]
2934 ; AVX2-SLOW-NEXT:    retq
2935 ;
2936 ; AVX2-FAST-LABEL: shuffle_extract_insert:
2937 ; AVX2-FAST:       # %bb.0:
2938 ; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[4,5,2,3,0,1,6,7,12,13,10,11,8,9,14,15]
2939 ; AVX2-FAST-NEXT:    retq
2940   %a0 = extractelement <8 x i16> %a, i32 0
2941   %a1 = extractelement <8 x i16> %a, i32 1
2942   %a3 = extractelement <8 x i16> %a, i32 3
2943   %a4 = extractelement <8 x i16> %a, i32 4
2944   %a5 = extractelement <8 x i16> %a, i32 5
2945   %a6 = extractelement <8 x i16> %a, i32 6
2946   %a7 = extractelement <8 x i16> %a, i32 7
2947   %1 = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> <i32 2, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
2948   %2 = insertelement <8 x i16> %1, i16 %a1, i32 1
2949   %3 = insertelement <8 x i16> %2, i16 %a0, i32 2
2950   %4 = insertelement <8 x i16> %3, i16 %a3, i32 3
2951   %5 = insertelement <8 x i16> %4, i16 %a6, i32 4
2952   %6 = insertelement <8 x i16> %5, i16 %a5, i32 5
2953   %7 = insertelement <8 x i16> %6, i16 %a4, i32 6
2954   %8 = insertelement <8 x i16> %7, i16 %a7, i32 7
2955   ret <8 x i16> %8
2956 }
2957
2958 define <8 x i16> @shuffle_extract_insert_double(<8 x i16> %a, <8 x i16> %b) {
2959 ; SSE2-LABEL: shuffle_extract_insert_double:
2960 ; SSE2:       # %bb.0:
2961 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7]
2962 ; SSE2-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,6,7]
2963 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
2964 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,1,3,2,4,5,6,7]
2965 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
2966 ; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
2967 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
2968 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7]
2969 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
2970 ; SSE2-NEXT:    retq
2971 ;
2972 ; SSSE3-LABEL: shuffle_extract_insert_double:
2973 ; SSSE3:       # %bb.0:
2974 ; SSSE3-NEXT:    pshufb {{.*#+}} xmm1 = xmm1[0,1,6,7,10,11,14,15,u,u,u,u,u,u,u,u]
2975 ; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u]
2976 ; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
2977 ; SSSE3-NEXT:    retq
2978 ;
2979 ; SSE41-LABEL: shuffle_extract_insert_double:
2980 ; SSE41:       # %bb.0:
2981 ; SSE41-NEXT:    pshufb {{.*#+}} xmm1 = xmm1[0,1,6,7,10,11,14,15,u,u,u,u,u,u,u,u]
2982 ; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u]
2983 ; SSE41-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
2984 ; SSE41-NEXT:    retq
2985 ;
2986 ; AVX-LABEL: shuffle_extract_insert_double:
2987 ; AVX:       # %bb.0:
2988 ; AVX-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[0,1,6,7,10,11,14,15,u,u,u,u,u,u,u,u]
2989 ; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u]
2990 ; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
2991 ; AVX-NEXT:    retq
2992   %a0 = extractelement <8 x i16> %a, i32 0
2993   %a4 = extractelement <8 x i16> %a, i32 4
2994   %a6 = extractelement <8 x i16> %a, i32 6
2995   %b11 = extractelement <8 x i16> %b, i32 3
2996   %b13 = extractelement <8 x i16> %b, i32 5
2997   %b15 = extractelement <8 x i16> %b, i32 7
2998   %1 = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 2, i32 8, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
2999   %2 = insertelement <8 x i16> %1, i16 %a0, i32 2
3000   %3 = insertelement <8 x i16> %2, i16 %b11, i32 3
3001   %4 = insertelement <8 x i16> %3, i16 %a6, i32 4
3002   %5 = insertelement <8 x i16> %4, i16 %b13, i32 5
3003   %6 = insertelement <8 x i16> %5, i16 %a4, i32 6
3004   %7 = insertelement <8 x i16> %6, i16 %b15, i32 7
3005   ret <8 x i16> %7
3006 }
3007
3008 define <8 x i16> @shuffle_extract_concat_insert(<4 x i16> %lhsa, <4 x i16> %rhsa, <8 x i16> %b) {
3009 ; SSE2-LABEL: shuffle_extract_concat_insert:
3010 ; SSE2:       # %bb.0:
3011 ; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
3012 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
3013 ; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
3014 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
3015 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7]
3016 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm2[0,3,2,3,4,5,6,7]
3017 ; SSE2-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,6,7]
3018 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
3019 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,1,3,2,4,5,6,7]
3020 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
3021 ; SSE2-NEXT:    retq
3022 ;
3023 ; SSSE3-LABEL: shuffle_extract_concat_insert:
3024 ; SSSE3:       # %bb.0:
3025 ; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
3026 ; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u]
3027 ; SSSE3-NEXT:    pshufb {{.*#+}} xmm2 = xmm2[0,1,6,7,10,11,14,15,u,u,u,u,u,u,u,u]
3028 ; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
3029 ; SSSE3-NEXT:    retq
3030 ;
3031 ; SSE41-LABEL: shuffle_extract_concat_insert:
3032 ; SSE41:       # %bb.0:
3033 ; SSE41-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
3034 ; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u]
3035 ; SSE41-NEXT:    pshufb {{.*#+}} xmm2 = xmm2[0,1,6,7,10,11,14,15,u,u,u,u,u,u,u,u]
3036 ; SSE41-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
3037 ; SSE41-NEXT:    retq
3038 ;
3039 ; AVX-LABEL: shuffle_extract_concat_insert:
3040 ; AVX:       # %bb.0:
3041 ; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
3042 ; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u]
3043 ; AVX-NEXT:    vpshufb {{.*#+}} xmm1 = xmm2[0,1,6,7,10,11,14,15,u,u,u,u,u,u,u,u]
3044 ; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
3045 ; AVX-NEXT:    retq
3046   %a = shufflevector <4 x i16> %lhsa, <4 x i16> %rhsa, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
3047   %a0 = extractelement <8 x i16> %a, i32 0
3048   %a4 = extractelement <8 x i16> %a, i32 4
3049   %a6 = extractelement <8 x i16> %a, i32 6
3050   %b11 = extractelement <8 x i16> %b, i32 3
3051   %b13 = extractelement <8 x i16> %b, i32 5
3052   %b15 = extractelement <8 x i16> %b, i32 7
3053   %1 = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 2, i32 8, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
3054   %2 = insertelement <8 x i16> %1, i16 %a0, i32 2
3055   %3 = insertelement <8 x i16> %2, i16 %b11, i32 3
3056   %4 = insertelement <8 x i16> %3, i16 %a6, i32 4
3057   %5 = insertelement <8 x i16> %4, i16 %b13, i32 5
3058   %6 = insertelement <8 x i16> %5, i16 %a4, i32 6
3059   %7 = insertelement <8 x i16> %6, i16 %b15, i32 7
3060   ret <8 x i16> %7
3061 }
3062
3063 define <8 x i16> @shuffle_scalar_to_vector_extract(ptr %p0, ptr %p1, ptr %p2) {
3064 ; SSE2-LABEL: shuffle_scalar_to_vector_extract:
3065 ; SSE2:       # %bb.0:
3066 ; SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
3067 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
3068 ; SSE2-NEXT:    psraw $8, %xmm1
3069 ; SSE2-NEXT:    pextrw $7, %xmm1, %eax
3070 ; SSE2-NEXT:    movd %eax, %xmm2
3071 ; SSE2-NEXT:    movsbl (%rsi), %eax
3072 ; SSE2-NEXT:    movd %eax, %xmm0
3073 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
3074 ; SSE2-NEXT:    movsbl (%rdx), %eax
3075 ; SSE2-NEXT:    movd %eax, %xmm0
3076 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
3077 ; SSE2-NEXT:    pxor %xmm0, %xmm0
3078 ; SSE2-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
3079 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
3080 ; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
3081 ; SSE2-NEXT:    retq
3082 ;
3083 ; SSSE3-LABEL: shuffle_scalar_to_vector_extract:
3084 ; SSSE3:       # %bb.0:
3085 ; SSSE3-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
3086 ; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
3087 ; SSSE3-NEXT:    psraw $8, %xmm1
3088 ; SSSE3-NEXT:    movsbl (%rsi), %eax
3089 ; SSSE3-NEXT:    movd %eax, %xmm2
3090 ; SSSE3-NEXT:    palignr {{.*#+}} xmm2 = xmm1[14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
3091 ; SSSE3-NEXT:    movsbl (%rdx), %eax
3092 ; SSSE3-NEXT:    movd %eax, %xmm0
3093 ; SSSE3-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
3094 ; SSSE3-NEXT:    pxor %xmm0, %xmm0
3095 ; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
3096 ; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
3097 ; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
3098 ; SSSE3-NEXT:    retq
3099 ;
3100 ; SSE41-LABEL: shuffle_scalar_to_vector_extract:
3101 ; SSE41:       # %bb.0:
3102 ; SSE41-NEXT:    pmovsxbw (%rdi), %xmm0
3103 ; SSE41-NEXT:    pextrw $4, %xmm0, %eax
3104 ; SSE41-NEXT:    pextrw $7, %xmm0, %ecx
3105 ; SSE41-NEXT:    pxor %xmm0, %xmm0
3106 ; SSE41-NEXT:    pinsrw $1, %eax, %xmm0
3107 ; SSE41-NEXT:    movl $65531, %eax # imm = 0xFFFB
3108 ; SSE41-NEXT:    pinsrw $2, %eax, %xmm0
3109 ; SSE41-NEXT:    pinsrw $4, %ecx, %xmm0
3110 ; SSE41-NEXT:    movsbl (%rsi), %eax
3111 ; SSE41-NEXT:    pinsrw $5, %eax, %xmm0
3112 ; SSE41-NEXT:    movsbl (%rdx), %eax
3113 ; SSE41-NEXT:    pinsrw $6, %eax, %xmm0
3114 ; SSE41-NEXT:    retq
3115 ;
3116 ; AVX-LABEL: shuffle_scalar_to_vector_extract:
3117 ; AVX:       # %bb.0:
3118 ; AVX-NEXT:    vpmovsxbw (%rdi), %xmm0
3119 ; AVX-NEXT:    vpextrw $4, %xmm0, %eax
3120 ; AVX-NEXT:    vpextrw $7, %xmm0, %ecx
3121 ; AVX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
3122 ; AVX-NEXT:    vpinsrw $1, %eax, %xmm0, %xmm0
3123 ; AVX-NEXT:    movl $65531, %eax # imm = 0xFFFB
3124 ; AVX-NEXT:    vpinsrw $2, %eax, %xmm0, %xmm0
3125 ; AVX-NEXT:    vpinsrw $4, %ecx, %xmm0, %xmm0
3126 ; AVX-NEXT:    movsbl (%rsi), %eax
3127 ; AVX-NEXT:    vpinsrw $5, %eax, %xmm0, %xmm0
3128 ; AVX-NEXT:    movsbl (%rdx), %eax
3129 ; AVX-NEXT:    vpinsrw $6, %eax, %xmm0, %xmm0
3130 ; AVX-NEXT:    retq
3131   %tmp = load <8 x i8>, ptr %p0, align 1
3132   %tmp1 = sext <8 x i8> %tmp to <8 x i16>
3133   %tmp2 = load i8, ptr %p1, align 1
3134   %cvt1 = sext i8 %tmp2 to i16
3135   %tmp3 = load i8, ptr %p2, align 1
3136   %cvt2 = sext i8 %tmp3 to i16
3137   %tmp4 = extractelement <8 x i16> %tmp1, i32 4
3138   %tmp5 = extractelement <8 x i16> %tmp1, i32 7
3139   %tmp6 = insertelement <8 x i16> <i16 undef, i16 undef, i16 -5, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef>, i16 undef, i32 0
3140   %tmp7 = insertelement <8 x i16> %tmp6, i16 %tmp4, i32 1
3141   %tmp8 = insertelement <8 x i16> %tmp7, i16 undef, i32 3
3142   %tmp9 = insertelement <8 x i16> %tmp8, i16 %tmp5, i32 4
3143   %tmp10 = insertelement <8 x i16> %tmp9, i16 %cvt1, i32 5
3144   %tmp11 = insertelement <8 x i16> %tmp10, i16 %cvt2, i32 6
3145   %tmp12 = insertelement <8 x i16> %tmp11, i16 undef, i32 7
3146   %tmp13 = shufflevector <8 x i16> %tmp12, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 10, i32 3, i32 4, i32 5, i32 6, i32 7>
3147   ret <8 x i16> %tmp13
3148 }
3149
3150 ; Bug noticed in D96345
3151 define i32 @shuffle_binops_with_undef() {
3152 ; SSE-LABEL: shuffle_binops_with_undef:
3153 ; SSE:       # %bb.0: # %entry
3154 ; SSE-NEXT:    movdqa (%rax), %xmm0
3155 ; SSE-NEXT:    paddw %xmm0, %xmm0
3156 ; SSE-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
3157 ; SSE-NEXT:    psrlw %xmm1, %xmm0
3158 ; SSE-NEXT:    movdqa %xmm0, (%rax)
3159 ; SSE-NEXT:    retq
3160 ;
3161 ; AVX-LABEL: shuffle_binops_with_undef:
3162 ; AVX:       # %bb.0: # %entry
3163 ; AVX-NEXT:    vmovdqa (%rax), %xmm0
3164 ; AVX-NEXT:    vpaddw %xmm0, %xmm0, %xmm0
3165 ; AVX-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
3166 ; AVX-NEXT:    vpsrlw %xmm1, %xmm0, %xmm0
3167 ; AVX-NEXT:    vmovdqa %xmm0, (%rax)
3168 ; AVX-NEXT:    retq
3169 entry:
3170   %load0 = load <8 x i16>, ptr undef, align 16
3171   %load1 = load <8 x i16>, ptr undef, align 16
3172   %shuf0 = shufflevector <16 x i8> undef, <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison>, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
3173   %addi = add <8 x i16> %load0, %load1
3174   %bc0 = bitcast <8 x i16> %addi to <2 x i64>
3175   %bc1 = bitcast <16 x i8> %shuf0 to <8 x i16>
3176   %shuf1 = shufflevector <8 x i16> %load1, <8 x i16> poison, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
3177   %addi24 = add <8 x i16> %shuf1, %bc1
3178   %bc2 = bitcast <8 x i16> %addi24 to <2 x i64>
3179   %shuf2 = shufflevector <2 x i64> %bc0, <2 x i64> %bc2, <2 x i32> <i32 0, i32 2>
3180   %bc3 = bitcast <2 x i64> %shuf2 to <8 x i16>
3181   %psrli = call <8 x i16> @llvm.x86.sse2.psrli.w(<8 x i16> %bc3, i32 ptrtoint (ptr @shuffle_binops_with_undef to i32))
3182   store <8 x i16> %psrli, ptr undef, align 16
3183   ret i32 undef
3184 }
3185 declare <8 x i16> @llvm.x86.sse2.psrli.w(<8 x i16>, i32)
3186
3187 define void @PR43024() {
3188 ; SSE2-LABEL: PR43024:
3189 ; SSE2:       # %bb.0:
3190 ; SSE2-NEXT:    movaps {{.*#+}} xmm0 = [NaN,NaN,0.0E+0,0.0E+0]
3191 ; SSE2-NEXT:    movaps %xmm0, (%rax)
3192 ; SSE2-NEXT:    addss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
3193 ; SSE2-NEXT:    xorps %xmm1, %xmm1
3194 ; SSE2-NEXT:    addss %xmm1, %xmm0
3195 ; SSE2-NEXT:    addss %xmm1, %xmm0
3196 ; SSE2-NEXT:    movss %xmm0, (%rax)
3197 ; SSE2-NEXT:    retq
3198 ;
3199 ; SSSE3-LABEL: PR43024:
3200 ; SSSE3:       # %bb.0:
3201 ; SSSE3-NEXT:    movaps {{.*#+}} xmm0 = [NaN,NaN,0.0E+0,0.0E+0]
3202 ; SSSE3-NEXT:    movaps %xmm0, (%rax)
3203 ; SSSE3-NEXT:    addss %xmm0, %xmm0
3204 ; SSSE3-NEXT:    xorps %xmm1, %xmm1
3205 ; SSSE3-NEXT:    addss %xmm1, %xmm0
3206 ; SSSE3-NEXT:    addss %xmm1, %xmm0
3207 ; SSSE3-NEXT:    movss %xmm0, (%rax)
3208 ; SSSE3-NEXT:    retq
3209 ;
3210 ; SSE41-LABEL: PR43024:
3211 ; SSE41:       # %bb.0:
3212 ; SSE41-NEXT:    movaps {{.*#+}} xmm0 = [NaN,NaN,0.0E+0,0.0E+0]
3213 ; SSE41-NEXT:    movaps %xmm0, (%rax)
3214 ; SSE41-NEXT:    addss %xmm0, %xmm0
3215 ; SSE41-NEXT:    xorps %xmm1, %xmm1
3216 ; SSE41-NEXT:    addss %xmm1, %xmm0
3217 ; SSE41-NEXT:    addss %xmm1, %xmm0
3218 ; SSE41-NEXT:    movss %xmm0, (%rax)
3219 ; SSE41-NEXT:    retq
3220 ;
3221 ; AVX-LABEL: PR43024:
3222 ; AVX:       # %bb.0:
3223 ; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [NaN,NaN,0.0E+0,0.0E+0]
3224 ; AVX-NEXT:    vmovaps %xmm0, (%rax)
3225 ; AVX-NEXT:    vaddss {{\.?LCPI[0-9]+_[0-9]+}}+4(%rip), %xmm0, %xmm0
3226 ; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
3227 ; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm0
3228 ; AVX-NEXT:    vaddss {{\.?LCPI[0-9]+_[0-9]+}}+12(%rip), %xmm0, %xmm0
3229 ; AVX-NEXT:    vmovss %xmm0, (%rax)
3230 ; AVX-NEXT:    retq
3231   store <4 x float> <float 0x7FF8000000000000, float 0x7FF8000000000000, float 0x0, float 0x0>, ptr undef, align 16
3232   %1 = load <4 x float>, ptr undef, align 16
3233   %2 = fmul <4 x float> %1, <float 0x0, float 0x0, float 0x0, float 0x0>
3234   %3 = shufflevector <4 x float> %2, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
3235   %4 = fadd <4 x float> %2, %3
3236   %5 = fadd <4 x float> zeroinitializer, %4
3237   %6 = shufflevector <4 x float> %2, <4 x float> undef, <4 x i32> <i32 3, i32 undef, i32 undef, i32 undef>
3238   %7 = fadd <4 x float> %6, %5
3239   %8 = extractelement <4 x float> %7, i32 0
3240   store float %8, ptr undef, align 8
3241   ret void
3242 }
3243
3244 define void @PR45604(ptr %dst, ptr %src) {
3245 ; SSE2-LABEL: PR45604:
3246 ; SSE2:       # %bb.0:
3247 ; SSE2-NEXT:    movdqa (%rsi), %xmm0
3248 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,0,0]
3249 ; SSE2-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5]
3250 ; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [0,65535,65535,65535,0,65535,65535,65535]
3251 ; SSE2-NEXT:    movdqa %xmm2, %xmm3
3252 ; SSE2-NEXT:    pandn %xmm1, %xmm3
3253 ; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [0,0,0,0,11,0,0,0,0,0,0,0,11,0,0,0]
3254 ; SSE2-NEXT:    por %xmm1, %xmm3
3255 ; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[1,1,1,1]
3256 ; SSE2-NEXT:    pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,5,5,5]
3257 ; SSE2-NEXT:    movdqa %xmm2, %xmm5
3258 ; SSE2-NEXT:    pandn %xmm4, %xmm5
3259 ; SSE2-NEXT:    por %xmm1, %xmm5
3260 ; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[2,2,2,2]
3261 ; SSE2-NEXT:    pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,5,5,5]
3262 ; SSE2-NEXT:    movdqa %xmm2, %xmm6
3263 ; SSE2-NEXT:    pandn %xmm4, %xmm6
3264 ; SSE2-NEXT:    por %xmm1, %xmm6
3265 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
3266 ; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5]
3267 ; SSE2-NEXT:    pandn %xmm0, %xmm2
3268 ; SSE2-NEXT:    por %xmm1, %xmm2
3269 ; SSE2-NEXT:    movdqa %xmm2, 48(%rdi)
3270 ; SSE2-NEXT:    movdqa %xmm6, 32(%rdi)
3271 ; SSE2-NEXT:    movdqa %xmm5, 16(%rdi)
3272 ; SSE2-NEXT:    movdqa %xmm3, (%rdi)
3273 ; SSE2-NEXT:    retq
3274 ;
3275 ; SSSE3-LABEL: PR45604:
3276 ; SSSE3:       # %bb.0:
3277 ; SSSE3-NEXT:    movdqa (%rsi), %xmm0
3278 ; SSSE3-NEXT:    movdqa %xmm0, %xmm1
3279 ; SSSE3-NEXT:    pshufb {{.*#+}} xmm1 = xmm1[0,1],zero,zero,zero,zero,zero,zero,xmm1[2,3],zero,zero,zero,zero,zero,zero
3280 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [0,0,0,0,11,0,0,0,0,0,0,0,11,0,0,0]
3281 ; SSSE3-NEXT:    por %xmm2, %xmm1
3282 ; SSSE3-NEXT:    movdqa %xmm0, %xmm3
3283 ; SSSE3-NEXT:    pshufb {{.*#+}} xmm3 = xmm3[4,5],zero,zero,zero,zero,zero,zero,xmm3[6,7],zero,zero,zero,zero,zero,zero
3284 ; SSSE3-NEXT:    por %xmm2, %xmm3
3285 ; SSSE3-NEXT:    movdqa %xmm0, %xmm4
3286 ; SSSE3-NEXT:    pshufb {{.*#+}} xmm4 = xmm4[8,9],zero,zero,zero,zero,zero,zero,xmm4[10,11],zero,zero,zero,zero,zero,zero
3287 ; SSSE3-NEXT:    por %xmm2, %xmm4
3288 ; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[12,13],zero,zero,zero,zero,zero,zero,xmm0[14,15],zero,zero,zero,zero,zero,zero
3289 ; SSSE3-NEXT:    por %xmm2, %xmm0
3290 ; SSSE3-NEXT:    movdqa %xmm0, 48(%rdi)
3291 ; SSSE3-NEXT:    movdqa %xmm4, 32(%rdi)
3292 ; SSSE3-NEXT:    movdqa %xmm3, 16(%rdi)
3293 ; SSSE3-NEXT:    movdqa %xmm1, (%rdi)
3294 ; SSSE3-NEXT:    retq
3295 ;
3296 ; SSE41-LABEL: PR45604:
3297 ; SSE41:       # %bb.0:
3298 ; SSE41-NEXT:    movdqa (%rsi), %xmm0
3299 ; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
3300 ; SSE41-NEXT:    pmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
3301 ; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = <u,0,11,0,u,0,11,0>
3302 ; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7]
3303 ; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
3304 ; SSE41-NEXT:    pmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero
3305 ; SSE41-NEXT:    pblendw {{.*#+}} xmm3 = xmm3[0],xmm2[1,2,3],xmm3[4],xmm2[5,6,7]
3306 ; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[3,3,3,3]
3307 ; SSE41-NEXT:    pmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
3308 ; SSE41-NEXT:    pblendw {{.*#+}} xmm4 = xmm4[0],xmm2[1,2,3],xmm4[4],xmm2[5,6,7]
3309 ; SSE41-NEXT:    pmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
3310 ; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7]
3311 ; SSE41-NEXT:    movdqa %xmm0, (%rdi)
3312 ; SSE41-NEXT:    movdqa %xmm4, 48(%rdi)
3313 ; SSE41-NEXT:    movdqa %xmm3, 32(%rdi)
3314 ; SSE41-NEXT:    movdqa %xmm1, 16(%rdi)
3315 ; SSE41-NEXT:    retq
3316 ;
3317 ; AVX1-LABEL: PR45604:
3318 ; AVX1:       # %bb.0:
3319 ; AVX1-NEXT:    vmovdqa (%rsi), %xmm0
3320 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[3,3,3,3]
3321 ; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
3322 ; AVX1-NEXT:    vmovddup {{.*#+}} xmm2 = [11,11,11,0,11,11,11,0]
3323 ; AVX1-NEXT:    # xmm2 = mem[0,0]
3324 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
3325 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
3326 ; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero
3327 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7]
3328 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm3, %ymm1
3329 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1]
3330 ; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero
3331 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7]
3332 ; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
3333 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
3334 ; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm0
3335 ; AVX1-NEXT:    vmovups %ymm0, (%rdi)
3336 ; AVX1-NEXT:    vmovups %ymm1, 32(%rdi)
3337 ; AVX1-NEXT:    vzeroupper
3338 ; AVX1-NEXT:    retq
3339 ;
3340 ; AVX2-LABEL: PR45604:
3341 ; AVX2:       # %bb.0:
3342 ; AVX2-NEXT:    vmovdqa (%rsi), %xmm0
3343 ; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[0,2,0,2]
3344 ; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = <0,1,8,9,u,u,u,u,2,3,10,11,u,u,u,u,4,5,12,13,u,u,u,u,6,7,14,15,u,u,u,u>
3345 ; AVX2-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
3346 ; AVX2-NEXT:    vpbroadcastd {{.*#+}} ymm3 = [11,0,0,0,11,0,0,0,11,0,0,0,11,0,0,0,11,0,0,0,11,0,0,0,11,0,0,0,11,0,0,0]
3347 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2],ymm3[3],ymm1[4],ymm3[5],ymm1[6],ymm3[7]
3348 ; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[1,3,1,3]
3349 ; AVX2-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
3350 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2],ymm3[3],ymm0[4],ymm3[5],ymm0[6],ymm3[7]
3351 ; AVX2-NEXT:    vmovdqu %ymm0, 32(%rdi)
3352 ; AVX2-NEXT:    vmovdqu %ymm1, (%rdi)
3353 ; AVX2-NEXT:    vzeroupper
3354 ; AVX2-NEXT:    retq
3355   %v1 = load <8 x i16>, ptr %src, align 16
3356   %v2 = shufflevector <8 x i16> %v1, <8 x i16> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
3357   %v3 = shufflevector <16 x i16> %v2, <16 x i16> <i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, <32 x i32> <i32 0, i32 8, i32 16, i32 24, i32 1, i32 9, i32 17, i32 25, i32 2, i32 10, i32 18, i32 26, i32 3, i32 11, i32 19, i32 27, i32 4, i32 12, i32 20, i32 28, i32 5, i32 13, i32 21, i32 29, i32 6, i32 14, i32 22, i32 30, i32 7, i32 15, i32 23, i32 31>
3358   store <32 x i16> %v3, ptr %dst, align 16
3359   ret void
3360 }
3361
3362 ; getFauxShuffle AND/ANDN decoding wrongly assumed an undef src always gives an undef dst.
3363 define <2 x i64> @PR55157(ptr %0) {
3364 ; SSE-LABEL: PR55157:
3365 ; SSE:       # %bb.0:
3366 ; SSE-NEXT:    xorps %xmm0, %xmm0
3367 ; SSE-NEXT:    retq
3368 ;
3369 ; AVX-LABEL: PR55157:
3370 ; AVX:       # %bb.0:
3371 ; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
3372 ; AVX-NEXT:    retq
3373   %2 = load <16 x i8>, ptr %0, align 16
3374   %3 = icmp eq <16 x i8> %2, zeroinitializer
3375   %4 = tail call <16 x i8> @llvm.x86.sse2.pavg.b(<16 x i8> zeroinitializer, <16 x i8> zeroinitializer)
3376   %5 = select <16 x i1> %3, <16 x i8> <i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, <16 x i8> %4
3377   %6 = shufflevector <16 x i8> %5, <16 x i8> poison, <16 x i32> <i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15>
3378   %7 = bitcast <16 x i8> %6 to <2 x i64>
3379   ret <2 x i64> %7
3380 }
3381 declare <16 x i8> @llvm.x86.sse2.pavg.b(<16 x i8>, <16 x i8>)
3382
3383 ; SelectionDAG::isSplatValue - incorrect handling of undef sub-elements
3384 define <2 x i64> @PR56520(<16 x i8> %0) {
3385 ; SSE-LABEL: PR56520:
3386 ; SSE:       # %bb.0:
3387 ; SSE-NEXT:    pxor %xmm1, %xmm1
3388 ; SSE-NEXT:    pcmpeqb %xmm0, %xmm1
3389 ; SSE-NEXT:    movd %xmm1, %eax
3390 ; SSE-NEXT:    movsbl %al, %eax
3391 ; SSE-NEXT:    movd %eax, %xmm0
3392 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
3393 ; SSE-NEXT:    retq
3394 ;
3395 ; AVX1-LABEL: PR56520:
3396 ; AVX1:       # %bb.0:
3397 ; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
3398 ; AVX1-NEXT:    vpcmpeqb %xmm1, %xmm0, %xmm0
3399 ; AVX1-NEXT:    vmovd %xmm0, %eax
3400 ; AVX1-NEXT:    movsbl %al, %eax
3401 ; AVX1-NEXT:    vmovd %eax, %xmm0
3402 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
3403 ; AVX1-NEXT:    retq
3404 ;
3405 ; AVX2-SLOW-LABEL: PR56520:
3406 ; AVX2-SLOW:       # %bb.0:
3407 ; AVX2-SLOW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
3408 ; AVX2-SLOW-NEXT:    vpcmpeqb %xmm1, %xmm0, %xmm0
3409 ; AVX2-SLOW-NEXT:    vmovd %xmm0, %eax
3410 ; AVX2-SLOW-NEXT:    movsbl %al, %eax
3411 ; AVX2-SLOW-NEXT:    vmovd %eax, %xmm0
3412 ; AVX2-SLOW-NEXT:    vpbroadcastq %xmm0, %xmm0
3413 ; AVX2-SLOW-NEXT:    retq
3414 ;
3415 ; AVX2-FAST-LABEL: PR56520:
3416 ; AVX2-FAST:       # %bb.0:
3417 ; AVX2-FAST-NEXT:    vpxor %xmm1, %xmm1, %xmm1
3418 ; AVX2-FAST-NEXT:    vpcmpeqb %xmm1, %xmm0, %xmm0
3419 ; AVX2-FAST-NEXT:    vmovd %xmm0, %eax
3420 ; AVX2-FAST-NEXT:    movsbl %al, %eax
3421 ; AVX2-FAST-NEXT:    vmovd %eax, %xmm0
3422 ; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3],zero,zero,zero,zero,xmm0[0,1,2,3],zero,zero,zero,zero
3423 ; AVX2-FAST-NEXT:    retq
3424   %2 = icmp eq <16 x i8> zeroinitializer, %0
3425   %3 = extractelement <16 x i1> %2, i64 0
3426   %4 = sext i1 %3 to i32
3427   %5 = insertelement <2 x i32> zeroinitializer, i32 %4, i64 0
3428   %6 = zext <2 x i32> %5 to <2 x i64>
3429   %7 = shufflevector <2 x i64> %6, <2 x i64> zeroinitializer, <2 x i32> zeroinitializer
3430   ret <2 x i64> %7
3431 }
3432
3433 define <4 x i32> @PR63700(i128 %0) {
3434 ; SSE2-LABEL: PR63700:
3435 ; SSE2:       # %bb.0:
3436 ; SSE2-NEXT:    movd %edi, %xmm0
3437 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
3438 ; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
3439 ; SSE2-NEXT:    retq
3440 ;
3441 ; SSSE3-LABEL: PR63700:
3442 ; SSSE3:       # %bb.0:
3443 ; SSSE3-NEXT:    movd %edi, %xmm0
3444 ; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,1,2,3],zero,zero,zero,zero,xmm0[0,1,2,3],zero,zero,zero,zero
3445 ; SSSE3-NEXT:    retq
3446 ;
3447 ; SSE41-LABEL: PR63700:
3448 ; SSE41:       # %bb.0:
3449 ; SSE41-NEXT:    movd %edi, %xmm0
3450 ; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
3451 ; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
3452 ; SSE41-NEXT:    retq
3453 ;
3454 ; AVX1-LABEL: PR63700:
3455 ; AVX1:       # %bb.0:
3456 ; AVX1-NEXT:    vmovd %edi, %xmm0
3457 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
3458 ; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
3459 ; AVX1-NEXT:    retq
3460 ;
3461 ; AVX2-SLOW-LABEL: PR63700:
3462 ; AVX2-SLOW:       # %bb.0:
3463 ; AVX2-SLOW-NEXT:    vmovd %edi, %xmm0
3464 ; AVX2-SLOW-NEXT:    vpbroadcastd %xmm0, %xmm0
3465 ; AVX2-SLOW-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
3466 ; AVX2-SLOW-NEXT:    retq
3467 ;
3468 ; AVX2-FAST-LABEL: PR63700:
3469 ; AVX2-FAST:       # %bb.0:
3470 ; AVX2-FAST-NEXT:    vmovq %rdi, %xmm0
3471 ; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3],zero,zero,zero,zero,xmm0[0,1,2,3],zero,zero,zero,zero
3472 ; AVX2-FAST-NEXT:    retq
3473   %vcmp = bitcast i128 %0 to <4 x i32>
3474   %shuffle.i = shufflevector <4 x i32> %vcmp, <4 x i32> zeroinitializer, <4 x i32> <i32 0, i32 0, i32 undef, i32 undef>
3475   %shuffle.i11 = shufflevector <4 x i32> %shuffle.i, <4 x i32> zeroinitializer, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
3476   ret <4 x i32> %shuffle.i11
3477 }
3478
3479 ; Test case reported on D105827
3480 define void @SpinningCube() {
3481 ; SSE2-LABEL: SpinningCube:
3482 ; SSE2:       # %bb.0: # %entry
3483 ; SSE2-NEXT:    movl $1065353216, (%rax) # imm = 0x3F800000
3484 ; SSE2-NEXT:    movaps {{.*#+}} xmm0 = <u,u,u,1.0E+0>
3485 ; SSE2-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
3486 ; SSE2-NEXT:    movapd {{.*#+}} xmm2 = <u,u,-2.0E+0,u>
3487 ; SSE2-NEXT:    movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1]
3488 ; SSE2-NEXT:    xorps %xmm3, %xmm3
3489 ; SSE2-NEXT:    shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,0]
3490 ; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,3]
3491 ; SSE2-NEXT:    addps %xmm3, %xmm1
3492 ; SSE2-NEXT:    movaps %xmm1, (%rax)
3493 ; SSE2-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
3494 ; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0,0,0]
3495 ; SSE2-NEXT:    mulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
3496 ; SSE2-NEXT:    addps %xmm0, %xmm1
3497 ; SSE2-NEXT:    movaps %xmm1, (%rax)
3498 ; SSE2-NEXT:    retq
3499 ;
3500 ; SSSE3-LABEL: SpinningCube:
3501 ; SSSE3:       # %bb.0: # %entry
3502 ; SSSE3-NEXT:    movl $1065353216, (%rax) # imm = 0x3F800000
3503 ; SSSE3-NEXT:    movaps {{.*#+}} xmm0 = <u,u,u,1.0E+0>
3504 ; SSSE3-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
3505 ; SSSE3-NEXT:    movapd {{.*#+}} xmm2 = <u,u,-2.0E+0,u>
3506 ; SSSE3-NEXT:    movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1]
3507 ; SSSE3-NEXT:    xorps %xmm3, %xmm3
3508 ; SSSE3-NEXT:    shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,0]
3509 ; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,3]
3510 ; SSSE3-NEXT:    addps %xmm3, %xmm1
3511 ; SSSE3-NEXT:    movaps %xmm1, (%rax)
3512 ; SSSE3-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
3513 ; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0,0,2]
3514 ; SSSE3-NEXT:    mulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
3515 ; SSSE3-NEXT:    addps %xmm0, %xmm1
3516 ; SSSE3-NEXT:    movaps %xmm1, (%rax)
3517 ; SSSE3-NEXT:    retq
3518 ;
3519 ; SSE41-LABEL: SpinningCube:
3520 ; SSE41:       # %bb.0: # %entry
3521 ; SSE41-NEXT:    movl $1065353216, (%rax) # imm = 0x3F800000
3522 ; SSE41-NEXT:    movaps {{.*#+}} xmm0 = <u,u,u,1.0E+0>
3523 ; SSE41-NEXT:    movaps {{.*#+}} xmm1 = <0.0E+0,0.0E+0,-2.0E+0,u>
3524 ; SSE41-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
3525 ; SSE41-NEXT:    movaps %xmm1, %xmm3
3526 ; SSE41-NEXT:    insertps {{.*#+}} xmm3 = xmm3[0,1,2],xmm2[0]
3527 ; SSE41-NEXT:    movaps %xmm0, %xmm4
3528 ; SSE41-NEXT:    insertps {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[2,3]
3529 ; SSE41-NEXT:    addps %xmm3, %xmm4
3530 ; SSE41-NEXT:    movaps %xmm4, (%rax)
3531 ; SSE41-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
3532 ; SSE41-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,0,0,2]
3533 ; SSE41-NEXT:    mulps %xmm1, %xmm2
3534 ; SSE41-NEXT:    addps %xmm0, %xmm2
3535 ; SSE41-NEXT:    movaps %xmm2, (%rax)
3536 ; SSE41-NEXT:    retq
3537 ;
3538 ; AVX-LABEL: SpinningCube:
3539 ; AVX:       # %bb.0: # %entry
3540 ; AVX-NEXT:    movl $1065353216, (%rax) # imm = 0x3F800000
3541 ; AVX-NEXT:    vbroadcastss {{.*#+}} xmm0 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
3542 ; AVX-NEXT:    vmovaps {{.*#+}} xmm1 = <0.0E+0,0.0E+0,-2.0E+0,u>
3543 ; AVX-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
3544 ; AVX-NEXT:    vinsertps {{.*#+}} xmm3 = xmm1[0,1,2],xmm2[0]
3545 ; AVX-NEXT:    vinsertps {{.*#+}} xmm2 = xmm0[0],xmm2[0],xmm0[2,3]
3546 ; AVX-NEXT:    vaddps %xmm2, %xmm3, %xmm2
3547 ; AVX-NEXT:    vmovaps %xmm2, (%rax)
3548 ; AVX-NEXT:    vbroadcastss (%rax), %xmm2
3549 ; AVX-NEXT:    vmulps %xmm1, %xmm2, %xmm1
3550 ; AVX-NEXT:    vaddps %xmm0, %xmm1, %xmm0
3551 ; AVX-NEXT:    vmovaps %xmm0, (%rax)
3552 ; AVX-NEXT:    retq
3553 entry:
3554   store float 1.000000e+00, ptr undef, align 4
3555   %0 = load float, ptr undef, align 4
3556   %1 = fmul float undef, 0.000000e+00
3557   %2 = insertelement <4 x float> poison, float %0, i32 3
3558   %3 = load float, ptr undef, align 4
3559   %4 = insertelement <2 x float> poison, float %3, i32 0
3560   %5 = shufflevector <2 x float> %4, <2 x float> poison, <2 x i32> zeroinitializer
3561   %6 = fmul <2 x float> %5, <float 0.000000e+00, float -2.000000e+00>
3562   %7 = fadd float %1, undef
3563   %8 = shufflevector <2 x float> %6, <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
3564   %9 = shufflevector <4 x float> undef, <4 x float> %8, <4 x i32> <i32 0, i32 4, i32 5, i32 undef>
3565   %10 = insertelement <4 x float> %9, float %7, i32 3
3566   %11 = insertelement <4 x float> %2, float 0x7FF8000000000000, i32 1
3567   %12 = insertelement <4 x float> %11, float undef, i32 0
3568   %13 = insertelement <4 x float> %12, float undef, i32 2
3569   %14 = fadd <4 x float> %10, %13
3570   store <4 x float> %14, ptr undef, align 16
3571   %15 = load float, ptr undef, align 4
3572   %16 = insertelement <2 x float> poison, float %15, i32 0
3573   %17 = shufflevector <2 x float> %16, <2 x float> poison, <2 x i32> zeroinitializer
3574   %18 = fmul <2 x float> %17, <float 0.000000e+00, float -2.000000e+00>
3575   %19 = shufflevector <2 x float> %18, <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
3576   %20 = shufflevector <4 x float> undef, <4 x float> %19, <4 x i32> <i32 0, i32 4, i32 5, i32 undef>
3577   %21 = fadd <4 x float> %20, %2
3578   store <4 x float> %21, ptr undef, align 16
3579   ret void
3580 }
3581
3582 ; Inifite loop test case reported on 5ca77541446d
3583 define void @autogen_SD25931() {
3584 ; CHECK-LABEL: autogen_SD25931:
3585 ; CHECK:       # %bb.0: # %BB
3586 ; CHECK-NEXT:    .p2align 4, 0x90
3587 ; CHECK-NEXT:  .LBB140_1: # %CF242
3588 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
3589 ; CHECK-NEXT:    jmp .LBB140_1
3590 BB:
3591   %Cmp16 = icmp uge <2 x i1> zeroinitializer, zeroinitializer
3592   %Shuff19 = shufflevector <2 x i1> zeroinitializer, <2 x i1> %Cmp16, <2 x i32> <i32 3, i32 1>
3593   %Shuff33 = shufflevector <2 x i1> %Shuff19, <2 x i1> zeroinitializer, <2 x i32> <i32 0, i32 2>
3594   br label %CF250
3595
3596 CF250:                                            ; preds = %CF250, %BB
3597   br i1 poison, label %CF250, label %CF259
3598
3599 CF259:                                            ; preds = %CF250
3600   %Cmp83 = icmp ule <2 x i1> %Shuff19, zeroinitializer
3601   br label %CF242
3602
3603 CF242:                                            ; preds = %CF242, %CF259
3604   %Shuff153 = shufflevector <2 x i1> %Shuff33, <2 x i1> poison, <2 x i32> <i32 3, i32 1>
3605   %Shuff161 = shufflevector <2 x i1> zeroinitializer, <2 x i1> %Cmp83, <2 x i32> <i32 1, i32 3>
3606   br label %CF242
3607 }