llvm/test/CodeGen/X86/vector-shuffle-combining-avx2.ll

   1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
   2 ; RUN: llc < %s -mtriple=i686-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,X86,AVX2
   3 ; RUN: llc < %s -mtriple=i686-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,X86,AVX512
   4 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,X64,AVX2
   5 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,X64,AVX512
   6
   7 declare <8 x i32> @llvm.x86.avx2.permd(<8 x i32>, <8 x i32>)
   8 declare <8 x float> @llvm.x86.avx2.permps(<8 x float>, <8 x i32>)
   9 declare <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8>, <16 x i8>)
  10 declare <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8>, <32 x i8>)
  11
  12 define <32 x i8> @combine_pshufb_pslldq(<32 x i8> %a0) {
  13 ; CHECK-LABEL: combine_pshufb_pslldq:
  14 ; CHECK:       # %bb.0:
  15 ; CHECK-NEXT:    vxorps %xmm0, %xmm0, %xmm0
  16 ; CHECK-NEXT:    ret{{[l|q]}}
  17   %1 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> <i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>)
  18   %2 = shufflevector <32 x i8> %1, <32 x i8> zeroinitializer, <32 x i32> <i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
  19   ret <32 x i8> %2
  20 }
  21
  22 define <32 x i8> @combine_pshufb_psrldq(<32 x i8> %a0) {
  23 ; CHECK-LABEL: combine_pshufb_psrldq:
  24 ; CHECK:       # %bb.0:
  25 ; CHECK-NEXT:    vxorps %xmm0, %xmm0, %xmm0
  26 ; CHECK-NEXT:    ret{{[l|q]}}
  27   %1 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> <i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128>)
  28   %2 = shufflevector <32 x i8> %1, <32 x i8> zeroinitializer, <32 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32>
  29   ret <32 x i8> %2
  30 }
  31
  32 define <32 x i8> @combine_pshufb_vpermd(<8 x i32> %a) {
  33 ; CHECK-LABEL: combine_pshufb_vpermd:
  34 ; CHECK:       # %bb.0:
  35 ; CHECK-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,16,17,18,18]
  36 ; CHECK-NEXT:    ret{{[l|q]}}
  37   %tmp0 = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %a, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 4>)
  38   %tmp1 = bitcast <8 x i32> %tmp0 to <32 x i8>
  39   %tmp2 = shufflevector <32 x i8> %tmp1, <32 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 30>
  40   ret <32 x i8> %tmp2
  41 }
  42
  43 define <32 x i8> @combine_pshufb_vpermps(<8 x float> %a) {
  44 ; CHECK-LABEL: combine_pshufb_vpermps:
  45 ; CHECK:       # %bb.0:
  46 ; CHECK-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,16,17,18,18]
  47 ; CHECK-NEXT:    ret{{[l|q]}}
  48   %tmp0 = call <8 x float> @llvm.x86.avx2.permps(<8 x float> %a, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 4>)
  49   %tmp1 = bitcast <8 x float> %tmp0 to <32 x i8>
  50   %tmp2 = shufflevector <32 x i8> %tmp1, <32 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 30>
  51   ret <32 x i8> %tmp2
  52 }
  53
  54 define <32 x i8> @combine_and_pshufb(<32 x i8> %a0) {
  55 ; CHECK-LABEL: combine_and_pshufb:
  56 ; CHECK:       # %bb.0:
  57 ; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
  58 ; CHECK-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7],ymm0[8],ymm1[9,10,11],ymm0[12],ymm1[13,14,15]
  59 ; CHECK-NEXT:    ret{{[l|q]}}
  60   %1 = shufflevector <32 x i8> %a0, <32 x i8> zeroinitializer, <32 x i32> <i32 0, i32 1, i32 32, i32 32, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
  61   %2 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %1, <32 x i8> <i8 0, i8 1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 8, i8 9, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 0, i8 1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 8, i8 9, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
  62   ret <32 x i8> %2
  63 }
  64
  65 define <32 x i8> @combine_pshufb_and(<32 x i8> %a0) {
  66 ; CHECK-LABEL: combine_pshufb_and:
  67 ; CHECK:       # %bb.0:
  68 ; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
  69 ; CHECK-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7],ymm0[8],ymm1[9,10,11],ymm0[12],ymm1[13,14,15]
  70 ; CHECK-NEXT:    ret{{[l|q]}}
  71   %1 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> <i8 0, i8 1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 8, i8 9, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 0, i8 1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 8, i8 9, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
  72   %2 = shufflevector <32 x i8> %1, <32 x i8> zeroinitializer, <32 x i32> <i32 0, i32 1, i32 32, i32 32, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
  73   ret <32 x i8> %2
  74 }
  75
  76 define <4 x i64> @combine_permq_pshufb_as_vextracti128(<4 x i64> %a0) {
  77 ; X86-LABEL: combine_permq_pshufb_as_vextracti128:
  78 ; X86:       # %bb.0:
  79 ; X86-NEXT:    vextracti128 $1, %ymm0, %xmm0
  80 ; X86-NEXT:    vpaddq {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0
  81 ; X86-NEXT:    retl
  82 ;
  83 ; X64-LABEL: combine_permq_pshufb_as_vextracti128:
  84 ; X64:       # %bb.0:
  85 ; X64-NEXT:    vextracti128 $1, %ymm0, %xmm0
  86 ; X64-NEXT:    vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
  87 ; X64-NEXT:    retq
  88   %1 = shufflevector <4 x i64> %a0, <4 x i64> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
  89   %2 = bitcast <4 x i64> %1 to <32 x i8>
  90   %3 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %2, <32 x i8> <i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255>)
  91   %4 = bitcast <32 x i8> %3 to <4 x i64>
  92   %5 = add <4 x i64> %4, <i64 1, i64 1, i64 3, i64 3>
  93   ret <4 x i64> %5
  94 }
  95
  96 define <4 x i64> @combine_permq_pshufb_as_vmovdqa(<4 x i64> %a0) {
  97 ; X86-LABEL: combine_permq_pshufb_as_vmovdqa:
  98 ; X86:       # %bb.0:
  99 ; X86-NEXT:    vmovdqa %xmm0, %xmm0
 100 ; X86-NEXT:    vpaddq {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0
 101 ; X86-NEXT:    retl
 102 ;
 103 ; X64-LABEL: combine_permq_pshufb_as_vmovdqa:
 104 ; X64:       # %bb.0:
 105 ; X64-NEXT:    vmovdqa %xmm0, %xmm0
 106 ; X64-NEXT:    vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
 107 ; X64-NEXT:    retq
 108   %1 = shufflevector <4 x i64> %a0, <4 x i64> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
 109   %2 = bitcast <4 x i64> %1 to <32 x i8>
 110   %3 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %2, <32 x i8> <i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255>)
 111   %4 = bitcast <32 x i8> %3 to <4 x i64>
 112   %5 = add <4 x i64> %4, <i64 1, i64 1, i64 3, i64 3>
 113   ret <4 x i64> %5
 114 }
 115
 116 define <8 x i32> @combine_as_vpermd(<8 x i32> %a0) {
 117 ; CHECK-LABEL: combine_as_vpermd:
 118 ; CHECK:       # %bb.0:
 119 ; CHECK-NEXT:    vmovaps {{.*#+}} ymm1 = [4,5,4,5,6,7,0,7]
 120 ; CHECK-NEXT:    vpermps %ymm0, %ymm1, %ymm0
 121 ; CHECK-NEXT:    ret{{[l|q]}}
 122   %1 = shufflevector <8 x i32> %a0, <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
 123   %2 = tail call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %a0, <8 x i32> <i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 7, i32 6>)
 124   %3 = shufflevector <8 x i32> %1, <8 x i32> %2, <8 x i32> <i32 0, i32 8, i32 9, i32 1, i32 15, i32 14, i32 4, i32 3>
 125   ret <8 x i32> %3
 126 }
 127
 128 define <8 x float> @combine_as_vpermps(<8 x float> %a0) {
 129 ; CHECK-LABEL: combine_as_vpermps:
 130 ; CHECK:       # %bb.0:
 131 ; CHECK-NEXT:    vmovaps {{.*#+}} ymm1 = <6,4,7,5,1,u,4,7>
 132 ; CHECK-NEXT:    vpermps %ymm0, %ymm1, %ymm0
 133 ; CHECK-NEXT:    ret{{[l|q]}}
 134   %1 = shufflevector <8 x float> %a0, <8 x float> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7>
 135   %2 = tail call <8 x float> @llvm.x86.avx2.permps(<8 x float> %a0, <8 x i32> <i32 1, i32 undef, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>)
 136   %3 = shufflevector <8 x float> %1, <8 x float> %2, <8 x i32> <i32 15, i32 0, i32 14, i32 1, i32 8, i32 9, i32 4, i32 3>
 137   ret <8 x float> %3
 138 }
 139
 140 define <32 x i8> @combine_permq_pshufb_as_vmovaps(<4 x i64> %a0) {
 141 ; CHECK-LABEL: combine_permq_pshufb_as_vmovaps:
 142 ; CHECK:       # %bb.0:
 143 ; CHECK-NEXT:    vmovaps %xmm0, %xmm0
 144 ; CHECK-NEXT:    ret{{[l|q]}}
 145   %1 = shufflevector <4 x i64> %a0, <4 x i64> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
 146   %2 = bitcast <4 x i64> %1 to <32 x i8>
 147   %3 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %2, <32 x i8> <i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255>)
 148   ret <32 x i8> %3
 149 }
 150
 151 define <32 x i8> @combine_permq_pshufb_as_vpblendd(<4 x i64> %a0) {
 152 ; CHECK-LABEL: combine_permq_pshufb_as_vpblendd:
 153 ; CHECK:       # %bb.0:
 154 ; CHECK-NEXT:    vxorps %xmm1, %xmm1, %xmm1
 155 ; CHECK-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
 156 ; CHECK-NEXT:    ret{{[l|q]}}
 157   %1 = shufflevector <4 x i64> %a0, <4 x i64> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
 158   %2 = bitcast <4 x i64> %1 to <32 x i8>
 159   %3 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %2, <32 x i8> <i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>)
 160   ret <32 x i8> %3
 161 }
 162
 163 define <16 x i8> @combine_pshufb_as_vpbroadcastb128(<16 x i8> %a) {
 164 ; CHECK-LABEL: combine_pshufb_as_vpbroadcastb128:
 165 ; CHECK:       # %bb.0:
 166 ; CHECK-NEXT:    vpbroadcastb %xmm0, %xmm0
 167 ; CHECK-NEXT:    ret{{[l|q]}}
 168   %1 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a, <16 x i8> zeroinitializer)
 169   ret <16 x i8> %1
 170 }
 171
 172 define <32 x i8> @combine_pshufb_as_vpbroadcastb256(<2 x i64> %a) {
 173 ; CHECK-LABEL: combine_pshufb_as_vpbroadcastb256:
 174 ; CHECK:       # %bb.0:
 175 ; CHECK-NEXT:    vpbroadcastb %xmm0, %ymm0
 176 ; CHECK-NEXT:    ret{{[l|q]}}
 177   %1 = shufflevector <2 x i64> %a, <2 x i64> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
 178   %2 = bitcast <4 x i64> %1 to <32 x i8>
 179   %3 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %2, <32 x i8> zeroinitializer)
 180   %4 = bitcast <32 x i8> %3 to <8 x i32>
 181   %5 = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %4, <8 x i32> zeroinitializer)
 182   %6 = bitcast <8 x i32> %5 to <32 x i8>
 183   ret <32 x i8> %6
 184 }
 185
 186 define <16 x i8> @combine_pshufb_as_vpbroadcastw128(<16 x i8> %a) {
 187 ; CHECK-LABEL: combine_pshufb_as_vpbroadcastw128:
 188 ; CHECK:       # %bb.0:
 189 ; CHECK-NEXT:    vpbroadcastw %xmm0, %xmm0
 190 ; CHECK-NEXT:    ret{{[l|q]}}
 191   %1 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a, <16 x i8> <i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1>)
 192   ret <16 x i8> %1
 193 }
 194
 195 define <32 x i8> @combine_pshufb_as_vpbroadcastw256(<2 x i64> %a) {
 196 ; CHECK-LABEL: combine_pshufb_as_vpbroadcastw256:
 197 ; CHECK:       # %bb.0:
 198 ; CHECK-NEXT:    vpbroadcastw %xmm0, %ymm0
 199 ; CHECK-NEXT:    ret{{[l|q]}}
 200   %1 = shufflevector <2 x i64> %a, <2 x i64> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
 201   %2 = bitcast <4 x i64> %1 to <32 x i8>
 202   %3 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %2, <32 x i8> <i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1>)
 203   %4 = bitcast <32 x i8> %3 to <8 x i32>
 204   %5 = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %4, <8 x i32> zeroinitializer)
 205   %6 = bitcast <8 x i32> %5 to <32 x i8>
 206   ret <32 x i8> %6
 207 }
 208
 209 define <16 x i8> @combine_pshufb_as_vpbroadcastd128(<16 x i8> %a) {
 210 ; X86-LABEL: combine_pshufb_as_vpbroadcastd128:
 211 ; X86:       # %bb.0:
 212 ; X86-NEXT:    vpbroadcastd %xmm0, %xmm0
 213 ; X86-NEXT:    vpaddb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
 214 ; X86-NEXT:    retl
 215 ;
 216 ; X64-LABEL: combine_pshufb_as_vpbroadcastd128:
 217 ; X64:       # %bb.0:
 218 ; X64-NEXT:    vpbroadcastd %xmm0, %xmm0
 219 ; X64-NEXT:    vpaddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 220 ; X64-NEXT:    retq
 221   %1 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3>)
 222   %2 = add <16 x i8> %1, <i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3>
 223   ret <16 x i8> %2
 224 }
 225
 226 define <8 x i32> @combine_permd_as_vpbroadcastd256(<4 x i32> %a) {
 227 ; X86-LABEL: combine_permd_as_vpbroadcastd256:
 228 ; X86:       # %bb.0:
 229 ; X86-NEXT:    vpbroadcastd %xmm0, %ymm0
 230 ; X86-NEXT:    vpaddd {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0
 231 ; X86-NEXT:    retl
 232 ;
 233 ; X64-LABEL: combine_permd_as_vpbroadcastd256:
 234 ; X64:       # %bb.0:
 235 ; X64-NEXT:    vpbroadcastd %xmm0, %ymm0
 236 ; X64-NEXT:    vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
 237 ; X64-NEXT:    retq
 238   %1 = shufflevector <4 x i32> %a, <4 x i32> undef, <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 239   %2 = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %1, <8 x i32> zeroinitializer)
 240   %3 = add <8 x i32> %2, <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
 241   ret <8 x i32> %3
 242 }
 243
 244 define <16 x i8> @combine_pshufb_as_vpbroadcastq128(<16 x i8> %a) {
 245 ; CHECK-LABEL: combine_pshufb_as_vpbroadcastq128:
 246 ; CHECK:       # %bb.0:
 247 ; CHECK-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
 248 ; CHECK-NEXT:    ret{{[l|q]}}
 249   %1 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>)
 250   ret <16 x i8> %1
 251 }
 252
 253 define <8 x i32> @combine_permd_as_vpbroadcastq256(<4 x i32> %a) {
 254 ; X86-LABEL: combine_permd_as_vpbroadcastq256:
 255 ; X86:       # %bb.0:
 256 ; X86-NEXT:    vpbroadcastq %xmm0, %ymm0
 257 ; X86-NEXT:    vpaddd {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0
 258 ; X86-NEXT:    retl
 259 ;
 260 ; X64-LABEL: combine_permd_as_vpbroadcastq256:
 261 ; X64:       # %bb.0:
 262 ; X64-NEXT:    vpbroadcastq %xmm0, %ymm0
 263 ; X64-NEXT:    vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
 264 ; X64-NEXT:    retq
 265   %1 = shufflevector <4 x i32> %a, <4 x i32> undef, <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 266   %2 = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %1, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>)
 267   %3 = add <8 x i32> %2, <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
 268   ret <8 x i32> %3
 269 }
 270
 271 define <4 x float> @combine_pshufb_as_vpbroadcastss128(<4 x float> %a) {
 272 ; CHECK-LABEL: combine_pshufb_as_vpbroadcastss128:
 273 ; CHECK:       # %bb.0:
 274 ; CHECK-NEXT:    vbroadcastss %xmm0, %xmm0
 275 ; CHECK-NEXT:    ret{{[l|q]}}
 276   %1 = bitcast <4 x float> %a to <16 x i8>
 277   %2 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %1, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3>)
 278   %3 = bitcast <16 x i8> %2 to <4 x float>
 279   ret <4 x float> %3
 280 }
 281
 282 define <8 x float> @combine_permps_as_vpbroadcastss256(<4 x float> %a) {
 283 ; CHECK-LABEL: combine_permps_as_vpbroadcastss256:
 284 ; CHECK:       # %bb.0:
 285 ; CHECK-NEXT:    vbroadcastss %xmm0, %ymm0
 286 ; CHECK-NEXT:    ret{{[l|q]}}
 287   %1 = shufflevector <4 x float> %a, <4 x float> undef, <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 288   %2 = call <8 x float> @llvm.x86.avx2.permps(<8 x float> %1, <8 x i32> zeroinitializer)
 289   ret <8 x float> %2
 290 }
 291
 292 define <4 x double> @combine_permps_as_vpbroadcastsd256(<2 x double> %a) {
 293 ; CHECK-LABEL: combine_permps_as_vpbroadcastsd256:
 294 ; CHECK:       # %bb.0:
 295 ; CHECK-NEXT:    vbroadcastsd %xmm0, %ymm0
 296 ; CHECK-NEXT:    ret{{[l|q]}}
 297   %1 = shufflevector <2 x double> %a, <2 x double> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
 298   %2 = bitcast <4 x double> %1 to <8 x float>
 299   %3 = call <8 x float> @llvm.x86.avx2.permps(<8 x float> %2, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>)
 300   %4 = bitcast <8 x float> %3 to <4 x double>
 301   ret <4 x double> %4
 302 }
 303
 304 define <16 x i8> @combine_vpbroadcast_pshufb_as_vpbroadcastb128(<16 x i8> %a) {
 305 ; CHECK-LABEL: combine_vpbroadcast_pshufb_as_vpbroadcastb128:
 306 ; CHECK:       # %bb.0:
 307 ; CHECK-NEXT:    vpbroadcastb %xmm0, %xmm0
 308 ; CHECK-NEXT:    ret{{[l|q]}}
 309   %1 = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> zeroinitializer
 310   %2 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %1, <16 x i8> zeroinitializer)
 311   ret <16 x i8> %2
 312 }
 313
 314 define <32 x i8> @combine_vpbroadcast_pshufb_as_vpbroadcastb256(<32 x i8> %a) {
 315 ; CHECK-LABEL: combine_vpbroadcast_pshufb_as_vpbroadcastb256:
 316 ; CHECK:       # %bb.0:
 317 ; CHECK-NEXT:    vpbroadcastb %xmm0, %ymm0
 318 ; CHECK-NEXT:    ret{{[l|q]}}
 319   %1 = shufflevector <32 x i8> %a, <32 x i8> undef, <32 x i32> zeroinitializer
 320   %2 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %1, <32 x i8> zeroinitializer)
 321   ret <32 x i8> %2
 322 }
 323
 324 define <4 x float> @combine_vpbroadcast_pshufb_as_vpbroadcastss128(<4 x float> %a) {
 325 ; CHECK-LABEL: combine_vpbroadcast_pshufb_as_vpbroadcastss128:
 326 ; CHECK:       # %bb.0:
 327 ; CHECK-NEXT:    vbroadcastss %xmm0, %xmm0
 328 ; CHECK-NEXT:    ret{{[l|q]}}
 329   %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> zeroinitializer
 330   %2 = bitcast <4 x float> %1 to <16 x i8>
 331   %3 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %2, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3>)
 332   %4 = bitcast <16 x i8> %3 to <4 x float>
 333   ret <4 x float> %4
 334 }
 335
 336 define <8 x float> @combine_vpbroadcast_permd_as_vpbroadcastss256(<4 x float> %a) {
 337 ; CHECK-LABEL: combine_vpbroadcast_permd_as_vpbroadcastss256:
 338 ; CHECK:       # %bb.0:
 339 ; CHECK-NEXT:    vbroadcastss %xmm0, %ymm0
 340 ; CHECK-NEXT:    ret{{[l|q]}}
 341   %1 = shufflevector <4 x float> %a, <4 x float> undef, <8 x i32> zeroinitializer
 342   %2 = call <8 x float> @llvm.x86.avx2.permps(<8 x float> %1, <8 x i32> zeroinitializer)
 343   ret <8 x float> %2
 344 }
 345
 346 define <4 x double> @combine_vpbroadcast_permd_as_vpbroadcastsd256(<2 x double> %a) {
 347 ; CHECK-LABEL: combine_vpbroadcast_permd_as_vpbroadcastsd256:
 348 ; CHECK:       # %bb.0:
 349 ; CHECK-NEXT:    vbroadcastsd %xmm0, %ymm0
 350 ; CHECK-NEXT:    ret{{[l|q]}}
 351   %1 = shufflevector <2 x double> %a, <2 x double> undef, <4 x i32> zeroinitializer
 352   %2 = bitcast <4 x double> %1 to <8 x float>
 353   %3 = call <8 x float> @llvm.x86.avx2.permps(<8 x float> %2, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>)
 354   %4 = bitcast <8 x float> %3 to <4 x double>
 355   ret <4 x double> %4
 356 }
 357
 358 define <8 x i32> @combine_permd_as_permq(<8 x i32> %a) {
 359 ; CHECK-LABEL: combine_permd_as_permq:
 360 ; CHECK:       # %bb.0:
 361 ; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,1]
 362 ; CHECK-NEXT:    ret{{[l|q]}}
 363   %1 = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %a, <8 x i32> <i32 0, i32 1, i32 4, i32 5, i32 4, i32 5, i32 2, i32 3>)
 364   ret <8 x i32> %1
 365 }
 366
 367 define <8 x float> @combine_permps_as_permpd(<8 x float> %a) {
 368 ; CHECK-LABEL: combine_permps_as_permpd:
 369 ; CHECK:       # %bb.0:
 370 ; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[3,2,0,1]
 371 ; CHECK-NEXT:    ret{{[l|q]}}
 372   %1 = call <8 x float> @llvm.x86.avx2.permps(<8 x float> %a, <8 x i32> <i32 6, i32 7, i32 4, i32 5, i32 0, i32 1, i32 2, i32 3>)
 373   ret <8 x float> %1
 374 }
 375
 376 define <8 x float> @combine_permps_as_vpermilps(<8 x float> %a, i32 %a1) {
 377 ; CHECK-LABEL: combine_permps_as_vpermilps:
 378 ; CHECK:       # %bb.0:
 379 ; CHECK-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[2,2,1,0,7,6,5,4]
 380 ; CHECK-NEXT:    ret{{[l|q]}}
 381   %1 = insertelement <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>, i32 %a1, i32 0
 382   %2 = call <8 x float> @llvm.x86.avx2.permps(<8 x float> %a, <8 x i32> %1)
 383   %3 = shufflevector <8 x float> %2, <8 x float> undef, <8 x i32> <i32 1, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 384   ret <8 x float> %3
 385 }
 386
 387 define <4 x i64> @combine_pshufb_as_zext(<32 x i8> %a0) {
 388 ; CHECK-LABEL: combine_pshufb_as_zext:
 389 ; CHECK:       # %bb.0:
 390 ; CHECK-NEXT:    vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
 391 ; CHECK-NEXT:    ret{{[l|q]}}
 392   %1 = shufflevector <32 x i8> %a0, <32 x i8> undef, <32 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 393   %2 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %1, <32 x i8> <i8 8, i8 9, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 10, i8 11, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 4, i8 5, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 6, i8 7, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
 394   %3 = bitcast <32 x i8> %2 to <4 x i64>
 395   ret <4 x i64> %3
 396 }
 397
 398 define <4 x i64> @combine_pshufb_as_zext128(<32 x i8> %a0) {
 399 ; CHECK-LABEL: combine_pshufb_as_zext128:
 400 ; CHECK:       # %bb.0:
 401 ; CHECK-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,7,6,5,4,3,2,1,0]
 402 ; CHECK-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
 403 ; CHECK-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[15,14],zero,zero,zero,zero,zero,zero,ymm0[13,12],zero,zero,zero,zero,zero,zero,ymm0[31,30],zero,zero,zero,zero,zero,zero,ymm0[29,28],zero,zero,zero,zero,zero,zero
 404 ; CHECK-NEXT:    ret{{[l|q]}}
 405   %1 = shufflevector <32 x i8> %a0, <32 x i8> undef, <32 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 406   %2 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %1, <32 x i8> <i8 15, i8 14, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 13, i8 12, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 15, i8 14, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 13, i8 12, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
 407   %3 = bitcast <32 x i8> %2 to <4 x i64>
 408   ret <4 x i64> %3
 409 }
 410
 411 define <4 x double> @combine_pshufb_as_vzmovl_64(<4 x double> %a0) {
 412 ; CHECK-LABEL: combine_pshufb_as_vzmovl_64:
 413 ; CHECK:       # %bb.0:
 414 ; CHECK-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
 415 ; CHECK-NEXT:    ret{{[l|q]}}
 416   %1 = bitcast <4 x double> %a0 to <32 x i8>
 417   %2 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %1, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
 418   %3 = bitcast <32 x i8> %2 to <4 x double>
 419   ret <4 x double> %3
 420 }
 421
 422 define <8 x float> @combine_pshufb_as_vzmovl_32(<8 x float> %a0) {
 423 ; CHECK-LABEL: combine_pshufb_as_vzmovl_32:
 424 ; CHECK:       # %bb.0:
 425 ; CHECK-NEXT:    vxorps %xmm1, %xmm1, %xmm1
 426 ; CHECK-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 427 ; CHECK-NEXT:    ret{{[l|q]}}
 428   %1 = bitcast <8 x float> %a0 to <32 x i8>
 429   %2 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %1, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
 430   %3 = bitcast <32 x i8> %2 to <8 x float>
 431   ret <8 x float> %3
 432 }
 433
 434 define <32 x i8> @combine_pshufb_as_pslldq(<32 x i8> %a0) {
 435 ; CHECK-LABEL: combine_pshufb_as_pslldq:
 436 ; CHECK:       # %bb.0:
 437 ; CHECK-NEXT:    vpslldq {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[0,1,2,3,4,5],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,18,19,20,21]
 438 ; CHECK-NEXT:    ret{{[l|q]}}
 439   %res0 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> <i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5>)
 440   ret <32 x i8> %res0
 441 }
 442
 443 define <32 x i8> @combine_pshufb_as_psrldq(<32 x i8> %a0) {
 444 ; CHECK-LABEL: combine_pshufb_as_psrldq:
 445 ; CHECK:       # %bb.0:
 446 ; CHECK-NEXT:    vpsrldq {{.*#+}} ymm0 = ymm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 447 ; CHECK-NEXT:    ret{{[l|q]}}
 448   %res0 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> <i8 15, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 15, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128>)
 449   ret <32 x i8> %res0
 450 }
 451
 452 define <32 x i8> @combine_pshufb_as_psrlw(<32 x i8> %a0) {
 453 ; CHECK-LABEL: combine_pshufb_as_psrlw:
 454 ; CHECK:       # %bb.0:
 455 ; CHECK-NEXT:    vpsrlw $8, %ymm0, %ymm0
 456 ; CHECK-NEXT:    ret{{[l|q]}}
 457   %res0 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> <i8 1, i8 128, i8 3, i8 128, i8 5, i8 128, i8 7, i8 128, i8 9, i8 128, i8 11, i8 128, i8 13, i8 128, i8 15, i8 128, i8 17, i8 128, i8 19, i8 128, i8 21, i8 128, i8 23, i8 128, i8 25, i8 128, i8 27, i8 128, i8 29, i8 128, i8 31, i8 128>)
 458   ret <32 x i8> %res0
 459 }
 460
 461 define <32 x i8> @combine_pshufb_as_pslld(<32 x i8> %a0) {
 462 ; CHECK-LABEL: combine_pshufb_as_pslld:
 463 ; CHECK:       # %bb.0:
 464 ; CHECK-NEXT:    vpslld $24, %ymm0, %ymm0
 465 ; CHECK-NEXT:    ret{{[l|q]}}
 466   %res0 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> <i8 128, i8 128, i8 128, i8 0, i8 128, i8 128, i8 128, i8 4, i8 128, i8 128, i8 128, i8 8, i8 128, i8 128, i8 128, i8 12, i8 128, i8 128, i8 128, i8 16, i8 128, i8 128, i8 128, i8 20, i8 128, i8 128, i8 128, i8 24, i8 128, i8 128, i8 128, i8 28>)
 467   ret <32 x i8> %res0
 468 }
 469
 470 define <32 x i8> @combine_pshufb_as_psrlq(<32 x i8> %a0) {
 471 ; CHECK-LABEL: combine_pshufb_as_psrlq:
 472 ; CHECK:       # %bb.0:
 473 ; CHECK-NEXT:    vpsrlq $40, %ymm0, %ymm0
 474 ; CHECK-NEXT:    ret{{[l|q]}}
 475   %res0 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> <i8 5, i8 6, i8 7, i8 128, i8 128, i8 128, i8 128, i8 128, i8 13, i8 14, i8 15, i8 128, i8 128, i8 128, i8 128, i8 128, i8 21, i8 22, i8 23, i8 128, i8 128, i8 128, i8 128, i8 128, i8 29, i8 30, i8 31, i8 128, i8 128, i8 128, i8 128, i8 128>)
 476   ret <32 x i8> %res0
 477 }
 478
 479 define <32 x i8> @combine_pshufb_as_pshuflw(<32 x i8> %a0) {
 480 ; CHECK-LABEL: combine_pshufb_as_pshuflw:
 481 ; CHECK:       # %bb.0:
 482 ; CHECK-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[1,0,3,2,4,5,6,7,9,8,11,10,12,13,14,15]
 483 ; CHECK-NEXT:    ret{{[l|q]}}
 484   %res0 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> <i8 2, i8 3, i8 0, i8 1, i8 6, i8 7, i8 4, i8 5, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 2, i8 3, i8 0, i8 1, i8 6, i8 7, i8 4, i8 5, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>)
 485   ret <32 x i8> %res0
 486 }
 487
 488 define <32 x i8> @combine_pshufb_as_pshufhw(<32 x i8> %a0) {
 489 ; CHECK-LABEL: combine_pshufb_as_pshufhw:
 490 ; CHECK:       # %bb.0:
 491 ; CHECK-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,5,4,7,6,8,9,10,11,13,12,15,14]
 492 ; CHECK-NEXT:    ret{{[l|q]}}
 493   %res0 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 10, i8 11, i8 8, i8 9, i8 14, i8 15, i8 12, i8 13, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 10, i8 11, i8 8, i8 9, i8 14, i8 15, i8 12, i8 13>)
 494   ret <32 x i8> %res0
 495 }
 496
 497 define <32 x i8> @combine_pshufb_not_as_pshufw(<32 x i8> %a0) {
 498 ; AVX2-LABEL: combine_pshufb_not_as_pshufw:
 499 ; AVX2:       # %bb.0:
 500 ; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13,18,19,16,17,22,23,20,21,26,27,24,25,30,31,28,29]
 501 ; AVX2-NEXT:    ret{{[l|q]}}
 502 ;
 503 ; AVX512-LABEL: combine_pshufb_not_as_pshufw:
 504 ; AVX512:       # %bb.0:
 505 ; AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
 506 ; AVX512-NEXT:    vprold $16, %zmm0, %zmm0
 507 ; AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
 508 ; AVX512-NEXT:    ret{{[l|q]}}
 509   %res0 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> <i8 2, i8 3, i8 0, i8 1, i8 6, i8 7, i8 4, i8 5, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 2, i8 3, i8 0, i8 1, i8 6, i8 7, i8 4, i8 5, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>)
 510   %res1 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %res0, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 10, i8 11, i8 8, i8 9, i8 14, i8 15, i8 12, i8 13, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 10, i8 11, i8 8, i8 9, i8 14, i8 15, i8 12, i8 13>)
 511   ret <32 x i8> %res1
 512 }
 513
 514 define <32 x i8> @combine_pshufb_as_unpacklo_undef(<32 x i8> %a0) {
 515 ; CHECK-LABEL: combine_pshufb_as_unpacklo_undef:
 516 ; CHECK:       # %bb.0:
 517 ; CHECK-NEXT:    ret{{[l|q]}}
 518   %1 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> <i8 undef, i8 0, i8 undef, i8 1, i8 undef, i8 2, i8 undef, i8 3, i8 undef, i8 4, i8 undef, i8 5, i8 undef, i8 6, i8 undef, i8 7, i8 undef, i8 16, i8 undef, i8 17, i8 undef, i8 18, i8 undef, i8 19, i8 undef, i8 20, i8 undef, i8 21, i8 undef, i8 22, i8 undef, i8 23>)
 519   %2 = shufflevector <32 x i8> %1, <32 x i8> undef, <32 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14, i32 16, i32 16, i32 18, i32 18, i32 20, i32 20, i32 22, i32 22, i32 24, i32 24, i32 26, i32 26, i32 28, i32 28, i32 30, i32 30>
 520   ret <32 x i8> %2
 521 }
 522
 523 define <32 x i8> @combine_pshufb_as_unpacklo_zero(<32 x i8> %a0) {
 524 ; CHECK-LABEL: combine_pshufb_as_unpacklo_zero:
 525 ; CHECK:       # %bb.0:
 526 ; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 527 ; CHECK-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11]
 528 ; CHECK-NEXT:    ret{{[l|q]}}
 529   %1 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> <i8 0, i8 1, i8 -1, i8 -1, i8 2, i8 3, i8 -1, i8 -1, i8 4, i8 5, i8 -1, i8 -1, i8 6, i8 7, i8 -1, i8 -1, i8 16, i8 17, i8 -1, i8 -1, i8 18, i8 19, i8 -1, i8 -1, i8 20, i8 21, i8 -1, i8 -1, i8 22, i8 23, i8 -1, i8 -1>)
 530   ret <32 x i8> %1
 531 }
 532
 533 define <32 x i8> @combine_pshufb_as_unpackhi_zero(<32 x i8> %a0) {
 534 ; CHECK-LABEL: combine_pshufb_as_unpackhi_zero:
 535 ; CHECK:       # %bb.0:
 536 ; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 537 ; CHECK-NEXT:    vpunpckhbw {{.*#+}} ymm0 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
 538 ; CHECK-NEXT:    ret{{[l|q]}}
 539   %1 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> <i8 -1, i8 8, i8 -1, i8 9, i8 -1, i8 10, i8 -1, i8 11, i8 -1, i8 12, i8 -1, i8 13, i8 -1, i8 14, i8 -1, i8 15, i8 -1, i8 24, i8 -1, i8 25, i8 -1, i8 26, i8 -1, i8 27, i8 -1, i8 28, i8 -1, i8 29, i8 -1, i8 30, i8 -1, i8 31>)
 540   ret <32 x i8> %1
 541 }
 542
 543 define <32 x i8> @combine_psrlw_pshufb(<16 x i16> %a0) {
 544 ; X86-LABEL: combine_psrlw_pshufb:
 545 ; X86:       # %bb.0:
 546 ; X86-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0
 547 ; X86-NEXT:    retl
 548 ;
 549 ; X64-LABEL: combine_psrlw_pshufb:
 550 ; X64:       # %bb.0:
 551 ; X64-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
 552 ; X64-NEXT:    retq
 553   %1 = lshr <16 x i16> %a0, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
 554   %2 = bitcast <16 x i16> %1 to <32 x i8>
 555   %3 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %2, <32 x i8> <i8 1, i8 0, i8 3, i8 2, i8 5, i8 4, i8 7, i8 6, i8 9, i8 8, i8 11, i8 10, i8 13, i8 12, i8 15, i8 14, i8 17, i8 16, i8 19, i8 18, i8 21, i8 20, i8 23, i8 22, i8 25, i8 24, i8 27, i8 26, i8 29, i8 28, i8 31, i8 30>)
 556   ret <32 x i8> %3
 557 }
 558
 559 define <32 x i8> @combine_pslld_pshufb(<8 x i32> %a0) {
 560 ; X86-LABEL: combine_pslld_pshufb:
 561 ; X86:       # %bb.0:
 562 ; X86-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0
 563 ; X86-NEXT:    retl
 564 ;
 565 ; X64-LABEL: combine_pslld_pshufb:
 566 ; X64:       # %bb.0:
 567 ; X64-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
 568 ; X64-NEXT:    retq
 569   %1 = shl <8 x i32> %a0, <i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24>
 570   %2 = bitcast <8 x i32> %1 to <32 x i8>
 571   %3 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %2, <32 x i8> <i8 3, i8 2, i8 1, i8 0, i8 7, i8 6, i8 5, i8 4, i8 11, i8 10, i8 9, i8 8, i8 15, i8 14, i8 13, i8 12, i8 19, i8 18, i8 17, i8 16, i8 23, i8 22, i8 21, i8 20, i8 27, i8 26, i8 25, i8 24, i8 31, i8 30, i8 29, i8 28>)
 572   ret <32 x i8> %3
 573 }
 574
 575 define <32 x i8> @combine_psrlq_pshufb(<4 x i64> %a0) {
 576 ; CHECK-LABEL: combine_psrlq_pshufb:
 577 ; CHECK:       # %bb.0:
 578 ; CHECK-NEXT:    vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,ymm0[7,6,5,4],zero,zero,zero,zero,ymm0[15,14,13,12],zero,zero,zero,zero,ymm0[23,22,21],zero,zero,zero,zero,ymm0[31,30,29,28],zero
 579 ; CHECK-NEXT:    ret{{[l|q]}}
 580   %1 = lshr <4 x i64> %a0, <i64 32, i64 32, i64 32, i64 32>
 581   %2 = bitcast <4 x i64> %1 to <32 x i8>
 582   %3 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %2, <32 x i8> <i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8, i8 23, i8 22, i8 21, i8 20, i8 19, i8 18, i8 17, i8 31, i8 30, i8 29, i8 28, i8 27, i8 26, i8 25, i8 24, i8 23>)
 583   ret <32 x i8> %3
 584 }
 585
 586 define <32 x i8> @combine_unpack_unpack_pshufb(<32 x i8> %a0) {
 587 ; CHECK-LABEL: combine_unpack_unpack_pshufb:
 588 ; CHECK:       # %bb.0:
 589 ; CHECK-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,0,4,8,1,1,5,9,2,2,6,10,3,3,7,11,16,16,20,24,17,17,21,25,18,18,22,26,19,19,23,27]
 590 ; CHECK-NEXT:    ret{{[l|q]}}
 591   %1 = shufflevector <32 x i8> %a0, <32 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 16, i32 17, i32 18, i32 19, i32 16, i32 17, i32 18, i32 19, i32 16, i32 17, i32 18, i32 19>
 592   %2 = shufflevector <32 x i8> %a0, <32 x i8> undef, <32 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 20, i32 21, i32 22, i32 23, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 593   %3 = shufflevector <32 x i8> %a0, <32 x i8> undef, <32 x i32> <i32 8, i32 9, i32 10, i32 11, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 24, i32 25, i32 26, i32 27, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 594   %4 = shufflevector <32 x i8> %1, <32 x i8> %2, <32 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 16, i32 48, i32 17, i32 49, i32 18, i32 50, i32 19, i32 51, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 595   %5 = shufflevector <32 x i8> %1, <32 x i8> %3, <32 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 16, i32 48, i32 17, i32 49, i32 18, i32 50, i32 19, i32 51, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 596   %6 = shufflevector <32 x i8> %4, <32 x i8> %5, <32 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 4, i32 36, i32 5, i32 37, i32 6, i32 38, i32 7, i32 39, i32 16, i32 48, i32 17, i32 49, i32 18, i32 50, i32 19, i32 51, i32 20, i32 52, i32 21, i32 53, i32 22, i32 54, i32 23, i32 55>
 597   ret <32 x i8> %6
 598 }
 599
 600 define <16 x i16> @shuffle_combine_packssdw_pshufb(<8 x i32> %a0) {
 601 ; CHECK-LABEL: shuffle_combine_packssdw_pshufb:
 602 ; CHECK:       # %bb.0:
 603 ; CHECK-NEXT:    vpsrad $31, %ymm0, %ymm0
 604 ; CHECK-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[12,13,8,9,4,5,0,1,12,13,8,9,4,5,0,1,16,17,20,21,24,25,28,29,28,29,24,25,20,21,16,17]
 605 ; CHECK-NEXT:    ret{{[l|q]}}
 606   %1 = ashr <8 x i32> %a0, <i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
 607   %2 = tail call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> %1, <8 x i32> %1)
 608   %3 = shufflevector <16 x i16> %2, <16 x i16> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 3, i32 2, i32 1, i32 0, i32 8, i32 9, i32 10, i32 11, i32 11, i32 10, i32 9, i32 8>
 609   ret <16 x i16> %3
 610 }
 611 declare <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32>, <8 x i32>) nounwind readnone
 612
 613 define <32 x i8> @shuffle_combine_packsswb_pshufb(<16 x i16> %a0, <16 x i16> %a1) {
 614 ; CHECK-LABEL: shuffle_combine_packsswb_pshufb:
 615 ; CHECK:       # %bb.0:
 616 ; CHECK-NEXT:    vpsraw $15, %ymm0, %ymm0
 617 ; CHECK-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[14,12,10,8,6,4,2,0,14,12,10,8,6,4,2,0,30,28,26,24,22,20,18,16,30,28,26,24,22,20,18,16]
 618 ; CHECK-NEXT:    ret{{[l|q]}}
 619   %1 = ashr <16 x i16> %a0, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
 620   %2 = ashr <16 x i16> %a1, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
 621   %3 = tail call <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16> %1, <16 x i16> %2)
 622   %4 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %3, <32 x i8> <i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
 623   ret <32 x i8> %4
 624 }
 625 declare <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16>, <16 x i16>) nounwind readnone
 626
 627 define <16 x i16> @shuffle_combine_packusdw_pshufb(<8 x i32> %a0, <8 x i32> %a1) {
 628 ; CHECK-LABEL: shuffle_combine_packusdw_pshufb:
 629 ; CHECK:       # %bb.0:
 630 ; CHECK-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[14,15,10,11,6,7,2,3,14,15,10,11,6,7,2,3,18,19,22,23,26,27,30,31,30,31,26,27,22,23,18,19]
 631 ; CHECK-NEXT:    ret{{[l|q]}}
 632   %1 = lshr <8 x i32> %a0, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
 633   %2 = tail call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> %1, <8 x i32> %1)
 634   %3 = shufflevector <16 x i16> %2, <16 x i16> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 3, i32 2, i32 1, i32 0, i32 8, i32 9, i32 10, i32 11, i32 11, i32 10, i32 9, i32 8>
 635   ret <16 x i16> %3
 636 }
 637 declare <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32>, <8 x i32>) nounwind readnone
 638
 639 define <32 x i8> @shuffle_combine_packuswb_pshufb(<16 x i16> %a0, <16 x i16> %a1) {
 640 ; CHECK-LABEL: shuffle_combine_packuswb_pshufb:
 641 ; CHECK:       # %bb.0:
 642 ; CHECK-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[15,13,11,9,7,5,3,1,15,13,11,9,7,5,3,1,31,29,27,25,23,21,19,17,31,29,27,25,23,21,19,17]
 643 ; CHECK-NEXT:    ret{{[l|q]}}
 644   %1 = lshr <16 x i16> %a0, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
 645   %2 = lshr <16 x i16> %a1, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
 646   %3 = tail call <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16> %1, <16 x i16> %2)
 647   %4 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %3, <32 x i8> <i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
 648   ret <32 x i8> %4
 649 }
 650 declare <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16>, <16 x i16>) nounwind readnone
 651
 652 define <32 x i8> @combine_pshufb_as_packsswb(<16 x i16> %a0, <16 x i16> %a1) nounwind {
 653 ; CHECK-LABEL: combine_pshufb_as_packsswb:
 654 ; CHECK:       # %bb.0:
 655 ; CHECK-NEXT:    vpsraw $11, %ymm0, %ymm0
 656 ; CHECK-NEXT:    vpsraw $11, %ymm1, %ymm1
 657 ; CHECK-NEXT:    vpacksswb %ymm1, %ymm0, %ymm0
 658 ; CHECK-NEXT:    ret{{[l|q]}}
 659   %1 = ashr <16 x i16> %a0, <i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11>
 660   %2 = ashr <16 x i16> %a1, <i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11>
 661   %3 = bitcast <16 x i16> %1 to <32 x i8>
 662   %4 = bitcast <16 x i16> %2 to <32 x i8>
 663   %5 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %3, <32 x i8> <i8 0, i8 2, i8 4, i8 6, i8 8, i8 10, i8 12, i8 14, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 0, i8 2, i8 4, i8 6, i8 8, i8 10, i8 12, i8 14, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
 664   %6 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %4, <32 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 0, i8 2, i8 4, i8 6, i8 8, i8 10, i8 12, i8 14, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 0, i8 2, i8 4, i8 6, i8 8, i8 10, i8 12, i8 14>)
 665   %7 = or <32 x i8> %5, %6
 666   ret <32 x i8> %7
 667 }
 668
 669 define <32 x i8> @combine_pshufb_as_packuswb(<16 x i16> %a0, <16 x i16> %a1) nounwind {
 670 ; CHECK-LABEL: combine_pshufb_as_packuswb:
 671 ; CHECK:       # %bb.0:
 672 ; CHECK-NEXT:    vpsrlw $11, %ymm0, %ymm0
 673 ; CHECK-NEXT:    vpsrlw $11, %ymm1, %ymm1
 674 ; CHECK-NEXT:    vpackuswb %ymm1, %ymm0, %ymm0
 675 ; CHECK-NEXT:    ret{{[l|q]}}
 676   %1 = lshr <16 x i16> %a0, <i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11>
 677   %2 = lshr <16 x i16> %a1, <i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11>
 678   %3 = bitcast <16 x i16> %1 to <32 x i8>
 679   %4 = bitcast <16 x i16> %2 to <32 x i8>
 680   %5 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %3, <32 x i8> <i8 0, i8 2, i8 4, i8 6, i8 8, i8 10, i8 12, i8 14, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 0, i8 2, i8 4, i8 6, i8 8, i8 10, i8 12, i8 14, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
 681   %6 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %4, <32 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 0, i8 2, i8 4, i8 6, i8 8, i8 10, i8 12, i8 14, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 0, i8 2, i8 4, i8 6, i8 8, i8 10, i8 12, i8 14>)
 682   %7 = or <32 x i8> %5, %6
 683   ret <32 x i8> %7
 684 }
 685
 686 define <16 x i8> @combine_pshufb_insertion_as_broadcast_v2i64(i64 %a0) {
 687 ; X86-LABEL: combine_pshufb_insertion_as_broadcast_v2i64:
 688 ; X86:       # %bb.0:
 689 ; X86-NEXT:    vmovddup {{.*#+}} xmm0 = mem[0,0]
 690 ; X86-NEXT:    retl
 691 ;
 692 ; X64-LABEL: combine_pshufb_insertion_as_broadcast_v2i64:
 693 ; X64:       # %bb.0:
 694 ; X64-NEXT:    vmovq %rdi, %xmm0
 695 ; X64-NEXT:    vpbroadcastq %xmm0, %xmm0
 696 ; X64-NEXT:    retq
 697   %1 = insertelement <2 x i64> undef, i64 %a0, i32 0
 698   %2 = bitcast <2 x i64> %1 to <16 x i8>
 699   %3 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %2, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>)
 700   ret <16 x i8> %3
 701 }
 702
 703 define <8 x i32> @combine_permd_insertion_as_broadcast_v4i64(i64 %a0) {
 704 ; X86-LABEL: combine_permd_insertion_as_broadcast_v4i64:
 705 ; X86:       # %bb.0:
 706 ; X86-NEXT:    vbroadcastsd {{[0-9]+}}(%esp), %ymm0
 707 ; X86-NEXT:    retl
 708 ;
 709 ; X64-LABEL: combine_permd_insertion_as_broadcast_v4i64:
 710 ; X64:       # %bb.0:
 711 ; X64-NEXT:    vmovq %rdi, %xmm0
 712 ; X64-NEXT:    vpbroadcastq %xmm0, %ymm0
 713 ; X64-NEXT:    retq
 714   %1 = insertelement <4 x i64> undef, i64 %a0, i32 0
 715   %2 = bitcast <4 x i64> %1 to <8 x i32>
 716   %3 = tail call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %2, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>)
 717   ret <8 x i32> %3
 718 }
 719
 720 define <32 x i8> @combine_pshufb_pshufb_or_as_blend(<32 x i8> %a0, <32 x i8> %a1) {
 721 ; CHECK-LABEL: combine_pshufb_pshufb_or_as_blend:
 722 ; CHECK:       # %bb.0:
 723 ; CHECK-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
 724 ; CHECK-NEXT:    ret{{[l|q]}}
 725   %1 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
 726   %2 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a1, <32 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>)
 727   %3 = or <32 x i8> %1, %2
 728   ret <32 x i8> %3
 729 }
 730
 731 define <32 x i8> @combine_pshufb_pshufb_or_as_unpcklbw(<32 x i8> %a0, <32 x i8> %a1) {
 732 ; CHECK-LABEL: combine_pshufb_pshufb_or_as_unpcklbw:
 733 ; CHECK:       # %bb.0:
 734 ; CHECK-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
 735 ; CHECK-NEXT:    ret{{[l|q]}}
 736   %1 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> <i8 0, i8 -1, i8 1, i8 -1, i8 2, i8 -1, i8 3, i8 -1, i8 4, i8 -1, i8 5, i8 -1, i8 6, i8 -1, i8 7, i8 -1, i8 0, i8 -1, i8 1, i8 -1, i8 2, i8 -1, i8 3, i8 -1, i8 4, i8 -1, i8 5, i8 -1, i8 6, i8 -1, i8 7, i8 -1>)
 737   %2 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a1, <32 x i8> <i8 -1, i8 0, i8 -1, i8 1, i8 -1, i8 2, i8 -1, i8 3, i8 -1, i8 4, i8 -1, i8 5, i8 -1, i8 6, i8 -1, i8 7, i8 -1, i8 0, i8 -1, i8 1, i8 -1, i8 2, i8 -1, i8 3, i8 -1, i8 4, i8 -1, i8 5, i8 -1, i8 6, i8 -1, i8 7>)
 738   %3 = or <32 x i8> %1, %2
 739   ret <32 x i8> %3
 740 }
 741
 742 define <32 x i8> @combine_pshufb_pshufb_or_pshufb(<32 x i8> %a0) {
 743 ; CHECK-LABEL: combine_pshufb_pshufb_or_pshufb:
 744 ; CHECK:       # %bb.0:
 745 ; CHECK-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4]
 746 ; CHECK-NEXT:    ret{{[l|q]}}
 747   %1 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 -1, i8 -1, i8 -1, i8 -1, i8 0, i8 1, i8 2, i8 3, i8 -1, i8 -1, i8 -1, i8 -1, i8 0, i8 1, i8 2, i8 3, i8 -1, i8 -1, i8 -1, i8 -1, i8 0, i8 1, i8 2, i8 3, i8 -1, i8 -1, i8 -1, i8 -1>)
 748   %2 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 0, i8 1, i8 2, i8 3, i8 -1, i8 -1, i8 -1, i8 -1, i8 0, i8 1, i8 2, i8 3, i8 -1, i8 -1, i8 -1, i8 -1, i8 0, i8 1, i8 2, i8 3, i8 -1, i8 -1, i8 -1, i8 -1, i8 0, i8 1, i8 2, i8 3>)
 749   %3 = or <32 x i8> %1, %2
 750   %4 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %3, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>)
 751   ret <32 x i8> %4
 752 }
 753
 754 define <8 x i32> @constant_fold_permd() {
 755 ; CHECK-LABEL: constant_fold_permd:
 756 ; CHECK:       # %bb.0:
 757 ; CHECK-NEXT:    vmovaps {{.*#+}} ymm0 = [5,7,3,2,8,2,6,1]
 758 ; CHECK-NEXT:    ret{{[l|q]}}
 759   %1 = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>, <8 x i32> <i32 4, i32 6, i32 2, i32 1, i32 7, i32 1, i32 5, i32 0>)
 760   ret <8 x i32> %1
 761 }
 762
 763 define <8 x float> @constant_fold_permps() {
 764 ; CHECK-LABEL: constant_fold_permps:
 765 ; CHECK:       # %bb.0:
 766 ; CHECK-NEXT:    vmovaps {{.*#+}} ymm0 = [5.0E+0,7.0E+0,3.0E+0,2.0E+0,8.0E+0,2.0E+0,6.0E+0,1.0E+0]
 767 ; CHECK-NEXT:    ret{{[l|q]}}
 768   %1 = call <8 x float> @llvm.x86.avx2.permps(<8 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0>, <8 x i32> <i32 4, i32 6, i32 2, i32 1, i32 7, i32 1, i32 5, i32 0>)
 769   ret <8 x float> %1
 770 }
 771
 772 define <32 x i8> @constant_fold_pshufb_256() {
 773 ; CHECK-LABEL: constant_fold_pshufb_256:
 774 ; CHECK:       # %bb.0:
 775 ; CHECK-NEXT:    vmovaps {{.*#+}} ymm0 = <14,0,0,0,u,u,0,0,0,0,0,0,0,0,8,9,255,0,0,0,u,u,0,0,241,0,0,0,0,0,249,250>
 776 ; CHECK-NEXT:    ret{{[l|q]}}
 777   %1 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> <i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 -1, i8 -2, i8 -3, i8 -4, i8 -5, i8 -6, i8 -7, i8 -8, i8 -9, i8 -10, i8 -11, i8 -12, i8 -13, i8 -14, i8 -15>, <32 x i8> <i8 1, i8 -1, i8 -1, i8 -1, i8 undef, i8 undef, i8 -1, i8 -1, i8 15, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 7, i8 6, i8 1, i8 -1, i8 -1, i8 -1, i8 undef, i8 undef, i8 -1, i8 -1, i8 15, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 7, i8 6>)
 778   ret <32 x i8> %1
 779 }
 780
 781 define i32 @broadcast_v2i64_multiuse(i64* %p0) {
 782 ; X86-LABEL: broadcast_v2i64_multiuse:
 783 ; X86:       # %bb.0: # %entry
 784 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 785 ; X86-NEXT:    movl (%eax), %eax
 786 ; X86-NEXT:    addl %eax, %eax
 787 ; X86-NEXT:    retl
 788 ;
 789 ; X64-LABEL: broadcast_v2i64_multiuse:
 790 ; X64:       # %bb.0: # %entry
 791 ; X64-NEXT:    movl (%rdi), %eax
 792 ; X64-NEXT:    addl %eax, %eax
 793 ; X64-NEXT:    retq
 794 entry:
 795   %tmp = load i64, i64* %p0, align 8
 796   %tmp1 = trunc i64 %tmp to i32
 797   %tmp2 = insertelement <2 x i64> undef, i64 %tmp, i32 0
 798   %tmp3 = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <2 x i32> zeroinitializer
 799   %tmp4 = trunc <2 x i64> %tmp3 to <2 x i32>
 800   %tmp5 = extractelement <2 x i32> %tmp4, i32 1
 801   %tmp6 = add i32 %tmp1, %tmp5
 802   ret i32 %tmp6
 803 }
 804
 805 define <32 x i8> @PR27320(<8 x i32> %a0) {
 806 ; CHECK-LABEL: PR27320:
 807 ; CHECK:       # %bb.0:
 808 ; CHECK-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,1,2]
 809 ; CHECK-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,1,2,3,4,4,5,6,7,7,8,9,10,10,11,20,21,21,22,23,24,24,25,26,27,27,28,29,30,30,31]
 810 ; CHECK-NEXT:    ret{{[l|q]}}
 811   %1 = shufflevector <8 x i32> %a0, <8 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 undef, i32 3, i32 4, i32 5, i32 undef>
 812   %2 = bitcast <8 x i32> %1 to <32 x i8>
 813   %3 = shufflevector <32 x i8> %2, <32 x i8> undef, <32 x i32> <i32 0, i32 1, i32 1, i32 2, i32 3, i32 4, i32 4, i32 5, i32 6, i32 7, i32 7, i32 8, i32 9, i32 10, i32 10, i32 11, i32 16, i32 17, i32 17, i32 18, i32 19, i32 20, i32 20, i32 21, i32 22, i32 23, i32 23, i32 24, i32 25, i32 26, i32 26, i32 27>
 814   ret <32 x i8> %3
 815 }
 816
 817 define internal fastcc <8 x float> @PR34577(<8 x float> %inp0, <8 x float> %inp1, <8 x float> %inp2) {
 818 ; AVX2-LABEL: PR34577:
 819 ; AVX2:       # %bb.0: # %entry
 820 ; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[1,1,1,1]
 821 ; AVX2-NEXT:    vxorps %xmm2, %xmm2, %xmm2
 822 ; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
 823 ; AVX2-NEXT:    vmovaps {{.*#+}} ymm2 = <u,u,7,2,u,u,3,2>
 824 ; AVX2-NEXT:    vpermps %ymm1, %ymm2, %ymm1
 825 ; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
 826 ; AVX2-NEXT:    ret{{[l|q]}}
 827 ;
 828 ; AVX512-LABEL: PR34577:
 829 ; AVX512:       # %bb.0: # %entry
 830 ; AVX512-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
 831 ; AVX512-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[1,1,1,1]
 832 ; AVX512-NEXT:    vxorps %xmm2, %xmm2, %xmm2
 833 ; AVX512-NEXT:    vblendps {{.*#+}} ymm2 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7]
 834 ; AVX512-NEXT:    vmovaps {{.*#+}} ymm0 = <23,18,7,2,20,u,3,2>
 835 ; AVX512-NEXT:    vpermi2ps %zmm2, %zmm1, %zmm0
 836 ; AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
 837 ; AVX512-NEXT:    ret{{[l|q]}}
 838 entry:
 839   %shuf0 = shufflevector <8 x float> %inp0, <8 x float> %inp2, <8 x i32> <i32 1, i32 10, i32 11, i32 13, i32 2, i32 13, i32 5, i32 0>
 840   %sel = select <8 x i1> <i1 false, i1 true, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false>, <8 x float> %shuf0, <8 x float> zeroinitializer
 841   %shuf1 = shufflevector <8 x float> zeroinitializer, <8 x float> %sel, <8 x i32> <i32 6, i32 11, i32 6, i32 15, i32 12, i32 11, i32 1, i32 3>
 842   %shuf2 = shufflevector <8 x float> %inp1, <8 x float> %shuf1, <8 x i32> <i32 15, i32 10, i32 7, i32 2, i32 12, i32 undef, i32 3, i32 2>
 843   ret <8 x float> %shuf2
 844 }
 845
 846 define void @packss_zext_v8i1() {
 847 ; X86-LABEL: packss_zext_v8i1:
 848 ; X86:       # %bb.0:
 849 ; X86-NEXT:    vxorps %xmm0, %xmm0, %xmm0
 850 ; X86-NEXT:    vmovups %ymm0, (%eax)
 851 ; X86-NEXT:    vzeroupper
 852 ; X86-NEXT:    retl
 853 ;
 854 ; X64-LABEL: packss_zext_v8i1:
 855 ; X64:       # %bb.0:
 856 ; X64-NEXT:    vxorps %xmm0, %xmm0, %xmm0
 857 ; X64-NEXT:    vmovups %ymm0, (%rax)
 858 ; X64-NEXT:    vzeroupper
 859 ; X64-NEXT:    retq
 860   %tmp0 = icmp sgt <8 x i32> undef, undef
 861   %tmp1 = zext <8 x i1> %tmp0 to <8 x i32>
 862   %tmp2 = shufflevector <8 x i32> %tmp1, <8 x i32> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 863   %tmp3 = trunc <16 x i32> %tmp2 to <16 x i16>
 864   %tmp4 = add <16 x i16> zeroinitializer, %tmp3
 865   %tmp6 = sext <16 x i16> %tmp4 to <16 x i32>
 866   %tmp10 = shufflevector <16 x i32> %tmp6, <16 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
 867   %tmp11 = tail call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> undef, <8 x i32> %tmp10)
 868   store <16 x i16> %tmp11, <16 x i16>* undef, align 2
 869   ret void
 870 }