llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll

   1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
   2 ; RUN: llc < %s -disable-peephole -mtriple=i686-unknown -mattr=+avx | FileCheck %s --check-prefixes=CHECK,X86,AVX,AVX1,X86-AVX1
   3 ; RUN: llc < %s -disable-peephole -mtriple=i686-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,X86,AVX,AVX2,X86-AVX2
   4 ; RUN: llc < %s -disable-peephole -mtriple=i686-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,X86,AVX512,X86-AVX512
   5 ; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s --check-prefixes=CHECK,X64,AVX,AVX1,X64-AVX1
   6 ; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,X64,AVX,AVX2,X64-AVX2
   7 ; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,X64,AVX512,X64-AVX512
   8 ;
   9 ; Combine tests involving AVX target shuffles
  10
  11 declare <4 x float> @llvm.x86.avx.vpermil.ps(<4 x float>, i8)
  12 declare <8 x float> @llvm.x86.avx.vpermil.ps.256(<8 x float>, i8)
  13 declare <2 x double> @llvm.x86.avx.vpermil.pd(<2 x double>, i8)
  14 declare <4 x double> @llvm.x86.avx.vpermil.pd.256(<4 x double>, i8)
  15
  16 declare <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float>, <4 x i32>)
  17 declare <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float>, <8 x i32>)
  18 declare <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double>, <2 x i64>)
  19 declare <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double>, <4 x i64>)
  20
  21 declare <8 x i32> @llvm.x86.avx.vperm2f128.si.256(<8 x i32>, <8 x i32>, i8)
  22 declare <8 x float> @llvm.x86.avx.vperm2f128.ps.256(<8 x float>, <8 x float>, i8)
  23 declare <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double>, <4 x double>, i8)
  24
  25 define <4 x float> @combine_vpermilvar_4f32_identity(<4 x float> %a0) {
  26 ; CHECK-LABEL: combine_vpermilvar_4f32_identity:
  27 ; CHECK:       # %bb.0:
  28 ; CHECK-NEXT:    ret{{[l|q]}}
  29   %1 = tail call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %a0, <4 x i32> <i32 3, i32 2, i32 1, i32 0>)
  30   %2 = tail call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float>  %1, <4 x i32> <i32 3, i32 2, i32 1, i32 0>)
  31   ret <4 x float> %2
  32 }
  33
  34 define <4 x float> @combine_vpermilvar_4f32_movddup(<4 x float> %a0) {
  35 ; CHECK-LABEL: combine_vpermilvar_4f32_movddup:
  36 ; CHECK:       # %bb.0:
  37 ; CHECK-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
  38 ; CHECK-NEXT:    ret{{[l|q]}}
  39   %1 = tail call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %a0, <4 x i32> <i32 0, i32 1, i32 0, i32 1>)
  40   ret <4 x float> %1
  41 }
  42 define <4 x float> @combine_vpermilvar_4f32_movddup_load(<4 x float> *%a0) {
  43 ; X86-LABEL: combine_vpermilvar_4f32_movddup_load:
  44 ; X86:       # %bb.0:
  45 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
  46 ; X86-NEXT:    vmovddup {{.*#+}} xmm0 = mem[0,0]
  47 ; X86-NEXT:    retl
  48 ;
  49 ; X64-LABEL: combine_vpermilvar_4f32_movddup_load:
  50 ; X64:       # %bb.0:
  51 ; X64-NEXT:    vmovddup {{.*#+}} xmm0 = mem[0,0]
  52 ; X64-NEXT:    retq
  53   %1 = load <4 x float>, <4 x float> *%a0
  54   %2 = tail call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %1, <4 x i32> <i32 0, i32 1, i32 0, i32 1>)
  55   ret <4 x float> %2
  56 }
  57
  58 define <4 x float> @combine_vpermilvar_4f32_movshdup(<4 x float> %a0) {
  59 ; CHECK-LABEL: combine_vpermilvar_4f32_movshdup:
  60 ; CHECK:       # %bb.0:
  61 ; CHECK-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
  62 ; CHECK-NEXT:    ret{{[l|q]}}
  63   %1 = tail call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %a0, <4 x i32> <i32 undef, i32 1, i32 3, i32 3>)
  64   ret <4 x float> %1
  65 }
  66
  67 define <4 x float> @combine_vpermilvar_4f32_movsldup(<4 x float> %a0) {
  68 ; CHECK-LABEL: combine_vpermilvar_4f32_movsldup:
  69 ; CHECK:       # %bb.0:
  70 ; CHECK-NEXT:    vmovsldup {{.*#+}} xmm0 = xmm0[0,0,2,2]
  71 ; CHECK-NEXT:    ret{{[l|q]}}
  72   %1 = tail call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %a0, <4 x i32> <i32 0, i32 0, i32 2, i32 undef>)
  73   ret <4 x float> %1
  74 }
  75
  76 define <4 x float> @combine_vpermilvar_4f32_unpckh(<4 x float> %a0) {
  77 ; CHECK-LABEL: combine_vpermilvar_4f32_unpckh:
  78 ; CHECK:       # %bb.0:
  79 ; CHECK-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[2,2,3,3]
  80 ; CHECK-NEXT:    ret{{[l|q]}}
  81   %1 = tail call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %a0, <4 x i32> <i32 2, i32 2, i32 3, i32 3>)
  82   ret <4 x float> %1
  83 }
  84
  85 define <4 x float> @combine_vpermilvar_4f32_unpckl(<4 x float> %a0) {
  86 ; CHECK-LABEL: combine_vpermilvar_4f32_unpckl:
  87 ; CHECK:       # %bb.0:
  88 ; CHECK-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,1]
  89 ; CHECK-NEXT:    ret{{[l|q]}}
  90   %1 = tail call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %a0, <4 x i32> <i32 0, i32 0, i32 1, i32 1>)
  91   ret <4 x float> %1
  92 }
  93
  94 define <8 x float> @combine_vpermilvar_8f32_identity(<8 x float> %a0) {
  95 ; CHECK-LABEL: combine_vpermilvar_8f32_identity:
  96 ; CHECK:       # %bb.0:
  97 ; CHECK-NEXT:    ret{{[l|q]}}
  98   %1 = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %a0, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 2, i32 3, i32 0, i32 undef>)
  99   %2 = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float>  %1, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 2, i32 3, i32 0, i32 1>)
 100   ret <8 x float> %2
 101 }
 102
 103 define <8 x float> @combine_vpermilvar_8f32_10326u4u(<8 x float> %a0) {
 104 ; CHECK-LABEL: combine_vpermilvar_8f32_10326u4u:
 105 ; CHECK:       # %bb.0:
 106 ; CHECK-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[1,0,3,2,6,u,4,u]
 107 ; CHECK-NEXT:    ret{{[l|q]}}
 108   %1 = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %a0, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 0, i32 1, i32 2, i32 undef>)
 109   %2 = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float>  %1, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 undef>)
 110   ret <8 x float> %2
 111 }
 112
 113 define <8 x float> @combine_vpermilvar_vperm2f128_8f32(<8 x float> %a0) {
 114 ; AVX1-LABEL: combine_vpermilvar_vperm2f128_8f32:
 115 ; AVX1:       # %bb.0:
 116 ; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
 117 ; AVX1-NEXT:    ret{{[l|q]}}
 118 ;
 119 ; AVX2-LABEL: combine_vpermilvar_vperm2f128_8f32:
 120 ; AVX2:       # %bb.0:
 121 ; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,3,0,1]
 122 ; AVX2-NEXT:    ret{{[l|q]}}
 123 ;
 124 ; AVX512-LABEL: combine_vpermilvar_vperm2f128_8f32:
 125 ; AVX512:       # %bb.0:
 126 ; AVX512-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,3,0,1]
 127 ; AVX512-NEXT:    ret{{[l|q]}}
 128   %1 = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %a0, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 3, i32 2, i32 1, i32 0>)
 129   %2 = shufflevector <8 x float> %1, <8 x float> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
 130   %3 = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float>  %2, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 3, i32 2, i32 1, i32 0>)
 131   ret <8 x float> %3
 132 }
 133
 134 define <8 x float> @combine_vpermilvar_vperm2f128_zero_8f32(<8 x float> %a0) {
 135 ; AVX-LABEL: combine_vpermilvar_vperm2f128_zero_8f32:
 136 ; AVX:       # %bb.0:
 137 ; AVX-NEXT:    vperm2f128 {{.*#+}} ymm0 = zero,zero,ymm0[0,1]
 138 ; AVX-NEXT:    ret{{[l|q]}}
 139 ;
 140 ; AVX512-LABEL: combine_vpermilvar_vperm2f128_zero_8f32:
 141 ; AVX512:       # %bb.0:
 142 ; AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
 143 ; AVX512-NEXT:    vmovaps {{.*#+}} ymm1 = [16,17,18,19,3,2,1,0]
 144 ; AVX512-NEXT:    vxorps %xmm2, %xmm2, %xmm2
 145 ; AVX512-NEXT:    vpermt2ps %zmm2, %zmm1, %zmm0
 146 ; AVX512-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
 147 ; AVX512-NEXT:    ret{{[l|q]}}
 148   %1 = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %a0, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 3, i32 2, i32 1, i32 0>)
 149   %2 = shufflevector <8 x float> %1, <8 x float> zeroinitializer, <8 x i32> <i32 8, i32 8, i32 8, i32 8, i32 0, i32 1, i32 2, i32 3>
 150   %3 = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float>  %2, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 3, i32 2, i32 1, i32 0>)
 151   ret <8 x float> %3
 152 }
 153
 154 define <4 x double> @combine_vperm2f128_vpermilvar_as_vperm2f128(<4 x double> %a0) {
 155 ; CHECK-LABEL: combine_vperm2f128_vpermilvar_as_vperm2f128:
 156 ; CHECK:       # %bb.0:
 157 ; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = zero,zero,ymm0[0,1]
 158 ; CHECK-NEXT:    ret{{[l|q]}}
 159   %1 = tail call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> %a0, <4 x i64> <i64 2, i64 0, i64 2, i64 0>)
 160   %2 = shufflevector <4 x double> %1, <4 x double> zeroinitializer, <4 x i32> <i32 4, i32 5, i32 0, i32 1>
 161   %3 = tail call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> %2, <4 x i64> <i64 2, i64 0, i64 2, i64 0>)
 162   ret <4 x double> %3
 163 }
 164
 165 define <4 x double> @combine_vperm2f128_vpermilvar_as_vmovaps(<4 x double> %a0) {
 166 ; CHECK-LABEL: combine_vperm2f128_vpermilvar_as_vmovaps:
 167 ; CHECK:       # %bb.0:
 168 ; CHECK-NEXT:    vmovaps %xmm0, %xmm0
 169 ; CHECK-NEXT:    ret{{[l|q]}}
 170   %1 = tail call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> %a0, <4 x i64> <i64 2, i64 0, i64 2, i64 0>)
 171   %2 = shufflevector <4 x double> %1, <4 x double> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 172   %3 = tail call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> %2, <4 x i64> <i64 2, i64 0, i64 2, i64 0>)
 173   ret <4 x double> %3
 174 }
 175
 176 define <8 x float> @combine_vpermilvar_8f32_movddup(<8 x float> %a0) {
 177 ; CHECK-LABEL: combine_vpermilvar_8f32_movddup:
 178 ; CHECK:       # %bb.0:
 179 ; CHECK-NEXT:    vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
 180 ; CHECK-NEXT:    ret{{[l|q]}}
 181   %1 = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %a0, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 4, i32 5, i32 4, i32 5>)
 182   ret <8 x float> %1
 183 }
 184 define <8 x float> @combine_vpermilvar_8f32_movddup_load(<8 x float> *%a0) {
 185 ; X86-LABEL: combine_vpermilvar_8f32_movddup_load:
 186 ; X86:       # %bb.0:
 187 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 188 ; X86-NEXT:    vmovddup {{.*#+}} ymm0 = mem[0,0,2,2]
 189 ; X86-NEXT:    retl
 190 ;
 191 ; X64-LABEL: combine_vpermilvar_8f32_movddup_load:
 192 ; X64:       # %bb.0:
 193 ; X64-NEXT:    vmovddup {{.*#+}} ymm0 = mem[0,0,2,2]
 194 ; X64-NEXT:    retq
 195   %1 = load <8 x float>, <8 x float> *%a0
 196   %2 = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %1, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 4, i32 5, i32 4, i32 5>)
 197   ret <8 x float> %2
 198 }
 199
 200 define <8 x float> @combine_vpermilvar_8f32_movshdup(<8 x float> %a0) {
 201 ; CHECK-LABEL: combine_vpermilvar_8f32_movshdup:
 202 ; CHECK:       # %bb.0:
 203 ; CHECK-NEXT:    vmovshdup {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7]
 204 ; CHECK-NEXT:    ret{{[l|q]}}
 205   %1 = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %a0, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 undef, i32 5, i32 7, i32 7>)
 206   ret <8 x float> %1
 207 }
 208 define <8 x float> @demandedelts_vpermilvar_8f32_movshdup(<8 x float> %a0, i32 %a1) {
 209 ; CHECK-LABEL: demandedelts_vpermilvar_8f32_movshdup:
 210 ; CHECK:       # %bb.0:
 211 ; CHECK-NEXT:    vmovshdup {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7]
 212 ; CHECK-NEXT:    ret{{[l|q]}}
 213   %1 = insertelement <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 undef, i32 5, i32 7, i32 7>, i32 %a1, i32 7
 214   %2 = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %a0, <8 x i32> %1)
 215   %3 = shufflevector <8 x float> %2, <8 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 6>
 216   ret <8 x float> %3
 217 }
 218
 219 define <8 x float> @combine_vpermilvar_8f32_movsldup(<8 x float> %a0) {
 220 ; CHECK-LABEL: combine_vpermilvar_8f32_movsldup:
 221 ; CHECK:       # %bb.0:
 222 ; CHECK-NEXT:    vmovsldup {{.*#+}} ymm0 = ymm0[0,0,2,2,4,4,6,6]
 223 ; CHECK-NEXT:    ret{{[l|q]}}
 224   %1 = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %a0, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>)
 225   ret <8 x float> %1
 226 }
 227 define <8 x float> @demandedelts_vpermilvar_8f32_movsldup(<8 x float> %a0, i32 %a1) {
 228 ; CHECK-LABEL: demandedelts_vpermilvar_8f32_movsldup:
 229 ; CHECK:       # %bb.0:
 230 ; CHECK-NEXT:    vmovsldup {{.*#+}} ymm0 = ymm0[0,0,2,2,4,4,6,6]
 231 ; CHECK-NEXT:    ret{{[l|q]}}
 232   %1 = insertelement <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>, i32 %a1, i32 0
 233   %2 = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %a0, <8 x i32> %1)
 234   %3 = shufflevector <8 x float> %2, <8 x float> undef, <8 x i32> <i32 1, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 235   ret <8 x float> %3
 236 }
 237
 238 define <2 x double> @combine_vpermilvar_2f64_identity(<2 x double> %a0) {
 239 ; CHECK-LABEL: combine_vpermilvar_2f64_identity:
 240 ; CHECK:       # %bb.0:
 241 ; CHECK-NEXT:    ret{{[l|q]}}
 242   %1 = tail call <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double> %a0, <2 x i64> <i64 2, i64 0>)
 243   %2 = tail call <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double>  %1, <2 x i64> <i64 2, i64 0>)
 244   ret <2 x double> %2
 245 }
 246
 247 define <2 x double> @combine_vpermilvar_2f64_movddup(<2 x double> %a0) {
 248 ; CHECK-LABEL: combine_vpermilvar_2f64_movddup:
 249 ; CHECK:       # %bb.0:
 250 ; CHECK-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
 251 ; CHECK-NEXT:    ret{{[l|q]}}
 252   %1 = tail call <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double> %a0, <2 x i64> <i64 0, i64 0>)
 253   ret <2 x double> %1
 254 }
 255
 256 define <4 x double> @combine_vpermilvar_4f64_identity(<4 x double> %a0) {
 257 ; CHECK-LABEL: combine_vpermilvar_4f64_identity:
 258 ; CHECK:       # %bb.0:
 259 ; CHECK-NEXT:    ret{{[l|q]}}
 260   %1 = tail call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> %a0, <4 x i64> <i64 2, i64 0, i64 2, i64 0>)
 261   %2 = tail call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double>  %1, <4 x i64> <i64 2, i64 0, i64 2, i64 0>)
 262   ret <4 x double> %2
 263 }
 264
 265 define <4 x double> @combine_vpermilvar_4f64_movddup(<4 x double> %a0) {
 266 ; CHECK-LABEL: combine_vpermilvar_4f64_movddup:
 267 ; CHECK:       # %bb.0:
 268 ; CHECK-NEXT:    vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
 269 ; CHECK-NEXT:    ret{{[l|q]}}
 270   %1 = tail call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> %a0, <4 x i64> <i64 0, i64 0, i64 4, i64 4>)
 271   ret <4 x double> %1
 272 }
 273
 274 define <4 x float> @combine_vpermilvar_4f32_4stage(<4 x float> %a0) {
 275 ; CHECK-LABEL: combine_vpermilvar_4f32_4stage:
 276 ; CHECK:       # %bb.0:
 277 ; CHECK-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[2,0,3,1]
 278 ; CHECK-NEXT:    ret{{[l|q]}}
 279   %1 = tail call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %a0, <4 x i32> <i32 3, i32 2, i32 1, i32 0>)
 280   %2 = tail call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float>  %1, <4 x i32> <i32 2, i32 3, i32 0, i32 1>)
 281   %3 = tail call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float>  %2, <4 x i32> <i32 0, i32 2, i32 1, i32 3>)
 282   %4 = tail call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float>  %3, <4 x i32> <i32 3, i32 2, i32 1, i32 0>)
 283   ret <4 x float> %4
 284 }
 285
 286 define <8 x float> @combine_vpermilvar_8f32_4stage(<8 x float> %a0) {
 287 ; CHECK-LABEL: combine_vpermilvar_8f32_4stage:
 288 ; CHECK:       # %bb.0:
 289 ; CHECK-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[2,0,3,1,6,4,7,5]
 290 ; CHECK-NEXT:    ret{{[l|q]}}
 291   %1 = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %a0, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 3, i32 2, i32 1, i32 0>)
 292   %2 = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float>  %1, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>)
 293   %3 = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float>  %2, <8 x i32> <i32 0, i32 2, i32 1, i32 3, i32 0, i32 2, i32 1, i32 3>)
 294   %4 = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float>  %3, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 3, i32 2, i32 1, i32 0>)
 295   ret <8 x float> %4
 296 }
 297
 298 define <4 x float> @combine_vpermilvar_4f32_as_insertps(<4 x float> %a0) {
 299 ; CHECK-LABEL: combine_vpermilvar_4f32_as_insertps:
 300 ; CHECK:       # %bb.0:
 301 ; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[1],zero,xmm0[2],zero
 302 ; CHECK-NEXT:    ret{{[l|q]}}
 303   %1 = call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %a0, <4 x i32> <i32 3, i32 2, i32 1, i32 0>)
 304   %2 = shufflevector <4 x float> %1, <4 x float> zeroinitializer, <4 x i32> <i32 2, i32 4, i32 1, i32 4>
 305   ret <4 x float> %2
 306 }
 307
 308 define <2 x double> @constant_fold_vpermilvar_pd() {
 309 ; CHECK-LABEL: constant_fold_vpermilvar_pd:
 310 ; CHECK:       # %bb.0:
 311 ; CHECK-NEXT:    vmovaps {{.*#+}} xmm0 = [2.0E+0,1.0E+0]
 312 ; CHECK-NEXT:    ret{{[l|q]}}
 313   %1 = call <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double> <double 1.0, double 2.0>, <2 x i64> <i64 2, i64 0>)
 314   ret <2 x double> %1
 315 }
 316
 317 define <4 x double> @constant_fold_vpermilvar_pd_256() {
 318 ; CHECK-LABEL: constant_fold_vpermilvar_pd_256:
 319 ; CHECK:       # %bb.0:
 320 ; CHECK-NEXT:    vmovaps {{.*#+}} ymm0 = [2.0E+0,1.0E+0,3.0E+0,4.0E+0]
 321 ; CHECK-NEXT:    ret{{[l|q]}}
 322   %1 = call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> <double 1.0, double 2.0, double 3.0, double 4.0>, <4 x i64> <i64 2, i64 0, i64 0, i64 2>)
 323   ret <4 x double> %1
 324 }
 325
 326 define <4 x float> @constant_fold_vpermilvar_ps() {
 327 ; CHECK-LABEL: constant_fold_vpermilvar_ps:
 328 ; CHECK:       # %bb.0:
 329 ; CHECK-NEXT:    vmovaps {{.*#+}} xmm0 = [4.0E+0,1.0E+0,3.0E+0,2.0E+0]
 330 ; CHECK-NEXT:    ret{{[l|q]}}
 331   %1 = call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> <float 1.0, float 2.0, float 3.0, float 4.0>, <4 x i32> <i32 3, i32 0, i32 2, i32 1>)
 332   ret <4 x float> %1
 333 }
 334
 335 define <8 x float> @constant_fold_vpermilvar_ps_256() {
 336 ; CHECK-LABEL: constant_fold_vpermilvar_ps_256:
 337 ; CHECK:       # %bb.0:
 338 ; CHECK-NEXT:    vmovaps {{.*#+}} ymm0 = [1.0E+0,1.0E+0,3.0E+0,2.0E+0,5.0E+0,6.0E+0,6.0E+0,6.0E+0]
 339 ; CHECK-NEXT:    ret{{[l|q]}}
 340   %1 = call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0>, <8 x i32> <i32 4, i32 0, i32 2, i32 1, i32 0, i32 1, i32 1, i32 1>)
 341   ret <8 x float> %1
 342 }
 343
 344 define void @PR39483() {
 345 ; X86-AVX1-LABEL: PR39483:
 346 ; X86-AVX1:       # %bb.0: # %entry
 347 ; X86-AVX1-NEXT:    vmovups 32, %ymm0
 348 ; X86-AVX1-NEXT:    vmovups 64, %xmm1
 349 ; X86-AVX1-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[0,1],mem[0,3]
 350 ; X86-AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
 351 ; X86-AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7]
 352 ; X86-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
 353 ; X86-AVX1-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
 354 ; X86-AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[2,1,0,3]
 355 ; X86-AVX1-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm2[1,0]
 356 ; X86-AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
 357 ; X86-AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
 358 ; X86-AVX1-NEXT:    vxorps %xmm1, %xmm1, %xmm1
 359 ; X86-AVX1-NEXT:    vmulps %ymm1, %ymm0, %ymm0
 360 ; X86-AVX1-NEXT:    vaddps %ymm1, %ymm0, %ymm0
 361 ; X86-AVX1-NEXT:    vmovups %ymm0, (%eax)
 362 ;
 363 ; X86-AVX2-LABEL: PR39483:
 364 ; X86-AVX2:       # %bb.0: # %entry
 365 ; X86-AVX2-NEXT:    vmovups 32, %ymm0
 366 ; X86-AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7]
 367 ; X86-AVX2-NEXT:    vmovaps {{.*#+}} ymm1 = <2,5,0,3,6,u,u,u>
 368 ; X86-AVX2-NEXT:    vpermps %ymm0, %ymm1, %ymm0
 369 ; X86-AVX2-NEXT:    vpermilps {{.*#+}} ymm1 = mem[0,1,0,3,4,5,4,7]
 370 ; X86-AVX2-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3]
 371 ; X86-AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
 372 ; X86-AVX2-NEXT:    vxorps %xmm1, %xmm1, %xmm1
 373 ; X86-AVX2-NEXT:    vmulps %ymm1, %ymm0, %ymm0
 374 ; X86-AVX2-NEXT:    vaddps %ymm1, %ymm0, %ymm0
 375 ; X86-AVX2-NEXT:    vmovups %ymm0, (%eax)
 376 ;
 377 ; X86-AVX512-LABEL: PR39483:
 378 ; X86-AVX512:       # %bb.0: # %entry
 379 ; X86-AVX512-NEXT:    vmovups 0, %zmm0
 380 ; X86-AVX512-NEXT:    vmovups 64, %ymm1
 381 ; X86-AVX512-NEXT:    vmovaps {{.*#+}} ymm2 = [2,5,8,11,14,17,20,23]
 382 ; X86-AVX512-NEXT:    vpermi2ps %zmm1, %zmm0, %zmm2
 383 ; X86-AVX512-NEXT:    vxorps %xmm0, %xmm0, %xmm0
 384 ; X86-AVX512-NEXT:    vmulps %ymm0, %ymm2, %ymm1
 385 ; X86-AVX512-NEXT:    vaddps %ymm0, %ymm1, %ymm0
 386 ; X86-AVX512-NEXT:    vmovups %ymm0, (%eax)
 387 ;
 388 ; X64-AVX1-LABEL: PR39483:
 389 ; X64-AVX1:       # %bb.0: # %entry
 390 ; X64-AVX1-NEXT:    vmovups 32, %ymm0
 391 ; X64-AVX1-NEXT:    vmovups 64, %xmm1
 392 ; X64-AVX1-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[0,1],mem[0,3]
 393 ; X64-AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
 394 ; X64-AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7]
 395 ; X64-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
 396 ; X64-AVX1-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
 397 ; X64-AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[2,1,0,3]
 398 ; X64-AVX1-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm2[1,0]
 399 ; X64-AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
 400 ; X64-AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
 401 ; X64-AVX1-NEXT:    vxorps %xmm1, %xmm1, %xmm1
 402 ; X64-AVX1-NEXT:    vmulps %ymm1, %ymm0, %ymm0
 403 ; X64-AVX1-NEXT:    vaddps %ymm1, %ymm0, %ymm0
 404 ; X64-AVX1-NEXT:    vmovups %ymm0, (%rax)
 405 ;
 406 ; X64-AVX2-LABEL: PR39483:
 407 ; X64-AVX2:       # %bb.0: # %entry
 408 ; X64-AVX2-NEXT:    vmovups 32, %ymm0
 409 ; X64-AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7]
 410 ; X64-AVX2-NEXT:    vmovaps {{.*#+}} ymm1 = <2,5,0,3,6,u,u,u>
 411 ; X64-AVX2-NEXT:    vpermps %ymm0, %ymm1, %ymm0
 412 ; X64-AVX2-NEXT:    vpermilps {{.*#+}} ymm1 = mem[0,1,0,3,4,5,4,7]
 413 ; X64-AVX2-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3]
 414 ; X64-AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
 415 ; X64-AVX2-NEXT:    vxorps %xmm1, %xmm1, %xmm1
 416 ; X64-AVX2-NEXT:    vmulps %ymm1, %ymm0, %ymm0
 417 ; X64-AVX2-NEXT:    vaddps %ymm1, %ymm0, %ymm0
 418 ; X64-AVX2-NEXT:    vmovups %ymm0, (%rax)
 419 ;
 420 ; X64-AVX512-LABEL: PR39483:
 421 ; X64-AVX512:       # %bb.0: # %entry
 422 ; X64-AVX512-NEXT:    vmovups 0, %zmm0
 423 ; X64-AVX512-NEXT:    vmovups 64, %ymm1
 424 ; X64-AVX512-NEXT:    vmovaps {{.*#+}} ymm2 = [2,5,8,11,14,17,20,23]
 425 ; X64-AVX512-NEXT:    vpermi2ps %zmm1, %zmm0, %zmm2
 426 ; X64-AVX512-NEXT:    vxorps %xmm0, %xmm0, %xmm0
 427 ; X64-AVX512-NEXT:    vmulps %ymm0, %ymm2, %ymm1
 428 ; X64-AVX512-NEXT:    vaddps %ymm0, %ymm1, %ymm0
 429 ; X64-AVX512-NEXT:    vmovups %ymm0, (%rax)
 430 entry:
 431   %wide.vec = load <24 x float>, <24 x float>* null, align 4
 432   %strided.vec18 = shufflevector <24 x float> %wide.vec, <24 x float> undef, <8 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23>
 433   %0 = fmul <8 x float> %strided.vec18, zeroinitializer
 434   %1 = fadd <8 x float> zeroinitializer, %0
 435   store <8 x float> %1, <8 x float>* undef, align 16
 436   unreachable
 437 }
 438
 439 define void @PR48908(<4 x double> %v0, <4 x double> %v1, <4 x double> %v2, <4 x double>* noalias %out0, <4 x double>* noalias %out1, <4 x double>* noalias %out2) {
 440 ; X86-AVX1-LABEL: PR48908:
 441 ; X86-AVX1:       # %bb.0:
 442 ; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax
 443 ; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 444 ; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %edx
 445 ; X86-AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm3
 446 ; X86-AVX1-NEXT:    vpermilpd {{.*#+}} ymm3 = ymm3[0,1,2,2]
 447 ; X86-AVX1-NEXT:    vperm2f128 {{.*#+}} ymm4 = ymm1[2,3],ymm2[0,1]
 448 ; X86-AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm5
 449 ; X86-AVX1-NEXT:    vshufpd {{.*#+}} ymm4 = ymm5[1],ymm4[0],ymm5[2],ymm4[3]
 450 ; X86-AVX1-NEXT:    vperm2f128 {{.*#+}} ymm5 = ymm0[0,1],ymm2[0,1]
 451 ; X86-AVX1-NEXT:    vperm2f128 {{.*#+}} ymm3 = ymm3[2,3,0,1]
 452 ; X86-AVX1-NEXT:    vblendpd {{.*#+}} ymm3 = ymm5[0],ymm3[1],ymm5[2],ymm3[3]
 453 ; X86-AVX1-NEXT:    vmovapd %ymm3, (%edx)
 454 ; X86-AVX1-NEXT:    vperm2f128 {{.*#+}} ymm3 = ymm4[2,3,0,1]
 455 ; X86-AVX1-NEXT:    vblendpd {{.*#+}} ymm4 = ymm4[0,1],ymm0[2],ymm4[3]
 456 ; X86-AVX1-NEXT:    vblendpd {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2],ymm3[3]
 457 ; X86-AVX1-NEXT:    vmovapd %ymm3, (%ecx)
 458 ; X86-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
 459 ; X86-AVX1-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3]
 460 ; X86-AVX1-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm2[1],ymm0[3],ymm2[3]
 461 ; X86-AVX1-NEXT:    vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[3],ymm0[3]
 462 ; X86-AVX1-NEXT:    vmovapd %ymm0, (%eax)
 463 ; X86-AVX1-NEXT:    vzeroupper
 464 ; X86-AVX1-NEXT:    retl
 465 ;
 466 ; X86-AVX2-LABEL: PR48908:
 467 ; X86-AVX2:       # %bb.0:
 468 ; X86-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 469 ; X86-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 470 ; X86-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %edx
 471 ; X86-AVX2-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm3
 472 ; X86-AVX2-NEXT:    vperm2f128 {{.*#+}} ymm4 = ymm1[2,3],ymm2[0,1]
 473 ; X86-AVX2-NEXT:    vpermilpd {{.*#+}} xmm5 = xmm1[1,0]
 474 ; X86-AVX2-NEXT:    vperm2f128 {{.*#+}} ymm6 = ymm0[0,1],ymm2[0,1]
 475 ; X86-AVX2-NEXT:    vpermpd {{.*#+}} ymm3 = ymm3[0,2,2,1]
 476 ; X86-AVX2-NEXT:    vblendpd {{.*#+}} ymm3 = ymm6[0],ymm3[1],ymm6[2],ymm3[3]
 477 ; X86-AVX2-NEXT:    vmovapd %ymm3, (%edx)
 478 ; X86-AVX2-NEXT:    vblendpd {{.*#+}} ymm3 = ymm5[0,1],ymm0[2],ymm5[3]
 479 ; X86-AVX2-NEXT:    vpermpd {{.*#+}} ymm4 = ymm4[0,3,2,0]
 480 ; X86-AVX2-NEXT:    vblendpd {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2],ymm4[3]
 481 ; X86-AVX2-NEXT:    vmovapd %ymm3, (%ecx)
 482 ; X86-AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm0
 483 ; X86-AVX2-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3]
 484 ; X86-AVX2-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm2[1],ymm0[3],ymm2[3]
 485 ; X86-AVX2-NEXT:    vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[3],ymm0[3]
 486 ; X86-AVX2-NEXT:    vmovapd %ymm0, (%eax)
 487 ; X86-AVX2-NEXT:    vzeroupper
 488 ; X86-AVX2-NEXT:    retl
 489 ;
 490 ; X86-AVX512-LABEL: PR48908:
 491 ; X86-AVX512:       # %bb.0:
 492 ; X86-AVX512-NEXT:    # kill: def $ymm2 killed $ymm2 def $zmm2
 493 ; X86-AVX512-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
 494 ; X86-AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
 495 ; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
 496 ; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 497 ; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %edx
 498 ; X86-AVX512-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm3
 499 ; X86-AVX512-NEXT:    vshufpd {{.*#+}} ymm3 = ymm0[0],ymm3[1],ymm0[2],ymm3[2]
 500 ; X86-AVX512-NEXT:    vperm2f128 {{.*#+}} ymm4 = ymm1[2,3],ymm2[0,1]
 501 ; X86-AVX512-NEXT:    vshufpd {{.*#+}} ymm4 = ymm1[1],ymm4[0],ymm1[2],ymm4[3]
 502 ; X86-AVX512-NEXT:    vmovapd {{.*#+}} ymm5 = [0,0,3,0,8,0,1,0]
 503 ; X86-AVX512-NEXT:    vpermt2pd %zmm2, %zmm5, %zmm3
 504 ; X86-AVX512-NEXT:    vmovapd %ymm3, (%edx)
 505 ; X86-AVX512-NEXT:    vmovapd {{.*#+}} ymm3 = [0,0,3,0,10,0,1,0]
 506 ; X86-AVX512-NEXT:    vpermt2pd %zmm0, %zmm3, %zmm4
 507 ; X86-AVX512-NEXT:    vmovapd %ymm4, (%ecx)
 508 ; X86-AVX512-NEXT:    vmovapd {{.*#+}} ymm3 = <3,0,11,0,u,u,u,u>
 509 ; X86-AVX512-NEXT:    vpermi2pd %zmm1, %zmm0, %zmm3
 510 ; X86-AVX512-NEXT:    vmovapd {{.*#+}} ymm0 = [2,0,8,0,9,0,3,0]
 511 ; X86-AVX512-NEXT:    vpermi2pd %zmm3, %zmm2, %zmm0
 512 ; X86-AVX512-NEXT:    vmovapd %ymm0, (%eax)
 513 ; X86-AVX512-NEXT:    vzeroupper
 514 ; X86-AVX512-NEXT:    retl
 515 ;
 516 ; X64-AVX1-LABEL: PR48908:
 517 ; X64-AVX1:       # %bb.0:
 518 ; X64-AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm3
 519 ; X64-AVX1-NEXT:    vpermilpd {{.*#+}} ymm3 = ymm3[0,1,2,2]
 520 ; X64-AVX1-NEXT:    vperm2f128 {{.*#+}} ymm4 = ymm1[2,3],ymm2[0,1]
 521 ; X64-AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm5
 522 ; X64-AVX1-NEXT:    vshufpd {{.*#+}} ymm4 = ymm5[1],ymm4[0],ymm5[2],ymm4[3]
 523 ; X64-AVX1-NEXT:    vperm2f128 {{.*#+}} ymm5 = ymm0[0,1],ymm2[0,1]
 524 ; X64-AVX1-NEXT:    vperm2f128 {{.*#+}} ymm3 = ymm3[2,3,0,1]
 525 ; X64-AVX1-NEXT:    vblendpd {{.*#+}} ymm3 = ymm5[0],ymm3[1],ymm5[2],ymm3[3]
 526 ; X64-AVX1-NEXT:    vmovapd %ymm3, (%rdi)
 527 ; X64-AVX1-NEXT:    vperm2f128 {{.*#+}} ymm3 = ymm4[2,3,0,1]
 528 ; X64-AVX1-NEXT:    vblendpd {{.*#+}} ymm4 = ymm4[0,1],ymm0[2],ymm4[3]
 529 ; X64-AVX1-NEXT:    vblendpd {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2],ymm3[3]
 530 ; X64-AVX1-NEXT:    vmovapd %ymm3, (%rsi)
 531 ; X64-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
 532 ; X64-AVX1-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3]
 533 ; X64-AVX1-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm2[1],ymm0[3],ymm2[3]
 534 ; X64-AVX1-NEXT:    vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[3],ymm0[3]
 535 ; X64-AVX1-NEXT:    vmovapd %ymm0, (%rdx)
 536 ; X64-AVX1-NEXT:    vzeroupper
 537 ; X64-AVX1-NEXT:    retq
 538 ;
 539 ; X64-AVX2-LABEL: PR48908:
 540 ; X64-AVX2:       # %bb.0:
 541 ; X64-AVX2-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm3
 542 ; X64-AVX2-NEXT:    vperm2f128 {{.*#+}} ymm4 = ymm1[2,3],ymm2[0,1]
 543 ; X64-AVX2-NEXT:    vpermilpd {{.*#+}} xmm5 = xmm1[1,0]
 544 ; X64-AVX2-NEXT:    vperm2f128 {{.*#+}} ymm6 = ymm0[0,1],ymm2[0,1]
 545 ; X64-AVX2-NEXT:    vpermpd {{.*#+}} ymm3 = ymm3[0,2,2,1]
 546 ; X64-AVX2-NEXT:    vblendpd {{.*#+}} ymm3 = ymm6[0],ymm3[1],ymm6[2],ymm3[3]
 547 ; X64-AVX2-NEXT:    vmovapd %ymm3, (%rdi)
 548 ; X64-AVX2-NEXT:    vblendpd {{.*#+}} ymm3 = ymm5[0,1],ymm0[2],ymm5[3]
 549 ; X64-AVX2-NEXT:    vpermpd {{.*#+}} ymm4 = ymm4[0,3,2,0]
 550 ; X64-AVX2-NEXT:    vblendpd {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2],ymm4[3]
 551 ; X64-AVX2-NEXT:    vmovapd %ymm3, (%rsi)
 552 ; X64-AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm0
 553 ; X64-AVX2-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3]
 554 ; X64-AVX2-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm2[1],ymm0[3],ymm2[3]
 555 ; X64-AVX2-NEXT:    vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[3],ymm0[3]
 556 ; X64-AVX2-NEXT:    vmovapd %ymm0, (%rdx)
 557 ; X64-AVX2-NEXT:    vzeroupper
 558 ; X64-AVX2-NEXT:    retq
 559 ;
 560 ; X64-AVX512-LABEL: PR48908:
 561 ; X64-AVX512:       # %bb.0:
 562 ; X64-AVX512-NEXT:    # kill: def $ymm2 killed $ymm2 def $zmm2
 563 ; X64-AVX512-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
 564 ; X64-AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
 565 ; X64-AVX512-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm3
 566 ; X64-AVX512-NEXT:    vshufpd {{.*#+}} ymm3 = ymm0[0],ymm3[1],ymm0[2],ymm3[2]
 567 ; X64-AVX512-NEXT:    vperm2f128 {{.*#+}} ymm4 = ymm1[2,3],ymm2[0,1]
 568 ; X64-AVX512-NEXT:    vshufpd {{.*#+}} ymm4 = ymm1[1],ymm4[0],ymm1[2],ymm4[3]
 569 ; X64-AVX512-NEXT:    vmovapd {{.*#+}} ymm5 = [0,3,8,1]
 570 ; X64-AVX512-NEXT:    vpermt2pd %zmm2, %zmm5, %zmm3
 571 ; X64-AVX512-NEXT:    vmovapd %ymm3, (%rdi)
 572 ; X64-AVX512-NEXT:    vmovapd {{.*#+}} ymm3 = [0,3,10,1]
 573 ; X64-AVX512-NEXT:    vpermt2pd %zmm0, %zmm3, %zmm4
 574 ; X64-AVX512-NEXT:    vmovapd %ymm4, (%rsi)
 575 ; X64-AVX512-NEXT:    vmovapd {{.*#+}} ymm3 = <3,11,u,u>
 576 ; X64-AVX512-NEXT:    vpermi2pd %zmm1, %zmm0, %zmm3
 577 ; X64-AVX512-NEXT:    vmovapd {{.*#+}} ymm0 = [2,8,9,3]
 578 ; X64-AVX512-NEXT:    vpermi2pd %zmm3, %zmm2, %zmm0
 579 ; X64-AVX512-NEXT:    vmovapd %ymm0, (%rdx)
 580 ; X64-AVX512-NEXT:    vzeroupper
 581 ; X64-AVX512-NEXT:    retq
 582   %t0 = shufflevector <4 x double> %v0, <4 x double> %v1, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
 583   %t1 = shufflevector <4 x double> %v1, <4 x double> %v2, <4 x i32> <i32 1, i32 2, i32 4, i32 5>
 584   %r0 = shufflevector <4 x double> %t0, <4 x double> %t1, <4 x i32> <i32 0, i32 3, i32 6, i32 1>
 585   store <4 x double> %r0, <4 x double>* %out0, align 32
 586   %r1 = shufflevector <4 x double> %t0, <4 x double> %t1, <4 x i32> <i32 4, i32 7, i32 2, i32 5>
 587   store <4 x double> %r1, <4 x double>* %out1, align 32
 588   %t2 = shufflevector <4 x double> %v0, <4 x double> %v1, <4 x i32> <i32 3, i32 7, i32 undef, i32 undef>
 589   %r2 = shufflevector <4 x double> %t2, <4 x double> %v2, <4 x i32> <i32 6, i32 0, i32 1, i32 7>
 590   store <4 x double> %r2, <4 x double>* %out2, align 32
 591   ret void
 592 }
 593
 594 define <4 x i64> @concat_self_v4i64(<2 x i64> %x) {
 595 ; AVX1-LABEL: concat_self_v4i64:
 596 ; AVX1:       # %bb.0:
 597 ; AVX1-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
 598 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
 599 ; AVX1-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm0[0,0,3,3]
 600 ; AVX1-NEXT:    ret{{[l|q]}}
 601 ;
 602 ; AVX2-LABEL: concat_self_v4i64:
 603 ; AVX2:       # %bb.0:
 604 ; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
 605 ; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,1]
 606 ; AVX2-NEXT:    ret{{[l|q]}}
 607 ;
 608 ; AVX512-LABEL: concat_self_v4i64:
 609 ; AVX512:       # %bb.0:
 610 ; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
 611 ; AVX512-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,1]
 612 ; AVX512-NEXT:    ret{{[l|q]}}
 613   %cat = shufflevector <2 x i64> %x, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
 614   %s = shufflevector <4 x i64> %cat, <4 x i64> undef, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
 615   ret <4 x i64> %s
 616 }
 617
 618 define <8 x i32> @concat_self_v8i32(<4 x i32> %x) {
 619 ; AVX1-LABEL: concat_self_v8i32:
 620 ; AVX1:       # %bb.0:
 621 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[3,2,1,0]
 622 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[0,2,1,3]
 623 ; AVX1-NEXT:    vpaddd %xmm0, %xmm2, %xmm2
 624 ; AVX1-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
 625 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
 626 ; AVX1-NEXT:    ret{{[l|q]}}
 627 ;
 628 ; AVX2-LABEL: concat_self_v8i32:
 629 ; AVX2:       # %bb.0:
 630 ; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
 631 ; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm1
 632 ; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [3,2,1,0,0,2,1,3]
 633 ; AVX2-NEXT:    vpermd %ymm0, %ymm2, %ymm0
 634 ; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
 635 ; AVX2-NEXT:    ret{{[l|q]}}
 636 ;
 637 ; AVX512-LABEL: concat_self_v8i32:
 638 ; AVX512:       # %bb.0:
 639 ; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
 640 ; AVX512-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm1
 641 ; AVX512-NEXT:    vmovdqa {{.*#+}} ymm2 = [3,2,1,0,0,2,1,3]
 642 ; AVX512-NEXT:    vpermd %ymm0, %ymm2, %ymm0
 643 ; AVX512-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
 644 ; AVX512-NEXT:    ret{{[l|q]}}
 645   %cat = shufflevector <4 x i32> %x, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
 646   %s = shufflevector <8 x i32> %cat, <8 x i32> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 0, i32 2, i32 1, i32 3>
 647   %a = add <8 x i32> %s, %cat
 648   ret <8 x i32> %a
 649 }