test/CodeGen/X86/vector-shuffle-128-unpck.ll

   1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
   2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX1OR2 --check-prefix=AVX1
   3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX1OR2 --check-prefix=AVX2OR512VL --check-prefix=AVX2 --check-prefix=AVX2-SLOW
   4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefix=ALL --check-prefix=AVX1OR2 --check-prefix=AVX2OR512VL --check-prefix=AVX2 --check-prefix=AVX2-FAST
   5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512dq | FileCheck %s --check-prefix=ALL --check-prefix=AVX2OR512VL --check-prefix=AVX512VL --check-prefix=AVX512VL-SLOW
   6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512dq,+fast-variable-shuffle | FileCheck %s --check-prefix=ALL --check-prefix=AVX2OR512VL --check-prefix=AVX512VL --check-prefix=AVX512VL-FAST
   7
   8 define <2 x i64> @unpckh_unary_extracted_v4i64(<4 x i64> %x) {
   9 ; AVX1-LABEL: unpckh_unary_extracted_v4i64:
  10 ; AVX1:       # %bb.0:
  11 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
  12 ; AVX1-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
  13 ; AVX1-NEXT:    vzeroupper
  14 ; AVX1-NEXT:    retq
  15 ;
  16 ; AVX2OR512VL-LABEL: unpckh_unary_extracted_v4i64:
  17 ; AVX2OR512VL:       # %bb.0:
  18 ; AVX2OR512VL-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[1,3,2,3]
  19 ; AVX2OR512VL-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
  20 ; AVX2OR512VL-NEXT:    vzeroupper
  21 ; AVX2OR512VL-NEXT:    retq
  22   %extrl = shufflevector <4 x i64> %x, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
  23   %extrh = shufflevector <4 x i64> %x, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
  24   %r = shufflevector <2 x i64> %extrl, <2 x i64> %extrh, <2 x i32> <i32 1, i32 3>
  25   ret <2 x i64> %r
  26 }
  27
  28 define <2 x double> @unpckh_unary_extracted_v8f64(<4 x double> %x) {
  29 ; AVX1-LABEL: unpckh_unary_extracted_v8f64:
  30 ; AVX1:       # %bb.0:
  31 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
  32 ; AVX1-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
  33 ; AVX1-NEXT:    vzeroupper
  34 ; AVX1-NEXT:    retq
  35 ;
  36 ; AVX2OR512VL-LABEL: unpckh_unary_extracted_v8f64:
  37 ; AVX2OR512VL:       # %bb.0:
  38 ; AVX2OR512VL-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[1,3,2,3]
  39 ; AVX2OR512VL-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
  40 ; AVX2OR512VL-NEXT:    vzeroupper
  41 ; AVX2OR512VL-NEXT:    retq
  42   %extrl = shufflevector <4 x double> %x, <4 x double> undef, <2 x i32> <i32 0, i32 1>
  43   %extrh = shufflevector <4 x double> %x, <4 x double> undef, <2 x i32> <i32 2, i32 3>
  44   %r = shufflevector <2 x double> %extrl, <2 x double> %extrh, <2 x i32> <i32 1, i32 3>
  45   ret <2 x double> %r
  46 }
  47
  48 ; vpermps requires a constant load for the index op. It's unlikely to be profitable.
  49
  50 define <4 x i32> @unpckh_unary_extracted_v8i32(<8 x i32> %x) {
  51 ; ALL-LABEL: unpckh_unary_extracted_v8i32:
  52 ; ALL:       # %bb.0:
  53 ; ALL-NEXT:    vextractf128 $1, %ymm0, %xmm1
  54 ; ALL-NEXT:    vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
  55 ; ALL-NEXT:    vzeroupper
  56 ; ALL-NEXT:    retq
  57   %extrl = shufflevector <8 x i32> %x, <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
  58   %extrh = shufflevector <8 x i32> %x, <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
  59   %r = shufflevector <4 x i32> %extrl, <4 x i32> %extrh, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
  60   ret <4 x i32> %r
  61 }
  62
  63 define <4 x float> @unpckh_unary_extracted_v8f32(<8 x float> %x) {
  64 ; ALL-LABEL: unpckh_unary_extracted_v8f32:
  65 ; ALL:       # %bb.0:
  66 ; ALL-NEXT:    vextractf128 $1, %ymm0, %xmm1
  67 ; ALL-NEXT:    vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
  68 ; ALL-NEXT:    vzeroupper
  69 ; ALL-NEXT:    retq
  70   %extrl = shufflevector <8 x float> %x, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
  71   %extrh = shufflevector <8 x float> %x, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
  72   %r = shufflevector <4 x float> %extrl, <4 x float> %extrh, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
  73   ret <4 x float> %r
  74 }
  75
  76 define <8 x i16> @unpckh_unary_extracted_v16i16(<16 x i16> %x) {
  77 ; AVX1-LABEL: unpckh_unary_extracted_v16i16:
  78 ; AVX1:       # %bb.0:
  79 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
  80 ; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
  81 ; AVX1-NEXT:    vzeroupper
  82 ; AVX1-NEXT:    retq
  83 ;
  84 ; AVX2OR512VL-LABEL: unpckh_unary_extracted_v16i16:
  85 ; AVX2OR512VL:       # %bb.0:
  86 ; AVX2OR512VL-NEXT:    vextracti128 $1, %ymm0, %xmm1
  87 ; AVX2OR512VL-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
  88 ; AVX2OR512VL-NEXT:    vzeroupper
  89 ; AVX2OR512VL-NEXT:    retq
  90   %extrl = shufflevector <16 x i16> %x, <16 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
  91   %extrh = shufflevector <16 x i16> %x, <16 x i16> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
  92   %r = shufflevector <8 x i16> %extrl, <8 x i16> %extrh, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
  93   ret <8 x i16> %r
  94 }
  95
  96 define <16 x i8> @unpckh_unary_extracted_v32i8(<32 x i8> %x) {
  97 ; AVX1-LABEL: unpckh_unary_extracted_v32i8:
  98 ; AVX1:       # %bb.0:
  99 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
 100 ; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
 101 ; AVX1-NEXT:    vzeroupper
 102 ; AVX1-NEXT:    retq
 103 ;
 104 ; AVX2OR512VL-LABEL: unpckh_unary_extracted_v32i8:
 105 ; AVX2OR512VL:       # %bb.0:
 106 ; AVX2OR512VL-NEXT:    vextracti128 $1, %ymm0, %xmm1
 107 ; AVX2OR512VL-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
 108 ; AVX2OR512VL-NEXT:    vzeroupper
 109 ; AVX2OR512VL-NEXT:    retq
 110   %extrl = shufflevector <32 x i8> %x, <32 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 111   %extrh = shufflevector <32 x i8> %x, <32 x i8> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 112   %r = shufflevector <16 x i8> %extrl, <16 x i8> %extrh, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
 113   ret <16 x i8> %r
 114 }
 115
 116 define <2 x i64> @unpckl_unary_extracted_v4i64(<4 x i64> %x) {
 117 ; AVX1-LABEL: unpckl_unary_extracted_v4i64:
 118 ; AVX1:       # %bb.0:
 119 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
 120 ; AVX1-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 121 ; AVX1-NEXT:    vzeroupper
 122 ; AVX1-NEXT:    retq
 123 ;
 124 ; AVX2OR512VL-LABEL: unpckl_unary_extracted_v4i64:
 125 ; AVX2OR512VL:       # %bb.0:
 126 ; AVX2OR512VL-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3]
 127 ; AVX2OR512VL-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
 128 ; AVX2OR512VL-NEXT:    vzeroupper
 129 ; AVX2OR512VL-NEXT:    retq
 130   %extrl = shufflevector <4 x i64> %x, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
 131   %extrh = shufflevector <4 x i64> %x, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
 132   %r = shufflevector <2 x i64> %extrl, <2 x i64> %extrh, <2 x i32> <i32 0, i32 2>
 133   ret <2 x i64> %r
 134 }
 135
 136 define <2 x double> @unpckl_unary_extracted_v8f64(<4 x double> %x) {
 137 ; AVX1-LABEL: unpckl_unary_extracted_v8f64:
 138 ; AVX1:       # %bb.0:
 139 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
 140 ; AVX1-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 141 ; AVX1-NEXT:    vzeroupper
 142 ; AVX1-NEXT:    retq
 143 ;
 144 ; AVX2OR512VL-LABEL: unpckl_unary_extracted_v8f64:
 145 ; AVX2OR512VL:       # %bb.0:
 146 ; AVX2OR512VL-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3]
 147 ; AVX2OR512VL-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
 148 ; AVX2OR512VL-NEXT:    vzeroupper
 149 ; AVX2OR512VL-NEXT:    retq
 150   %extrl = shufflevector <4 x double> %x, <4 x double> undef, <2 x i32> <i32 0, i32 1>
 151   %extrh = shufflevector <4 x double> %x, <4 x double> undef, <2 x i32> <i32 2, i32 3>
 152   %r = shufflevector <2 x double> %extrl, <2 x double> %extrh, <2 x i32> <i32 0, i32 2>
 153   ret <2 x double> %r
 154 }
 155
 156 ; vpermps requires a constant load for the index op. It's unlikely to be profitable.
 157
 158 define <4 x i32> @unpckl_unary_extracted_v8i32(<8 x i32> %x) {
 159 ; ALL-LABEL: unpckl_unary_extracted_v8i32:
 160 ; ALL:       # %bb.0:
 161 ; ALL-NEXT:    vextractf128 $1, %ymm0, %xmm1
 162 ; ALL-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 163 ; ALL-NEXT:    vzeroupper
 164 ; ALL-NEXT:    retq
 165   %extrl = shufflevector <8 x i32> %x, <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 166   %extrh = shufflevector <8 x i32> %x, <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 167   %r = shufflevector <4 x i32> %extrl, <4 x i32> %extrh, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
 168   ret <4 x i32> %r
 169 }
 170
 171 define <4 x float> @unpckl_unary_extracted_v8f32(<8 x float> %x) {
 172 ; ALL-LABEL: unpckl_unary_extracted_v8f32:
 173 ; ALL:       # %bb.0:
 174 ; ALL-NEXT:    vextractf128 $1, %ymm0, %xmm1
 175 ; ALL-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 176 ; ALL-NEXT:    vzeroupper
 177 ; ALL-NEXT:    retq
 178   %extrl = shufflevector <8 x float> %x, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 179   %extrh = shufflevector <8 x float> %x, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 180   %r = shufflevector <4 x float> %extrl, <4 x float> %extrh, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
 181   ret <4 x float> %r
 182 }
 183
 184 define <8 x i16> @unpckl_unary_extracted_v16i16(<16 x i16> %x) {
 185 ; AVX1-LABEL: unpckl_unary_extracted_v16i16:
 186 ; AVX1:       # %bb.0:
 187 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
 188 ; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
 189 ; AVX1-NEXT:    vzeroupper
 190 ; AVX1-NEXT:    retq
 191 ;
 192 ; AVX2OR512VL-LABEL: unpckl_unary_extracted_v16i16:
 193 ; AVX2OR512VL:       # %bb.0:
 194 ; AVX2OR512VL-NEXT:    vextracti128 $1, %ymm0, %xmm1
 195 ; AVX2OR512VL-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
 196 ; AVX2OR512VL-NEXT:    vzeroupper
 197 ; AVX2OR512VL-NEXT:    retq
 198   %extrl = shufflevector <16 x i16> %x, <16 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 199   %extrh = shufflevector <16 x i16> %x, <16 x i16> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 200   %r = shufflevector <8 x i16> %extrl, <8 x i16> %extrh, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
 201   ret <8 x i16> %r
 202 }
 203
 204 define <16 x i8> @unpckl_unary_extracted_v32i8(<32 x i8> %x) {
 205 ; AVX1-LABEL: unpckl_unary_extracted_v32i8:
 206 ; AVX1:       # %bb.0:
 207 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
 208 ; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
 209 ; AVX1-NEXT:    vzeroupper
 210 ; AVX1-NEXT:    retq
 211 ;
 212 ; AVX2OR512VL-LABEL: unpckl_unary_extracted_v32i8:
 213 ; AVX2OR512VL:       # %bb.0:
 214 ; AVX2OR512VL-NEXT:    vextracti128 $1, %ymm0, %xmm1
 215 ; AVX2OR512VL-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
 216 ; AVX2OR512VL-NEXT:    vzeroupper
 217 ; AVX2OR512VL-NEXT:    retq
 218   %extrl = shufflevector <32 x i8> %x, <32 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 219   %extrh = shufflevector <32 x i8> %x, <32 x i8> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 220   %r = shufflevector <16 x i8> %extrl, <16 x i8> %extrh, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
 221   ret <16 x i8> %r
 222 }
 223
 224 ; This would infinite loop because we did not recognize the unpack shuffle mask in commuted form.
 225
 226 define <8 x i32> @extract_unpckl_v8i32(<8 x i32> %a) {
 227 ; ALL-LABEL: extract_unpckl_v8i32:
 228 ; ALL:       # %bb.0:
 229 ; ALL-NEXT:    vextractf128 $1, %ymm0, %xmm1
 230 ; ALL-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
 231 ; ALL-NEXT:    retq
 232   %shuffle = shufflevector <8 x i32> %a, <8 x i32> undef, <8 x i32> <i32 4, i32 undef, i32 5, i32 1, i32 undef, i32 undef, i32 undef, i32 undef>
 233   ret <8 x i32> %shuffle
 234 }
 235