test/CodeGen/X86/avx-vperm2x128.ll

   1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
   2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx -disable-peephole | FileCheck %s --check-prefix=ALL --check-prefix=AVX1
   3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 -disable-peephole | FileCheck %s --check-prefix=ALL --check-prefix=AVX2
   4
   5 define <8 x float> @shuffle_v8f32_45670123(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
   6 ; AVX1-LABEL: shuffle_v8f32_45670123:
   7 ; AVX1:       # %bb.0: # %entry
   8 ; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
   9 ; AVX1-NEXT:    retq
  10 ;
  11 ; AVX2-LABEL: shuffle_v8f32_45670123:
  12 ; AVX2:       # %bb.0: # %entry
  13 ; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,3,0,1]
  14 ; AVX2-NEXT:    retq
  15 entry:
  16   %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
  17   ret <8 x float> %shuffle
  18 }
  19
  20 define <8 x float> @shuffle_v8f32_45670123_mem(<8 x float>* %pa, <8 x float>* %pb) nounwind uwtable readnone ssp {
  21 ; AVX1-LABEL: shuffle_v8f32_45670123_mem:
  22 ; AVX1:       # %bb.0: # %entry
  23 ; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = mem[2,3,0,1]
  24 ; AVX1-NEXT:    retq
  25 ;
  26 ; AVX2-LABEL: shuffle_v8f32_45670123_mem:
  27 ; AVX2:       # %bb.0: # %entry
  28 ; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = mem[2,3,0,1]
  29 ; AVX2-NEXT:    retq
  30 entry:
  31   %a = load <8 x float>, <8 x float>* %pa
  32   %b = load <8 x float>, <8 x float>* %pb
  33   %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
  34   ret <8 x float> %shuffle
  35 }
  36
  37 define <8 x float> @shuffle_v8f32_0123cdef(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
  38 ; ALL-LABEL: shuffle_v8f32_0123cdef:
  39 ; ALL:       # %bb.0: # %entry
  40 ; ALL-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
  41 ; ALL-NEXT:    retq
  42 entry:
  43   %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
  44   ret <8 x float> %shuffle
  45 }
  46
  47 define <8 x float> @shuffle_v8f32_01230123(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
  48 ; AVX1-LABEL: shuffle_v8f32_01230123:
  49 ; AVX1:       # %bb.0: # %entry
  50 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
  51 ; AVX1-NEXT:    retq
  52 ;
  53 ; AVX2-LABEL: shuffle_v8f32_01230123:
  54 ; AVX2:       # %bb.0: # %entry
  55 ; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,1]
  56 ; AVX2-NEXT:    retq
  57 entry:
  58   %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
  59   ret <8 x float> %shuffle
  60 }
  61
  62 define <8 x float> @shuffle_v8f32_01230123_mem(<8 x float>* %pa, <8 x float>* %pb) nounwind uwtable readnone ssp {
  63 ; AVX1-LABEL: shuffle_v8f32_01230123_mem:
  64 ; AVX1:       # %bb.0: # %entry
  65 ; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = mem[0,1,0,1]
  66 ; AVX1-NEXT:    retq
  67 ;
  68 ; AVX2-LABEL: shuffle_v8f32_01230123_mem:
  69 ; AVX2:       # %bb.0: # %entry
  70 ; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = mem[0,1,0,1]
  71 ; AVX2-NEXT:    retq
  72 entry:
  73   %a = load <8 x float>, <8 x float>* %pa
  74   %b = load <8 x float>, <8 x float>* %pb
  75   %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
  76   ret <8 x float> %shuffle
  77 }
  78
  79 define <8 x float> @shuffle_v8f32_45674567(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
  80 ; AVX1-LABEL: shuffle_v8f32_45674567:
  81 ; AVX1:       # %bb.0: # %entry
  82 ; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
  83 ; AVX1-NEXT:    retq
  84 ;
  85 ; AVX2-LABEL: shuffle_v8f32_45674567:
  86 ; AVX2:       # %bb.0: # %entry
  87 ; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,3,2,3]
  88 ; AVX2-NEXT:    retq
  89 entry:
  90   %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7>
  91   ret <8 x float> %shuffle
  92 }
  93
  94 define <8 x float> @shuffle_v8f32_45674567_mem(<8 x float>* %pa, <8 x float>* %pb) nounwind uwtable readnone ssp {
  95 ; AVX1-LABEL: shuffle_v8f32_45674567_mem:
  96 ; AVX1:       # %bb.0: # %entry
  97 ; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = mem[2,3,2,3]
  98 ; AVX1-NEXT:    retq
  99 ;
 100 ; AVX2-LABEL: shuffle_v8f32_45674567_mem:
 101 ; AVX2:       # %bb.0: # %entry
 102 ; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = mem[2,3,2,3]
 103 ; AVX2-NEXT:    retq
 104 entry:
 105   %a = load <8 x float>, <8 x float>* %pa
 106   %b = load <8 x float>, <8 x float>* %pb
 107   %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7>
 108   ret <8 x float> %shuffle
 109 }
 110
 111 define <32 x i8> @shuffle_v32i8_2323(<32 x i8> %a, <32 x i8> %b) nounwind uwtable readnone ssp {
 112 ; AVX1-LABEL: shuffle_v32i8_2323:
 113 ; AVX1:       # %bb.0: # %entry
 114 ; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
 115 ; AVX1-NEXT:    retq
 116 ;
 117 ; AVX2-LABEL: shuffle_v32i8_2323:
 118 ; AVX2:       # %bb.0: # %entry
 119 ; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,3,2,3]
 120 ; AVX2-NEXT:    retq
 121 entry:
 122   %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 123   ret <32 x i8> %shuffle
 124 }
 125
 126 define <32 x i8> @shuffle_v32i8_2323_domain(<32 x i8> %a, <32 x i8> %b) nounwind uwtable readnone ssp {
 127 ; AVX1-LABEL: shuffle_v32i8_2323_domain:
 128 ; AVX1:       # %bb.0: # %entry
 129 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
 130 ; AVX1-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
 131 ; AVX1-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
 132 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
 133 ; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
 134 ; AVX1-NEXT:    retq
 135 ;
 136 ; AVX2-LABEL: shuffle_v32i8_2323_domain:
 137 ; AVX2:       # %bb.0: # %entry
 138 ; AVX2-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
 139 ; AVX2-NEXT:    vpsubb %ymm1, %ymm0, %ymm0
 140 ; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3]
 141 ; AVX2-NEXT:    retq
 142 entry:
 143   ; add forces execution domain
 144   %a2 = add <32 x i8> %a, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
 145   %shuffle = shufflevector <32 x i8> %a2, <32 x i8> %b, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 146   ret <32 x i8> %shuffle
 147 }
 148
 149 define <4 x i64> @shuffle_v4i64_6701(<4 x i64> %a, <4 x i64> %b) nounwind uwtable readnone ssp {
 150 ; ALL-LABEL: shuffle_v4i64_6701:
 151 ; ALL:       # %bb.0: # %entry
 152 ; ALL-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[0,1]
 153 ; ALL-NEXT:    retq
 154 entry:
 155   %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
 156   ret <4 x i64> %shuffle
 157 }
 158
 159 define <4 x i64> @shuffle_v4i64_6701_domain(<4 x i64> %a, <4 x i64> %b) nounwind uwtable readnone ssp {
 160 ; AVX1-LABEL: shuffle_v4i64_6701_domain:
 161 ; AVX1:       # %bb.0: # %entry
 162 ; AVX1-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
 163 ; AVX1-NEXT:    vpsubq %xmm2, %xmm0, %xmm0
 164 ; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[0,1]
 165 ; AVX1-NEXT:    retq
 166 ;
 167 ; AVX2-LABEL: shuffle_v4i64_6701_domain:
 168 ; AVX2:       # %bb.0: # %entry
 169 ; AVX2-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
 170 ; AVX2-NEXT:    vpsubq %ymm2, %ymm0, %ymm0
 171 ; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[0,1]
 172 ; AVX2-NEXT:    retq
 173 entry:
 174   ; add forces execution domain
 175   %a2 = add <4 x i64> %a, <i64 1, i64 1, i64 1, i64 1>
 176   %shuffle = shufflevector <4 x i64> %a2, <4 x i64> %b, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
 177   ret <4 x i64> %shuffle
 178 }
 179
 180 define <8 x i32> @shuffle_v8i32_u5u7cdef(<8 x i32> %a, <8 x i32> %b) nounwind uwtable readnone ssp {
 181 ; AVX1-LABEL: shuffle_v8i32_u5u7cdef:
 182 ; AVX1:       # %bb.0: # %entry
 183 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
 184 ; AVX1-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
 185 ; AVX1-NEXT:    vpsubd %xmm2, %xmm0, %xmm0
 186 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
 187 ; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
 188 ; AVX1-NEXT:    retq
 189 ;
 190 ; AVX2-LABEL: shuffle_v8i32_u5u7cdef:
 191 ; AVX2:       # %bb.0: # %entry
 192 ; AVX2-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
 193 ; AVX2-NEXT:    vpsubd %ymm2, %ymm0, %ymm0
 194 ; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
 195 ; AVX2-NEXT:    retq
 196 entry:
 197   ; add forces execution domain
 198   %a2 = add <8 x i32> %a, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
 199   %shuffle = shufflevector <8 x i32> %a2, <8 x i32> %b, <8 x i32> <i32 undef, i32 5, i32 undef, i32 7, i32 12, i32 13, i32 14, i32 15>
 200   ret <8 x i32> %shuffle
 201 }
 202
 203 define <16 x i16> @shuffle_v16i16_4501(<16 x i16> %a, <16 x i16> %b) nounwind uwtable readnone ssp {
 204 ; AVX1-LABEL: shuffle_v16i16_4501:
 205 ; AVX1:       # %bb.0: # %entry
 206 ; AVX1-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
 207 ; AVX1-NEXT:    vpsubw %xmm2, %xmm0, %xmm0
 208 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
 209 ; AVX1-NEXT:    retq
 210 ;
 211 ; AVX2-LABEL: shuffle_v16i16_4501:
 212 ; AVX2:       # %bb.0: # %entry
 213 ; AVX2-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
 214 ; AVX2-NEXT:    vpsubw %xmm2, %xmm0, %xmm0
 215 ; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
 216 ; AVX2-NEXT:    retq
 217 entry:
 218   ; add forces execution domain
 219   %a2 = add <16 x i16> %a, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
 220   %shuffle = shufflevector <16 x i16> %a2, <16 x i16> %b, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 221   ret <16 x i16> %shuffle
 222 }
 223
 224 define <16 x i16> @shuffle_v16i16_4501_mem(<16 x i16>* %a, <16 x i16>* %b) nounwind uwtable readnone ssp {
 225 ; AVX1-LABEL: shuffle_v16i16_4501_mem:
 226 ; AVX1:       # %bb.0: # %entry
 227 ; AVX1-NEXT:    vmovdqa (%rdi), %xmm0
 228 ; AVX1-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
 229 ; AVX1-NEXT:    vpsubw %xmm1, %xmm0, %xmm0
 230 ; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = mem[0,1],ymm0[0,1]
 231 ; AVX1-NEXT:    retq
 232 ;
 233 ; AVX2-LABEL: shuffle_v16i16_4501_mem:
 234 ; AVX2:       # %bb.0: # %entry
 235 ; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
 236 ; AVX2-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
 237 ; AVX2-NEXT:    vpsubw %ymm1, %ymm0, %ymm0
 238 ; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm0 = mem[0,1],ymm0[0,1]
 239 ; AVX2-NEXT:    retq
 240 entry:
 241   %c = load <16 x i16>, <16 x i16>* %a
 242   %d = load <16 x i16>, <16 x i16>* %b
 243   %c2 = add <16 x i16> %c, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
 244   %shuffle = shufflevector <16 x i16> %c2, <16 x i16> %d, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 245   ret <16 x i16> %shuffle
 246 }
 247
 248 ;;;; Cases with undef indicies mixed in the mask
 249
 250 define <8 x float> @shuffle_v8f32_uu67u9ub(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
 251 ; ALL-LABEL: shuffle_v8f32_uu67u9ub:
 252 ; ALL:       # %bb.0: # %entry
 253 ; ALL-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
 254 ; ALL-NEXT:    retq
 255 entry:
 256   %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 undef, i32 undef, i32 6, i32 7, i32 undef, i32 9, i32 undef, i32 11>
 257   ret <8 x float> %shuffle
 258 }
 259
 260 define <8 x float> @shuffle_v8f32_uu67uu67(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
 261 ; AVX1-LABEL: shuffle_v8f32_uu67uu67:
 262 ; AVX1:       # %bb.0: # %entry
 263 ; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
 264 ; AVX1-NEXT:    retq
 265 ;
 266 ; AVX2-LABEL: shuffle_v8f32_uu67uu67:
 267 ; AVX2:       # %bb.0: # %entry
 268 ; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3]
 269 ; AVX2-NEXT:    retq
 270 entry:
 271   %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 undef, i32 undef, i32 6, i32 7, i32 undef, i32 undef, i32 6, i32 7>
 272   ret <8 x float> %shuffle
 273 }
 274
 275 define <8 x float> @shuffle_v8f32_uu67uuab(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
 276 ; ALL-LABEL: shuffle_v8f32_uu67uuab:
 277 ; ALL:       # %bb.0: # %entry
 278 ; ALL-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
 279 ; ALL-NEXT:    retq
 280 entry:
 281   %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 undef, i32 undef, i32 6, i32 7, i32 undef, i32 undef, i32 10, i32 11>
 282   ret <8 x float> %shuffle
 283 }
 284
 285 define <8 x float> @shuffle_v8f32_uu67uuef(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
 286 ; ALL-LABEL: shuffle_v8f32_uu67uuef:
 287 ; ALL:       # %bb.0: # %entry
 288 ; ALL-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
 289 ; ALL-NEXT:    retq
 290 entry:
 291   %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 undef, i32 undef, i32 6, i32 7, i32 undef, i32 undef, i32 14, i32 15>
 292   ret <8 x float> %shuffle
 293 }
 294
 295 define <8 x float> @shuffle_v8f32_uu674567(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
 296 ; AVX1-LABEL: shuffle_v8f32_uu674567:
 297 ; AVX1:       # %bb.0: # %entry
 298 ; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
 299 ; AVX1-NEXT:    retq
 300 ;
 301 ; AVX2-LABEL: shuffle_v8f32_uu674567:
 302 ; AVX2:       # %bb.0: # %entry
 303 ; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3]
 304 ; AVX2-NEXT:    retq
 305 entry:
 306   %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 undef, i32 undef, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7>
 307   ret <8 x float> %shuffle
 308 }
 309
 310 define <8 x float> @shuffle_v8f32_uu6789ab(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
 311 ; ALL-LABEL: shuffle_v8f32_uu6789ab:
 312 ; ALL:       # %bb.0: # %entry
 313 ; ALL-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
 314 ; ALL-NEXT:    retq
 315 entry:
 316   %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 undef, i32 undef, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
 317   ret <8 x float> %shuffle
 318 }
 319
 320 define <8 x float> @shuffle_v8f32_4567uu67(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
 321 ; AVX1-LABEL: shuffle_v8f32_4567uu67:
 322 ; AVX1:       # %bb.0: # %entry
 323 ; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
 324 ; AVX1-NEXT:    retq
 325 ;
 326 ; AVX2-LABEL: shuffle_v8f32_4567uu67:
 327 ; AVX2:       # %bb.0: # %entry
 328 ; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,3,2,3]
 329 ; AVX2-NEXT:    retq
 330 entry:
 331   %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 6, i32 7>
 332   ret <8 x float> %shuffle
 333 }
 334
 335 define <8 x float> @shuffle_v8f32_4567uuef(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
 336 ; ALL-LABEL: shuffle_v8f32_4567uuef:
 337 ; ALL:       # %bb.0: # %entry
 338 ; ALL-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
 339 ; ALL-NEXT:    retq
 340 entry:
 341   %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 14, i32 15>
 342   ret <8 x float> %shuffle
 343 }
 344
 345 ;;;; Cases we must not select vperm2f128
 346
 347 define <8 x float> @shuffle_v8f32_uu67ucuf(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
 348 ; ALL-LABEL: shuffle_v8f32_uu67ucuf:
 349 ; ALL:       # %bb.0: # %entry
 350 ; ALL-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
 351 ; ALL-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,0,2,3,4,4,6,7]
 352 ; ALL-NEXT:    retq
 353 entry:
 354   %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 undef, i32 undef, i32 6, i32 7, i32 undef, i32 12, i32 undef, i32 15>
 355   ret <8 x float> %shuffle
 356 }
 357
 358 ;; Test zero mask generation.
 359 ;; PR22984: https://llvm.org/bugs/show_bug.cgi?id=22984
 360 ;; Prefer xor+vblendpd over vperm2f128 because that has better performance.
 361 ;; TODO: When building for optsize we should use vperm2f128.
 362
 363 define <4 x double> @shuffle_v4f64_zz01(<4 x double> %a) {
 364 ; ALL-LABEL: shuffle_v4f64_zz01:
 365 ; ALL:       # %bb.0:
 366 ; ALL-NEXT:    vperm2f128 {{.*#+}} ymm0 = zero,zero,ymm0[0,1]
 367 ; ALL-NEXT:    retq
 368   %s = shufflevector <4 x double> %a, <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x i32> <i32 4, i32 5, i32 0, i32 1>
 369   ret <4 x double> %s
 370 }
 371 define <4 x double> @shuffle_v4f64_zz01_optsize(<4 x double> %a) optsize {
 372 ; ALL-LABEL: shuffle_v4f64_zz01_optsize:
 373 ; ALL:       # %bb.0:
 374 ; ALL-NEXT:    vperm2f128 {{.*#+}} ymm0 = zero,zero,ymm0[0,1]
 375 ; ALL-NEXT:    retq
 376   %s = shufflevector <4 x double> %a, <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x i32> <i32 4, i32 5, i32 0, i32 1>
 377   ret <4 x double> %s
 378 }
 379
 380 define <4 x double> @shuffle_v4f64_zz23(<4 x double> %a) {
 381 ; ALL-LABEL: shuffle_v4f64_zz23:
 382 ; ALL:       # %bb.0:
 383 ; ALL-NEXT:    vxorps %xmm1, %xmm1, %xmm1
 384 ; ALL-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
 385 ; ALL-NEXT:    retq
 386   %s = shufflevector <4 x double> %a, <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 387   ret <4 x double> %s
 388 }
 389 define <4 x double> @shuffle_v4f64_zz23_optsize(<4 x double> %a) optsize {
 390 ; ALL-LABEL: shuffle_v4f64_zz23_optsize:
 391 ; ALL:       # %bb.0:
 392 ; ALL-NEXT:    vxorps %xmm1, %xmm1, %xmm1
 393 ; ALL-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
 394 ; ALL-NEXT:    retq
 395   %s = shufflevector <4 x double> %a, <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 396   ret <4 x double> %s
 397 }
 398
 399 define <4 x double> @shuffle_v4f64_zz45(<4 x double> %a) {
 400 ; ALL-LABEL: shuffle_v4f64_zz45:
 401 ; ALL:       # %bb.0:
 402 ; ALL-NEXT:    vperm2f128 {{.*#+}} ymm0 = zero,zero,ymm0[0,1]
 403 ; ALL-NEXT:    retq
 404   %s = shufflevector <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x double> %a, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 405   ret <4 x double> %s
 406 }
 407 define <4 x double> @shuffle_v4f64_zz45_optsize(<4 x double> %a) optsize {
 408 ; ALL-LABEL: shuffle_v4f64_zz45_optsize:
 409 ; ALL:       # %bb.0:
 410 ; ALL-NEXT:    vperm2f128 {{.*#+}} ymm0 = zero,zero,ymm0[0,1]
 411 ; ALL-NEXT:    retq
 412   %s = shufflevector <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x double> %a, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 413   ret <4 x double> %s
 414 }
 415
 416 define <4 x double> @shuffle_v4f64_zz67(<4 x double> %a) {
 417 ; ALL-LABEL: shuffle_v4f64_zz67:
 418 ; ALL:       # %bb.0:
 419 ; ALL-NEXT:    vxorps %xmm1, %xmm1, %xmm1
 420 ; ALL-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
 421 ; ALL-NEXT:    retq
 422   %s = shufflevector <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x double> %a, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
 423   ret <4 x double> %s
 424 }
 425 define <4 x double> @shuffle_v4f64_zz67_optsize(<4 x double> %a) optsize {
 426 ; ALL-LABEL: shuffle_v4f64_zz67_optsize:
 427 ; ALL:       # %bb.0:
 428 ; ALL-NEXT:    vxorps %xmm1, %xmm1, %xmm1
 429 ; ALL-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
 430 ; ALL-NEXT:    retq
 431   %s = shufflevector <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x double> %a, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
 432   ret <4 x double> %s
 433 }
 434
 435 define <4 x double> @shuffle_v4f64_01zz(<4 x double> %a) {
 436 ; ALL-LABEL: shuffle_v4f64_01zz:
 437 ; ALL:       # %bb.0:
 438 ; ALL-NEXT:    vmovaps %xmm0, %xmm0
 439 ; ALL-NEXT:    retq
 440   %s = shufflevector <4 x double> %a, <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 441   ret <4 x double> %s
 442 }
 443 define <4 x double> @shuffle_v4f64_01zz_optsize(<4 x double> %a) optsize {
 444 ; ALL-LABEL: shuffle_v4f64_01zz_optsize:
 445 ; ALL:       # %bb.0:
 446 ; ALL-NEXT:    vmovaps %xmm0, %xmm0
 447 ; ALL-NEXT:    retq
 448   %s = shufflevector <4 x double> %a, <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 449   ret <4 x double> %s
 450 }
 451
 452 define <4 x double> @shuffle_v4f64_23zz(<4 x double> %a) {
 453 ; ALL-LABEL: shuffle_v4f64_23zz:
 454 ; ALL:       # %bb.0:
 455 ; ALL-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],zero,zero
 456 ; ALL-NEXT:    retq
 457   %s = shufflevector <4 x double> %a, <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 458   ret <4 x double> %s
 459 }
 460 define <4 x double> @shuffle_v4f64_23zz_optsize(<4 x double> %a) optsize {
 461 ; ALL-LABEL: shuffle_v4f64_23zz_optsize:
 462 ; ALL:       # %bb.0:
 463 ; ALL-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],zero,zero
 464 ; ALL-NEXT:    retq
 465   %s = shufflevector <4 x double> %a, <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 466   ret <4 x double> %s
 467 }
 468
 469 define <4 x double> @shuffle_v4f64_45zz(<4 x double> %a) {
 470 ; ALL-LABEL: shuffle_v4f64_45zz:
 471 ; ALL:       # %bb.0:
 472 ; ALL-NEXT:    vmovaps %xmm0, %xmm0
 473 ; ALL-NEXT:    retq
 474   %s = shufflevector <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x double> %a, <4 x i32> <i32 4, i32 5, i32 0, i32 1>
 475   ret <4 x double> %s
 476 }
 477 define <4 x double> @shuffle_v4f64_45zz_optsize(<4 x double> %a) optsize {
 478 ; ALL-LABEL: shuffle_v4f64_45zz_optsize:
 479 ; ALL:       # %bb.0:
 480 ; ALL-NEXT:    vmovaps %xmm0, %xmm0
 481 ; ALL-NEXT:    retq
 482   %s = shufflevector <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x double> %a, <4 x i32> <i32 4, i32 5, i32 0, i32 1>
 483   ret <4 x double> %s
 484 }
 485
 486 define <4 x double> @shuffle_v4f64_67zz(<4 x double> %a) {
 487 ; ALL-LABEL: shuffle_v4f64_67zz:
 488 ; ALL:       # %bb.0:
 489 ; ALL-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],zero,zero
 490 ; ALL-NEXT:    retq
 491   %s = shufflevector <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x double> %a, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
 492   ret <4 x double> %s
 493 }
 494 define <4 x double> @shuffle_v4f64_67zz_optsize(<4 x double> %a) optsize {
 495 ; ALL-LABEL: shuffle_v4f64_67zz_optsize:
 496 ; ALL:       # %bb.0:
 497 ; ALL-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],zero,zero
 498 ; ALL-NEXT:    retq
 499   %s = shufflevector <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x double> %a, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
 500   ret <4 x double> %s
 501 }
 502
 503 ;; With AVX2 select the integer version of the instruction. Use an add to force the domain selection.
 504
 505 define <4 x i64> @shuffle_v4i64_67zz(<4 x i64> %a, <4 x i64> %b) {
 506 ; AVX1-LABEL: shuffle_v4i64_67zz:
 507 ; AVX1:       # %bb.0:
 508 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
 509 ; AVX1-NEXT:    vpaddq %xmm0, %xmm1, %xmm0
 510 ; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
 511 ; AVX1-NEXT:    retq
 512 ;
 513 ; AVX2-LABEL: shuffle_v4i64_67zz:
 514 ; AVX2:       # %bb.0:
 515 ; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],zero,zero
 516 ; AVX2-NEXT:    vpaddq %ymm0, %ymm1, %ymm0
 517 ; AVX2-NEXT:    retq
 518   %s = shufflevector <4 x i64> <i64 0, i64 0, i64 undef, i64 undef>, <4 x i64> %a, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
 519   %c = add <4 x i64> %b, %s
 520   ret <4 x i64> %c
 521 }
 522
 523 ;;; Memory folding cases
 524
 525 define <4 x double> @ld0_hi0_lo1_4f64(<4 x double> * %pa, <4 x double> %b) nounwind uwtable readnone ssp {
 526 ; AVX1-LABEL: ld0_hi0_lo1_4f64:
 527 ; AVX1:       # %bb.0: # %entry
 528 ; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm0[0,1]
 529 ; AVX1-NEXT:    vaddpd {{.*}}(%rip), %ymm0, %ymm0
 530 ; AVX1-NEXT:    retq
 531 ;
 532 ; AVX2-LABEL: ld0_hi0_lo1_4f64:
 533 ; AVX2:       # %bb.0: # %entry
 534 ; AVX2-NEXT:    vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm0[0,1]
 535 ; AVX2-NEXT:    vbroadcastsd {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 536 ; AVX2-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
 537 ; AVX2-NEXT:    retq
 538 entry:
 539   %a = load <4 x double>, <4 x double> * %pa
 540   %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 541   %res = fadd <4 x double> %shuffle, <double 1.0, double 1.0, double 1.0, double 1.0>
 542   ret <4 x double> %res
 543 }
 544
 545 define <4 x double> @ld1_hi0_hi1_4f64(<4 x double> %a, <4 x double> * %pb) nounwind uwtable readnone ssp {
 546 ; AVX1-LABEL: ld1_hi0_hi1_4f64:
 547 ; AVX1:       # %bb.0: # %entry
 548 ; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3]
 549 ; AVX1-NEXT:    vaddpd {{.*}}(%rip), %ymm0, %ymm0
 550 ; AVX1-NEXT:    retq
 551 ;
 552 ; AVX2-LABEL: ld1_hi0_hi1_4f64:
 553 ; AVX2:       # %bb.0: # %entry
 554 ; AVX2-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3]
 555 ; AVX2-NEXT:    vbroadcastsd {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 556 ; AVX2-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
 557 ; AVX2-NEXT:    retq
 558 entry:
 559   %b = load <4 x double>, <4 x double> * %pb
 560   %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
 561   %res = fadd <4 x double> %shuffle, <double 1.0, double 1.0, double 1.0, double 1.0>
 562   ret <4 x double> %res
 563 }
 564
 565 define <8 x float> @ld0_hi0_lo1_8f32(<8 x float> * %pa, <8 x float> %b) nounwind uwtable readnone ssp {
 566 ; AVX1-LABEL: ld0_hi0_lo1_8f32:
 567 ; AVX1:       # %bb.0: # %entry
 568 ; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm0[0,1]
 569 ; AVX1-NEXT:    vaddps {{.*}}(%rip), %ymm0, %ymm0
 570 ; AVX1-NEXT:    retq
 571 ;
 572 ; AVX2-LABEL: ld0_hi0_lo1_8f32:
 573 ; AVX2:       # %bb.0: # %entry
 574 ; AVX2-NEXT:    vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm0[0,1]
 575 ; AVX2-NEXT:    vbroadcastss {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 576 ; AVX2-NEXT:    vaddps %ymm1, %ymm0, %ymm0
 577 ; AVX2-NEXT:    retq
 578 entry:
 579   %a = load <8 x float>, <8 x float> * %pa
 580   %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
 581   %res = fadd <8 x float> %shuffle, <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>
 582   ret <8 x float> %res
 583 }
 584
 585 define <8 x float> @ld1_hi0_hi1_8f32(<8 x float> %a, <8 x float> * %pb) nounwind uwtable readnone ssp {
 586 ; AVX1-LABEL: ld1_hi0_hi1_8f32:
 587 ; AVX1:       # %bb.0: # %entry
 588 ; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3]
 589 ; AVX1-NEXT:    vaddps {{.*}}(%rip), %ymm0, %ymm0
 590 ; AVX1-NEXT:    retq
 591 ;
 592 ; AVX2-LABEL: ld1_hi0_hi1_8f32:
 593 ; AVX2:       # %bb.0: # %entry
 594 ; AVX2-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3]
 595 ; AVX2-NEXT:    vbroadcastss {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 596 ; AVX2-NEXT:    vaddps %ymm1, %ymm0, %ymm0
 597 ; AVX2-NEXT:    retq
 598 entry:
 599   %b = load <8 x float>, <8 x float> * %pb
 600   %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
 601   %res = fadd <8 x float> %shuffle, <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>
 602   ret <8 x float> %res
 603 }
 604
 605 define <4 x i64> @ld0_hi0_lo1_4i64(<4 x i64> * %pa, <4 x i64> %b) nounwind uwtable readnone ssp {
 606 ; AVX1-LABEL: ld0_hi0_lo1_4i64:
 607 ; AVX1:       # %bb.0: # %entry
 608 ; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm0[0,1]
 609 ; AVX1-NEXT:    vpaddq {{.*}}(%rip), %xmm0, %xmm1
 610 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
 611 ; AVX1-NEXT:    vpaddq {{.*}}(%rip), %xmm0, %xmm0
 612 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
 613 ; AVX1-NEXT:    retq
 614 ;
 615 ; AVX2-LABEL: ld0_hi0_lo1_4i64:
 616 ; AVX2:       # %bb.0: # %entry
 617 ; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm0 = mem[2,3],ymm0[0,1]
 618 ; AVX2-NEXT:    vpaddq {{.*}}(%rip), %ymm0, %ymm0
 619 ; AVX2-NEXT:    retq
 620 entry:
 621   %a = load <4 x i64>, <4 x i64> * %pa
 622   %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 623   %res = add <4 x i64> %shuffle, <i64 1, i64 2, i64 3, i64 4>
 624   ret <4 x i64> %res
 625 }
 626
 627 define <4 x i64> @ld1_hi0_hi1_4i64(<4 x i64> %a, <4 x i64> * %pb) nounwind uwtable readnone ssp {
 628 ; AVX1-LABEL: ld1_hi0_hi1_4i64:
 629 ; AVX1:       # %bb.0: # %entry
 630 ; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3]
 631 ; AVX1-NEXT:    vpaddq {{.*}}(%rip), %xmm0, %xmm1
 632 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
 633 ; AVX1-NEXT:    vpaddq {{.*}}(%rip), %xmm0, %xmm0
 634 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
 635 ; AVX1-NEXT:    retq
 636 ;
 637 ; AVX2-LABEL: ld1_hi0_hi1_4i64:
 638 ; AVX2:       # %bb.0: # %entry
 639 ; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3]
 640 ; AVX2-NEXT:    vpaddq {{.*}}(%rip), %ymm0, %ymm0
 641 ; AVX2-NEXT:    retq
 642 entry:
 643   %b = load <4 x i64>, <4 x i64> * %pb
 644   %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
 645   %res = add <4 x i64> %shuffle, <i64 1, i64 2, i64 3, i64 4>
 646   ret <4 x i64> %res
 647 }
 648
 649 define <8 x i32> @ld0_hi0_lo1_8i32(<8 x i32> * %pa, <8 x i32> %b) nounwind uwtable readnone ssp {
 650 ; AVX1-LABEL: ld0_hi0_lo1_8i32:
 651 ; AVX1:       # %bb.0: # %entry
 652 ; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm0[0,1]
 653 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
 654 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [1,2,3,4]
 655 ; AVX1-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
 656 ; AVX1-NEXT:    vpaddd %xmm2, %xmm0, %xmm0
 657 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 658 ; AVX1-NEXT:    retq
 659 ;
 660 ; AVX2-LABEL: ld0_hi0_lo1_8i32:
 661 ; AVX2:       # %bb.0: # %entry
 662 ; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm0 = mem[2,3],ymm0[0,1]
 663 ; AVX2-NEXT:    vpaddd {{.*}}(%rip), %ymm0, %ymm0
 664 ; AVX2-NEXT:    retq
 665 entry:
 666   %a = load <8 x i32>, <8 x i32> * %pa
 667   %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
 668   %res = add <8 x i32> %shuffle, <i32 1, i32 2, i32 3, i32 4, i32 1, i32 2, i32 3, i32 4>
 669   ret <8 x i32> %res
 670 }
 671
 672 define <8 x i32> @ld1_hi0_hi1_8i32(<8 x i32> %a, <8 x i32> * %pb) nounwind uwtable readnone ssp {
 673 ; AVX1-LABEL: ld1_hi0_hi1_8i32:
 674 ; AVX1:       # %bb.0: # %entry
 675 ; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3]
 676 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
 677 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [1,2,3,4]
 678 ; AVX1-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
 679 ; AVX1-NEXT:    vpaddd %xmm2, %xmm0, %xmm0
 680 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 681 ; AVX1-NEXT:    retq
 682 ;
 683 ; AVX2-LABEL: ld1_hi0_hi1_8i32:
 684 ; AVX2:       # %bb.0: # %entry
 685 ; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3]
 686 ; AVX2-NEXT:    vpaddd {{.*}}(%rip), %ymm0, %ymm0
 687 ; AVX2-NEXT:    retq
 688 entry:
 689   %b = load <8 x i32>, <8 x i32> * %pb
 690   %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
 691   %res = add <8 x i32> %shuffle, <i32 1, i32 2, i32 3, i32 4, i32 1, i32 2, i32 3, i32 4>
 692   ret <8 x i32> %res
 693 }