llvm/test/CodeGen/X86/load-partial.ll

   1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
   2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2
   3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefixes=SSE,SSSE3
   4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41
   5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1
   6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2
   7
   8 ;
   9 ; Partial Vector Loads - PR16739
  10 ;
  11
  12 define <4 x float> @load_float4_float3(ptr nocapture readonly dereferenceable(16)) nofree nosync {
  13 ; SSE-LABEL: load_float4_float3:
  14 ; SSE:       # %bb.0:
  15 ; SSE-NEXT:    movups (%rdi), %xmm0
  16 ; SSE-NEXT:    retq
  17 ;
  18 ; AVX-LABEL: load_float4_float3:
  19 ; AVX:       # %bb.0:
  20 ; AVX-NEXT:    vmovups (%rdi), %xmm0
  21 ; AVX-NEXT:    retq
  22   %p1 = getelementptr inbounds <4 x float>, ptr %0, i64 0, i64 1
  23   %p2 = getelementptr inbounds <4 x float>, ptr %0, i64 0, i64 2
  24   %ld0 = load float, ptr %0, align 4
  25   %ld1 = load float, ptr %p1, align 4
  26   %ld2 = load float, ptr %p2, align 4
  27   %r0 = insertelement <4 x float> undef, float %ld0, i32 0
  28   %r1 = insertelement <4 x float> %r0,   float %ld1, i32 1
  29   %r2 = insertelement <4 x float> %r1,   float %ld2, i32 2
  30   ret <4 x float> %r2
  31 }
  32
  33 define <4 x float> @load_float4_float3_0122(ptr nocapture readonly dereferenceable(16)) nofree nosync {
  34 ; SSE-LABEL: load_float4_float3_0122:
  35 ; SSE:       # %bb.0:
  36 ; SSE-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
  37 ; SSE-NEXT:    movups (%rdi), %xmm0
  38 ; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,0]
  39 ; SSE-NEXT:    retq
  40 ;
  41 ; AVX-LABEL: load_float4_float3_0122:
  42 ; AVX:       # %bb.0:
  43 ; AVX-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
  44 ; AVX-NEXT:    vmovups (%rdi), %xmm1
  45 ; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0,0]
  46 ; AVX-NEXT:    retq
  47   %p1 = getelementptr inbounds <4 x float>, ptr %0, i64 0, i64 1
  48   %p2 = getelementptr inbounds <4 x float>, ptr %0, i64 0, i64 2
  49   %ld0 = load float, ptr %0, align 4
  50   %ld1 = load float, ptr %p1, align 4
  51   %ld2 = load float, ptr %p2, align 4
  52   %r0 = insertelement <4 x float> undef, float %ld0, i32 0
  53   %r1 = insertelement <4 x float> %r0,   float %ld1, i32 1
  54   %r2 = insertelement <4 x float> %r1,   float %ld2, i32 2
  55   %r3 = insertelement <4 x float> %r2,   float %ld2, i32 3
  56   ret <4 x float> %r3
  57 }
  58
  59 define <8 x float> @load_float8_float3(ptr nocapture readonly dereferenceable(16)) nofree nosync {
  60 ; SSE-LABEL: load_float8_float3:
  61 ; SSE:       # %bb.0:
  62 ; SSE-NEXT:    movups (%rdi), %xmm0
  63 ; SSE-NEXT:    retq
  64 ;
  65 ; AVX-LABEL: load_float8_float3:
  66 ; AVX:       # %bb.0:
  67 ; AVX-NEXT:    vmovups (%rdi), %xmm0
  68 ; AVX-NEXT:    retq
  69   %p1 = getelementptr inbounds <4 x float>, ptr %0, i64 0, i64 1
  70   %p2 = getelementptr inbounds <4 x float>, ptr %0, i64 0, i64 2
  71   %ld0 = load float, ptr %0, align 4
  72   %ld1 = load float, ptr %p1, align 4
  73   %ld2 = load float, ptr %p2, align 4
  74   %r0 = insertelement <8 x float> undef, float %ld0, i32 0
  75   %r1 = insertelement <8 x float> %r0,   float %ld1, i32 1
  76   %r2 = insertelement <8 x float> %r1,   float %ld2, i32 2
  77   ret <8 x float> %r2
  78 }
  79
  80 define <8 x float> @load_float8_float3_0122(ptr nocapture readonly dereferenceable(16)) nofree nosync {
  81 ; SSE-LABEL: load_float8_float3_0122:
  82 ; SSE:       # %bb.0:
  83 ; SSE-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
  84 ; SSE-NEXT:    movups (%rdi), %xmm0
  85 ; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,0]
  86 ; SSE-NEXT:    retq
  87 ;
  88 ; AVX-LABEL: load_float8_float3_0122:
  89 ; AVX:       # %bb.0:
  90 ; AVX-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
  91 ; AVX-NEXT:    vmovups (%rdi), %xmm1
  92 ; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0,0]
  93 ; AVX-NEXT:    retq
  94   %p1 = getelementptr inbounds <4 x float>, ptr %0, i64 0, i64 1
  95   %p2 = getelementptr inbounds <4 x float>, ptr %0, i64 0, i64 2
  96   %ld0 = load float, ptr %0, align 4
  97   %ld1 = load float, ptr %p1, align 4
  98   %ld2 = load float, ptr %p2, align 4
  99   %r0 = insertelement <8 x float> undef, float %ld0, i32 0
 100   %r1 = insertelement <8 x float> %r0,   float %ld1, i32 1
 101   %r2 = insertelement <8 x float> %r1,   float %ld2, i32 2
 102   %r3 = insertelement <8 x float> %r2,   float %ld2, i32 3
 103   ret <8 x float> %r3
 104 }
 105
 106 define <4 x float> @load_float4_float3_as_float2_float(ptr nocapture readonly dereferenceable(16)) nofree nosync {
 107 ; SSE-LABEL: load_float4_float3_as_float2_float:
 108 ; SSE:       # %bb.0:
 109 ; SSE-NEXT:    movups (%rdi), %xmm0
 110 ; SSE-NEXT:    retq
 111 ;
 112 ; AVX-LABEL: load_float4_float3_as_float2_float:
 113 ; AVX:       # %bb.0:
 114 ; AVX-NEXT:    vmovups (%rdi), %xmm0
 115 ; AVX-NEXT:    retq
 116   %2 = load <2 x float>, ptr %0, align 4
 117   %3 = extractelement <2 x float> %2, i32 0
 118   %4 = insertelement <4 x float> undef, float %3, i32 0
 119   %5 = extractelement <2 x float> %2, i32 1
 120   %6 = insertelement <4 x float> %4, float %5, i32 1
 121   %7 = getelementptr inbounds <4 x float>, ptr %0, i64 0, i64 2
 122   %8 = load float, ptr %7, align 4
 123   %9 = insertelement <4 x float> %6, float %8, i32 2
 124   ret <4 x float> %9
 125 }
 126
 127 define <4 x float> @load_float4_float3_as_float2_float_0122(ptr nocapture readonly dereferenceable(16)) nofree nosync {
 128 ; SSE-LABEL: load_float4_float3_as_float2_float_0122:
 129 ; SSE:       # %bb.0:
 130 ; SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
 131 ; SSE-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
 132 ; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,0]
 133 ; SSE-NEXT:    retq
 134 ;
 135 ; AVX-LABEL: load_float4_float3_as_float2_float_0122:
 136 ; AVX:       # %bb.0:
 137 ; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
 138 ; AVX-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
 139 ; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,0]
 140 ; AVX-NEXT:    retq
 141   %2 = load <2 x float>, ptr %0, align 4
 142   %3 = extractelement <2 x float> %2, i32 0
 143   %4 = insertelement <4 x float> undef, float %3, i32 0
 144   %5 = extractelement <2 x float> %2, i32 1
 145   %6 = insertelement <4 x float> %4, float %5, i32 1
 146   %7 = getelementptr inbounds <4 x float>, ptr %0, i64 0, i64 2
 147   %8 = load float, ptr %7, align 4
 148   %9 = insertelement <4 x float> %6, float %8, i32 2
 149   %10 = insertelement <4 x float> %9, float %8, i32 3
 150   ret <4 x float> %10
 151 }
 152
 153 define <4 x float> @load_float4_float3_trunc(ptr nocapture readonly dereferenceable(16)) {
 154 ; SSE-LABEL: load_float4_float3_trunc:
 155 ; SSE:       # %bb.0:
 156 ; SSE-NEXT:    movaps (%rdi), %xmm0
 157 ; SSE-NEXT:    retq
 158 ;
 159 ; AVX-LABEL: load_float4_float3_trunc:
 160 ; AVX:       # %bb.0:
 161 ; AVX-NEXT:    vmovaps (%rdi), %xmm0
 162 ; AVX-NEXT:    retq
 163   %2 = load i64, ptr %0, align 16
 164   %3 = getelementptr inbounds <4 x float>, ptr %0, i64 0, i64 2
 165   %4 = load i64, ptr %3, align 8
 166   %5 = trunc i64 %2 to i32
 167   %6 = bitcast i32 %5 to float
 168   %7 = insertelement <4 x float> undef, float %6, i32 0
 169   %8 = lshr i64 %2, 32
 170   %9 = trunc i64 %8 to i32
 171   %10 = bitcast i32 %9 to float
 172   %11 = insertelement <4 x float> %7, float %10, i32 1
 173   %12 = trunc i64 %4 to i32
 174   %13 = bitcast i32 %12 to float
 175   %14 = insertelement <4 x float> %11, float %13, i32 2
 176   ret <4 x float> %14
 177 }
 178
 179 define <4 x float> @load_float4_float3_trunc_0122(ptr nocapture readonly dereferenceable(16)) nofree nosync {
 180 ; SSE-LABEL: load_float4_float3_trunc_0122:
 181 ; SSE:       # %bb.0:
 182 ; SSE-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
 183 ; SSE-NEXT:    movaps (%rdi), %xmm0
 184 ; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,0]
 185 ; SSE-NEXT:    retq
 186 ;
 187 ; AVX-LABEL: load_float4_float3_trunc_0122:
 188 ; AVX:       # %bb.0:
 189 ; AVX-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 190 ; AVX-NEXT:    vmovaps (%rdi), %xmm1
 191 ; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0,0]
 192 ; AVX-NEXT:    retq
 193   %2 = load i64, ptr %0, align 16
 194   %3 = getelementptr inbounds <4 x float>, ptr %0, i64 0, i64 2
 195   %4 = load i64, ptr %3, align 8
 196   %5 = trunc i64 %2 to i32
 197   %6 = bitcast i32 %5 to float
 198   %7 = insertelement <4 x float> undef, float %6, i32 0
 199   %8 = lshr i64 %2, 32
 200   %9 = trunc i64 %8 to i32
 201   %10 = bitcast i32 %9 to float
 202   %11 = insertelement <4 x float> %7, float %10, i32 1
 203   %12 = trunc i64 %4 to i32
 204   %13 = bitcast i32 %12 to float
 205   %14 = insertelement <4 x float> %11, float %13, i32 2
 206   %15 = insertelement <4 x float> %14, float %13, i32 3
 207   ret <4 x float> %15
 208 }
 209
 210 define <4 x float> @load_float4_float3_trunc_0123(ptr nocapture readonly dereferenceable(16)) nofree nosync {
 211 ; SSE2-LABEL: load_float4_float3_trunc_0123:
 212 ; SSE2:       # %bb.0:
 213 ; SSE2-NEXT:    movaps (%rdi), %xmm0
 214 ; SSE2-NEXT:    movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
 215 ; SSE2-NEXT:    retq
 216 ;
 217 ; SSSE3-LABEL: load_float4_float3_trunc_0123:
 218 ; SSSE3:       # %bb.0:
 219 ; SSSE3-NEXT:    movaps (%rdi), %xmm0
 220 ; SSSE3-NEXT:    movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
 221 ; SSSE3-NEXT:    retq
 222 ;
 223 ; SSE41-LABEL: load_float4_float3_trunc_0123:
 224 ; SSE41:       # %bb.0:
 225 ; SSE41-NEXT:    movaps (%rdi), %xmm0
 226 ; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
 227 ; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
 228 ; SSE41-NEXT:    retq
 229 ;
 230 ; AVX-LABEL: load_float4_float3_trunc_0123:
 231 ; AVX:       # %bb.0:
 232 ; AVX-NEXT:    vmovaps (%rdi), %xmm0
 233 ; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
 234 ; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
 235 ; AVX-NEXT:    retq
 236   %2 = load i64, ptr %0, align 16
 237   %3 = getelementptr inbounds <4 x float>, ptr %0, i64 0, i64 2
 238   %4 = load i64, ptr %3, align 8
 239   %5 = trunc i64 %2 to i32
 240   %6 = bitcast i32 %5 to float
 241   %7 = insertelement <4 x float> undef, float %6, i32 0
 242   %8 = lshr i64 %2, 32
 243   %9 = trunc i64 %8 to i32
 244   %10 = bitcast i32 %9 to float
 245   %11 = insertelement <4 x float> %7, float %10, i32 1
 246   %12 = trunc i64 %4 to i32
 247   %13 = bitcast i32 %12 to float
 248   %14 = insertelement <4 x float> %11, float %13, i32 2
 249   %15 = lshr i64 %4, 32
 250   %16 = trunc i64 %15 to i32
 251   %17 = bitcast i32 %16 to float
 252   %18 = insertelement <4 x float> %14, float %17, i32 3
 253   ret <4 x float> %18
 254 }
 255
 256 define <4 x float> @load_float4_float3_trunc_0123_unaligned(ptr nocapture readonly dereferenceable(16)) nofree nosync {
 257 ; SSE2-LABEL: load_float4_float3_trunc_0123_unaligned:
 258 ; SSE2:       # %bb.0:
 259 ; SSE2-NEXT:    movups (%rdi), %xmm0
 260 ; SSE2-NEXT:    movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
 261 ; SSE2-NEXT:    retq
 262 ;
 263 ; SSSE3-LABEL: load_float4_float3_trunc_0123_unaligned:
 264 ; SSSE3:       # %bb.0:
 265 ; SSSE3-NEXT:    movups (%rdi), %xmm0
 266 ; SSSE3-NEXT:    movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
 267 ; SSSE3-NEXT:    retq
 268 ;
 269 ; SSE41-LABEL: load_float4_float3_trunc_0123_unaligned:
 270 ; SSE41:       # %bb.0:
 271 ; SSE41-NEXT:    movups (%rdi), %xmm0
 272 ; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
 273 ; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
 274 ; SSE41-NEXT:    retq
 275 ;
 276 ; AVX-LABEL: load_float4_float3_trunc_0123_unaligned:
 277 ; AVX:       # %bb.0:
 278 ; AVX-NEXT:    vmovups (%rdi), %xmm0
 279 ; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
 280 ; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
 281 ; AVX-NEXT:    retq
 282   %2 = load i64, ptr %0, align 1
 283   %3 = getelementptr inbounds <4 x float>, ptr %0, i64 0, i64 2
 284   %4 = load i64, ptr %3, align 1
 285   %5 = trunc i64 %2 to i32
 286   %6 = bitcast i32 %5 to float
 287   %7 = insertelement <4 x float> undef, float %6, i32 0
 288   %8 = lshr i64 %2, 32
 289   %9 = trunc i64 %8 to i32
 290   %10 = bitcast i32 %9 to float
 291   %11 = insertelement <4 x float> %7, float %10, i32 1
 292   %12 = trunc i64 %4 to i32
 293   %13 = bitcast i32 %12 to float
 294   %14 = insertelement <4 x float> %11, float %13, i32 2
 295   %15 = lshr i64 %4, 32
 296   %16 = trunc i64 %15 to i32
 297   %17 = bitcast i32 %16 to float
 298   %18 = insertelement <4 x float> %14, float %17, i32 3
 299   ret <4 x float> %18
 300 }
 301
 302 ; PR21780
 303 define <4 x double> @load_double4_0u2u(ptr nocapture readonly dereferenceable(32)) nofree nosync {
 304 ; SSE2-LABEL: load_double4_0u2u:
 305 ; SSE2:       # %bb.0:
 306 ; SSE2-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
 307 ; SSE2-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
 308 ; SSE2-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0,0]
 309 ; SSE2-NEXT:    movlhps {{.*#+}} xmm1 = xmm1[0,0]
 310 ; SSE2-NEXT:    retq
 311 ;
 312 ; SSSE3-LABEL: load_double4_0u2u:
 313 ; SSSE3:       # %bb.0:
 314 ; SSSE3-NEXT:    movddup {{.*#+}} xmm0 = mem[0,0]
 315 ; SSSE3-NEXT:    movddup {{.*#+}} xmm1 = mem[0,0]
 316 ; SSSE3-NEXT:    retq
 317 ;
 318 ; SSE41-LABEL: load_double4_0u2u:
 319 ; SSE41:       # %bb.0:
 320 ; SSE41-NEXT:    movddup {{.*#+}} xmm0 = mem[0,0]
 321 ; SSE41-NEXT:    movddup {{.*#+}} xmm1 = mem[0,0]
 322 ; SSE41-NEXT:    retq
 323 ;
 324 ; AVX-LABEL: load_double4_0u2u:
 325 ; AVX:       # %bb.0:
 326 ; AVX-NEXT:    vmovddup {{.*#+}} ymm0 = mem[0,0,2,2]
 327 ; AVX-NEXT:    retq
 328   %2 = load double, ptr %0, align 8
 329   %3 = insertelement <4 x double> undef, double %2, i32 0
 330   %4 = getelementptr inbounds double, ptr %0, i64 2
 331   %5 = load double, ptr %4, align 8
 332   %6 = insertelement <4 x double> %3, double %5, i32 2
 333   %7 = shufflevector <4 x double> %6, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
 334   ret <4 x double> %7
 335 }
 336
 337 ; Test case identified in rL366501
 338 @h = dso_local local_unnamed_addr global i8 0, align 1
 339 define dso_local i32 @load_partial_illegal_type()  {
 340 ; SSE2-LABEL: load_partial_illegal_type:
 341 ; SSE2:       # %bb.0:
 342 ; SSE2-NEXT:    movzwl h(%rip), %eax
 343 ; SSE2-NEXT:    movd %eax, %xmm0
 344 ; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 345 ; SSE2-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 346 ; SSE2-NEXT:    movd %xmm0, %eax
 347 ; SSE2-NEXT:    retq
 348 ;
 349 ; SSSE3-LABEL: load_partial_illegal_type:
 350 ; SSSE3:       # %bb.0:
 351 ; SSSE3-NEXT:    movzwl h(%rip), %eax
 352 ; SSSE3-NEXT:    movd %eax, %xmm0
 353 ; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,1],zero,xmm0[3,u,u,u,u,u,u,u,u,u,u,u,u]
 354 ; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 355 ; SSSE3-NEXT:    movd %xmm0, %eax
 356 ; SSSE3-NEXT:    retq
 357 ;
 358 ; SSE41-LABEL: load_partial_illegal_type:
 359 ; SSE41:       # %bb.0:
 360 ; SSE41-NEXT:    movzwl h(%rip), %eax
 361 ; SSE41-NEXT:    movd %eax, %xmm0
 362 ; SSE41-NEXT:    movl $2, %eax
 363 ; SSE41-NEXT:    pinsrb $2, %eax, %xmm0
 364 ; SSE41-NEXT:    movd %xmm0, %eax
 365 ; SSE41-NEXT:    retq
 366 ;
 367 ; AVX-LABEL: load_partial_illegal_type:
 368 ; AVX:       # %bb.0:
 369 ; AVX-NEXT:    movzwl h(%rip), %eax
 370 ; AVX-NEXT:    vmovd %eax, %xmm0
 371 ; AVX-NEXT:    movl $2, %eax
 372 ; AVX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
 373 ; AVX-NEXT:    vmovd %xmm0, %eax
 374 ; AVX-NEXT:    retq
 375   %1 = load <2 x i8>, ptr @h, align 1
 376   %2 = shufflevector <2 x i8> %1, <2 x i8> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
 377   %3 = insertelement <4 x i8> %2, i8 2, i32 2
 378   %4 = bitcast <4 x i8> %3 to i32
 379   ret i32 %4
 380 }
 381
 382 define dso_local void @PR43227(ptr %explicit_0, ptr %explicit_1) {
 383 ; SSE-LABEL: PR43227:
 384 ; SSE:       # %bb.0:
 385 ; SSE-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
 386 ; SSE-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
 387 ; SSE-NEXT:    psrlq $32, %xmm0
 388 ; SSE-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 389 ; SSE-NEXT:    pxor %xmm1, %xmm1
 390 ; SSE-NEXT:    movdqa %xmm1, 672(%rsi)
 391 ; SSE-NEXT:    movdqa %xmm0, 688(%rsi)
 392 ; SSE-NEXT:    retq
 393 ;
 394 ; AVX1-LABEL: PR43227:
 395 ; AVX1:       # %bb.0:
 396 ; AVX1-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
 397 ; AVX1-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
 398 ; AVX1-NEXT:    vpsrlq $32, %xmm0, %xmm0
 399 ; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 400 ; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 401 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
 402 ; AVX1-NEXT:    vmovaps %ymm0, 672(%rsi)
 403 ; AVX1-NEXT:    vzeroupper
 404 ; AVX1-NEXT:    retq
 405 ;
 406 ; AVX2-LABEL: PR43227:
 407 ; AVX2:       # %bb.0:
 408 ; AVX2-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
 409 ; AVX2-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
 410 ; AVX2-NEXT:    vpsrlq $32, %xmm0, %xmm0
 411 ; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 412 ; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 413 ; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
 414 ; AVX2-NEXT:    vmovdqa %ymm0, 672(%rsi)
 415 ; AVX2-NEXT:    vzeroupper
 416 ; AVX2-NEXT:    retq
 417   %1 = getelementptr i32, ptr %explicit_0, i64 63
 418   %2 = load <3 x i32>, ptr %1, align 1
 419   %3 = shufflevector <3 x i32> %2, <3 x i32> undef, <2 x i32> <i32 1, i32 2>
 420   %4 = shufflevector <2 x i32> %3, <2 x i32> undef, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 421   %5 = shufflevector <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 undef, i32 0, i32 undef, i32 0>, <8 x i32> %4, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 5, i32 9, i32 7>
 422   %6 = getelementptr inbounds <8 x i32>, ptr %explicit_1, i64 21
 423   store <8 x i32> %5, ptr %6, align 32
 424   ret void
 425 }