test/Transforms/InterleavedAccess/ARM/interleaved-accesses.ll

   1 ; RUN: opt < %s -mattr=+neon -interleaved-access -S | FileCheck %s -check-prefixes=NEON,ALL
   2 ; RUN: opt < %s -interleaved-access -S | FileCheck %s -check-prefixes=NO_NEON,ALL
   3
   4 target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-n32-S64"
   5 target triple = "arm---eabi"
   6
   7 define void @load_factor2(<16 x i8>* %ptr) {
   8 ; NEON-LABEL:    @load_factor2(
   9 ; NEON-NEXT:       [[TMP1:%.*]] = bitcast <16 x i8>* %ptr to i8*
  10 ; NEON-NEXT:       [[VLDN:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.arm.neon.vld2.v8i8.p0i8(i8* [[TMP1]], i32 4)
  11 ; NEON-NEXT:       [[TMP2:%.*]] = extractvalue { <8 x i8>, <8 x i8> } [[VLDN]], 1
  12 ; NEON-NEXT:       [[TMP3:%.*]] = extractvalue { <8 x i8>, <8 x i8> } [[VLDN]], 0
  13 ; NEON-NEXT:       ret void
  14 ; NO_NEON-LABEL: @load_factor2(
  15 ; NO_NEON-NOT:     @llvm.arm.neon
  16 ; NO_NEON:         ret void
  17 ;
  18   %interleaved.vec = load <16 x i8>, <16 x i8>* %ptr, align 4
  19   %v0 = shufflevector <16 x i8> %interleaved.vec, <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
  20   %v1 = shufflevector <16 x i8> %interleaved.vec, <16 x i8> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
  21   ret void
  22 }
  23
  24 define void @load_factor3(<6 x i32>* %ptr) {
  25 ; NEON-LABEL:    @load_factor3(
  26 ; NEON-NEXT:       [[TMP1:%.*]] = bitcast <6 x i32>* %ptr to i8*
  27 ; NEON-NEXT:       [[VLDN:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld3.v2i32.p0i8(i8* [[TMP1]], i32 4)
  28 ; NEON-NEXT:       [[TMP2:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[VLDN]], 2
  29 ; NEON-NEXT:       [[TMP3:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[VLDN]], 1
  30 ; NEON-NEXT:       [[TMP4:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[VLDN]], 0
  31 ; NEON-NEXT:       ret void
  32 ; NO_NEON-LABEL: @load_factor3(
  33 ; NO_NEON-NOT:     @llvm.arm.neon
  34 ; NO_NEON:         ret void
  35 ;
  36   %interleaved.vec = load <6 x i32>, <6 x i32>* %ptr, align 4
  37   %v0 = shufflevector <6 x i32> %interleaved.vec, <6 x i32> undef, <2 x i32> <i32 0, i32 3>
  38   %v1 = shufflevector <6 x i32> %interleaved.vec, <6 x i32> undef, <2 x i32> <i32 1, i32 4>
  39   %v2 = shufflevector <6 x i32> %interleaved.vec, <6 x i32> undef, <2 x i32> <i32 2, i32 5>
  40   ret void
  41 }
  42
  43 define void @load_factor4(<16 x i32>* %ptr) {
  44 ; NEON-LABEL:    @load_factor4(
  45 ; NEON-NEXT:       [[TMP1:%.*]] = bitcast <16 x i32>* %ptr to i8*
  46 ; NEON-NEXT:       [[VLDN:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld4.v4i32.p0i8(i8* [[TMP1]], i32 4)
  47 ; NEON-NEXT:       [[TMP2:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN]], 3
  48 ; NEON-NEXT:       [[TMP3:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN]], 2
  49 ; NEON-NEXT:       [[TMP4:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN]], 1
  50 ; NEON-NEXT:       [[TMP5:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN]], 0
  51 ; NEON-NEXT:       ret void
  52 ; NO_NEON-LABEL: @load_factor4(
  53 ; NO_NEON-NOT:     @llvm.arm.neon
  54 ; NO_NEON:         ret void
  55 ;
  56   %interleaved.vec = load <16 x i32>, <16 x i32>* %ptr, align 4
  57   %v0 = shufflevector <16 x i32> %interleaved.vec, <16 x i32> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
  58   %v1 = shufflevector <16 x i32> %interleaved.vec, <16 x i32> undef, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
  59   %v2 = shufflevector <16 x i32> %interleaved.vec, <16 x i32> undef, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
  60   %v3 = shufflevector <16 x i32> %interleaved.vec, <16 x i32> undef, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
  61   ret void
  62 }
  63
  64 define void @store_factor2(<16 x i8>* %ptr, <8 x i8> %v0, <8 x i8> %v1) {
  65 ; NEON-LABEL:    @store_factor2(
  66 ; NEON-NEXT:       [[TMP1:%.*]] = bitcast <16 x i8>* %ptr to i8*
  67 ; NEON-NEXT:       [[TMP2:%.*]] = shufflevector <8 x i8> %v0, <8 x i8> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
  68 ; NEON-NEXT:       [[TMP3:%.*]] = shufflevector <8 x i8> %v0, <8 x i8> %v1, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
  69 ; NEON-NEXT:       call void @llvm.arm.neon.vst2.p0i8.v8i8(i8* [[TMP1]], <8 x i8> [[TMP2]], <8 x i8> [[TMP3]], i32 4)
  70 ; NEON-NEXT:       ret void
  71 ; NO_NEON-LABEL: @store_factor2(
  72 ; NO_NEON-NOT:     @llvm.arm.neon
  73 ; NO_NEON:         ret void
  74 ;
  75   %interleaved.vec = shufflevector <8 x i8> %v0, <8 x i8> %v1, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
  76   store <16 x i8> %interleaved.vec, <16 x i8>* %ptr, align 4
  77   ret void
  78 }
  79
  80 define void @store_factor3(<12 x i32>* %ptr, <4 x i32> %v0, <4 x i32> %v1, <4 x i32> %v2) {
  81 ; NEON-LABEL:    @store_factor3(
  82 ; NEON:            [[TMP1:%.*]] = bitcast <12 x i32>* %ptr to i8*
  83 ; NEON-NEXT:       [[TMP2:%.*]] = shufflevector <8 x i32> %s0, <8 x i32> %s1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
  84 ; NEON-NEXT:       [[TMP3:%.*]] = shufflevector <8 x i32> %s0, <8 x i32> %s1, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
  85 ; NEON-NEXT:       [[TMP4:%.*]] = shufflevector <8 x i32> %s0, <8 x i32> %s1, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
  86 ; NEON-NEXT:       call void @llvm.arm.neon.vst3.p0i8.v4i32(i8* [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], i32 4)
  87 ; NEON-NEXT:       ret void
  88 ; NO_NEON-LABEL: @store_factor3(
  89 ; NO_NEON-NOT:     @llvm.arm.neon
  90 ; NO_NEON:         ret void
  91 ;
  92   %s0 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
  93   %s1 = shufflevector <4 x i32> %v2, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
  94   %interleaved.vec = shufflevector <8 x i32> %s0, <8 x i32> %s1, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
  95   store <12 x i32> %interleaved.vec, <12 x i32>* %ptr, align 4
  96   ret void
  97 }
  98
  99 define void @store_factor4(<16 x i32>* %ptr, <4 x i32> %v0, <4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3) {
 100 ; NEON-LABEL:    @store_factor4(
 101 ; NEON:            [[TMP1:%.*]] = bitcast <16 x i32>* %ptr to i8*
 102 ; NEON-NEXT:       [[TMP2:%.*]] = shufflevector <8 x i32> %s0, <8 x i32> %s1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 103 ; NEON-NEXT:       [[TMP3:%.*]] = shufflevector <8 x i32> %s0, <8 x i32> %s1, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 104 ; NEON-NEXT:       [[TMP4:%.*]] = shufflevector <8 x i32> %s0, <8 x i32> %s1, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
 105 ; NEON-NEXT:       [[TMP5:%.*]] = shufflevector <8 x i32> %s0, <8 x i32> %s1, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
 106 ; NEON-NEXT:       call void @llvm.arm.neon.vst4.p0i8.v4i32(i8* [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], i32 4)
 107 ; NEON-NEXT:       ret void
 108 ; NO_NEON-LABEL: @store_factor4(
 109 ; NO_NEON-NOT:     @llvm.arm.neon
 110 ; NO_NEON:         ret void
 111 ;
 112   %s0 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 113   %s1 = shufflevector <4 x i32> %v2, <4 x i32> %v3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 114   %interleaved.vec = shufflevector <8 x i32> %s0, <8 x i32> %s1, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
 115   store <16 x i32> %interleaved.vec, <16 x i32>* %ptr, align 4
 116   ret void
 117 }
 118
 119 define void @load_ptrvec_factor2(<4 x i32*>* %ptr) {
 120 ; NEON-LABEL:    @load_ptrvec_factor2(
 121 ; NEON-NEXT:       [[TMP1:%.*]] = bitcast <4 x i32*>* %ptr to i8*
 122 ; NEON-NEXT:       [[VLDN:%.*]] = call { <2 x i32>, <2 x i32> } @llvm.arm.neon.vld2.v2i32.p0i8(i8* [[TMP1]], i32 4)
 123 ; NEON-NEXT:       [[TMP2:%.*]] = extractvalue { <2 x i32>, <2 x i32> } [[VLDN]], 0
 124 ; NEON-NEXT:       [[TMP3:%.*]] = inttoptr <2 x i32> [[TMP2]] to <2 x i32*>
 125 ; NEON-NEXT:       ret void
 126 ; NO_NEON-LABEL: @load_ptrvec_factor2(
 127 ; NO_NEON-NOT:     @llvm.arm.neon
 128 ; NO_NEON:         ret void
 129 ;
 130   %interleaved.vec = load <4 x i32*>, <4 x i32*>* %ptr, align 4
 131   %v0 = shufflevector <4 x i32*> %interleaved.vec, <4 x i32*> undef, <2 x i32> <i32 0, i32 2>
 132   ret void
 133 }
 134
 135 define void @load_ptrvec_factor3(<6 x i32*>* %ptr) {
 136 ; NEON-LABEL:    @load_ptrvec_factor3(
 137 ; NEON-NEXT:       [[TMP1:%.*]] = bitcast <6 x i32*>* %ptr to i8*
 138 ; NEON-NEXT:       [[VLDN:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld3.v2i32.p0i8(i8* [[TMP1]], i32 4)
 139 ; NEON-NEXT:       [[TMP2:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[VLDN]], 2
 140 ; NEON-NEXT:       [[TMP3:%.*]] = inttoptr <2 x i32> [[TMP2]] to <2 x i32*>
 141 ; NEON-NEXT:       [[TMP4:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[VLDN]], 1
 142 ; NEON-NEXT:       [[TMP5:%.*]] = inttoptr <2 x i32> [[TMP4]] to <2 x i32*>
 143 ; NEON-NEXT:       [[TMP6:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[VLDN]], 0
 144 ; NEON-NEXT:       [[TMP7:%.*]] = inttoptr <2 x i32> [[TMP6]] to <2 x i32*>
 145 ; NEON-NEXT:       ret void
 146 ; NO_NEON-LABEL: @load_ptrvec_factor3(
 147 ; NO_NEON-NOT:     @llvm.arm.neon
 148 ; NO_NEON:         ret void
 149 ;
 150   %interleaved.vec = load <6 x i32*>, <6 x i32*>* %ptr, align 4
 151   %v0 = shufflevector <6 x i32*> %interleaved.vec, <6 x i32*> undef, <2 x i32> <i32 0, i32 3>
 152   %v1 = shufflevector <6 x i32*> %interleaved.vec, <6 x i32*> undef, <2 x i32> <i32 1, i32 4>
 153   %v2 = shufflevector <6 x i32*> %interleaved.vec, <6 x i32*> undef, <2 x i32> <i32 2, i32 5>
 154   ret void
 155 }
 156
 157 define void @load_ptrvec_factor4(<8 x i32*>* %ptr) {
 158 ; NEON-LABEL:    @load_ptrvec_factor4(
 159 ; NEON-NEXT:       [[TMP1:%.*]] = bitcast <8 x i32*>* %ptr to i8*
 160 ; NEON-NEXT:       [[VLDN:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld4.v2i32.p0i8(i8* [[TMP1]], i32 4)
 161 ; NEON-NEXT:       [[TMP2:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLDN]], 3
 162 ; NEON-NEXT:       [[TMP3:%.*]] = inttoptr <2 x i32> [[TMP2]] to <2 x i32*>
 163 ; NEON-NEXT:       [[TMP4:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLDN]], 2
 164 ; NEON-NEXT:       [[TMP5:%.*]] = inttoptr <2 x i32> [[TMP4]] to <2 x i32*>
 165 ; NEON-NEXT:       [[TMP6:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLDN]], 1
 166 ; NEON-NEXT:       [[TMP7:%.*]] = inttoptr <2 x i32> [[TMP6]] to <2 x i32*>
 167 ; NEON-NEXT:       [[TMP8:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLDN]], 0
 168 ; NEON-NEXT:       [[TMP9:%.*]] = inttoptr <2 x i32> [[TMP8]] to <2 x i32*>
 169 ; NEON-NEXT:       ret void
 170 ; NO_NEON-LABEL: @load_ptrvec_factor4(
 171 ; NO_NEON-NOT:     @llvm.arm.neon
 172 ; NO_NEON:         ret void
 173 ;
 174   %interleaved.vec = load <8 x i32*>, <8 x i32*>* %ptr, align 4
 175   %v0 = shufflevector <8 x i32*> %interleaved.vec, <8 x i32*> undef, <2 x i32> <i32 0, i32 4>
 176   %v1 = shufflevector <8 x i32*> %interleaved.vec, <8 x i32*> undef, <2 x i32> <i32 1, i32 5>
 177   %v2 = shufflevector <8 x i32*> %interleaved.vec, <8 x i32*> undef, <2 x i32> <i32 2, i32 6>
 178   %v3 = shufflevector <8 x i32*> %interleaved.vec, <8 x i32*> undef, <2 x i32> <i32 3, i32 7>
 179   ret void
 180 }
 181
 182 define void @store_ptrvec_factor2(<4 x i32*>* %ptr, <2 x i32*> %v0, <2 x i32*> %v1) {
 183 ; NEON-LABEL:    @store_ptrvec_factor2(
 184 ; NEON-NEXT:       [[TMP1:%.*]] = ptrtoint <2 x i32*> %v0 to <2 x i32>
 185 ; NEON-NEXT:       [[TMP2:%.*]] = ptrtoint <2 x i32*> %v1 to <2 x i32>
 186 ; NEON-NEXT:       [[TMP3:%.*]] = bitcast <4 x i32*>* %ptr to i8*
 187 ; NEON-NEXT:       [[TMP4:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> <i32 0, i32 1>
 188 ; NEON-NEXT:       [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> <i32 2, i32 3>
 189 ; NEON-NEXT:       call void @llvm.arm.neon.vst2.p0i8.v2i32(i8* [[TMP3]], <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], i32 4)
 190 ; NEON-NEXT:       ret void
 191 ; NO_NEON-LABEL: @store_ptrvec_factor2(
 192 ; NO_NEON-NOT:     @llvm.arm.neon
 193 ; NO_NEON:         ret void
 194 ;
 195   %interleaved.vec = shufflevector <2 x i32*> %v0, <2 x i32*> %v1, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
 196   store <4 x i32*> %interleaved.vec, <4 x i32*>* %ptr, align 4
 197   ret void
 198 }
 199
 200 define void @store_ptrvec_factor3(<6 x i32*>* %ptr, <2 x i32*> %v0, <2 x i32*> %v1, <2 x i32*> %v2) {
 201 ; NEON-LABEL:    @store_ptrvec_factor3(
 202 ; NEON:            [[TMP1:%.*]] = ptrtoint <4 x i32*> %s0 to <4 x i32>
 203 ; NEON-NEXT:       [[TMP2:%.*]] = ptrtoint <4 x i32*> %s1 to <4 x i32>
 204 ; NEON-NEXT:       [[TMP3:%.*]] = bitcast <6 x i32*>* %ptr to i8*
 205 ; NEON-NEXT:       [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <2 x i32> <i32 0, i32 1>
 206 ; NEON-NEXT:       [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <2 x i32> <i32 2, i32 3>
 207 ; NEON-NEXT:       [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <2 x i32> <i32 4, i32 5>
 208 ; NEON-NEXT:       call void @llvm.arm.neon.vst3.p0i8.v2i32(i8* [[TMP3]], <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], <2 x i32> [[TMP6]], i32 4)
 209 ; NEON-NEXT:       ret void
 210 ; NO_NEON-LABEL: @store_ptrvec_factor3(
 211 ; NO_NEON-NOT:     @llvm.arm.neon
 212 ; NO_NEON:         ret void
 213 ;
 214   %s0 = shufflevector <2 x i32*> %v0, <2 x i32*> %v1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 215   %s1 = shufflevector <2 x i32*> %v2, <2 x i32*> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
 216   %interleaved.vec = shufflevector <4 x i32*> %s0, <4 x i32*> %s1, <6 x i32> <i32 0, i32 2, i32 4, i32 1, i32 3, i32 5>
 217   store <6 x i32*> %interleaved.vec, <6 x i32*>* %ptr, align 4
 218   ret void
 219 }
 220
 221 define void @store_ptrvec_factor4(<8 x i32*>* %ptr, <2 x i32*> %v0, <2 x i32*> %v1, <2 x i32*> %v2, <2 x i32*> %v3) {
 222 ; NEON-LABEL:    @store_ptrvec_factor4(
 223 ; NEON:            [[TMP1:%.*]] = ptrtoint <4 x i32*> %s0 to <4 x i32>
 224 ; NEON-NEXT:       [[TMP2:%.*]] = ptrtoint <4 x i32*> %s1 to <4 x i32>
 225 ; NEON-NEXT:       [[TMP3:%.*]] = bitcast <8 x i32*>* %ptr to i8*
 226 ; NEON-NEXT:       [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <2 x i32> <i32 0, i32 1>
 227 ; NEON-NEXT:       [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <2 x i32> <i32 2, i32 3>
 228 ; NEON-NEXT:       [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <2 x i32> <i32 4, i32 5>
 229 ; NEON-NEXT:       [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <2 x i32> <i32 6, i32 7>
 230 ; NEON-NEXT:       call void @llvm.arm.neon.vst4.p0i8.v2i32(i8* [[TMP3]], <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], <2 x i32> [[TMP6]], <2 x i32> [[TMP7]], i32 4)
 231 ; NEON-NEXT:       ret void
 232 ; NO_NEON-LABEL: @store_ptrvec_factor4(
 233 ; NO_NEON-NOT:     @llvm.arm.neon
 234 ; NO_NEON:         ret void
 235 ;
 236   %s0 = shufflevector <2 x i32*> %v0, <2 x i32*> %v1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 237   %s1 = shufflevector <2 x i32*> %v2, <2 x i32*> %v3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 238   %interleaved.vec = shufflevector <4 x i32*> %s0, <4 x i32*> %s1, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7>
 239   store <8 x i32*> %interleaved.vec, <8 x i32*>* %ptr, align 4
 240   ret void
 241 }
 242
 243 define void @load_undef_mask_factor2(<8 x i32>* %ptr) {
 244 ; NEON-LABEL:    @load_undef_mask_factor2(
 245 ; NEON-NEXT:       [[TMP1:%.*]] = bitcast <8 x i32>* %ptr to i8*
 246 ; NEON-NEXT:       [[VLDN:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2.v4i32.p0i8(i8* [[TMP1]], i32 4)
 247 ; NEON-NEXT:       [[TMP2:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLDN]], 1
 248 ; NEON-NEXT:       [[TMP3:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLDN]], 0
 249 ; NEON-NEXT:       ret void
 250 ; NO_NEON-LABEL: @load_undef_mask_factor2(
 251 ; NO_NEON-NOT:     @llvm.arm.neon
 252 ; NO_NEON:         ret void
 253 ;
 254   %interleaved.vec = load <8 x i32>, <8 x i32>* %ptr, align 4
 255   %v0 = shufflevector <8 x i32> %interleaved.vec, <8 x i32> undef, <4 x i32> <i32 undef, i32 2, i32 undef, i32 6>
 256   %v1 = shufflevector <8 x i32> %interleaved.vec, <8 x i32> undef, <4 x i32> <i32 undef, i32 3, i32 undef, i32 7>
 257   ret void
 258 }
 259
 260 define void @load_undef_mask_factor3(<12 x i32>* %ptr) {
 261 ; NEON-LABEL:    @load_undef_mask_factor3(
 262 ; NEON-NEXT:       [[TMP1:%.*]] = bitcast <12 x i32>* %ptr to i8*
 263 ; NEON-NEXT:       [[VLDN:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld3.v4i32.p0i8(i8* [[TMP1]], i32 4)
 264 ; NEON-NEXT:       [[TMP2:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN]], 2
 265 ; NEON-NEXT:       [[TMP3:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN]], 1
 266 ; NEON-NEXT:       [[TMP4:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN]], 0
 267 ; NEON-NEXT:       ret void
 268 ; NO_NEON-LABEL: @load_undef_mask_factor3(
 269 ; NO_NEON-NOT:     @llvm.arm.neon
 270 ; NO_NEON:         ret void
 271 ;
 272   %interleaved.vec = load <12 x i32>, <12 x i32>* %ptr, align 4
 273   %v0 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> undef, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
 274   %v1 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> undef, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
 275   %v2 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> undef, <4 x i32> <i32 2, i32 undef, i32 undef, i32 undef>
 276   ret void
 277 }
 278
 279 define void @load_undef_mask_factor4(<16 x i32>* %ptr) {
 280 ; NEON-LABEL:    @load_undef_mask_factor4(
 281 ; NEON-NEXT:       [[TMP1:%.*]] = bitcast <16 x i32>* %ptr to i8*
 282 ; NEON-NEXT:       [[VLDN:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld4.v4i32.p0i8(i8* [[TMP1]], i32 4)
 283 ; NEON-NEXT:       [[TMP2:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN]], 3
 284 ; NEON-NEXT:       [[TMP3:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN]], 2
 285 ; NEON-NEXT:       [[TMP4:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN]], 1
 286 ; NEON-NEXT:       [[TMP5:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN]], 0
 287 ; NEON-NEXT:       ret void
 288 ; NO_NEON-LABEL: @load_undef_mask_factor4(
 289 ; NO_NEON-NOT:     @llvm.arm.neon
 290 ; NO_NEON:         ret void
 291 ;
 292   %interleaved.vec = load <16 x i32>, <16 x i32>* %ptr, align 4
 293   %v0 = shufflevector <16 x i32> %interleaved.vec, <16 x i32> undef, <4 x i32> <i32 0, i32 4, i32 undef, i32 undef>
 294   %v1 = shufflevector <16 x i32> %interleaved.vec, <16 x i32> undef, <4 x i32> <i32 1, i32 5, i32 undef, i32 undef>
 295   %v2 = shufflevector <16 x i32> %interleaved.vec, <16 x i32> undef, <4 x i32> <i32 2, i32 6, i32 undef, i32 undef>
 296   %v3 = shufflevector <16 x i32> %interleaved.vec, <16 x i32> undef, <4 x i32> <i32 3, i32 7, i32 undef, i32 undef>
 297   ret void
 298 }
 299
 300 define void @store_undef_mask_factor2(<8 x i32>* %ptr, <4 x i32> %v0, <4 x i32> %v1) {
 301 ; NEON-LABEL:    @store_undef_mask_factor2(
 302 ; NEON-NEXT:       [[TMP1:%.*]] = bitcast <8 x i32>* %ptr to i8*
 303 ; NEON-NEXT:       [[TMP2:%.*]] = shufflevector <4 x i32> %v0, <4 x i32> %v1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 304 ; NEON-NEXT:       [[TMP3:%.*]] = shufflevector <4 x i32> %v0, <4 x i32> %v1, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 305 ; NEON-NEXT:       call void @llvm.arm.neon.vst2.p0i8.v4i32(i8* [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], i32 4)
 306 ; NEON-NEXT:       ret void
 307 ; NO_NEON-LABEL: @store_undef_mask_factor2(
 308 ; NO_NEON-NOT:     @llvm.arm.neon
 309 ; NO_NEON:         ret void
 310 ;
 311   %interleaved.vec = shufflevector <4 x i32> %v0, <4 x i32> %v1, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 2, i32 6, i32 3, i32 7>
 312   store <8 x i32> %interleaved.vec, <8 x i32>* %ptr, align 4
 313   ret void
 314 }
 315
 316 define void @store_undef_mask_factor3(<12 x i32>* %ptr, <4 x i32> %v0, <4 x i32> %v1, <4 x i32> %v2) {
 317 ; NEON-LABEL:    @store_undef_mask_factor3(
 318 ; NEON:            [[TMP1:%.*]] = bitcast <12 x i32>* %ptr to i8*
 319 ; NEON-NEXT:       [[TMP2:%.*]] = shufflevector <8 x i32> %s0, <8 x i32> %s1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 320 ; NEON-NEXT:       [[TMP3:%.*]] = shufflevector <8 x i32> %s0, <8 x i32> %s1, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 321 ; NEON-NEXT:       [[TMP4:%.*]] = shufflevector <8 x i32> %s0, <8 x i32> %s1, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
 322 ; NEON-NEXT:       call void @llvm.arm.neon.vst3.p0i8.v4i32(i8* [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], i32 4)
 323 ; NEON-NEXT:       ret void
 324 ; NO_NEON-LABEL: @store_undef_mask_factor3(
 325 ; NO_NEON-NOT:     @llvm.arm.neon
 326 ; NO_NEON:         ret void
 327 ;
 328   %s0 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 329   %s1 = shufflevector <4 x i32> %v2, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
 330   %interleaved.vec = shufflevector <8 x i32> %s0, <8 x i32> %s1, <12 x i32> <i32 0, i32 4, i32 undef, i32 1, i32 undef, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
 331   store <12 x i32> %interleaved.vec, <12 x i32>* %ptr, align 4
 332   ret void
 333 }
 334
 335 define void @store_undef_mask_factor4(<16 x i32>* %ptr, <4 x i32> %v0, <4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3) {
 336 ; NEON-LABEL:    @store_undef_mask_factor4(
 337 ; NEON:            [[TMP1:%.*]] = bitcast <16 x i32>* %ptr to i8*
 338 ; NEON-NEXT:       [[TMP2:%.*]] = shufflevector <8 x i32> %s0, <8 x i32> %s1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 339 ; NEON-NEXT:       [[TMP3:%.*]] = shufflevector <8 x i32> %s0, <8 x i32> %s1, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 340 ; NEON-NEXT:       [[TMP4:%.*]] = shufflevector <8 x i32> %s0, <8 x i32> %s1, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
 341 ; NEON-NEXT:       [[TMP5:%.*]] = shufflevector <8 x i32> %s0, <8 x i32> %s1, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
 342 ; NEON-NEXT:       call void @llvm.arm.neon.vst4.p0i8.v4i32(i8* [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], i32 4)
 343 ; NEON-NEXT:       ret void
 344 ; NO_NEON-LABEL: @store_undef_mask_factor4(
 345 ; NO_NEON-NOT:     @llvm.arm.neon
 346 ; NO_NEON:         ret void
 347 ;
 348   %s0 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 349   %s1 = shufflevector <4 x i32> %v2, <4 x i32> %v3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 350   %interleaved.vec = shufflevector <8 x i32> %s0, <8 x i32> %s1, <16 x i32> <i32 0, i32 4, i32 8, i32 undef, i32 undef, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
 351   store <16 x i32> %interleaved.vec, <16 x i32>* %ptr, align 4
 352   ret void
 353 }
 354
 355 define void @load_address_space(<4 x i32> addrspace(1)* %ptr) {
 356 ; NEON-LABEL:    @load_address_space(
 357 ; NEON-NEXT:       [[TMP1:%.*]] = bitcast <4 x i32> addrspace(1)* %ptr to i8 addrspace(1)*
 358 ; NEON-NEXT:       [[VLDN:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld3.v2i32.p1i8(i8 addrspace(1)* [[TMP1]], i32 0)
 359 ; NEON-NEXT:       [[TMP2:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[VLDN]], 2
 360 ; NEON-NEXT:       [[TMP3:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[VLDN]], 1
 361 ; NEON-NEXT:       [[TMP4:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[VLDN]], 0
 362 ; NEON-NEXT:       ret void
 363 ; NO_NEON-LABEL: @load_address_space(
 364 ; NO_NEON-NOT:     @llvm.arm.neon
 365 ; NO_NEON:         ret void
 366 ;
 367   %interleaved.vec = load <4 x i32>, <4 x i32> addrspace(1)* %ptr
 368   %v0 = shufflevector <4 x i32> %interleaved.vec, <4 x i32> undef, <2 x i32> <i32 0, i32 3>
 369   %v1 = shufflevector <4 x i32> %interleaved.vec, <4 x i32> undef, <2 x i32> <i32 1, i32 4>
 370   %v2 = shufflevector <4 x i32> %interleaved.vec, <4 x i32> undef, <2 x i32> <i32 2, i32 5>
 371   ret void
 372 }
 373
 374 define void @store_address_space(<4 x i32> addrspace(1)* %ptr, <2 x i32> %v0, <2 x i32> %v1) {
 375 ; NEON-LABEL:    @store_address_space(
 376 ; NEON-NEXT:       [[TMP1:%.*]] = bitcast <4 x i32> addrspace(1)* %ptr to i8 addrspace(1)*
 377 ; NEON-NEXT:       [[TMP2:%.*]] = shufflevector <2 x i32> %v0, <2 x i32> %v1, <2 x i32> <i32 0, i32 1>
 378 ; NEON-NEXT:       [[TMP3:%.*]] = shufflevector <2 x i32> %v0, <2 x i32> %v1, <2 x i32> <i32 2, i32 3>
 379 ; NEON-NEXT:       call void @llvm.arm.neon.vst2.p1i8.v2i32(i8 addrspace(1)* [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> [[TMP3]], i32 0)
 380 ; NEON-NEXT:       ret void
 381 ; NO_NEON-LABEL: @store_address_space(
 382 ; NO_NEON-NOT:     @llvm.arm.neon
 383 ; NO_NEON:         ret void
 384 ;
 385   %interleaved.vec = shufflevector <2 x i32> %v0, <2 x i32> %v1, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
 386   store <4 x i32> %interleaved.vec, <4 x i32> addrspace(1)* %ptr
 387   ret void
 388 }
 389
 390 define void @load_f16_factor2(<8 x half>* %ptr) {
 391 ; ALL-LABEL: @load_f16_factor2(
 392 ; ALL-NOT:     @llvm.arm.neon
 393 ; ALL:         ret void
 394 ;
 395   %interleaved.vec = load <8 x half>, <8 x half>* %ptr, align 4
 396   %v0 = shufflevector <8 x half> %interleaved.vec, <8 x half> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
 397   %v1 = shufflevector <8 x half> %interleaved.vec, <8 x half> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
 398   ret void
 399 }
 400
 401 define void @store_f16_factor2(<8 x half>* %ptr, <4 x half> %v0, <4 x half> %v1) {
 402 ; ALL-LABEL: @store_f16_factor2(
 403 ; ALL-NOT:     @llvm.arm.neon
 404 ; ALL:         ret void
 405 ;
 406   %interleaved.vec = shufflevector <4 x half> %v0, <4 x half> %v1, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
 407   store <8 x half> %interleaved.vec, <8 x half>* %ptr, align 4
 408   ret void
 409 }
 410
 411 define void @load_illegal_factor2(<3 x float>* %ptr) nounwind {
 412 ; ALL-LABEL:    @load_illegal_factor2(
 413 ; ALL-NOT:        @llvm.arm.neon
 414 ; ALL:            ret void
 415 ;
 416   %interleaved.vec = load <3 x float>, <3 x float>* %ptr, align 16
 417   %v0 = shufflevector <3 x float> %interleaved.vec, <3 x float> undef, <3 x i32> <i32 0, i32 2, i32 undef>
 418   ret void
 419 }
 420
 421 define void @store_illegal_factor2(<3 x float>* %ptr, <3 x float> %v0) nounwind {
 422 ; ALL-LABEL: @store_illegal_factor2(
 423 ; ALL-NOT:     @llvm.arm.neon
 424 ; ALL:         ret void
 425 ;
 426   %interleaved.vec = shufflevector <3 x float> %v0, <3 x float> undef, <3 x i32> <i32 0, i32 2, i32 undef>
 427   store <3 x float> %interleaved.vec, <3 x float>* %ptr, align 16
 428   ret void
 429 }
 430
 431 define void @store_general_mask_factor4(<8 x i32>* %ptr, <32 x i32> %v0, <32 x i32> %v1) {
 432 ; NEON-LABEL:    @store_general_mask_factor4(
 433 ; NEON-NEXT:       [[TMP1:%.*]] = bitcast <8 x i32>* %ptr to i8*
 434 ; NEON-NEXT:       [[TMP2:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <2 x i32> <i32 4, i32 5>
 435 ; NEON-NEXT:       [[TMP3:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <2 x i32> <i32 16, i32 17>
 436 ; NEON-NEXT:       [[TMP4:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <2 x i32> <i32 32, i32 33>
 437 ; NEON-NEXT:       [[TMP5:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <2 x i32> <i32 8, i32 9>
 438 ; NEON-NEXT:       call void @llvm.arm.neon.vst4.p0i8.v2i32(i8* [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> [[TMP3]], <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], i32 4)
 439 ; NEON-NEXT:       ret void
 440 ; NO_NEON-LABEL: @store_general_mask_factor4(
 441 ; NO_NEON-NOT:     @llvm.arm.neon
 442 ; NO_NEON:         ret void
 443 ;
 444   %interleaved.vec = shufflevector <32 x i32> %v0, <32 x i32> %v1, <8 x i32> <i32 4, i32 16, i32 32, i32 8, i32 5, i32 17, i32 33, i32 9>
 445   store <8 x i32> %interleaved.vec, <8 x i32>* %ptr, align 4
 446   ret void
 447 }
 448
 449 define void @store_general_mask_factor4_undefbeg(<8 x i32>* %ptr, <32 x i32> %v0, <32 x i32> %v1) {
 450 ; NEON-LABEL:    @store_general_mask_factor4_undefbeg(
 451 ; NEON-NEXT:       [[TMP1:%.*]] = bitcast <8 x i32>* %ptr to i8*
 452 ; NEON-NEXT:       [[TMP2:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <2 x i32> <i32 4, i32 5>
 453 ; NEON-NEXT:       [[TMP3:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <2 x i32> <i32 16, i32 17>
 454 ; NEON-NEXT:       [[TMP4:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <2 x i32> <i32 32, i32 33>
 455 ; NEON-NEXT:       [[TMP5:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <2 x i32> <i32 8, i32 9>
 456 ; NEON-NEXT:       call void @llvm.arm.neon.vst4.p0i8.v2i32(i8* [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> [[TMP3]], <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], i32 4)
 457 ; NEON-NEXT:       ret void
 458 ; NO_NEON-LABEL: @store_general_mask_factor4_undefbeg(
 459 ; NO_NEON-NOT:     @llvm.arm.neon
 460 ; NO_NEON:         ret void
 461 ;
 462   %interleaved.vec = shufflevector <32 x i32> %v0, <32 x i32> %v1, <8 x i32> <i32 undef, i32 16, i32 32, i32 8, i32 5, i32 17, i32 33, i32 9>
 463   store <8 x i32> %interleaved.vec, <8 x i32>* %ptr, align 4
 464   ret void
 465 }
 466
 467 define void @store_general_mask_factor4_undefend(<8 x i32>* %ptr, <32 x i32> %v0, <32 x i32> %v1) {
 468 ; NEON-LABEL:    @store_general_mask_factor4_undefend(
 469 ; NEON-NEXT:       [[TMP1:%.*]] = bitcast <8 x i32>* %ptr to i8*
 470 ; NEON-NEXT:       [[TMP2:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <2 x i32> <i32 4, i32 5>
 471 ; NEON-NEXT:       [[TMP3:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <2 x i32> <i32 16, i32 17>
 472 ; NEON-NEXT:       [[TMP4:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <2 x i32> <i32 32, i32 33>
 473 ; NEON-NEXT:       [[TMP5:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <2 x i32> <i32 8, i32 9>
 474 ; NEON-NEXT:       call void @llvm.arm.neon.vst4.p0i8.v2i32(i8* [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> [[TMP3]], <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], i32 4)
 475 ; NEON-NEXT:       ret void
 476 ; NO_NEON-LABEL: @store_general_mask_factor4_undefend(
 477 ; NO_NEON-NOT:     @llvm.arm.neon
 478 ; NO_NEON:         ret void
 479 ;
 480   %interleaved.vec = shufflevector <32 x i32> %v0, <32 x i32> %v1, <8 x i32> <i32 4, i32 16, i32 32, i32 8, i32 5, i32 17, i32 33, i32 undef>
 481   store <8 x i32> %interleaved.vec, <8 x i32>* %ptr, align 4
 482   ret void
 483 }
 484
 485 define void @store_general_mask_factor4_undefmid(<8 x i32>* %ptr, <32 x i32> %v0, <32 x i32> %v1) {
 486 ; NEON-LABEL:    @store_general_mask_factor4_undefmid(
 487 ; NEON-NEXT:       [[TMP1:%.*]] = bitcast <8 x i32>* %ptr to i8*
 488 ; NEON-NEXT:       [[TMP2:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <2 x i32> <i32 4, i32 5>
 489 ; NEON-NEXT:       [[TMP3:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <2 x i32> <i32 16, i32 17>
 490 ; NEON-NEXT:       [[TMP4:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <2 x i32> <i32 32, i32 33>
 491 ; NEON-NEXT:       [[TMP5:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <2 x i32> <i32 8, i32 9>
 492 ; NEON-NEXT:       call void @llvm.arm.neon.vst4.p0i8.v2i32(i8* [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> [[TMP3]], <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], i32 4)
 493 ; NEON-NEXT:       ret void
 494 ; NO_NEON-LABEL: @store_general_mask_factor4_undefmid(
 495 ; NO_NEON-NOT:     @llvm.arm.neon
 496 ; NO_NEON:         ret void
 497 ;
 498   %interleaved.vec = shufflevector <32 x i32> %v0, <32 x i32> %v1, <8 x i32> <i32 4, i32 undef, i32 32, i32 8, i32 5, i32 17, i32 undef, i32 9>
 499   store <8 x i32> %interleaved.vec, <8 x i32>* %ptr, align 4
 500   ret void
 501 }
 502
 503 define void @store_general_mask_factor4_undefmulti(<8 x i32>* %ptr, <32 x i32> %v0, <32 x i32> %v1) {
 504 ; NEON-LABEL:    @store_general_mask_factor4_undefmulti(
 505 ; NEON-NEXT:       [[TMP1:%.*]] = bitcast <8 x i32>* %ptr to i8*
 506 ; NEON-NEXT:       [[TMP2:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <2 x i32> <i32 4, i32 5>
 507 ; NEON-NEXT:       [[TMP3:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <2 x i32> <i32 0, i32 1>
 508 ; NEON-NEXT:       [[TMP4:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <2 x i32> <i32 0, i32 1>
 509 ; NEON-NEXT:       [[TMP5:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <2 x i32> <i32 8, i32 9>
 510 ; NEON-NEXT:       call void @llvm.arm.neon.vst4.p0i8.v2i32(i8* [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> [[TMP3]], <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], i32 4)
 511 ; NEON-NEXT:       ret void
 512 ; NO_NEON-LABEL: @store_general_mask_factor4_undefmulti(
 513 ; NO_NEON-NOT:     @llvm.arm.neon
 514 ; NO_NEON:         ret void
 515 ;
 516   %interleaved.vec = shufflevector <32 x i32> %v0, <32 x i32> %v1, <8 x i32> <i32 4, i32 undef, i32 undef, i32 8, i32 undef, i32 undef, i32 undef, i32 9>
 517   store <8 x i32> %interleaved.vec, <8 x i32>* %ptr, align 4
 518   ret void
 519 }
 520
 521 define void @store_general_mask_factor3(<12 x i32>* %ptr, <32 x i32> %v0, <32 x i32> %v1) {
 522 ; NEON-LABEL:    @store_general_mask_factor3(
 523 ; NEON-NEXT:       [[TMP1:%.*]] = bitcast <12 x i32>* %ptr to i8*
 524 ; NEON-NEXT:       [[TMP2:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 525 ; NEON-NEXT:       [[TMP3:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <4 x i32> <i32 32, i32 33, i32 34, i32 35>
 526 ; NEON-NEXT:       [[TMP4:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
 527 ; NEON-NEXT:       call void @llvm.arm.neon.vst3.p0i8.v4i32(i8* [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], i32 4)
 528 ; NEON-NEXT:       ret void
 529 ; NO_NEON-LABEL: @store_general_mask_factor3(
 530 ; NO_NEON-NOT:     @llvm.arm.neon
 531 ; NO_NEON:         ret void
 532 ;
 533   %interleaved.vec = shufflevector <32 x i32> %v0, <32 x i32> %v1, <12 x i32> <i32 4, i32 32, i32 16, i32 5, i32 33, i32 17, i32 6, i32 34, i32 18, i32 7, i32 35, i32 19>
 534   store <12 x i32> %interleaved.vec, <12 x i32>* %ptr, align 4
 535   ret void
 536 }
 537
 538 define void @store_general_mask_factor3_undefmultimid(<12 x i32>* %ptr, <32 x i32> %v0, <32 x i32> %v1) {
 539 ; NEON-LABEL:    @store_general_mask_factor3_undefmultimid(
 540 ; NEON-NEXT:       [[TMP1:%.*]] = bitcast <12 x i32>* %ptr to i8*
 541 ; NEON-NEXT:       [[TMP2:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 542 ; NEON-NEXT:       [[TMP3:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <4 x i32> <i32 32, i32 33, i32 34, i32 35>
 543 ; NEON-NEXT:       [[TMP4:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
 544 ; NEON-NEXT:       call void @llvm.arm.neon.vst3.p0i8.v4i32(i8* [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], i32 4)
 545 ; NEON-NEXT:       ret void
 546 ; NO_NEON-LABEL: @store_general_mask_factor3_undefmultimid(
 547 ; NO_NEON-NOT:     @llvm.arm.neon
 548 ; NO_NEON:         ret void
 549 ;
 550   %interleaved.vec = shufflevector <32 x i32> %v0, <32 x i32> %v1, <12 x i32> <i32 4, i32 32, i32 16, i32 undef, i32 33, i32 17, i32 undef, i32 34, i32 18, i32 7, i32 35, i32 19>
 551   store <12 x i32> %interleaved.vec, <12 x i32>* %ptr, align 4
 552   ret void
 553 }
 554
 555 define void @store_general_mask_factor3_undef_fail(<12 x i32>* %ptr, <32 x i32> %v0, <32 x i32> %v1) {
 556 ; ALL-LABEL: @store_general_mask_factor3_undef_fail(
 557 ; ALL-NOT:     @llvm.arm.neon
 558 ; ALL:         ret void
 559 ;
 560   %interleaved.vec = shufflevector <32 x i32> %v0, <32 x i32> %v1, <12 x i32> <i32 4, i32 32, i32 16, i32 undef, i32 33, i32 17, i32 undef, i32 34, i32 18, i32 8, i32 35, i32 19>
 561   store <12 x i32> %interleaved.vec, <12 x i32>* %ptr, align 4
 562   ret void
 563 }
 564
 565 define void @store_general_mask_factor3_undeflane(<12 x i32>* %ptr, <32 x i32> %v0, <32 x i32> %v1) {
 566 ; NEON-LABEL:    @store_general_mask_factor3_undeflane(
 567 ; NEON-NEXT:       [[TMP1:%.*]] = bitcast <12 x i32>* %ptr to i8*
 568 ; NEON-NEXT:       [[TMP2:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 569 ; NEON-NEXT:       [[TMP3:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <4 x i32> <i32 32, i32 33, i32 34, i32 35>
 570 ; NEON-NEXT:       [[TMP4:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
 571 ; NEON-NEXT:       call void @llvm.arm.neon.vst3.p0i8.v4i32(i8* [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], i32 4)
 572 ; NEON-NEXT:       ret void
 573 ; NO_NEON-LABEL: @store_general_mask_factor3_undeflane(
 574 ; NO_NEON-NOT:     @llvm.arm.neon
 575 ; NO_NEON:         ret void
 576 ;
 577   %interleaved.vec = shufflevector <32 x i32> %v0, <32 x i32> %v1, <12 x i32> <i32 undef, i32 32, i32 16, i32 undef, i32 33, i32 17, i32 undef, i32 34, i32 18, i32 undef, i32 35, i32 19>
 578   store <12 x i32> %interleaved.vec, <12 x i32>* %ptr, align 4
 579   ret void
 580 }
 581
 582 define void @store_general_mask_factor3_endstart_fail(<12 x i32>* %ptr, <32 x i32> %v0, <32 x i32> %v1) {
 583 ; ALL-LABEL:    @store_general_mask_factor3_endstart_fail(
 584 ; ALL-NOT:        @llvm.arm.neon
 585 ; ALL:            ret void
 586 ;
 587   %interleaved.vec = shufflevector <32 x i32> %v0, <32 x i32> %v1, <12 x i32> <i32 undef, i32 32, i32 16, i32 undef, i32 33, i32 17, i32 undef, i32 34, i32 18, i32 2, i32 35, i32 19>
 588   store <12 x i32> %interleaved.vec, <12 x i32>* %ptr, align 4
 589   ret void
 590 }
 591
 592 define void @store_general_mask_factor3_endstart_pass(<12 x i32>* %ptr, <32 x i32> %v0, <32 x i32> %v1) {
 593 ; NEON-LABEL:    @store_general_mask_factor3_endstart_pass(
 594 ; NEON-NEXT:       [[TMP1:%.*]] = bitcast <12 x i32>* %ptr to i8*
 595 ; NEON-NEXT:       [[TMP2:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 596 ; NEON-NEXT:       [[TMP3:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <4 x i32> <i32 32, i32 33, i32 34, i32 35>
 597 ; NEON-NEXT:       [[TMP4:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
 598 ; NEON-NEXT:       call void @llvm.arm.neon.vst3.p0i8.v4i32(i8* [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], i32 4)
 599 ; NEON-NEXT:       ret void
 600 ; NO_NEON-LABEL: @store_general_mask_factor3_endstart_pass(
 601 ; NO_NEON-NOT:     @llvm.arm.neon
 602 ; NO_NEON:         ret void
 603 ;
 604   %interleaved.vec = shufflevector <32 x i32> %v0, <32 x i32> %v1, <12 x i32> <i32 undef, i32 32, i32 16, i32 undef, i32 33, i32 17, i32 undef, i32 34, i32 18, i32 7, i32 35, i32 19>
 605   store <12 x i32> %interleaved.vec, <12 x i32>* %ptr, align 4
 606   ret void
 607 }
 608
 609 define void @store_general_mask_factor3_midstart_fail(<12 x i32>* %ptr, <32 x i32> %v0, <32 x i32> %v1) {
 610 ; ALL-LABEL:    @store_general_mask_factor3_midstart_fail(
 611 ; ALL-NOT:        @llvm.arm.neon
 612 ; ALL:            ret void
 613 ;
 614   %interleaved.vec = shufflevector <32 x i32> %v0, <32 x i32> %v1, <12 x i32> <i32 undef, i32 32, i32 16, i32 0, i32 33, i32 17, i32 undef, i32 34, i32 18, i32 undef, i32 35, i32 19>
 615   store <12 x i32> %interleaved.vec, <12 x i32>* %ptr, align 4
 616   ret void
 617 }
 618
 619 define void @store_general_mask_factor3_midstart_pass(<12 x i32>* %ptr, <32 x i32> %v0, <32 x i32> %v1) {
 620 ; NEON-LABEL:    @store_general_mask_factor3_midstart_pass(
 621 ; NEON-NEXT:       [[TMP1:%.*]] = bitcast <12 x i32>* %ptr to i8*
 622 ; NEON-NEXT:       [[TMP2:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 623 ; NEON-NEXT:       [[TMP3:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <4 x i32> <i32 32, i32 33, i32 34, i32 35>
 624 ; NEON-NEXT:       [[TMP4:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
 625 ; NEON-NEXT:       call void @llvm.arm.neon.vst3.p0i8.v4i32(i8* [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], i32 4)
 626 ; NEON-NEXT:       ret void
 627 ; NO_NEON-LABEL: @store_general_mask_factor3_midstart_pass(
 628 ; NO_NEON-NOT:     @llvm.arm.neon
 629 ; NO_NEON:         ret void
 630 ;
 631   %interleaved.vec = shufflevector <32 x i32> %v0, <32 x i32> %v1, <12 x i32> <i32 undef, i32 32, i32 16, i32 1, i32 33, i32 17, i32 undef, i32 34, i32 18, i32 undef, i32 35, i32 19>
 632   store <12 x i32> %interleaved.vec, <12 x i32>* %ptr, align 4
 633   ret void
 634 }
 635
 636 @g = external global <4 x float>
 637
 638 ; The following does not give a valid interleaved store
 639 ; ALL-LABEL: define void @no_interleave
 640 ; ALL-NOT: call void @llvm.arm.neon.vst2
 641 ; ALL: shufflevector
 642 ; ALL: store
 643 ; ALL: ret void
 644 define void @no_interleave(<4 x float> %a0) {
 645   %v0 = shufflevector <4 x float> %a0, <4 x float> %a0, <4 x i32> <i32 0, i32 7, i32 1, i32 undef>
 646   store <4 x float> %v0, <4 x float>* @g, align 16
 647   ret void
 648 }
 649
 650 define void @load_factor2_wide2(<16 x i32>* %ptr) {
 651 ; NEON-LABEL:    @load_factor2_wide2(
 652 ; NEON-NEXT:       [[TMP1:%.*]] = bitcast <16 x i32>* %ptr to i32*
 653 ; NEON-NEXT:       [[TMP2:%.*]] = bitcast i32* [[TMP1]] to i8*
 654 ; NEON-NEXT:       [[VLDN:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2.v4i32.p0i8(i8* [[TMP2]], i32 4)
 655 ; NEON-NEXT:       [[TMP3:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLDN]], 1
 656 ; NEON-NEXT:       [[TMP4:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLDN]], 0
 657 ; NEON-NEXT:       [[TMP5:%.*]] = getelementptr i32, i32* [[TMP1]], i32 8
 658 ; NEON-NEXT:       [[TMP6:%.*]] = bitcast i32* [[TMP5]] to i8*
 659 ; NEON-NEXT:       [[VLDN1:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2.v4i32.p0i8(i8* [[TMP6]], i32 4)
 660 ; NEON-NEXT:       [[TMP7:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLDN1]], 1
 661 ; NEON-NEXT:       [[TMP8:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLDN1]], 0
 662 ; NEON-NEXT:       [[TMP9:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP7]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 663 ; NEON-NEXT:       [[TMP10:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP8]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 664 ; NEON-NEXT:       ret void
 665 ; NO_NEON-LABEL: @load_factor2_wide2(
 666 ; NO_NEON-NOT:     @llvm.arm.neon
 667 ; NO_NEON:         ret void
 668 ;
 669   %interleaved.vec = load <16 x i32>, <16 x i32>* %ptr, align 4
 670   %v0 = shufflevector <16 x i32> %interleaved.vec, <16 x i32> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
 671   %v1 = shufflevector <16 x i32> %interleaved.vec, <16 x i32> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
 672   ret void
 673 }
 674
 675 define void @load_factor2_wide3(<24 x i32>* %ptr) {
 676 ; NEON-LABEL:    @load_factor2_wide3(
 677 ; NEON-NEXT:       [[TMP1:%.*]] = bitcast <24 x i32>* [[PTR:%.*]] to i32*
 678 ; NEON-NEXT:       [[TMP2:%.*]] = bitcast i32* [[TMP1]] to i8*
 679 ; NEON-NEXT:       [[VLDN:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2.v4i32.p0i8(i8* [[TMP2]], i32 4)
 680 ; NEON-NEXT:       [[TMP3:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLDN]], 1
 681 ; NEON-NEXT:       [[TMP4:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLDN]], 0
 682 ; NEON-NEXT:       [[TMP5:%.*]] = getelementptr i32, i32* [[TMP1]], i32 8
 683 ; NEON-NEXT:       [[TMP6:%.*]] = bitcast i32* [[TMP5]] to i8*
 684 ; NEON-NEXT:       [[VLDN1:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2.v4i32.p0i8(i8* [[TMP6]], i32 4)
 685 ; NEON-NEXT:       [[TMP7:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLDN1]], 1
 686 ; NEON-NEXT:       [[TMP8:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLDN1]], 0
 687 ; NEON-NEXT:       [[TMP9:%.*]] = getelementptr i32, i32* [[TMP5]], i32 8
 688 ; NEON-NEXT:       [[TMP10:%.*]] = bitcast i32* [[TMP9]] to i8*
 689 ; NEON-NEXT:       [[VLDN2:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2.v4i32.p0i8(i8* [[TMP10]], i32 4)
 690 ; NEON-NEXT:       [[TMP11:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLDN2]], 1
 691 ; NEON-NEXT:       [[TMP12:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLDN2]], 0
 692 ; NEON-NEXT:       [[TMP13:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP7]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 693 ; NEON-NEXT:       [[TMP14:%.*]] = shufflevector <4 x i32> [[TMP11]], <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
 694 ; NEON-NEXT:       [[TMP15:%.*]] = shufflevector <8 x i32> [[TMP13]], <8 x i32> [[TMP14]], <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
 695 ; NEON-NEXT:       [[TMP16:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP8]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 696 ; NEON-NEXT:       [[TMP17:%.*]] = shufflevector <4 x i32> [[TMP12]], <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
 697 ; NEON-NEXT:       [[TMP18:%.*]] = shufflevector <8 x i32> [[TMP16]], <8 x i32> [[TMP17]], <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
 698 ; NEON-NEXT:       ret void
 699 ; NO_NEON-LABEL: @load_factor2_wide3(
 700 ; NO_NEON-NOT:     @llvm.arm.neon
 701 ; NO_NEON:         ret void
 702 ;
 703   %interleaved.vec = load <24 x i32>, <24 x i32>* %ptr, align 4
 704   %v0 = shufflevector <24 x i32> %interleaved.vec, <24 x i32> undef, <12 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22>
 705   %v1 = shufflevector <24 x i32> %interleaved.vec, <24 x i32> undef, <12 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23>
 706   ret void
 707 }
 708
 709 define void @load_factor3_wide(<24 x i32>* %ptr) {
 710 ; NEON-LABEL:    @load_factor3_wide(
 711 ; NEON-NEXT:       [[TMP1:%.*]] = bitcast <24 x i32>* %ptr to i32*
 712 ; NEON-NEXT:       [[TMP2:%.*]] = bitcast i32* [[TMP1]] to i8*
 713 ; NEON-NEXT:       [[VLDN:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld3.v4i32.p0i8(i8* [[TMP2]], i32 4)
 714 ; NEON-NEXT:       [[TMP3:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN]], 2
 715 ; NEON-NEXT:       [[TMP4:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN]], 1
 716 ; NEON-NEXT:       [[TMP5:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN]], 0
 717 ; NEON-NEXT:       [[TMP6:%.*]] = getelementptr i32, i32* [[TMP1]], i32 12
 718 ; NEON-NEXT:       [[TMP7:%.*]] = bitcast i32* [[TMP6]] to i8*
 719 ; NEON-NEXT:       [[VLDN1:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld3.v4i32.p0i8(i8* [[TMP7]], i32 4)
 720 ; NEON-NEXT:       [[TMP8:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN1]], 2
 721 ; NEON-NEXT:       [[TMP9:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN1]], 1
 722 ; NEON-NEXT:       [[TMP10:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN1]], 0
 723 ; NEON-NEXT:       [[TMP11:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP8]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 724 ; NEON-NEXT:       [[TMP12:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP9]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 725 ; NEON-NEXT:       [[TMP13:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP10]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 726 ; NEON-NEXT:       ret void
 727 ; NO_NEON-LABEL: @load_factor3_wide(
 728 ; NO_NEON-NOT:     @llvm.arm.neon
 729 ; NO_NEON:         ret void
 730 ;
 731   %interleaved.vec = load <24 x i32>, <24 x i32>* %ptr, align 4
 732   %v0 = shufflevector <24 x i32> %interleaved.vec, <24 x i32> undef, <8 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21>
 733   %v1 = shufflevector <24 x i32> %interleaved.vec, <24 x i32> undef, <8 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22>
 734   %v2 = shufflevector <24 x i32> %interleaved.vec, <24 x i32> undef, <8 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23>
 735   ret void
 736 }
 737
 738 define void @load_factor4_wide(<32 x i32>* %ptr) {
 739 ; NEON-LABEL:    @load_factor4_wide(
 740 ; NEON-NEXT:       [[TMP1:%.*]] = bitcast <32 x i32>* %ptr to i32*
 741 ; NEON-NEXT:       [[TMP2:%.*]] = bitcast i32* [[TMP1]] to i8*
 742 ; NEON-NEXT:       [[VLDN:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld4.v4i32.p0i8(i8* [[TMP2]], i32 4)
 743 ; NEON-NEXT:       [[TMP3:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN]], 3
 744 ; NEON-NEXT:       [[TMP4:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN]], 2
 745 ; NEON-NEXT:       [[TMP5:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN]], 1
 746 ; NEON-NEXT:       [[TMP6:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN]], 0
 747 ; NEON-NEXT:       [[TMP7:%.*]] = getelementptr i32, i32* [[TMP1]], i32 16
 748 ; NEON-NEXT:       [[TMP8:%.*]] = bitcast i32* [[TMP7]] to i8*
 749 ; NEON-NEXT:       [[VLDN1:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld4.v4i32.p0i8(i8* [[TMP8]], i32 4)
 750 ; NEON-NEXT:       [[TMP9:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN1]], 3
 751 ; NEON-NEXT:       [[TMP10:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN1]], 2
 752 ; NEON-NEXT:       [[TMP11:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN1]], 1
 753 ; NEON-NEXT:       [[TMP12:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN1]], 0
 754 ; NEON-NEXT:       [[TMP13:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP9]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 755 ; NEON-NEXT:       [[TMP14:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP10]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 756 ; NEON-NEXT:       [[TMP15:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP11]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 757 ; NEON-NEXT:       [[TMP16:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> [[TMP12]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 758 ; NEON-NEXT:       ret void
 759 ; NO_NEON-LABEL: @load_factor4_wide(
 760 ; NO_NEON-NOT:     @llvm.arm.neon
 761 ; NO_NEON:         ret void
 762 ;
 763   %interleaved.vec = load <32 x i32>, <32 x i32>* %ptr, align 4
 764   %v0 = shufflevector <32 x i32> %interleaved.vec, <32 x i32> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28>
 765   %v1 = shufflevector <32 x i32> %interleaved.vec, <32 x i32> undef, <8 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29>
 766   %v2 = shufflevector <32 x i32> %interleaved.vec, <32 x i32> undef, <8 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30>
 767   %v3 = shufflevector <32 x i32> %interleaved.vec, <32 x i32> undef, <8 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31>
 768   ret void
 769 }
 770
 771 define void @store_factor2_wide(<16 x i32>* %ptr, <8 x i32> %v0, <8 x i32> %v1) {
 772 ; NEON-LABEL:    @store_factor2_wide(
 773 ; NEON-NEXT:       [[TMP1:%.*]] = bitcast <16 x i32>* %ptr to i32*
 774 ; NEON-NEXT:       [[TMP2:%.*]] = bitcast i32* [[TMP1]] to i8*
 775 ; NEON-NEXT:       [[TMP3:%.*]] = shufflevector <8 x i32> %v0, <8 x i32> %v1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 776 ; NEON-NEXT:       [[TMP4:%.*]] = shufflevector <8 x i32> %v0, <8 x i32> %v1, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
 777 ; NEON-NEXT:       call void @llvm.arm.neon.vst2.p0i8.v4i32(i8* [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], i32 4)
 778 ; NEON-NEXT:       [[TMP5:%.*]] = getelementptr i32, i32* [[TMP1]], i32 8
 779 ; NEON-NEXT:       [[TMP6:%.*]] = bitcast i32* [[TMP5]] to i8*
 780 ; NEON-NEXT:       [[TMP7:%.*]] = shufflevector <8 x i32> %v0, <8 x i32> %v1, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 781 ; NEON-NEXT:       [[TMP8:%.*]] = shufflevector <8 x i32> %v0, <8 x i32> %v1, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
 782 ; NEON-NEXT:       call void @llvm.arm.neon.vst2.p0i8.v4i32(i8* [[TMP6]], <4 x i32> [[TMP7]], <4 x i32> [[TMP8]], i32 4)
 783 ; NEON-NEXT:       ret void
 784 ; NO_NEON-LABEL: @store_factor2_wide(
 785 ; NO_NEON-NOT:     @llvm.arm.neon
 786 ; NO_NEON:         ret void
 787 ;
 788   %interleaved.vec = shufflevector <8 x i32> %v0, <8 x i32> %v1, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
 789   store <16 x i32> %interleaved.vec, <16 x i32>* %ptr, align 4
 790   ret void
 791 }
 792
 793 define void @store_factor3_wide(<24 x i32>* %ptr, <8 x i32> %v0, <8 x i32> %v1, <8 x i32> %v2) {
 794 ; NEON-LABEL:    @store_factor3_wide(
 795 ; NEON:            [[TMP1:%.*]] = bitcast <24 x i32>* %ptr to i32*
 796 ; NEON-NEXT:       [[TMP2:%.*]] = bitcast i32* [[TMP1]] to i8*
 797 ; NEON-NEXT:       [[TMP3:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 798 ; NEON-NEXT:       [[TMP4:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
 799 ; NEON-NEXT:       [[TMP5:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
 800 ; NEON-NEXT:       call void @llvm.arm.neon.vst3.p0i8.v4i32(i8* [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], i32 4)
 801 ; NEON-NEXT:       [[TMP6:%.*]] = getelementptr i32, i32* [[TMP1]], i32 12
 802 ; NEON-NEXT:       [[TMP7:%.*]] = bitcast i32* [[TMP6]] to i8*
 803 ; NEON-NEXT:       [[TMP8:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 804 ; NEON-NEXT:       [[TMP9:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
 805 ; NEON-NEXT:       [[TMP10:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
 806 ; NEON-NEXT:       call void @llvm.arm.neon.vst3.p0i8.v4i32(i8* [[TMP7]], <4 x i32> [[TMP8]], <4 x i32> [[TMP9]], <4 x i32> [[TMP10]], i32 4)
 807 ; NEON-NEXT:       ret void
 808 ; NO_NEON-LABEL: @store_factor3_wide(
 809 ; NO_NEON-NOT:     @llvm.arm.neon
 810 ; NO_NEON:         ret void
 811 ;
 812   %s0 = shufflevector <8 x i32> %v0, <8 x i32> %v1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 813   %s1 = shufflevector <8 x i32> %v2, <8 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 814   %interleaved.vec = shufflevector <16 x i32> %s0, <16 x i32> %s1, <24 x i32> <i32 0, i32 8, i32 16, i32 1, i32 9, i32 17, i32 2, i32 10, i32 18, i32 3, i32 11, i32 19, i32 4, i32 12, i32 20, i32 5, i32 13, i32 21, i32 6, i32 14, i32 22, i32 7, i32 15, i32 23>
 815   store <24 x i32> %interleaved.vec, <24 x i32>* %ptr, align 4
 816   ret void
 817 }
 818
 819 define void @store_factor4_wide(<32 x i32>* %ptr, <8 x i32> %v0, <8 x i32> %v1, <8 x i32> %v2, <8 x i32> %v3) {
 820 ; NEON-LABEL:    @store_factor4_wide(
 821 ; NEON:            [[TMP1:%.*]] = bitcast <32 x i32>* %ptr to i32*
 822 ; NEON-NEXT:       [[TMP2:%.*]] = bitcast i32* [[TMP1]] to i8*
 823 ; NEON-NEXT:       [[TMP3:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 824 ; NEON-NEXT:       [[TMP4:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
 825 ; NEON-NEXT:       [[TMP5:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
 826 ; NEON-NEXT:       [[TMP6:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> <i32 24, i32 25, i32 26, i32 27>
 827 ; NEON-NEXT:       call void @llvm.arm.neon.vst4.p0i8.v4i32(i8* [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> [[TMP6]], i32 4)
 828 ; NEON-NEXT:       [[TMP7:%.*]] = getelementptr i32, i32* [[TMP1]], i32 16
 829 ; NEON-NEXT:       [[TMP8:%.*]] = bitcast i32* [[TMP7]] to i8*
 830 ; NEON-NEXT:       [[TMP9:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 831 ; NEON-NEXT:       [[TMP10:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
 832 ; NEON-NEXT:       [[TMP11:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
 833 ; NEON-NEXT:       [[TMP12:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> <i32 28, i32 29, i32 30, i32 31>
 834 ; NEON-NEXT:       call void @llvm.arm.neon.vst4.p0i8.v4i32(i8* [[TMP8]], <4 x i32> [[TMP9]], <4 x i32> [[TMP10]], <4 x i32> [[TMP11]], <4 x i32> [[TMP12]], i32 4)
 835 ; NEON-NEXT:       ret void
 836 ; NO_NEON-LABEL: @store_factor4_wide(
 837 ; NO_NEON-NOT:     @llvm.arm.neon
 838 ; NO_NEON:         ret void
 839 ;
 840   %s0 = shufflevector <8 x i32> %v0, <8 x i32> %v1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 841   %s1 = shufflevector <8 x i32> %v2, <8 x i32> %v3, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 842   %interleaved.vec = shufflevector <16 x i32> %s0, <16 x i32> %s1, <32 x i32> <i32 0, i32 8, i32 16, i32 24, i32 1, i32 9, i32 17, i32 25, i32 2, i32 10, i32 18, i32 26, i32 3, i32 11, i32 19, i32 27, i32 4, i32 12, i32 20, i32 28, i32 5, i32 13, i32 21, i32 29, i32 6, i32 14, i32 22, i32 30, i32 7, i32 15, i32 23, i32 31>
 843   store <32 x i32> %interleaved.vec, <32 x i32>* %ptr, align 4
 844   ret void
 845 }
 846
 847 define void @load_factor2_fp128(<4 x fp128>* %ptr) {
 848 ; ALL-LABEL: @load_factor2_fp128(
 849 ; ALL-NOT:     @llvm.arm.neon
 850 ; ALL:         ret void
 851 ;
 852   %interleaved.vec = load <4 x fp128>, <4 x fp128>* %ptr, align 16
 853   %v0 = shufflevector <4 x fp128> %interleaved.vec, <4 x fp128> undef, <2 x i32> <i32 0, i32 2>
 854   %v1 = shufflevector <4 x fp128> %interleaved.vec, <4 x fp128> undef, <2 x i32> <i32 1, i32 3>
 855   ret void
 856 }
 857
 858 define void @load_factor2_wide_pointer(<16 x i32*>* %ptr) {
 859 ; NEON-LABEL:    @load_factor2_wide_pointer(
 860 ; NEON-NEXT:       [[TMP1:%.*]] = bitcast <16 x i32*>* %ptr to i32*
 861 ; NEON-NEXT:       [[TMP2:%.*]] = bitcast i32* [[TMP1]] to i8*
 862 ; NEON-NEXT:       [[VLDN:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2.v4i32.p0i8(i8* [[TMP2]], i32 4)
 863 ; NEON-NEXT:       [[TMP3:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLDN]], 1
 864 ; NEON-NEXT:       [[TMP4:%.*]] = inttoptr <4 x i32> [[TMP3]] to <4 x i32*>
 865 ; NEON-NEXT:       [[TMP5:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLDN]], 0
 866 ; NEON-NEXT:       [[TMP6:%.*]] = inttoptr <4 x i32> [[TMP5]] to <4 x i32*>
 867 ; NEON-NEXT:       [[TMP7:%.*]] = getelementptr i32, i32* [[TMP1]], i32 8
 868 ; NEON-NEXT:       [[TMP8:%.*]] = bitcast i32* [[TMP7]] to i8*
 869 ; NEON-NEXT:       [[VLDN1:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2.v4i32.p0i8(i8* [[TMP8]], i32 4)
 870 ; NEON-NEXT:       [[TMP9:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLDN1]], 1
 871 ; NEON-NEXT:       [[TMP10:%.*]] = inttoptr <4 x i32> [[TMP9]] to <4 x i32*>
 872 ; NEON-NEXT:       [[TMP11:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLDN1]], 0
 873 ; NEON-NEXT:       [[TMP12:%.*]] = inttoptr <4 x i32> [[TMP11]] to <4 x i32*>
 874 ; NEON-NEXT:       [[TMP13:%.*]] = shufflevector <4 x i32*> [[TMP4]], <4 x i32*> [[TMP10]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 875 ; NEON-NEXT:       [[TMP14:%.*]] = shufflevector <4 x i32*> [[TMP6]], <4 x i32*> [[TMP12]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 876 ; NEON-NEXT:       ret void
 877 ; NO_NEON-LABEL: @load_factor2_wide_pointer(
 878 ; NO_NEON-NOT:     @llvm.arm.neon
 879 ; NO_NEON:         ret void
 880 ;
 881   %interleaved.vec = load <16 x i32*>, <16 x i32*>* %ptr, align 4
 882   %v0 = shufflevector <16 x i32*> %interleaved.vec, <16 x i32*> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
 883   %v1 = shufflevector <16 x i32*> %interleaved.vec, <16 x i32*> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
 884   ret void
 885 }