test/Transforms/InterleavedAccess/AArch64/interleaved-accesses.ll

   1 ; RUN: opt < %s -interleaved-access -S | FileCheck %s -check-prefix=NEON
   2 ; RUN: opt < %s -mattr=-neon -interleaved-access -S | FileCheck %s -check-prefix=NO_NEON
   3
   4 target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
   5 target triple = "aarch64--linux-gnu"
   6
   7 define void @load_factor2(<16 x i8>* %ptr) {
   8 ; NEON-LABEL:    @load_factor2(
   9 ; NEON-NEXT:       [[TMP1:%.*]] = bitcast <16 x i8>* %ptr to <8 x i8>*
  10 ; NEON-NEXT:       [[LDN:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2.v8i8.p0v8i8(<8 x i8>* [[TMP1]])
  11 ; NEON-NEXT:       [[TMP2:%.*]] = extractvalue { <8 x i8>, <8 x i8> } [[LDN]], 1
  12 ; NEON-NEXT:       [[TMP3:%.*]] = extractvalue { <8 x i8>, <8 x i8> } [[LDN]], 0
  13 ; NEON-NEXT:       ret void
  14 ; NO_NEON-LABEL: @load_factor2(
  15 ; NO_NEON-NOT:     @llvm.aarch64.neon
  16 ; NO_NEON:         ret void
  17 ;
  18   %interleaved.vec = load <16 x i8>, <16 x i8>* %ptr, align 4
  19   %v0 = shufflevector <16 x i8> %interleaved.vec, <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
  20   %v1 = shufflevector <16 x i8> %interleaved.vec, <16 x i8> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
  21   ret void
  22 }
  23
  24 define void @load_factor3(<12 x i32>* %ptr) {
  25 ; NEON-LABEL:    @load_factor3(
  26 ; NEON-NEXT:       [[TMP1:%.*]] = bitcast <12 x i32>* %ptr to <4 x i32>*
  27 ; NEON-NEXT:       [[LDN:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3.v4i32.p0v4i32(<4 x i32>* [[TMP1]])
  28 ; NEON-NEXT:       [[TMP2:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[LDN]], 2
  29 ; NEON-NEXT:       [[TMP3:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[LDN]], 1
  30 ; NEON-NEXT:       [[TMP4:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[LDN]], 0
  31 ; NEON-NEXT:       ret void
  32 ; NO_NEON-LABEL: @load_factor3(
  33 ; NO_NEON-NOT:     @llvm.aarch64.neon
  34 ; NO_NEON:         ret void
  35 ;
  36   %interleaved.vec = load <12 x i32>, <12 x i32>* %ptr, align 4
  37   %v0 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> undef, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
  38   %v1 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> undef, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
  39   %v2 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> undef, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
  40   ret void
  41 }
  42
  43 define void @load_factor4(<16 x i32>* %ptr) {
  44 ; NEON-LABEL:    @load_factor4(
  45 ; NEON-NEXT:       [[TMP1:%.*]] = bitcast <16 x i32>* %ptr to <4 x i32>*
  46 ; NEON-NEXT:       [[LDN:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld4.v4i32.p0v4i32(<4 x i32>* [[TMP1]])
  47 ; NEON-NEXT:       [[TMP2:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[LDN]], 3
  48 ; NEON-NEXT:       [[TMP3:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[LDN]], 2
  49 ; NEON-NEXT:       [[TMP4:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[LDN]], 1
  50 ; NEON-NEXT:       [[TMP5:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[LDN]], 0
  51 ; NEON-NEXT:       ret void
  52 ; NO_NEON-LABEL: @load_factor4(
  53 ; NO_NEON-NOT:     @llvm.aarch64.neon
  54 ; NO_NEON:         ret void
  55 ;
  56   %interleaved.vec = load <16 x i32>, <16 x i32>* %ptr, align 4
  57   %v0 = shufflevector <16 x i32> %interleaved.vec, <16 x i32> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
  58   %v1 = shufflevector <16 x i32> %interleaved.vec, <16 x i32> undef, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
  59   %v2 = shufflevector <16 x i32> %interleaved.vec, <16 x i32> undef, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
  60   %v3 = shufflevector <16 x i32> %interleaved.vec, <16 x i32> undef, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
  61   ret void
  62 }
  63
  64 define void @store_factor2(<16 x i8>* %ptr, <8 x i8> %v0, <8 x i8> %v1) {
  65 ; NEON-LABEL:    @store_factor2(
  66 ; NEON-NEXT:       [[TMP1:%.*]] = shufflevector <8 x i8> %v0, <8 x i8> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
  67 ; NEON-NEXT:       [[TMP2:%.*]] = shufflevector <8 x i8> %v0, <8 x i8> %v1, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
  68 ; NEON-NEXT:       [[TMP3:%.*]] = bitcast <16 x i8>* %ptr to <8 x i8>*
  69 ; NEON-NEXT:       call void @llvm.aarch64.neon.st2.v8i8.p0v8i8(<8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <8 x i8>* [[TMP3]])
  70 ; NEON-NEXT:       ret void
  71 ; NO_NEON-LABEL: @store_factor2(
  72 ; NO_NEON-NOT:     @llvm.aarch64.neon
  73 ; NO_NEON:         ret void
  74 ;
  75   %interleaved.vec = shufflevector <8 x i8> %v0, <8 x i8> %v1, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
  76   store <16 x i8> %interleaved.vec, <16 x i8>* %ptr, align 4
  77   ret void
  78 }
  79
  80 define void @store_factor3(<12 x i32>* %ptr, <4 x i32> %v0, <4 x i32> %v1, <4 x i32> %v2) {
  81 ; NEON-LABEL:    @store_factor3(
  82 ; NEON:            [[TMP1:%.*]] = shufflevector <8 x i32> %s0, <8 x i32> %s1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
  83 ; NEON-NEXT:       [[TMP2:%.*]] = shufflevector <8 x i32> %s0, <8 x i32> %s1, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
  84 ; NEON-NEXT:       [[TMP3:%.*]] = shufflevector <8 x i32> %s0, <8 x i32> %s1, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
  85 ; NEON-NEXT:       [[TMP4:%.*]] = bitcast <12 x i32>* %ptr to <4 x i32>*
  86 ; NEON-NEXT:       call void @llvm.aarch64.neon.st3.v4i32.p0v4i32(<4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32>* [[TMP4]])
  87 ; NEON-NEXT:       ret void
  88 ; NO_NEON-LABEL: @store_factor3(
  89 ; NO_NEON-NOT:     @llvm.aarch64.neon
  90 ; NO_NEON:         ret void
  91 ;
  92   %s0 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
  93   %s1 = shufflevector <4 x i32> %v2, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
  94   %interleaved.vec = shufflevector <8 x i32> %s0, <8 x i32> %s1, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
  95   store <12 x i32> %interleaved.vec, <12 x i32>* %ptr, align 4
  96   ret void
  97 }
  98
  99 define void @store_factor4(<16 x i32>* %ptr, <4 x i32> %v0, <4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3) {
 100 ; NEON-LABEL:    @store_factor4(
 101 ; NEON:            [[TMP1:%.*]] = shufflevector <8 x i32> %s0, <8 x i32> %s1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 102 ; NEON-NEXT:       [[TMP2:%.*]] = shufflevector <8 x i32> %s0, <8 x i32> %s1, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 103 ; NEON-NEXT:       [[TMP3:%.*]] = shufflevector <8 x i32> %s0, <8 x i32> %s1, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
 104 ; NEON-NEXT:       [[TMP4:%.*]] = shufflevector <8 x i32> %s0, <8 x i32> %s1, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
 105 ; NEON-NEXT:       [[TMP5:%.*]] = bitcast <16 x i32>* %ptr to <4 x i32>*
 106 ; NEON-NEXT:       call void @llvm.aarch64.neon.st4.v4i32.p0v4i32(<4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32>* [[TMP5]])
 107 ; NEON-NEXT:       ret void
 108 ; NO_NEON-LABEL: @store_factor4(
 109 ; NO_NEON-NOT:     @llvm.aarch64.neon
 110 ; NO_NEON:         ret void
 111 ;
 112   %s0 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 113   %s1 = shufflevector <4 x i32> %v2, <4 x i32> %v3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 114   %interleaved.vec = shufflevector <8 x i32> %s0, <8 x i32> %s1, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
 115   store <16 x i32> %interleaved.vec, <16 x i32>* %ptr, align 4
 116   ret void
 117 }
 118
 119 define void @load_ptrvec_factor2(<4 x i32*>* %ptr) {
 120 ; NEON-LABEL:    @load_ptrvec_factor2(
 121 ; NEON-NEXT:       [[TMP1:%.*]] = bitcast <4 x i32*>* %ptr to <2 x i64>*
 122 ; NEON-NEXT:       [[LDN:%.*]] = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2.v2i64.p0v2i64(<2 x i64>* [[TMP1]])
 123 ; NEON-NEXT:       [[TMP2:%.*]] = extractvalue { <2 x i64>, <2 x i64> } [[LDN]], 1
 124 ; NEON-NEXT:       [[TMP3:%.*]] = inttoptr <2 x i64> [[TMP2]] to <2 x i32*>
 125 ; NEON-NEXT:       [[TMP4:%.*]] = extractvalue { <2 x i64>, <2 x i64> } [[LDN]], 0
 126 ; NEON-NEXT:       [[TMP5:%.*]] = inttoptr <2 x i64> [[TMP4]] to <2 x i32*>
 127 ; NEON-NEXT:       ret void
 128 ; NO_NEON-LABEL: @load_ptrvec_factor2(
 129 ; NO_NEON-NOT:     @llvm.aarch64.neon
 130 ; NO_NEON:         ret void
 131 ;
 132   %interleaved.vec = load <4 x i32*>, <4 x i32*>* %ptr, align 4
 133   %v0 = shufflevector <4 x i32*> %interleaved.vec, <4 x i32*> undef, <2 x i32> <i32 0, i32 2>
 134   %v1 = shufflevector <4 x i32*> %interleaved.vec, <4 x i32*> undef, <2 x i32> <i32 1, i32 3>
 135   ret void
 136 }
 137
 138 define void @load_ptrvec_factor3(<6 x i32*>* %ptr) {
 139 ; NEON-LABEL:    @load_ptrvec_factor3(
 140 ; NEON-NEXT:       [[TMP1:%.*]] = bitcast <6 x i32*>* %ptr to <2 x i64>*
 141 ; NEON-NEXT:       [[LDN:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld3.v2i64.p0v2i64(<2 x i64>* [[TMP1]])
 142 ; NEON-NEXT:       [[TMP2:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } [[LDN]], 2
 143 ; NEON-NEXT:       [[TMP3:%.*]] = inttoptr <2 x i64> [[TMP2]] to <2 x i32*>
 144 ; NEON-NEXT:       [[TMP4:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } [[LDN]], 1
 145 ; NEON-NEXT:       [[TMP5:%.*]] = inttoptr <2 x i64> [[TMP4]] to <2 x i32*>
 146 ; NEON-NEXT:       [[TMP6:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } [[LDN]], 0
 147 ; NEON-NEXT:       [[TMP7:%.*]] = inttoptr <2 x i64> [[TMP6]] to <2 x i32*>
 148 ; NEON-NEXT:       ret void
 149 ; NO_NEON-LABEL: @load_ptrvec_factor3(
 150 ; NO_NEON-NOT:     @llvm.aarch64.neon
 151 ; NO_NEON:         ret void
 152 ;
 153   %interleaved.vec = load <6 x i32*>, <6 x i32*>* %ptr, align 4
 154   %v0 = shufflevector <6 x i32*> %interleaved.vec, <6 x i32*> undef, <2 x i32> <i32 0, i32 3>
 155   %v1 = shufflevector <6 x i32*> %interleaved.vec, <6 x i32*> undef, <2 x i32> <i32 1, i32 4>
 156   %v2 = shufflevector <6 x i32*> %interleaved.vec, <6 x i32*> undef, <2 x i32> <i32 2, i32 5>
 157   ret void
 158 }
 159
 160 define void @load_ptrvec_factor4(<8 x i32*>* %ptr) {
 161 ; NEON-LABEL:    @load_ptrvec_factor4(
 162 ; NEON-NEXT:       [[TMP1:%.*]] = bitcast <8 x i32*>* %ptr to <2 x i64>*
 163 ; NEON-NEXT:       [[LDN:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld4.v2i64.p0v2i64(<2 x i64>* [[TMP1]])
 164 ; NEON-NEXT:       [[TMP2:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[LDN]], 3
 165 ; NEON-NEXT:       [[TMP3:%.*]] = inttoptr <2 x i64> [[TMP2]] to <2 x i32*>
 166 ; NEON-NEXT:       [[TMP4:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[LDN]], 2
 167 ; NEON-NEXT:       [[TMP5:%.*]] = inttoptr <2 x i64> [[TMP4]] to <2 x i32*>
 168 ; NEON-NEXT:       [[TMP6:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[LDN]], 1
 169 ; NEON-NEXT:       [[TMP7:%.*]] = inttoptr <2 x i64> [[TMP6]] to <2 x i32*>
 170 ; NEON-NEXT:       [[TMP8:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[LDN]], 0
 171 ; NEON-NEXT:       [[TMP9:%.*]] = inttoptr <2 x i64> [[TMP8]] to <2 x i32*>
 172 ; NEON-NEXT:       ret void
 173 ; NO_NEON-LABEL: @load_ptrvec_factor4(
 174 ; NO_NEON-NOT:     @llvm.aarch64.neon
 175 ; NO_NEON:         ret void
 176 ;
 177   %interleaved.vec = load <8 x i32*>, <8 x i32*>* %ptr, align 4
 178   %v0 = shufflevector <8 x i32*> %interleaved.vec, <8 x i32*> undef, <2 x i32> <i32 0, i32 4>
 179   %v1 = shufflevector <8 x i32*> %interleaved.vec, <8 x i32*> undef, <2 x i32> <i32 1, i32 5>
 180   %v2 = shufflevector <8 x i32*> %interleaved.vec, <8 x i32*> undef, <2 x i32> <i32 2, i32 6>
 181   %v3 = shufflevector <8 x i32*> %interleaved.vec, <8 x i32*> undef, <2 x i32> <i32 3, i32 7>
 182   ret void
 183 }
 184
 185 define void @store_ptrvec_factor2(<4 x i32*>* %ptr, <2 x i32*> %v0, <2 x i32*> %v1) {
 186 ; NEON-LABEL:    @store_ptrvec_factor2(
 187 ; NEON-NEXT:       [[TMP1:%.*]] = ptrtoint <2 x i32*> %v0 to <2 x i64>
 188 ; NEON-NEXT:       [[TMP2:%.*]] = ptrtoint <2 x i32*> %v1 to <2 x i64>
 189 ; NEON-NEXT:       [[TMP3:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> [[TMP2]], <2 x i32> <i32 0, i32 1>
 190 ; NEON-NEXT:       [[TMP4:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> [[TMP2]], <2 x i32> <i32 2, i32 3>
 191 ; NEON-NEXT:       [[TMP5:%.*]] = bitcast <4 x i32*>* %ptr to <2 x i64>*
 192 ; NEON-NEXT:       call void @llvm.aarch64.neon.st2.v2i64.p0v2i64(<2 x i64> [[TMP3]], <2 x i64> [[TMP4]], <2 x i64>* [[TMP5]])
 193 ; NEON-NEXT:       ret void
 194 ; NO_NEON-LABEL: @store_ptrvec_factor2(
 195 ; NO_NEON-NOT:     @llvm.aarch64.neon
 196 ; NO_NEON:         ret void
 197 ;
 198   %interleaved.vec = shufflevector <2 x i32*> %v0, <2 x i32*> %v1, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
 199   store <4 x i32*> %interleaved.vec, <4 x i32*>* %ptr, align 4
 200   ret void
 201 }
 202
 203 define void @store_ptrvec_factor3(<6 x i32*>* %ptr, <2 x i32*> %v0, <2 x i32*> %v1, <2 x i32*> %v2) {
 204 ; NEON-LABEL:    @store_ptrvec_factor3(
 205 ; NEON:            [[TMP1:%.*]] = ptrtoint <4 x i32*> %s0 to <4 x i64>
 206 ; NEON-NEXT:       [[TMP2:%.*]] = ptrtoint <4 x i32*> %s1 to <4 x i64>
 207 ; NEON-NEXT:       [[TMP3:%.*]] = shufflevector <4 x i64> [[TMP1]], <4 x i64> [[TMP2]], <2 x i32> <i32 0, i32 1>
 208 ; NEON-NEXT:       [[TMP4:%.*]] = shufflevector <4 x i64> [[TMP1]], <4 x i64> [[TMP2]], <2 x i32> <i32 2, i32 3>
 209 ; NEON-NEXT:       [[TMP5:%.*]] = shufflevector <4 x i64> [[TMP1]], <4 x i64> [[TMP2]], <2 x i32> <i32 4, i32 5>
 210 ; NEON-NEXT:       [[TMP6:%.*]] = bitcast <6 x i32*>* %ptr to <2 x i64>*
 211 ; NEON-NEXT:       call void @llvm.aarch64.neon.st3.v2i64.p0v2i64(<2 x i64> [[TMP3]], <2 x i64> [[TMP4]], <2 x i64> [[TMP5]], <2 x i64>* [[TMP6]])
 212 ; NEON-NEXT:       ret void
 213 ; NO_NEON-LABEL: @store_ptrvec_factor3(
 214 ; NO_NEON-NOT:     @llvm.aarch64.neon
 215 ; NO_NEON:         ret void
 216 ;
 217   %s0 = shufflevector <2 x i32*> %v0, <2 x i32*> %v1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 218   %s1 = shufflevector <2 x i32*> %v2, <2 x i32*> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
 219   %interleaved.vec = shufflevector <4 x i32*> %s0, <4 x i32*> %s1, <6 x i32> <i32 0, i32 2, i32 4, i32 1, i32 3, i32 5>
 220   store <6 x i32*> %interleaved.vec, <6 x i32*>* %ptr, align 4
 221   ret void
 222 }
 223
 224 define void @store_ptrvec_factor4(<8 x i32*>* %ptr, <2 x i32*> %v0, <2 x i32*> %v1, <2 x i32*> %v2, <2 x i32*> %v3) {
 225 ; NEON-LABEL:    @store_ptrvec_factor4(
 226 ; NEON:            [[TMP1:%.*]] = ptrtoint <4 x i32*> %s0 to <4 x i64>
 227 ; NEON-NEXT:       [[TMP2:%.*]] = ptrtoint <4 x i32*> %s1 to <4 x i64>
 228 ; NEON-NEXT:       [[TMP3:%.*]] = shufflevector <4 x i64> [[TMP1]], <4 x i64> [[TMP2]], <2 x i32> <i32 0, i32 1>
 229 ; NEON-NEXT:       [[TMP4:%.*]] = shufflevector <4 x i64> [[TMP1]], <4 x i64> [[TMP2]], <2 x i32> <i32 2, i32 3>
 230 ; NEON-NEXT:       [[TMP5:%.*]] = shufflevector <4 x i64> [[TMP1]], <4 x i64> [[TMP2]], <2 x i32> <i32 4, i32 5>
 231 ; NEON-NEXT:       [[TMP6:%.*]] = shufflevector <4 x i64> [[TMP1]], <4 x i64> [[TMP2]], <2 x i32> <i32 6, i32 7>
 232 ; NEON-NEXT:       [[TMP7:%.*]] = bitcast <8 x i32*>* %ptr to <2 x i64>*
 233 ; NEON-NEXT:       call void @llvm.aarch64.neon.st4.v2i64.p0v2i64(<2 x i64> [[TMP3]], <2 x i64> [[TMP4]], <2 x i64> [[TMP5]], <2 x i64> [[TMP6]], <2 x i64>* [[TMP7]])
 234 ; NEON-NEXT:       ret void
 235 ; NO_NEON-LABEL: @store_ptrvec_factor4(
 236 ; NO_NEON-NOT:     @llvm.aarch64.neon
 237 ; NO_NEON:         ret void
 238 ;
 239   %s0 = shufflevector <2 x i32*> %v0, <2 x i32*> %v1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 240   %s1 = shufflevector <2 x i32*> %v2, <2 x i32*> %v3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 241   %interleaved.vec = shufflevector <4 x i32*> %s0, <4 x i32*> %s1, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7>
 242   store <8 x i32*> %interleaved.vec, <8 x i32*>* %ptr, align 4
 243   ret void
 244 }
 245
 246 define void @load_undef_mask_factor2(<8 x i32>* %ptr) {
 247 ; NEON-LABEL:    @load_undef_mask_factor2(
 248 ; NEON-NEXT:       [[TMP1:%.*]] = bitcast <8 x i32>* %ptr to <4 x i32>*
 249 ; NEON-NEXT:       [[LDN:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0v4i32(<4 x i32>* [[TMP1]])
 250 ; NEON-NEXT:       [[TMP2:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[LDN]], 1
 251 ; NEON-NEXT:       [[TMP3:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[LDN]], 0
 252 ; NEON-NEXT:       ret void
 253 ; NO_NEON-LABEL: @load_undef_mask_factor2(
 254 ; NO_NEON-NOT:     @llvm.aarch64.neon
 255 ; NO_NEON:         ret void
 256 ;
 257   %interleaved.vec = load <8 x i32>, <8 x i32>* %ptr, align 4
 258   %v0 = shufflevector <8 x i32> %interleaved.vec, <8 x i32> undef, <4 x i32> <i32 undef, i32 2, i32 undef, i32 6>
 259   %v1 = shufflevector <8 x i32> %interleaved.vec, <8 x i32> undef, <4 x i32> <i32 undef, i32 3, i32 undef, i32 7>
 260   ret void
 261 }
 262
 263 define void @load_undef_mask_factor3(<12 x i32>* %ptr) {
 264 ; NEON-LABEL:    @load_undef_mask_factor3(
 265 ; NEON-NEXT:       [[TMP1:%.*]] = bitcast <12 x i32>* %ptr to <4 x i32>*
 266 ; NEON-NEXT:       [[LDN:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3.v4i32.p0v4i32(<4 x i32>* [[TMP1]])
 267 ; NEON-NEXT:       [[TMP2:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[LDN]], 2
 268 ; NEON-NEXT:       [[TMP3:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[LDN]], 1
 269 ; NEON-NEXT:       [[TMP4:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[LDN]], 0
 270 ; NEON-NEXT:       ret void
 271 ; NO_NEON-LABEL: @load_undef_mask_factor3(
 272 ; NO_NEON-NOT:     @llvm.aarch64.neon
 273 ; NO_NEON:         ret void
 274 ;
 275   %interleaved.vec = load <12 x i32>, <12 x i32>* %ptr, align 4
 276   %v0 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> undef, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
 277   %v1 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> undef, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
 278   %v2 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> undef, <4 x i32> <i32 2, i32 undef, i32 undef, i32 undef>
 279   ret void
 280 }
 281
 282 define void @load_undef_mask_factor4(<16 x i32>* %ptr) {
 283 ; NEON-LABEL:    @load_undef_mask_factor4(
 284 ; NEON-NEXT:       [[TMP1:%.*]] = bitcast <16 x i32>* %ptr to <4 x i32>*
 285 ; NEON-NEXT:       [[LDN:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld4.v4i32.p0v4i32(<4 x i32>* [[TMP1]])
 286 ; NEON-NEXT:       [[TMP2:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[LDN]], 3
 287 ; NEON-NEXT:       [[TMP3:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[LDN]], 2
 288 ; NEON-NEXT:       [[TMP4:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[LDN]], 1
 289 ; NEON-NEXT:       [[TMP5:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[LDN]], 0
 290 ; NEON-NEXT:       ret void
 291 ; NO_NEON-LABEL: @load_undef_mask_factor4(
 292 ; NO_NEON-NOT:     @llvm.aarch64.neon
 293 ; NO_NEON:         ret void
 294 ;
 295   %interleaved.vec = load <16 x i32>, <16 x i32>* %ptr, align 4
 296   %v0 = shufflevector <16 x i32> %interleaved.vec, <16 x i32> undef, <4 x i32> <i32 0, i32 4, i32 undef, i32 undef>
 297   %v1 = shufflevector <16 x i32> %interleaved.vec, <16 x i32> undef, <4 x i32> <i32 1, i32 5, i32 undef, i32 undef>
 298   %v2 = shufflevector <16 x i32> %interleaved.vec, <16 x i32> undef, <4 x i32> <i32 2, i32 6, i32 undef, i32 undef>
 299   %v3 = shufflevector <16 x i32> %interleaved.vec, <16 x i32> undef, <4 x i32> <i32 3, i32 7, i32 undef, i32 undef>
 300   ret void
 301 }
 302
 303 define void @store_undef_mask_factor2(<8 x i32>* %ptr, <4 x i32> %v0, <4 x i32> %v1) {
 304 ; NEON-LABEL:    @store_undef_mask_factor2(
 305 ; NEON-NEXT:       [[TMP1:%.*]] = shufflevector <4 x i32> %v0, <4 x i32> %v1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 306 ; NEON-NEXT:       [[TMP2:%.*]] = shufflevector <4 x i32> %v0, <4 x i32> %v1, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 307 ; NEON-NEXT:       [[TMP3:%.*]] = bitcast <8 x i32>* %ptr to <4 x i32>*
 308 ; NEON-NEXT:       call void @llvm.aarch64.neon.st2.v4i32.p0v4i32(<4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32>* [[TMP3]])
 309 ; NEON-NEXT:       ret void
 310 ; NO_NEON-LABEL: @store_undef_mask_factor2(
 311 ; NO_NEON-NOT:     @llvm.aarch64.neon
 312 ; NO_NEON:         ret void
 313 ;
 314   %interleaved.vec = shufflevector <4 x i32> %v0, <4 x i32> %v1, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 2, i32 6, i32 3, i32 7>
 315   store <8 x i32> %interleaved.vec, <8 x i32>* %ptr, align 4
 316   ret void
 317 }
 318
 319 define void @store_undef_mask_factor3(<12 x i32>* %ptr, <4 x i32> %v0, <4 x i32> %v1, <4 x i32> %v2) {
 320 ; NEON-LABEL:    @store_undef_mask_factor3(
 321 ; NEON:            [[TMP1:%.*]] = shufflevector <8 x i32> %s0, <8 x i32> %s1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 322 ; NEON-NEXT:       [[TMP2:%.*]] = shufflevector <8 x i32> %s0, <8 x i32> %s1, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 323 ; NEON-NEXT:       [[TMP3:%.*]] = shufflevector <8 x i32> %s0, <8 x i32> %s1, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
 324 ; NEON-NEXT:       [[TMP4:%.*]] = bitcast <12 x i32>* %ptr to <4 x i32>*
 325 ; NEON-NEXT:       call void @llvm.aarch64.neon.st3.v4i32.p0v4i32(<4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32>* [[TMP4]])
 326 ; NEON-NEXT:       ret void
 327 ; NO_NEON-LABEL: @store_undef_mask_factor3(
 328 ; NO_NEON-NOT:     @llvm.aarch64.neon
 329 ; NO_NEON:         ret void
 330 ;
 331   %s0 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 332   %s1 = shufflevector <4 x i32> %v2, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
 333   %interleaved.vec = shufflevector <8 x i32> %s0, <8 x i32> %s1, <12 x i32> <i32 0, i32 4, i32 undef, i32 1, i32 undef, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
 334   store <12 x i32> %interleaved.vec, <12 x i32>* %ptr, align 4
 335   ret void
 336 }
 337
 338 define void @store_undef_mask_factor4(<16 x i32>* %ptr, <4 x i32> %v0, <4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3) {
 339 ; NEON-LABEL:    @store_undef_mask_factor4(
 340 ; NEON:            [[TMP1:%.*]] = shufflevector <8 x i32> %s0, <8 x i32> %s1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 341 ; NEON-NEXT:       [[TMP2:%.*]] = shufflevector <8 x i32> %s0, <8 x i32> %s1, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 342 ; NEON-NEXT:       [[TMP3:%.*]] = shufflevector <8 x i32> %s0, <8 x i32> %s1, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
 343 ; NEON-NEXT:       [[TMP4:%.*]] = shufflevector <8 x i32> %s0, <8 x i32> %s1, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
 344 ; NEON-NEXT:       [[TMP5:%.*]] = bitcast <16 x i32>* %ptr to <4 x i32>*
 345 ; NEON-NEXT:       call void @llvm.aarch64.neon.st4.v4i32.p0v4i32(<4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32>* [[TMP5]])
 346 ; NEON-NEXT:       ret void
 347 ; NO_NEON-LABEL: @store_undef_mask_factor4(
 348 ; NO_NEON-NOT:     @llvm.aarch64.neon
 349 ; NO_NEON:         ret void
 350 ;
 351   %s0 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 352   %s1 = shufflevector <4 x i32> %v2, <4 x i32> %v3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 353   %interleaved.vec = shufflevector <8 x i32> %s0, <8 x i32> %s1, <16 x i32> <i32 0, i32 4, i32 8, i32 undef, i32 undef, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
 354   store <16 x i32> %interleaved.vec, <16 x i32>* %ptr, align 4
 355   ret void
 356 }
 357
 358 define void @load_illegal_factor2(<3 x float>* %ptr) nounwind {
 359 ; NEON-LABEL:    @load_illegal_factor2(
 360 ; NEON-NOT:        @llvm.aarch64.neon
 361 ; NEON:            ret void
 362 ; NO_NEON-LABEL: @load_illegal_factor2(
 363 ; NO_NEON-NOT:     @llvm.aarch64.neon
 364 ; NO_NEON:         ret void
 365 ;
 366   %interleaved.vec = load <3 x float>, <3 x float>* %ptr, align 16
 367   %v0 = shufflevector <3 x float> %interleaved.vec, <3 x float> undef, <3 x i32> <i32 0, i32 2, i32 undef>
 368   ret void
 369 }
 370
 371 define void @store_illegal_factor2(<3 x float>* %ptr, <3 x float> %v0) nounwind {
 372 ; NEON-LABEL:    @store_illegal_factor2(
 373 ; NEON-NOT:        @llvm.aarch64.neon
 374 ; NEON:            ret void
 375 ; NO_NEON-LABEL: @store_illegal_factor2(
 376 ; NO_NEON-NOT:     @llvm.aarch64.neon
 377 ; NO_NEON:         ret void
 378 ;
 379   %interleaved.vec = shufflevector <3 x float> %v0, <3 x float> undef, <3 x i32> <i32 0, i32 2, i32 undef>
 380   store <3 x float> %interleaved.vec, <3 x float>* %ptr, align 16
 381   ret void
 382 }
 383
 384 define void @store_general_mask_factor4(<8 x i32>* %ptr, <32 x i32> %v0, <32 x i32> %v1) {
 385 ; NEON-LABEL:    @store_general_mask_factor4(
 386 ; NEON-NEXT:       [[TMP1:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <2 x i32> <i32 4, i32 5>
 387 ; NEON-NEXT:       [[TMP2:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <2 x i32> <i32 16, i32 17>
 388 ; NEON-NEXT:       [[TMP3:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <2 x i32> <i32 32, i32 33>
 389 ; NEON-NEXT:       [[TMP4:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <2 x i32> <i32 8, i32 9>
 390 ; NEON-NEXT:       [[TMP5:%.*]] = bitcast <8 x i32>* %ptr to <2 x i32>*
 391 ; NEON-NEXT:       call void @llvm.aarch64.neon.st4.v2i32.p0v2i32(<2 x i32> [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> [[TMP3]], <2 x i32> [[TMP4]], <2 x i32>* [[TMP5]])
 392 ; NEON-NEXT:       ret void
 393 ; NO_NEON-LABEL: @store_general_mask_factor4(
 394 ; NO_NEON-NOT:     @llvm.aarch64.neon
 395 ; NO_NEON:         ret void
 396 ;
 397   %interleaved.vec = shufflevector <32 x i32> %v0, <32 x i32> %v1, <8 x i32> <i32 4, i32 16, i32 32, i32 8, i32 5, i32 17, i32 33, i32 9>
 398   store <8 x i32> %interleaved.vec, <8 x i32>* %ptr, align 4
 399   ret void
 400 }
 401
 402 define void @store_general_mask_factor4_undefbeg(<8 x i32>* %ptr, <32 x i32> %v0, <32 x i32> %v1) {
 403 ; NEON-LABEL:    @store_general_mask_factor4_undefbeg(
 404 ; NEON-NEXT:       [[TMP1:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <2 x i32> <i32 4, i32 5>
 405 ; NEON-NEXT:       [[TMP2:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <2 x i32> <i32 16, i32 17>
 406 ; NEON-NEXT:       [[TMP3:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <2 x i32> <i32 32, i32 33>
 407 ; NEON-NEXT:       [[TMP4:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <2 x i32> <i32 8, i32 9>
 408 ; NEON-NEXT:       [[TMP5:%.*]] = bitcast <8 x i32>* %ptr to <2 x i32>*
 409 ; NEON-NEXT:       call void @llvm.aarch64.neon.st4.v2i32.p0v2i32(<2 x i32> [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> [[TMP3]], <2 x i32> [[TMP4]], <2 x i32>* [[TMP5]])
 410 ; NEON-NEXT:       ret void
 411 ; NO_NEON-LABEL: @store_general_mask_factor4_undefbeg(
 412 ; NO_NEON-NOT:     @llvm.aarch64.neon
 413 ; NO_NEON:         ret void
 414 ;
 415   %interleaved.vec = shufflevector <32 x i32> %v0, <32 x i32> %v1, <8 x i32> <i32 undef, i32 16, i32 32, i32 8, i32 5, i32 17, i32 33, i32 9>
 416   store <8 x i32> %interleaved.vec, <8 x i32>* %ptr, align 4
 417   ret void
 418 }
 419
 420 define void @store_general_mask_factor4_undefend(<8 x i32>* %ptr, <32 x i32> %v0, <32 x i32> %v1) {
 421 ; NEON-LABEL:    @store_general_mask_factor4_undefend(
 422 ; NEON-NEXT:       [[TMP1:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <2 x i32> <i32 4, i32 5>
 423 ; NEON-NEXT:       [[TMP2:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <2 x i32> <i32 16, i32 17>
 424 ; NEON-NEXT:       [[TMP3:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <2 x i32> <i32 32, i32 33>
 425 ; NEON-NEXT:       [[TMP4:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <2 x i32> <i32 8, i32 9>
 426 ; NEON-NEXT:       [[TMP5:%.*]] = bitcast <8 x i32>* %ptr to <2 x i32>*
 427 ; NEON-NEXT:       call void @llvm.aarch64.neon.st4.v2i32.p0v2i32(<2 x i32> [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> [[TMP3]], <2 x i32> [[TMP4]], <2 x i32>* [[TMP5]])
 428 ; NEON-NEXT:       ret void
 429 ; NO_NEON-LABEL: @store_general_mask_factor4_undefend(
 430 ; NO_NEON-NOT:     @llvm.aarch64.neon
 431 ; NO_NEON:         ret void
 432 ;
 433   %interleaved.vec = shufflevector <32 x i32> %v0, <32 x i32> %v1, <8 x i32> <i32 4, i32 16, i32 32, i32 8, i32 5, i32 17, i32 33, i32 undef>
 434   store <8 x i32> %interleaved.vec, <8 x i32>* %ptr, align 4
 435   ret void
 436 }
 437
 438 define void @store_general_mask_factor4_undefmid(<8 x i32>* %ptr, <32 x i32> %v0, <32 x i32> %v1) {
 439 ; NEON-LABEL:    @store_general_mask_factor4_undefmid(
 440 ; NEON-NEXT:       [[TMP1:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <2 x i32> <i32 4, i32 5>
 441 ; NEON-NEXT:       [[TMP2:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <2 x i32> <i32 16, i32 17>
 442 ; NEON-NEXT:       [[TMP3:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <2 x i32> <i32 32, i32 33>
 443 ; NEON-NEXT:       [[TMP4:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <2 x i32> <i32 8, i32 9>
 444 ; NEON-NEXT:       [[TMP5:%.*]] = bitcast <8 x i32>* %ptr to <2 x i32>*
 445 ; NEON-NEXT:       call void @llvm.aarch64.neon.st4.v2i32.p0v2i32(<2 x i32> [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> [[TMP3]], <2 x i32> [[TMP4]], <2 x i32>* [[TMP5]])
 446 ; NEON-NEXT:       ret void
 447 ; NO_NEON-LABEL: @store_general_mask_factor4_undefmid(
 448 ; NO_NEON-NOT:     @llvm.aarch64.neon
 449 ; NO_NEON:         ret void
 450 ;
 451   %interleaved.vec = shufflevector <32 x i32> %v0, <32 x i32> %v1, <8 x i32> <i32 4, i32 undef, i32 32, i32 8, i32 5, i32 17, i32 undef, i32 9>
 452   store <8 x i32> %interleaved.vec, <8 x i32>* %ptr, align 4
 453   ret void
 454 }
 455
 456 define void @store_general_mask_factor4_undefmulti(<8 x i32>* %ptr, <32 x i32> %v0, <32 x i32> %v1) {
 457 ; NEON-LABEL:    @store_general_mask_factor4_undefmulti(
 458 ; NEON-NEXT:       [[TMP1:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <2 x i32> <i32 4, i32 5>
 459 ; NEON-NEXT:       [[TMP2:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <2 x i32> <i32 0, i32 1>
 460 ; NEON-NEXT:       [[TMP3:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <2 x i32> <i32 0, i32 1>
 461 ; NEON-NEXT:       [[TMP4:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <2 x i32> <i32 8, i32 9>
 462 ; NEON-NEXT:       [[TMP5:%.*]] = bitcast <8 x i32>* %ptr to <2 x i32>*
 463 ; NEON-NEXT:       call void @llvm.aarch64.neon.st4.v2i32.p0v2i32(<2 x i32> [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> [[TMP3]], <2 x i32> [[TMP4]], <2 x i32>* [[TMP5]])
 464 ; NEON-NEXT:       ret void
 465 ; NO_NEON-LABEL: @store_general_mask_factor4_undefmulti(
 466 ; NO_NEON-NOT:     @llvm.aarch64.neon
 467 ; NO_NEON:         ret void
 468 ;
 469   %interleaved.vec = shufflevector <32 x i32> %v0, <32 x i32> %v1, <8 x i32> <i32 4, i32 undef, i32 undef, i32 8, i32 undef, i32 undef, i32 undef, i32 9>
 470   store <8 x i32> %interleaved.vec, <8 x i32>* %ptr, align 4
 471   ret void
 472 }
 473
 474 define void @store_general_mask_factor3(<12 x i32>* %ptr, <32 x i32> %v0, <32 x i32> %v1) {
 475 ; NEON-LABEL:    @store_general_mask_factor3(
 476 ; NEON-NEXT:       [[TMP1:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 477 ; NEON-NEXT:       [[TMP2:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <4 x i32> <i32 32, i32 33, i32 34, i32 35>
 478 ; NEON-NEXT:       [[TMP3:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
 479 ; NEON-NEXT:       [[TMP4:%.*]] = bitcast <12 x i32>* %ptr to <4 x i32>*
 480 ; NEON-NEXT:       call void @llvm.aarch64.neon.st3.v4i32.p0v4i32(<4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32>* [[TMP4]])
 481 ; NEON-NEXT:       ret void
 482 ; NO_NEON-LABEL: @store_general_mask_factor3(
 483 ; NO_NEON-NOT:     @llvm.aarch64.neon
 484 ; NO_NEON:         ret void
 485 ;
 486   %interleaved.vec = shufflevector <32 x i32> %v0, <32 x i32> %v1, <12 x i32> <i32 4, i32 32, i32 16, i32 5, i32 33, i32 17, i32 6, i32 34, i32 18, i32 7, i32 35, i32 19>
 487   store <12 x i32> %interleaved.vec, <12 x i32>* %ptr, align 4
 488   ret void
 489 }
 490
 491 define void @store_general_mask_factor3_undefmultimid(<12 x i32>* %ptr, <32 x i32> %v0, <32 x i32> %v1) {
 492 ; NEON-LABEL:    @store_general_mask_factor3_undefmultimid(
 493 ; NEON-NEXT:       [[TMP1:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 494 ; NEON-NEXT:       [[TMP2:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <4 x i32> <i32 32, i32 33, i32 34, i32 35>
 495 ; NEON-NEXT:       [[TMP3:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
 496 ; NEON-NEXT:       [[TMP4:%.*]] = bitcast <12 x i32>* %ptr to <4 x i32>*
 497 ; NEON-NEXT:       call void @llvm.aarch64.neon.st3.v4i32.p0v4i32(<4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32>* [[TMP4]])
 498 ; NEON-NEXT:       ret void
 499 ; NO_NEON-LABEL: @store_general_mask_factor3_undefmultimid(
 500 ; NO_NEON-NOT:     @llvm.aarch64.neon
 501 ; NO_NEON:         ret void
 502 ;
 503   %interleaved.vec = shufflevector <32 x i32> %v0, <32 x i32> %v1, <12 x i32> <i32 4, i32 32, i32 16, i32 undef, i32 33, i32 17, i32 undef, i32 34, i32 18, i32 7, i32 35, i32 19>
 504   store <12 x i32> %interleaved.vec, <12 x i32>* %ptr, align 4
 505   ret void
 506 }
 507
 508 define void @store_general_mask_factor3_undef_fail(<12 x i32>* %ptr, <32 x i32> %v0, <32 x i32> %v1) {
 509 ; NEON-LABEL:    @store_general_mask_factor3_undef_fail(
 510 ; NEON-NOT:        @llvm.aarch64.neon
 511 ; NEON:            ret void
 512 ; NO_NEON-LABEL: @store_general_mask_factor3_undef_fail(
 513 ; NO_NEON-NOT:     @llvm.aarch64.neon
 514 ; NO_NEON:         ret void
 515 ;
 516   %interleaved.vec = shufflevector <32 x i32> %v0, <32 x i32> %v1, <12 x i32> <i32 4, i32 32, i32 16, i32 undef, i32 33, i32 17, i32 undef, i32 34, i32 18, i32 8, i32 35, i32 19>
 517   store <12 x i32> %interleaved.vec, <12 x i32>* %ptr, align 4
 518   ret void
 519 }
 520
 521 define void @store_general_mask_factor3_undeflane(<12 x i32>* %ptr, <32 x i32> %v0, <32 x i32> %v1) {
 522 ; NEON-LABEL:    @store_general_mask_factor3_undeflane(
 523 ; NEON-NEXT:       [[TMP1:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 524 ; NEON-NEXT:       [[TMP2:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <4 x i32> <i32 32, i32 33, i32 34, i32 35>
 525 ; NEON-NEXT:       [[TMP3:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
 526 ; NEON-NEXT:       [[TMP4:%.*]] = bitcast <12 x i32>* %ptr to <4 x i32>*
 527 ; NEON-NEXT:       call void @llvm.aarch64.neon.st3.v4i32.p0v4i32(<4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32>* [[TMP4]])
 528 ; NEON-NEXT:       ret void
 529 ; NO_NEON-LABEL: @store_general_mask_factor3_undeflane(
 530 ; NO_NEON-NOT:     @llvm.aarch64.neon
 531 ; NO_NEON:         ret void
 532 ;
 533   %interleaved.vec = shufflevector <32 x i32> %v0, <32 x i32> %v1, <12 x i32> <i32 undef, i32 32, i32 16, i32 undef, i32 33, i32 17, i32 undef, i32 34, i32 18, i32 undef, i32 35, i32 19>
 534   store <12 x i32> %interleaved.vec, <12 x i32>* %ptr, align 4
 535   ret void
 536 }
 537
 538 define void @store_general_mask_factor3_negativestart(<12 x i32>* %ptr, <32 x i32> %v0, <32 x i32> %v1) {
 539 ; NEON-LABEL:    @store_general_mask_factor3_negativestart(
 540 ; NEON-NOT:        @llvm.aarch64.neon
 541 ; NEON:            ret void
 542 ; NO_NEON-LABEL: @store_general_mask_factor3_negativestart(
 543 ; NO_NEON-NOT:     @llvm.aarch64.neon
 544 ; NO_NEON:         ret void
 545 ;
 546   %interleaved.vec = shufflevector <32 x i32> %v0, <32 x i32> %v1, <12 x i32> <i32 undef, i32 32, i32 16, i32 undef, i32 33, i32 17, i32 undef, i32 34, i32 18, i32 2, i32 35, i32 19>
 547   store <12 x i32> %interleaved.vec, <12 x i32>* %ptr, align 4
 548   ret void
 549 }
 550
 551 @g = external global <4 x float>
 552
 553 ; The following does not give a valid interleaved store
 554 ; NEON-LABEL: define void @no_interleave
 555 ; NEON-NOT: call void @llvm.aarch64.neon.st2
 556 ; NEON: shufflevector
 557 ; NEON: store
 558 ; NEON: ret void
 559 ; NO_NEON-LABEL: define void @no_interleave
 560 ; NO_NEON: shufflevector
 561 ; NO_NEON: store
 562 ; NO_NEON: ret void
 563 define void @no_interleave(<4 x float> %a0) {
 564   %v0 = shufflevector <4 x float> %a0, <4 x float> %a0, <4 x i32> <i32 0, i32 3, i32 7, i32 undef>
 565   store <4 x float> %v0, <4 x float>* @g, align 16
 566   ret void
 567 }
 568
 569 define void @load_factor2_wide2(<16 x i32>* %ptr) {
 570 ; NEON-LABEL:    @load_factor2_wide2(
 571 ; NEON-NEXT:       [[TMP1:%.*]] = bitcast <16 x i32>* %ptr to i32*
 572 ; NEON-NEXT:       [[TMP2:%.*]] = bitcast i32* [[TMP1]] to <4 x i32>*
 573 ; NEON-NEXT:       [[LDN:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0v4i32(<4 x i32>* [[TMP2]])
 574 ; NEON-NEXT:       [[TMP3:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[LDN]], 1
 575 ; NEON-NEXT:       [[TMP4:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[LDN]], 0
 576 ; NEON-NEXT:       [[TMP5:%.*]] = getelementptr i32, i32* [[TMP1]], i32 8
 577 ; NEON-NEXT:       [[TMP6:%.*]] = bitcast i32* [[TMP5]] to <4 x i32>*
 578 ; NEON-NEXT:       [[LDN1:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0v4i32(<4 x i32>* [[TMP6]])
 579 ; NEON-NEXT:       [[TMP7:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[LDN1]], 1
 580 ; NEON-NEXT:       [[TMP8:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[LDN1]], 0
 581 ; NEON-NEXT:       [[TMP9:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP7]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 582 ; NEON-NEXT:       [[TMP10:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP8]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 583 ; NEON-NEXT:       ret void
 584 ; NO_NEON-LABEL: @load_factor2_wide2(
 585 ; NO_NEON-NOT:     @llvm.aarch64.neon
 586 ; NO_NEON:         ret void
 587 ;
 588   %interleaved.vec = load <16 x i32>, <16 x i32>* %ptr, align 4
 589   %v0 = shufflevector <16 x i32> %interleaved.vec, <16 x i32> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
 590   %v1 = shufflevector <16 x i32> %interleaved.vec, <16 x i32> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
 591   ret void
 592 }
 593
 594 define void @load_factor2_wide3(<24 x i32>* %ptr) {
 595 ; NEON-LABEL:    @load_factor2_wide3(
 596 ; NEON-NEXT:       [[TMP1:%.*]] = bitcast <24 x i32>* [[PTR:%.*]] to i32*
 597 ; NEON-NEXT:       [[TMP2:%.*]] = bitcast i32* [[TMP1]] to <4 x i32>*
 598 ; NEON-NEXT:       [[LDN:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0v4i32(<4 x i32>* [[TMP2]])
 599 ; NEON-NEXT:       [[TMP3:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[LDN]], 1
 600 ; NEON-NEXT:       [[TMP4:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[LDN]], 0
 601 ; NEON-NEXT:       [[TMP5:%.*]] = getelementptr i32, i32* [[TMP1]], i32 8
 602 ; NEON-NEXT:       [[TMP6:%.*]] = bitcast i32* [[TMP5]] to <4 x i32>*
 603 ; NEON-NEXT:       [[LDN1:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0v4i32(<4 x i32>* [[TMP6]])
 604 ; NEON-NEXT:       [[TMP7:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[LDN1]], 1
 605 ; NEON-NEXT:       [[TMP8:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[LDN1]], 0
 606 ; NEON-NEXT:       [[TMP9:%.*]] = getelementptr i32, i32* [[TMP5]], i32 8
 607 ; NEON-NEXT:       [[TMP10:%.*]] = bitcast i32* [[TMP9]] to <4 x i32>*
 608 ; NEON-NEXT:       [[LDN2:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0v4i32(<4 x i32>* [[TMP10]])
 609 ; NEON-NEXT:       [[TMP11:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[LDN2]], 1
 610 ; NEON-NEXT:       [[TMP12:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[LDN2]], 0
 611 ; NEON-NEXT:       [[TMP13:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP7]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 612 ; NEON-NEXT:       [[TMP14:%.*]] = shufflevector <4 x i32> [[TMP11]], <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
 613 ; NEON-NEXT:       [[TMP15:%.*]] = shufflevector <8 x i32> [[TMP13]], <8 x i32> [[TMP14]], <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
 614 ; NEON-NEXT:       [[TMP16:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP8]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 615 ; NEON-NEXT:       [[TMP17:%.*]] = shufflevector <4 x i32> [[TMP12]], <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
 616 ; NEON-NEXT:       [[TMP18:%.*]] = shufflevector <8 x i32> [[TMP16]], <8 x i32> [[TMP17]], <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
 617 ; NEON-NEXT:       ret void
 618 ; NO_NEON-LABEL: @load_factor2_wide3(
 619 ; NO_NEON-NOT:     @llvm.aarch64.neon
 620 ; NO_NEON:         ret void
 621 ;
 622   %interleaved.vec = load <24 x i32>, <24 x i32>* %ptr, align 4
 623   %v0 = shufflevector <24 x i32> %interleaved.vec, <24 x i32> undef, <12 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22>
 624   %v1 = shufflevector <24 x i32> %interleaved.vec, <24 x i32> undef, <12 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23>
 625   ret void
 626 }
 627
 628 define void @load_factor3_wide(<24 x i32>* %ptr) {
 629 ; NEON-LABEL: @load_factor3_wide(
 630 ; NEON-NEXT:       [[TMP1:%.*]] = bitcast <24 x i32>* %ptr to i32*
 631 ; NEON-NEXT:       [[TMP2:%.*]] = bitcast i32* [[TMP1]] to <4 x i32>*
 632 ; NEON-NEXT:       [[LDN:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3.v4i32.p0v4i32(<4 x i32>* [[TMP2]])
 633 ; NEON-NEXT:       [[TMP3:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[LDN]], 2
 634 ; NEON-NEXT:       [[TMP4:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[LDN]], 1
 635 ; NEON-NEXT:       [[TMP5:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[LDN]], 0
 636 ; NEON-NEXT:       [[TMP6:%.*]] = getelementptr i32, i32* [[TMP1]], i32 12
 637 ; NEON-NEXT:       [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>*
 638 ; NEON-NEXT:       [[LDN1:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3.v4i32.p0v4i32(<4 x i32>* [[TMP7]])
 639 ; NEON-NEXT:       [[TMP8:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[LDN1]], 2
 640 ; NEON-NEXT:       [[TMP9:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[LDN1]], 1
 641 ; NEON-NEXT:       [[TMP10:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[LDN1]], 0
 642 ; NEON-NEXT:       [[TMP11:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP8]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 643 ; NEON-NEXT:       [[TMP12:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP9]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 644 ; NEON-NEXT:       [[TMP13:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP10]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 645 ; NEON-NEXT:       ret void
 646 ; NO_NEON-LABEL: @load_factor3_wide(
 647 ; NO_NEON-NOT:     @llvm.aarch64.neon
 648 ; NO_NEON:         ret void
 649 ;
 650   %interleaved.vec = load <24 x i32>, <24 x i32>* %ptr, align 4
 651   %v0 = shufflevector <24 x i32> %interleaved.vec, <24 x i32> undef, <8 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21>
 652   %v1 = shufflevector <24 x i32> %interleaved.vec, <24 x i32> undef, <8 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22>
 653   %v2 = shufflevector <24 x i32> %interleaved.vec, <24 x i32> undef, <8 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23>
 654   ret void
 655 }
 656
 657 define void @load_factor4_wide(<32 x i32>* %ptr) {
 658 ; NEON-LABEL: @load_factor4_wide(
 659 ; NEON-NEXT:       [[TMP1:%.*]] = bitcast <32 x i32>* %ptr to i32*
 660 ; NEON-NEXT:       [[TMP2:%.*]] = bitcast i32* [[TMP1]] to <4 x i32>*
 661 ; NEON-NEXT:       [[LDN:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld4.v4i32.p0v4i32(<4 x i32>* [[TMP2]])
 662 ; NEON-NEXT:       [[TMP3:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[LDN]], 3
 663 ; NEON-NEXT:       [[TMP4:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[LDN]], 2
 664 ; NEON-NEXT:       [[TMP5:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[LDN]], 1
 665 ; NEON-NEXT:       [[TMP6:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[LDN]], 0
 666 ; NEON-NEXT:       [[TMP7:%.*]] = getelementptr i32, i32* [[TMP1]], i32 16
 667 ; NEON-NEXT:       [[TMP8:%.*]] = bitcast i32* [[TMP7]] to <4 x i32>*
 668 ; NEON-NEXT:       [[LDN1:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld4.v4i32.p0v4i32(<4 x i32>* [[TMP8]])
 669 ; NEON-NEXT:       [[TMP9:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[LDN1]], 3
 670 ; NEON-NEXT:       [[TMP10:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[LDN1]], 2
 671 ; NEON-NEXT:       [[TMP11:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[LDN1]], 1
 672 ; NEON-NEXT:       [[TMP12:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[LDN1]], 0
 673 ; NEON-NEXT:       [[TMP13:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP9]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 674 ; NEON-NEXT:       [[TMP14:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP10]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 675 ; NEON-NEXT:       [[TMP15:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP11]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 676 ; NEON-NEXT:       [[TMP16:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> [[TMP12]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 677 ; NEON-NEXT:       ret void
 678 ; NO_NEON-LABEL: @load_factor4_wide(
 679 ; NO_NEON-NOT:     @llvm.aarch64.neon
 680 ; NO_NEON:         ret void
 681 ;
 682   %interleaved.vec = load <32 x i32>, <32 x i32>* %ptr, align 4
 683   %v0 = shufflevector <32 x i32> %interleaved.vec, <32 x i32> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28>
 684   %v1 = shufflevector <32 x i32> %interleaved.vec, <32 x i32> undef, <8 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29>
 685   %v2 = shufflevector <32 x i32> %interleaved.vec, <32 x i32> undef, <8 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30>
 686   %v3 = shufflevector <32 x i32> %interleaved.vec, <32 x i32> undef, <8 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31>
 687   ret void
 688 }
 689
 690 define void @store_factor2_wide(<16 x i32>* %ptr, <8 x i32> %v0, <8 x i32> %v1) {
 691 ; NEON-LABEL:    @store_factor2_wide(
 692 ; NEON-NEXT:       [[TMP1:%.*]] = bitcast <16 x i32>* %ptr to i32*
 693 ; NEON-NEXT:       [[TMP2:%.*]] = shufflevector <8 x i32> %v0, <8 x i32> %v1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 694 ; NEON-NEXT:       [[TMP3:%.*]] = shufflevector <8 x i32> %v0, <8 x i32> %v1, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
 695 ; NEON-NEXT:       [[TMP4:%.*]] = bitcast i32* [[TMP1]] to <4 x i32>*
 696 ; NEON-NEXT:       call void @llvm.aarch64.neon.st2.v4i32.p0v4i32(<4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32>* [[TMP4]])
 697 ; NEON-NEXT:       [[TMP5:%.*]] = shufflevector <8 x i32> %v0, <8 x i32> %v1, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 698 ; NEON-NEXT:       [[TMP6:%.*]] = shufflevector <8 x i32> %v0, <8 x i32> %v1, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
 699 ; NEON-NEXT:       [[TMP7:%.*]] = getelementptr i32, i32* [[TMP1]], i32 8
 700 ; NEON-NEXT:       [[TMP8:%.*]] = bitcast i32* [[TMP7]] to <4 x i32>*
 701 ; NEON-NEXT:       call void @llvm.aarch64.neon.st2.v4i32.p0v4i32(<4 x i32> [[TMP5]], <4 x i32> [[TMP6]], <4 x i32>* [[TMP8]])
 702 ; NEON-NEXT:       ret void
 703 ; NO_NEON-LABEL: @store_factor2_wide(
 704 ; NO_NEON:         ret void
 705 ;
 706   %interleaved.vec = shufflevector <8 x i32> %v0, <8 x i32> %v1, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
 707   store <16 x i32> %interleaved.vec, <16 x i32>* %ptr, align 4
 708   ret void
 709 }
 710
 711 define void @store_factor3_wide(<24 x i32>* %ptr, <8 x i32> %v0, <8 x i32> %v1, <8 x i32> %v2) {
 712 ; NEON-LABEL:    @store_factor3_wide(
 713 ; NEON:            [[TMP1:%.*]] = bitcast <24 x i32>* %ptr to i32*
 714 ; NEON-NEXT:       [[TMP2:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 715 ; NEON-NEXT:       [[TMP3:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
 716 ; NEON-NEXT:       [[TMP4:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
 717 ; NEON-NEXT:       [[TMP5:%.*]] = bitcast i32* [[TMP1]] to <4 x i32>*
 718 ; NEON-NEXT:       call void @llvm.aarch64.neon.st3.v4i32.p0v4i32(<4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32>* [[TMP5]])
 719 ; NEON-NEXT:       [[TMP6:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 720 ; NEON-NEXT:       [[TMP7:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
 721 ; NEON-NEXT:       [[TMP8:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
 722 ; NEON-NEXT:       [[TMP9:%.*]] = getelementptr i32, i32* [[TMP1]], i32 12
 723 ; NEON-NEXT:       [[TMP10:%.*]] = bitcast i32* [[TMP9]] to <4 x i32>*
 724 ; NEON-NEXT:       call void @llvm.aarch64.neon.st3.v4i32.p0v4i32(<4 x i32> [[TMP6]], <4 x i32> [[TMP7]], <4 x i32> [[TMP8]], <4 x i32>* [[TMP10]])
 725 ; NEON-NEXT:       ret void
 726 ; NO_NEON-LABEL: @store_factor3_wide(
 727 ; NO_NEON:         ret void
 728 ;
 729   %s0 = shufflevector <8 x i32> %v0, <8 x i32> %v1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 730   %s1 = shufflevector <8 x i32> %v2, <8 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 731   %interleaved.vec = shufflevector <16 x i32> %s0, <16 x i32> %s1, <24 x i32> <i32 0, i32 8, i32 16, i32 1, i32 9, i32 17, i32 2, i32 10, i32 18, i32 3, i32 11, i32 19, i32 4, i32 12, i32 20, i32 5, i32 13, i32 21, i32 6, i32 14, i32 22, i32 7, i32 15, i32 23>
 732   store <24 x i32> %interleaved.vec, <24 x i32>* %ptr, align 4
 733   ret void
 734 }
 735
 736 define void @store_factor4_wide(<32 x i32>* %ptr, <8 x i32> %v0, <8 x i32> %v1, <8 x i32> %v2, <8 x i32> %v3) {
 737 ; NEON-LABEL:    @store_factor4_wide(
 738 ; NEON:            [[TMP1:%.*]] = bitcast <32 x i32>* %ptr to i32*
 739 ; NEON-NEXT:       [[TMP2:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 740 ; NEON-NEXT:       [[TMP3:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
 741 ; NEON-NEXT:       [[TMP4:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
 742 ; NEON-NEXT:       [[TMP5:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> <i32 24, i32 25, i32 26, i32 27>
 743 ; NEON-NEXT:       [[TMP6:%.*]] = bitcast i32* [[TMP1]] to <4 x i32>*
 744 ; NEON-NEXT:       call void @llvm.aarch64.neon.st4.v4i32.p0v4i32(<4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32>* [[TMP6]])
 745 ; NEON-NEXT:       [[TMP7:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 746 ; NEON-NEXT:       [[TMP8:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
 747 ; NEON-NEXT:       [[TMP9:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
 748 ; NEON-NEXT:       [[TMP10:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> <i32 28, i32 29, i32 30, i32 31>
 749 ; NEON-NEXT:       [[TMP11:%.*]] = getelementptr i32, i32* [[TMP1]], i32 16
 750 ; NEON-NEXT:       [[TMP12:%.*]] = bitcast i32* [[TMP11]] to <4 x i32>*
 751 ; NEON-NEXT:       call void @llvm.aarch64.neon.st4.v4i32.p0v4i32(<4 x i32> [[TMP7]], <4 x i32> [[TMP8]], <4 x i32> [[TMP9]], <4 x i32> [[TMP10]], <4 x i32>* [[TMP12]])
 752 ; NEON-NEXT:       ret void
 753 ; NO_NEON-LABEL: @store_factor4_wide(
 754 ; NO_NEON-NOT:     @llvm.aarch64.neon
 755 ; NO_NEON:         ret void
 756 ;
 757   %s0 = shufflevector <8 x i32> %v0, <8 x i32> %v1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 758   %s1 = shufflevector <8 x i32> %v2, <8 x i32> %v3, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 759   %interleaved.vec = shufflevector <16 x i32> %s0, <16 x i32> %s1, <32 x i32> <i32 0, i32 8, i32 16, i32 24, i32 1, i32 9, i32 17, i32 25, i32 2, i32 10, i32 18, i32 26, i32 3, i32 11, i32 19, i32 27, i32 4, i32 12, i32 20, i32 28, i32 5, i32 13, i32 21, i32 29, i32 6, i32 14, i32 22, i32 30, i32 7, i32 15, i32 23, i32 31>
 760   store <32 x i32> %interleaved.vec, <32 x i32>* %ptr, align 4
 761   ret void
 762 }
 763
 764 define void @load_factor2_fp128(<4 x fp128>* %ptr) {
 765 ; NEON-LABEL:    @load_factor2_fp128(
 766 ; NEON-NOT:        @llvm.aarch64.neon
 767 ; NEON:            ret void
 768 ; NO_NEON-LABEL: @load_factor2_fp128(
 769 ; NO_NEON-NOT:     @llvm.aarch64.neon
 770 ; NO_NEON:         ret void
 771 ;
 772   %interleaved.vec = load <4 x fp128>, <4 x fp128>* %ptr, align 16
 773   %v0 = shufflevector <4 x fp128> %interleaved.vec, <4 x fp128> undef, <2 x i32> <i32 0, i32 2>
 774   %v1 = shufflevector <4 x fp128> %interleaved.vec, <4 x fp128> undef, <2 x i32> <i32 1, i32 3>
 775   ret void
 776 }
 777
 778 define <4 x i1> @load_large_vector(<12 x i64 *>* %p) {
 779 ; NEON-LABEL:    @load_large_vector(
 780 ; NEON:            [[LDN:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld3.v2i64.p0v2i64(<2 x i64>*
 781 ; NEON-NEXT:       [[TMP1:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } [[LDN]], 1
 782 ; NEON-NEXT:       [[TMP2:%.*]] = inttoptr <2 x i64> [[TMP1]] to <2 x i64*>
 783 ; NEON-NEXT:       [[TMP3:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } [[LDN]], 0
 784 ; NEON-NEXT:       [[TMP4:%.*]] = inttoptr <2 x i64> [[TMP3]] to <2 x i64*>
 785 ; NEON:            [[LDN1:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld3.v2i64.p0v2i64(<2 x i64>*
 786 ; NEON-NEXT:       [[TMP5:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } [[LDN1]], 1
 787 ; NEON-NEXT:       [[TMP6:%.*]] = inttoptr <2 x i64> [[TMP5]] to <2 x i64*>
 788 ; NEON-NEXT:       [[TMP7:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } [[LDN1]], 0
 789 ; NEON-NEXT:       [[TMP8:%.*]] = inttoptr <2 x i64> [[TMP7]] to <2 x i64*>
 790 ; NEON-NEXT:       shufflevector <2 x i64*> [[TMP2]], <2 x i64*> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 791 ; NEON-NEXT:       shufflevector <2 x i64*> [[TMP4]], <2 x i64*> [[TMP8]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 792 ; NO_NEON-LABEL: @load_large_vector(
 793 ; NO_NEON-NOT:     @llvm.aarch64.neon
 794 ; NO_NEON:         ret
 795 ;
 796   %l = load <12 x i64 *>, <12 x i64 *>* %p
 797   %s1 = shufflevector <12 x i64 *> %l, <12 x i64 *> undef, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
 798   %s2 = shufflevector <12 x i64 *> %l, <12 x i64 *> undef, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
 799   %ret = icmp ne <4 x i64 *> %s1, %s2
 800   ret <4 x i1> %ret
 801 }