llvm/test/Transforms/LoopVectorize/interleaved-accesses.ll

   1 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
   2 ; RUN: opt -S -passes=loop-vectorize,instcombine -force-vector-width=4 -force-vector-interleave=1 -enable-interleaved-mem-accesses=true -runtime-memory-check-threshold=24 < %s | FileCheck %s
   3
   4 target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
   5
   6 ; Check vectorization on an interleaved load group of factor 2 and an interleaved
   7 ; store group of factor 2.
   8
   9 ; int AB[1024];
  10 ; int CD[1024];
  11 ;  void test_array_load2_store2(int C, int D) {
  12 ;   for (int i = 0; i < 1024; i+=2) {
  13 ;     int A = AB[i];
  14 ;     int B = AB[i+1];
  15 ;     CD[i] = A + C;
  16 ;     CD[i+1] = B * D;
  17 ;   }
  18 ; }
  19
  20
  21 @AB = common global [1024 x i32] zeroinitializer, align 4
  22 @CD = common global [1024 x i32] zeroinitializer, align 4
  23
  24 define void @test_array_load2_store2(i32 %C, i32 %D) {
  25 ; CHECK-LABEL: @test_array_load2_store2(
  26 ; CHECK-NEXT:  entry:
  27 ; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
  28 ; CHECK:       vector.ph:
  29 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[C:%.*]], i64 0
  30 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
  31 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <4 x i32> poison, i32 [[D:%.*]], i64 0
  32 ; CHECK-NEXT:    [[BROADCAST_SPLAT3:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT2]], <4 x i32> poison, <4 x i32> zeroinitializer
  33 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
  34 ; CHECK:       vector.body:
  35 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
  36 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = shl i64 [[INDEX]], 1
  37 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [1024 x i32], ptr @AB, i64 0, i64 [[OFFSET_IDX]]
  38 ; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <8 x i32>, ptr [[TMP0]], align 4
  39 ; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
  40 ; CHECK-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
  41 ; CHECK-NEXT:    [[TMP1:%.*]] = or i64 [[OFFSET_IDX]], 1
  42 ; CHECK-NEXT:    [[TMP2:%.*]] = add nsw <4 x i32> [[STRIDED_VEC]], [[BROADCAST_SPLAT]]
  43 ; CHECK-NEXT:    [[TMP3:%.*]] = mul nsw <4 x i32> [[STRIDED_VEC1]], [[BROADCAST_SPLAT3]]
  44 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1024 x i32], ptr @CD, i64 0, i64 [[TMP1]]
  45 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i64 -1
  46 ; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
  47 ; CHECK-NEXT:    store <8 x i32> [[INTERLEAVED_VEC]], ptr [[TMP5]], align 4
  48 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
  49 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 512
  50 ; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
  51 ; CHECK:       middle.block:
  52 ; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
  53 ; CHECK:       scalar.ph:
  54 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
  55 ; CHECK:       for.body:
  56 ; CHECK-NEXT:    br i1 poison, label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP3:![0-9]+]]
  57 ; CHECK:       for.end:
  58 ; CHECK-NEXT:    ret void
  59 ;
  60 entry:
  61   br label %for.body
  62
  63 for.body:                                         ; preds = %for.body, %entry
  64   %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
  65   %arrayidx0 = getelementptr inbounds [1024 x i32], ptr @AB, i64 0, i64 %indvars.iv
  66   %tmp = load i32, ptr %arrayidx0, align 4
  67   %tmp1 = or i64 %indvars.iv, 1
  68   %arrayidx1 = getelementptr inbounds [1024 x i32], ptr @AB, i64 0, i64 %tmp1
  69   %tmp2 = load i32, ptr %arrayidx1, align 4
  70   %add = add nsw i32 %tmp, %C
  71   %mul = mul nsw i32 %tmp2, %D
  72   %arrayidx2 = getelementptr inbounds [1024 x i32], ptr @CD, i64 0, i64 %indvars.iv
  73   store i32 %add, ptr %arrayidx2, align 4
  74   %arrayidx3 = getelementptr inbounds [1024 x i32], ptr @CD, i64 0, i64 %tmp1
  75   store i32 %mul, ptr %arrayidx3, align 4
  76   %indvars.iv.next = add nuw nsw i64 %indvars.iv, 2
  77   %cmp = icmp slt i64 %indvars.iv.next, 1024
  78   br i1 %cmp, label %for.body, label %for.end
  79
  80 for.end:                                          ; preds = %for.body
  81   ret void
  82 }
  83
  84 ; int A[3072];
  85 ; struct ST S[1024];
  86 ; void test_struct_st3() {
  87 ;   int *ptr = A;
  88 ;   for (int i = 0; i < 1024; i++) {
  89 ;     int X1 = *ptr++;
  90 ;     int X2 = *ptr++;
  91 ;     int X3 = *ptr++;
  92 ;     T[i].x = X1 + 1;
  93 ;     T[i].y = X2 + 2;
  94 ;     T[i].z = X3 + 3;
  95 ;   }
  96 ; }
  97
  98
  99 %struct.ST3 = type { i32, i32, i32 }
 100 @A = common global [3072 x i32] zeroinitializer, align 4
 101 @S = common global [1024 x %struct.ST3] zeroinitializer, align 4
 102
 103 define void @test_struct_array_load3_store3() {
 104 ; CHECK-LABEL: @test_struct_array_load3_store3(
 105 ; CHECK-NEXT:  entry:
 106 ; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 107 ; CHECK:       vector.ph:
 108 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 109 ; CHECK:       vector.body:
 110 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 111 ; CHECK-NEXT:    [[TMP0:%.*]] = mul i64 [[INDEX]], 12
 112 ; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr @A, i64 [[TMP0]]
 113 ; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <12 x i32>, ptr [[NEXT_GEP]], align 4
 114 ; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <12 x i32> [[WIDE_VEC]], <12 x i32> poison, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
 115 ; CHECK-NEXT:    [[STRIDED_VEC2:%.*]] = shufflevector <12 x i32> [[WIDE_VEC]], <12 x i32> poison, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
 116 ; CHECK-NEXT:    [[STRIDED_VEC3:%.*]] = shufflevector <12 x i32> [[WIDE_VEC]], <12 x i32> poison, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
 117 ; CHECK-NEXT:    [[TMP1:%.*]] = add nsw <4 x i32> [[STRIDED_VEC]], <i32 1, i32 1, i32 1, i32 1>
 118 ; CHECK-NEXT:    [[TMP2:%.*]] = add nsw <4 x i32> [[STRIDED_VEC2]], <i32 2, i32 2, i32 2, i32 2>
 119 ; CHECK-NEXT:    [[TMP3:%.*]] = add nsw <4 x i32> [[STRIDED_VEC3]], <i32 3, i32 3, i32 3, i32 3>
 120 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1024 x %struct.ST3], ptr @S, i64 0, i64 [[INDEX]]
 121 ; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 122 ; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
 123 ; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i32> [[TMP5]], <8 x i32> [[TMP6]], <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
 124 ; CHECK-NEXT:    store <12 x i32> [[INTERLEAVED_VEC]], ptr [[TMP4]], align 4
 125 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 126 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 127 ; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 128 ; CHECK:       middle.block:
 129 ; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
 130 ; CHECK:       scalar.ph:
 131 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 132 ; CHECK:       for.body:
 133 ; CHECK-NEXT:    br i1 poison, label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 134 ; CHECK:       for.end:
 135 ; CHECK-NEXT:    ret void
 136 ;
 137 entry:
 138   br label %for.body
 139
 140 for.body:                                         ; preds = %for.body, %entry
 141   %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
 142   %ptr.016 = phi ptr [ @A, %entry ], [ %incdec.ptr2, %for.body ]
 143   %incdec.ptr = getelementptr inbounds i32, ptr %ptr.016, i64 1
 144   %tmp = load i32, ptr %ptr.016, align 4
 145   %incdec.ptr1 = getelementptr inbounds i32, ptr %ptr.016, i64 2
 146   %tmp1 = load i32, ptr %incdec.ptr, align 4
 147   %incdec.ptr2 = getelementptr inbounds i32, ptr %ptr.016, i64 3
 148   %tmp2 = load i32, ptr %incdec.ptr1, align 4
 149   %add = add nsw i32 %tmp, 1
 150   %x = getelementptr inbounds [1024 x %struct.ST3], ptr @S, i64 0, i64 %indvars.iv, i32 0
 151   store i32 %add, ptr %x, align 4
 152   %add3 = add nsw i32 %tmp1, 2
 153   %y = getelementptr inbounds [1024 x %struct.ST3], ptr @S, i64 0, i64 %indvars.iv, i32 1
 154   store i32 %add3, ptr %y, align 4
 155   %add6 = add nsw i32 %tmp2, 3
 156   %z = getelementptr inbounds [1024 x %struct.ST3], ptr @S, i64 0, i64 %indvars.iv, i32 2
 157   store i32 %add6, ptr %z, align 4
 158   %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
 159   %exitcond = icmp eq i64 %indvars.iv.next, 1024
 160   br i1 %exitcond, label %for.end, label %for.body
 161
 162 for.end:                                          ; preds = %for.body
 163   ret void
 164 }
 165
 166 ; Check vectorization on an interleaved load group of factor 4.
 167
 168 ; struct ST4{
 169 ;   int x;
 170 ;   int y;
 171 ;   int z;
 172 ;   int w;
 173 ; };
 174 ; int test_struct_load4(struct ST4 *S) {
 175 ;   int r = 0;
 176 ;   for (int i = 0; i < 1024; i++) {
 177 ;      r += S[i].x;
 178 ;      r -= S[i].y;
 179 ;      r += S[i].z;
 180 ;      r -= S[i].w;
 181 ;   }
 182 ;   return r;
 183 ; }
 184
 185 %struct.ST4 = type { i32, i32, i32, i32 }
 186
 187 define i32 @test_struct_load4(ptr nocapture readonly %S) {
 188 ;
 189 ; CHECK-LABEL: @test_struct_load4(
 190 ; CHECK-NEXT:  entry:
 191 ; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 192 ; CHECK:       vector.ph:
 193 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 194 ; CHECK:       vector.body:
 195 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 196 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
 197 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT_ST4:%.*]], ptr [[S:%.*]], i64 [[INDEX]], i32 0
 198 ; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <16 x i32>, ptr [[TMP0]], align 4
 199 ; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> poison, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
 200 ; CHECK-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> poison, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
 201 ; CHECK-NEXT:    [[STRIDED_VEC2:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> poison, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
 202 ; CHECK-NEXT:    [[STRIDED_VEC3:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> poison, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
 203 ; CHECK-NEXT:    [[TMP1:%.*]] = add <4 x i32> [[STRIDED_VEC]], [[VEC_PHI]]
 204 ; CHECK-NEXT:    [[TMP2:%.*]] = add <4 x i32> [[TMP1]], [[STRIDED_VEC2]]
 205 ; CHECK-NEXT:    [[TMP3:%.*]] = add <4 x i32> [[STRIDED_VEC1]], [[STRIDED_VEC3]]
 206 ; CHECK-NEXT:    [[TMP4]] = sub <4 x i32> [[TMP2]], [[TMP3]]
 207 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 208 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 209 ; CHECK-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 210 ; CHECK:       middle.block:
 211 ; CHECK-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP4]])
 212 ; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
 213 ; CHECK:       scalar.ph:
 214 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 215 ; CHECK:       for.body:
 216 ; CHECK-NEXT:    br i1 poison, label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
 217 ; CHECK:       for.end:
 218 ; CHECK-NEXT:    [[SUB8_LCSSA:%.*]] = phi i32 [ poison, [[FOR_BODY]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ]
 219 ; CHECK-NEXT:    ret i32 [[SUB8_LCSSA]]
 220 ;
 221 entry:
 222   br label %for.body
 223
 224 for.body:                                         ; preds = %for.body, %entry
 225   %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
 226   %r.022 = phi i32 [ 0, %entry ], [ %sub8, %for.body ]
 227   %x = getelementptr inbounds %struct.ST4, ptr %S, i64 %indvars.iv, i32 0
 228   %tmp = load i32, ptr %x, align 4
 229   %add = add nsw i32 %tmp, %r.022
 230   %y = getelementptr inbounds %struct.ST4, ptr %S, i64 %indvars.iv, i32 1
 231   %tmp1 = load i32, ptr %y, align 4
 232   %sub = sub i32 %add, %tmp1
 233   %z = getelementptr inbounds %struct.ST4, ptr %S, i64 %indvars.iv, i32 2
 234   %tmp2 = load i32, ptr %z, align 4
 235   %add5 = add nsw i32 %sub, %tmp2
 236   %w = getelementptr inbounds %struct.ST4, ptr %S, i64 %indvars.iv, i32 3
 237   %tmp3 = load i32, ptr %w, align 4
 238   %sub8 = sub i32 %add5, %tmp3
 239   %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
 240   %exitcond = icmp eq i64 %indvars.iv.next, 1024
 241   br i1 %exitcond, label %for.end, label %for.body
 242
 243 for.end:                                          ; preds = %for.body
 244   ret i32 %sub8
 245 }
 246
 247 ; Check vectorization on an interleaved store group of factor 4.
 248
 249 ; void test_struct_store4(int *A, struct ST4 *B) {
 250 ;   int *ptr = A;
 251 ;   for (int i = 0; i < 1024; i++) {
 252 ;     int X = *ptr++;
 253 ;     B[i].x = X + 1;
 254 ;     B[i].y = X * 2;
 255 ;     B[i].z = X + 3;
 256 ;     B[i].w = X + 4;
 257 ;   }
 258 ; }
 259
 260
 261 define void @test_struct_store4(ptr noalias nocapture readonly %A, ptr noalias nocapture %B) {
 262 ; CHECK-LABEL: @test_struct_store4(
 263 ; CHECK-NEXT:  entry:
 264 ; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 265 ; CHECK:       vector.ph:
 266 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 267 ; CHECK:       vector.body:
 268 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 269 ; CHECK-NEXT:    [[TMP0:%.*]] = shl i64 [[INDEX]], 2
 270 ; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 [[TMP0]]
 271 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[NEXT_GEP]], align 4
 272 ; CHECK-NEXT:    [[TMP1:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], <i32 1, i32 1, i32 1, i32 1>
 273 ; CHECK-NEXT:    [[TMP2:%.*]] = shl nsw <4 x i32> [[WIDE_LOAD]], <i32 1, i32 1, i32 1, i32 1>
 274 ; CHECK-NEXT:    [[TMP3:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], <i32 3, i32 3, i32 3, i32 3>
 275 ; CHECK-NEXT:    [[TMP4:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], <i32 4, i32 4, i32 4, i32 4>
 276 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT_ST4:%.*]], ptr [[B:%.*]], i64 [[INDEX]]
 277 ; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 278 ; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 279 ; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i32> [[TMP6]], <8 x i32> [[TMP7]], <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
 280 ; CHECK-NEXT:    store <16 x i32> [[INTERLEAVED_VEC]], ptr [[TMP5]], align 4
 281 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 282 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 283 ; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 284 ; CHECK:       middle.block:
 285 ; CHECK-NEXT:    br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
 286 ; CHECK:       scalar.ph:
 287 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 288 ; CHECK:       for.cond.cleanup:
 289 ; CHECK-NEXT:    ret void
 290 ; CHECK:       for.body:
 291 ; CHECK-NEXT:    br i1 poison, label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
 292 ;
 293 entry:
 294   br label %for.body
 295
 296 for.cond.cleanup:                                 ; preds = %for.body
 297   ret void
 298
 299 for.body:                                         ; preds = %for.body, %entry
 300   %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
 301   %ptr.024 = phi ptr [ %A, %entry ], [ %incdec.ptr, %for.body ]
 302   %incdec.ptr = getelementptr inbounds i32, ptr %ptr.024, i64 1
 303   %tmp = load i32, ptr %ptr.024, align 4
 304   %add = add nsw i32 %tmp, 1
 305   %x = getelementptr inbounds %struct.ST4, ptr %B, i64 %indvars.iv, i32 0
 306   store i32 %add, ptr %x, align 4
 307   %mul = shl nsw i32 %tmp, 1
 308   %y = getelementptr inbounds %struct.ST4, ptr %B, i64 %indvars.iv, i32 1
 309   store i32 %mul, ptr %y, align 4
 310   %add3 = add nsw i32 %tmp, 3
 311   %z = getelementptr inbounds %struct.ST4, ptr %B, i64 %indvars.iv, i32 2
 312   store i32 %add3, ptr %z, align 4
 313   %add6 = add nsw i32 %tmp, 4
 314   %w = getelementptr inbounds %struct.ST4, ptr %B, i64 %indvars.iv, i32 3
 315   store i32 %add6, ptr %w, align 4
 316   %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
 317   %exitcond = icmp eq i64 %indvars.iv.next, 1024
 318   br i1 %exitcond, label %for.cond.cleanup, label %for.body
 319 }
 320
 321 ; Check vectorization on a reverse interleaved load group of factor 2 and
 322 ; a reverse interleaved store group of factor 2.
 323
 324 ; struct ST2 {
 325 ;  int x;
 326 ;  int y;
 327 ; };
 328 ;
 329 ; void test_reversed_load2_store2(struct ST2 *A, struct ST2 *B) {
 330 ;   for (int i = 1023; i >= 0; i--) {
 331 ;     int a = A[i].x + i;  // interleaved load of index 0
 332 ;     int b = A[i].y - i;  // interleaved load of index 1
 333 ;     B[i].x = a;          // interleaved store of index 0
 334 ;     B[i].y = b;          // interleaved store of index 1
 335 ;   }
 336 ; }
 337
 338
 339 %struct.ST2 = type { i32, i32 }
 340
 341 define void @test_reversed_load2_store2(ptr noalias nocapture readonly %A, ptr noalias nocapture %B) {
 342 ; CHECK-LABEL: @test_reversed_load2_store2(
 343 ; CHECK-NEXT:  entry:
 344 ; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 345 ; CHECK:       vector.ph:
 346 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 347 ; CHECK:       vector.body:
 348 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 349 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 1023, i32 1022, i32 1021, i32 1020>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 350 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = sub i64 1023, [[INDEX]]
 351 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT_ST2:%.*]], ptr [[A:%.*]], i64 [[OFFSET_IDX]], i32 0
 352 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 -6
 353 ; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <8 x i32>, ptr [[TMP1]], align 4
 354 ; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
 355 ; CHECK-NEXT:    [[REVERSE:%.*]] = shufflevector <4 x i32> [[STRIDED_VEC]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
 356 ; CHECK-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
 357 ; CHECK-NEXT:    [[REVERSE2:%.*]] = shufflevector <4 x i32> [[STRIDED_VEC1]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
 358 ; CHECK-NEXT:    [[TMP2:%.*]] = add nsw <4 x i32> [[REVERSE]], [[VEC_IND]]
 359 ; CHECK-NEXT:    [[TMP3:%.*]] = sub nsw <4 x i32> [[REVERSE2]], [[VEC_IND]]
 360 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [[STRUCT_ST2]], ptr [[B:%.*]], i64 [[OFFSET_IDX]], i32 1
 361 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i64 -7
 362 ; CHECK-NEXT:    [[REVERSE3:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
 363 ; CHECK-NEXT:    [[REVERSE4:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
 364 ; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i32> [[REVERSE3]], <4 x i32> [[REVERSE4]], <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
 365 ; CHECK-NEXT:    store <8 x i32> [[INTERLEAVED_VEC]], ptr [[TMP5]], align 4
 366 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 367 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 -4, i32 -4, i32 -4, i32 -4>
 368 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 369 ; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 370 ; CHECK:       middle.block:
 371 ; CHECK-NEXT:    br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
 372 ; CHECK:       scalar.ph:
 373 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 374 ; CHECK:       for.cond.cleanup:
 375 ; CHECK-NEXT:    ret void
 376 ; CHECK:       for.body:
 377 ; CHECK-NEXT:    br i1 poison, label [[FOR_BODY]], label [[FOR_COND_CLEANUP]], !llvm.loop [[LOOP11:![0-9]+]]
 378 ;
 379 entry:
 380   br label %for.body
 381
 382 for.cond.cleanup:                                 ; preds = %for.body
 383   ret void
 384
 385 for.body:                                         ; preds = %for.body, %entry
 386   %indvars.iv = phi i64 [ 1023, %entry ], [ %indvars.iv.next, %for.body ]
 387   %x = getelementptr inbounds %struct.ST2, ptr %A, i64 %indvars.iv, i32 0
 388   %tmp = load i32, ptr %x, align 4
 389   %tmp1 = trunc i64 %indvars.iv to i32
 390   %add = add nsw i32 %tmp, %tmp1
 391   %y = getelementptr inbounds %struct.ST2, ptr %A, i64 %indvars.iv, i32 1
 392   %tmp2 = load i32, ptr %y, align 4
 393   %sub = sub nsw i32 %tmp2, %tmp1
 394   %x5 = getelementptr inbounds %struct.ST2, ptr %B, i64 %indvars.iv, i32 0
 395   store i32 %add, ptr %x5, align 4
 396   %y8 = getelementptr inbounds %struct.ST2, ptr %B, i64 %indvars.iv, i32 1
 397   store i32 %sub, ptr %y8, align 4
 398   %indvars.iv.next = add nsw i64 %indvars.iv, -1
 399   %cmp = icmp sgt i64 %indvars.iv, 0
 400   br i1 %cmp, label %for.body, label %for.cond.cleanup
 401 }
 402
 403 ; Check vectorization on an interleaved load group of factor 2 with 1 gap
 404 ; (missing the load of odd elements). Because the vectorized loop would
 405 ; speculatively access memory out-of-bounds, we must execute at least one
 406 ; iteration of the scalar loop.
 407
 408 ; void even_load_static_tc(int *A, int *B) {
 409 ;  for (unsigned i = 0; i < 1024; i+=2)
 410 ;     B[i/2] = A[i] * 2;
 411 ; }
 412
 413
 414 define void @even_load_static_tc(ptr noalias nocapture readonly %A, ptr noalias nocapture %B) {
 415 ; CHECK-LABEL: @even_load_static_tc(
 416 ; CHECK-NEXT:  entry:
 417 ; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 418 ; CHECK:       vector.ph:
 419 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 420 ; CHECK:       vector.body:
 421 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 422 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = shl i64 [[INDEX]], 1
 423 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[OFFSET_IDX]]
 424 ; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <8 x i32>, ptr [[TMP0]], align 4
 425 ; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
 426 ; CHECK-NEXT:    [[TMP1:%.*]] = shl nsw <4 x i32> [[STRIDED_VEC]], <i32 1, i32 1, i32 1, i32 1>
 427 ; CHECK-NEXT:    [[TMP2:%.*]] = and i64 [[INDEX]], 9223372036854775804
 428 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[TMP2]]
 429 ; CHECK-NEXT:    store <4 x i32> [[TMP1]], ptr [[TMP3]], align 4
 430 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 431 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], 508
 432 ; CHECK-NEXT:    br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
 433 ; CHECK:       middle.block:
 434 ; CHECK-NEXT:    br label [[SCALAR_PH]]
 435 ; CHECK:       scalar.ph:
 436 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 437 ; CHECK:       for.cond.cleanup:
 438 ; CHECK-NEXT:    ret void
 439 ; CHECK:       for.body:
 440 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 1016, [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
 441 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDVARS_IV]]
 442 ; CHECK-NEXT:    [[TMP:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
 443 ; CHECK-NEXT:    [[MUL:%.*]] = shl nsw i32 [[TMP]], 1
 444 ; CHECK-NEXT:    [[TMP1:%.*]] = lshr exact i64 [[INDVARS_IV]], 1
 445 ; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP1]]
 446 ; CHECK-NEXT:    store i32 [[MUL]], ptr [[ARRAYIDX2]], align 4
 447 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 2
 448 ; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i64 [[INDVARS_IV]], 1022
 449 ; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP:%.*]], !llvm.loop [[LOOP13:![0-9]+]]
 450 ;
 451 entry:
 452   br label %for.body
 453
 454 for.cond.cleanup:                                 ; preds = %for.body
 455   ret void
 456
 457 for.body:                                         ; preds = %for.body, %entry
 458   %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
 459   %arrayidx = getelementptr inbounds i32, ptr %A, i64 %indvars.iv
 460   %tmp = load i32, ptr %arrayidx, align 4
 461   %mul = shl nsw i32 %tmp, 1
 462   %tmp1 = lshr exact i64 %indvars.iv, 1
 463   %arrayidx2 = getelementptr inbounds i32, ptr %B, i64 %tmp1
 464   store i32 %mul, ptr %arrayidx2, align 4
 465   %indvars.iv.next = add nuw nsw i64 %indvars.iv, 2
 466   %cmp = icmp ult i64 %indvars.iv.next, 1024
 467   br i1 %cmp, label %for.body, label %for.cond.cleanup
 468 }
 469
 470 ; Check vectorization on an interleaved load group of factor 2 with 1 gap
 471 ; (missing the load of odd elements). Because the vectorized loop would
 472 ; speculatively access memory out-of-bounds, we must execute at least one
 473 ; iteration of the scalar loop.
 474
 475 ; void even_load_dynamic_tc(int *A, int *B, unsigned N) {
 476 ;  for (unsigned i = 0; i < N; i+=2)
 477 ;     B[i/2] = A[i] * 2;
 478 ; }
 479
 480
 481 define void @even_load_dynamic_tc(ptr noalias nocapture readonly %A, ptr noalias nocapture %B, i64 %N) {
 482 ; CHECK-LABEL: @even_load_dynamic_tc(
 483 ; CHECK-NEXT:  entry:
 484 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 9
 485 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 486 ; CHECK:       vector.ph:
 487 ; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[N]], -1
 488 ; CHECK-NEXT:    [[TMP1:%.*]] = lshr i64 [[TMP0]], 1
 489 ; CHECK-NEXT:    [[TMP2:%.*]] = add nuw i64 [[TMP1]], 1
 490 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = and i64 [[TMP2]], 3
 491 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
 492 ; CHECK-NEXT:    [[TMP4:%.*]] = select i1 [[TMP3]], i64 4, i64 [[N_MOD_VF]]
 493 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP2]], [[TMP4]]
 494 ; CHECK-NEXT:    [[IND_END:%.*]] = shl i64 [[N_VEC]], 1
 495 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 496 ; CHECK:       vector.body:
 497 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 498 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = shl i64 [[INDEX]], 1
 499 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[OFFSET_IDX]]
 500 ; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <8 x i32>, ptr [[TMP5]], align 4
 501 ; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
 502 ; CHECK-NEXT:    [[TMP6:%.*]] = shl nsw <4 x i32> [[STRIDED_VEC]], <i32 1, i32 1, i32 1, i32 1>
 503 ; CHECK-NEXT:    [[TMP7:%.*]] = and i64 [[INDEX]], 9223372036854775804
 504 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[TMP7]]
 505 ; CHECK-NEXT:    store <4 x i32> [[TMP6]], ptr [[TMP8]], align 4
 506 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 507 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 508 ; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
 509 ; CHECK:       middle.block:
 510 ; CHECK-NEXT:    br label [[SCALAR_PH]]
 511 ; CHECK:       scalar.ph:
 512 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
 513 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 514 ; CHECK:       for.cond.cleanup:
 515 ; CHECK-NEXT:    ret void
 516 ; CHECK:       for.body:
 517 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
 518 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDVARS_IV]]
 519 ; CHECK-NEXT:    [[TMP:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
 520 ; CHECK-NEXT:    [[MUL:%.*]] = shl nsw i32 [[TMP]], 1
 521 ; CHECK-NEXT:    [[TMP1:%.*]] = lshr exact i64 [[INDVARS_IV]], 1
 522 ; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP1]]
 523 ; CHECK-NEXT:    store i32 [[MUL]], ptr [[ARRAYIDX2]], align 4
 524 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 2
 525 ; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i64 [[INDVARS_IV_NEXT]], [[N]]
 526 ; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP:%.*]], !llvm.loop [[LOOP15:![0-9]+]]
 527 ;
 528 entry:
 529   br label %for.body
 530
 531 for.cond.cleanup:                                 ; preds = %for.body
 532   ret void
 533
 534 for.body:                                         ; preds = %for.body, %entry
 535   %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
 536   %arrayidx = getelementptr inbounds i32, ptr %A, i64 %indvars.iv
 537   %tmp = load i32, ptr %arrayidx, align 4
 538   %mul = shl nsw i32 %tmp, 1
 539   %tmp1 = lshr exact i64 %indvars.iv, 1
 540   %arrayidx2 = getelementptr inbounds i32, ptr %B, i64 %tmp1
 541   store i32 %mul, ptr %arrayidx2, align 4
 542   %indvars.iv.next = add nuw nsw i64 %indvars.iv, 2
 543   %cmp = icmp ult i64 %indvars.iv.next, %N
 544   br i1 %cmp, label %for.body, label %for.cond.cleanup
 545 }
 546
 547 ; Check vectorization on a reverse interleaved load group of factor 2 with 1
 548 ; gap and a reverse interleaved store group of factor 2. The interleaved load
 549 ; group should be removed since it has a gap and is reverse.
 550
 551 ; struct pair {
 552 ;  int x;
 553 ;  int y;
 554 ; };
 555 ;
 556 ; void load_gap_reverse(struct pair *P1, struct pair *P2, int X) {
 557 ;   for (int i = 1023; i >= 0; i--) {
 558 ;     int a = X + i;
 559 ;     int b = B[i].y - i;
 560 ;     A[i].x = a;
 561 ;     B[i].y = b;
 562 ;   }
 563 ; }
 564
 565
 566 %pair = type { i64, i64 }
 567 define void @load_gap_reverse(ptr noalias nocapture %P1, ptr noalias nocapture %P2, i64 %X) {
 568 ; CHECK-LABEL: @load_gap_reverse(
 569 ; CHECK-NEXT:  entry:
 570 ; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 571 ; CHECK:       vector.ph:
 572 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[X:%.*]], i64 0
 573 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
 574 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 575 ; CHECK:       vector.body:
 576 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 577 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 1023, i64 1022, i64 1021, i64 1020>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 578 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = sub i64 1023, [[INDEX]]
 579 ; CHECK-NEXT:    [[TMP0:%.*]] = sub i64 1022, [[INDEX]]
 580 ; CHECK-NEXT:    [[TMP1:%.*]] = sub i64 1021, [[INDEX]]
 581 ; CHECK-NEXT:    [[TMP2:%.*]] = sub i64 1020, [[INDEX]]
 582 ; CHECK-NEXT:    [[TMP3:%.*]] = add nsw <4 x i64> [[BROADCAST_SPLAT]], [[VEC_IND]]
 583 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [[PAIR:%.*]], ptr [[P1:%.*]], i64 [[OFFSET_IDX]], i32 0
 584 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[PAIR]], ptr [[P1]], i64 [[TMP0]], i32 0
 585 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[PAIR]], ptr [[P1]], i64 [[TMP1]], i32 0
 586 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[PAIR]], ptr [[P1]], i64 [[TMP2]], i32 0
 587 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[PAIR]], ptr [[P2:%.*]], i64 [[OFFSET_IDX]], i32 1
 588 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[PAIR]], ptr [[P2]], i64 [[TMP0]], i32 1
 589 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [[PAIR]], ptr [[P2]], i64 [[TMP1]], i32 1
 590 ; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [[PAIR]], ptr [[P2]], i64 [[TMP2]], i32 1
 591 ; CHECK-NEXT:    [[TMP12:%.*]] = load i64, ptr [[TMP8]], align 8
 592 ; CHECK-NEXT:    [[TMP13:%.*]] = load i64, ptr [[TMP9]], align 8
 593 ; CHECK-NEXT:    [[TMP14:%.*]] = load i64, ptr [[TMP10]], align 8
 594 ; CHECK-NEXT:    [[TMP15:%.*]] = load i64, ptr [[TMP11]], align 8
 595 ; CHECK-NEXT:    [[TMP16:%.*]] = insertelement <4 x i64> poison, i64 [[TMP12]], i64 0
 596 ; CHECK-NEXT:    [[TMP17:%.*]] = insertelement <4 x i64> [[TMP16]], i64 [[TMP13]], i64 1
 597 ; CHECK-NEXT:    [[TMP18:%.*]] = insertelement <4 x i64> [[TMP17]], i64 [[TMP14]], i64 2
 598 ; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <4 x i64> [[TMP18]], i64 [[TMP15]], i64 3
 599 ; CHECK-NEXT:    [[TMP20:%.*]] = sub nsw <4 x i64> [[TMP19]], [[VEC_IND]]
 600 ; CHECK-NEXT:    [[TMP21:%.*]] = extractelement <4 x i64> [[TMP3]], i64 0
 601 ; CHECK-NEXT:    store i64 [[TMP21]], ptr [[TMP4]], align 8
 602 ; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <4 x i64> [[TMP3]], i64 1
 603 ; CHECK-NEXT:    store i64 [[TMP22]], ptr [[TMP5]], align 8
 604 ; CHECK-NEXT:    [[TMP23:%.*]] = extractelement <4 x i64> [[TMP3]], i64 2
 605 ; CHECK-NEXT:    store i64 [[TMP23]], ptr [[TMP6]], align 8
 606 ; CHECK-NEXT:    [[TMP24:%.*]] = extractelement <4 x i64> [[TMP3]], i64 3
 607 ; CHECK-NEXT:    store i64 [[TMP24]], ptr [[TMP7]], align 8
 608 ; CHECK-NEXT:    [[TMP25:%.*]] = extractelement <4 x i64> [[TMP20]], i64 0
 609 ; CHECK-NEXT:    store i64 [[TMP25]], ptr [[TMP8]], align 8
 610 ; CHECK-NEXT:    [[TMP26:%.*]] = extractelement <4 x i64> [[TMP20]], i64 1
 611 ; CHECK-NEXT:    store i64 [[TMP26]], ptr [[TMP9]], align 8
 612 ; CHECK-NEXT:    [[TMP27:%.*]] = extractelement <4 x i64> [[TMP20]], i64 2
 613 ; CHECK-NEXT:    store i64 [[TMP27]], ptr [[TMP10]], align 8
 614 ; CHECK-NEXT:    [[TMP28:%.*]] = extractelement <4 x i64> [[TMP20]], i64 3
 615 ; CHECK-NEXT:    store i64 [[TMP28]], ptr [[TMP11]], align 8
 616 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 617 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], <i64 -4, i64 -4, i64 -4, i64 -4>
 618 ; CHECK-NEXT:    [[TMP29:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 619 ; CHECK-NEXT:    br i1 [[TMP29]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
 620 ; CHECK:       middle.block:
 621 ; CHECK-NEXT:    br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
 622 ; CHECK:       scalar.ph:
 623 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 624 ; CHECK:       for.body:
 625 ; CHECK-NEXT:    br i1 poison, label [[FOR_BODY]], label [[FOR_EXIT]], !llvm.loop [[LOOP17:![0-9]+]]
 626 ; CHECK:       for.exit:
 627 ; CHECK-NEXT:    ret void
 628 ;
 629 entry:
 630   br label %for.body
 631
 632 for.body:
 633   %i = phi i64 [ 1023, %entry ], [ %i.next, %for.body ]
 634   %0 = add nsw i64 %X, %i
 635   %1 = getelementptr inbounds %pair, ptr %P1, i64 %i, i32 0
 636   %2 = getelementptr inbounds %pair, ptr %P2, i64 %i, i32 1
 637   %3 = load i64, ptr %2, align 8
 638   %4 = sub nsw i64 %3, %i
 639   store i64 %0, ptr %1, align 8
 640   store i64 %4, ptr %2, align 8
 641   %i.next = add nsw i64 %i, -1
 642   %cond = icmp sgt i64 %i, 0
 643   br i1 %cond, label %for.body, label %for.exit
 644
 645 for.exit:
 646   ret void
 647 }
 648
 649 ; Check vectorization on interleaved access groups identified from mixed
 650 ; loads/stores.
 651 ; void mixed_load2_store2(int *A, int *B) {
 652 ;   for (unsigned i = 0; i < 1024; i+=2)  {
 653 ;     B[i] = A[i] * A[i+1];
 654 ;     B[i+1] = A[i] + A[i+1];
 655 ;   }
 656 ; }
 657
 658
 659 define void @mixed_load2_store2(ptr noalias nocapture readonly %A, ptr noalias nocapture %B) {
 660 ; CHECK-LABEL: @mixed_load2_store2(
 661 ; CHECK-NEXT:  entry:
 662 ; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 663 ; CHECK:       vector.ph:
 664 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 665 ; CHECK:       vector.body:
 666 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 667 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = shl i64 [[INDEX]], 1
 668 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[OFFSET_IDX]]
 669 ; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <8 x i32>, ptr [[TMP0]], align 4
 670 ; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
 671 ; CHECK-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
 672 ; CHECK-NEXT:    [[TMP1:%.*]] = or i64 [[OFFSET_IDX]], 1
 673 ; CHECK-NEXT:    [[TMP2:%.*]] = mul nsw <4 x i32> [[STRIDED_VEC1]], [[STRIDED_VEC]]
 674 ; CHECK-NEXT:    [[STRIDED_VEC3:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
 675 ; CHECK-NEXT:    [[STRIDED_VEC4:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
 676 ; CHECK-NEXT:    [[TMP3:%.*]] = add nsw <4 x i32> [[STRIDED_VEC4]], [[STRIDED_VEC3]]
 677 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[TMP1]]
 678 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i64 -1
 679 ; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
 680 ; CHECK-NEXT:    store <8 x i32> [[INTERLEAVED_VEC]], ptr [[TMP5]], align 4
 681 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 682 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 512
 683 ; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
 684 ; CHECK:       middle.block:
 685 ; CHECK-NEXT:    br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
 686 ; CHECK:       scalar.ph:
 687 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 688 ; CHECK:       for.cond.cleanup:
 689 ; CHECK-NEXT:    ret void
 690 ; CHECK:       for.body:
 691 ; CHECK-NEXT:    br i1 poison, label [[FOR_BODY]], label [[FOR_COND_CLEANUP]], !llvm.loop [[LOOP19:![0-9]+]]
 692 ;
 693 entry:
 694   br label %for.body
 695
 696 for.cond.cleanup:                                 ; preds = %for.body
 697   ret void
 698
 699 for.body:                                         ; preds = %for.body, %entry
 700   %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
 701   %arrayidx = getelementptr inbounds i32, ptr %A, i64 %indvars.iv
 702   %tmp = load i32, ptr %arrayidx, align 4
 703   %tmp1 = or i64 %indvars.iv, 1
 704   %arrayidx2 = getelementptr inbounds i32, ptr %A, i64 %tmp1
 705   %tmp2 = load i32, ptr %arrayidx2, align 4
 706   %mul = mul nsw i32 %tmp2, %tmp
 707   %arrayidx4 = getelementptr inbounds i32, ptr %B, i64 %indvars.iv
 708   store i32 %mul, ptr %arrayidx4, align 4
 709   %tmp3 = load i32, ptr %arrayidx, align 4
 710   %tmp4 = load i32, ptr %arrayidx2, align 4
 711   %add10 = add nsw i32 %tmp4, %tmp3
 712   %arrayidx13 = getelementptr inbounds i32, ptr %B, i64 %tmp1
 713   store i32 %add10, ptr %arrayidx13, align 4
 714   %indvars.iv.next = add nuw nsw i64 %indvars.iv, 2
 715   %cmp = icmp ult i64 %indvars.iv.next, 1024
 716   br i1 %cmp, label %for.body, label %for.cond.cleanup
 717 }
 718
 719 ; Check vectorization on interleaved access groups identified from mixed
 720 ; loads/stores.
 721 ; void mixed_load3_store3(int *A) {
 722 ;   for (unsigned i = 0; i < 1024; i++)  {
 723 ;     *A++ += i;
 724 ;     *A++ += i;
 725 ;     *A++ += i;
 726 ;   }
 727 ; }
 728
 729
 730 define void @mixed_load3_store3(ptr nocapture %A) {
 731 ; CHECK-LABEL: @mixed_load3_store3(
 732 ; CHECK-NEXT:  entry:
 733 ; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 734 ; CHECK:       vector.ph:
 735 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 736 ; CHECK:       vector.body:
 737 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 738 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 739 ; CHECK-NEXT:    [[TMP0:%.*]] = mul i64 [[INDEX]], 12
 740 ; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 [[TMP0]]
 741 ; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <12 x i32>, ptr [[NEXT_GEP]], align 4
 742 ; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <12 x i32> [[WIDE_VEC]], <12 x i32> poison, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
 743 ; CHECK-NEXT:    [[STRIDED_VEC2:%.*]] = shufflevector <12 x i32> [[WIDE_VEC]], <12 x i32> poison, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
 744 ; CHECK-NEXT:    [[STRIDED_VEC3:%.*]] = shufflevector <12 x i32> [[WIDE_VEC]], <12 x i32> poison, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
 745 ; CHECK-NEXT:    [[TMP1:%.*]] = add <4 x i32> [[STRIDED_VEC]], [[VEC_IND]]
 746 ; CHECK-NEXT:    [[TMP2:%.*]] = add <4 x i32> [[STRIDED_VEC2]], [[VEC_IND]]
 747 ; CHECK-NEXT:    [[TMP3:%.*]] = add <4 x i32> [[STRIDED_VEC3]], [[VEC_IND]]
 748 ; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 749 ; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
 750 ; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i32> [[TMP4]], <8 x i32> [[TMP5]], <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
 751 ; CHECK-NEXT:    store <12 x i32> [[INTERLEAVED_VEC]], ptr [[NEXT_GEP]], align 4
 752 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 753 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
 754 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 755 ; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
 756 ; CHECK:       middle.block:
 757 ; CHECK-NEXT:    br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
 758 ; CHECK:       scalar.ph:
 759 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 760 ; CHECK:       for.cond.cleanup:
 761 ; CHECK-NEXT:    ret void
 762 ; CHECK:       for.body:
 763 ; CHECK-NEXT:    br i1 poison, label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
 764 ;
 765 entry:
 766   br label %for.body
 767
 768 for.cond.cleanup:                                 ; preds = %for.body
 769   ret void
 770
 771 for.body:                                         ; preds = %for.body, %entry
 772   %i.013 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
 773   %A.addr.012 = phi ptr [ %A, %entry ], [ %incdec.ptr3, %for.body ]
 774   %incdec.ptr = getelementptr inbounds i32, ptr %A.addr.012, i64 1
 775   %tmp = load i32, ptr %A.addr.012, align 4
 776   %add = add i32 %tmp, %i.013
 777   store i32 %add, ptr %A.addr.012, align 4
 778   %incdec.ptr1 = getelementptr inbounds i32, ptr %A.addr.012, i64 2
 779   %tmp1 = load i32, ptr %incdec.ptr, align 4
 780   %add2 = add i32 %tmp1, %i.013
 781   store i32 %add2, ptr %incdec.ptr, align 4
 782   %incdec.ptr3 = getelementptr inbounds i32, ptr %A.addr.012, i64 3
 783   %tmp2 = load i32, ptr %incdec.ptr1, align 4
 784   %add4 = add i32 %tmp2, %i.013
 785   store i32 %add4, ptr %incdec.ptr1, align 4
 786   %inc = add nuw nsw i32 %i.013, 1
 787   %exitcond = icmp eq i32 %inc, 1024
 788   br i1 %exitcond, label %for.cond.cleanup, label %for.body
 789 }
 790
 791 ; Check vectorization on interleaved access groups with members having different
 792 ; kinds of type.
 793
 794 ; struct IntFloat {
 795 ;   int a;
 796 ;   float b;
 797 ; };
 798 ;
 799 ; int SA;
 800 ; float SB;
 801 ;
 802 ; void int_float_struct(struct IntFloat *A) {
 803 ;   int SumA;
 804 ;   float SumB;
 805 ;   for (unsigned i = 0; i < 1024; i++)  {
 806 ;     SumA += A[i].a;
 807 ;     SumB += A[i].b;
 808 ;   }
 809 ;   SA = SumA;
 810 ;   SB = SumB;
 811 ; }
 812
 813
 814 %struct.IntFloat = type { i32, float }
 815
 816 @SA = common global i32 0, align 4
 817 @SB = common global float 0.000000e+00, align 4
 818
 819 define void @int_float_struct(ptr nocapture readonly %A) #0 {
 820 ; CHECK-LABEL: @int_float_struct(
 821 ; CHECK-NEXT:  entry:
 822 ; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 823 ; CHECK:       vector.ph:
 824 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 825 ; CHECK:       vector.body:
 826 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 827 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x float> [ <float undef, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>, [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ]
 828 ; CHECK-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x i32> [ <i32 undef, i32 0, i32 0, i32 0>, [[VECTOR_PH]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ]
 829 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT_INTFLOAT:%.*]], ptr [[A:%.*]], i64 [[INDEX]], i32 0
 830 ; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <8 x i32>, ptr [[TMP0]], align 4
 831 ; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
 832 ; CHECK-NEXT:    [[STRIDED_VEC2:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
 833 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[STRIDED_VEC2]] to <4 x float>
 834 ; CHECK-NEXT:    [[TMP2]] = add <4 x i32> [[STRIDED_VEC]], [[VEC_PHI1]]
 835 ; CHECK-NEXT:    [[TMP3]] = fadd fast <4 x float> [[VEC_PHI]], [[TMP1]]
 836 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 837 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 838 ; CHECK-NEXT:    br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
 839 ; CHECK:       middle.block:
 840 ; CHECK-NEXT:    [[TMP5:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP2]])
 841 ; CHECK-NEXT:    [[TMP6:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP3]])
 842 ; CHECK-NEXT:    br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
 843 ; CHECK:       scalar.ph:
 844 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 845 ; CHECK:       for.cond.cleanup:
 846 ; CHECK-NEXT:    [[ADD_LCSSA:%.*]] = phi i32 [ poison, [[FOR_BODY]] ], [ [[TMP5]], [[MIDDLE_BLOCK]] ]
 847 ; CHECK-NEXT:    [[ADD3_LCSSA:%.*]] = phi float [ poison, [[FOR_BODY]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ]
 848 ; CHECK-NEXT:    store i32 [[ADD_LCSSA]], ptr @SA, align 4
 849 ; CHECK-NEXT:    store float [[ADD3_LCSSA]], ptr @SB, align 4
 850 ; CHECK-NEXT:    ret void
 851 ; CHECK:       for.body:
 852 ; CHECK-NEXT:    br i1 poison, label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]]
 853 ;
 854 entry:
 855   br label %for.body
 856
 857 for.cond.cleanup:                                 ; preds = %for.body
 858   store i32 %add, ptr @SA, align 4
 859   store float %add3, ptr @SB, align 4
 860   ret void
 861
 862 for.body:                                         ; preds = %for.body, %entry
 863   %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
 864   %SumB.014 = phi float [ undef, %entry ], [ %add3, %for.body ]
 865   %SumA.013 = phi i32 [ undef, %entry ], [ %add, %for.body ]
 866   %a = getelementptr inbounds %struct.IntFloat, ptr %A, i64 %indvars.iv, i32 0
 867   %tmp = load i32, ptr %a, align 4
 868   %add = add nsw i32 %tmp, %SumA.013
 869   %b = getelementptr inbounds %struct.IntFloat, ptr %A, i64 %indvars.iv, i32 1
 870   %tmp1 = load float, ptr %b, align 4
 871   %add3 = fadd fast float %SumB.014, %tmp1
 872   %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
 873   %exitcond = icmp eq i64 %indvars.iv.next, 1024
 874   br i1 %exitcond, label %for.cond.cleanup, label %for.body
 875 }
 876
 877 ; Check vectorization of interleaved access groups in the presence of
 878 ; dependences (PR27626). The following tests check that we don't reorder
 879 ; dependent loads and stores when generating code for interleaved access
 880 ; groups. Stores should be scalarized because the required code motion would
 881 ; break dependences, and the remaining interleaved load groups should have
 882 ; gaps.
 883
 884 ; PR27626_0: Ensure a strided store is not moved after a dependent (zero
 885 ;            distance) strided load.
 886
 887 ; void PR27626_0(struct pair *p, int z, int n) {
 888 ;   for (int i = 0; i < n; i++) {
 889 ;     p[i].x = z;
 890 ;     p[i].y = p[i].x;
 891 ;   }
 892 ; }
 893
 894
 895 %pair.i32 = type { i32, i32 }
 896 define void @PR27626_0(ptr %p, i32 %z, i64 %n) {
 897 ; CHECK-LABEL: @PR27626_0(
 898 ; CHECK-NEXT:  entry:
 899 ; CHECK-NEXT:    [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N:%.*]], i64 1)
 900 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[SMAX]], 5
 901 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 902 ; CHECK:       vector.ph:
 903 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = and i64 [[SMAX]], 3
 904 ; CHECK-NEXT:    [[TMP0:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
 905 ; CHECK-NEXT:    [[TMP1:%.*]] = select i1 [[TMP0]], i64 4, i64 [[N_MOD_VF]]
 906 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub nsw i64 [[SMAX]], [[TMP1]]
 907 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 908 ; CHECK:       vector.body:
 909 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 910 ; CHECK-NEXT:    [[TMP2:%.*]] = or i64 [[INDEX]], 1
 911 ; CHECK-NEXT:    [[TMP3:%.*]] = or i64 [[INDEX]], 2
 912 ; CHECK-NEXT:    [[TMP4:%.*]] = or i64 [[INDEX]], 3
 913 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[PAIR_I32:%.*]], ptr [[P:%.*]], i64 [[INDEX]], i32 0
 914 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP2]], i32 0
 915 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP3]], i32 0
 916 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP4]], i32 0
 917 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[INDEX]], i32 1
 918 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP2]], i32 1
 919 ; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP3]], i32 1
 920 ; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP4]], i32 1
 921 ; CHECK-NEXT:    store i32 [[Z:%.*]], ptr [[TMP5]], align 4
 922 ; CHECK-NEXT:    store i32 [[Z]], ptr [[TMP6]], align 4
 923 ; CHECK-NEXT:    store i32 [[Z]], ptr [[TMP7]], align 4
 924 ; CHECK-NEXT:    store i32 [[Z]], ptr [[TMP8]], align 4
 925 ; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <8 x i32>, ptr [[TMP5]], align 4
 926 ; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i64 0
 927 ; CHECK-NEXT:    store i32 [[TMP13]], ptr [[TMP9]], align 4
 928 ; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i64 2
 929 ; CHECK-NEXT:    store i32 [[TMP14]], ptr [[TMP10]], align 4
 930 ; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i64 4
 931 ; CHECK-NEXT:    store i32 [[TMP15]], ptr [[TMP11]], align 4
 932 ; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i64 6
 933 ; CHECK-NEXT:    store i32 [[TMP16]], ptr [[TMP12]], align 4
 934 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 935 ; CHECK-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 936 ; CHECK-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
 937 ; CHECK:       middle.block:
 938 ; CHECK-NEXT:    br label [[SCALAR_PH]]
 939 ; CHECK:       scalar.ph:
 940 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
 941 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 942 ; CHECK:       for.body:
 943 ; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
 944 ; CHECK-NEXT:    [[P_I_X:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[I]], i32 0
 945 ; CHECK-NEXT:    [[P_I_Y:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[I]], i32 1
 946 ; CHECK-NEXT:    store i32 [[Z]], ptr [[P_I_X]], align 4
 947 ; CHECK-NEXT:    store i32 [[Z]], ptr [[P_I_Y]], align 4
 948 ; CHECK-NEXT:    [[I_NEXT]] = add nuw nsw i64 [[I]], 1
 949 ; CHECK-NEXT:    [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]]
 950 ; CHECK-NEXT:    br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END:%.*]], !llvm.loop [[LOOP25:![0-9]+]]
 951 ; CHECK:       for.end:
 952 ; CHECK-NEXT:    ret void
 953 ;
 954 entry:
 955   br label %for.body
 956
 957 for.body:
 958   %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
 959   %p_i.x = getelementptr inbounds %pair.i32, ptr %p, i64 %i, i32 0
 960   %p_i.y = getelementptr inbounds %pair.i32, ptr %p, i64 %i, i32 1
 961   store i32 %z, ptr %p_i.x, align 4
 962   %0 = load i32, ptr %p_i.x, align 4
 963   store i32 %0, ptr %p_i.y, align 4
 964   %i.next = add nuw nsw i64 %i, 1
 965   %cond = icmp slt i64 %i.next, %n
 966   br i1 %cond, label %for.body, label %for.end
 967
 968 for.end:
 969   ret void
 970 }
 971
 972 ; PR27626_1: Ensure a strided load is not moved before a dependent (zero
 973 ;            distance) strided store.
 974
 975 ; void PR27626_1(struct pair *p, int n) {
 976 ;   int s = 0;
 977 ;   for (int i = 0; i < n; i++) {
 978 ;     p[i].y = p[i].x;
 979 ;     s += p[i].y
 980 ;   }
 981 ; }
 982
 983
 984 define i32 @PR27626_1(ptr %p, i64 %n) {
 985 ; CHECK-LABEL: @PR27626_1(
 986 ; CHECK-NEXT:  entry:
 987 ; CHECK-NEXT:    [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N:%.*]], i64 1)
 988 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[SMAX]], 5
 989 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 990 ; CHECK:       vector.ph:
 991 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = and i64 [[SMAX]], 3
 992 ; CHECK-NEXT:    [[TMP0:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
 993 ; CHECK-NEXT:    [[TMP1:%.*]] = select i1 [[TMP0]], i64 4, i64 [[N_MOD_VF]]
 994 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub nsw i64 [[SMAX]], [[TMP1]]
 995 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 996 ; CHECK:       vector.body:
 997 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 998 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP14:%.*]], [[VECTOR_BODY]] ]
 999 ; CHECK-NEXT:    [[TMP2:%.*]] = or i64 [[INDEX]], 1
1000 ; CHECK-NEXT:    [[TMP3:%.*]] = or i64 [[INDEX]], 2
1001 ; CHECK-NEXT:    [[TMP4:%.*]] = or i64 [[INDEX]], 3
1002 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[PAIR_I32:%.*]], ptr [[P:%.*]], i64 [[INDEX]], i32 0
1003 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[INDEX]], i32 1
1004 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP2]], i32 1
1005 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP3]], i32 1
1006 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP4]], i32 1
1007 ; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <8 x i32>, ptr [[TMP5]], align 4
1008 ; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i64 0
1009 ; CHECK-NEXT:    store i32 [[TMP10]], ptr [[TMP6]], align 4
1010 ; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i64 2
1011 ; CHECK-NEXT:    store i32 [[TMP11]], ptr [[TMP7]], align 4
1012 ; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i64 4
1013 ; CHECK-NEXT:    store i32 [[TMP12]], ptr [[TMP8]], align 4
1014 ; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i64 6
1015 ; CHECK-NEXT:    store i32 [[TMP13]], ptr [[TMP9]], align 4
1016 ; CHECK-NEXT:    [[WIDE_VEC1:%.*]] = load <8 x i32>, ptr [[TMP6]], align 4
1017 ; CHECK-NEXT:    [[STRIDED_VEC2:%.*]] = shufflevector <8 x i32> [[WIDE_VEC1]], <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
1018 ; CHECK-NEXT:    [[TMP14]] = add <4 x i32> [[STRIDED_VEC2]], [[VEC_PHI]]
1019 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
1020 ; CHECK-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
1021 ; CHECK-NEXT:    br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]]
1022 ; CHECK:       middle.block:
1023 ; CHECK-NEXT:    [[TMP16:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP14]])
1024 ; CHECK-NEXT:    br label [[SCALAR_PH]]
1025 ; CHECK:       scalar.ph:
1026 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
1027 ; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP16]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
1028 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
1029 ; CHECK:       for.body:
1030 ; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
1031 ; CHECK-NEXT:    [[S:%.*]] = phi i32 [ [[TMP18:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
1032 ; CHECK-NEXT:    [[P_I_X:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[I]], i32 0
1033 ; CHECK-NEXT:    [[P_I_Y:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[I]], i32 1
1034 ; CHECK-NEXT:    [[TMP17:%.*]] = load i32, ptr [[P_I_X]], align 4
1035 ; CHECK-NEXT:    store i32 [[TMP17]], ptr [[P_I_Y]], align 4
1036 ; CHECK-NEXT:    [[TMP18]] = add nsw i32 [[TMP17]], [[S]]
1037 ; CHECK-NEXT:    [[I_NEXT]] = add nuw nsw i64 [[I]], 1
1038 ; CHECK-NEXT:    [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]]
1039 ; CHECK-NEXT:    br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END:%.*]], !llvm.loop [[LOOP27:![0-9]+]]
1040 ; CHECK:       for.end:
1041 ; CHECK-NEXT:    ret i32 [[TMP18]]
1042 ;
1043 entry:
1044   br label %for.body
1045
1046 for.body:
1047   %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
1048   %s = phi i32 [ %2, %for.body ], [ 0, %entry ]
1049   %p_i.x = getelementptr inbounds %pair.i32, ptr %p, i64 %i, i32 0
1050   %p_i.y = getelementptr inbounds %pair.i32, ptr %p, i64 %i, i32 1
1051   %0 = load i32, ptr %p_i.x, align 4
1052   store i32 %0, ptr %p_i.y, align 4
1053   %1 = load i32, ptr %p_i.y, align 4
1054   %2 = add nsw i32 %1, %s
1055   %i.next = add nuw nsw i64 %i, 1
1056   %cond = icmp slt i64 %i.next, %n
1057   br i1 %cond, label %for.body, label %for.end
1058
1059 for.end:
1060   %3 = phi i32 [ %2, %for.body ]
1061   ret i32 %3
1062 }
1063
1064 ; PR27626_2: Ensure a strided store is not moved after a dependent (negative
1065 ;            distance) strided load.
1066
1067 ; void PR27626_2(struct pair *p, int z, int n) {
1068 ;   for (int i = 0; i < n; i++) {
1069 ;     p[i].x = z;
1070 ;     p[i].y = p[i - 1].x;
1071 ;   }
1072 ; }
1073
1074
1075 define void @PR27626_2(ptr %p, i64 %n, i32 %z) {
1076 ; CHECK-LABEL: @PR27626_2(
1077 ; CHECK-NEXT:  entry:
1078 ; CHECK-NEXT:    [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N:%.*]], i64 1)
1079 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[SMAX]], 5
1080 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
1081 ; CHECK:       vector.ph:
1082 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = and i64 [[SMAX]], 3
1083 ; CHECK-NEXT:    [[TMP0:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
1084 ; CHECK-NEXT:    [[TMP1:%.*]] = select i1 [[TMP0]], i64 4, i64 [[N_MOD_VF]]
1085 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub nsw i64 [[SMAX]], [[TMP1]]
1086 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
1087 ; CHECK:       vector.body:
1088 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
1089 ; CHECK-NEXT:    [[TMP2:%.*]] = or i64 [[INDEX]], 1
1090 ; CHECK-NEXT:    [[TMP3:%.*]] = or i64 [[INDEX]], 2
1091 ; CHECK-NEXT:    [[TMP4:%.*]] = or i64 [[INDEX]], 3
1092 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[PAIR_I32:%.*]], ptr [[P:%.*]], i64 [[INDEX]], i32 0
1093 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP2]], i32 0
1094 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP3]], i32 0
1095 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP4]], i32 0
1096 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 -1, i32 0
1097 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[INDEX]], i32 1
1098 ; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP2]], i32 1
1099 ; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP3]], i32 1
1100 ; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP4]], i32 1
1101 ; CHECK-NEXT:    store i32 [[Z:%.*]], ptr [[TMP5]], align 4
1102 ; CHECK-NEXT:    store i32 [[Z]], ptr [[TMP6]], align 4
1103 ; CHECK-NEXT:    store i32 [[Z]], ptr [[TMP7]], align 4
1104 ; CHECK-NEXT:    store i32 [[Z]], ptr [[TMP8]], align 4
1105 ; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <8 x i32>, ptr [[TMP9]], align 4
1106 ; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i64 0
1107 ; CHECK-NEXT:    store i32 [[TMP14]], ptr [[TMP10]], align 4
1108 ; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i64 2
1109 ; CHECK-NEXT:    store i32 [[TMP15]], ptr [[TMP11]], align 4
1110 ; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i64 4
1111 ; CHECK-NEXT:    store i32 [[TMP16]], ptr [[TMP12]], align 4
1112 ; CHECK-NEXT:    [[TMP17:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i64 6
1113 ; CHECK-NEXT:    store i32 [[TMP17]], ptr [[TMP13]], align 4
1114 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
1115 ; CHECK-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
1116 ; CHECK-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]]
1117 ; CHECK:       middle.block:
1118 ; CHECK-NEXT:    br label [[SCALAR_PH]]
1119 ; CHECK:       scalar.ph:
1120 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
1121 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
1122 ; CHECK:       for.body:
1123 ; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
1124 ; CHECK-NEXT:    [[P_I_X:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[I]], i32 0
1125 ; CHECK-NEXT:    [[P_I_MINUS_1_X:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 -1, i32 0
1126 ; CHECK-NEXT:    [[P_I_Y:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[I]], i32 1
1127 ; CHECK-NEXT:    store i32 [[Z]], ptr [[P_I_X]], align 4
1128 ; CHECK-NEXT:    [[TMP19:%.*]] = load i32, ptr [[P_I_MINUS_1_X]], align 4
1129 ; CHECK-NEXT:    store i32 [[TMP19]], ptr [[P_I_Y]], align 4
1130 ; CHECK-NEXT:    [[I_NEXT]] = add nuw nsw i64 [[I]], 1
1131 ; CHECK-NEXT:    [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]]
1132 ; CHECK-NEXT:    br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END:%.*]], !llvm.loop [[LOOP29:![0-9]+]]
1133 ; CHECK:       for.end:
1134 ; CHECK-NEXT:    ret void
1135 ;
1136 entry:
1137   br label %for.body
1138
1139 for.body:
1140   %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
1141   %i_minus_1 = add nuw nsw i64 %i, -1
1142   %p_i.x = getelementptr inbounds %pair.i32, ptr %p, i64 %i, i32 0
1143   %p_i_minus_1.x = getelementptr inbounds %pair.i32, ptr %p, i64 %i_minus_1, i32 0
1144   %p_i.y = getelementptr inbounds %pair.i32, ptr %p, i64 %i, i32 1
1145   store i32 %z, ptr %p_i.x, align 4
1146   %0 = load i32, ptr %p_i_minus_1.x, align 4
1147   store i32 %0, ptr %p_i.y, align 4
1148   %i.next = add nuw nsw i64 %i, 1
1149   %cond = icmp slt i64 %i.next, %n
1150   br i1 %cond, label %for.body, label %for.end
1151
1152 for.end:
1153   ret void
1154 }
1155
1156 ; PR27626_3: Ensure a strided load is not moved before a dependent (negative
1157 ;            distance) strided store.
1158
1159 ; void PR27626_3(struct pair *p, int z, int n) {
1160 ;   for (int i = 0; i < n; i++) {
1161 ;     p[i + 1].y = p[i].x;
1162 ;     s += p[i].y;
1163 ;   }
1164 ; }
1165
1166
1167 define i32 @PR27626_3(ptr %p, i64 %n, i32 %z) {
1168 ; CHECK-LABEL: @PR27626_3(
1169 ; CHECK-NEXT:  entry:
1170 ; CHECK-NEXT:    [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N:%.*]], i64 1)
1171 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[SMAX]], 5
1172 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
1173 ; CHECK:       vector.ph:
1174 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = and i64 [[SMAX]], 3
1175 ; CHECK-NEXT:    [[TMP0:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
1176 ; CHECK-NEXT:    [[TMP1:%.*]] = select i1 [[TMP0]], i64 4, i64 [[N_MOD_VF]]
1177 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub nsw i64 [[SMAX]], [[TMP1]]
1178 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
1179 ; CHECK:       vector.body:
1180 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
1181 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
1182 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP17:%.*]], [[VECTOR_BODY]] ]
1183 ; CHECK-NEXT:    [[TMP2:%.*]] = add nuw nsw <4 x i64> [[VEC_IND]], <i64 1, i64 1, i64 1, i64 1>
1184 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [[PAIR_I32:%.*]], ptr [[P:%.*]], i64 [[INDEX]], i32 0
1185 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[INDEX]], i32 1
1186 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x i64> [[TMP2]], i64 0
1187 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP5]], i32 1
1188 ; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP2]], i64 1
1189 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP7]], i32 1
1190 ; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <4 x i64> [[TMP2]], i64 2
1191 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP9]], i32 1
1192 ; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <4 x i64> [[TMP2]], i64 3
1193 ; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP11]], i32 1
1194 ; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <8 x i32>, ptr [[TMP3]], align 4
1195 ; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i64 0
1196 ; CHECK-NEXT:    store i32 [[TMP13]], ptr [[TMP6]], align 4
1197 ; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i64 2
1198 ; CHECK-NEXT:    store i32 [[TMP14]], ptr [[TMP8]], align 4
1199 ; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i64 4
1200 ; CHECK-NEXT:    store i32 [[TMP15]], ptr [[TMP10]], align 4
1201 ; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i64 6
1202 ; CHECK-NEXT:    store i32 [[TMP16]], ptr [[TMP12]], align 4
1203 ; CHECK-NEXT:    [[WIDE_VEC1:%.*]] = load <8 x i32>, ptr [[TMP4]], align 4
1204 ; CHECK-NEXT:    [[STRIDED_VEC2:%.*]] = shufflevector <8 x i32> [[WIDE_VEC1]], <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
1205 ; CHECK-NEXT:    [[TMP17]] = add <4 x i32> [[STRIDED_VEC2]], [[VEC_PHI]]
1206 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
1207 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], <i64 4, i64 4, i64 4, i64 4>
1208 ; CHECK-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
1209 ; CHECK-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP30:![0-9]+]]
1210 ; CHECK:       middle.block:
1211 ; CHECK-NEXT:    [[TMP19:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP17]])
1212 ; CHECK-NEXT:    br label [[SCALAR_PH]]
1213 ; CHECK:       scalar.ph:
1214 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
1215 ; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP19]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
1216 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
1217 ; CHECK:       for.body:
1218 ; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
1219 ; CHECK-NEXT:    [[S:%.*]] = phi i32 [ [[TMP22:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
1220 ; CHECK-NEXT:    [[I_PLUS_1:%.*]] = add nuw nsw i64 [[I]], 1
1221 ; CHECK-NEXT:    [[P_I_X:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[I]], i32 0
1222 ; CHECK-NEXT:    [[P_I_Y:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[I]], i32 1
1223 ; CHECK-NEXT:    [[P_I_PLUS_1_Y:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[I_PLUS_1]], i32 1
1224 ; CHECK-NEXT:    [[TMP20:%.*]] = load i32, ptr [[P_I_X]], align 4
1225 ; CHECK-NEXT:    store i32 [[TMP20]], ptr [[P_I_PLUS_1_Y]], align 4
1226 ; CHECK-NEXT:    [[TMP21:%.*]] = load i32, ptr [[P_I_Y]], align 4
1227 ; CHECK-NEXT:    [[TMP22]] = add nsw i32 [[TMP21]], [[S]]
1228 ; CHECK-NEXT:    [[I_NEXT]] = add nuw nsw i64 [[I]], 1
1229 ; CHECK-NEXT:    [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]]
1230 ; CHECK-NEXT:    br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END:%.*]], !llvm.loop [[LOOP31:![0-9]+]]
1231 ; CHECK:       for.end:
1232 ; CHECK-NEXT:    ret i32 [[TMP22]]
1233 ;
1234 entry:
1235   br label %for.body
1236
1237 for.body:
1238   %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
1239   %s = phi i32 [ %2, %for.body ], [ 0, %entry ]
1240   %i_plus_1 = add nuw nsw i64 %i, 1
1241   %p_i.x = getelementptr inbounds %pair.i32, ptr %p, i64 %i, i32 0
1242   %p_i.y = getelementptr inbounds %pair.i32, ptr %p, i64 %i, i32 1
1243   %p_i_plus_1.y = getelementptr inbounds %pair.i32, ptr %p, i64 %i_plus_1, i32 1
1244   %0 = load i32, ptr %p_i.x, align 4
1245   store i32 %0, ptr %p_i_plus_1.y, align 4
1246   %1 = load i32, ptr %p_i.y, align 4
1247   %2 = add nsw i32 %1, %s
1248   %i.next = add nuw nsw i64 %i, 1
1249   %cond = icmp slt i64 %i.next, %n
1250   br i1 %cond, label %for.body, label %for.end
1251
1252 for.end:
1253   %3 = phi i32 [ %2, %for.body ]
1254   ret i32 %3
1255 }
1256
1257 ; PR27626_4: Ensure we form an interleaved group for strided stores in the
1258 ;            presence of a write-after-write dependence. We create a group for
1259 ;            (2) and (3) while excluding (1).
1260
1261 ; void PR27626_4(int *a, int x, int y, int z, int n) {
1262 ;   for (int i = 0; i < n; i += 2) {
1263 ;     a[i] = x;      // (1)
1264 ;     a[i] = y;      // (2)
1265 ;     a[i + 1] = z;  // (3)
1266 ;   }
1267 ; }
1268
1269
1270 define void @PR27626_4(ptr %a, i32 %x, i32 %y, i32 %z, i64 %n) {
1271 ; CHECK-LABEL: @PR27626_4(
1272 ; CHECK-NEXT:  entry:
1273 ; CHECK-NEXT:    [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N:%.*]], i64 2)
1274 ; CHECK-NEXT:    [[TMP0:%.*]] = add nsw i64 [[SMAX]], -1
1275 ; CHECK-NEXT:    [[TMP1:%.*]] = lshr i64 [[TMP0]], 1
1276 ; CHECK-NEXT:    [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1
1277 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[SMAX]], 7
1278 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
1279 ; CHECK:       vector.ph:
1280 ; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[TMP2]], 9223372036854775804
1281 ; CHECK-NEXT:    [[IND_END:%.*]] = shl nuw i64 [[N_VEC]], 1
1282 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[Y:%.*]], i64 0
1283 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
1284 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i32> poison, i32 [[Z:%.*]], i64 0
1285 ; CHECK-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT1]], <4 x i32> poison, <4 x i32> zeroinitializer
1286 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
1287 ; CHECK:       vector.body:
1288 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
1289 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = shl i64 [[INDEX]], 1
1290 ; CHECK-NEXT:    [[TMP3:%.*]] = or i64 [[OFFSET_IDX]], 2
1291 ; CHECK-NEXT:    [[TMP4:%.*]] = or i64 [[OFFSET_IDX]], 4
1292 ; CHECK-NEXT:    [[TMP5:%.*]] = or i64 [[OFFSET_IDX]], 6
1293 ; CHECK-NEXT:    [[TMP6:%.*]] = or i64 [[OFFSET_IDX]], 1
1294 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[OFFSET_IDX]]
1295 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP3]]
1296 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP4]]
1297 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP5]]
1298 ; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP6]]
1299 ; CHECK-NEXT:    store i32 [[X:%.*]], ptr [[TMP7]], align 4
1300 ; CHECK-NEXT:    store i32 [[X]], ptr [[TMP8]], align 4
1301 ; CHECK-NEXT:    store i32 [[X]], ptr [[TMP9]], align 4
1302 ; CHECK-NEXT:    store i32 [[X]], ptr [[TMP10]], align 4
1303 ; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP11]], i64 -1
1304 ; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLAT]], <4 x i32> [[BROADCAST_SPLAT2]], <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
1305 ; CHECK-NEXT:    store <8 x i32> [[INTERLEAVED_VEC]], ptr [[TMP12]], align 4
1306 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
1307 ; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
1308 ; CHECK-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP32:![0-9]+]]
1309 ; CHECK:       middle.block:
1310 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
1311 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
1312 ; CHECK:       scalar.ph:
1313 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
1314 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
1315 ; CHECK:       for.body:
1316 ; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
1317 ; CHECK-NEXT:    [[I_PLUS_1:%.*]] = or i64 [[I]], 1
1318 ; CHECK-NEXT:    [[A_I:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[I]]
1319 ; CHECK-NEXT:    [[A_I_PLUS_1:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[I_PLUS_1]]
1320 ; CHECK-NEXT:    store i32 [[Y]], ptr [[A_I]], align 4
1321 ; CHECK-NEXT:    store i32 [[Z]], ptr [[A_I_PLUS_1]], align 4
1322 ; CHECK-NEXT:    [[I_NEXT]] = add nuw nsw i64 [[I]], 2
1323 ; CHECK-NEXT:    [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]]
1324 ; CHECK-NEXT:    br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP33:![0-9]+]]
1325 ; CHECK:       for.end:
1326 ; CHECK-NEXT:    ret void
1327 ;
1328 entry:
1329   br label %for.body
1330
1331 for.body:
1332   %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
1333   %i_plus_1 = add i64 %i, 1
1334   %a_i = getelementptr inbounds i32, ptr %a, i64 %i
1335   %a_i_plus_1 = getelementptr inbounds i32, ptr %a, i64 %i_plus_1
1336   store i32 %x, ptr %a_i, align 4
1337   store i32 %y, ptr %a_i, align 4
1338   store i32 %z, ptr %a_i_plus_1, align 4
1339   %i.next = add nuw nsw i64 %i, 2
1340   %cond = icmp slt i64 %i.next, %n
1341   br i1 %cond, label %for.body, label %for.end
1342
1343 for.end:
1344   ret void
1345 }
1346
1347 ; PR27626_5: Ensure we do not form an interleaved group for strided stores in
1348 ;            the presence of a write-after-write dependence.
1349
1350 ; void PR27626_5(int *a, int x, int y, int z, int n) {
1351 ;   for (int i = 3; i < n; i += 2) {
1352 ;     a[i - 1] = x;
1353 ;     a[i - 3] = y;
1354 ;     a[i] = z;
1355 ;   }
1356 ; }
1357
1358
1359 define void @PR27626_5(ptr %a, i32 %x, i32 %y, i32 %z, i64 %n) {
1360 ; CHECK-LABEL: @PR27626_5(
1361 ; CHECK-NEXT:  entry:
1362 ; CHECK-NEXT:    [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N:%.*]], i64 5)
1363 ; CHECK-NEXT:    [[TMP0:%.*]] = add nsw i64 [[SMAX]], -4
1364 ; CHECK-NEXT:    [[TMP1:%.*]] = lshr i64 [[TMP0]], 1
1365 ; CHECK-NEXT:    [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1
1366 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 6
1367 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
1368 ; CHECK:       vector.ph:
1369 ; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[TMP2]], 9223372036854775804
1370 ; CHECK-NEXT:    [[TMP3:%.*]] = shl nuw i64 [[N_VEC]], 1
1371 ; CHECK-NEXT:    [[IND_END:%.*]] = or i64 [[TMP3]], 3
1372 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
1373 ; CHECK:       vector.body:
1374 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
1375 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 3, i64 5, i64 7, i64 9>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
1376 ; CHECK-NEXT:    [[TMP4:%.*]] = shl i64 [[INDEX]], 1
1377 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = or i64 [[TMP4]], 3
1378 ; CHECK-NEXT:    [[TMP5:%.*]] = or i64 [[TMP4]], 5
1379 ; CHECK-NEXT:    [[TMP6:%.*]] = or i64 [[TMP4]], 7
1380 ; CHECK-NEXT:    [[TMP7:%.*]] = add i64 [[TMP4]], 9
1381 ; CHECK-NEXT:    [[TMP8:%.*]] = add <4 x i64> [[VEC_IND]], <i64 -1, i64 -1, i64 -1, i64 -1>
1382 ; CHECK-NEXT:    [[TMP9:%.*]] = add <4 x i64> [[VEC_IND]], <i64 -3, i64 -3, i64 -3, i64 -3>
1383 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[OFFSET_IDX]]
1384 ; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP5]]
1385 ; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP6]]
1386 ; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP7]]
1387 ; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <4 x i64> [[TMP8]], i64 0
1388 ; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP14]]
1389 ; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <4 x i64> [[TMP8]], i64 1
1390 ; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP16]]
1391 ; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <4 x i64> [[TMP8]], i64 2
1392 ; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP18]]
1393 ; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <4 x i64> [[TMP8]], i64 3
1394 ; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP20]]
1395 ; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <4 x i64> [[TMP9]], i64 0
1396 ; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP22]]
1397 ; CHECK-NEXT:    [[TMP24:%.*]] = extractelement <4 x i64> [[TMP9]], i64 1
1398 ; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP24]]
1399 ; CHECK-NEXT:    [[TMP26:%.*]] = extractelement <4 x i64> [[TMP9]], i64 2
1400 ; CHECK-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP26]]
1401 ; CHECK-NEXT:    [[TMP28:%.*]] = extractelement <4 x i64> [[TMP9]], i64 3
1402 ; CHECK-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP28]]
1403 ; CHECK-NEXT:    store i32 [[X:%.*]], ptr [[TMP15]], align 4
1404 ; CHECK-NEXT:    store i32 [[X]], ptr [[TMP17]], align 4
1405 ; CHECK-NEXT:    store i32 [[X]], ptr [[TMP19]], align 4
1406 ; CHECK-NEXT:    store i32 [[X]], ptr [[TMP21]], align 4
1407 ; CHECK-NEXT:    store i32 [[Y:%.*]], ptr [[TMP23]], align 4
1408 ; CHECK-NEXT:    store i32 [[Y]], ptr [[TMP25]], align 4
1409 ; CHECK-NEXT:    store i32 [[Y]], ptr [[TMP27]], align 4
1410 ; CHECK-NEXT:    store i32 [[Y]], ptr [[TMP29]], align 4
1411 ; CHECK-NEXT:    store i32 [[Z:%.*]], ptr [[TMP10]], align 4
1412 ; CHECK-NEXT:    store i32 [[Z]], ptr [[TMP11]], align 4
1413 ; CHECK-NEXT:    store i32 [[Z]], ptr [[TMP12]], align 4
1414 ; CHECK-NEXT:    store i32 [[Z]], ptr [[TMP13]], align 4
1415 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
1416 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], <i64 8, i64 8, i64 8, i64 8>
1417 ; CHECK-NEXT:    [[TMP30:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
1418 ; CHECK-NEXT:    br i1 [[TMP30]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP34:![0-9]+]]
1419 ; CHECK:       middle.block:
1420 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
1421 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
1422 ; CHECK:       scalar.ph:
1423 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 3, [[ENTRY:%.*]] ]
1424 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
1425 ; CHECK:       for.body:
1426 ; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
1427 ; CHECK-NEXT:    [[I_MINUS_1:%.*]] = add i64 [[I]], -1
1428 ; CHECK-NEXT:    [[I_MINUS_3:%.*]] = add i64 [[I]], -3
1429 ; CHECK-NEXT:    [[A_I:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[I]]
1430 ; CHECK-NEXT:    [[A_I_MINUS_1:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[I_MINUS_1]]
1431 ; CHECK-NEXT:    [[A_I_MINUS_3:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[I_MINUS_3]]
1432 ; CHECK-NEXT:    store i32 [[X]], ptr [[A_I_MINUS_1]], align 4
1433 ; CHECK-NEXT:    store i32 [[Y]], ptr [[A_I_MINUS_3]], align 4
1434 ; CHECK-NEXT:    store i32 [[Z]], ptr [[A_I]], align 4
1435 ; CHECK-NEXT:    [[I_NEXT]] = add nuw nsw i64 [[I]], 2
1436 ; CHECK-NEXT:    [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]]
1437 ; CHECK-NEXT:    br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP35:![0-9]+]]
1438 ; CHECK:       for.end:
1439 ; CHECK-NEXT:    ret void
1440 ;
1441 entry:
1442   br label %for.body
1443
1444 for.body:
1445   %i = phi i64 [ %i.next, %for.body ], [ 3, %entry ]
1446   %i_minus_1 = sub i64 %i, 1
1447   %i_minus_3 = sub i64 %i_minus_1, 2
1448   %a_i = getelementptr inbounds i32, ptr %a, i64 %i
1449   %a_i_minus_1 = getelementptr inbounds i32, ptr %a, i64 %i_minus_1
1450   %a_i_minus_3 = getelementptr inbounds i32, ptr %a, i64 %i_minus_3
1451   store i32 %x, ptr %a_i_minus_1, align 4
1452   store i32 %y, ptr %a_i_minus_3, align 4
1453   store i32 %z, ptr %a_i, align 4
1454   %i.next = add nuw nsw i64 %i, 2
1455   %cond = icmp slt i64 %i.next, %n
1456   br i1 %cond, label %for.body, label %for.end
1457
1458 for.end:
1459   ret void
1460 }
1461
1462 ; PR34743: Ensure that a cast which needs to sink after a load that belongs to
1463 ; an interleaved group, indeeded gets sunk.
1464
1465 ; void PR34743(short *a, int *b, int n) {
1466 ;   for (int i = 0, iv = 0; iv < n; i++, iv += 2) {
1467 ;     b[i] = a[iv] * a[iv+1] * a[iv+2];
1468 ;   }
1469 ; }
1470
1471
1472 define void @PR34743(ptr %a, ptr %b, i64 %n) {
1473 ; CHECK-LABEL: @PR34743(
1474 ; CHECK-NEXT:  entry:
1475 ; CHECK-NEXT:    [[DOTPRE:%.*]] = load i16, ptr [[A:%.*]], align 2
1476 ; CHECK-NEXT:    [[TMP0:%.*]] = lshr i64 [[N:%.*]], 1
1477 ; CHECK-NEXT:    [[TMP1:%.*]] = add nuw i64 [[TMP0]], 1
1478 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 6
1479 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
1480 ; CHECK:       vector.memcheck:
1481 ; CHECK-NEXT:    [[TMP2:%.*]] = shl i64 [[N]], 1
1482 ; CHECK-NEXT:    [[TMP3:%.*]] = and i64 [[TMP2]], -4
1483 ; CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[TMP3]], 4
1484 ; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[B:%.*]], i64 [[TMP4]]
1485 ; CHECK-NEXT:    [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[A]], i64 2
1486 ; CHECK-NEXT:    [[TMP5:%.*]] = add i64 [[TMP3]], 6
1487 ; CHECK-NEXT:    [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP5]]
1488 ; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ugt ptr [[SCEVGEP2]], [[B]]
1489 ; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ult ptr [[SCEVGEP1]], [[SCEVGEP]]
1490 ; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
1491 ; CHECK-NEXT:    br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
1492 ; CHECK:       vector.ph:
1493 ; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[TMP1]], -4
1494 ; CHECK-NEXT:    [[IND_END:%.*]] = shl i64 [[N_VEC]], 1
1495 ; CHECK-NEXT:    [[VECTOR_RECUR_INIT:%.*]] = insertelement <4 x i16> poison, i16 [[DOTPRE]], i64 3
1496 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
1497 ; CHECK:       vector.body:
1498 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
1499 ; CHECK-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x i16> [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[STRIDED_VEC4:%.*]], [[VECTOR_BODY]] ]
1500 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = shl i64 [[INDEX]], 1
1501 ; CHECK-NEXT:    [[TMP6:%.*]] = or i64 [[OFFSET_IDX]], 1
1502 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i16, ptr [[A]], i64 [[TMP6]]
1503 ; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <8 x i16>, ptr [[TMP7]], align 4
1504 ; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <8 x i16> [[WIDE_VEC]], <8 x i16> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
1505 ; CHECK-NEXT:    [[STRIDED_VEC4]] = shufflevector <8 x i16> [[WIDE_VEC]], <8 x i16> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
1506 ; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <4 x i16> [[VECTOR_RECUR]], <4 x i16> [[STRIDED_VEC4]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
1507 ; CHECK-NEXT:    [[TMP9:%.*]] = sext <4 x i16> [[TMP8]] to <4 x i32>
1508 ; CHECK-NEXT:    [[TMP10:%.*]] = sext <4 x i16> [[STRIDED_VEC]] to <4 x i32>
1509 ; CHECK-NEXT:    [[TMP11:%.*]] = sext <4 x i16> [[STRIDED_VEC4]] to <4 x i32>
1510 ; CHECK-NEXT:    [[TMP12:%.*]] = mul nsw <4 x i32> [[TMP9]], [[TMP10]]
1511 ; CHECK-NEXT:    [[TMP13:%.*]] = mul nsw <4 x i32> [[TMP12]], [[TMP11]]
1512 ; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDEX]]
1513 ; CHECK-NEXT:    store <4 x i32> [[TMP13]], ptr [[TMP14]], align 4, !alias.scope !36, !noalias !39
1514 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
1515 ; CHECK-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
1516 ; CHECK-NEXT:    br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP41:![0-9]+]]
1517 ; CHECK:       middle.block:
1518 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP1]], [[N_VEC]]
1519 ; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <8 x i16> [[WIDE_VEC]], i64 7
1520 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[END:%.*]], label [[SCALAR_PH]]
1521 ; CHECK:       scalar.ph:
1522 ; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ [[DOTPRE]], [[VECTOR_MEMCHECK]] ], [ [[DOTPRE]], [[ENTRY:%.*]] ], [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ]
1523 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ENTRY]] ], [ [[IND_END]], [[MIDDLE_BLOCK]] ]
1524 ; CHECK-NEXT:    [[BC_RESUME_VAL3:%.*]] = phi i64 [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ENTRY]] ], [ [[N_VEC]], [[MIDDLE_BLOCK]] ]
1525 ; CHECK-NEXT:    br label [[LOOP:%.*]]
1526 ; CHECK:       loop:
1527 ; CHECK-NEXT:    [[SCALAR_RECUR:%.*]] = phi i16 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[LOAD2:%.*]], [[LOOP]] ]
1528 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV2:%.*]], [[LOOP]] ]
1529 ; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL3]], [[SCALAR_PH]] ], [ [[I1:%.*]], [[LOOP]] ]
1530 ; CHECK-NEXT:    [[CONV:%.*]] = sext i16 [[SCALAR_RECUR]] to i32
1531 ; CHECK-NEXT:    [[I1]] = add nuw nsw i64 [[I]], 1
1532 ; CHECK-NEXT:    [[IV1:%.*]] = or i64 [[IV]], 1
1533 ; CHECK-NEXT:    [[IV2]] = add nuw nsw i64 [[IV]], 2
1534 ; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr inbounds i16, ptr [[A]], i64 [[IV1]]
1535 ; CHECK-NEXT:    [[LOAD1:%.*]] = load i16, ptr [[GEP1]], align 4
1536 ; CHECK-NEXT:    [[CONV1:%.*]] = sext i16 [[LOAD1]] to i32
1537 ; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr inbounds i16, ptr [[A]], i64 [[IV2]]
1538 ; CHECK-NEXT:    [[LOAD2]] = load i16, ptr [[GEP2]], align 4
1539 ; CHECK-NEXT:    [[CONV2:%.*]] = sext i16 [[LOAD2]] to i32
1540 ; CHECK-NEXT:    [[MUL01:%.*]] = mul nsw i32 [[CONV]], [[CONV1]]
1541 ; CHECK-NEXT:    [[MUL012:%.*]] = mul nsw i32 [[MUL01]], [[CONV2]]
1542 ; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[I]]
1543 ; CHECK-NEXT:    store i32 [[MUL012]], ptr [[ARRAYIDX5]], align 4
1544 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV]], [[N]]
1545 ; CHECK-NEXT:    br i1 [[EXITCOND]], label [[END]], label [[LOOP]], !llvm.loop [[LOOP42:![0-9]+]]
1546 ; CHECK:       end:
1547 ; CHECK-NEXT:    ret void
1548 ;
1549 entry:
1550   %.pre = load i16, ptr %a
1551   br label %loop
1552
1553 loop:
1554   %0 = phi i16 [ %.pre, %entry ], [ %load2, %loop ]
1555   %iv = phi i64 [ 0, %entry ], [ %iv2, %loop ]
1556   %i = phi i64 [ 0, %entry ], [ %i1, %loop ]
1557   %conv = sext i16 %0 to i32
1558   %i1 = add nuw nsw i64 %i, 1
1559   %iv1 = add nuw nsw i64 %iv, 1
1560   %iv2 = add nuw nsw i64 %iv, 2
1561   %gep1 = getelementptr inbounds i16, ptr %a, i64 %iv1
1562   %load1 = load i16, ptr %gep1, align 4
1563   %conv1 = sext i16 %load1 to i32
1564   %gep2 = getelementptr inbounds i16, ptr %a, i64 %iv2
1565   %load2 = load i16, ptr %gep2, align 4
1566   %conv2 = sext i16 %load2 to i32
1567   %mul01 = mul nsw i32 %conv, %conv1
1568   %mul012 = mul nsw i32 %mul01, %conv2
1569   %arrayidx5 = getelementptr inbounds i32, ptr %b, i64 %i
1570   store i32 %mul012, ptr %arrayidx5
1571   %exitcond = icmp eq i64 %iv, %n
1572   br i1 %exitcond, label %end, label %loop
1573
1574 end:
1575   ret void
1576 }
1577
1578 attributes #0 = { "unsafe-fp-math"="true" }