llvm/test/Transforms/LoopVectorize/interleaved-accesses.ll

   1 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
   2 ; RUN: opt -S -passes=loop-vectorize,instcombine -force-vector-width=4 -force-vector-interleave=1 -enable-interleaved-mem-accesses=true -runtime-memory-check-threshold=24 < %s | FileCheck %s
   3
   4 target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
   5
   6 ; Check vectorization on an interleaved load group of factor 2 and an interleaved
   7 ; store group of factor 2.
   8
   9 ; int AB[1024];
  10 ; int CD[1024];
  11 ;  void test_array_load2_store2(int C, int D) {
  12 ;   for (int i = 0; i < 1024; i+=2) {
  13 ;     int A = AB[i];
  14 ;     int B = AB[i+1];
  15 ;     CD[i] = A + C;
  16 ;     CD[i+1] = B * D;
  17 ;   }
  18 ; }
  19
  20
  21 @AB = common global [1024 x i32] zeroinitializer, align 4
  22 @CD = common global [1024 x i32] zeroinitializer, align 4
  23
  24 define void @test_array_load2_store2(i32 %C, i32 %D) {
  25 ; CHECK-LABEL: @test_array_load2_store2(
  26 ; CHECK-NEXT:  entry:
  27 ; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
  28 ; CHECK:       vector.ph:
  29 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[C:%.*]], i64 0
  30 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
  31 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <4 x i32> poison, i32 [[D:%.*]], i64 0
  32 ; CHECK-NEXT:    [[BROADCAST_SPLAT3:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT2]], <4 x i32> poison, <4 x i32> zeroinitializer
  33 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
  34 ; CHECK:       vector.body:
  35 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
  36 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = shl i64 [[INDEX]], 1
  37 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [1024 x i32], ptr @AB, i64 0, i64 [[OFFSET_IDX]]
  38 ; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <8 x i32>, ptr [[TMP0]], align 4
  39 ; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
  40 ; CHECK-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
  41 ; CHECK-NEXT:    [[TMP2:%.*]] = add nsw <4 x i32> [[STRIDED_VEC]], [[BROADCAST_SPLAT]]
  42 ; CHECK-NEXT:    [[TMP3:%.*]] = mul nsw <4 x i32> [[STRIDED_VEC1]], [[BROADCAST_SPLAT3]]
  43 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1024 x i32], ptr @CD, i64 0, i64 [[OFFSET_IDX]]
  44 ; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
  45 ; CHECK-NEXT:    store <8 x i32> [[INTERLEAVED_VEC]], ptr [[TMP4]], align 4
  46 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
  47 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 512
  48 ; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
  49 ; CHECK:       middle.block:
  50 ; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
  51 ; CHECK:       scalar.ph:
  52 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
  53 ; CHECK:       for.body:
  54 ; CHECK-NEXT:    br i1 poison, label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP3:![0-9]+]]
  55 ; CHECK:       for.end:
  56 ; CHECK-NEXT:    ret void
  57 ;
  58 entry:
  59   br label %for.body
  60
  61 for.body:                                         ; preds = %for.body, %entry
  62   %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
  63   %arrayidx0 = getelementptr inbounds [1024 x i32], ptr @AB, i64 0, i64 %indvars.iv
  64   %tmp = load i32, ptr %arrayidx0, align 4
  65   %tmp1 = or disjoint i64 %indvars.iv, 1
  66   %arrayidx1 = getelementptr inbounds [1024 x i32], ptr @AB, i64 0, i64 %tmp1
  67   %tmp2 = load i32, ptr %arrayidx1, align 4
  68   %add = add nsw i32 %tmp, %C
  69   %mul = mul nsw i32 %tmp2, %D
  70   %arrayidx2 = getelementptr inbounds [1024 x i32], ptr @CD, i64 0, i64 %indvars.iv
  71   store i32 %add, ptr %arrayidx2, align 4
  72   %arrayidx3 = getelementptr inbounds [1024 x i32], ptr @CD, i64 0, i64 %tmp1
  73   store i32 %mul, ptr %arrayidx3, align 4
  74   %indvars.iv.next = add nuw nsw i64 %indvars.iv, 2
  75   %cmp = icmp slt i64 %indvars.iv.next, 1024
  76   br i1 %cmp, label %for.body, label %for.end
  77
  78 for.end:                                          ; preds = %for.body
  79   ret void
  80 }
  81
  82 ; int A[3072];
  83 ; struct ST S[1024];
  84 ; void test_struct_st3() {
  85 ;   int *ptr = A;
  86 ;   for (int i = 0; i < 1024; i++) {
  87 ;     int X1 = *ptr++;
  88 ;     int X2 = *ptr++;
  89 ;     int X3 = *ptr++;
  90 ;     T[i].x = X1 + 1;
  91 ;     T[i].y = X2 + 2;
  92 ;     T[i].z = X3 + 3;
  93 ;   }
  94 ; }
  95
  96
  97 %struct.ST3 = type { i32, i32, i32 }
  98 @A = common global [3072 x i32] zeroinitializer, align 4
  99 @S = common global [1024 x %struct.ST3] zeroinitializer, align 4
 100
 101 define void @test_struct_array_load3_store3() {
 102 ; CHECK-LABEL: @test_struct_array_load3_store3(
 103 ; CHECK-NEXT:  entry:
 104 ; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 105 ; CHECK:       vector.ph:
 106 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 107 ; CHECK:       vector.body:
 108 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 109 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 12
 110 ; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr @A, i64 [[OFFSET_IDX]]
 111 ; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <12 x i32>, ptr [[NEXT_GEP]], align 4
 112 ; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <12 x i32> [[WIDE_VEC]], <12 x i32> poison, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
 113 ; CHECK-NEXT:    [[STRIDED_VEC2:%.*]] = shufflevector <12 x i32> [[WIDE_VEC]], <12 x i32> poison, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
 114 ; CHECK-NEXT:    [[STRIDED_VEC3:%.*]] = shufflevector <12 x i32> [[WIDE_VEC]], <12 x i32> poison, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
 115 ; CHECK-NEXT:    [[TMP0:%.*]] = add nsw <4 x i32> [[STRIDED_VEC]], splat (i32 1)
 116 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1024 x %struct.ST3], ptr @S, i64 0, i64 [[INDEX]], i32 0
 117 ; CHECK-NEXT:    [[TMP1:%.*]] = add nsw <4 x i32> [[STRIDED_VEC2]], splat (i32 2)
 118 ; CHECK-NEXT:    [[TMP2:%.*]] = add nsw <4 x i32> [[STRIDED_VEC3]], splat (i32 3)
 119 ; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> [[TMP1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 120 ; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
 121 ; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i32> [[TMP4]], <8 x i32> [[TMP5]], <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
 122 ; CHECK-NEXT:    store <12 x i32> [[INTERLEAVED_VEC]], ptr [[TMP3]], align 4
 123 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 124 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 125 ; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 126 ; CHECK:       middle.block:
 127 ; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
 128 ; CHECK:       scalar.ph:
 129 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 130 ; CHECK:       for.body:
 131 ; CHECK-NEXT:    br i1 poison, label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 132 ; CHECK:       for.end:
 133 ; CHECK-NEXT:    ret void
 134 ;
 135 entry:
 136   br label %for.body
 137
 138 for.body:                                         ; preds = %for.body, %entry
 139   %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
 140   %ptr.016 = phi ptr [ @A, %entry ], [ %incdec.ptr2, %for.body ]
 141   %incdec.ptr = getelementptr inbounds i32, ptr %ptr.016, i64 1
 142   %tmp = load i32, ptr %ptr.016, align 4
 143   %incdec.ptr1 = getelementptr inbounds i32, ptr %ptr.016, i64 2
 144   %tmp1 = load i32, ptr %incdec.ptr, align 4
 145   %incdec.ptr2 = getelementptr inbounds i32, ptr %ptr.016, i64 3
 146   %tmp2 = load i32, ptr %incdec.ptr1, align 4
 147   %add = add nsw i32 %tmp, 1
 148   %x = getelementptr inbounds [1024 x %struct.ST3], ptr @S, i64 0, i64 %indvars.iv, i32 0
 149   store i32 %add, ptr %x, align 4
 150   %add3 = add nsw i32 %tmp1, 2
 151   %y = getelementptr inbounds [1024 x %struct.ST3], ptr @S, i64 0, i64 %indvars.iv, i32 1
 152   store i32 %add3, ptr %y, align 4
 153   %add6 = add nsw i32 %tmp2, 3
 154   %z = getelementptr inbounds [1024 x %struct.ST3], ptr @S, i64 0, i64 %indvars.iv, i32 2
 155   store i32 %add6, ptr %z, align 4
 156   %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
 157   %exitcond = icmp eq i64 %indvars.iv.next, 1024
 158   br i1 %exitcond, label %for.end, label %for.body
 159
 160 for.end:                                          ; preds = %for.body
 161   ret void
 162 }
 163
 164 ; Check vectorization on an interleaved load group of factor 4.
 165
 166 ; struct ST4{
 167 ;   int x;
 168 ;   int y;
 169 ;   int z;
 170 ;   int w;
 171 ; };
 172 ; int test_struct_load4(struct ST4 *S) {
 173 ;   int r = 0;
 174 ;   for (int i = 0; i < 1024; i++) {
 175 ;      r += S[i].x;
 176 ;      r -= S[i].y;
 177 ;      r += S[i].z;
 178 ;      r -= S[i].w;
 179 ;   }
 180 ;   return r;
 181 ; }
 182
 183 %struct.ST4 = type { i32, i32, i32, i32 }
 184
 185 define i32 @test_struct_load4(ptr nocapture readonly %S) {
 186 ;
 187 ; CHECK-LABEL: @test_struct_load4(
 188 ; CHECK-NEXT:  entry:
 189 ; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 190 ; CHECK:       vector.ph:
 191 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 192 ; CHECK:       vector.body:
 193 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 194 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
 195 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT_ST4:%.*]], ptr [[S:%.*]], i64 [[INDEX]], i32 0
 196 ; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <16 x i32>, ptr [[TMP0]], align 4
 197 ; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> poison, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
 198 ; CHECK-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> poison, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
 199 ; CHECK-NEXT:    [[STRIDED_VEC2:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> poison, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
 200 ; CHECK-NEXT:    [[STRIDED_VEC3:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> poison, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
 201 ; CHECK-NEXT:    [[TMP1:%.*]] = add <4 x i32> [[STRIDED_VEC]], [[VEC_PHI]]
 202 ; CHECK-NEXT:    [[TMP2:%.*]] = add <4 x i32> [[TMP1]], [[STRIDED_VEC2]]
 203 ; CHECK-NEXT:    [[TMP3:%.*]] = add <4 x i32> [[STRIDED_VEC1]], [[STRIDED_VEC3]]
 204 ; CHECK-NEXT:    [[TMP4]] = sub <4 x i32> [[TMP2]], [[TMP3]]
 205 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 206 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 207 ; CHECK-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 208 ; CHECK:       middle.block:
 209 ; CHECK-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP4]])
 210 ; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
 211 ; CHECK:       scalar.ph:
 212 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 213 ; CHECK:       for.body:
 214 ; CHECK-NEXT:    br i1 poison, label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
 215 ; CHECK:       for.end:
 216 ; CHECK-NEXT:    [[SUB8_LCSSA:%.*]] = phi i32 [ poison, [[FOR_BODY]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ]
 217 ; CHECK-NEXT:    ret i32 [[SUB8_LCSSA]]
 218 ;
 219 entry:
 220   br label %for.body
 221
 222 for.body:                                         ; preds = %for.body, %entry
 223   %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
 224   %r.022 = phi i32 [ 0, %entry ], [ %sub8, %for.body ]
 225   %x = getelementptr inbounds %struct.ST4, ptr %S, i64 %indvars.iv, i32 0
 226   %tmp = load i32, ptr %x, align 4
 227   %add = add nsw i32 %tmp, %r.022
 228   %y = getelementptr inbounds %struct.ST4, ptr %S, i64 %indvars.iv, i32 1
 229   %tmp1 = load i32, ptr %y, align 4
 230   %sub = sub i32 %add, %tmp1
 231   %z = getelementptr inbounds %struct.ST4, ptr %S, i64 %indvars.iv, i32 2
 232   %tmp2 = load i32, ptr %z, align 4
 233   %add5 = add nsw i32 %sub, %tmp2
 234   %w = getelementptr inbounds %struct.ST4, ptr %S, i64 %indvars.iv, i32 3
 235   %tmp3 = load i32, ptr %w, align 4
 236   %sub8 = sub i32 %add5, %tmp3
 237   %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
 238   %exitcond = icmp eq i64 %indvars.iv.next, 1024
 239   br i1 %exitcond, label %for.end, label %for.body
 240
 241 for.end:                                          ; preds = %for.body
 242   ret i32 %sub8
 243 }
 244
 245 ; Check vectorization on an interleaved store group of factor 4.
 246
 247 ; void test_struct_store4(int *A, struct ST4 *B) {
 248 ;   int *ptr = A;
 249 ;   for (int i = 0; i < 1024; i++) {
 250 ;     int X = *ptr++;
 251 ;     B[i].x = X + 1;
 252 ;     B[i].y = X * 2;
 253 ;     B[i].z = X + 3;
 254 ;     B[i].w = X + 4;
 255 ;   }
 256 ; }
 257
 258
 259 define void @test_struct_store4(ptr noalias nocapture readonly %A, ptr noalias nocapture %B) {
 260 ; CHECK-LABEL: @test_struct_store4(
 261 ; CHECK-NEXT:  entry:
 262 ; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 263 ; CHECK:       vector.ph:
 264 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 265 ; CHECK:       vector.body:
 266 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 267 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = shl i64 [[INDEX]], 2
 268 ; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 [[OFFSET_IDX]]
 269 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[NEXT_GEP]], align 4
 270 ; CHECK-NEXT:    [[TMP0:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], splat (i32 1)
 271 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [[STRUCT_ST4:%.*]], ptr [[B:%.*]], i64 [[INDEX]], i32 0
 272 ; CHECK-NEXT:    [[TMP1:%.*]] = shl nsw <4 x i32> [[WIDE_LOAD]], splat (i32 1)
 273 ; CHECK-NEXT:    [[TMP2:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], splat (i32 3)
 274 ; CHECK-NEXT:    [[TMP3:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], splat (i32 4)
 275 ; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> [[TMP1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 276 ; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 277 ; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i32> [[TMP5]], <8 x i32> [[TMP6]], <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
 278 ; CHECK-NEXT:    store <16 x i32> [[INTERLEAVED_VEC]], ptr [[TMP4]], align 4
 279 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 280 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 281 ; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 282 ; CHECK:       middle.block:
 283 ; CHECK-NEXT:    br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
 284 ; CHECK:       scalar.ph:
 285 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 286 ; CHECK:       for.cond.cleanup:
 287 ; CHECK-NEXT:    ret void
 288 ; CHECK:       for.body:
 289 ; CHECK-NEXT:    br i1 poison, label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
 290 ;
 291 entry:
 292   br label %for.body
 293
 294 for.cond.cleanup:                                 ; preds = %for.body
 295   ret void
 296
 297 for.body:                                         ; preds = %for.body, %entry
 298   %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
 299   %ptr.024 = phi ptr [ %A, %entry ], [ %incdec.ptr, %for.body ]
 300   %incdec.ptr = getelementptr inbounds i32, ptr %ptr.024, i64 1
 301   %tmp = load i32, ptr %ptr.024, align 4
 302   %add = add nsw i32 %tmp, 1
 303   %x = getelementptr inbounds %struct.ST4, ptr %B, i64 %indvars.iv, i32 0
 304   store i32 %add, ptr %x, align 4
 305   %mul = shl nsw i32 %tmp, 1
 306   %y = getelementptr inbounds %struct.ST4, ptr %B, i64 %indvars.iv, i32 1
 307   store i32 %mul, ptr %y, align 4
 308   %add3 = add nsw i32 %tmp, 3
 309   %z = getelementptr inbounds %struct.ST4, ptr %B, i64 %indvars.iv, i32 2
 310   store i32 %add3, ptr %z, align 4
 311   %add6 = add nsw i32 %tmp, 4
 312   %w = getelementptr inbounds %struct.ST4, ptr %B, i64 %indvars.iv, i32 3
 313   store i32 %add6, ptr %w, align 4
 314   %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
 315   %exitcond = icmp eq i64 %indvars.iv.next, 1024
 316   br i1 %exitcond, label %for.cond.cleanup, label %for.body
 317 }
 318
 319 ; Check vectorization on a reverse interleaved load group of factor 2 and
 320 ; a reverse interleaved store group of factor 2.
 321
 322 ; struct ST2 {
 323 ;  int x;
 324 ;  int y;
 325 ; };
 326 ;
 327 ; void test_reversed_load2_store2(struct ST2 *A, struct ST2 *B) {
 328 ;   for (int i = 1023; i >= 0; i--) {
 329 ;     int a = A[i].x + i;  // interleaved load of index 0
 330 ;     int b = A[i].y - i;  // interleaved load of index 1
 331 ;     B[i].x = a;          // interleaved store of index 0
 332 ;     B[i].y = b;          // interleaved store of index 1
 333 ;   }
 334 ; }
 335
 336
 337 %struct.ST2 = type { i32, i32 }
 338
 339 define void @test_reversed_load2_store2(ptr noalias nocapture readonly %A, ptr noalias nocapture %B) {
 340 ; CHECK-LABEL: @test_reversed_load2_store2(
 341 ; CHECK-NEXT:  entry:
 342 ; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 343 ; CHECK:       vector.ph:
 344 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 345 ; CHECK:       vector.body:
 346 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 347 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 1023, i32 1022, i32 1021, i32 1020>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 348 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = sub i64 1023, [[INDEX]]
 349 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT_ST2:%.*]], ptr [[A:%.*]], i64 [[OFFSET_IDX]], i32 0
 350 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 -24
 351 ; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <8 x i32>, ptr [[TMP1]], align 4
 352 ; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
 353 ; CHECK-NEXT:    [[REVERSE:%.*]] = shufflevector <4 x i32> [[STRIDED_VEC]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
 354 ; CHECK-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
 355 ; CHECK-NEXT:    [[REVERSE2:%.*]] = shufflevector <4 x i32> [[STRIDED_VEC1]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
 356 ; CHECK-NEXT:    [[TMP2:%.*]] = add nsw <4 x i32> [[REVERSE]], [[VEC_IND]]
 357 ; CHECK-NEXT:    [[TMP3:%.*]] = sub nsw <4 x i32> [[REVERSE2]], [[VEC_IND]]
 358 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [[STRUCT_ST2]], ptr [[B:%.*]], i64 [[OFFSET_IDX]], i32 0
 359 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i64 -24
 360 ; CHECK-NEXT:    [[REVERSE3:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
 361 ; CHECK-NEXT:    [[REVERSE4:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
 362 ; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i32> [[REVERSE3]], <4 x i32> [[REVERSE4]], <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
 363 ; CHECK-NEXT:    store <8 x i32> [[INTERLEAVED_VEC]], ptr [[TMP5]], align 4
 364 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 365 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 -4)
 366 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 367 ; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 368 ; CHECK:       middle.block:
 369 ; CHECK-NEXT:    br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
 370 ; CHECK:       scalar.ph:
 371 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 372 ; CHECK:       for.cond.cleanup:
 373 ; CHECK-NEXT:    ret void
 374 ; CHECK:       for.body:
 375 ; CHECK-NEXT:    br i1 poison, label [[FOR_BODY]], label [[FOR_COND_CLEANUP]], !llvm.loop [[LOOP11:![0-9]+]]
 376 ;
 377 entry:
 378   br label %for.body
 379
 380 for.cond.cleanup:                                 ; preds = %for.body
 381   ret void
 382
 383 for.body:                                         ; preds = %for.body, %entry
 384   %indvars.iv = phi i64 [ 1023, %entry ], [ %indvars.iv.next, %for.body ]
 385   %x = getelementptr inbounds %struct.ST2, ptr %A, i64 %indvars.iv, i32 0
 386   %tmp = load i32, ptr %x, align 4
 387   %tmp1 = trunc i64 %indvars.iv to i32
 388   %add = add nsw i32 %tmp, %tmp1
 389   %y = getelementptr inbounds %struct.ST2, ptr %A, i64 %indvars.iv, i32 1
 390   %tmp2 = load i32, ptr %y, align 4
 391   %sub = sub nsw i32 %tmp2, %tmp1
 392   %x5 = getelementptr inbounds %struct.ST2, ptr %B, i64 %indvars.iv, i32 0
 393   store i32 %add, ptr %x5, align 4
 394   %y8 = getelementptr inbounds %struct.ST2, ptr %B, i64 %indvars.iv, i32 1
 395   store i32 %sub, ptr %y8, align 4
 396   %indvars.iv.next = add nsw i64 %indvars.iv, -1
 397   %cmp = icmp sgt i64 %indvars.iv, 0
 398   br i1 %cmp, label %for.body, label %for.cond.cleanup
 399 }
 400
 401 ; Check vectorization on an interleaved load group of factor 2 with 1 gap
 402 ; (missing the load of odd elements). Because the vectorized loop would
 403 ; speculatively access memory out-of-bounds, we must execute at least one
 404 ; iteration of the scalar loop.
 405
 406 ; void even_load_static_tc(int *A, int *B) {
 407 ;  for (unsigned i = 0; i < 1024; i+=2)
 408 ;     B[i/2] = A[i] * 2;
 409 ; }
 410
 411
 412 define void @even_load_static_tc(ptr noalias nocapture readonly %A, ptr noalias nocapture %B) {
 413 ; CHECK-LABEL: @even_load_static_tc(
 414 ; CHECK-NEXT:  entry:
 415 ; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 416 ; CHECK:       vector.ph:
 417 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 418 ; CHECK:       vector.body:
 419 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 420 ; CHECK-NEXT:    [[DOTIDX:%.*]] = shl i64 [[INDEX]], 3
 421 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[A:%.*]], i64 [[DOTIDX]]
 422 ; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <8 x i32>, ptr [[TMP0]], align 4
 423 ; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
 424 ; CHECK-NEXT:    [[TMP1:%.*]] = shl nsw <4 x i32> [[STRIDED_VEC]], splat (i32 1)
 425 ; CHECK-NEXT:    [[TMP2:%.*]] = and i64 [[INDEX]], 9223372036854775804
 426 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[TMP2]]
 427 ; CHECK-NEXT:    store <4 x i32> [[TMP1]], ptr [[TMP3]], align 4
 428 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 429 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], 508
 430 ; CHECK-NEXT:    br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
 431 ; CHECK:       middle.block:
 432 ; CHECK-NEXT:    br label [[SCALAR_PH]]
 433 ; CHECK:       scalar.ph:
 434 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 435 ; CHECK:       for.cond.cleanup:
 436 ; CHECK-NEXT:    ret void
 437 ; CHECK:       for.body:
 438 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 1016, [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
 439 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDVARS_IV]]
 440 ; CHECK-NEXT:    [[TMP:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
 441 ; CHECK-NEXT:    [[MUL:%.*]] = shl nsw i32 [[TMP]], 1
 442 ; CHECK-NEXT:    [[TMP1:%.*]] = lshr exact i64 [[INDVARS_IV]], 1
 443 ; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP1]]
 444 ; CHECK-NEXT:    store i32 [[MUL]], ptr [[ARRAYIDX2]], align 4
 445 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 2
 446 ; CHECK-NEXT:    [[CMP:%.*]] = icmp samesign ult i64 [[INDVARS_IV]], 1022
 447 ; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP:%.*]], !llvm.loop [[LOOP13:![0-9]+]]
 448 ;
 449 entry:
 450   br label %for.body
 451
 452 for.cond.cleanup:                                 ; preds = %for.body
 453   ret void
 454
 455 for.body:                                         ; preds = %for.body, %entry
 456   %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
 457   %arrayidx = getelementptr inbounds i32, ptr %A, i64 %indvars.iv
 458   %tmp = load i32, ptr %arrayidx, align 4
 459   %mul = shl nsw i32 %tmp, 1
 460   %tmp1 = lshr exact i64 %indvars.iv, 1
 461   %arrayidx2 = getelementptr inbounds i32, ptr %B, i64 %tmp1
 462   store i32 %mul, ptr %arrayidx2, align 4
 463   %indvars.iv.next = add nuw nsw i64 %indvars.iv, 2
 464   %cmp = icmp ult i64 %indvars.iv.next, 1024
 465   br i1 %cmp, label %for.body, label %for.cond.cleanup
 466 }
 467
 468 ; Check vectorization on an interleaved load group of factor 2 with 1 gap
 469 ; (missing the load of odd elements). Because the vectorized loop would
 470 ; speculatively access memory out-of-bounds, we must execute at least one
 471 ; iteration of the scalar loop.
 472
 473 ; void even_load_dynamic_tc(int *A, int *B, unsigned N) {
 474 ;  for (unsigned i = 0; i < N; i+=2)
 475 ;     B[i/2] = A[i] * 2;
 476 ; }
 477
 478
 479 define void @even_load_dynamic_tc(ptr noalias nocapture readonly %A, ptr noalias nocapture %B, i64 %N) {
 480 ; CHECK-LABEL: @even_load_dynamic_tc(
 481 ; CHECK-NEXT:  entry:
 482 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 9
 483 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 484 ; CHECK:       vector.ph:
 485 ; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[N]], -1
 486 ; CHECK-NEXT:    [[TMP1:%.*]] = lshr i64 [[TMP0]], 1
 487 ; CHECK-NEXT:    [[TMP2:%.*]] = add nuw i64 [[TMP1]], 1
 488 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = and i64 [[TMP2]], 3
 489 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
 490 ; CHECK-NEXT:    [[TMP4:%.*]] = select i1 [[TMP3]], i64 4, i64 [[N_MOD_VF]]
 491 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP2]], [[TMP4]]
 492 ; CHECK-NEXT:    [[IND_END:%.*]] = shl i64 [[N_VEC]], 1
 493 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 494 ; CHECK:       vector.body:
 495 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 496 ; CHECK-NEXT:    [[DOTIDX:%.*]] = shl i64 [[INDEX]], 3
 497 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[A:%.*]], i64 [[DOTIDX]]
 498 ; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <8 x i32>, ptr [[TMP5]], align 4
 499 ; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
 500 ; CHECK-NEXT:    [[TMP6:%.*]] = shl nsw <4 x i32> [[STRIDED_VEC]], splat (i32 1)
 501 ; CHECK-NEXT:    [[TMP7:%.*]] = and i64 [[INDEX]], 9223372036854775804
 502 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[TMP7]]
 503 ; CHECK-NEXT:    store <4 x i32> [[TMP6]], ptr [[TMP8]], align 4
 504 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 505 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 506 ; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
 507 ; CHECK:       middle.block:
 508 ; CHECK-NEXT:    br label [[SCALAR_PH]]
 509 ; CHECK:       scalar.ph:
 510 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
 511 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 512 ; CHECK:       for.cond.cleanup:
 513 ; CHECK-NEXT:    ret void
 514 ; CHECK:       for.body:
 515 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
 516 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDVARS_IV]]
 517 ; CHECK-NEXT:    [[TMP:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
 518 ; CHECK-NEXT:    [[MUL:%.*]] = shl nsw i32 [[TMP]], 1
 519 ; CHECK-NEXT:    [[TMP1:%.*]] = lshr exact i64 [[INDVARS_IV]], 1
 520 ; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP1]]
 521 ; CHECK-NEXT:    store i32 [[MUL]], ptr [[ARRAYIDX2]], align 4
 522 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 2
 523 ; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i64 [[INDVARS_IV_NEXT]], [[N]]
 524 ; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP:%.*]], !llvm.loop [[LOOP15:![0-9]+]]
 525 ;
 526 entry:
 527   br label %for.body
 528
 529 for.cond.cleanup:                                 ; preds = %for.body
 530   ret void
 531
 532 for.body:                                         ; preds = %for.body, %entry
 533   %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
 534   %arrayidx = getelementptr inbounds i32, ptr %A, i64 %indvars.iv
 535   %tmp = load i32, ptr %arrayidx, align 4
 536   %mul = shl nsw i32 %tmp, 1
 537   %tmp1 = lshr exact i64 %indvars.iv, 1
 538   %arrayidx2 = getelementptr inbounds i32, ptr %B, i64 %tmp1
 539   store i32 %mul, ptr %arrayidx2, align 4
 540   %indvars.iv.next = add nuw nsw i64 %indvars.iv, 2
 541   %cmp = icmp ult i64 %indvars.iv.next, %N
 542   br i1 %cmp, label %for.body, label %for.cond.cleanup
 543 }
 544
 545 ; Check vectorization on a reverse interleaved load group of factor 2 with 1
 546 ; gap and a reverse interleaved store group of factor 2. The interleaved load
 547 ; group should be removed since it has a gap and is reverse.
 548
 549 ; struct pair {
 550 ;  int x;
 551 ;  int y;
 552 ; };
 553 ;
 554 ; void load_gap_reverse(struct pair *P1, struct pair *P2, int X) {
 555 ;   for (int i = 1023; i >= 0; i--) {
 556 ;     int a = X + i;
 557 ;     int b = B[i].y - i;
 558 ;     A[i].x = a;
 559 ;     B[i].y = b;
 560 ;   }
 561 ; }
 562
 563
 564 %pair = type { i64, i64 }
 565 define void @load_gap_reverse(ptr noalias nocapture %P1, ptr noalias nocapture %P2, i64 %X) {
 566 ; CHECK-LABEL: @load_gap_reverse(
 567 ; CHECK-NEXT:  entry:
 568 ; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 569 ; CHECK:       vector.ph:
 570 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[X:%.*]], i64 0
 571 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
 572 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 573 ; CHECK:       vector.body:
 574 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 575 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 1023, i64 1022, i64 1021, i64 1020>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 576 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = sub i64 1023, [[INDEX]]
 577 ; CHECK-NEXT:    [[TMP0:%.*]] = sub i64 1022, [[INDEX]]
 578 ; CHECK-NEXT:    [[TMP1:%.*]] = sub i64 1021, [[INDEX]]
 579 ; CHECK-NEXT:    [[TMP2:%.*]] = sub i64 1020, [[INDEX]]
 580 ; CHECK-NEXT:    [[TMP3:%.*]] = add nsw <4 x i64> [[BROADCAST_SPLAT]], [[VEC_IND]]
 581 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [[PAIR:%.*]], ptr [[P1:%.*]], i64 [[OFFSET_IDX]], i32 0
 582 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[PAIR]], ptr [[P1]], i64 [[TMP0]], i32 0
 583 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[PAIR]], ptr [[P1]], i64 [[TMP1]], i32 0
 584 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[PAIR]], ptr [[P1]], i64 [[TMP2]], i32 0
 585 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[PAIR]], ptr [[P2:%.*]], i64 [[OFFSET_IDX]], i32 1
 586 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[PAIR]], ptr [[P2]], i64 [[TMP0]], i32 1
 587 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [[PAIR]], ptr [[P2]], i64 [[TMP1]], i32 1
 588 ; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [[PAIR]], ptr [[P2]], i64 [[TMP2]], i32 1
 589 ; CHECK-NEXT:    [[TMP12:%.*]] = load i64, ptr [[TMP8]], align 8
 590 ; CHECK-NEXT:    [[TMP13:%.*]] = load i64, ptr [[TMP9]], align 8
 591 ; CHECK-NEXT:    [[TMP14:%.*]] = load i64, ptr [[TMP10]], align 8
 592 ; CHECK-NEXT:    [[TMP15:%.*]] = load i64, ptr [[TMP11]], align 8
 593 ; CHECK-NEXT:    [[TMP16:%.*]] = insertelement <4 x i64> poison, i64 [[TMP12]], i64 0
 594 ; CHECK-NEXT:    [[TMP17:%.*]] = insertelement <4 x i64> [[TMP16]], i64 [[TMP13]], i64 1
 595 ; CHECK-NEXT:    [[TMP18:%.*]] = insertelement <4 x i64> [[TMP17]], i64 [[TMP14]], i64 2
 596 ; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <4 x i64> [[TMP18]], i64 [[TMP15]], i64 3
 597 ; CHECK-NEXT:    [[TMP20:%.*]] = sub nsw <4 x i64> [[TMP19]], [[VEC_IND]]
 598 ; CHECK-NEXT:    [[TMP21:%.*]] = extractelement <4 x i64> [[TMP3]], i64 0
 599 ; CHECK-NEXT:    store i64 [[TMP21]], ptr [[TMP4]], align 8
 600 ; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <4 x i64> [[TMP3]], i64 1
 601 ; CHECK-NEXT:    store i64 [[TMP22]], ptr [[TMP5]], align 8
 602 ; CHECK-NEXT:    [[TMP23:%.*]] = extractelement <4 x i64> [[TMP3]], i64 2
 603 ; CHECK-NEXT:    store i64 [[TMP23]], ptr [[TMP6]], align 8
 604 ; CHECK-NEXT:    [[TMP24:%.*]] = extractelement <4 x i64> [[TMP3]], i64 3
 605 ; CHECK-NEXT:    store i64 [[TMP24]], ptr [[TMP7]], align 8
 606 ; CHECK-NEXT:    [[TMP25:%.*]] = extractelement <4 x i64> [[TMP20]], i64 0
 607 ; CHECK-NEXT:    store i64 [[TMP25]], ptr [[TMP8]], align 8
 608 ; CHECK-NEXT:    [[TMP26:%.*]] = extractelement <4 x i64> [[TMP20]], i64 1
 609 ; CHECK-NEXT:    store i64 [[TMP26]], ptr [[TMP9]], align 8
 610 ; CHECK-NEXT:    [[TMP27:%.*]] = extractelement <4 x i64> [[TMP20]], i64 2
 611 ; CHECK-NEXT:    store i64 [[TMP27]], ptr [[TMP10]], align 8
 612 ; CHECK-NEXT:    [[TMP28:%.*]] = extractelement <4 x i64> [[TMP20]], i64 3
 613 ; CHECK-NEXT:    store i64 [[TMP28]], ptr [[TMP11]], align 8
 614 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 615 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 -4)
 616 ; CHECK-NEXT:    [[TMP29:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 617 ; CHECK-NEXT:    br i1 [[TMP29]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
 618 ; CHECK:       middle.block:
 619 ; CHECK-NEXT:    br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
 620 ; CHECK:       scalar.ph:
 621 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 622 ; CHECK:       for.body:
 623 ; CHECK-NEXT:    br i1 poison, label [[FOR_BODY]], label [[FOR_EXIT]], !llvm.loop [[LOOP17:![0-9]+]]
 624 ; CHECK:       for.exit:
 625 ; CHECK-NEXT:    ret void
 626 ;
 627 entry:
 628   br label %for.body
 629
 630 for.body:
 631   %i = phi i64 [ 1023, %entry ], [ %i.next, %for.body ]
 632   %0 = add nsw i64 %X, %i
 633   %1 = getelementptr inbounds %pair, ptr %P1, i64 %i, i32 0
 634   %2 = getelementptr inbounds %pair, ptr %P2, i64 %i, i32 1
 635   %3 = load i64, ptr %2, align 8
 636   %4 = sub nsw i64 %3, %i
 637   store i64 %0, ptr %1, align 8
 638   store i64 %4, ptr %2, align 8
 639   %i.next = add nsw i64 %i, -1
 640   %cond = icmp sgt i64 %i, 0
 641   br i1 %cond, label %for.body, label %for.exit
 642
 643 for.exit:
 644   ret void
 645 }
 646
 647 ; Check vectorization on interleaved access groups identified from mixed
 648 ; loads/stores.
 649 ; void mixed_load2_store2(int *A, int *B) {
 650 ;   for (unsigned i = 0; i < 1024; i+=2)  {
 651 ;     B[i] = A[i] * A[i+1];
 652 ;     B[i+1] = A[i] + A[i+1];
 653 ;   }
 654 ; }
 655
 656
 657 define void @mixed_load2_store2(ptr noalias nocapture readonly %A, ptr noalias nocapture %B) {
 658 ; CHECK-LABEL: @mixed_load2_store2(
 659 ; CHECK-NEXT:  entry:
 660 ; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 661 ; CHECK:       vector.ph:
 662 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 663 ; CHECK:       vector.body:
 664 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 665 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = shl i64 [[INDEX]], 1
 666 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[OFFSET_IDX]]
 667 ; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <8 x i32>, ptr [[TMP0]], align 4
 668 ; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
 669 ; CHECK-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
 670 ; CHECK-NEXT:    [[TMP2:%.*]] = mul nsw <4 x i32> [[STRIDED_VEC1]], [[STRIDED_VEC]]
 671 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[OFFSET_IDX]]
 672 ; CHECK-NEXT:    [[STRIDED_VEC3:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
 673 ; CHECK-NEXT:    [[STRIDED_VEC4:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
 674 ; CHECK-NEXT:    [[TMP3:%.*]] = add nsw <4 x i32> [[STRIDED_VEC4]], [[STRIDED_VEC3]]
 675 ; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
 676 ; CHECK-NEXT:    store <8 x i32> [[INTERLEAVED_VEC]], ptr [[TMP4]], align 4
 677 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 678 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 512
 679 ; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
 680 ; CHECK:       middle.block:
 681 ; CHECK-NEXT:    br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
 682 ; CHECK:       scalar.ph:
 683 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 684 ; CHECK:       for.cond.cleanup:
 685 ; CHECK-NEXT:    ret void
 686 ; CHECK:       for.body:
 687 ; CHECK-NEXT:    br i1 poison, label [[FOR_BODY]], label [[FOR_COND_CLEANUP]], !llvm.loop [[LOOP19:![0-9]+]]
 688 ;
 689 entry:
 690   br label %for.body
 691
 692 for.cond.cleanup:                                 ; preds = %for.body
 693   ret void
 694
 695 for.body:                                         ; preds = %for.body, %entry
 696   %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
 697   %arrayidx = getelementptr inbounds i32, ptr %A, i64 %indvars.iv
 698   %tmp = load i32, ptr %arrayidx, align 4
 699   %tmp1 = or disjoint i64 %indvars.iv, 1
 700   %arrayidx2 = getelementptr inbounds i32, ptr %A, i64 %tmp1
 701   %tmp2 = load i32, ptr %arrayidx2, align 4
 702   %mul = mul nsw i32 %tmp2, %tmp
 703   %arrayidx4 = getelementptr inbounds i32, ptr %B, i64 %indvars.iv
 704   store i32 %mul, ptr %arrayidx4, align 4
 705   %tmp3 = load i32, ptr %arrayidx, align 4
 706   %tmp4 = load i32, ptr %arrayidx2, align 4
 707   %add10 = add nsw i32 %tmp4, %tmp3
 708   %arrayidx13 = getelementptr inbounds i32, ptr %B, i64 %tmp1
 709   store i32 %add10, ptr %arrayidx13, align 4
 710   %indvars.iv.next = add nuw nsw i64 %indvars.iv, 2
 711   %cmp = icmp ult i64 %indvars.iv.next, 1024
 712   br i1 %cmp, label %for.body, label %for.cond.cleanup
 713 }
 714
 715 ; Check vectorization on interleaved access groups identified from mixed
 716 ; loads/stores.
 717 ; void mixed_load3_store3(int *A) {
 718 ;   for (unsigned i = 0; i < 1024; i++)  {
 719 ;     *A++ += i;
 720 ;     *A++ += i;
 721 ;     *A++ += i;
 722 ;   }
 723 ; }
 724
 725
 726 define void @mixed_load3_store3(ptr nocapture %A) {
 727 ; CHECK-LABEL: @mixed_load3_store3(
 728 ; CHECK-NEXT:  entry:
 729 ; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 730 ; CHECK:       vector.ph:
 731 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 732 ; CHECK:       vector.body:
 733 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 734 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 735 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 12
 736 ; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 [[OFFSET_IDX]]
 737 ; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <12 x i32>, ptr [[NEXT_GEP]], align 4
 738 ; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <12 x i32> [[WIDE_VEC]], <12 x i32> poison, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
 739 ; CHECK-NEXT:    [[STRIDED_VEC2:%.*]] = shufflevector <12 x i32> [[WIDE_VEC]], <12 x i32> poison, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
 740 ; CHECK-NEXT:    [[STRIDED_VEC3:%.*]] = shufflevector <12 x i32> [[WIDE_VEC]], <12 x i32> poison, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
 741 ; CHECK-NEXT:    [[TMP0:%.*]] = add <4 x i32> [[STRIDED_VEC]], [[VEC_IND]]
 742 ; CHECK-NEXT:    [[TMP1:%.*]] = add <4 x i32> [[STRIDED_VEC2]], [[VEC_IND]]
 743 ; CHECK-NEXT:    [[TMP2:%.*]] = add <4 x i32> [[STRIDED_VEC3]], [[VEC_IND]]
 744 ; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> [[TMP1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 745 ; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
 746 ; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i32> [[TMP3]], <8 x i32> [[TMP4]], <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
 747 ; CHECK-NEXT:    store <12 x i32> [[INTERLEAVED_VEC]], ptr [[NEXT_GEP]], align 4
 748 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 749 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4)
 750 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 751 ; CHECK-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
 752 ; CHECK:       middle.block:
 753 ; CHECK-NEXT:    br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
 754 ; CHECK:       scalar.ph:
 755 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 756 ; CHECK:       for.cond.cleanup:
 757 ; CHECK-NEXT:    ret void
 758 ; CHECK:       for.body:
 759 ; CHECK-NEXT:    br i1 poison, label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
 760 ;
 761 entry:
 762   br label %for.body
 763
 764 for.cond.cleanup:                                 ; preds = %for.body
 765   ret void
 766
 767 for.body:                                         ; preds = %for.body, %entry
 768   %i.013 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
 769   %A.addr.012 = phi ptr [ %A, %entry ], [ %incdec.ptr3, %for.body ]
 770   %incdec.ptr = getelementptr inbounds i32, ptr %A.addr.012, i64 1
 771   %tmp = load i32, ptr %A.addr.012, align 4
 772   %add = add i32 %tmp, %i.013
 773   store i32 %add, ptr %A.addr.012, align 4
 774   %incdec.ptr1 = getelementptr inbounds i32, ptr %A.addr.012, i64 2
 775   %tmp1 = load i32, ptr %incdec.ptr, align 4
 776   %add2 = add i32 %tmp1, %i.013
 777   store i32 %add2, ptr %incdec.ptr, align 4
 778   %incdec.ptr3 = getelementptr inbounds i32, ptr %A.addr.012, i64 3
 779   %tmp2 = load i32, ptr %incdec.ptr1, align 4
 780   %add4 = add i32 %tmp2, %i.013
 781   store i32 %add4, ptr %incdec.ptr1, align 4
 782   %inc = add nuw nsw i32 %i.013, 1
 783   %exitcond = icmp eq i32 %inc, 1024
 784   br i1 %exitcond, label %for.cond.cleanup, label %for.body
 785 }
 786
 787 ; Check vectorization on interleaved access groups with members having different
 788 ; kinds of type.
 789
 790 ; struct IntFloat {
 791 ;   int a;
 792 ;   float b;
 793 ; };
 794 ;
 795 ; int SA;
 796 ; float SB;
 797 ;
 798 ; void int_float_struct(struct IntFloat *A) {
 799 ;   int SumA;
 800 ;   float SumB;
 801 ;   for (unsigned i = 0; i < 1024; i++)  {
 802 ;     SumA += A[i].a;
 803 ;     SumB += A[i].b;
 804 ;   }
 805 ;   SA = SumA;
 806 ;   SB = SumB;
 807 ; }
 808
 809
 810 %struct.IntFloat = type { i32, float }
 811
 812 @SA = common global i32 0, align 4
 813 @SB = common global float 0.000000e+00, align 4
 814
 815 define void @int_float_struct(ptr nocapture readonly %A) #0 {
 816 ; CHECK-LABEL: @int_float_struct(
 817 ; CHECK-NEXT:  entry:
 818 ; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 819 ; CHECK:       vector.ph:
 820 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 821 ; CHECK:       vector.body:
 822 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 823 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x float> [ <float undef, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>, [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ]
 824 ; CHECK-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x i32> [ <i32 undef, i32 0, i32 0, i32 0>, [[VECTOR_PH]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ]
 825 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT_INTFLOAT:%.*]], ptr [[A:%.*]], i64 [[INDEX]], i32 0
 826 ; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <8 x i32>, ptr [[TMP0]], align 4
 827 ; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
 828 ; CHECK-NEXT:    [[STRIDED_VEC2:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
 829 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[STRIDED_VEC2]] to <4 x float>
 830 ; CHECK-NEXT:    [[TMP2]] = add <4 x i32> [[STRIDED_VEC]], [[VEC_PHI1]]
 831 ; CHECK-NEXT:    [[TMP3]] = fadd fast <4 x float> [[VEC_PHI]], [[TMP1]]
 832 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 833 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 834 ; CHECK-NEXT:    br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
 835 ; CHECK:       middle.block:
 836 ; CHECK-NEXT:    [[TMP5:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[TMP3]])
 837 ; CHECK-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP2]])
 838 ; CHECK-NEXT:    br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
 839 ; CHECK:       scalar.ph:
 840 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 841 ; CHECK:       for.cond.cleanup:
 842 ; CHECK-NEXT:    [[ADD_LCSSA:%.*]] = phi i32 [ poison, [[FOR_BODY]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ]
 843 ; CHECK-NEXT:    [[ADD3_LCSSA:%.*]] = phi float [ poison, [[FOR_BODY]] ], [ [[TMP5]], [[MIDDLE_BLOCK]] ]
 844 ; CHECK-NEXT:    store i32 [[ADD_LCSSA]], ptr @SA, align 4
 845 ; CHECK-NEXT:    store float [[ADD3_LCSSA]], ptr @SB, align 4
 846 ; CHECK-NEXT:    ret void
 847 ; CHECK:       for.body:
 848 ; CHECK-NEXT:    br i1 poison, label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]]
 849 ;
 850 entry:
 851   br label %for.body
 852
 853 for.cond.cleanup:                                 ; preds = %for.body
 854   store i32 %add, ptr @SA, align 4
 855   store float %add3, ptr @SB, align 4
 856   ret void
 857
 858 for.body:                                         ; preds = %for.body, %entry
 859   %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
 860   %SumB.014 = phi float [ undef, %entry ], [ %add3, %for.body ]
 861   %SumA.013 = phi i32 [ undef, %entry ], [ %add, %for.body ]
 862   %a = getelementptr inbounds %struct.IntFloat, ptr %A, i64 %indvars.iv, i32 0
 863   %tmp = load i32, ptr %a, align 4
 864   %add = add nsw i32 %tmp, %SumA.013
 865   %b = getelementptr inbounds %struct.IntFloat, ptr %A, i64 %indvars.iv, i32 1
 866   %tmp1 = load float, ptr %b, align 4
 867   %add3 = fadd fast float %SumB.014, %tmp1
 868   %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
 869   %exitcond = icmp eq i64 %indvars.iv.next, 1024
 870   br i1 %exitcond, label %for.cond.cleanup, label %for.body
 871 }
 872
 873 ; Check vectorization of interleaved access groups in the presence of
 874 ; dependences (PR27626). The following tests check that we don't reorder
 875 ; dependent loads and stores when generating code for interleaved access
 876 ; groups. Stores should be scalarized because the required code motion would
 877 ; break dependences, and the remaining interleaved load groups should have
 878 ; gaps.
 879
 880 ; PR27626_0: Ensure a strided store is not moved after a dependent (zero
 881 ;            distance) strided load.
 882
 883 ; void PR27626_0(struct pair *p, int z, int n) {
 884 ;   for (int i = 0; i < n; i++) {
 885 ;     p[i].x = z;
 886 ;     p[i].y = p[i].x;
 887 ;   }
 888 ; }
 889
 890
 891 %pair.i32 = type { i32, i32 }
 892 define void @PR27626_0(ptr %p, i32 %z, i64 %n) {
 893 ; CHECK-LABEL: @PR27626_0(
 894 ; CHECK-NEXT:  entry:
 895 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp slt i64 [[N:%.*]], 5
 896 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 897 ; CHECK:       vector.ph:
 898 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = and i64 [[N]], 3
 899 ; CHECK-NEXT:    [[TMP0:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
 900 ; CHECK-NEXT:    [[TMP1:%.*]] = select i1 [[TMP0]], i64 4, i64 [[N_MOD_VF]]
 901 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub nsw i64 [[N]], [[TMP1]]
 902 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 903 ; CHECK:       vector.body:
 904 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 905 ; CHECK-NEXT:    [[TMP2:%.*]] = or disjoint i64 [[INDEX]], 1
 906 ; CHECK-NEXT:    [[TMP3:%.*]] = or disjoint i64 [[INDEX]], 2
 907 ; CHECK-NEXT:    [[TMP4:%.*]] = or disjoint i64 [[INDEX]], 3
 908 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[PAIR_I32:%.*]], ptr [[P:%.*]], i64 [[INDEX]], i32 0
 909 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP2]], i32 0
 910 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP3]], i32 0
 911 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP4]], i32 0
 912 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[INDEX]], i32 1
 913 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP2]], i32 1
 914 ; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP3]], i32 1
 915 ; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP4]], i32 1
 916 ; CHECK-NEXT:    store i32 [[Z:%.*]], ptr [[TMP5]], align 4
 917 ; CHECK-NEXT:    store i32 [[Z]], ptr [[TMP6]], align 4
 918 ; CHECK-NEXT:    store i32 [[Z]], ptr [[TMP7]], align 4
 919 ; CHECK-NEXT:    store i32 [[Z]], ptr [[TMP8]], align 4
 920 ; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <8 x i32>, ptr [[TMP5]], align 4
 921 ; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i64 0
 922 ; CHECK-NEXT:    store i32 [[TMP13]], ptr [[TMP9]], align 4
 923 ; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i64 2
 924 ; CHECK-NEXT:    store i32 [[TMP14]], ptr [[TMP10]], align 4
 925 ; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i64 4
 926 ; CHECK-NEXT:    store i32 [[TMP15]], ptr [[TMP11]], align 4
 927 ; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i64 6
 928 ; CHECK-NEXT:    store i32 [[TMP16]], ptr [[TMP12]], align 4
 929 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 930 ; CHECK-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 931 ; CHECK-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
 932 ; CHECK:       middle.block:
 933 ; CHECK-NEXT:    br label [[SCALAR_PH]]
 934 ; CHECK:       scalar.ph:
 935 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
 936 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 937 ; CHECK:       for.body:
 938 ; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
 939 ; CHECK-NEXT:    [[P_I_X:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[I]], i32 0
 940 ; CHECK-NEXT:    [[P_I_Y:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[I]], i32 1
 941 ; CHECK-NEXT:    store i32 [[Z]], ptr [[P_I_X]], align 4
 942 ; CHECK-NEXT:    store i32 [[Z]], ptr [[P_I_Y]], align 4
 943 ; CHECK-NEXT:    [[I_NEXT]] = add nuw nsw i64 [[I]], 1
 944 ; CHECK-NEXT:    [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]]
 945 ; CHECK-NEXT:    br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END:%.*]], !llvm.loop [[LOOP25:![0-9]+]]
 946 ; CHECK:       for.end:
 947 ; CHECK-NEXT:    ret void
 948 ;
 949 entry:
 950   br label %for.body
 951
 952 for.body:
 953   %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
 954   %p_i.x = getelementptr inbounds %pair.i32, ptr %p, i64 %i, i32 0
 955   %p_i.y = getelementptr inbounds %pair.i32, ptr %p, i64 %i, i32 1
 956   store i32 %z, ptr %p_i.x, align 4
 957   %0 = load i32, ptr %p_i.x, align 4
 958   store i32 %0, ptr %p_i.y, align 4
 959   %i.next = add nuw nsw i64 %i, 1
 960   %cond = icmp slt i64 %i.next, %n
 961   br i1 %cond, label %for.body, label %for.end
 962
 963 for.end:
 964   ret void
 965 }
 966
 967 ; PR27626_1: Ensure a strided load is not moved before a dependent (zero
 968 ;            distance) strided store.
 969
 970 ; void PR27626_1(struct pair *p, int n) {
 971 ;   int s = 0;
 972 ;   for (int i = 0; i < n; i++) {
 973 ;     p[i].y = p[i].x;
 974 ;     s += p[i].y
 975 ;   }
 976 ; }
 977
 978
 979 define i32 @PR27626_1(ptr %p, i64 %n) {
 980 ; CHECK-LABEL: @PR27626_1(
 981 ; CHECK-NEXT:  entry:
 982 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp slt i64 [[N:%.*]], 5
 983 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 984 ; CHECK:       vector.ph:
 985 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = and i64 [[N]], 3
 986 ; CHECK-NEXT:    [[TMP0:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
 987 ; CHECK-NEXT:    [[TMP1:%.*]] = select i1 [[TMP0]], i64 4, i64 [[N_MOD_VF]]
 988 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub nsw i64 [[N]], [[TMP1]]
 989 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 990 ; CHECK:       vector.body:
 991 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 992 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP14:%.*]], [[VECTOR_BODY]] ]
 993 ; CHECK-NEXT:    [[TMP2:%.*]] = or disjoint i64 [[INDEX]], 1
 994 ; CHECK-NEXT:    [[TMP3:%.*]] = or disjoint i64 [[INDEX]], 2
 995 ; CHECK-NEXT:    [[TMP4:%.*]] = or disjoint i64 [[INDEX]], 3
 996 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[PAIR_I32:%.*]], ptr [[P:%.*]], i64 [[INDEX]], i32 0
 997 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[INDEX]], i32 1
 998 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP2]], i32 1
 999 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP3]], i32 1
1000 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP4]], i32 1
1001 ; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <8 x i32>, ptr [[TMP5]], align 4
1002 ; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i64 0
1003 ; CHECK-NEXT:    store i32 [[TMP10]], ptr [[TMP6]], align 4
1004 ; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i64 2
1005 ; CHECK-NEXT:    store i32 [[TMP11]], ptr [[TMP7]], align 4
1006 ; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i64 4
1007 ; CHECK-NEXT:    store i32 [[TMP12]], ptr [[TMP8]], align 4
1008 ; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i64 6
1009 ; CHECK-NEXT:    store i32 [[TMP13]], ptr [[TMP9]], align 4
1010 ; CHECK-NEXT:    [[WIDE_VEC1:%.*]] = load <8 x i32>, ptr [[TMP6]], align 4
1011 ; CHECK-NEXT:    [[STRIDED_VEC2:%.*]] = shufflevector <8 x i32> [[WIDE_VEC1]], <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
1012 ; CHECK-NEXT:    [[TMP14]] = add <4 x i32> [[STRIDED_VEC2]], [[VEC_PHI]]
1013 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
1014 ; CHECK-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
1015 ; CHECK-NEXT:    br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]]
1016 ; CHECK:       middle.block:
1017 ; CHECK-NEXT:    [[TMP16:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP14]])
1018 ; CHECK-NEXT:    br label [[SCALAR_PH]]
1019 ; CHECK:       scalar.ph:
1020 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
1021 ; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP16]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
1022 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
1023 ; CHECK:       for.body:
1024 ; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
1025 ; CHECK-NEXT:    [[S:%.*]] = phi i32 [ [[TMP18:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
1026 ; CHECK-NEXT:    [[P_I_X:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[I]], i32 0
1027 ; CHECK-NEXT:    [[P_I_Y:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[I]], i32 1
1028 ; CHECK-NEXT:    [[TMP17:%.*]] = load i32, ptr [[P_I_X]], align 4
1029 ; CHECK-NEXT:    store i32 [[TMP17]], ptr [[P_I_Y]], align 4
1030 ; CHECK-NEXT:    [[TMP18]] = add nsw i32 [[TMP17]], [[S]]
1031 ; CHECK-NEXT:    [[I_NEXT]] = add nuw nsw i64 [[I]], 1
1032 ; CHECK-NEXT:    [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]]
1033 ; CHECK-NEXT:    br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END:%.*]], !llvm.loop [[LOOP27:![0-9]+]]
1034 ; CHECK:       for.end:
1035 ; CHECK-NEXT:    ret i32 [[TMP18]]
1036 ;
1037 entry:
1038   br label %for.body
1039
1040 for.body:
1041   %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
1042   %s = phi i32 [ %2, %for.body ], [ 0, %entry ]
1043   %p_i.x = getelementptr inbounds %pair.i32, ptr %p, i64 %i, i32 0
1044   %p_i.y = getelementptr inbounds %pair.i32, ptr %p, i64 %i, i32 1
1045   %0 = load i32, ptr %p_i.x, align 4
1046   store i32 %0, ptr %p_i.y, align 4
1047   %1 = load i32, ptr %p_i.y, align 4
1048   %2 = add nsw i32 %1, %s
1049   %i.next = add nuw nsw i64 %i, 1
1050   %cond = icmp slt i64 %i.next, %n
1051   br i1 %cond, label %for.body, label %for.end
1052
1053 for.end:
1054   %3 = phi i32 [ %2, %for.body ]
1055   ret i32 %3
1056 }
1057
1058 ; PR27626_2: Ensure a strided store is not moved after a dependent (negative
1059 ;            distance) strided load.
1060
1061 ; void PR27626_2(struct pair *p, int z, int n) {
1062 ;   for (int i = 0; i < n; i++) {
1063 ;     p[i].x = z;
1064 ;     p[i].y = p[i - 1].x;
1065 ;   }
1066 ; }
1067
1068
1069 define void @PR27626_2(ptr %p, i64 %n, i32 %z) {
1070 ; CHECK-LABEL: @PR27626_2(
1071 ; CHECK-NEXT:  entry:
1072 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp slt i64 [[N:%.*]], 5
1073 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
1074 ; CHECK:       vector.ph:
1075 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = and i64 [[N]], 3
1076 ; CHECK-NEXT:    [[TMP0:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
1077 ; CHECK-NEXT:    [[TMP1:%.*]] = select i1 [[TMP0]], i64 4, i64 [[N_MOD_VF]]
1078 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub nsw i64 [[N]], [[TMP1]]
1079 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
1080 ; CHECK:       vector.body:
1081 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
1082 ; CHECK-NEXT:    [[TMP2:%.*]] = or disjoint i64 [[INDEX]], 1
1083 ; CHECK-NEXT:    [[TMP3:%.*]] = or disjoint i64 [[INDEX]], 2
1084 ; CHECK-NEXT:    [[TMP4:%.*]] = or disjoint i64 [[INDEX]], 3
1085 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[PAIR_I32:%.*]], ptr [[P:%.*]], i64 [[INDEX]], i32 0
1086 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP2]], i32 0
1087 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP3]], i32 0
1088 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP4]], i32 0
1089 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 -8
1090 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[INDEX]], i32 1
1091 ; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP2]], i32 1
1092 ; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP3]], i32 1
1093 ; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP4]], i32 1
1094 ; CHECK-NEXT:    store i32 [[Z:%.*]], ptr [[TMP5]], align 4
1095 ; CHECK-NEXT:    store i32 [[Z]], ptr [[TMP6]], align 4
1096 ; CHECK-NEXT:    store i32 [[Z]], ptr [[TMP7]], align 4
1097 ; CHECK-NEXT:    store i32 [[Z]], ptr [[TMP8]], align 4
1098 ; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <8 x i32>, ptr [[TMP9]], align 4
1099 ; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i64 0
1100 ; CHECK-NEXT:    store i32 [[TMP14]], ptr [[TMP10]], align 4
1101 ; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i64 2
1102 ; CHECK-NEXT:    store i32 [[TMP15]], ptr [[TMP11]], align 4
1103 ; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i64 4
1104 ; CHECK-NEXT:    store i32 [[TMP16]], ptr [[TMP12]], align 4
1105 ; CHECK-NEXT:    [[TMP17:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i64 6
1106 ; CHECK-NEXT:    store i32 [[TMP17]], ptr [[TMP13]], align 4
1107 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
1108 ; CHECK-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
1109 ; CHECK-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]]
1110 ; CHECK:       middle.block:
1111 ; CHECK-NEXT:    br label [[SCALAR_PH]]
1112 ; CHECK:       scalar.ph:
1113 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
1114 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
1115 ; CHECK:       for.body:
1116 ; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
1117 ; CHECK-NEXT:    [[P_I_X:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[I]], i32 0
1118 ; CHECK-NEXT:    [[P_I_MINUS_1_X:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 -8
1119 ; CHECK-NEXT:    [[P_I_Y:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[I]], i32 1
1120 ; CHECK-NEXT:    store i32 [[Z]], ptr [[P_I_X]], align 4
1121 ; CHECK-NEXT:    [[TMP19:%.*]] = load i32, ptr [[P_I_MINUS_1_X]], align 4
1122 ; CHECK-NEXT:    store i32 [[TMP19]], ptr [[P_I_Y]], align 4
1123 ; CHECK-NEXT:    [[I_NEXT]] = add nuw nsw i64 [[I]], 1
1124 ; CHECK-NEXT:    [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]]
1125 ; CHECK-NEXT:    br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END:%.*]], !llvm.loop [[LOOP29:![0-9]+]]
1126 ; CHECK:       for.end:
1127 ; CHECK-NEXT:    ret void
1128 ;
1129 entry:
1130   br label %for.body
1131
1132 for.body:
1133   %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
1134   %i_minus_1 = add nuw nsw i64 %i, -1
1135   %p_i.x = getelementptr inbounds %pair.i32, ptr %p, i64 %i, i32 0
1136   %p_i_minus_1.x = getelementptr inbounds %pair.i32, ptr %p, i64 %i_minus_1, i32 0
1137   %p_i.y = getelementptr inbounds %pair.i32, ptr %p, i64 %i, i32 1
1138   store i32 %z, ptr %p_i.x, align 4
1139   %0 = load i32, ptr %p_i_minus_1.x, align 4
1140   store i32 %0, ptr %p_i.y, align 4
1141   %i.next = add nuw nsw i64 %i, 1
1142   %cond = icmp slt i64 %i.next, %n
1143   br i1 %cond, label %for.body, label %for.end
1144
1145 for.end:
1146   ret void
1147 }
1148
1149 ; PR27626_3: Ensure a strided load is not moved before a dependent (negative
1150 ;            distance) strided store.
1151
1152 ; void PR27626_3(struct pair *p, int z, int n) {
1153 ;   for (int i = 0; i < n; i++) {
1154 ;     p[i + 1].y = p[i].x;
1155 ;     s += p[i].y;
1156 ;   }
1157 ; }
1158
1159
1160 define i32 @PR27626_3(ptr %p, i64 %n, i32 %z) {
1161 ; CHECK-LABEL: @PR27626_3(
1162 ; CHECK-NEXT:  entry:
1163 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp slt i64 [[N:%.*]], 5
1164 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
1165 ; CHECK:       vector.ph:
1166 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = and i64 [[N]], 3
1167 ; CHECK-NEXT:    [[TMP0:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
1168 ; CHECK-NEXT:    [[TMP1:%.*]] = select i1 [[TMP0]], i64 4, i64 [[N_MOD_VF]]
1169 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub nsw i64 [[N]], [[TMP1]]
1170 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
1171 ; CHECK:       vector.body:
1172 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
1173 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
1174 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP17:%.*]], [[VECTOR_BODY]] ]
1175 ; CHECK-NEXT:    [[TMP2:%.*]] = add nuw nsw <4 x i64> [[VEC_IND]], splat (i64 1)
1176 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [[PAIR_I32:%.*]], ptr [[P:%.*]], i64 [[INDEX]], i32 0
1177 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[INDEX]], i32 1
1178 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x i64> [[TMP2]], i64 0
1179 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP5]], i32 1
1180 ; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP2]], i64 1
1181 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP7]], i32 1
1182 ; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <4 x i64> [[TMP2]], i64 2
1183 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP9]], i32 1
1184 ; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <4 x i64> [[TMP2]], i64 3
1185 ; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP11]], i32 1
1186 ; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <8 x i32>, ptr [[TMP3]], align 4
1187 ; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i64 0
1188 ; CHECK-NEXT:    store i32 [[TMP13]], ptr [[TMP6]], align 4
1189 ; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i64 2
1190 ; CHECK-NEXT:    store i32 [[TMP14]], ptr [[TMP8]], align 4
1191 ; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i64 4
1192 ; CHECK-NEXT:    store i32 [[TMP15]], ptr [[TMP10]], align 4
1193 ; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i64 6
1194 ; CHECK-NEXT:    store i32 [[TMP16]], ptr [[TMP12]], align 4
1195 ; CHECK-NEXT:    [[WIDE_VEC1:%.*]] = load <8 x i32>, ptr [[TMP4]], align 4
1196 ; CHECK-NEXT:    [[STRIDED_VEC2:%.*]] = shufflevector <8 x i32> [[WIDE_VEC1]], <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
1197 ; CHECK-NEXT:    [[TMP17]] = add <4 x i32> [[STRIDED_VEC2]], [[VEC_PHI]]
1198 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
1199 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
1200 ; CHECK-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
1201 ; CHECK-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP30:![0-9]+]]
1202 ; CHECK:       middle.block:
1203 ; CHECK-NEXT:    [[TMP19:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP17]])
1204 ; CHECK-NEXT:    br label [[SCALAR_PH]]
1205 ; CHECK:       scalar.ph:
1206 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
1207 ; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP19]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
1208 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
1209 ; CHECK:       for.body:
1210 ; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
1211 ; CHECK-NEXT:    [[S:%.*]] = phi i32 [ [[TMP22:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
1212 ; CHECK-NEXT:    [[I_PLUS_1:%.*]] = add nuw nsw i64 [[I]], 1
1213 ; CHECK-NEXT:    [[P_I_X:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[I]], i32 0
1214 ; CHECK-NEXT:    [[P_I_Y:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[I]], i32 1
1215 ; CHECK-NEXT:    [[P_I_PLUS_1_Y:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[I_PLUS_1]], i32 1
1216 ; CHECK-NEXT:    [[TMP20:%.*]] = load i32, ptr [[P_I_X]], align 4
1217 ; CHECK-NEXT:    store i32 [[TMP20]], ptr [[P_I_PLUS_1_Y]], align 4
1218 ; CHECK-NEXT:    [[TMP21:%.*]] = load i32, ptr [[P_I_Y]], align 4
1219 ; CHECK-NEXT:    [[TMP22]] = add nsw i32 [[TMP21]], [[S]]
1220 ; CHECK-NEXT:    [[I_NEXT]] = add nuw nsw i64 [[I]], 1
1221 ; CHECK-NEXT:    [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]]
1222 ; CHECK-NEXT:    br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END:%.*]], !llvm.loop [[LOOP31:![0-9]+]]
1223 ; CHECK:       for.end:
1224 ; CHECK-NEXT:    ret i32 [[TMP22]]
1225 ;
1226 entry:
1227   br label %for.body
1228
1229 for.body:
1230   %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
1231   %s = phi i32 [ %2, %for.body ], [ 0, %entry ]
1232   %i_plus_1 = add nuw nsw i64 %i, 1
1233   %p_i.x = getelementptr inbounds %pair.i32, ptr %p, i64 %i, i32 0
1234   %p_i.y = getelementptr inbounds %pair.i32, ptr %p, i64 %i, i32 1
1235   %p_i_plus_1.y = getelementptr inbounds %pair.i32, ptr %p, i64 %i_plus_1, i32 1
1236   %0 = load i32, ptr %p_i.x, align 4
1237   store i32 %0, ptr %p_i_plus_1.y, align 4
1238   %1 = load i32, ptr %p_i.y, align 4
1239   %2 = add nsw i32 %1, %s
1240   %i.next = add nuw nsw i64 %i, 1
1241   %cond = icmp slt i64 %i.next, %n
1242   br i1 %cond, label %for.body, label %for.end
1243
1244 for.end:
1245   %3 = phi i32 [ %2, %for.body ]
1246   ret i32 %3
1247 }
1248
1249 ; PR27626_4: Ensure we form an interleaved group for strided stores in the
1250 ;            presence of a write-after-write dependence. We create a group for
1251 ;            (2) and (3) while excluding (1).
1252
1253 ; void PR27626_4(int *a, int x, int y, int z, int n) {
1254 ;   for (int i = 0; i < n; i += 2) {
1255 ;     a[i] = x;      // (1)
1256 ;     a[i] = y;      // (2)
1257 ;     a[i + 1] = z;  // (3)
1258 ;   }
1259 ; }
1260
1261
1262 define void @PR27626_4(ptr %a, i32 %x, i32 %y, i32 %z, i64 %n) {
1263 ; CHECK-LABEL: @PR27626_4(
1264 ; CHECK-NEXT:  entry:
1265 ; CHECK-NEXT:    [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N:%.*]], i64 2)
1266 ; CHECK-NEXT:    [[TMP0:%.*]] = add nsw i64 [[SMAX]], -1
1267 ; CHECK-NEXT:    [[TMP1:%.*]] = lshr i64 [[TMP0]], 1
1268 ; CHECK-NEXT:    [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1
1269 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp slt i64 [[N]], 7
1270 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
1271 ; CHECK:       vector.ph:
1272 ; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[TMP2]], 9223372036854775804
1273 ; CHECK-NEXT:    [[IND_END:%.*]] = shl nuw i64 [[N_VEC]], 1
1274 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[Y:%.*]], i64 0
1275 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
1276 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i32> poison, i32 [[Z:%.*]], i64 0
1277 ; CHECK-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT1]], <4 x i32> poison, <4 x i32> zeroinitializer
1278 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
1279 ; CHECK:       vector.body:
1280 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
1281 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = shl i64 [[INDEX]], 1
1282 ; CHECK-NEXT:    [[TMP3:%.*]] = or disjoint i64 [[OFFSET_IDX]], 2
1283 ; CHECK-NEXT:    [[TMP4:%.*]] = or disjoint i64 [[OFFSET_IDX]], 4
1284 ; CHECK-NEXT:    [[TMP5:%.*]] = or disjoint i64 [[OFFSET_IDX]], 6
1285 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[OFFSET_IDX]]
1286 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP3]]
1287 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP4]]
1288 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP5]]
1289 ; CHECK-NEXT:    store i32 [[X:%.*]], ptr [[TMP7]], align 4
1290 ; CHECK-NEXT:    store i32 [[X]], ptr [[TMP8]], align 4
1291 ; CHECK-NEXT:    store i32 [[X]], ptr [[TMP9]], align 4
1292 ; CHECK-NEXT:    store i32 [[X]], ptr [[TMP10]], align 4
1293 ; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLAT]], <4 x i32> [[BROADCAST_SPLAT2]], <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
1294 ; CHECK-NEXT:    store <8 x i32> [[INTERLEAVED_VEC]], ptr [[TMP7]], align 4
1295 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
1296 ; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
1297 ; CHECK-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP32:![0-9]+]]
1298 ; CHECK:       middle.block:
1299 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
1300 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
1301 ; CHECK:       scalar.ph:
1302 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
1303 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
1304 ; CHECK:       for.body:
1305 ; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
1306 ; CHECK-NEXT:    [[I_PLUS_1:%.*]] = or disjoint i64 [[I]], 1
1307 ; CHECK-NEXT:    [[A_I:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[I]]
1308 ; CHECK-NEXT:    [[A_I_PLUS_1:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[I_PLUS_1]]
1309 ; CHECK-NEXT:    store i32 [[Y]], ptr [[A_I]], align 4
1310 ; CHECK-NEXT:    store i32 [[Z]], ptr [[A_I_PLUS_1]], align 4
1311 ; CHECK-NEXT:    [[I_NEXT]] = add nuw nsw i64 [[I]], 2
1312 ; CHECK-NEXT:    [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]]
1313 ; CHECK-NEXT:    br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP33:![0-9]+]]
1314 ; CHECK:       for.end:
1315 ; CHECK-NEXT:    ret void
1316 ;
1317 entry:
1318   br label %for.body
1319
1320 for.body:
1321   %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
1322   %i_plus_1 = add i64 %i, 1
1323   %a_i = getelementptr inbounds i32, ptr %a, i64 %i
1324   %a_i_plus_1 = getelementptr inbounds i32, ptr %a, i64 %i_plus_1
1325   store i32 %x, ptr %a_i, align 4
1326   store i32 %y, ptr %a_i, align 4
1327   store i32 %z, ptr %a_i_plus_1, align 4
1328   %i.next = add nuw nsw i64 %i, 2
1329   %cond = icmp slt i64 %i.next, %n
1330   br i1 %cond, label %for.body, label %for.end
1331
1332 for.end:
1333   ret void
1334 }
1335
1336 ; PR27626_5: Ensure we do not form an interleaved group for strided stores in
1337 ;            the presence of a write-after-write dependence.
1338
1339 ; void PR27626_5(int *a, int x, int y, int z, int n) {
1340 ;   for (int i = 3; i < n; i += 2) {
1341 ;     a[i - 1] = x;
1342 ;     a[i - 3] = y;
1343 ;     a[i] = z;
1344 ;   }
1345 ; }
1346
1347
1348 define void @PR27626_5(ptr %a, i32 %x, i32 %y, i32 %z, i64 %n) {
1349 ; CHECK-LABEL: @PR27626_5(
1350 ; CHECK-NEXT:  entry:
1351 ; CHECK-NEXT:    [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N:%.*]], i64 5)
1352 ; CHECK-NEXT:    [[TMP0:%.*]] = add nsw i64 [[SMAX]], -4
1353 ; CHECK-NEXT:    [[TMP1:%.*]] = lshr i64 [[TMP0]], 1
1354 ; CHECK-NEXT:    [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1
1355 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp slt i64 [[N]], 10
1356 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
1357 ; CHECK:       vector.ph:
1358 ; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[TMP2]], 9223372036854775804
1359 ; CHECK-NEXT:    [[TMP3:%.*]] = shl nuw i64 [[N_VEC]], 1
1360 ; CHECK-NEXT:    [[IND_END:%.*]] = or disjoint i64 [[TMP3]], 3
1361 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
1362 ; CHECK:       vector.body:
1363 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
1364 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 3, i64 5, i64 7, i64 9>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
1365 ; CHECK-NEXT:    [[TMP4:%.*]] = shl i64 [[INDEX]], 1
1366 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = or disjoint i64 [[TMP4]], 3
1367 ; CHECK-NEXT:    [[TMP5:%.*]] = or disjoint i64 [[TMP4]], 5
1368 ; CHECK-NEXT:    [[TMP6:%.*]] = or disjoint i64 [[TMP4]], 7
1369 ; CHECK-NEXT:    [[TMP7:%.*]] = add <4 x i64> [[VEC_IND]], splat (i64 -1)
1370 ; CHECK-NEXT:    [[TMP8:%.*]] = add <4 x i64> [[VEC_IND]], splat (i64 -3)
1371 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[OFFSET_IDX]]
1372 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP5]]
1373 ; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP6]]
1374 ; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr i32, ptr [[A]], i64 [[TMP4]]
1375 ; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr [[TMP12]], i64 36
1376 ; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <4 x i64> [[TMP7]], i64 0
1377 ; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP14]]
1378 ; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <4 x i64> [[TMP7]], i64 1
1379 ; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP16]]
1380 ; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <4 x i64> [[TMP7]], i64 2
1381 ; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP18]]
1382 ; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <4 x i64> [[TMP7]], i64 3
1383 ; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP20]]
1384 ; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <4 x i64> [[TMP8]], i64 0
1385 ; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP22]]
1386 ; CHECK-NEXT:    [[TMP24:%.*]] = extractelement <4 x i64> [[TMP8]], i64 1
1387 ; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP24]]
1388 ; CHECK-NEXT:    [[TMP26:%.*]] = extractelement <4 x i64> [[TMP8]], i64 2
1389 ; CHECK-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP26]]
1390 ; CHECK-NEXT:    [[TMP28:%.*]] = extractelement <4 x i64> [[TMP8]], i64 3
1391 ; CHECK-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP28]]
1392 ; CHECK-NEXT:    store i32 [[X:%.*]], ptr [[TMP15]], align 4
1393 ; CHECK-NEXT:    store i32 [[X]], ptr [[TMP17]], align 4
1394 ; CHECK-NEXT:    store i32 [[X]], ptr [[TMP19]], align 4
1395 ; CHECK-NEXT:    store i32 [[X]], ptr [[TMP21]], align 4
1396 ; CHECK-NEXT:    store i32 [[Y:%.*]], ptr [[TMP23]], align 4
1397 ; CHECK-NEXT:    store i32 [[Y]], ptr [[TMP25]], align 4
1398 ; CHECK-NEXT:    store i32 [[Y]], ptr [[TMP27]], align 4
1399 ; CHECK-NEXT:    store i32 [[Y]], ptr [[TMP29]], align 4
1400 ; CHECK-NEXT:    store i32 [[Z:%.*]], ptr [[TMP9]], align 4
1401 ; CHECK-NEXT:    store i32 [[Z]], ptr [[TMP10]], align 4
1402 ; CHECK-NEXT:    store i32 [[Z]], ptr [[TMP11]], align 4
1403 ; CHECK-NEXT:    store i32 [[Z]], ptr [[TMP13]], align 4
1404 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
1405 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 8)
1406 ; CHECK-NEXT:    [[TMP30:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
1407 ; CHECK-NEXT:    br i1 [[TMP30]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP34:![0-9]+]]
1408 ; CHECK:       middle.block:
1409 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
1410 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
1411 ; CHECK:       scalar.ph:
1412 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 3, [[ENTRY:%.*]] ]
1413 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
1414 ; CHECK:       for.body:
1415 ; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
1416 ; CHECK-NEXT:    [[A_I:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[I]]
1417 ; CHECK-NEXT:    [[TMP31:%.*]] = getelementptr i32, ptr [[A]], i64 [[I]]
1418 ; CHECK-NEXT:    [[A_I_MINUS_1:%.*]] = getelementptr i8, ptr [[TMP31]], i64 -4
1419 ; CHECK-NEXT:    [[TMP32:%.*]] = getelementptr i32, ptr [[A]], i64 [[I]]
1420 ; CHECK-NEXT:    [[A_I_MINUS_3:%.*]] = getelementptr i8, ptr [[TMP32]], i64 -12
1421 ; CHECK-NEXT:    store i32 [[X]], ptr [[A_I_MINUS_1]], align 4
1422 ; CHECK-NEXT:    store i32 [[Y]], ptr [[A_I_MINUS_3]], align 4
1423 ; CHECK-NEXT:    store i32 [[Z]], ptr [[A_I]], align 4
1424 ; CHECK-NEXT:    [[I_NEXT]] = add nuw nsw i64 [[I]], 2
1425 ; CHECK-NEXT:    [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]]
1426 ; CHECK-NEXT:    br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP35:![0-9]+]]
1427 ; CHECK:       for.end:
1428 ; CHECK-NEXT:    ret void
1429 ;
1430 entry:
1431   br label %for.body
1432
1433 for.body:
1434   %i = phi i64 [ %i.next, %for.body ], [ 3, %entry ]
1435   %i_minus_1 = sub i64 %i, 1
1436   %i_minus_3 = sub i64 %i_minus_1, 2
1437   %a_i = getelementptr inbounds i32, ptr %a, i64 %i
1438   %a_i_minus_1 = getelementptr inbounds i32, ptr %a, i64 %i_minus_1
1439   %a_i_minus_3 = getelementptr inbounds i32, ptr %a, i64 %i_minus_3
1440   store i32 %x, ptr %a_i_minus_1, align 4
1441   store i32 %y, ptr %a_i_minus_3, align 4
1442   store i32 %z, ptr %a_i, align 4
1443   %i.next = add nuw nsw i64 %i, 2
1444   %cond = icmp slt i64 %i.next, %n
1445   br i1 %cond, label %for.body, label %for.end
1446
1447 for.end:
1448   ret void
1449 }
1450
1451 ; PR34743: Ensure that a cast which needs to sink after a load that belongs to
1452 ; an interleaved group, indeeded gets sunk.
1453
1454 ; void PR34743(short *a, int *b, int n) {
1455 ;   for (int i = 0, iv = 0; iv < n; i++, iv += 2) {
1456 ;     b[i] = a[iv] * a[iv+1] * a[iv+2];
1457 ;   }
1458 ; }
1459
1460
1461 define void @PR34743(ptr %a, ptr %b, i64 %n) {
1462 ; CHECK-LABEL: @PR34743(
1463 ; CHECK-NEXT:  entry:
1464 ; CHECK-NEXT:    [[DOTPRE:%.*]] = load i16, ptr [[A:%.*]], align 2
1465 ; CHECK-NEXT:    [[TMP0:%.*]] = lshr i64 [[N:%.*]], 1
1466 ; CHECK-NEXT:    [[TMP1:%.*]] = add nuw i64 [[TMP0]], 1
1467 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 6
1468 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
1469 ; CHECK:       vector.memcheck:
1470 ; CHECK-NEXT:    [[TMP2:%.*]] = shl i64 [[N]], 1
1471 ; CHECK-NEXT:    [[TMP3:%.*]] = and i64 [[TMP2]], -4
1472 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[B:%.*]], i64 [[TMP3]]
1473 ; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[TMP4]], i64 4
1474 ; CHECK-NEXT:    [[SCEVGEP1:%.*]] = getelementptr nuw i8, ptr [[A]], i64 2
1475 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP3]]
1476 ; CHECK-NEXT:    [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[TMP5]], i64 6
1477 ; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ult ptr [[B]], [[SCEVGEP2]]
1478 ; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ult ptr [[SCEVGEP1]], [[SCEVGEP]]
1479 ; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
1480 ; CHECK-NEXT:    br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
1481 ; CHECK:       vector.ph:
1482 ; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[TMP1]], -4
1483 ; CHECK-NEXT:    [[IND_END:%.*]] = shl i64 [[N_VEC]], 1
1484 ; CHECK-NEXT:    [[VECTOR_RECUR_INIT:%.*]] = insertelement <4 x i16> poison, i16 [[DOTPRE]], i64 3
1485 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
1486 ; CHECK:       vector.body:
1487 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
1488 ; CHECK-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x i16> [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[STRIDED_VEC4:%.*]], [[VECTOR_BODY]] ]
1489 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = shl i64 [[INDEX]], 1
1490 ; CHECK-NEXT:    [[TMP6:%.*]] = or disjoint i64 [[OFFSET_IDX]], 1
1491 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i16, ptr [[A]], i64 [[TMP6]]
1492 ; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <8 x i16>, ptr [[TMP7]], align 4
1493 ; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <8 x i16> [[WIDE_VEC]], <8 x i16> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
1494 ; CHECK-NEXT:    [[STRIDED_VEC4]] = shufflevector <8 x i16> [[WIDE_VEC]], <8 x i16> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
1495 ; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <4 x i16> [[VECTOR_RECUR]], <4 x i16> [[STRIDED_VEC4]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
1496 ; CHECK-NEXT:    [[TMP9:%.*]] = sext <4 x i16> [[TMP8]] to <4 x i32>
1497 ; CHECK-NEXT:    [[TMP10:%.*]] = sext <4 x i16> [[STRIDED_VEC]] to <4 x i32>
1498 ; CHECK-NEXT:    [[TMP11:%.*]] = sext <4 x i16> [[STRIDED_VEC4]] to <4 x i32>
1499 ; CHECK-NEXT:    [[TMP12:%.*]] = mul nsw <4 x i32> [[TMP9]], [[TMP10]]
1500 ; CHECK-NEXT:    [[TMP13:%.*]] = mul nsw <4 x i32> [[TMP12]], [[TMP11]]
1501 ; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDEX]]
1502 ; CHECK-NEXT:    store <4 x i32> [[TMP13]], ptr [[TMP14]], align 4, !alias.scope [[META36:![0-9]+]], !noalias [[META39:![0-9]+]]
1503 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
1504 ; CHECK-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
1505 ; CHECK-NEXT:    br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP41:![0-9]+]]
1506 ; CHECK:       middle.block:
1507 ; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <8 x i16> [[WIDE_VEC]], i64 7
1508 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP1]], [[N_VEC]]
1509 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[END:%.*]], label [[SCALAR_PH]]
1510 ; CHECK:       scalar.ph:
1511 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ]
1512 ; CHECK-NEXT:    [[BC_RESUME_VAL3:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ], [ 0, [[VECTOR_MEMCHECK]] ]
1513 ; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ [[DOTPRE]], [[ENTRY]] ], [ [[DOTPRE]], [[VECTOR_MEMCHECK]] ]
1514 ; CHECK-NEXT:    br label [[LOOP:%.*]]
1515 ; CHECK:       loop:
1516 ; CHECK-NEXT:    [[TMP16:%.*]] = phi i16 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[LOAD2:%.*]], [[LOOP]] ]
1517 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV2:%.*]], [[LOOP]] ]
1518 ; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL3]], [[SCALAR_PH]] ], [ [[I1:%.*]], [[LOOP]] ]
1519 ; CHECK-NEXT:    [[CONV:%.*]] = sext i16 [[TMP16]] to i32
1520 ; CHECK-NEXT:    [[I1]] = add nuw nsw i64 [[I]], 1
1521 ; CHECK-NEXT:    [[IV1:%.*]] = or disjoint i64 [[IV]], 1
1522 ; CHECK-NEXT:    [[IV2]] = add nuw nsw i64 [[IV]], 2
1523 ; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr inbounds i16, ptr [[A]], i64 [[IV1]]
1524 ; CHECK-NEXT:    [[LOAD1:%.*]] = load i16, ptr [[GEP1]], align 4
1525 ; CHECK-NEXT:    [[CONV1:%.*]] = sext i16 [[LOAD1]] to i32
1526 ; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr inbounds i16, ptr [[A]], i64 [[IV2]]
1527 ; CHECK-NEXT:    [[LOAD2]] = load i16, ptr [[GEP2]], align 4
1528 ; CHECK-NEXT:    [[CONV2:%.*]] = sext i16 [[LOAD2]] to i32
1529 ; CHECK-NEXT:    [[MUL01:%.*]] = mul nsw i32 [[CONV]], [[CONV1]]
1530 ; CHECK-NEXT:    [[MUL012:%.*]] = mul nsw i32 [[MUL01]], [[CONV2]]
1531 ; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[I]]
1532 ; CHECK-NEXT:    store i32 [[MUL012]], ptr [[ARRAYIDX5]], align 4
1533 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV]], [[N]]
1534 ; CHECK-NEXT:    br i1 [[EXITCOND]], label [[END]], label [[LOOP]], !llvm.loop [[LOOP42:![0-9]+]]
1535 ; CHECK:       end:
1536 ; CHECK-NEXT:    ret void
1537 ;
1538 entry:
1539   %.pre = load i16, ptr %a
1540   br label %loop
1541
1542 loop:
1543   %0 = phi i16 [ %.pre, %entry ], [ %load2, %loop ]
1544   %iv = phi i64 [ 0, %entry ], [ %iv2, %loop ]
1545   %i = phi i64 [ 0, %entry ], [ %i1, %loop ]
1546   %conv = sext i16 %0 to i32
1547   %i1 = add nuw nsw i64 %i, 1
1548   %iv1 = add nuw nsw i64 %iv, 1
1549   %iv2 = add nuw nsw i64 %iv, 2
1550   %gep1 = getelementptr inbounds i16, ptr %a, i64 %iv1
1551   %load1 = load i16, ptr %gep1, align 4
1552   %conv1 = sext i16 %load1 to i32
1553   %gep2 = getelementptr inbounds i16, ptr %a, i64 %iv2
1554   %load2 = load i16, ptr %gep2, align 4
1555   %conv2 = sext i16 %load2 to i32
1556   %mul01 = mul nsw i32 %conv, %conv1
1557   %mul012 = mul nsw i32 %mul01, %conv2
1558   %arrayidx5 = getelementptr inbounds i32, ptr %b, i64 %i
1559   store i32 %mul012, ptr %arrayidx5
1560   %exitcond = icmp eq i64 %iv, %n
1561   br i1 %exitcond, label %end, label %loop
1562
1563 end:
1564   ret void
1565 }
1566
1567 attributes #0 = { "unsafe-fp-math"="true" }