llvm/test/Transforms/LoopVectorize/interleaved-accesses.ll

   1 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
   2 ; RUN: opt -S -loop-vectorize -instcombine -force-vector-width=4 -force-vector-interleave=1 -enable-interleaved-mem-accesses=true -runtime-memory-check-threshold=24 < %s | FileCheck %s
   3
   4 target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
   5
   6 ; Check vectorization on an interleaved load group of factor 2 and an interleaved
   7 ; store group of factor 2.
   8
   9 ; int AB[1024];
  10 ; int CD[1024];
  11 ;  void test_array_load2_store2(int C, int D) {
  12 ;   for (int i = 0; i < 1024; i+=2) {
  13 ;     int A = AB[i];
  14 ;     int B = AB[i+1];
  15 ;     CD[i] = A + C;
  16 ;     CD[i+1] = B * D;
  17 ;   }
  18 ; }
  19
  20
  21 @AB = common global [1024 x i32] zeroinitializer, align 4
  22 @CD = common global [1024 x i32] zeroinitializer, align 4
  23
  24 define void @test_array_load2_store2(i32 %C, i32 %D) {
  25 ; CHECK-LABEL: @test_array_load2_store2(
  26 ; CHECK-NEXT:  entry:
  27 ; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
  28 ; CHECK:       vector.ph:
  29 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[C:%.*]], i32 0
  30 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
  31 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <4 x i32> poison, i32 [[D:%.*]], i32 0
  32 ; CHECK-NEXT:    [[BROADCAST_SPLAT3:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT2]], <4 x i32> poison, <4 x i32> zeroinitializer
  33 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
  34 ; CHECK:       vector.body:
  35 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
  36 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = shl i64 [[INDEX]], 1
  37 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* @AB, i64 0, i64 [[OFFSET_IDX]]
  38 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <8 x i32>*
  39 ; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <8 x i32>, <8 x i32>* [[TMP1]], align 4
  40 ; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
  41 ; CHECK-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
  42 ; CHECK-NEXT:    [[TMP2:%.*]] = or i64 [[OFFSET_IDX]], 1
  43 ; CHECK-NEXT:    [[TMP3:%.*]] = add nsw <4 x i32> [[STRIDED_VEC]], [[BROADCAST_SPLAT]]
  44 ; CHECK-NEXT:    [[TMP4:%.*]] = mul nsw <4 x i32> [[STRIDED_VEC1]], [[BROADCAST_SPLAT3]]
  45 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* @CD, i64 0, i64 [[TMP2]]
  46 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP5]], i64 -1
  47 ; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <8 x i32>*
  48 ; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
  49 ; CHECK-NEXT:    store <8 x i32> [[INTERLEAVED_VEC]], <8 x i32>* [[TMP7]], align 4
  50 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
  51 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 512
  52 ; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
  53 ; CHECK:       middle.block:
  54 ; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
  55 ; CHECK:       scalar.ph:
  56 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
  57 ; CHECK:       for.body:
  58 ; CHECK-NEXT:    br i1 undef, label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP2:![0-9]+]]
  59 ; CHECK:       for.end:
  60 ; CHECK-NEXT:    ret void
  61 ;
  62 entry:
  63   br label %for.body
  64
  65 for.body:                                         ; preds = %for.body, %entry
  66   %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
  67   %arrayidx0 = getelementptr inbounds [1024 x i32], [1024 x i32]* @AB, i64 0, i64 %indvars.iv
  68   %tmp = load i32, i32* %arrayidx0, align 4
  69   %tmp1 = or i64 %indvars.iv, 1
  70   %arrayidx1 = getelementptr inbounds [1024 x i32], [1024 x i32]* @AB, i64 0, i64 %tmp1
  71   %tmp2 = load i32, i32* %arrayidx1, align 4
  72   %add = add nsw i32 %tmp, %C
  73   %mul = mul nsw i32 %tmp2, %D
  74   %arrayidx2 = getelementptr inbounds [1024 x i32], [1024 x i32]* @CD, i64 0, i64 %indvars.iv
  75   store i32 %add, i32* %arrayidx2, align 4
  76   %arrayidx3 = getelementptr inbounds [1024 x i32], [1024 x i32]* @CD, i64 0, i64 %tmp1
  77   store i32 %mul, i32* %arrayidx3, align 4
  78   %indvars.iv.next = add nuw nsw i64 %indvars.iv, 2
  79   %cmp = icmp slt i64 %indvars.iv.next, 1024
  80   br i1 %cmp, label %for.body, label %for.end
  81
  82 for.end:                                          ; preds = %for.body
  83   ret void
  84 }
  85
  86 ; int A[3072];
  87 ; struct ST S[1024];
  88 ; void test_struct_st3() {
  89 ;   int *ptr = A;
  90 ;   for (int i = 0; i < 1024; i++) {
  91 ;     int X1 = *ptr++;
  92 ;     int X2 = *ptr++;
  93 ;     int X3 = *ptr++;
  94 ;     T[i].x = X1 + 1;
  95 ;     T[i].y = X2 + 2;
  96 ;     T[i].z = X3 + 3;
  97 ;   }
  98 ; }
  99
 100
 101 %struct.ST3 = type { i32, i32, i32 }
 102 @A = common global [3072 x i32] zeroinitializer, align 4
 103 @S = common global [1024 x %struct.ST3] zeroinitializer, align 4
 104
 105 define void @test_struct_array_load3_store3() {
 106 ; CHECK-LABEL: @test_struct_array_load3_store3(
 107 ; CHECK-NEXT:  entry:
 108 ; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 109 ; CHECK:       vector.ph:
 110 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 111 ; CHECK:       vector.body:
 112 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 113 ; CHECK-NEXT:    [[TMP0:%.*]] = mul i64 [[INDEX]], 3
 114 ; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr [3072 x i32], [3072 x i32]* @A, i64 0, i64 [[TMP0]]
 115 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[NEXT_GEP]] to <12 x i32>*
 116 ; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <12 x i32>, <12 x i32>* [[TMP1]], align 4
 117 ; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <12 x i32> [[WIDE_VEC]], <12 x i32> poison, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
 118 ; CHECK-NEXT:    [[STRIDED_VEC2:%.*]] = shufflevector <12 x i32> [[WIDE_VEC]], <12 x i32> poison, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
 119 ; CHECK-NEXT:    [[STRIDED_VEC3:%.*]] = shufflevector <12 x i32> [[WIDE_VEC]], <12 x i32> poison, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
 120 ; CHECK-NEXT:    [[TMP2:%.*]] = add nsw <4 x i32> [[STRIDED_VEC]], <i32 1, i32 1, i32 1, i32 1>
 121 ; CHECK-NEXT:    [[TMP3:%.*]] = add nsw <4 x i32> [[STRIDED_VEC2]], <i32 2, i32 2, i32 2, i32 2>
 122 ; CHECK-NEXT:    [[TMP4:%.*]] = add nsw <4 x i32> [[STRIDED_VEC3]], <i32 3, i32 3, i32 3, i32 3>
 123 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [1024 x %struct.ST3], [1024 x %struct.ST3]* @S, i64 0, i64 [[INDEX]], i32 2
 124 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP5]], i64 -2
 125 ; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <12 x i32>*
 126 ; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 127 ; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
 128 ; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i32> [[TMP8]], <8 x i32> [[TMP9]], <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
 129 ; CHECK-NEXT:    store <12 x i32> [[INTERLEAVED_VEC]], <12 x i32>* [[TMP7]], align 4
 130 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 131 ; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 132 ; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 133 ; CHECK:       middle.block:
 134 ; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
 135 ; CHECK:       scalar.ph:
 136 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 137 ; CHECK:       for.body:
 138 ; CHECK-NEXT:    br i1 undef, label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 139 ; CHECK:       for.end:
 140 ; CHECK-NEXT:    ret void
 141 ;
 142 entry:
 143   br label %for.body
 144
 145 for.body:                                         ; preds = %for.body, %entry
 146   %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
 147   %ptr.016 = phi i32* [ getelementptr inbounds ([3072 x i32], [3072 x i32]* @A, i64 0, i64 0), %entry ], [ %incdec.ptr2, %for.body ]
 148   %incdec.ptr = getelementptr inbounds i32, i32* %ptr.016, i64 1
 149   %tmp = load i32, i32* %ptr.016, align 4
 150   %incdec.ptr1 = getelementptr inbounds i32, i32* %ptr.016, i64 2
 151   %tmp1 = load i32, i32* %incdec.ptr, align 4
 152   %incdec.ptr2 = getelementptr inbounds i32, i32* %ptr.016, i64 3
 153   %tmp2 = load i32, i32* %incdec.ptr1, align 4
 154   %add = add nsw i32 %tmp, 1
 155   %x = getelementptr inbounds [1024 x %struct.ST3], [1024 x %struct.ST3]* @S, i64 0, i64 %indvars.iv, i32 0
 156   store i32 %add, i32* %x, align 4
 157   %add3 = add nsw i32 %tmp1, 2
 158   %y = getelementptr inbounds [1024 x %struct.ST3], [1024 x %struct.ST3]* @S, i64 0, i64 %indvars.iv, i32 1
 159   store i32 %add3, i32* %y, align 4
 160   %add6 = add nsw i32 %tmp2, 3
 161   %z = getelementptr inbounds [1024 x %struct.ST3], [1024 x %struct.ST3]* @S, i64 0, i64 %indvars.iv, i32 2
 162   store i32 %add6, i32* %z, align 4
 163   %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
 164   %exitcond = icmp eq i64 %indvars.iv.next, 1024
 165   br i1 %exitcond, label %for.end, label %for.body
 166
 167 for.end:                                          ; preds = %for.body
 168   ret void
 169 }
 170
 171 ; Check vectorization on an interleaved load group of factor 4.
 172
 173 ; struct ST4{
 174 ;   int x;
 175 ;   int y;
 176 ;   int z;
 177 ;   int w;
 178 ; };
 179 ; int test_struct_load4(struct ST4 *S) {
 180 ;   int r = 0;
 181 ;   for (int i = 0; i < 1024; i++) {
 182 ;      r += S[i].x;
 183 ;      r -= S[i].y;
 184 ;      r += S[i].z;
 185 ;      r -= S[i].w;
 186 ;   }
 187 ;   return r;
 188 ; }
 189
 190 %struct.ST4 = type { i32, i32, i32, i32 }
 191
 192 define i32 @test_struct_load4(%struct.ST4* nocapture readonly %S) {
 193 ;
 194 ; CHECK-LABEL: @test_struct_load4(
 195 ; CHECK-NEXT:  entry:
 196 ; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 197 ; CHECK:       vector.ph:
 198 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 199 ; CHECK:       vector.body:
 200 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 201 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
 202 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT_ST4:%.*]], %struct.ST4* [[S:%.*]], i64 [[INDEX]], i32 0
 203 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <16 x i32>*
 204 ; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <16 x i32>, <16 x i32>* [[TMP1]], align 4
 205 ; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> poison, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
 206 ; CHECK-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> poison, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
 207 ; CHECK-NEXT:    [[STRIDED_VEC2:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> poison, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
 208 ; CHECK-NEXT:    [[STRIDED_VEC3:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> poison, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
 209 ; CHECK-NEXT:    [[TMP2:%.*]] = add <4 x i32> [[STRIDED_VEC]], [[VEC_PHI]]
 210 ; CHECK-NEXT:    [[TMP3:%.*]] = add <4 x i32> [[TMP2]], [[STRIDED_VEC2]]
 211 ; CHECK-NEXT:    [[TMP4:%.*]] = add <4 x i32> [[STRIDED_VEC1]], [[STRIDED_VEC3]]
 212 ; CHECK-NEXT:    [[TMP5]] = sub <4 x i32> [[TMP3]], [[TMP4]]
 213 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 214 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 215 ; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 216 ; CHECK:       middle.block:
 217 ; CHECK-NEXT:    [[TMP7:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP5]])
 218 ; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
 219 ; CHECK:       scalar.ph:
 220 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 221 ; CHECK:       for.body:
 222 ; CHECK-NEXT:    br i1 undef, label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
 223 ; CHECK:       for.end:
 224 ; CHECK-NEXT:    [[SUB8_LCSSA:%.*]] = phi i32 [ undef, [[FOR_BODY]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ]
 225 ; CHECK-NEXT:    ret i32 [[SUB8_LCSSA]]
 226 ;
 227 entry:
 228   br label %for.body
 229
 230 for.body:                                         ; preds = %for.body, %entry
 231   %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
 232   %r.022 = phi i32 [ 0, %entry ], [ %sub8, %for.body ]
 233   %x = getelementptr inbounds %struct.ST4, %struct.ST4* %S, i64 %indvars.iv, i32 0
 234   %tmp = load i32, i32* %x, align 4
 235   %add = add nsw i32 %tmp, %r.022
 236   %y = getelementptr inbounds %struct.ST4, %struct.ST4* %S, i64 %indvars.iv, i32 1
 237   %tmp1 = load i32, i32* %y, align 4
 238   %sub = sub i32 %add, %tmp1
 239   %z = getelementptr inbounds %struct.ST4, %struct.ST4* %S, i64 %indvars.iv, i32 2
 240   %tmp2 = load i32, i32* %z, align 4
 241   %add5 = add nsw i32 %sub, %tmp2
 242   %w = getelementptr inbounds %struct.ST4, %struct.ST4* %S, i64 %indvars.iv, i32 3
 243   %tmp3 = load i32, i32* %w, align 4
 244   %sub8 = sub i32 %add5, %tmp3
 245   %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
 246   %exitcond = icmp eq i64 %indvars.iv.next, 1024
 247   br i1 %exitcond, label %for.end, label %for.body
 248
 249 for.end:                                          ; preds = %for.body
 250   ret i32 %sub8
 251 }
 252
 253 ; Check vectorization on an interleaved store group of factor 4.
 254
 255 ; void test_struct_store4(int *A, struct ST4 *B) {
 256 ;   int *ptr = A;
 257 ;   for (int i = 0; i < 1024; i++) {
 258 ;     int X = *ptr++;
 259 ;     B[i].x = X + 1;
 260 ;     B[i].y = X * 2;
 261 ;     B[i].z = X + 3;
 262 ;     B[i].w = X + 4;
 263 ;   }
 264 ; }
 265
 266
 267 define void @test_struct_store4(i32* noalias nocapture readonly %A, %struct.ST4* noalias nocapture %B) {
 268 ; CHECK-LABEL: @test_struct_store4(
 269 ; CHECK-NEXT:  entry:
 270 ; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 271 ; CHECK:       vector.ph:
 272 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 273 ; CHECK:       vector.body:
 274 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 275 ; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i32, i32* [[A:%.*]], i64 [[INDEX]]
 276 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[NEXT_GEP]] to <4 x i32>*
 277 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
 278 ; CHECK-NEXT:    [[TMP1:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], <i32 1, i32 1, i32 1, i32 1>
 279 ; CHECK-NEXT:    [[TMP2:%.*]] = shl nsw <4 x i32> [[WIDE_LOAD]], <i32 1, i32 1, i32 1, i32 1>
 280 ; CHECK-NEXT:    [[TMP3:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], <i32 3, i32 3, i32 3, i32 3>
 281 ; CHECK-NEXT:    [[TMP4:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], <i32 4, i32 4, i32 4, i32 4>
 282 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT_ST4:%.*]], %struct.ST4* [[B:%.*]], i64 [[INDEX]], i32 3
 283 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP5]], i64 -3
 284 ; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <16 x i32>*
 285 ; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 286 ; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 287 ; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i32> [[TMP8]], <8 x i32> [[TMP9]], <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
 288 ; CHECK-NEXT:    store <16 x i32> [[INTERLEAVED_VEC]], <16 x i32>* [[TMP7]], align 4
 289 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 290 ; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 291 ; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 292 ; CHECK:       middle.block:
 293 ; CHECK-NEXT:    br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
 294 ; CHECK:       scalar.ph:
 295 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 296 ; CHECK:       for.cond.cleanup:
 297 ; CHECK-NEXT:    ret void
 298 ; CHECK:       for.body:
 299 ; CHECK-NEXT:    br i1 undef, label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
 300 ;
 301 entry:
 302   br label %for.body
 303
 304 for.cond.cleanup:                                 ; preds = %for.body
 305   ret void
 306
 307 for.body:                                         ; preds = %for.body, %entry
 308   %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
 309   %ptr.024 = phi i32* [ %A, %entry ], [ %incdec.ptr, %for.body ]
 310   %incdec.ptr = getelementptr inbounds i32, i32* %ptr.024, i64 1
 311   %tmp = load i32, i32* %ptr.024, align 4
 312   %add = add nsw i32 %tmp, 1
 313   %x = getelementptr inbounds %struct.ST4, %struct.ST4* %B, i64 %indvars.iv, i32 0
 314   store i32 %add, i32* %x, align 4
 315   %mul = shl nsw i32 %tmp, 1
 316   %y = getelementptr inbounds %struct.ST4, %struct.ST4* %B, i64 %indvars.iv, i32 1
 317   store i32 %mul, i32* %y, align 4
 318   %add3 = add nsw i32 %tmp, 3
 319   %z = getelementptr inbounds %struct.ST4, %struct.ST4* %B, i64 %indvars.iv, i32 2
 320   store i32 %add3, i32* %z, align 4
 321   %add6 = add nsw i32 %tmp, 4
 322   %w = getelementptr inbounds %struct.ST4, %struct.ST4* %B, i64 %indvars.iv, i32 3
 323   store i32 %add6, i32* %w, align 4
 324   %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
 325   %exitcond = icmp eq i64 %indvars.iv.next, 1024
 326   br i1 %exitcond, label %for.cond.cleanup, label %for.body
 327 }
 328
 329 ; Check vectorization on a reverse interleaved load group of factor 2 and
 330 ; a reverse interleaved store group of factor 2.
 331
 332 ; struct ST2 {
 333 ;  int x;
 334 ;  int y;
 335 ; };
 336 ;
 337 ; void test_reversed_load2_store2(struct ST2 *A, struct ST2 *B) {
 338 ;   for (int i = 1023; i >= 0; i--) {
 339 ;     int a = A[i].x + i;  // interleaved load of index 0
 340 ;     int b = A[i].y - i;  // interleaved load of index 1
 341 ;     B[i].x = a;          // interleaved store of index 0
 342 ;     B[i].y = b;          // interleaved store of index 1
 343 ;   }
 344 ; }
 345
 346
 347 %struct.ST2 = type { i32, i32 }
 348
 349 define void @test_reversed_load2_store2(%struct.ST2* noalias nocapture readonly %A, %struct.ST2* noalias nocapture %B) {
 350 ; CHECK-LABEL: @test_reversed_load2_store2(
 351 ; CHECK-NEXT:  entry:
 352 ; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 353 ; CHECK:       vector.ph:
 354 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 355 ; CHECK:       vector.body:
 356 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 357 ; CHECK-NEXT:    [[VEC_IND3:%.*]] = phi <4 x i32> [ <i32 1023, i32 1022, i32 1021, i32 1020>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT4:%.*]], [[VECTOR_BODY]] ]
 358 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = sub i64 1023, [[INDEX]]
 359 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT_ST2:%.*]], %struct.ST2* [[A:%.*]], i64 [[OFFSET_IDX]], i32 0
 360 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 -6
 361 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i32* [[TMP1]] to <8 x i32>*
 362 ; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <8 x i32>, <8 x i32>* [[TMP2]], align 4
 363 ; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
 364 ; CHECK-NEXT:    [[REVERSE:%.*]] = shufflevector <4 x i32> [[STRIDED_VEC]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
 365 ; CHECK-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
 366 ; CHECK-NEXT:    [[REVERSE2:%.*]] = shufflevector <4 x i32> [[STRIDED_VEC1]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
 367 ; CHECK-NEXT:    [[TMP3:%.*]] = add nsw <4 x i32> [[REVERSE]], [[VEC_IND3]]
 368 ; CHECK-NEXT:    [[TMP4:%.*]] = sub nsw <4 x i32> [[REVERSE2]], [[VEC_IND3]]
 369 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT_ST2]], %struct.ST2* [[B:%.*]], i64 [[OFFSET_IDX]], i32 1
 370 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP5]], i64 -7
 371 ; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <8 x i32>*
 372 ; CHECK-NEXT:    [[REVERSE5:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
 373 ; CHECK-NEXT:    [[REVERSE6:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
 374 ; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i32> [[REVERSE5]], <4 x i32> [[REVERSE6]], <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
 375 ; CHECK-NEXT:    store <8 x i32> [[INTERLEAVED_VEC]], <8 x i32>* [[TMP7]], align 4
 376 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 377 ; CHECK-NEXT:    [[VEC_IND_NEXT4]] = add <4 x i32> [[VEC_IND3]], <i32 -4, i32 -4, i32 -4, i32 -4>
 378 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 379 ; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 380 ; CHECK:       middle.block:
 381 ; CHECK-NEXT:    br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
 382 ; CHECK:       scalar.ph:
 383 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 384 ; CHECK:       for.cond.cleanup:
 385 ; CHECK-NEXT:    ret void
 386 ; CHECK:       for.body:
 387 ; CHECK-NEXT:    br i1 undef, label [[FOR_BODY]], label [[FOR_COND_CLEANUP]], !llvm.loop [[LOOP11:![0-9]+]]
 388 ;
 389 entry:
 390   br label %for.body
 391
 392 for.cond.cleanup:                                 ; preds = %for.body
 393   ret void
 394
 395 for.body:                                         ; preds = %for.body, %entry
 396   %indvars.iv = phi i64 [ 1023, %entry ], [ %indvars.iv.next, %for.body ]
 397   %x = getelementptr inbounds %struct.ST2, %struct.ST2* %A, i64 %indvars.iv, i32 0
 398   %tmp = load i32, i32* %x, align 4
 399   %tmp1 = trunc i64 %indvars.iv to i32
 400   %add = add nsw i32 %tmp, %tmp1
 401   %y = getelementptr inbounds %struct.ST2, %struct.ST2* %A, i64 %indvars.iv, i32 1
 402   %tmp2 = load i32, i32* %y, align 4
 403   %sub = sub nsw i32 %tmp2, %tmp1
 404   %x5 = getelementptr inbounds %struct.ST2, %struct.ST2* %B, i64 %indvars.iv, i32 0
 405   store i32 %add, i32* %x5, align 4
 406   %y8 = getelementptr inbounds %struct.ST2, %struct.ST2* %B, i64 %indvars.iv, i32 1
 407   store i32 %sub, i32* %y8, align 4
 408   %indvars.iv.next = add nsw i64 %indvars.iv, -1
 409   %cmp = icmp sgt i64 %indvars.iv, 0
 410   br i1 %cmp, label %for.body, label %for.cond.cleanup
 411 }
 412
 413 ; Check vectorization on an interleaved load group of factor 2 with 1 gap
 414 ; (missing the load of odd elements). Because the vectorized loop would
 415 ; speculatively access memory out-of-bounds, we must execute at least one
 416 ; iteration of the scalar loop.
 417
 418 ; void even_load_static_tc(int *A, int *B) {
 419 ;  for (unsigned i = 0; i < 1024; i+=2)
 420 ;     B[i/2] = A[i] * 2;
 421 ; }
 422
 423
 424 define void @even_load_static_tc(i32* noalias nocapture readonly %A, i32* noalias nocapture %B) {
 425 ; CHECK-LABEL: @even_load_static_tc(
 426 ; CHECK-NEXT:  entry:
 427 ; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 428 ; CHECK:       vector.ph:
 429 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 430 ; CHECK:       vector.body:
 431 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 432 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = shl i64 [[INDEX]], 1
 433 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[OFFSET_IDX]]
 434 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <8 x i32>*
 435 ; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <8 x i32>, <8 x i32>* [[TMP1]], align 4
 436 ; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
 437 ; CHECK-NEXT:    [[TMP2:%.*]] = shl nsw <4 x i32> [[STRIDED_VEC]], <i32 1, i32 1, i32 1, i32 1>
 438 ; CHECK-NEXT:    [[TMP3:%.*]] = and i64 [[INDEX]], 9223372036854775804
 439 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[TMP3]]
 440 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i32* [[TMP4]] to <4 x i32>*
 441 ; CHECK-NEXT:    store <4 x i32> [[TMP2]], <4 x i32>* [[TMP5]], align 4
 442 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 443 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 508
 444 ; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
 445 ; CHECK:       middle.block:
 446 ; CHECK-NEXT:    br label [[SCALAR_PH]]
 447 ; CHECK:       scalar.ph:
 448 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1016, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
 449 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 450 ; CHECK:       for.cond.cleanup:
 451 ; CHECK-NEXT:    ret void
 452 ; CHECK:       for.body:
 453 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
 454 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV]]
 455 ; CHECK-NEXT:    [[TMP:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
 456 ; CHECK-NEXT:    [[MUL:%.*]] = shl nsw i32 [[TMP]], 1
 457 ; CHECK-NEXT:    [[TMP1:%.*]] = lshr exact i64 [[INDVARS_IV]], 1
 458 ; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[TMP1]]
 459 ; CHECK-NEXT:    store i32 [[MUL]], i32* [[ARRAYIDX2]], align 4
 460 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 2
 461 ; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i64 [[INDVARS_IV]], 1022
 462 ; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP:%.*]], !llvm.loop [[LOOP13:![0-9]+]]
 463 ;
 464 entry:
 465   br label %for.body
 466
 467 for.cond.cleanup:                                 ; preds = %for.body
 468   ret void
 469
 470 for.body:                                         ; preds = %for.body, %entry
 471   %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
 472   %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
 473   %tmp = load i32, i32* %arrayidx, align 4
 474   %mul = shl nsw i32 %tmp, 1
 475   %tmp1 = lshr exact i64 %indvars.iv, 1
 476   %arrayidx2 = getelementptr inbounds i32, i32* %B, i64 %tmp1
 477   store i32 %mul, i32* %arrayidx2, align 4
 478   %indvars.iv.next = add nuw nsw i64 %indvars.iv, 2
 479   %cmp = icmp ult i64 %indvars.iv.next, 1024
 480   br i1 %cmp, label %for.body, label %for.cond.cleanup
 481 }
 482
 483 ; Check vectorization on an interleaved load group of factor 2 with 1 gap
 484 ; (missing the load of odd elements). Because the vectorized loop would
 485 ; speculatively access memory out-of-bounds, we must execute at least one
 486 ; iteration of the scalar loop.
 487
 488 ; void even_load_dynamic_tc(int *A, int *B, unsigned N) {
 489 ;  for (unsigned i = 0; i < N; i+=2)
 490 ;     B[i/2] = A[i] * 2;
 491 ; }
 492
 493
 494 define void @even_load_dynamic_tc(i32* noalias nocapture readonly %A, i32* noalias nocapture %B, i64 %N) {
 495 ; CHECK-LABEL: @even_load_dynamic_tc(
 496 ; CHECK-NEXT:  entry:
 497 ; CHECK-NEXT:    [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[N:%.*]], i64 2)
 498 ; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[UMAX]], -1
 499 ; CHECK-NEXT:    [[TMP1:%.*]] = lshr i64 [[TMP0]], 1
 500 ; CHECK-NEXT:    [[TMP2:%.*]] = add nuw i64 [[TMP1]], 1
 501 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 8
 502 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 503 ; CHECK:       vector.ph:
 504 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = and i64 [[TMP2]], 3
 505 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
 506 ; CHECK-NEXT:    [[TMP4:%.*]] = select i1 [[TMP3]], i64 4, i64 [[N_MOD_VF]]
 507 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP2]], [[TMP4]]
 508 ; CHECK-NEXT:    [[IND_END:%.*]] = shl i64 [[N_VEC]], 1
 509 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 510 ; CHECK:       vector.body:
 511 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 512 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = shl i64 [[INDEX]], 1
 513 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[OFFSET_IDX]]
 514 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i32* [[TMP5]] to <8 x i32>*
 515 ; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <8 x i32>, <8 x i32>* [[TMP6]], align 4
 516 ; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
 517 ; CHECK-NEXT:    [[TMP7:%.*]] = shl nsw <4 x i32> [[STRIDED_VEC]], <i32 1, i32 1, i32 1, i32 1>
 518 ; CHECK-NEXT:    [[TMP8:%.*]] = and i64 [[INDEX]], 9223372036854775804
 519 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[TMP8]]
 520 ; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i32* [[TMP9]] to <4 x i32>*
 521 ; CHECK-NEXT:    store <4 x i32> [[TMP7]], <4 x i32>* [[TMP10]], align 4
 522 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 523 ; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 524 ; CHECK-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
 525 ; CHECK:       middle.block:
 526 ; CHECK-NEXT:    br label [[SCALAR_PH]]
 527 ; CHECK:       scalar.ph:
 528 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
 529 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 530 ; CHECK:       for.cond.cleanup:
 531 ; CHECK-NEXT:    ret void
 532 ; CHECK:       for.body:
 533 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
 534 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV]]
 535 ; CHECK-NEXT:    [[TMP:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
 536 ; CHECK-NEXT:    [[MUL:%.*]] = shl nsw i32 [[TMP]], 1
 537 ; CHECK-NEXT:    [[TMP1:%.*]] = lshr exact i64 [[INDVARS_IV]], 1
 538 ; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[TMP1]]
 539 ; CHECK-NEXT:    store i32 [[MUL]], i32* [[ARRAYIDX2]], align 4
 540 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 2
 541 ; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i64 [[INDVARS_IV_NEXT]], [[N]]
 542 ; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP:%.*]], !llvm.loop [[LOOP15:![0-9]+]]
 543 ;
 544 entry:
 545   br label %for.body
 546
 547 for.cond.cleanup:                                 ; preds = %for.body
 548   ret void
 549
 550 for.body:                                         ; preds = %for.body, %entry
 551   %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
 552   %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
 553   %tmp = load i32, i32* %arrayidx, align 4
 554   %mul = shl nsw i32 %tmp, 1
 555   %tmp1 = lshr exact i64 %indvars.iv, 1
 556   %arrayidx2 = getelementptr inbounds i32, i32* %B, i64 %tmp1
 557   store i32 %mul, i32* %arrayidx2, align 4
 558   %indvars.iv.next = add nuw nsw i64 %indvars.iv, 2
 559   %cmp = icmp ult i64 %indvars.iv.next, %N
 560   br i1 %cmp, label %for.body, label %for.cond.cleanup
 561 }
 562
 563 ; Check vectorization on a reverse interleaved load group of factor 2 with 1
 564 ; gap and a reverse interleaved store group of factor 2. The interleaved load
 565 ; group should be removed since it has a gap and is reverse.
 566
 567 ; struct pair {
 568 ;  int x;
 569 ;  int y;
 570 ; };
 571 ;
 572 ; void load_gap_reverse(struct pair *P1, struct pair *P2, int X) {
 573 ;   for (int i = 1023; i >= 0; i--) {
 574 ;     int a = X + i;
 575 ;     int b = A[i].y - i;
 576 ;     B[i].x = a;
 577 ;     B[i].y = b;
 578 ;   }
 579 ; }
 580
 581
 582 %pair = type { i64, i64 }
 583 define void @load_gap_reverse(%pair* noalias nocapture readonly %P1, %pair* noalias nocapture readonly %P2, i64 %X) {
 584 ; CHECK-LABEL: @load_gap_reverse(
 585 ; CHECK-NEXT:  entry:
 586 ; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 587 ; CHECK:       vector.ph:
 588 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[X:%.*]], i32 0
 589 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
 590 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 591 ; CHECK:       vector.body:
 592 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 593 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 1023, i64 1022, i64 1021, i64 1020>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 594 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = sub i64 1023, [[INDEX]]
 595 ; CHECK-NEXT:    [[TMP0:%.*]] = sub i64 1022, [[INDEX]]
 596 ; CHECK-NEXT:    [[TMP1:%.*]] = sub i64 1021, [[INDEX]]
 597 ; CHECK-NEXT:    [[TMP2:%.*]] = sub i64 1020, [[INDEX]]
 598 ; CHECK-NEXT:    [[TMP3:%.*]] = add nsw <4 x i64> [[BROADCAST_SPLAT]], [[VEC_IND]]
 599 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [[PAIR:%.*]], %pair* [[P1:%.*]], i64 [[OFFSET_IDX]], i32 0
 600 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[PAIR]], %pair* [[P1]], i64 [[TMP0]], i32 0
 601 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[PAIR]], %pair* [[P1]], i64 [[TMP1]], i32 0
 602 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[PAIR]], %pair* [[P1]], i64 [[TMP2]], i32 0
 603 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[PAIR]], %pair* [[P2:%.*]], i64 [[OFFSET_IDX]], i32 1
 604 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[PAIR]], %pair* [[P2]], i64 [[TMP0]], i32 1
 605 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [[PAIR]], %pair* [[P2]], i64 [[TMP1]], i32 1
 606 ; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [[PAIR]], %pair* [[P2]], i64 [[TMP2]], i32 1
 607 ; CHECK-NEXT:    [[TMP12:%.*]] = load i64, i64* [[TMP8]], align 8
 608 ; CHECK-NEXT:    [[TMP13:%.*]] = load i64, i64* [[TMP9]], align 8
 609 ; CHECK-NEXT:    [[TMP14:%.*]] = load i64, i64* [[TMP10]], align 8
 610 ; CHECK-NEXT:    [[TMP15:%.*]] = load i64, i64* [[TMP11]], align 8
 611 ; CHECK-NEXT:    [[TMP16:%.*]] = insertelement <4 x i64> poison, i64 [[TMP12]], i32 0
 612 ; CHECK-NEXT:    [[TMP17:%.*]] = insertelement <4 x i64> [[TMP16]], i64 [[TMP13]], i32 1
 613 ; CHECK-NEXT:    [[TMP18:%.*]] = insertelement <4 x i64> [[TMP17]], i64 [[TMP14]], i32 2
 614 ; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <4 x i64> [[TMP18]], i64 [[TMP15]], i32 3
 615 ; CHECK-NEXT:    [[TMP20:%.*]] = sub nsw <4 x i64> [[TMP19]], [[VEC_IND]]
 616 ; CHECK-NEXT:    [[TMP21:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0
 617 ; CHECK-NEXT:    store i64 [[TMP21]], i64* [[TMP4]], align 8
 618 ; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1
 619 ; CHECK-NEXT:    store i64 [[TMP22]], i64* [[TMP5]], align 8
 620 ; CHECK-NEXT:    [[TMP23:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2
 621 ; CHECK-NEXT:    store i64 [[TMP23]], i64* [[TMP6]], align 8
 622 ; CHECK-NEXT:    [[TMP24:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3
 623 ; CHECK-NEXT:    store i64 [[TMP24]], i64* [[TMP7]], align 8
 624 ; CHECK-NEXT:    [[TMP25:%.*]] = extractelement <4 x i64> [[TMP20]], i32 0
 625 ; CHECK-NEXT:    store i64 [[TMP25]], i64* [[TMP8]], align 8
 626 ; CHECK-NEXT:    [[TMP26:%.*]] = extractelement <4 x i64> [[TMP20]], i32 1
 627 ; CHECK-NEXT:    store i64 [[TMP26]], i64* [[TMP9]], align 8
 628 ; CHECK-NEXT:    [[TMP27:%.*]] = extractelement <4 x i64> [[TMP20]], i32 2
 629 ; CHECK-NEXT:    store i64 [[TMP27]], i64* [[TMP10]], align 8
 630 ; CHECK-NEXT:    [[TMP28:%.*]] = extractelement <4 x i64> [[TMP20]], i32 3
 631 ; CHECK-NEXT:    store i64 [[TMP28]], i64* [[TMP11]], align 8
 632 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 633 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], <i64 -4, i64 -4, i64 -4, i64 -4>
 634 ; CHECK-NEXT:    [[TMP29:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 635 ; CHECK-NEXT:    br i1 [[TMP29]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
 636 ; CHECK:       middle.block:
 637 ; CHECK-NEXT:    br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
 638 ; CHECK:       scalar.ph:
 639 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 640 ; CHECK:       for.body:
 641 ; CHECK-NEXT:    br i1 undef, label [[FOR_BODY]], label [[FOR_EXIT]], !llvm.loop [[LOOP17:![0-9]+]]
 642 ; CHECK:       for.exit:
 643 ; CHECK-NEXT:    ret void
 644 ;
 645 entry:
 646   br label %for.body
 647
 648 for.body:
 649   %i = phi i64 [ 1023, %entry ], [ %i.next, %for.body ]
 650   %0 = add nsw i64 %X, %i
 651   %1 = getelementptr inbounds %pair, %pair* %P1, i64 %i, i32 0
 652   %2 = getelementptr inbounds %pair, %pair* %P2, i64 %i, i32 1
 653   %3 = load i64, i64* %2, align 8
 654   %4 = sub nsw i64 %3, %i
 655   store i64 %0, i64* %1, align 8
 656   store i64 %4, i64* %2, align 8
 657   %i.next = add nsw i64 %i, -1
 658   %cond = icmp sgt i64 %i, 0
 659   br i1 %cond, label %for.body, label %for.exit
 660
 661 for.exit:
 662   ret void
 663 }
 664
 665 ; Check vectorization on interleaved access groups identified from mixed
 666 ; loads/stores.
 667 ; void mixed_load2_store2(int *A, int *B) {
 668 ;   for (unsigned i = 0; i < 1024; i+=2)  {
 669 ;     B[i] = A[i] * A[i+1];
 670 ;     B[i+1] = A[i] + A[i+1];
 671 ;   }
 672 ; }
 673
 674
 675 define void @mixed_load2_store2(i32* noalias nocapture readonly %A, i32* noalias nocapture %B) {
 676 ; CHECK-LABEL: @mixed_load2_store2(
 677 ; CHECK-NEXT:  entry:
 678 ; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 679 ; CHECK:       vector.ph:
 680 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 681 ; CHECK:       vector.body:
 682 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 683 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = shl i64 [[INDEX]], 1
 684 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[OFFSET_IDX]]
 685 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <8 x i32>*
 686 ; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <8 x i32>, <8 x i32>* [[TMP1]], align 4
 687 ; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
 688 ; CHECK-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
 689 ; CHECK-NEXT:    [[TMP2:%.*]] = mul nsw <4 x i32> [[STRIDED_VEC1]], [[STRIDED_VEC]]
 690 ; CHECK-NEXT:    [[STRIDED_VEC3:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
 691 ; CHECK-NEXT:    [[STRIDED_VEC4:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
 692 ; CHECK-NEXT:    [[TMP3:%.*]] = add nsw <4 x i32> [[STRIDED_VEC4]], [[STRIDED_VEC3]]
 693 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[OFFSET_IDX]]
 694 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i32* [[TMP4]] to <8 x i32>*
 695 ; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
 696 ; CHECK-NEXT:    store <8 x i32> [[INTERLEAVED_VEC]], <8 x i32>* [[TMP5]], align 4
 697 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 698 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 512
 699 ; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
 700 ; CHECK:       middle.block:
 701 ; CHECK-NEXT:    br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
 702 ; CHECK:       scalar.ph:
 703 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 704 ; CHECK:       for.cond.cleanup:
 705 ; CHECK-NEXT:    ret void
 706 ; CHECK:       for.body:
 707 ; CHECK-NEXT:    br i1 undef, label [[FOR_BODY]], label [[FOR_COND_CLEANUP]], !llvm.loop [[LOOP19:![0-9]+]]
 708 ;
 709 entry:
 710   br label %for.body
 711
 712 for.cond.cleanup:                                 ; preds = %for.body
 713   ret void
 714
 715 for.body:                                         ; preds = %for.body, %entry
 716   %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
 717   %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
 718   %tmp = load i32, i32* %arrayidx, align 4
 719   %tmp1 = or i64 %indvars.iv, 1
 720   %arrayidx2 = getelementptr inbounds i32, i32* %A, i64 %tmp1
 721   %tmp2 = load i32, i32* %arrayidx2, align 4
 722   %mul = mul nsw i32 %tmp2, %tmp
 723   %arrayidx4 = getelementptr inbounds i32, i32* %B, i64 %indvars.iv
 724   store i32 %mul, i32* %arrayidx4, align 4
 725   %tmp3 = load i32, i32* %arrayidx, align 4
 726   %tmp4 = load i32, i32* %arrayidx2, align 4
 727   %add10 = add nsw i32 %tmp4, %tmp3
 728   %arrayidx13 = getelementptr inbounds i32, i32* %B, i64 %tmp1
 729   store i32 %add10, i32* %arrayidx13, align 4
 730   %indvars.iv.next = add nuw nsw i64 %indvars.iv, 2
 731   %cmp = icmp ult i64 %indvars.iv.next, 1024
 732   br i1 %cmp, label %for.body, label %for.cond.cleanup
 733 }
 734
 735 ; Check vectorization on interleaved access groups identified from mixed
 736 ; loads/stores.
 737 ; void mixed_load3_store3(int *A) {
 738 ;   for (unsigned i = 0; i < 1024; i++)  {
 739 ;     *A++ += i;
 740 ;     *A++ += i;
 741 ;     *A++ += i;
 742 ;   }
 743 ; }
 744
 745
 746 define void @mixed_load3_store3(i32* nocapture %A) {
 747 ; CHECK-LABEL: @mixed_load3_store3(
 748 ; CHECK-NEXT:  entry:
 749 ; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 750 ; CHECK:       vector.ph:
 751 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 752 ; CHECK:       vector.body:
 753 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 754 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 755 ; CHECK-NEXT:    [[TMP0:%.*]] = mul i64 [[INDEX]], 3
 756 ; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i32, i32* [[A:%.*]], i64 [[TMP0]]
 757 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[NEXT_GEP]] to <12 x i32>*
 758 ; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <12 x i32>, <12 x i32>* [[TMP1]], align 4
 759 ; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <12 x i32> [[WIDE_VEC]], <12 x i32> poison, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
 760 ; CHECK-NEXT:    [[STRIDED_VEC2:%.*]] = shufflevector <12 x i32> [[WIDE_VEC]], <12 x i32> poison, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
 761 ; CHECK-NEXT:    [[STRIDED_VEC3:%.*]] = shufflevector <12 x i32> [[WIDE_VEC]], <12 x i32> poison, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
 762 ; CHECK-NEXT:    [[TMP2:%.*]] = add <4 x i32> [[STRIDED_VEC]], [[VEC_IND]]
 763 ; CHECK-NEXT:    [[TMP3:%.*]] = add <4 x i32> [[STRIDED_VEC2]], [[VEC_IND]]
 764 ; CHECK-NEXT:    [[TMP4:%.*]] = add <4 x i32> [[STRIDED_VEC3]], [[VEC_IND]]
 765 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i32* [[NEXT_GEP]] to <12 x i32>*
 766 ; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 767 ; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
 768 ; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i32> [[TMP6]], <8 x i32> [[TMP7]], <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
 769 ; CHECK-NEXT:    store <12 x i32> [[INTERLEAVED_VEC]], <12 x i32>* [[TMP5]], align 4
 770 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 771 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
 772 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 773 ; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
 774 ; CHECK:       middle.block:
 775 ; CHECK-NEXT:    br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
 776 ; CHECK:       scalar.ph:
 777 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 778 ; CHECK:       for.cond.cleanup:
 779 ; CHECK-NEXT:    ret void
 780 ; CHECK:       for.body:
 781 ; CHECK-NEXT:    br i1 undef, label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
 782 ;
 783 entry:
 784   br label %for.body
 785
 786 for.cond.cleanup:                                 ; preds = %for.body
 787   ret void
 788
 789 for.body:                                         ; preds = %for.body, %entry
 790   %i.013 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
 791   %A.addr.012 = phi i32* [ %A, %entry ], [ %incdec.ptr3, %for.body ]
 792   %incdec.ptr = getelementptr inbounds i32, i32* %A.addr.012, i64 1
 793   %tmp = load i32, i32* %A.addr.012, align 4
 794   %add = add i32 %tmp, %i.013
 795   store i32 %add, i32* %A.addr.012, align 4
 796   %incdec.ptr1 = getelementptr inbounds i32, i32* %A.addr.012, i64 2
 797   %tmp1 = load i32, i32* %incdec.ptr, align 4
 798   %add2 = add i32 %tmp1, %i.013
 799   store i32 %add2, i32* %incdec.ptr, align 4
 800   %incdec.ptr3 = getelementptr inbounds i32, i32* %A.addr.012, i64 3
 801   %tmp2 = load i32, i32* %incdec.ptr1, align 4
 802   %add4 = add i32 %tmp2, %i.013
 803   store i32 %add4, i32* %incdec.ptr1, align 4
 804   %inc = add nuw nsw i32 %i.013, 1
 805   %exitcond = icmp eq i32 %inc, 1024
 806   br i1 %exitcond, label %for.cond.cleanup, label %for.body
 807 }
 808
 809 ; Check vectorization on interleaved access groups with members having different
 810 ; kinds of type.
 811
 812 ; struct IntFloat {
 813 ;   int a;
 814 ;   float b;
 815 ; };
 816 ;
 817 ; int SA;
 818 ; float SB;
 819 ;
 820 ; void int_float_struct(struct IntFloat *A) {
 821 ;   int SumA;
 822 ;   float SumB;
 823 ;   for (unsigned i = 0; i < 1024; i++)  {
 824 ;     SumA += A[i].a;
 825 ;     SumB += A[i].b;
 826 ;   }
 827 ;   SA = SumA;
 828 ;   SB = SumB;
 829 ; }
 830
 831
 832 %struct.IntFloat = type { i32, float }
 833
 834 @SA = common global i32 0, align 4
 835 @SB = common global float 0.000000e+00, align 4
 836
 837 define void @int_float_struct(%struct.IntFloat* nocapture readonly %A) #0 {
 838 ; CHECK-LABEL: @int_float_struct(
 839 ; CHECK-NEXT:  entry:
 840 ; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 841 ; CHECK:       vector.ph:
 842 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 843 ; CHECK:       vector.body:
 844 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 845 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x float> [ <float undef, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
 846 ; CHECK-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x i32> [ <i32 undef, i32 0, i32 0, i32 0>, [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ]
 847 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT_INTFLOAT:%.*]], %struct.IntFloat* [[A:%.*]], i64 [[INDEX]], i32 0
 848 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <8 x i32>*
 849 ; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <8 x i32>, <8 x i32>* [[TMP1]], align 4
 850 ; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
 851 ; CHECK-NEXT:    [[STRIDED_VEC2:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
 852 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[STRIDED_VEC2]] to <4 x float>
 853 ; CHECK-NEXT:    [[TMP3]] = add <4 x i32> [[STRIDED_VEC]], [[VEC_PHI1]]
 854 ; CHECK-NEXT:    [[TMP4]] = fadd fast <4 x float> [[VEC_PHI]], [[TMP2]]
 855 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 856 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 857 ; CHECK-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
 858 ; CHECK:       middle.block:
 859 ; CHECK-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP3]])
 860 ; CHECK-NEXT:    [[TMP7:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP4]])
 861 ; CHECK-NEXT:    br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
 862 ; CHECK:       scalar.ph:
 863 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 864 ; CHECK:       for.cond.cleanup:
 865 ; CHECK-NEXT:    [[ADD_LCSSA:%.*]] = phi i32 [ undef, [[FOR_BODY]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ]
 866 ; CHECK-NEXT:    [[ADD3_LCSSA:%.*]] = phi float [ undef, [[FOR_BODY]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ]
 867 ; CHECK-NEXT:    store i32 [[ADD_LCSSA]], i32* @SA, align 4
 868 ; CHECK-NEXT:    store float [[ADD3_LCSSA]], float* @SB, align 4
 869 ; CHECK-NEXT:    ret void
 870 ; CHECK:       for.body:
 871 ; CHECK-NEXT:    br i1 undef, label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]]
 872 ;
 873 entry:
 874   br label %for.body
 875
 876 for.cond.cleanup:                                 ; preds = %for.body
 877   store i32 %add, i32* @SA, align 4
 878   store float %add3, float* @SB, align 4
 879   ret void
 880
 881 for.body:                                         ; preds = %for.body, %entry
 882   %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
 883   %SumB.014 = phi float [ undef, %entry ], [ %add3, %for.body ]
 884   %SumA.013 = phi i32 [ undef, %entry ], [ %add, %for.body ]
 885   %a = getelementptr inbounds %struct.IntFloat, %struct.IntFloat* %A, i64 %indvars.iv, i32 0
 886   %tmp = load i32, i32* %a, align 4
 887   %add = add nsw i32 %tmp, %SumA.013
 888   %b = getelementptr inbounds %struct.IntFloat, %struct.IntFloat* %A, i64 %indvars.iv, i32 1
 889   %tmp1 = load float, float* %b, align 4
 890   %add3 = fadd fast float %SumB.014, %tmp1
 891   %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
 892   %exitcond = icmp eq i64 %indvars.iv.next, 1024
 893   br i1 %exitcond, label %for.cond.cleanup, label %for.body
 894 }
 895
 896 ; Check vectorization of interleaved access groups in the presence of
 897 ; dependences (PR27626). The following tests check that we don't reorder
 898 ; dependent loads and stores when generating code for interleaved access
 899 ; groups. Stores should be scalarized because the required code motion would
 900 ; break dependences, and the remaining interleaved load groups should have
 901 ; gaps.
 902
 903 ; PR27626_0: Ensure a strided store is not moved after a dependent (zero
 904 ;            distance) strided load.
 905
 906 ; void PR27626_0(struct pair *p, int z, int n) {
 907 ;   for (int i = 0; i < n; i++) {
 908 ;     p[i].x = z;
 909 ;     p[i].y = p[i].x;
 910 ;   }
 911 ; }
 912
 913
 914 %pair.i32 = type { i32, i32 }
 915 define void @PR27626_0(%pair.i32 *%p, i32 %z, i64 %n) {
 916 ; CHECK-LABEL: @PR27626_0(
 917 ; CHECK-NEXT:  entry:
 918 ; CHECK-NEXT:    [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N:%.*]], i64 1)
 919 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[SMAX]], 5
 920 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 921 ; CHECK:       vector.ph:
 922 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = and i64 [[SMAX]], 3
 923 ; CHECK-NEXT:    [[TMP0:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
 924 ; CHECK-NEXT:    [[TMP1:%.*]] = select i1 [[TMP0]], i64 4, i64 [[N_MOD_VF]]
 925 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub nsw i64 [[SMAX]], [[TMP1]]
 926 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 927 ; CHECK:       vector.body:
 928 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 929 ; CHECK-NEXT:    [[TMP2:%.*]] = or i64 [[INDEX]], 1
 930 ; CHECK-NEXT:    [[TMP3:%.*]] = or i64 [[INDEX]], 2
 931 ; CHECK-NEXT:    [[TMP4:%.*]] = or i64 [[INDEX]], 3
 932 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[PAIR_I32:%.*]], %pair.i32* [[P:%.*]], i64 [[INDEX]], i32 0
 933 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[TMP2]], i32 0
 934 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[TMP3]], i32 0
 935 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[TMP4]], i32 0
 936 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[INDEX]], i32 1
 937 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[TMP2]], i32 1
 938 ; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[TMP3]], i32 1
 939 ; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[TMP4]], i32 1
 940 ; CHECK-NEXT:    store i32 [[Z:%.*]], i32* [[TMP5]], align 4
 941 ; CHECK-NEXT:    store i32 [[Z]], i32* [[TMP6]], align 4
 942 ; CHECK-NEXT:    store i32 [[Z]], i32* [[TMP7]], align 4
 943 ; CHECK-NEXT:    store i32 [[Z]], i32* [[TMP8]], align 4
 944 ; CHECK-NEXT:    [[TMP13:%.*]] = bitcast i32* [[TMP5]] to <8 x i32>*
 945 ; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <8 x i32>, <8 x i32>* [[TMP13]], align 4
 946 ; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i32 0
 947 ; CHECK-NEXT:    store i32 [[TMP14]], i32* [[TMP9]], align 4
 948 ; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i32 2
 949 ; CHECK-NEXT:    store i32 [[TMP15]], i32* [[TMP10]], align 4
 950 ; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i32 4
 951 ; CHECK-NEXT:    store i32 [[TMP16]], i32* [[TMP11]], align 4
 952 ; CHECK-NEXT:    [[TMP17:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i32 6
 953 ; CHECK-NEXT:    store i32 [[TMP17]], i32* [[TMP12]], align 4
 954 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 955 ; CHECK-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 956 ; CHECK-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
 957 ; CHECK:       middle.block:
 958 ; CHECK-NEXT:    br label [[SCALAR_PH]]
 959 ; CHECK:       scalar.ph:
 960 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
 961 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 962 ; CHECK:       for.body:
 963 ; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
 964 ; CHECK-NEXT:    [[P_I_X:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[I]], i32 0
 965 ; CHECK-NEXT:    [[P_I_Y:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[I]], i32 1
 966 ; CHECK-NEXT:    store i32 [[Z]], i32* [[P_I_X]], align 4
 967 ; CHECK-NEXT:    store i32 [[Z]], i32* [[P_I_Y]], align 4
 968 ; CHECK-NEXT:    [[I_NEXT]] = add nuw nsw i64 [[I]], 1
 969 ; CHECK-NEXT:    [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]]
 970 ; CHECK-NEXT:    br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END:%.*]], !llvm.loop [[LOOP25:![0-9]+]]
 971 ; CHECK:       for.end:
 972 ; CHECK-NEXT:    ret void
 973 ;
 974 entry:
 975   br label %for.body
 976
 977 for.body:
 978   %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
 979   %p_i.x = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i, i32 0
 980   %p_i.y = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i, i32 1
 981   store i32 %z, i32* %p_i.x, align 4
 982   %0 = load i32, i32* %p_i.x, align 4
 983   store i32 %0, i32 *%p_i.y, align 4
 984   %i.next = add nuw nsw i64 %i, 1
 985   %cond = icmp slt i64 %i.next, %n
 986   br i1 %cond, label %for.body, label %for.end
 987
 988 for.end:
 989   ret void
 990 }
 991
 992 ; PR27626_1: Ensure a strided load is not moved before a dependent (zero
 993 ;            distance) strided store.
 994
 995 ; void PR27626_1(struct pair *p, int n) {
 996 ;   int s = 0;
 997 ;   for (int i = 0; i < n; i++) {
 998 ;     p[i].y = p[i].x;
 999 ;     s += p[i].y
1000 ;   }
1001 ; }
1002
1003
1004 define i32 @PR27626_1(%pair.i32 *%p, i64 %n) {
1005 ; CHECK-LABEL: @PR27626_1(
1006 ; CHECK-NEXT:  entry:
1007 ; CHECK-NEXT:    [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N:%.*]], i64 1)
1008 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[SMAX]], 5
1009 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
1010 ; CHECK:       vector.ph:
1011 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = and i64 [[SMAX]], 3
1012 ; CHECK-NEXT:    [[TMP0:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
1013 ; CHECK-NEXT:    [[TMP1:%.*]] = select i1 [[TMP0]], i64 4, i64 [[N_MOD_VF]]
1014 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub nsw i64 [[SMAX]], [[TMP1]]
1015 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
1016 ; CHECK:       vector.body:
1017 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
1018 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP16:%.*]], [[VECTOR_BODY]] ]
1019 ; CHECK-NEXT:    [[TMP2:%.*]] = or i64 [[INDEX]], 1
1020 ; CHECK-NEXT:    [[TMP3:%.*]] = or i64 [[INDEX]], 2
1021 ; CHECK-NEXT:    [[TMP4:%.*]] = or i64 [[INDEX]], 3
1022 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[PAIR_I32:%.*]], %pair.i32* [[P:%.*]], i64 [[INDEX]], i32 0
1023 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[INDEX]], i32 1
1024 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[TMP2]], i32 1
1025 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[TMP3]], i32 1
1026 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[TMP4]], i32 1
1027 ; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i32* [[TMP5]] to <8 x i32>*
1028 ; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <8 x i32>, <8 x i32>* [[TMP10]], align 4
1029 ; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i32 0
1030 ; CHECK-NEXT:    store i32 [[TMP11]], i32* [[TMP6]], align 4
1031 ; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i32 2
1032 ; CHECK-NEXT:    store i32 [[TMP12]], i32* [[TMP7]], align 4
1033 ; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i32 4
1034 ; CHECK-NEXT:    store i32 [[TMP13]], i32* [[TMP8]], align 4
1035 ; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i32 6
1036 ; CHECK-NEXT:    store i32 [[TMP14]], i32* [[TMP9]], align 4
1037 ; CHECK-NEXT:    [[TMP15:%.*]] = bitcast i32* [[TMP6]] to <8 x i32>*
1038 ; CHECK-NEXT:    [[WIDE_VEC1:%.*]] = load <8 x i32>, <8 x i32>* [[TMP15]], align 4
1039 ; CHECK-NEXT:    [[STRIDED_VEC2:%.*]] = shufflevector <8 x i32> [[WIDE_VEC1]], <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
1040 ; CHECK-NEXT:    [[TMP16]] = add <4 x i32> [[STRIDED_VEC2]], [[VEC_PHI]]
1041 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
1042 ; CHECK-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
1043 ; CHECK-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]]
1044 ; CHECK:       middle.block:
1045 ; CHECK-NEXT:    [[TMP18:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP16]])
1046 ; CHECK-NEXT:    br label [[SCALAR_PH]]
1047 ; CHECK:       scalar.ph:
1048 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
1049 ; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP18]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
1050 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
1051 ; CHECK:       for.body:
1052 ; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
1053 ; CHECK-NEXT:    [[S:%.*]] = phi i32 [ [[TMP20:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
1054 ; CHECK-NEXT:    [[P_I_X:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[I]], i32 0
1055 ; CHECK-NEXT:    [[P_I_Y:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[I]], i32 1
1056 ; CHECK-NEXT:    [[TMP19:%.*]] = load i32, i32* [[P_I_X]], align 4
1057 ; CHECK-NEXT:    store i32 [[TMP19]], i32* [[P_I_Y]], align 4
1058 ; CHECK-NEXT:    [[TMP20]] = add nsw i32 [[TMP19]], [[S]]
1059 ; CHECK-NEXT:    [[I_NEXT]] = add nuw nsw i64 [[I]], 1
1060 ; CHECK-NEXT:    [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]]
1061 ; CHECK-NEXT:    br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END:%.*]], !llvm.loop [[LOOP27:![0-9]+]]
1062 ; CHECK:       for.end:
1063 ; CHECK-NEXT:    ret i32 [[TMP20]]
1064 ;
1065 entry:
1066   br label %for.body
1067
1068 for.body:
1069   %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
1070   %s = phi i32 [ %2, %for.body ], [ 0, %entry ]
1071   %p_i.x = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i, i32 0
1072   %p_i.y = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i, i32 1
1073   %0 = load i32, i32* %p_i.x, align 4
1074   store i32 %0, i32* %p_i.y, align 4
1075   %1 = load i32, i32* %p_i.y, align 4
1076   %2 = add nsw i32 %1, %s
1077   %i.next = add nuw nsw i64 %i, 1
1078   %cond = icmp slt i64 %i.next, %n
1079   br i1 %cond, label %for.body, label %for.end
1080
1081 for.end:
1082   %3 = phi i32 [ %2, %for.body ]
1083   ret i32 %3
1084 }
1085
1086 ; PR27626_2: Ensure a strided store is not moved after a dependent (negative
1087 ;            distance) strided load.
1088
1089 ; void PR27626_2(struct pair *p, int z, int n) {
1090 ;   for (int i = 0; i < n; i++) {
1091 ;     p[i].x = z;
1092 ;     p[i].y = p[i - 1].x;
1093 ;   }
1094 ; }
1095
1096
1097 define void @PR27626_2(%pair.i32 *%p, i64 %n, i32 %z) {
1098 ; CHECK-LABEL: @PR27626_2(
1099 ; CHECK-NEXT:  entry:
1100 ; CHECK-NEXT:    [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N:%.*]], i64 1)
1101 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[SMAX]], 5
1102 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
1103 ; CHECK:       vector.ph:
1104 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = and i64 [[SMAX]], 3
1105 ; CHECK-NEXT:    [[TMP0:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
1106 ; CHECK-NEXT:    [[TMP1:%.*]] = select i1 [[TMP0]], i64 4, i64 [[N_MOD_VF]]
1107 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub nsw i64 [[SMAX]], [[TMP1]]
1108 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
1109 ; CHECK:       vector.body:
1110 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
1111 ; CHECK-NEXT:    [[TMP2:%.*]] = or i64 [[INDEX]], 1
1112 ; CHECK-NEXT:    [[TMP3:%.*]] = or i64 [[INDEX]], 2
1113 ; CHECK-NEXT:    [[TMP4:%.*]] = or i64 [[INDEX]], 3
1114 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[PAIR_I32:%.*]], %pair.i32* [[P:%.*]], i64 [[INDEX]], i32 0
1115 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[TMP2]], i32 0
1116 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[TMP3]], i32 0
1117 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[TMP4]], i32 0
1118 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 -1, i32 0
1119 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[INDEX]], i32 1
1120 ; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[TMP2]], i32 1
1121 ; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[TMP3]], i32 1
1122 ; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[TMP4]], i32 1
1123 ; CHECK-NEXT:    store i32 [[Z:%.*]], i32* [[TMP5]], align 4
1124 ; CHECK-NEXT:    store i32 [[Z]], i32* [[TMP6]], align 4
1125 ; CHECK-NEXT:    store i32 [[Z]], i32* [[TMP7]], align 4
1126 ; CHECK-NEXT:    store i32 [[Z]], i32* [[TMP8]], align 4
1127 ; CHECK-NEXT:    [[TMP14:%.*]] = bitcast i32* [[TMP9]] to <8 x i32>*
1128 ; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <8 x i32>, <8 x i32>* [[TMP14]], align 4
1129 ; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i32 0
1130 ; CHECK-NEXT:    store i32 [[TMP15]], i32* [[TMP10]], align 4
1131 ; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i32 2
1132 ; CHECK-NEXT:    store i32 [[TMP16]], i32* [[TMP11]], align 4
1133 ; CHECK-NEXT:    [[TMP17:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i32 4
1134 ; CHECK-NEXT:    store i32 [[TMP17]], i32* [[TMP12]], align 4
1135 ; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i32 6
1136 ; CHECK-NEXT:    store i32 [[TMP18]], i32* [[TMP13]], align 4
1137 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
1138 ; CHECK-NEXT:    [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
1139 ; CHECK-NEXT:    br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]]
1140 ; CHECK:       middle.block:
1141 ; CHECK-NEXT:    br label [[SCALAR_PH]]
1142 ; CHECK:       scalar.ph:
1143 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
1144 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
1145 ; CHECK:       for.body:
1146 ; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
1147 ; CHECK-NEXT:    [[P_I_X:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[I]], i32 0
1148 ; CHECK-NEXT:    [[P_I_MINUS_1_X:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 -1, i32 0
1149 ; CHECK-NEXT:    [[P_I_Y:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[I]], i32 1
1150 ; CHECK-NEXT:    store i32 [[Z]], i32* [[P_I_X]], align 4
1151 ; CHECK-NEXT:    [[TMP20:%.*]] = load i32, i32* [[P_I_MINUS_1_X]], align 4
1152 ; CHECK-NEXT:    store i32 [[TMP20]], i32* [[P_I_Y]], align 4
1153 ; CHECK-NEXT:    [[I_NEXT]] = add nuw nsw i64 [[I]], 1
1154 ; CHECK-NEXT:    [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]]
1155 ; CHECK-NEXT:    br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END:%.*]], !llvm.loop [[LOOP29:![0-9]+]]
1156 ; CHECK:       for.end:
1157 ; CHECK-NEXT:    ret void
1158 ;
1159 entry:
1160   br label %for.body
1161
1162 for.body:
1163   %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
1164   %i_minus_1 = add nuw nsw i64 %i, -1
1165   %p_i.x = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i, i32 0
1166   %p_i_minus_1.x = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i_minus_1, i32 0
1167   %p_i.y = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i, i32 1
1168   store i32 %z, i32* %p_i.x, align 4
1169   %0 = load i32, i32* %p_i_minus_1.x, align 4
1170   store i32 %0, i32 *%p_i.y, align 4
1171   %i.next = add nuw nsw i64 %i, 1
1172   %cond = icmp slt i64 %i.next, %n
1173   br i1 %cond, label %for.body, label %for.end
1174
1175 for.end:
1176   ret void
1177 }
1178
1179 ; PR27626_3: Ensure a strided load is not moved before a dependent (negative
1180 ;            distance) strided store.
1181
1182 ; void PR27626_3(struct pair *p, int z, int n) {
1183 ;   for (int i = 0; i < n; i++) {
1184 ;     p[i + 1].y = p[i].x;
1185 ;     s += p[i].y;
1186 ;   }
1187 ; }
1188
1189
1190 define i32 @PR27626_3(%pair.i32 *%p, i64 %n, i32 %z) {
1191 ; CHECK-LABEL: @PR27626_3(
1192 ; CHECK-NEXT:  entry:
1193 ; CHECK-NEXT:    [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N:%.*]], i64 1)
1194 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[SMAX]], 5
1195 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
1196 ; CHECK:       vector.ph:
1197 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = and i64 [[SMAX]], 3
1198 ; CHECK-NEXT:    [[TMP0:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
1199 ; CHECK-NEXT:    [[TMP1:%.*]] = select i1 [[TMP0]], i64 4, i64 [[N_MOD_VF]]
1200 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub nsw i64 [[SMAX]], [[TMP1]]
1201 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
1202 ; CHECK:       vector.body:
1203 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
1204 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
1205 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ]
1206 ; CHECK-NEXT:    [[TMP2:%.*]] = add nuw nsw <4 x i64> [[VEC_IND]], <i64 1, i64 1, i64 1, i64 1>
1207 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [[PAIR_I32:%.*]], %pair.i32* [[P:%.*]], i64 [[INDEX]], i32 0
1208 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[INDEX]], i32 1
1209 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x i64> [[TMP2]], i32 0
1210 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[TMP5]], i32 1
1211 ; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP2]], i32 1
1212 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[TMP7]], i32 1
1213 ; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <4 x i64> [[TMP2]], i32 2
1214 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[TMP9]], i32 1
1215 ; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <4 x i64> [[TMP2]], i32 3
1216 ; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[TMP11]], i32 1
1217 ; CHECK-NEXT:    [[TMP13:%.*]] = bitcast i32* [[TMP3]] to <8 x i32>*
1218 ; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <8 x i32>, <8 x i32>* [[TMP13]], align 4
1219 ; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i32 0
1220 ; CHECK-NEXT:    store i32 [[TMP14]], i32* [[TMP6]], align 4
1221 ; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i32 2
1222 ; CHECK-NEXT:    store i32 [[TMP15]], i32* [[TMP8]], align 4
1223 ; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i32 4
1224 ; CHECK-NEXT:    store i32 [[TMP16]], i32* [[TMP10]], align 4
1225 ; CHECK-NEXT:    [[TMP17:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i32 6
1226 ; CHECK-NEXT:    store i32 [[TMP17]], i32* [[TMP12]], align 4
1227 ; CHECK-NEXT:    [[TMP18:%.*]] = bitcast i32* [[TMP4]] to <8 x i32>*
1228 ; CHECK-NEXT:    [[WIDE_VEC1:%.*]] = load <8 x i32>, <8 x i32>* [[TMP18]], align 4
1229 ; CHECK-NEXT:    [[STRIDED_VEC2:%.*]] = shufflevector <8 x i32> [[WIDE_VEC1]], <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
1230 ; CHECK-NEXT:    [[TMP19]] = add <4 x i32> [[STRIDED_VEC2]], [[VEC_PHI]]
1231 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
1232 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], <i64 4, i64 4, i64 4, i64 4>
1233 ; CHECK-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
1234 ; CHECK-NEXT:    br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP30:![0-9]+]]
1235 ; CHECK:       middle.block:
1236 ; CHECK-NEXT:    [[TMP21:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP19]])
1237 ; CHECK-NEXT:    br label [[SCALAR_PH]]
1238 ; CHECK:       scalar.ph:
1239 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
1240 ; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP21]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
1241 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
1242 ; CHECK:       for.body:
1243 ; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
1244 ; CHECK-NEXT:    [[S:%.*]] = phi i32 [ [[TMP24:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
1245 ; CHECK-NEXT:    [[I_PLUS_1:%.*]] = add nuw nsw i64 [[I]], 1
1246 ; CHECK-NEXT:    [[P_I_X:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[I]], i32 0
1247 ; CHECK-NEXT:    [[P_I_Y:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[I]], i32 1
1248 ; CHECK-NEXT:    [[P_I_PLUS_1_Y:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[I_PLUS_1]], i32 1
1249 ; CHECK-NEXT:    [[TMP22:%.*]] = load i32, i32* [[P_I_X]], align 4
1250 ; CHECK-NEXT:    store i32 [[TMP22]], i32* [[P_I_PLUS_1_Y]], align 4
1251 ; CHECK-NEXT:    [[TMP23:%.*]] = load i32, i32* [[P_I_Y]], align 4
1252 ; CHECK-NEXT:    [[TMP24]] = add nsw i32 [[TMP23]], [[S]]
1253 ; CHECK-NEXT:    [[I_NEXT]] = add nuw nsw i64 [[I]], 1
1254 ; CHECK-NEXT:    [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]]
1255 ; CHECK-NEXT:    br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END:%.*]], !llvm.loop [[LOOP31:![0-9]+]]
1256 ; CHECK:       for.end:
1257 ; CHECK-NEXT:    ret i32 [[TMP24]]
1258 ;
1259 entry:
1260   br label %for.body
1261
1262 for.body:
1263   %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
1264   %s = phi i32 [ %2, %for.body ], [ 0, %entry ]
1265   %i_plus_1 = add nuw nsw i64 %i, 1
1266   %p_i.x = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i, i32 0
1267   %p_i.y = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i, i32 1
1268   %p_i_plus_1.y = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i_plus_1, i32 1
1269   %0 = load i32, i32* %p_i.x, align 4
1270   store i32 %0, i32* %p_i_plus_1.y, align 4
1271   %1 = load i32, i32* %p_i.y, align 4
1272   %2 = add nsw i32 %1, %s
1273   %i.next = add nuw nsw i64 %i, 1
1274   %cond = icmp slt i64 %i.next, %n
1275   br i1 %cond, label %for.body, label %for.end
1276
1277 for.end:
1278   %3 = phi i32 [ %2, %for.body ]
1279   ret i32 %3
1280 }
1281
1282 ; PR27626_4: Ensure we form an interleaved group for strided stores in the
1283 ;            presence of a write-after-write dependence. We create a group for
1284 ;            (2) and (3) while excluding (1).
1285
1286 ; void PR27626_4(int *a, int x, int y, int z, int n) {
1287 ;   for (int i = 0; i < n; i += 2) {
1288 ;     a[i] = x;      // (1)
1289 ;     a[i] = y;      // (2)
1290 ;     a[i + 1] = z;  // (3)
1291 ;   }
1292 ; }
1293
1294
1295 define void @PR27626_4(i32 *%a, i32 %x, i32 %y, i32 %z, i64 %n) {
1296 ; CHECK-LABEL: @PR27626_4(
1297 ; CHECK-NEXT:  entry:
1298 ; CHECK-NEXT:    [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N:%.*]], i64 2)
1299 ; CHECK-NEXT:    [[TMP0:%.*]] = add nsw i64 [[SMAX]], -1
1300 ; CHECK-NEXT:    [[TMP1:%.*]] = lshr i64 [[TMP0]], 1
1301 ; CHECK-NEXT:    [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1
1302 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 6
1303 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
1304 ; CHECK:       vector.ph:
1305 ; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[TMP2]], 9223372036854775804
1306 ; CHECK-NEXT:    [[IND_END:%.*]] = shl nuw i64 [[N_VEC]], 1
1307 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[Y:%.*]], i32 0
1308 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
1309 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i32> poison, i32 [[Z:%.*]], i32 0
1310 ; CHECK-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT1]], <4 x i32> poison, <4 x i32> zeroinitializer
1311 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
1312 ; CHECK:       vector.body:
1313 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
1314 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = shl i64 [[INDEX]], 1
1315 ; CHECK-NEXT:    [[TMP3:%.*]] = or i64 [[OFFSET_IDX]], 2
1316 ; CHECK-NEXT:    [[TMP4:%.*]] = or i64 [[OFFSET_IDX]], 4
1317 ; CHECK-NEXT:    [[TMP5:%.*]] = or i64 [[OFFSET_IDX]], 6
1318 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[OFFSET_IDX]]
1319 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP3]]
1320 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP4]]
1321 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP5]]
1322 ; CHECK-NEXT:    store i32 [[X:%.*]], i32* [[TMP6]], align 4
1323 ; CHECK-NEXT:    store i32 [[X]], i32* [[TMP7]], align 4
1324 ; CHECK-NEXT:    store i32 [[X]], i32* [[TMP8]], align 4
1325 ; CHECK-NEXT:    store i32 [[X]], i32* [[TMP9]], align 4
1326 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[OFFSET_IDX]]
1327 ; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i32* [[TMP10]] to <8 x i32>*
1328 ; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLAT]], <4 x i32> [[BROADCAST_SPLAT2]], <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
1329 ; CHECK-NEXT:    store <8 x i32> [[INTERLEAVED_VEC]], <8 x i32>* [[TMP11]], align 4
1330 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
1331 ; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
1332 ; CHECK-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP32:![0-9]+]]
1333 ; CHECK:       middle.block:
1334 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
1335 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
1336 ; CHECK:       scalar.ph:
1337 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
1338 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
1339 ; CHECK:       for.body:
1340 ; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
1341 ; CHECK-NEXT:    [[I_PLUS_1:%.*]] = or i64 [[I]], 1
1342 ; CHECK-NEXT:    [[A_I:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[I]]
1343 ; CHECK-NEXT:    [[A_I_PLUS_1:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[I_PLUS_1]]
1344 ; CHECK-NEXT:    store i32 [[Y]], i32* [[A_I]], align 4
1345 ; CHECK-NEXT:    store i32 [[Z]], i32* [[A_I_PLUS_1]], align 4
1346 ; CHECK-NEXT:    [[I_NEXT]] = add nuw nsw i64 [[I]], 2
1347 ; CHECK-NEXT:    [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]]
1348 ; CHECK-NEXT:    br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP33:![0-9]+]]
1349 ; CHECK:       for.end:
1350 ; CHECK-NEXT:    ret void
1351 ;
1352 entry:
1353   br label %for.body
1354
1355 for.body:
1356   %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
1357   %i_plus_1 = add i64 %i, 1
1358   %a_i = getelementptr inbounds i32, i32* %a, i64 %i
1359   %a_i_plus_1 = getelementptr inbounds i32, i32* %a, i64 %i_plus_1
1360   store i32 %x, i32* %a_i, align 4
1361   store i32 %y, i32* %a_i, align 4
1362   store i32 %z, i32* %a_i_plus_1, align 4
1363   %i.next = add nuw nsw i64 %i, 2
1364   %cond = icmp slt i64 %i.next, %n
1365   br i1 %cond, label %for.body, label %for.end
1366
1367 for.end:
1368   ret void
1369 }
1370
1371 ; PR27626_5: Ensure we do not form an interleaved group for strided stores in
1372 ;            the presence of a write-after-write dependence.
1373
1374 ; void PR27626_5(int *a, int x, int y, int z, int n) {
1375 ;   for (int i = 3; i < n; i += 2) {
1376 ;     a[i - 1] = x;
1377 ;     a[i - 3] = y;
1378 ;     a[i] = z;
1379 ;   }
1380 ; }
1381
1382
1383 define void @PR27626_5(i32 *%a, i32 %x, i32 %y, i32 %z, i64 %n) {
1384 ; CHECK-LABEL: @PR27626_5(
1385 ; CHECK-NEXT:  entry:
1386 ; CHECK-NEXT:    [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N:%.*]], i64 5)
1387 ; CHECK-NEXT:    [[TMP0:%.*]] = add nsw i64 [[SMAX]], -4
1388 ; CHECK-NEXT:    [[TMP1:%.*]] = lshr i64 [[TMP0]], 1
1389 ; CHECK-NEXT:    [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1
1390 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 6
1391 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
1392 ; CHECK:       vector.ph:
1393 ; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[TMP2]], 9223372036854775804
1394 ; CHECK-NEXT:    [[TMP3:%.*]] = shl nuw i64 [[N_VEC]], 1
1395 ; CHECK-NEXT:    [[IND_END:%.*]] = or i64 [[TMP3]], 3
1396 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
1397 ; CHECK:       vector.body:
1398 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
1399 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 3, i64 5, i64 7, i64 9>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
1400 ; CHECK-NEXT:    [[TMP4:%.*]] = shl i64 [[INDEX]], 1
1401 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = or i64 [[TMP4]], 3
1402 ; CHECK-NEXT:    [[TMP5:%.*]] = or i64 [[TMP4]], 5
1403 ; CHECK-NEXT:    [[TMP6:%.*]] = or i64 [[TMP4]], 7
1404 ; CHECK-NEXT:    [[TMP7:%.*]] = add i64 [[TMP4]], 9
1405 ; CHECK-NEXT:    [[TMP8:%.*]] = add <4 x i64> [[VEC_IND]], <i64 -1, i64 -1, i64 -1, i64 -1>
1406 ; CHECK-NEXT:    [[TMP9:%.*]] = add <4 x i64> [[VEC_IND]], <i64 -3, i64 -3, i64 -3, i64 -3>
1407 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[OFFSET_IDX]]
1408 ; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP5]]
1409 ; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP6]]
1410 ; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP7]]
1411 ; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <4 x i64> [[TMP8]], i32 0
1412 ; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP14]]
1413 ; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <4 x i64> [[TMP8]], i32 1
1414 ; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP16]]
1415 ; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <4 x i64> [[TMP8]], i32 2
1416 ; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP18]]
1417 ; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <4 x i64> [[TMP8]], i32 3
1418 ; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP20]]
1419 ; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <4 x i64> [[TMP9]], i32 0
1420 ; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP22]]
1421 ; CHECK-NEXT:    [[TMP24:%.*]] = extractelement <4 x i64> [[TMP9]], i32 1
1422 ; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP24]]
1423 ; CHECK-NEXT:    [[TMP26:%.*]] = extractelement <4 x i64> [[TMP9]], i32 2
1424 ; CHECK-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP26]]
1425 ; CHECK-NEXT:    [[TMP28:%.*]] = extractelement <4 x i64> [[TMP9]], i32 3
1426 ; CHECK-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP28]]
1427 ; CHECK-NEXT:    store i32 [[X:%.*]], i32* [[TMP15]], align 4
1428 ; CHECK-NEXT:    store i32 [[X]], i32* [[TMP17]], align 4
1429 ; CHECK-NEXT:    store i32 [[X]], i32* [[TMP19]], align 4
1430 ; CHECK-NEXT:    store i32 [[X]], i32* [[TMP21]], align 4
1431 ; CHECK-NEXT:    store i32 [[Y:%.*]], i32* [[TMP23]], align 4
1432 ; CHECK-NEXT:    store i32 [[Y]], i32* [[TMP25]], align 4
1433 ; CHECK-NEXT:    store i32 [[Y]], i32* [[TMP27]], align 4
1434 ; CHECK-NEXT:    store i32 [[Y]], i32* [[TMP29]], align 4
1435 ; CHECK-NEXT:    store i32 [[Z:%.*]], i32* [[TMP10]], align 4
1436 ; CHECK-NEXT:    store i32 [[Z]], i32* [[TMP11]], align 4
1437 ; CHECK-NEXT:    store i32 [[Z]], i32* [[TMP12]], align 4
1438 ; CHECK-NEXT:    store i32 [[Z]], i32* [[TMP13]], align 4
1439 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
1440 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], <i64 8, i64 8, i64 8, i64 8>
1441 ; CHECK-NEXT:    [[TMP30:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
1442 ; CHECK-NEXT:    br i1 [[TMP30]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP34:![0-9]+]]
1443 ; CHECK:       middle.block:
1444 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
1445 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
1446 ; CHECK:       scalar.ph:
1447 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 3, [[ENTRY:%.*]] ]
1448 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
1449 ; CHECK:       for.body:
1450 ; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
1451 ; CHECK-NEXT:    [[I_MINUS_1:%.*]] = add i64 [[I]], -1
1452 ; CHECK-NEXT:    [[I_MINUS_3:%.*]] = add i64 [[I]], -3
1453 ; CHECK-NEXT:    [[A_I:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[I]]
1454 ; CHECK-NEXT:    [[A_I_MINUS_1:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[I_MINUS_1]]
1455 ; CHECK-NEXT:    [[A_I_MINUS_3:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[I_MINUS_3]]
1456 ; CHECK-NEXT:    store i32 [[X]], i32* [[A_I_MINUS_1]], align 4
1457 ; CHECK-NEXT:    store i32 [[Y]], i32* [[A_I_MINUS_3]], align 4
1458 ; CHECK-NEXT:    store i32 [[Z]], i32* [[A_I]], align 4
1459 ; CHECK-NEXT:    [[I_NEXT]] = add nuw nsw i64 [[I]], 2
1460 ; CHECK-NEXT:    [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]]
1461 ; CHECK-NEXT:    br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP35:![0-9]+]]
1462 ; CHECK:       for.end:
1463 ; CHECK-NEXT:    ret void
1464 ;
1465 entry:
1466   br label %for.body
1467
1468 for.body:
1469   %i = phi i64 [ %i.next, %for.body ], [ 3, %entry ]
1470   %i_minus_1 = sub i64 %i, 1
1471   %i_minus_3 = sub i64 %i_minus_1, 2
1472   %a_i = getelementptr inbounds i32, i32* %a, i64 %i
1473   %a_i_minus_1 = getelementptr inbounds i32, i32* %a, i64 %i_minus_1
1474   %a_i_minus_3 = getelementptr inbounds i32, i32* %a, i64 %i_minus_3
1475   store i32 %x, i32* %a_i_minus_1, align 4
1476   store i32 %y, i32* %a_i_minus_3, align 4
1477   store i32 %z, i32* %a_i, align 4
1478   %i.next = add nuw nsw i64 %i, 2
1479   %cond = icmp slt i64 %i.next, %n
1480   br i1 %cond, label %for.body, label %for.end
1481
1482 for.end:
1483   ret void
1484 }
1485
1486 ; PR34743: Ensure that a cast which needs to sink after a load that belongs to
1487 ; an interleaved group, indeeded gets sunk.
1488
1489 ; void PR34743(short *a, int *b, int n) {
1490 ;   for (int i = 0, iv = 0; iv < n; i++, iv += 2) {
1491 ;     b[i] = a[iv] * a[iv+1] * a[iv+2];
1492 ;   }
1493 ; }
1494
1495
1496 define void @PR34743(i16* %a, i32* %b, i64 %n) {
1497 ; CHECK-LABEL: @PR34743(
1498 ; CHECK-NEXT:  entry:
1499 ; CHECK-NEXT:    [[DOTPRE:%.*]] = load i16, i16* [[A:%.*]], align 2
1500 ; CHECK-NEXT:    [[TMP0:%.*]] = lshr i64 [[N:%.*]], 1
1501 ; CHECK-NEXT:    [[TMP1:%.*]] = add nuw i64 [[TMP0]], 1
1502 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 6
1503 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
1504 ; CHECK:       vector.memcheck:
1505 ; CHECK-NEXT:    [[TMP2:%.*]] = lshr i64 [[N]], 1
1506 ; CHECK-NEXT:    [[TMP3:%.*]] = add nuw i64 [[TMP2]], 1
1507 ; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i32, i32* [[B:%.*]], i64 [[TMP3]]
1508 ; CHECK-NEXT:    [[SCEVGEP3:%.*]] = getelementptr i16, i16* [[A]], i64 1
1509 ; CHECK-NEXT:    [[TMP4:%.*]] = and i64 [[N]], -2
1510 ; CHECK-NEXT:    [[TMP5:%.*]] = add i64 [[TMP4]], 3
1511 ; CHECK-NEXT:    [[SCEVGEP5:%.*]] = getelementptr i16, i16* [[A]], i64 [[TMP5]]
1512 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i16* [[SCEVGEP5]] to i32*
1513 ; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ugt i32* [[TMP6]], [[B]]
1514 ; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i32* [[SCEVGEP]] to i16*
1515 ; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ult i16* [[SCEVGEP3]], [[TMP7]]
1516 ; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
1517 ; CHECK-NEXT:    br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
1518 ; CHECK:       vector.ph:
1519 ; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[TMP1]], -4
1520 ; CHECK-NEXT:    [[IND_END:%.*]] = shl i64 [[N_VEC]], 1
1521 ; CHECK-NEXT:    [[VECTOR_RECUR_INIT:%.*]] = insertelement <4 x i16> poison, i16 [[DOTPRE]], i32 3
1522 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
1523 ; CHECK:       vector.body:
1524 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
1525 ; CHECK-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x i16> [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[STRIDED_VEC8:%.*]], [[VECTOR_BODY]] ]
1526 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = shl i64 [[INDEX]], 1
1527 ; CHECK-NEXT:    [[TMP8:%.*]] = or i64 [[OFFSET_IDX]], 1
1528 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i16, i16* [[A]], i64 [[TMP8]]
1529 ; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16* [[TMP9]] to <8 x i16>*
1530 ; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <8 x i16>, <8 x i16>* [[TMP10]], align 4
1531 ; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <8 x i16> [[WIDE_VEC]], <8 x i16> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
1532 ; CHECK-NEXT:    [[STRIDED_VEC8]] = shufflevector <8 x i16> [[WIDE_VEC]], <8 x i16> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
1533 ; CHECK-NEXT:    [[TMP11:%.*]] = sext <4 x i16> [[STRIDED_VEC]] to <4 x i32>
1534 ; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <4 x i16> [[VECTOR_RECUR]], <4 x i16> [[STRIDED_VEC8]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
1535 ; CHECK-NEXT:    [[TMP13:%.*]] = sext <4 x i16> [[TMP12]] to <4 x i32>
1536 ; CHECK-NEXT:    [[TMP14:%.*]] = sext <4 x i16> [[STRIDED_VEC8]] to <4 x i32>
1537 ; CHECK-NEXT:    [[TMP15:%.*]] = mul nsw <4 x i32> [[TMP13]], [[TMP11]]
1538 ; CHECK-NEXT:    [[TMP16:%.*]] = mul nsw <4 x i32> [[TMP15]], [[TMP14]]
1539 ; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDEX]]
1540 ; CHECK-NEXT:    [[TMP18:%.*]] = bitcast i32* [[TMP17]] to <4 x i32>*
1541 ; CHECK-NEXT:    store <4 x i32> [[TMP16]], <4 x i32>* [[TMP18]], align 4, !alias.scope !36, !noalias !39
1542 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
1543 ; CHECK-NEXT:    [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
1544 ; CHECK-NEXT:    br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP41:![0-9]+]]
1545 ; CHECK:       middle.block:
1546 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP1]], [[N_VEC]]
1547 ; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <8 x i16> [[WIDE_VEC]], i32 7
1548 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[END:%.*]], label [[SCALAR_PH]]
1549 ; CHECK:       scalar.ph:
1550 ; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ [[DOTPRE]], [[VECTOR_MEMCHECK]] ], [ [[DOTPRE]], [[ENTRY:%.*]] ], [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ]
1551 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ENTRY]] ], [ [[IND_END]], [[MIDDLE_BLOCK]] ]
1552 ; CHECK-NEXT:    [[BC_RESUME_VAL7:%.*]] = phi i64 [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ENTRY]] ], [ [[N_VEC]], [[MIDDLE_BLOCK]] ]
1553 ; CHECK-NEXT:    br label [[LOOP:%.*]]
1554 ; CHECK:       loop:
1555 ; CHECK-NEXT:    [[SCALAR_RECUR:%.*]] = phi i16 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[LOAD2:%.*]], [[LOOP]] ]
1556 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV2:%.*]], [[LOOP]] ]
1557 ; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL7]], [[SCALAR_PH]] ], [ [[I1:%.*]], [[LOOP]] ]
1558 ; CHECK-NEXT:    [[CONV:%.*]] = sext i16 [[SCALAR_RECUR]] to i32
1559 ; CHECK-NEXT:    [[I1]] = add nuw nsw i64 [[I]], 1
1560 ; CHECK-NEXT:    [[IV1:%.*]] = or i64 [[IV]], 1
1561 ; CHECK-NEXT:    [[IV2]] = add nuw nsw i64 [[IV]], 2
1562 ; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr inbounds i16, i16* [[A]], i64 [[IV1]]
1563 ; CHECK-NEXT:    [[LOAD1:%.*]] = load i16, i16* [[GEP1]], align 4
1564 ; CHECK-NEXT:    [[CONV1:%.*]] = sext i16 [[LOAD1]] to i32
1565 ; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr inbounds i16, i16* [[A]], i64 [[IV2]]
1566 ; CHECK-NEXT:    [[LOAD2]] = load i16, i16* [[GEP2]], align 4
1567 ; CHECK-NEXT:    [[CONV2:%.*]] = sext i16 [[LOAD2]] to i32
1568 ; CHECK-NEXT:    [[MUL01:%.*]] = mul nsw i32 [[CONV]], [[CONV1]]
1569 ; CHECK-NEXT:    [[MUL012:%.*]] = mul nsw i32 [[MUL01]], [[CONV2]]
1570 ; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[I]]
1571 ; CHECK-NEXT:    store i32 [[MUL012]], i32* [[ARRAYIDX5]], align 4
1572 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV]], [[N]]
1573 ; CHECK-NEXT:    br i1 [[EXITCOND]], label [[END]], label [[LOOP]], !llvm.loop [[LOOP42:![0-9]+]]
1574 ; CHECK:       end:
1575 ; CHECK-NEXT:    ret void
1576 ;
1577 entry:
1578   %.pre = load i16, i16* %a
1579   br label %loop
1580
1581 loop:
1582   %0 = phi i16 [ %.pre, %entry ], [ %load2, %loop ]
1583   %iv = phi i64 [ 0, %entry ], [ %iv2, %loop ]
1584   %i = phi i64 [ 0, %entry ], [ %i1, %loop ]
1585   %conv = sext i16 %0 to i32
1586   %i1 = add nuw nsw i64 %i, 1
1587   %iv1 = add nuw nsw i64 %iv, 1
1588   %iv2 = add nuw nsw i64 %iv, 2
1589   %gep1 = getelementptr inbounds i16, i16* %a, i64 %iv1
1590   %load1 = load i16, i16* %gep1, align 4
1591   %conv1 = sext i16 %load1 to i32
1592   %gep2 = getelementptr inbounds i16, i16* %a, i64 %iv2
1593   %load2 = load i16, i16* %gep2, align 4
1594   %conv2 = sext i16 %load2 to i32
1595   %mul01 = mul nsw i32 %conv, %conv1
1596   %mul012 = mul nsw i32 %mul01, %conv2
1597   %arrayidx5 = getelementptr inbounds i32, i32* %b, i64 %i
1598   store i32 %mul012, i32* %arrayidx5
1599   %exitcond = icmp eq i64 %iv, %n
1600   br i1 %exitcond, label %end, label %loop
1601
1602 end:
1603   ret void
1604 }
1605
1606 attributes #0 = { "unsafe-fp-math"="true" }