test/Transforms/LoopVectorize/interleaved-accesses.ll

   1 ; RUN: opt -S -loop-vectorize -instcombine -force-vector-width=4 -force-vector-interleave=1 -enable-interleaved-mem-accesses=true -runtime-memory-check-threshold=24 < %s | FileCheck %s
   2
   3 target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
   4
   5 ; Check vectorization on an interleaved load group of factor 2 and an interleaved
   6 ; store group of factor 2.
   7
   8 ; int AB[1024];
   9 ; int CD[1024];
  10 ;  void test_array_load2_store2(int C, int D) {
  11 ;   for (int i = 0; i < 1024; i+=2) {
  12 ;     int A = AB[i];
  13 ;     int B = AB[i+1];
  14 ;     CD[i] = A + C;
  15 ;     CD[i+1] = B * D;
  16 ;   }
  17 ; }
  18
  19 ; CHECK-LABEL: @test_array_load2_store2(
  20 ; CHECK: %wide.vec = load <8 x i32>, <8 x i32>* %{{.*}}, align 4
  21 ; CHECK: shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
  22 ; CHECK: shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
  23 ; CHECK: add nsw <4 x i32>
  24 ; CHECK: mul nsw <4 x i32>
  25 ; CHECK: %interleaved.vec = shufflevector <4 x i32> {{.*}}, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
  26 ; CHECK: store <8 x i32> %interleaved.vec, <8 x i32>* %{{.*}}, align 4
  27
  28 @AB = common global [1024 x i32] zeroinitializer, align 4
  29 @CD = common global [1024 x i32] zeroinitializer, align 4
  30
  31 define void @test_array_load2_store2(i32 %C, i32 %D) {
  32 entry:
  33   br label %for.body
  34
  35 for.body:                                         ; preds = %for.body, %entry
  36   %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
  37   %arrayidx0 = getelementptr inbounds [1024 x i32], [1024 x i32]* @AB, i64 0, i64 %indvars.iv
  38   %tmp = load i32, i32* %arrayidx0, align 4
  39   %tmp1 = or i64 %indvars.iv, 1
  40   %arrayidx1 = getelementptr inbounds [1024 x i32], [1024 x i32]* @AB, i64 0, i64 %tmp1
  41   %tmp2 = load i32, i32* %arrayidx1, align 4
  42   %add = add nsw i32 %tmp, %C
  43   %mul = mul nsw i32 %tmp2, %D
  44   %arrayidx2 = getelementptr inbounds [1024 x i32], [1024 x i32]* @CD, i64 0, i64 %indvars.iv
  45   store i32 %add, i32* %arrayidx2, align 4
  46   %arrayidx3 = getelementptr inbounds [1024 x i32], [1024 x i32]* @CD, i64 0, i64 %tmp1
  47   store i32 %mul, i32* %arrayidx3, align 4
  48   %indvars.iv.next = add nuw nsw i64 %indvars.iv, 2
  49   %cmp = icmp slt i64 %indvars.iv.next, 1024
  50   br i1 %cmp, label %for.body, label %for.end
  51
  52 for.end:                                          ; preds = %for.body
  53   ret void
  54 }
  55
  56 ; int A[3072];
  57 ; struct ST S[1024];
  58 ; void test_struct_st3() {
  59 ;   int *ptr = A;
  60 ;   for (int i = 0; i < 1024; i++) {
  61 ;     int X1 = *ptr++;
  62 ;     int X2 = *ptr++;
  63 ;     int X3 = *ptr++;
  64 ;     T[i].x = X1 + 1;
  65 ;     T[i].y = X2 + 2;
  66 ;     T[i].z = X3 + 3;
  67 ;   }
  68 ; }
  69
  70 ; CHECK-LABEL: @test_struct_array_load3_store3(
  71 ; CHECK: %wide.vec = load <12 x i32>, <12 x i32>* {{.*}}, align 4
  72 ; CHECK: shufflevector <12 x i32> %wide.vec, <12 x i32> undef, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
  73 ; CHECK: shufflevector <12 x i32> %wide.vec, <12 x i32> undef, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
  74 ; CHECK: shufflevector <12 x i32> %wide.vec, <12 x i32> undef, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
  75 ; CHECK: add nsw <4 x i32> {{.*}}, <i32 1, i32 1, i32 1, i32 1>
  76 ; CHECK: add nsw <4 x i32> {{.*}}, <i32 2, i32 2, i32 2, i32 2>
  77 ; CHECK: add nsw <4 x i32> {{.*}}, <i32 3, i32 3, i32 3, i32 3>
  78 ; CHECK: shufflevector <4 x i32> {{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
  79 ; CHECK: shufflevector <4 x i32> {{.*}}, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
  80 ; CHECK: %interleaved.vec = shufflevector <8 x i32> {{.*}}, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
  81 ; CHECK: store <12 x i32> %interleaved.vec, <12 x i32>* {{.*}}, align 4
  82
  83 %struct.ST3 = type { i32, i32, i32 }
  84 @A = common global [3072 x i32] zeroinitializer, align 4
  85 @S = common global [1024 x %struct.ST3] zeroinitializer, align 4
  86
  87 define void @test_struct_array_load3_store3() {
  88 entry:
  89   br label %for.body
  90
  91 for.body:                                         ; preds = %for.body, %entry
  92   %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
  93   %ptr.016 = phi i32* [ getelementptr inbounds ([3072 x i32], [3072 x i32]* @A, i64 0, i64 0), %entry ], [ %incdec.ptr2, %for.body ]
  94   %incdec.ptr = getelementptr inbounds i32, i32* %ptr.016, i64 1
  95   %tmp = load i32, i32* %ptr.016, align 4
  96   %incdec.ptr1 = getelementptr inbounds i32, i32* %ptr.016, i64 2
  97   %tmp1 = load i32, i32* %incdec.ptr, align 4
  98   %incdec.ptr2 = getelementptr inbounds i32, i32* %ptr.016, i64 3
  99   %tmp2 = load i32, i32* %incdec.ptr1, align 4
 100   %add = add nsw i32 %tmp, 1
 101   %x = getelementptr inbounds [1024 x %struct.ST3], [1024 x %struct.ST3]* @S, i64 0, i64 %indvars.iv, i32 0
 102   store i32 %add, i32* %x, align 4
 103   %add3 = add nsw i32 %tmp1, 2
 104   %y = getelementptr inbounds [1024 x %struct.ST3], [1024 x %struct.ST3]* @S, i64 0, i64 %indvars.iv, i32 1
 105   store i32 %add3, i32* %y, align 4
 106   %add6 = add nsw i32 %tmp2, 3
 107   %z = getelementptr inbounds [1024 x %struct.ST3], [1024 x %struct.ST3]* @S, i64 0, i64 %indvars.iv, i32 2
 108   store i32 %add6, i32* %z, align 4
 109   %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
 110   %exitcond = icmp eq i64 %indvars.iv.next, 1024
 111   br i1 %exitcond, label %for.end, label %for.body
 112
 113 for.end:                                          ; preds = %for.body
 114   ret void
 115 }
 116
 117 ; Check vectorization on an interleaved load group of factor 4.
 118
 119 ; struct ST4{
 120 ;   int x;
 121 ;   int y;
 122 ;   int z;
 123 ;   int w;
 124 ; };
 125 ; int test_struct_load4(struct ST4 *S) {
 126 ;   int r = 0;
 127 ;   for (int i = 0; i < 1024; i++) {
 128 ;      r += S[i].x;
 129 ;      r -= S[i].y;
 130 ;      r += S[i].z;
 131 ;      r -= S[i].w;
 132 ;   }
 133 ;   return r;
 134 ; }
 135
 136 ; CHECK-LABEL: @test_struct_load4(
 137 ; CHECK: %wide.vec = load <16 x i32>, <16 x i32>* {{.*}}, align 4
 138 ; CHECK: shufflevector <16 x i32> %wide.vec, <16 x i32> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
 139 ; CHECK: shufflevector <16 x i32> %wide.vec, <16 x i32> undef, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
 140 ; CHECK: shufflevector <16 x i32> %wide.vec, <16 x i32> undef, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
 141 ; CHECK: shufflevector <16 x i32> %wide.vec, <16 x i32> undef, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
 142 ; CHECK: add nsw <4 x i32>
 143 ; CHECK: sub <4 x i32>
 144 ; CHECK: add nsw <4 x i32>
 145 ; CHECK: sub <4 x i32>
 146
 147 %struct.ST4 = type { i32, i32, i32, i32 }
 148
 149 define i32 @test_struct_load4(%struct.ST4* nocapture readonly %S) {
 150 entry:
 151   br label %for.body
 152
 153 for.body:                                         ; preds = %for.body, %entry
 154   %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
 155   %r.022 = phi i32 [ 0, %entry ], [ %sub8, %for.body ]
 156   %x = getelementptr inbounds %struct.ST4, %struct.ST4* %S, i64 %indvars.iv, i32 0
 157   %tmp = load i32, i32* %x, align 4
 158   %add = add nsw i32 %tmp, %r.022
 159   %y = getelementptr inbounds %struct.ST4, %struct.ST4* %S, i64 %indvars.iv, i32 1
 160   %tmp1 = load i32, i32* %y, align 4
 161   %sub = sub i32 %add, %tmp1
 162   %z = getelementptr inbounds %struct.ST4, %struct.ST4* %S, i64 %indvars.iv, i32 2
 163   %tmp2 = load i32, i32* %z, align 4
 164   %add5 = add nsw i32 %sub, %tmp2
 165   %w = getelementptr inbounds %struct.ST4, %struct.ST4* %S, i64 %indvars.iv, i32 3
 166   %tmp3 = load i32, i32* %w, align 4
 167   %sub8 = sub i32 %add5, %tmp3
 168   %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
 169   %exitcond = icmp eq i64 %indvars.iv.next, 1024
 170   br i1 %exitcond, label %for.end, label %for.body
 171
 172 for.end:                                          ; preds = %for.body
 173   ret i32 %sub8
 174 }
 175
 176 ; Check vectorization on an interleaved store group of factor 4.
 177
 178 ; void test_struct_store4(int *A, struct ST4 *B) {
 179 ;   int *ptr = A;
 180 ;   for (int i = 0; i < 1024; i++) {
 181 ;     int X = *ptr++;
 182 ;     B[i].x = X + 1;
 183 ;     B[i].y = X * 2;
 184 ;     B[i].z = X + 3;
 185 ;     B[i].w = X + 4;
 186 ;   }
 187 ; }
 188
 189 ; CHECK-LABEL: @test_struct_store4(
 190 ; CHECK: %[[LD:.*]] = load <4 x i32>, <4 x i32>*
 191 ; CHECK: add nsw <4 x i32> %[[LD]], <i32 1, i32 1, i32 1, i32 1>
 192 ; CHECK: shl nsw <4 x i32> %[[LD]], <i32 1, i32 1, i32 1, i32 1>
 193 ; CHECK: add nsw <4 x i32> %[[LD]], <i32 3, i32 3, i32 3, i32 3>
 194 ; CHECK: add nsw <4 x i32> %[[LD]], <i32 4, i32 4, i32 4, i32 4>
 195 ; CHECK: shufflevector <4 x i32> {{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 196 ; CHECK: shufflevector <4 x i32> {{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 197 ; CHECK: %interleaved.vec = shufflevector <8 x i32> {{.*}}, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
 198 ; CHECK: store <16 x i32> %interleaved.vec, <16 x i32>* {{.*}}, align 4
 199
 200 define void @test_struct_store4(i32* noalias nocapture readonly %A, %struct.ST4* noalias nocapture %B) {
 201 entry:
 202   br label %for.body
 203
 204 for.cond.cleanup:                                 ; preds = %for.body
 205   ret void
 206
 207 for.body:                                         ; preds = %for.body, %entry
 208   %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
 209   %ptr.024 = phi i32* [ %A, %entry ], [ %incdec.ptr, %for.body ]
 210   %incdec.ptr = getelementptr inbounds i32, i32* %ptr.024, i64 1
 211   %tmp = load i32, i32* %ptr.024, align 4
 212   %add = add nsw i32 %tmp, 1
 213   %x = getelementptr inbounds %struct.ST4, %struct.ST4* %B, i64 %indvars.iv, i32 0
 214   store i32 %add, i32* %x, align 4
 215   %mul = shl nsw i32 %tmp, 1
 216   %y = getelementptr inbounds %struct.ST4, %struct.ST4* %B, i64 %indvars.iv, i32 1
 217   store i32 %mul, i32* %y, align 4
 218   %add3 = add nsw i32 %tmp, 3
 219   %z = getelementptr inbounds %struct.ST4, %struct.ST4* %B, i64 %indvars.iv, i32 2
 220   store i32 %add3, i32* %z, align 4
 221   %add6 = add nsw i32 %tmp, 4
 222   %w = getelementptr inbounds %struct.ST4, %struct.ST4* %B, i64 %indvars.iv, i32 3
 223   store i32 %add6, i32* %w, align 4
 224   %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
 225   %exitcond = icmp eq i64 %indvars.iv.next, 1024
 226   br i1 %exitcond, label %for.cond.cleanup, label %for.body
 227 }
 228
 229 ; Check vectorization on a reverse interleaved load group of factor 2 and
 230 ; a reverse interleaved store group of factor 2.
 231
 232 ; struct ST2 {
 233 ;  int x;
 234 ;  int y;
 235 ; };
 236 ;
 237 ; void test_reversed_load2_store2(struct ST2 *A, struct ST2 *B) {
 238 ;   for (int i = 1023; i >= 0; i--) {
 239 ;     int a = A[i].x + i;  // interleaved load of index 0
 240 ;     int b = A[i].y - i;  // interleaved load of index 1
 241 ;     B[i].x = a;          // interleaved store of index 0
 242 ;     B[i].y = b;          // interleaved store of index 1
 243 ;   }
 244 ; }
 245
 246 ; CHECK-LABEL: @test_reversed_load2_store2(
 247 ; CHECK: %[[G0:.+]] = getelementptr inbounds %struct.ST2, %struct.ST2* %A, i64 %offset.idx, i32 0
 248 ; CHECK: %[[G1:.+]] = getelementptr inbounds i32, i32* %[[G0]], i64 -6
 249 ; CHECK: %[[B0:.+]] = bitcast i32* %[[G1]] to <8 x i32>*
 250 ; CHECK: %wide.vec = load <8 x i32>, <8 x i32>* %[[B0]], align 4
 251 ; CHECK: shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
 252 ; CHECK: shufflevector <4 x i32> {{.*}}, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
 253 ; CHECK: shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
 254 ; CHECK: shufflevector <4 x i32> {{.*}}, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
 255 ; CHECK: add nsw <4 x i32>
 256 ; CHECK: sub nsw <4 x i32>
 257 ; CHECK: %[[G2:.+]] = getelementptr inbounds %struct.ST2, %struct.ST2* %B, i64 %offset.idx, i32 1
 258 ; CHECK: %[[G3:.+]] = getelementptr inbounds i32, i32* %[[G2]], i64 -7
 259 ; CHECK: %[[B1:.+]] = bitcast i32* %[[G3]] to <8 x i32>*
 260 ; CHECK: shufflevector <4 x i32> {{.*}}, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
 261 ; CHECK: shufflevector <4 x i32> {{.*}}, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
 262 ; CHECK: %interleaved.vec = shufflevector <4 x i32> {{.*}}, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
 263 ; CHECK: store <8 x i32> %interleaved.vec, <8 x i32>* %[[B1]], align 4
 264
 265 %struct.ST2 = type { i32, i32 }
 266
 267 define void @test_reversed_load2_store2(%struct.ST2* noalias nocapture readonly %A, %struct.ST2* noalias nocapture %B) {
 268 entry:
 269   br label %for.body
 270
 271 for.cond.cleanup:                                 ; preds = %for.body
 272   ret void
 273
 274 for.body:                                         ; preds = %for.body, %entry
 275   %indvars.iv = phi i64 [ 1023, %entry ], [ %indvars.iv.next, %for.body ]
 276   %x = getelementptr inbounds %struct.ST2, %struct.ST2* %A, i64 %indvars.iv, i32 0
 277   %tmp = load i32, i32* %x, align 4
 278   %tmp1 = trunc i64 %indvars.iv to i32
 279   %add = add nsw i32 %tmp, %tmp1
 280   %y = getelementptr inbounds %struct.ST2, %struct.ST2* %A, i64 %indvars.iv, i32 1
 281   %tmp2 = load i32, i32* %y, align 4
 282   %sub = sub nsw i32 %tmp2, %tmp1
 283   %x5 = getelementptr inbounds %struct.ST2, %struct.ST2* %B, i64 %indvars.iv, i32 0
 284   store i32 %add, i32* %x5, align 4
 285   %y8 = getelementptr inbounds %struct.ST2, %struct.ST2* %B, i64 %indvars.iv, i32 1
 286   store i32 %sub, i32* %y8, align 4
 287   %indvars.iv.next = add nsw i64 %indvars.iv, -1
 288   %cmp = icmp sgt i64 %indvars.iv, 0
 289   br i1 %cmp, label %for.body, label %for.cond.cleanup
 290 }
 291
 292 ; Check vectorization on an interleaved load group of factor 2 with 1 gap
 293 ; (missing the load of odd elements). Because the vectorized loop would
 294 ; speculatively access memory out-of-bounds, we must execute at least one
 295 ; iteration of the scalar loop.
 296
 297 ; void even_load_static_tc(int *A, int *B) {
 298 ;  for (unsigned i = 0; i < 1024; i+=2)
 299 ;     B[i/2] = A[i] * 2;
 300 ; }
 301
 302 ; CHECK-LABEL: @even_load_static_tc(
 303 ; CHECK: vector.body:
 304 ; CHECK:   %wide.vec = load <8 x i32>, <8 x i32>* %{{.*}}, align 4
 305 ; CHECK:   %strided.vec = shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
 306 ; CHECK:   icmp eq i64 %index.next, 508
 307 ; CHECK: middle.block:
 308 ; CHECK:   br i1 false, label %for.cond.cleanup, label %scalar.ph
 309
 310 define void @even_load_static_tc(i32* noalias nocapture readonly %A, i32* noalias nocapture %B) {
 311 entry:
 312   br label %for.body
 313
 314 for.cond.cleanup:                                 ; preds = %for.body
 315   ret void
 316
 317 for.body:                                         ; preds = %for.body, %entry
 318   %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
 319   %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
 320   %tmp = load i32, i32* %arrayidx, align 4
 321   %mul = shl nsw i32 %tmp, 1
 322   %tmp1 = lshr exact i64 %indvars.iv, 1
 323   %arrayidx2 = getelementptr inbounds i32, i32* %B, i64 %tmp1
 324   store i32 %mul, i32* %arrayidx2, align 4
 325   %indvars.iv.next = add nuw nsw i64 %indvars.iv, 2
 326   %cmp = icmp ult i64 %indvars.iv.next, 1024
 327   br i1 %cmp, label %for.body, label %for.cond.cleanup
 328 }
 329
 330 ; Check vectorization on an interleaved load group of factor 2 with 1 gap
 331 ; (missing the load of odd elements). Because the vectorized loop would
 332 ; speculatively access memory out-of-bounds, we must execute at least one
 333 ; iteration of the scalar loop.
 334
 335 ; void even_load_dynamic_tc(int *A, int *B, unsigned N) {
 336 ;  for (unsigned i = 0; i < N; i+=2)
 337 ;     B[i/2] = A[i] * 2;
 338 ; }
 339
 340 ; CHECK-LABEL: @even_load_dynamic_tc(
 341 ; CHECK: vector.ph:
 342 ; CHECK:   %n.mod.vf = and i64 %[[N:[a-zA-Z0-9]+]], 3
 343 ; CHECK:   %[[IsZero:[a-zA-Z0-9]+]] = icmp eq i64 %n.mod.vf, 0
 344 ; CHECK:   %[[R:[a-zA-Z0-9]+]] = select i1 %[[IsZero]], i64 4, i64 %n.mod.vf
 345 ; CHECK:   %n.vec = sub i64 %[[N]], %[[R]]
 346 ; CHECK: vector.body:
 347 ; CHECK:   %wide.vec = load <8 x i32>, <8 x i32>* %{{.*}}, align 4
 348 ; CHECK:   %strided.vec = shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
 349 ; CHECK:   icmp eq i64 %index.next, %n.vec
 350 ; CHECK: middle.block:
 351 ; CHECK:   br i1 false, label %for.cond.cleanup, label %scalar.ph
 352
 353 define void @even_load_dynamic_tc(i32* noalias nocapture readonly %A, i32* noalias nocapture %B, i64 %N) {
 354 entry:
 355   br label %for.body
 356
 357 for.cond.cleanup:                                 ; preds = %for.body
 358   ret void
 359
 360 for.body:                                         ; preds = %for.body, %entry
 361   %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
 362   %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
 363   %tmp = load i32, i32* %arrayidx, align 4
 364   %mul = shl nsw i32 %tmp, 1
 365   %tmp1 = lshr exact i64 %indvars.iv, 1
 366   %arrayidx2 = getelementptr inbounds i32, i32* %B, i64 %tmp1
 367   store i32 %mul, i32* %arrayidx2, align 4
 368   %indvars.iv.next = add nuw nsw i64 %indvars.iv, 2
 369   %cmp = icmp ult i64 %indvars.iv.next, %N
 370   br i1 %cmp, label %for.body, label %for.cond.cleanup
 371 }
 372
 373 ; Check vectorization on a reverse interleaved load group of factor 2 with 1
 374 ; gap and a reverse interleaved store group of factor 2. The interleaved load
 375 ; group should be removed since it has a gap and is reverse.
 376
 377 ; struct pair {
 378 ;  int x;
 379 ;  int y;
 380 ; };
 381 ;
 382 ; void load_gap_reverse(struct pair *P1, struct pair *P2, int X) {
 383 ;   for (int i = 1023; i >= 0; i--) {
 384 ;     int a = X + i;
 385 ;     int b = A[i].y - i;
 386 ;     B[i].x = a;
 387 ;     B[i].y = b;
 388 ;   }
 389 ; }
 390
 391 ; CHECK-LABEL: @load_gap_reverse(
 392 ; CHECK-NOT: %wide.vec = load <8 x i64>, <8 x i64>* %{{.*}}, align 8
 393 ; CHECK-NOT: %strided.vec = shufflevector <8 x i64> %wide.vec, <8 x i64> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
 394
 395 %pair = type { i64, i64 }
 396 define void @load_gap_reverse(%pair* noalias nocapture readonly %P1, %pair* noalias nocapture readonly %P2, i64 %X) {
 397 entry:
 398   br label %for.body
 399
 400 for.body:
 401   %i = phi i64 [ 1023, %entry ], [ %i.next, %for.body ]
 402   %0 = add nsw i64 %X, %i
 403   %1 = getelementptr inbounds %pair, %pair* %P1, i64 %i, i32 0
 404   %2 = getelementptr inbounds %pair, %pair* %P2, i64 %i, i32 1
 405   %3 = load i64, i64* %2, align 8
 406   %4 = sub nsw i64 %3, %i
 407   store i64 %0, i64* %1, align 8
 408   store i64 %4, i64* %2, align 8
 409   %i.next = add nsw i64 %i, -1
 410   %cond = icmp sgt i64 %i, 0
 411   br i1 %cond, label %for.body, label %for.exit
 412
 413 for.exit:
 414   ret void
 415 }
 416
 417 ; Check vectorization on interleaved access groups identified from mixed
 418 ; loads/stores.
 419 ; void mixed_load2_store2(int *A, int *B) {
 420 ;   for (unsigned i = 0; i < 1024; i+=2)  {
 421 ;     B[i] = A[i] * A[i+1];
 422 ;     B[i+1] = A[i] + A[i+1];
 423 ;   }
 424 ; }
 425
 426 ; CHECK-LABEL: @mixed_load2_store2(
 427 ; CHECK: %wide.vec = load <8 x i32>, <8 x i32>* {{.*}}, align 4
 428 ; CHECK: shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
 429 ; CHECK: shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
 430 ; CHECK: %interleaved.vec = shufflevector <4 x i32> %{{.*}}, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
 431 ; CHECK: store <8 x i32> %interleaved.vec
 432
 433 define void @mixed_load2_store2(i32* noalias nocapture readonly %A, i32* noalias nocapture %B) {
 434 entry:
 435   br label %for.body
 436
 437 for.cond.cleanup:                                 ; preds = %for.body
 438   ret void
 439
 440 for.body:                                         ; preds = %for.body, %entry
 441   %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
 442   %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
 443   %tmp = load i32, i32* %arrayidx, align 4
 444   %tmp1 = or i64 %indvars.iv, 1
 445   %arrayidx2 = getelementptr inbounds i32, i32* %A, i64 %tmp1
 446   %tmp2 = load i32, i32* %arrayidx2, align 4
 447   %mul = mul nsw i32 %tmp2, %tmp
 448   %arrayidx4 = getelementptr inbounds i32, i32* %B, i64 %indvars.iv
 449   store i32 %mul, i32* %arrayidx4, align 4
 450   %tmp3 = load i32, i32* %arrayidx, align 4
 451   %tmp4 = load i32, i32* %arrayidx2, align 4
 452   %add10 = add nsw i32 %tmp4, %tmp3
 453   %arrayidx13 = getelementptr inbounds i32, i32* %B, i64 %tmp1
 454   store i32 %add10, i32* %arrayidx13, align 4
 455   %indvars.iv.next = add nuw nsw i64 %indvars.iv, 2
 456   %cmp = icmp ult i64 %indvars.iv.next, 1024
 457   br i1 %cmp, label %for.body, label %for.cond.cleanup
 458 }
 459
 460 ; Check vectorization on interleaved access groups identified from mixed
 461 ; loads/stores.
 462 ; void mixed_load3_store3(int *A) {
 463 ;   for (unsigned i = 0; i < 1024; i++)  {
 464 ;     *A++ += i;
 465 ;     *A++ += i;
 466 ;     *A++ += i;
 467 ;   }
 468 ; }
 469
 470 ; CHECK-LABEL: @mixed_load3_store3(
 471 ; CHECK: %wide.vec = load <12 x i32>, <12 x i32>* {{.*}}, align 4
 472 ; CHECK: shufflevector <12 x i32> %wide.vec, <12 x i32> undef, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
 473 ; CHECK: shufflevector <12 x i32> %wide.vec, <12 x i32> undef, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
 474 ; CHECK: shufflevector <12 x i32> %wide.vec, <12 x i32> undef, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
 475 ; CHECK: %interleaved.vec = shufflevector <8 x i32> %{{.*}}, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
 476 ; CHECK: store <12 x i32> %interleaved.vec, <12 x i32>* %{{.*}}, align 4
 477
 478 define void @mixed_load3_store3(i32* nocapture %A) {
 479 entry:
 480   br label %for.body
 481
 482 for.cond.cleanup:                                 ; preds = %for.body
 483   ret void
 484
 485 for.body:                                         ; preds = %for.body, %entry
 486   %i.013 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
 487   %A.addr.012 = phi i32* [ %A, %entry ], [ %incdec.ptr3, %for.body ]
 488   %incdec.ptr = getelementptr inbounds i32, i32* %A.addr.012, i64 1
 489   %tmp = load i32, i32* %A.addr.012, align 4
 490   %add = add i32 %tmp, %i.013
 491   store i32 %add, i32* %A.addr.012, align 4
 492   %incdec.ptr1 = getelementptr inbounds i32, i32* %A.addr.012, i64 2
 493   %tmp1 = load i32, i32* %incdec.ptr, align 4
 494   %add2 = add i32 %tmp1, %i.013
 495   store i32 %add2, i32* %incdec.ptr, align 4
 496   %incdec.ptr3 = getelementptr inbounds i32, i32* %A.addr.012, i64 3
 497   %tmp2 = load i32, i32* %incdec.ptr1, align 4
 498   %add4 = add i32 %tmp2, %i.013
 499   store i32 %add4, i32* %incdec.ptr1, align 4
 500   %inc = add nuw nsw i32 %i.013, 1
 501   %exitcond = icmp eq i32 %inc, 1024
 502   br i1 %exitcond, label %for.cond.cleanup, label %for.body
 503 }
 504
 505 ; Check vectorization on interleaved access groups with members having different
 506 ; kinds of type.
 507
 508 ; struct IntFloat {
 509 ;   int a;
 510 ;   float b;
 511 ; };
 512 ;
 513 ; int SA;
 514 ; float SB;
 515 ;
 516 ; void int_float_struct(struct IntFloat *A) {
 517 ;   int SumA;
 518 ;   float SumB;
 519 ;   for (unsigned i = 0; i < 1024; i++)  {
 520 ;     SumA += A[i].a;
 521 ;     SumB += A[i].b;
 522 ;   }
 523 ;   SA = SumA;
 524 ;   SB = SumB;
 525 ; }
 526
 527 ; CHECK-LABEL: @int_float_struct(
 528 ; CHECK: %wide.vec = load <8 x i32>, <8 x i32>* %{{.*}}, align 4
 529 ; CHECK: %[[V0:.*]] = shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
 530 ; CHECK: %[[V1:.*]] = shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
 531 ; CHECK: bitcast <4 x i32> %[[V1]] to <4 x float>
 532 ; CHECK: add nsw <4 x i32>
 533 ; CHECK: fadd fast <4 x float>
 534
 535 %struct.IntFloat = type { i32, float }
 536
 537 @SA = common global i32 0, align 4
 538 @SB = common global float 0.000000e+00, align 4
 539
 540 define void @int_float_struct(%struct.IntFloat* nocapture readonly %A) #0 {
 541 entry:
 542   br label %for.body
 543
 544 for.cond.cleanup:                                 ; preds = %for.body
 545   store i32 %add, i32* @SA, align 4
 546   store float %add3, float* @SB, align 4
 547   ret void
 548
 549 for.body:                                         ; preds = %for.body, %entry
 550   %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
 551   %SumB.014 = phi float [ undef, %entry ], [ %add3, %for.body ]
 552   %SumA.013 = phi i32 [ undef, %entry ], [ %add, %for.body ]
 553   %a = getelementptr inbounds %struct.IntFloat, %struct.IntFloat* %A, i64 %indvars.iv, i32 0
 554   %tmp = load i32, i32* %a, align 4
 555   %add = add nsw i32 %tmp, %SumA.013
 556   %b = getelementptr inbounds %struct.IntFloat, %struct.IntFloat* %A, i64 %indvars.iv, i32 1
 557   %tmp1 = load float, float* %b, align 4
 558   %add3 = fadd fast float %SumB.014, %tmp1
 559   %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
 560   %exitcond = icmp eq i64 %indvars.iv.next, 1024
 561   br i1 %exitcond, label %for.cond.cleanup, label %for.body
 562 }
 563
 564 ; Check vectorization of interleaved access groups in the presence of
 565 ; dependences (PR27626). The following tests check that we don't reorder
 566 ; dependent loads and stores when generating code for interleaved access
 567 ; groups. Stores should be scalarized because the required code motion would
 568 ; break dependences, and the remaining interleaved load groups should have
 569 ; gaps.
 570
 571 ; PR27626_0: Ensure a strided store is not moved after a dependent (zero
 572 ;            distance) strided load.
 573
 574 ; void PR27626_0(struct pair *p, int z, int n) {
 575 ;   for (int i = 0; i < n; i++) {
 576 ;     p[i].x = z;
 577 ;     p[i].y = p[i].x;
 578 ;   }
 579 ; }
 580
 581 ; CHECK-LABEL: @PR27626_0(
 582 ; CHECK: vector.ph:
 583 ; CHECK:   %n.mod.vf = and i64 %[[N:.+]], 3
 584 ; CHECK:   %[[IsZero:[a-zA-Z0-9]+]] = icmp eq i64 %n.mod.vf, 0
 585 ; CHECK:   %[[R:[a-zA-Z0-9]+]] = select i1 %[[IsZero]], i64 4, i64 %n.mod.vf
 586 ; CHECK:   %n.vec = sub nsw i64 %[[N]], %[[R]]
 587 ; CHECK: vector.body:
 588 ; CHECK:   %[[L1:.+]] = load <8 x i32>, <8 x i32>* {{.*}}
 589 ; CHECK:   %[[X1:.+]] = extractelement <8 x i32> %[[L1]], i32 0
 590 ; CHECK:   store i32 %[[X1]], {{.*}}
 591 ; CHECK:   %[[X2:.+]] = extractelement <8 x i32> %[[L1]], i32 2
 592 ; CHECK:   store i32 %[[X2]], {{.*}}
 593 ; CHECK:   %[[X3:.+]] = extractelement <8 x i32> %[[L1]], i32 4
 594 ; CHECK:   store i32 %[[X3]], {{.*}}
 595 ; CHECK:   %[[X4:.+]] = extractelement <8 x i32> %[[L1]], i32 6
 596 ; CHECK:   store i32 %[[X4]], {{.*}}
 597
 598 %pair.i32 = type { i32, i32 }
 599 define void @PR27626_0(%pair.i32 *%p, i32 %z, i64 %n) {
 600 entry:
 601   br label %for.body
 602
 603 for.body:
 604   %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
 605   %p_i.x = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i, i32 0
 606   %p_i.y = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i, i32 1
 607   store i32 %z, i32* %p_i.x, align 4
 608   %0 = load i32, i32* %p_i.x, align 4
 609   store i32 %0, i32 *%p_i.y, align 4
 610   %i.next = add nuw nsw i64 %i, 1
 611   %cond = icmp slt i64 %i.next, %n
 612   br i1 %cond, label %for.body, label %for.end
 613
 614 for.end:
 615   ret void
 616 }
 617
 618 ; PR27626_1: Ensure a strided load is not moved before a dependent (zero
 619 ;            distance) strided store.
 620
 621 ; void PR27626_1(struct pair *p, int n) {
 622 ;   int s = 0;
 623 ;   for (int i = 0; i < n; i++) {
 624 ;     p[i].y = p[i].x;
 625 ;     s += p[i].y
 626 ;   }
 627 ; }
 628
 629 ; CHECK-LABEL: @PR27626_1(
 630 ; CHECK: vector.ph:
 631 ; CHECK:   %n.mod.vf = and i64 %[[N:.+]], 3
 632 ; CHECK:   %[[IsZero:[a-zA-Z0-9]+]] = icmp eq i64 %n.mod.vf, 0
 633 ; CHECK:   %[[R:[a-zA-Z0-9]+]] = select i1 %[[IsZero]], i64 4, i64 %n.mod.vf
 634 ; CHECK:   %n.vec = sub nsw i64 %[[N]], %[[R]]
 635 ; CHECK: vector.body:
 636 ; CHECK:   %[[Phi:.+]] = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ {{.*}}, %vector.body ]
 637 ; CHECK:   %[[L1:.+]] = load <8 x i32>, <8 x i32>* {{.*}}
 638 ; CHECK:   %[[X1:.+]] = extractelement <8 x i32> %[[L1:.+]], i32 0
 639 ; CHECK:   store i32 %[[X1:.+]], {{.*}}
 640 ; CHECK:   %[[X2:.+]] = extractelement <8 x i32> %[[L1:.+]], i32 2
 641 ; CHECK:   store i32 %[[X2:.+]], {{.*}}
 642 ; CHECK:   %[[X3:.+]] = extractelement <8 x i32> %[[L1:.+]], i32 4
 643 ; CHECK:   store i32 %[[X3:.+]], {{.*}}
 644 ; CHECK:   %[[X4:.+]] = extractelement <8 x i32> %[[L1:.+]], i32 6
 645 ; CHECK:   store i32 %[[X4:.+]], {{.*}}
 646 ; CHECK:   %[[L2:.+]] = load <8 x i32>, <8 x i32>* {{.*}}
 647 ; CHECK:   %[[S1:.+]] = shufflevector <8 x i32> %[[L2]], <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
 648 ; CHECK:   add nsw <4 x i32> %[[S1]], %[[Phi]]
 649
 650 define i32 @PR27626_1(%pair.i32 *%p, i64 %n) {
 651 entry:
 652   br label %for.body
 653
 654 for.body:
 655   %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
 656   %s = phi i32 [ %2, %for.body ], [ 0, %entry ]
 657   %p_i.x = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i, i32 0
 658   %p_i.y = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i, i32 1
 659   %0 = load i32, i32* %p_i.x, align 4
 660   store i32 %0, i32* %p_i.y, align 4
 661   %1 = load i32, i32* %p_i.y, align 4
 662   %2 = add nsw i32 %1, %s
 663   %i.next = add nuw nsw i64 %i, 1
 664   %cond = icmp slt i64 %i.next, %n
 665   br i1 %cond, label %for.body, label %for.end
 666
 667 for.end:
 668   %3 = phi i32 [ %2, %for.body ]
 669   ret i32 %3
 670 }
 671
 672 ; PR27626_2: Ensure a strided store is not moved after a dependent (negative
 673 ;            distance) strided load.
 674
 675 ; void PR27626_2(struct pair *p, int z, int n) {
 676 ;   for (int i = 0; i < n; i++) {
 677 ;     p[i].x = z;
 678 ;     p[i].y = p[i - 1].x;
 679 ;   }
 680 ; }
 681
 682 ; CHECK-LABEL: @PR27626_2(
 683 ; CHECK: vector.ph:
 684 ; CHECK:   %n.mod.vf = and i64 %[[N:.+]], 3
 685 ; CHECK:   %[[IsZero:[a-zA-Z0-9]+]] = icmp eq i64 %n.mod.vf, 0
 686 ; CHECK:   %[[R:[a-zA-Z0-9]+]] = select i1 %[[IsZero]], i64 4, i64 %n.mod.vf
 687 ; CHECK:   %n.vec = sub nsw i64 %[[N]], %[[R]]
 688 ; CHECK: vector.body:
 689 ; CHECK:   %[[L1:.+]] = load <8 x i32>, <8 x i32>* {{.*}}
 690 ; CHECK:   %[[X1:.+]] = extractelement <8 x i32> %[[L1]], i32 0
 691 ; CHECK:   store i32 %[[X1]], {{.*}}
 692 ; CHECK:   %[[X2:.+]] = extractelement <8 x i32> %[[L1]], i32 2
 693 ; CHECK:   store i32 %[[X2]], {{.*}}
 694 ; CHECK:   %[[X3:.+]] = extractelement <8 x i32> %[[L1]], i32 4
 695 ; CHECK:   store i32 %[[X3]], {{.*}}
 696 ; CHECK:   %[[X4:.+]] = extractelement <8 x i32> %[[L1]], i32 6
 697 ; CHECK:   store i32 %[[X4]], {{.*}}
 698
 699 define void @PR27626_2(%pair.i32 *%p, i64 %n, i32 %z) {
 700 entry:
 701   br label %for.body
 702
 703 for.body:
 704   %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
 705   %i_minus_1 = add nuw nsw i64 %i, -1
 706   %p_i.x = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i, i32 0
 707   %p_i_minus_1.x = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i_minus_1, i32 0
 708   %p_i.y = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i, i32 1
 709   store i32 %z, i32* %p_i.x, align 4
 710   %0 = load i32, i32* %p_i_minus_1.x, align 4
 711   store i32 %0, i32 *%p_i.y, align 4
 712   %i.next = add nuw nsw i64 %i, 1
 713   %cond = icmp slt i64 %i.next, %n
 714   br i1 %cond, label %for.body, label %for.end
 715
 716 for.end:
 717   ret void
 718 }
 719
 720 ; PR27626_3: Ensure a strided load is not moved before a dependent (negative
 721 ;            distance) strided store.
 722
 723 ; void PR27626_3(struct pair *p, int z, int n) {
 724 ;   for (int i = 0; i < n; i++) {
 725 ;     p[i + 1].y = p[i].x;
 726 ;     s += p[i].y;
 727 ;   }
 728 ; }
 729
 730 ; CHECK-LABEL: @PR27626_3(
 731 ; CHECK: vector.ph:
 732 ; CHECK:   %n.mod.vf = and i64 %[[N:.+]], 3
 733 ; CHECK:   %[[IsZero:[a-zA-Z0-9]+]] = icmp eq i64 %n.mod.vf, 0
 734 ; CHECK:   %[[R:[a-zA-Z0-9]+]] = select i1 %[[IsZero]], i64 4, i64 %n.mod.vf
 735 ; CHECK:   %n.vec = sub nsw i64 %[[N]], %[[R]]
 736 ; CHECK: vector.body:
 737 ; CHECK:   %[[Phi:.+]] = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ {{.*}}, %vector.body ]
 738 ; CHECK:   %[[L1:.+]] = load <8 x i32>, <8 x i32>* {{.*}}
 739 ; CHECK:   %[[X1:.+]] = extractelement <8 x i32> %[[L1:.+]], i32 0
 740 ; CHECK:   store i32 %[[X1:.+]], {{.*}}
 741 ; CHECK:   %[[X2:.+]] = extractelement <8 x i32> %[[L1:.+]], i32 2
 742 ; CHECK:   store i32 %[[X2:.+]], {{.*}}
 743 ; CHECK:   %[[X3:.+]] = extractelement <8 x i32> %[[L1:.+]], i32 4
 744 ; CHECK:   store i32 %[[X3:.+]], {{.*}}
 745 ; CHECK:   %[[X4:.+]] = extractelement <8 x i32> %[[L1:.+]], i32 6
 746 ; CHECK:   store i32 %[[X4:.+]], {{.*}}
 747 ; CHECK:   %[[L2:.+]] = load <8 x i32>, <8 x i32>* {{.*}}
 748 ; CHECK:   %[[S1:.+]] = shufflevector <8 x i32> %[[L2]], <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
 749 ; CHECK:   add nsw <4 x i32> %[[S1]], %[[Phi]]
 750
 751 define i32 @PR27626_3(%pair.i32 *%p, i64 %n, i32 %z) {
 752 entry:
 753   br label %for.body
 754
 755 for.body:
 756   %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
 757   %s = phi i32 [ %2, %for.body ], [ 0, %entry ]
 758   %i_plus_1 = add nuw nsw i64 %i, 1
 759   %p_i.x = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i, i32 0
 760   %p_i.y = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i, i32 1
 761   %p_i_plus_1.y = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i_plus_1, i32 1
 762   %0 = load i32, i32* %p_i.x, align 4
 763   store i32 %0, i32* %p_i_plus_1.y, align 4
 764   %1 = load i32, i32* %p_i.y, align 4
 765   %2 = add nsw i32 %1, %s
 766   %i.next = add nuw nsw i64 %i, 1
 767   %cond = icmp slt i64 %i.next, %n
 768   br i1 %cond, label %for.body, label %for.end
 769
 770 for.end:
 771   %3 = phi i32 [ %2, %for.body ]
 772   ret i32 %3
 773 }
 774
 775 ; PR27626_4: Ensure we form an interleaved group for strided stores in the
 776 ;            presence of a write-after-write dependence. We create a group for
 777 ;            (2) and (3) while excluding (1).
 778
 779 ; void PR27626_4(int *a, int x, int y, int z, int n) {
 780 ;   for (int i = 0; i < n; i += 2) {
 781 ;     a[i] = x;      // (1)
 782 ;     a[i] = y;      // (2)
 783 ;     a[i + 1] = z;  // (3)
 784 ;   }
 785 ; }
 786
 787 ; CHECK-LABEL: @PR27626_4(
 788 ; CHECK: vector.ph:
 789 ; CHECK:   %[[INS_Y:.+]] = insertelement <4 x i32> undef, i32 %y, i32 0
 790 ; CHECK:   %[[SPLAT_Y:.+]] = shufflevector <4 x i32> %[[INS_Y]], <4 x i32> undef, <4 x i32> zeroinitializer
 791 ; CHECK:   %[[INS_Z:.+]] = insertelement <4 x i32> undef, i32 %z, i32 0
 792 ; CHECK:   %[[SPLAT_Z:.+]] = shufflevector <4 x i32> %[[INS_Z]], <4 x i32> undef, <4 x i32> zeroinitializer
 793 ; CHECK: vector.body:
 794 ; CHECK:   store i32 %x, {{.*}}
 795 ; CHECK:   store i32 %x, {{.*}}
 796 ; CHECK:   store i32 %x, {{.*}}
 797 ; CHECK:   store i32 %x, {{.*}}
 798 ; CHECK:   %[[VEC:.+]] = shufflevector <4 x i32> %[[SPLAT_Y]], <4 x i32> %[[SPLAT_Z]], <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
 799 ; CHECK:   store <8 x i32> %[[VEC]], {{.*}}
 800
 801 define void @PR27626_4(i32 *%a, i32 %x, i32 %y, i32 %z, i64 %n) {
 802 entry:
 803   br label %for.body
 804
 805 for.body:
 806   %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
 807   %i_plus_1 = add i64 %i, 1
 808   %a_i = getelementptr inbounds i32, i32* %a, i64 %i
 809   %a_i_plus_1 = getelementptr inbounds i32, i32* %a, i64 %i_plus_1
 810   store i32 %x, i32* %a_i, align 4
 811   store i32 %y, i32* %a_i, align 4
 812   store i32 %z, i32* %a_i_plus_1, align 4
 813   %i.next = add nuw nsw i64 %i, 2
 814   %cond = icmp slt i64 %i.next, %n
 815   br i1 %cond, label %for.body, label %for.end
 816
 817 for.end:
 818   ret void
 819 }
 820
 821 ; PR27626_5: Ensure we do not form an interleaved group for strided stores in
 822 ;            the presence of a write-after-write dependence.
 823
 824 ; void PR27626_5(int *a, int x, int y, int z, int n) {
 825 ;   for (int i = 3; i < n; i += 2) {
 826 ;     a[i - 1] = x;
 827 ;     a[i - 3] = y;
 828 ;     a[i] = z;
 829 ;   }
 830 ; }
 831
 832 ; CHECK-LABEL: @PR27626_5(
 833 ; CHECK: vector.body:
 834 ; CHECK:   store i32 %x, {{.*}}
 835 ; CHECK:   store i32 %x, {{.*}}
 836 ; CHECK:   store i32 %x, {{.*}}
 837 ; CHECK:   store i32 %x, {{.*}}
 838 ; CHECK:   store i32 %y, {{.*}}
 839 ; CHECK:   store i32 %y, {{.*}}
 840 ; CHECK:   store i32 %y, {{.*}}
 841 ; CHECK:   store i32 %y, {{.*}}
 842 ; CHECK:   store i32 %z, {{.*}}
 843 ; CHECK:   store i32 %z, {{.*}}
 844 ; CHECK:   store i32 %z, {{.*}}
 845 ; CHECK:   store i32 %z, {{.*}}
 846
 847 define void @PR27626_5(i32 *%a, i32 %x, i32 %y, i32 %z, i64 %n) {
 848 entry:
 849   br label %for.body
 850
 851 for.body:
 852   %i = phi i64 [ %i.next, %for.body ], [ 3, %entry ]
 853   %i_minus_1 = sub i64 %i, 1
 854   %i_minus_3 = sub i64 %i_minus_1, 2
 855   %a_i = getelementptr inbounds i32, i32* %a, i64 %i
 856   %a_i_minus_1 = getelementptr inbounds i32, i32* %a, i64 %i_minus_1
 857   %a_i_minus_3 = getelementptr inbounds i32, i32* %a, i64 %i_minus_3
 858   store i32 %x, i32* %a_i_minus_1, align 4
 859   store i32 %y, i32* %a_i_minus_3, align 4
 860   store i32 %z, i32* %a_i, align 4
 861   %i.next = add nuw nsw i64 %i, 2
 862   %cond = icmp slt i64 %i.next, %n
 863   br i1 %cond, label %for.body, label %for.end
 864
 865 for.end:
 866   ret void
 867 }
 868
 869 ; PR34743: Ensure that a cast which needs to sink after a load that belongs to
 870 ; an interleaved group, indeeded gets sunk.
 871
 872 ; void PR34743(short *a, int *b, int n) {
 873 ;   for (int i = 0, iv = 0; iv < n; i++, iv += 2) {
 874 ;     b[i] = a[iv] * a[iv+1] * a[iv+2];
 875 ;   }
 876 ; }
 877
 878 ; CHECK-LABEL: @PR34743(
 879 ; CHECK: vector.body:
 880 ; CHECK:   %vector.recur = phi <4 x i16> [ %vector.recur.init, %vector.ph ], [ %[[VSHUF1:.+]], %vector.body ]
 881 ; CHECK:   %wide.vec = load <8 x i16>
 882 ; CHECK:   %[[VSHUF0:.+]] = shufflevector <8 x i16> %wide.vec, <8 x i16> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
 883 ; CHECK:   %[[VSHUF1:.+]] = shufflevector <8 x i16> %wide.vec, <8 x i16> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
 884 ; CHECK:   %[[VSHUF:.+]] = shufflevector <4 x i16> %vector.recur, <4 x i16> %[[VSHUF1]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
 885 ; CHECK:   sext <4 x i16> %[[VSHUF0]] to <4 x i32>
 886 ; CHECK:   sext <4 x i16> %[[VSHUF]] to <4 x i32>
 887 ; CHECK:   sext <4 x i16> %[[VSHUF1]] to <4 x i32>
 888 ; CHECK:   mul nsw <4 x i32>
 889 ; CHECK:   mul nsw <4 x i32>
 890
 891 define void @PR34743(i16* %a, i32* %b, i64 %n) {
 892 entry:
 893   %.pre = load i16, i16* %a
 894   br label %loop
 895
 896 loop:
 897   %0 = phi i16 [ %.pre, %entry ], [ %load2, %loop ]
 898   %iv = phi i64 [ 0, %entry ], [ %iv2, %loop ]
 899   %i = phi i64 [ 0, %entry ], [ %i1, %loop ]
 900   %conv = sext i16 %0 to i32
 901   %i1 = add nuw nsw i64 %i, 1
 902   %iv1 = add nuw nsw i64 %iv, 1
 903   %iv2 = add nuw nsw i64 %iv, 2
 904   %gep1 = getelementptr inbounds i16, i16* %a, i64 %iv1
 905   %load1 = load i16, i16* %gep1, align 4
 906   %conv1 = sext i16 %load1 to i32
 907   %gep2 = getelementptr inbounds i16, i16* %a, i64 %iv2
 908   %load2 = load i16, i16* %gep2, align 4
 909   %conv2 = sext i16 %load2 to i32
 910   %mul01 = mul nsw i32 %conv, %conv1
 911   %mul012 = mul nsw i32 %mul01, %conv2
 912   %arrayidx5 = getelementptr inbounds i32, i32* %b, i64 %i
 913   store i32 %mul012, i32* %arrayidx5
 914   %exitcond = icmp eq i64 %iv, %n
 915   br i1 %exitcond, label %end, label %loop
 916
 917 end:
 918   ret void
 919 }
 920
 921 attributes #0 = { "unsafe-fp-math"="true" }