1 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2 ; RUN: opt -S -passes=loop-vectorize,instcombine -force-vector-width=4 -force-vector-interleave=1 -enable-interleaved-mem-accesses=true -runtime-memory-check-threshold=24 < %s | FileCheck %s
4 target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
6 ; Check vectorization on an interleaved load group of factor 2 and an interleaved
7 ; store group of factor 2.
11 ; void test_array_load2_store2(int C, int D) {
12 ; for (int i = 0; i < 1024; i+=2) {
21 @AB = common global [1024 x i32] zeroinitializer, align 4
22 @CD = common global [1024 x i32] zeroinitializer, align 4
24 define void @test_array_load2_store2(i32 %C, i32 %D) {
25 ; CHECK-LABEL: @test_array_load2_store2(
27 ; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
29 ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[C:%.*]], i64 0
30 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
31 ; CHECK-NEXT: [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <4 x i32> poison, i32 [[D:%.*]], i64 0
32 ; CHECK-NEXT: [[BROADCAST_SPLAT3:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT2]], <4 x i32> poison, <4 x i32> zeroinitializer
33 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
35 ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
36 ; CHECK-NEXT: [[OFFSET_IDX:%.*]] = shl i64 [[INDEX]], 1
37 ; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds [1024 x i32], ptr @AB, i64 0, i64 [[OFFSET_IDX]]
38 ; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <8 x i32>, ptr [[TMP0]], align 4
39 ; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
40 ; CHECK-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
41 ; CHECK-NEXT: [[TMP1:%.*]] = or i64 [[OFFSET_IDX]], 1
42 ; CHECK-NEXT: [[TMP2:%.*]] = add nsw <4 x i32> [[STRIDED_VEC]], [[BROADCAST_SPLAT]]
43 ; CHECK-NEXT: [[TMP3:%.*]] = mul nsw <4 x i32> [[STRIDED_VEC1]], [[BROADCAST_SPLAT3]]
44 ; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds [1024 x i32], ptr @CD, i64 0, i64 [[TMP1]]
45 ; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i64 -1
46 ; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
47 ; CHECK-NEXT: store <8 x i32> [[INTERLEAVED_VEC]], ptr [[TMP5]], align 4
48 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
49 ; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 512
50 ; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
51 ; CHECK: middle.block:
52 ; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
54 ; CHECK-NEXT: br label [[FOR_BODY:%.*]]
56 ; CHECK-NEXT: br i1 poison, label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP3:![0-9]+]]
58 ; CHECK-NEXT: ret void
63 for.body: ; preds = %for.body, %entry
64 %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
65 %arrayidx0 = getelementptr inbounds [1024 x i32], ptr @AB, i64 0, i64 %indvars.iv
66 %tmp = load i32, ptr %arrayidx0, align 4
67 %tmp1 = or i64 %indvars.iv, 1
68 %arrayidx1 = getelementptr inbounds [1024 x i32], ptr @AB, i64 0, i64 %tmp1
69 %tmp2 = load i32, ptr %arrayidx1, align 4
70 %add = add nsw i32 %tmp, %C
71 %mul = mul nsw i32 %tmp2, %D
72 %arrayidx2 = getelementptr inbounds [1024 x i32], ptr @CD, i64 0, i64 %indvars.iv
73 store i32 %add, ptr %arrayidx2, align 4
74 %arrayidx3 = getelementptr inbounds [1024 x i32], ptr @CD, i64 0, i64 %tmp1
75 store i32 %mul, ptr %arrayidx3, align 4
76 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 2
77 %cmp = icmp slt i64 %indvars.iv.next, 1024
78 br i1 %cmp, label %for.body, label %for.end
80 for.end: ; preds = %for.body
86 ; void test_struct_st3() {
88 ; for (int i = 0; i < 1024; i++) {
99 %struct.ST3 = type { i32, i32, i32 }
100 @A = common global [3072 x i32] zeroinitializer, align 4
101 @S = common global [1024 x %struct.ST3] zeroinitializer, align 4
103 define void @test_struct_array_load3_store3() {
104 ; CHECK-LABEL: @test_struct_array_load3_store3(
106 ; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
108 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
109 ; CHECK: vector.body:
110 ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
111 ; CHECK-NEXT: [[TMP0:%.*]] = mul i64 [[INDEX]], 12
112 ; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr @A, i64 [[TMP0]]
113 ; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <12 x i32>, ptr [[NEXT_GEP]], align 4
114 ; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <12 x i32> [[WIDE_VEC]], <12 x i32> poison, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
115 ; CHECK-NEXT: [[STRIDED_VEC2:%.*]] = shufflevector <12 x i32> [[WIDE_VEC]], <12 x i32> poison, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
116 ; CHECK-NEXT: [[STRIDED_VEC3:%.*]] = shufflevector <12 x i32> [[WIDE_VEC]], <12 x i32> poison, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
117 ; CHECK-NEXT: [[TMP1:%.*]] = add nsw <4 x i32> [[STRIDED_VEC]], <i32 1, i32 1, i32 1, i32 1>
118 ; CHECK-NEXT: [[TMP2:%.*]] = add nsw <4 x i32> [[STRIDED_VEC2]], <i32 2, i32 2, i32 2, i32 2>
119 ; CHECK-NEXT: [[TMP3:%.*]] = add nsw <4 x i32> [[STRIDED_VEC3]], <i32 3, i32 3, i32 3, i32 3>
120 ; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds [1024 x %struct.ST3], ptr @S, i64 0, i64 [[INDEX]]
121 ; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
122 ; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
123 ; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i32> [[TMP5]], <8 x i32> [[TMP6]], <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
124 ; CHECK-NEXT: store <12 x i32> [[INTERLEAVED_VEC]], ptr [[TMP4]], align 4
125 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
126 ; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
127 ; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
128 ; CHECK: middle.block:
129 ; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
131 ; CHECK-NEXT: br label [[FOR_BODY:%.*]]
133 ; CHECK-NEXT: br i1 poison, label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
135 ; CHECK-NEXT: ret void
140 for.body: ; preds = %for.body, %entry
141 %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
142 %ptr.016 = phi ptr [ @A, %entry ], [ %incdec.ptr2, %for.body ]
143 %incdec.ptr = getelementptr inbounds i32, ptr %ptr.016, i64 1
144 %tmp = load i32, ptr %ptr.016, align 4
145 %incdec.ptr1 = getelementptr inbounds i32, ptr %ptr.016, i64 2
146 %tmp1 = load i32, ptr %incdec.ptr, align 4
147 %incdec.ptr2 = getelementptr inbounds i32, ptr %ptr.016, i64 3
148 %tmp2 = load i32, ptr %incdec.ptr1, align 4
149 %add = add nsw i32 %tmp, 1
150 %x = getelementptr inbounds [1024 x %struct.ST3], ptr @S, i64 0, i64 %indvars.iv, i32 0
151 store i32 %add, ptr %x, align 4
152 %add3 = add nsw i32 %tmp1, 2
153 %y = getelementptr inbounds [1024 x %struct.ST3], ptr @S, i64 0, i64 %indvars.iv, i32 1
154 store i32 %add3, ptr %y, align 4
155 %add6 = add nsw i32 %tmp2, 3
156 %z = getelementptr inbounds [1024 x %struct.ST3], ptr @S, i64 0, i64 %indvars.iv, i32 2
157 store i32 %add6, ptr %z, align 4
158 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
159 %exitcond = icmp eq i64 %indvars.iv.next, 1024
160 br i1 %exitcond, label %for.end, label %for.body
162 for.end: ; preds = %for.body
166 ; Check vectorization on an interleaved load group of factor 4.
174 ; int test_struct_load4(struct ST4 *S) {
176 ; for (int i = 0; i < 1024; i++) {
185 %struct.ST4 = type { i32, i32, i32, i32 }
187 define i32 @test_struct_load4(ptr nocapture readonly %S) {
189 ; CHECK-LABEL: @test_struct_load4(
191 ; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
193 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
194 ; CHECK: vector.body:
195 ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
196 ; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
197 ; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds [[STRUCT_ST4:%.*]], ptr [[S:%.*]], i64 [[INDEX]], i32 0
198 ; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <16 x i32>, ptr [[TMP0]], align 4
199 ; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> poison, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
200 ; CHECK-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> poison, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
201 ; CHECK-NEXT: [[STRIDED_VEC2:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> poison, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
202 ; CHECK-NEXT: [[STRIDED_VEC3:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> poison, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
203 ; CHECK-NEXT: [[TMP1:%.*]] = add <4 x i32> [[STRIDED_VEC]], [[VEC_PHI]]
204 ; CHECK-NEXT: [[TMP2:%.*]] = add <4 x i32> [[TMP1]], [[STRIDED_VEC2]]
205 ; CHECK-NEXT: [[TMP3:%.*]] = add <4 x i32> [[STRIDED_VEC1]], [[STRIDED_VEC3]]
206 ; CHECK-NEXT: [[TMP4]] = sub <4 x i32> [[TMP2]], [[TMP3]]
207 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
208 ; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
209 ; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
210 ; CHECK: middle.block:
211 ; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP4]])
212 ; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
214 ; CHECK-NEXT: br label [[FOR_BODY:%.*]]
216 ; CHECK-NEXT: br i1 poison, label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
218 ; CHECK-NEXT: [[SUB8_LCSSA:%.*]] = phi i32 [ poison, [[FOR_BODY]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ]
219 ; CHECK-NEXT: ret i32 [[SUB8_LCSSA]]
224 for.body: ; preds = %for.body, %entry
225 %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
226 %r.022 = phi i32 [ 0, %entry ], [ %sub8, %for.body ]
227 %x = getelementptr inbounds %struct.ST4, ptr %S, i64 %indvars.iv, i32 0
228 %tmp = load i32, ptr %x, align 4
229 %add = add nsw i32 %tmp, %r.022
230 %y = getelementptr inbounds %struct.ST4, ptr %S, i64 %indvars.iv, i32 1
231 %tmp1 = load i32, ptr %y, align 4
232 %sub = sub i32 %add, %tmp1
233 %z = getelementptr inbounds %struct.ST4, ptr %S, i64 %indvars.iv, i32 2
234 %tmp2 = load i32, ptr %z, align 4
235 %add5 = add nsw i32 %sub, %tmp2
236 %w = getelementptr inbounds %struct.ST4, ptr %S, i64 %indvars.iv, i32 3
237 %tmp3 = load i32, ptr %w, align 4
238 %sub8 = sub i32 %add5, %tmp3
239 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
240 %exitcond = icmp eq i64 %indvars.iv.next, 1024
241 br i1 %exitcond, label %for.end, label %for.body
243 for.end: ; preds = %for.body
247 ; Check vectorization on an interleaved store group of factor 4.
249 ; void test_struct_store4(int *A, struct ST4 *B) {
251 ; for (int i = 0; i < 1024; i++) {
261 define void @test_struct_store4(ptr noalias nocapture readonly %A, ptr noalias nocapture %B) {
262 ; CHECK-LABEL: @test_struct_store4(
264 ; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
266 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
267 ; CHECK: vector.body:
268 ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
269 ; CHECK-NEXT: [[TMP0:%.*]] = shl i64 [[INDEX]], 2
270 ; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 [[TMP0]]
271 ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[NEXT_GEP]], align 4
272 ; CHECK-NEXT: [[TMP1:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], <i32 1, i32 1, i32 1, i32 1>
273 ; CHECK-NEXT: [[TMP2:%.*]] = shl nsw <4 x i32> [[WIDE_LOAD]], <i32 1, i32 1, i32 1, i32 1>
274 ; CHECK-NEXT: [[TMP3:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], <i32 3, i32 3, i32 3, i32 3>
275 ; CHECK-NEXT: [[TMP4:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], <i32 4, i32 4, i32 4, i32 4>
276 ; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds [[STRUCT_ST4:%.*]], ptr [[B:%.*]], i64 [[INDEX]]
277 ; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
278 ; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
279 ; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i32> [[TMP6]], <8 x i32> [[TMP7]], <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
280 ; CHECK-NEXT: store <16 x i32> [[INTERLEAVED_VEC]], ptr [[TMP5]], align 4
281 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
282 ; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
283 ; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
284 ; CHECK: middle.block:
285 ; CHECK-NEXT: br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
287 ; CHECK-NEXT: br label [[FOR_BODY:%.*]]
288 ; CHECK: for.cond.cleanup:
289 ; CHECK-NEXT: ret void
291 ; CHECK-NEXT: br i1 poison, label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
296 for.cond.cleanup: ; preds = %for.body
299 for.body: ; preds = %for.body, %entry
300 %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
301 %ptr.024 = phi ptr [ %A, %entry ], [ %incdec.ptr, %for.body ]
302 %incdec.ptr = getelementptr inbounds i32, ptr %ptr.024, i64 1
303 %tmp = load i32, ptr %ptr.024, align 4
304 %add = add nsw i32 %tmp, 1
305 %x = getelementptr inbounds %struct.ST4, ptr %B, i64 %indvars.iv, i32 0
306 store i32 %add, ptr %x, align 4
307 %mul = shl nsw i32 %tmp, 1
308 %y = getelementptr inbounds %struct.ST4, ptr %B, i64 %indvars.iv, i32 1
309 store i32 %mul, ptr %y, align 4
310 %add3 = add nsw i32 %tmp, 3
311 %z = getelementptr inbounds %struct.ST4, ptr %B, i64 %indvars.iv, i32 2
312 store i32 %add3, ptr %z, align 4
313 %add6 = add nsw i32 %tmp, 4
314 %w = getelementptr inbounds %struct.ST4, ptr %B, i64 %indvars.iv, i32 3
315 store i32 %add6, ptr %w, align 4
316 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
317 %exitcond = icmp eq i64 %indvars.iv.next, 1024
318 br i1 %exitcond, label %for.cond.cleanup, label %for.body
321 ; Check vectorization on a reverse interleaved load group of factor 2 and
322 ; a reverse interleaved store group of factor 2.
329 ; void test_reversed_load2_store2(struct ST2 *A, struct ST2 *B) {
330 ; for (int i = 1023; i >= 0; i--) {
331 ; int a = A[i].x + i; // interleaved load of index 0
332 ; int b = A[i].y - i; // interleaved load of index 1
333 ; B[i].x = a; // interleaved store of index 0
334 ; B[i].y = b; // interleaved store of index 1
339 %struct.ST2 = type { i32, i32 }
341 define void @test_reversed_load2_store2(ptr noalias nocapture readonly %A, ptr noalias nocapture %B) {
342 ; CHECK-LABEL: @test_reversed_load2_store2(
344 ; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
346 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
347 ; CHECK: vector.body:
348 ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
349 ; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 1023, i32 1022, i32 1021, i32 1020>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
350 ; CHECK-NEXT: [[OFFSET_IDX:%.*]] = sub i64 1023, [[INDEX]]
351 ; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds [[STRUCT_ST2:%.*]], ptr [[A:%.*]], i64 [[OFFSET_IDX]], i32 0
352 ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 -6
353 ; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <8 x i32>, ptr [[TMP1]], align 4
354 ; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
355 ; CHECK-NEXT: [[REVERSE:%.*]] = shufflevector <4 x i32> [[STRIDED_VEC]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
356 ; CHECK-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
357 ; CHECK-NEXT: [[REVERSE2:%.*]] = shufflevector <4 x i32> [[STRIDED_VEC1]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
358 ; CHECK-NEXT: [[TMP2:%.*]] = add nsw <4 x i32> [[REVERSE]], [[VEC_IND]]
359 ; CHECK-NEXT: [[TMP3:%.*]] = sub nsw <4 x i32> [[REVERSE2]], [[VEC_IND]]
360 ; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds [[STRUCT_ST2]], ptr [[B:%.*]], i64 [[OFFSET_IDX]], i32 1
361 ; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i64 -7
362 ; CHECK-NEXT: [[REVERSE3:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
363 ; CHECK-NEXT: [[REVERSE4:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
364 ; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i32> [[REVERSE3]], <4 x i32> [[REVERSE4]], <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
365 ; CHECK-NEXT: store <8 x i32> [[INTERLEAVED_VEC]], ptr [[TMP5]], align 4
366 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
367 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 -4, i32 -4, i32 -4, i32 -4>
368 ; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
369 ; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
370 ; CHECK: middle.block:
371 ; CHECK-NEXT: br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
373 ; CHECK-NEXT: br label [[FOR_BODY:%.*]]
374 ; CHECK: for.cond.cleanup:
375 ; CHECK-NEXT: ret void
377 ; CHECK-NEXT: br i1 poison, label [[FOR_BODY]], label [[FOR_COND_CLEANUP]], !llvm.loop [[LOOP11:![0-9]+]]
382 for.cond.cleanup: ; preds = %for.body
385 for.body: ; preds = %for.body, %entry
386 %indvars.iv = phi i64 [ 1023, %entry ], [ %indvars.iv.next, %for.body ]
387 %x = getelementptr inbounds %struct.ST2, ptr %A, i64 %indvars.iv, i32 0
388 %tmp = load i32, ptr %x, align 4
389 %tmp1 = trunc i64 %indvars.iv to i32
390 %add = add nsw i32 %tmp, %tmp1
391 %y = getelementptr inbounds %struct.ST2, ptr %A, i64 %indvars.iv, i32 1
392 %tmp2 = load i32, ptr %y, align 4
393 %sub = sub nsw i32 %tmp2, %tmp1
394 %x5 = getelementptr inbounds %struct.ST2, ptr %B, i64 %indvars.iv, i32 0
395 store i32 %add, ptr %x5, align 4
396 %y8 = getelementptr inbounds %struct.ST2, ptr %B, i64 %indvars.iv, i32 1
397 store i32 %sub, ptr %y8, align 4
398 %indvars.iv.next = add nsw i64 %indvars.iv, -1
399 %cmp = icmp sgt i64 %indvars.iv, 0
400 br i1 %cmp, label %for.body, label %for.cond.cleanup
403 ; Check vectorization on an interleaved load group of factor 2 with 1 gap
404 ; (missing the load of odd elements). Because the vectorized loop would
405 ; speculatively access memory out-of-bounds, we must execute at least one
406 ; iteration of the scalar loop.
408 ; void even_load_static_tc(int *A, int *B) {
409 ; for (unsigned i = 0; i < 1024; i+=2)
414 define void @even_load_static_tc(ptr noalias nocapture readonly %A, ptr noalias nocapture %B) {
415 ; CHECK-LABEL: @even_load_static_tc(
417 ; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
419 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
420 ; CHECK: vector.body:
421 ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
422 ; CHECK-NEXT: [[OFFSET_IDX:%.*]] = shl i64 [[INDEX]], 1
423 ; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[OFFSET_IDX]]
424 ; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <8 x i32>, ptr [[TMP0]], align 4
425 ; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
426 ; CHECK-NEXT: [[TMP1:%.*]] = shl nsw <4 x i32> [[STRIDED_VEC]], <i32 1, i32 1, i32 1, i32 1>
427 ; CHECK-NEXT: [[TMP2:%.*]] = and i64 [[INDEX]], 9223372036854775804
428 ; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[TMP2]]
429 ; CHECK-NEXT: store <4 x i32> [[TMP1]], ptr [[TMP3]], align 4
430 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
431 ; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], 508
432 ; CHECK-NEXT: br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
433 ; CHECK: middle.block:
434 ; CHECK-NEXT: br label [[SCALAR_PH]]
436 ; CHECK-NEXT: br label [[FOR_BODY:%.*]]
437 ; CHECK: for.cond.cleanup:
438 ; CHECK-NEXT: ret void
440 ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 1016, [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
441 ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDVARS_IV]]
442 ; CHECK-NEXT: [[TMP:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
443 ; CHECK-NEXT: [[MUL:%.*]] = shl nsw i32 [[TMP]], 1
444 ; CHECK-NEXT: [[TMP1:%.*]] = lshr exact i64 [[INDVARS_IV]], 1
445 ; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP1]]
446 ; CHECK-NEXT: store i32 [[MUL]], ptr [[ARRAYIDX2]], align 4
447 ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 2
448 ; CHECK-NEXT: [[CMP:%.*]] = icmp ult i64 [[INDVARS_IV]], 1022
449 ; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP:%.*]], !llvm.loop [[LOOP13:![0-9]+]]
454 for.cond.cleanup: ; preds = %for.body
457 for.body: ; preds = %for.body, %entry
458 %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
459 %arrayidx = getelementptr inbounds i32, ptr %A, i64 %indvars.iv
460 %tmp = load i32, ptr %arrayidx, align 4
461 %mul = shl nsw i32 %tmp, 1
462 %tmp1 = lshr exact i64 %indvars.iv, 1
463 %arrayidx2 = getelementptr inbounds i32, ptr %B, i64 %tmp1
464 store i32 %mul, ptr %arrayidx2, align 4
465 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 2
466 %cmp = icmp ult i64 %indvars.iv.next, 1024
467 br i1 %cmp, label %for.body, label %for.cond.cleanup
470 ; Check vectorization on an interleaved load group of factor 2 with 1 gap
471 ; (missing the load of odd elements). Because the vectorized loop would
472 ; speculatively access memory out-of-bounds, we must execute at least one
473 ; iteration of the scalar loop.
475 ; void even_load_dynamic_tc(int *A, int *B, unsigned N) {
476 ; for (unsigned i = 0; i < N; i+=2)
481 define void @even_load_dynamic_tc(ptr noalias nocapture readonly %A, ptr noalias nocapture %B, i64 %N) {
482 ; CHECK-LABEL: @even_load_dynamic_tc(
484 ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 9
485 ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
487 ; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[N]], -1
488 ; CHECK-NEXT: [[TMP1:%.*]] = lshr i64 [[TMP0]], 1
489 ; CHECK-NEXT: [[TMP2:%.*]] = add nuw i64 [[TMP1]], 1
490 ; CHECK-NEXT: [[N_MOD_VF:%.*]] = and i64 [[TMP2]], 3
491 ; CHECK-NEXT: [[TMP3:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
492 ; CHECK-NEXT: [[TMP4:%.*]] = select i1 [[TMP3]], i64 4, i64 [[N_MOD_VF]]
493 ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP2]], [[TMP4]]
494 ; CHECK-NEXT: [[IND_END:%.*]] = shl i64 [[N_VEC]], 1
495 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
496 ; CHECK: vector.body:
497 ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
498 ; CHECK-NEXT: [[OFFSET_IDX:%.*]] = shl i64 [[INDEX]], 1
499 ; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[OFFSET_IDX]]
500 ; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <8 x i32>, ptr [[TMP5]], align 4
501 ; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
502 ; CHECK-NEXT: [[TMP6:%.*]] = shl nsw <4 x i32> [[STRIDED_VEC]], <i32 1, i32 1, i32 1, i32 1>
503 ; CHECK-NEXT: [[TMP7:%.*]] = and i64 [[INDEX]], 9223372036854775804
504 ; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[TMP7]]
505 ; CHECK-NEXT: store <4 x i32> [[TMP6]], ptr [[TMP8]], align 4
506 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
507 ; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
508 ; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
509 ; CHECK: middle.block:
510 ; CHECK-NEXT: br label [[SCALAR_PH]]
512 ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
513 ; CHECK-NEXT: br label [[FOR_BODY:%.*]]
514 ; CHECK: for.cond.cleanup:
515 ; CHECK-NEXT: ret void
517 ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
518 ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDVARS_IV]]
519 ; CHECK-NEXT: [[TMP:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
520 ; CHECK-NEXT: [[MUL:%.*]] = shl nsw i32 [[TMP]], 1
521 ; CHECK-NEXT: [[TMP1:%.*]] = lshr exact i64 [[INDVARS_IV]], 1
522 ; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP1]]
523 ; CHECK-NEXT: store i32 [[MUL]], ptr [[ARRAYIDX2]], align 4
524 ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 2
525 ; CHECK-NEXT: [[CMP:%.*]] = icmp ult i64 [[INDVARS_IV_NEXT]], [[N]]
526 ; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP:%.*]], !llvm.loop [[LOOP15:![0-9]+]]
531 for.cond.cleanup: ; preds = %for.body
534 for.body: ; preds = %for.body, %entry
535 %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
536 %arrayidx = getelementptr inbounds i32, ptr %A, i64 %indvars.iv
537 %tmp = load i32, ptr %arrayidx, align 4
538 %mul = shl nsw i32 %tmp, 1
539 %tmp1 = lshr exact i64 %indvars.iv, 1
540 %arrayidx2 = getelementptr inbounds i32, ptr %B, i64 %tmp1
541 store i32 %mul, ptr %arrayidx2, align 4
542 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 2
543 %cmp = icmp ult i64 %indvars.iv.next, %N
544 br i1 %cmp, label %for.body, label %for.cond.cleanup
547 ; Check vectorization on a reverse interleaved load group of factor 2 with 1
548 ; gap and a reverse interleaved store group of factor 2. The interleaved load
549 ; group should be removed since it has a gap and is reverse.
556 ; void load_gap_reverse(struct pair *P1, struct pair *P2, int X) {
557 ; for (int i = 1023; i >= 0; i--) {
559 ; int b = B[i].y - i;
566 %pair = type { i64, i64 }
567 define void @load_gap_reverse(ptr noalias nocapture %P1, ptr noalias nocapture %P2, i64 %X) {
568 ; CHECK-LABEL: @load_gap_reverse(
570 ; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
572 ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[X:%.*]], i64 0
573 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
574 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
575 ; CHECK: vector.body:
576 ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
577 ; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 1023, i64 1022, i64 1021, i64 1020>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
578 ; CHECK-NEXT: [[OFFSET_IDX:%.*]] = sub i64 1023, [[INDEX]]
579 ; CHECK-NEXT: [[TMP0:%.*]] = sub i64 1022, [[INDEX]]
580 ; CHECK-NEXT: [[TMP1:%.*]] = sub i64 1021, [[INDEX]]
581 ; CHECK-NEXT: [[TMP2:%.*]] = sub i64 1020, [[INDEX]]
582 ; CHECK-NEXT: [[TMP3:%.*]] = add nsw <4 x i64> [[BROADCAST_SPLAT]], [[VEC_IND]]
583 ; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds [[PAIR:%.*]], ptr [[P1:%.*]], i64 [[OFFSET_IDX]], i32 0
584 ; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds [[PAIR]], ptr [[P1]], i64 [[TMP0]], i32 0
585 ; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds [[PAIR]], ptr [[P1]], i64 [[TMP1]], i32 0
586 ; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds [[PAIR]], ptr [[P1]], i64 [[TMP2]], i32 0
587 ; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds [[PAIR]], ptr [[P2:%.*]], i64 [[OFFSET_IDX]], i32 1
588 ; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds [[PAIR]], ptr [[P2]], i64 [[TMP0]], i32 1
589 ; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds [[PAIR]], ptr [[P2]], i64 [[TMP1]], i32 1
590 ; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds [[PAIR]], ptr [[P2]], i64 [[TMP2]], i32 1
591 ; CHECK-NEXT: [[TMP12:%.*]] = load i64, ptr [[TMP8]], align 8
592 ; CHECK-NEXT: [[TMP13:%.*]] = load i64, ptr [[TMP9]], align 8
593 ; CHECK-NEXT: [[TMP14:%.*]] = load i64, ptr [[TMP10]], align 8
594 ; CHECK-NEXT: [[TMP15:%.*]] = load i64, ptr [[TMP11]], align 8
595 ; CHECK-NEXT: [[TMP16:%.*]] = insertelement <4 x i64> poison, i64 [[TMP12]], i64 0
596 ; CHECK-NEXT: [[TMP17:%.*]] = insertelement <4 x i64> [[TMP16]], i64 [[TMP13]], i64 1
597 ; CHECK-NEXT: [[TMP18:%.*]] = insertelement <4 x i64> [[TMP17]], i64 [[TMP14]], i64 2
598 ; CHECK-NEXT: [[TMP19:%.*]] = insertelement <4 x i64> [[TMP18]], i64 [[TMP15]], i64 3
599 ; CHECK-NEXT: [[TMP20:%.*]] = sub nsw <4 x i64> [[TMP19]], [[VEC_IND]]
600 ; CHECK-NEXT: [[TMP21:%.*]] = extractelement <4 x i64> [[TMP3]], i64 0
601 ; CHECK-NEXT: store i64 [[TMP21]], ptr [[TMP4]], align 8
602 ; CHECK-NEXT: [[TMP22:%.*]] = extractelement <4 x i64> [[TMP3]], i64 1
603 ; CHECK-NEXT: store i64 [[TMP22]], ptr [[TMP5]], align 8
604 ; CHECK-NEXT: [[TMP23:%.*]] = extractelement <4 x i64> [[TMP3]], i64 2
605 ; CHECK-NEXT: store i64 [[TMP23]], ptr [[TMP6]], align 8
606 ; CHECK-NEXT: [[TMP24:%.*]] = extractelement <4 x i64> [[TMP3]], i64 3
607 ; CHECK-NEXT: store i64 [[TMP24]], ptr [[TMP7]], align 8
608 ; CHECK-NEXT: [[TMP25:%.*]] = extractelement <4 x i64> [[TMP20]], i64 0
609 ; CHECK-NEXT: store i64 [[TMP25]], ptr [[TMP8]], align 8
610 ; CHECK-NEXT: [[TMP26:%.*]] = extractelement <4 x i64> [[TMP20]], i64 1
611 ; CHECK-NEXT: store i64 [[TMP26]], ptr [[TMP9]], align 8
612 ; CHECK-NEXT: [[TMP27:%.*]] = extractelement <4 x i64> [[TMP20]], i64 2
613 ; CHECK-NEXT: store i64 [[TMP27]], ptr [[TMP10]], align 8
614 ; CHECK-NEXT: [[TMP28:%.*]] = extractelement <4 x i64> [[TMP20]], i64 3
615 ; CHECK-NEXT: store i64 [[TMP28]], ptr [[TMP11]], align 8
616 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
617 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], <i64 -4, i64 -4, i64 -4, i64 -4>
618 ; CHECK-NEXT: [[TMP29:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
619 ; CHECK-NEXT: br i1 [[TMP29]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
620 ; CHECK: middle.block:
621 ; CHECK-NEXT: br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
623 ; CHECK-NEXT: br label [[FOR_BODY:%.*]]
625 ; CHECK-NEXT: br i1 poison, label [[FOR_BODY]], label [[FOR_EXIT]], !llvm.loop [[LOOP17:![0-9]+]]
627 ; CHECK-NEXT: ret void
633 %i = phi i64 [ 1023, %entry ], [ %i.next, %for.body ]
634 %0 = add nsw i64 %X, %i
635 %1 = getelementptr inbounds %pair, ptr %P1, i64 %i, i32 0
636 %2 = getelementptr inbounds %pair, ptr %P2, i64 %i, i32 1
637 %3 = load i64, ptr %2, align 8
638 %4 = sub nsw i64 %3, %i
639 store i64 %0, ptr %1, align 8
640 store i64 %4, ptr %2, align 8
641 %i.next = add nsw i64 %i, -1
642 %cond = icmp sgt i64 %i, 0
643 br i1 %cond, label %for.body, label %for.exit
649 ; Check vectorization on interleaved access groups identified from mixed
651 ; void mixed_load2_store2(int *A, int *B) {
652 ; for (unsigned i = 0; i < 1024; i+=2) {
653 ; B[i] = A[i] * A[i+1];
654 ; B[i+1] = A[i] + A[i+1];
659 define void @mixed_load2_store2(ptr noalias nocapture readonly %A, ptr noalias nocapture %B) {
660 ; CHECK-LABEL: @mixed_load2_store2(
662 ; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
664 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
665 ; CHECK: vector.body:
666 ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
667 ; CHECK-NEXT: [[OFFSET_IDX:%.*]] = shl i64 [[INDEX]], 1
668 ; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[OFFSET_IDX]]
669 ; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <8 x i32>, ptr [[TMP0]], align 4
670 ; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
671 ; CHECK-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
672 ; CHECK-NEXT: [[TMP1:%.*]] = or i64 [[OFFSET_IDX]], 1
673 ; CHECK-NEXT: [[TMP2:%.*]] = mul nsw <4 x i32> [[STRIDED_VEC1]], [[STRIDED_VEC]]
674 ; CHECK-NEXT: [[STRIDED_VEC3:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
675 ; CHECK-NEXT: [[STRIDED_VEC4:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
676 ; CHECK-NEXT: [[TMP3:%.*]] = add nsw <4 x i32> [[STRIDED_VEC4]], [[STRIDED_VEC3]]
677 ; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[TMP1]]
678 ; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i64 -1
679 ; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
680 ; CHECK-NEXT: store <8 x i32> [[INTERLEAVED_VEC]], ptr [[TMP5]], align 4
681 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
682 ; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 512
683 ; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
684 ; CHECK: middle.block:
685 ; CHECK-NEXT: br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
687 ; CHECK-NEXT: br label [[FOR_BODY:%.*]]
688 ; CHECK: for.cond.cleanup:
689 ; CHECK-NEXT: ret void
691 ; CHECK-NEXT: br i1 poison, label [[FOR_BODY]], label [[FOR_COND_CLEANUP]], !llvm.loop [[LOOP19:![0-9]+]]
696 for.cond.cleanup: ; preds = %for.body
699 for.body: ; preds = %for.body, %entry
700 %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
701 %arrayidx = getelementptr inbounds i32, ptr %A, i64 %indvars.iv
702 %tmp = load i32, ptr %arrayidx, align 4
703 %tmp1 = or i64 %indvars.iv, 1
704 %arrayidx2 = getelementptr inbounds i32, ptr %A, i64 %tmp1
705 %tmp2 = load i32, ptr %arrayidx2, align 4
706 %mul = mul nsw i32 %tmp2, %tmp
707 %arrayidx4 = getelementptr inbounds i32, ptr %B, i64 %indvars.iv
708 store i32 %mul, ptr %arrayidx4, align 4
709 %tmp3 = load i32, ptr %arrayidx, align 4
710 %tmp4 = load i32, ptr %arrayidx2, align 4
711 %add10 = add nsw i32 %tmp4, %tmp3
712 %arrayidx13 = getelementptr inbounds i32, ptr %B, i64 %tmp1
713 store i32 %add10, ptr %arrayidx13, align 4
714 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 2
715 %cmp = icmp ult i64 %indvars.iv.next, 1024
716 br i1 %cmp, label %for.body, label %for.cond.cleanup
719 ; Check vectorization on interleaved access groups identified from mixed
721 ; void mixed_load3_store3(int *A) {
722 ; for (unsigned i = 0; i < 1024; i++) {
730 define void @mixed_load3_store3(ptr nocapture %A) {
731 ; CHECK-LABEL: @mixed_load3_store3(
733 ; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
735 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
736 ; CHECK: vector.body:
737 ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
738 ; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
739 ; CHECK-NEXT: [[TMP0:%.*]] = mul i64 [[INDEX]], 12
740 ; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 [[TMP0]]
741 ; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <12 x i32>, ptr [[NEXT_GEP]], align 4
742 ; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <12 x i32> [[WIDE_VEC]], <12 x i32> poison, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
743 ; CHECK-NEXT: [[STRIDED_VEC2:%.*]] = shufflevector <12 x i32> [[WIDE_VEC]], <12 x i32> poison, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
744 ; CHECK-NEXT: [[STRIDED_VEC3:%.*]] = shufflevector <12 x i32> [[WIDE_VEC]], <12 x i32> poison, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
745 ; CHECK-NEXT: [[TMP1:%.*]] = add <4 x i32> [[STRIDED_VEC]], [[VEC_IND]]
746 ; CHECK-NEXT: [[TMP2:%.*]] = add <4 x i32> [[STRIDED_VEC2]], [[VEC_IND]]
747 ; CHECK-NEXT: [[TMP3:%.*]] = add <4 x i32> [[STRIDED_VEC3]], [[VEC_IND]]
748 ; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
749 ; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
750 ; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i32> [[TMP4]], <8 x i32> [[TMP5]], <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
751 ; CHECK-NEXT: store <12 x i32> [[INTERLEAVED_VEC]], ptr [[NEXT_GEP]], align 4
752 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
753 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
754 ; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
755 ; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
756 ; CHECK: middle.block:
757 ; CHECK-NEXT: br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
759 ; CHECK-NEXT: br label [[FOR_BODY:%.*]]
760 ; CHECK: for.cond.cleanup:
761 ; CHECK-NEXT: ret void
763 ; CHECK-NEXT: br i1 poison, label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
768 for.cond.cleanup: ; preds = %for.body
771 for.body: ; preds = %for.body, %entry
772 %i.013 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
773 %A.addr.012 = phi ptr [ %A, %entry ], [ %incdec.ptr3, %for.body ]
774 %incdec.ptr = getelementptr inbounds i32, ptr %A.addr.012, i64 1
775 %tmp = load i32, ptr %A.addr.012, align 4
776 %add = add i32 %tmp, %i.013
777 store i32 %add, ptr %A.addr.012, align 4
778 %incdec.ptr1 = getelementptr inbounds i32, ptr %A.addr.012, i64 2
779 %tmp1 = load i32, ptr %incdec.ptr, align 4
780 %add2 = add i32 %tmp1, %i.013
781 store i32 %add2, ptr %incdec.ptr, align 4
782 %incdec.ptr3 = getelementptr inbounds i32, ptr %A.addr.012, i64 3
783 %tmp2 = load i32, ptr %incdec.ptr1, align 4
784 %add4 = add i32 %tmp2, %i.013
785 store i32 %add4, ptr %incdec.ptr1, align 4
786 %inc = add nuw nsw i32 %i.013, 1
787 %exitcond = icmp eq i32 %inc, 1024
788 br i1 %exitcond, label %for.cond.cleanup, label %for.body
791 ; Check vectorization on interleaved access groups with members having different
802 ; void int_float_struct(struct IntFloat *A) {
805 ; for (unsigned i = 0; i < 1024; i++) {
814 %struct.IntFloat = type { i32, float }
816 @SA = common global i32 0, align 4
817 @SB = common global float 0.000000e+00, align 4
819 define void @int_float_struct(ptr nocapture readonly %A) #0 {
820 ; CHECK-LABEL: @int_float_struct(
822 ; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
824 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
825 ; CHECK: vector.body:
826 ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
827 ; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x float> [ <float undef, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>, [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ]
828 ; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i32> [ <i32 undef, i32 0, i32 0, i32 0>, [[VECTOR_PH]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ]
829 ; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds [[STRUCT_INTFLOAT:%.*]], ptr [[A:%.*]], i64 [[INDEX]], i32 0
830 ; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <8 x i32>, ptr [[TMP0]], align 4
831 ; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
832 ; CHECK-NEXT: [[STRIDED_VEC2:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
833 ; CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[STRIDED_VEC2]] to <4 x float>
834 ; CHECK-NEXT: [[TMP2]] = add <4 x i32> [[STRIDED_VEC]], [[VEC_PHI1]]
835 ; CHECK-NEXT: [[TMP3]] = fadd fast <4 x float> [[VEC_PHI]], [[TMP1]]
836 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
837 ; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
838 ; CHECK-NEXT: br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
839 ; CHECK: middle.block:
840 ; CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP2]])
841 ; CHECK-NEXT: [[TMP6:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP3]])
842 ; CHECK-NEXT: br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
844 ; CHECK-NEXT: br label [[FOR_BODY:%.*]]
845 ; CHECK: for.cond.cleanup:
846 ; CHECK-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ poison, [[FOR_BODY]] ], [ [[TMP5]], [[MIDDLE_BLOCK]] ]
847 ; CHECK-NEXT: [[ADD3_LCSSA:%.*]] = phi float [ poison, [[FOR_BODY]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ]
848 ; CHECK-NEXT: store i32 [[ADD_LCSSA]], ptr @SA, align 4
849 ; CHECK-NEXT: store float [[ADD3_LCSSA]], ptr @SB, align 4
850 ; CHECK-NEXT: ret void
852 ; CHECK-NEXT: br i1 poison, label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]]
857 for.cond.cleanup: ; preds = %for.body
858 store i32 %add, ptr @SA, align 4
859 store float %add3, ptr @SB, align 4
862 for.body: ; preds = %for.body, %entry
863 %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
864 %SumB.014 = phi float [ undef, %entry ], [ %add3, %for.body ]
865 %SumA.013 = phi i32 [ undef, %entry ], [ %add, %for.body ]
866 %a = getelementptr inbounds %struct.IntFloat, ptr %A, i64 %indvars.iv, i32 0
867 %tmp = load i32, ptr %a, align 4
868 %add = add nsw i32 %tmp, %SumA.013
869 %b = getelementptr inbounds %struct.IntFloat, ptr %A, i64 %indvars.iv, i32 1
870 %tmp1 = load float, ptr %b, align 4
871 %add3 = fadd fast float %SumB.014, %tmp1
872 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
873 %exitcond = icmp eq i64 %indvars.iv.next, 1024
874 br i1 %exitcond, label %for.cond.cleanup, label %for.body
877 ; Check vectorization of interleaved access groups in the presence of
878 ; dependences (PR27626). The following tests check that we don't reorder
879 ; dependent loads and stores when generating code for interleaved access
880 ; groups. Stores should be scalarized because the required code motion would
881 ; break dependences, and the remaining interleaved load groups should have
884 ; PR27626_0: Ensure a strided store is not moved after a dependent (zero
885 ; distance) strided load.
887 ; void PR27626_0(struct pair *p, int z, int n) {
888 ; for (int i = 0; i < n; i++) {
895 %pair.i32 = type { i32, i32 }
896 define void @PR27626_0(ptr %p, i32 %z, i64 %n) {
897 ; CHECK-LABEL: @PR27626_0(
899 ; CHECK-NEXT: [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N:%.*]], i64 1)
900 ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[SMAX]], 5
901 ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
903 ; CHECK-NEXT: [[N_MOD_VF:%.*]] = and i64 [[SMAX]], 3
904 ; CHECK-NEXT: [[TMP0:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
905 ; CHECK-NEXT: [[TMP1:%.*]] = select i1 [[TMP0]], i64 4, i64 [[N_MOD_VF]]
906 ; CHECK-NEXT: [[N_VEC:%.*]] = sub nsw i64 [[SMAX]], [[TMP1]]
907 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
908 ; CHECK: vector.body:
909 ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
910 ; CHECK-NEXT: [[TMP2:%.*]] = or i64 [[INDEX]], 1
911 ; CHECK-NEXT: [[TMP3:%.*]] = or i64 [[INDEX]], 2
912 ; CHECK-NEXT: [[TMP4:%.*]] = or i64 [[INDEX]], 3
913 ; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds [[PAIR_I32:%.*]], ptr [[P:%.*]], i64 [[INDEX]], i32 0
914 ; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP2]], i32 0
915 ; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP3]], i32 0
916 ; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP4]], i32 0
917 ; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[INDEX]], i32 1
918 ; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP2]], i32 1
919 ; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP3]], i32 1
920 ; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP4]], i32 1
921 ; CHECK-NEXT: store i32 [[Z:%.*]], ptr [[TMP5]], align 4
922 ; CHECK-NEXT: store i32 [[Z]], ptr [[TMP6]], align 4
923 ; CHECK-NEXT: store i32 [[Z]], ptr [[TMP7]], align 4
924 ; CHECK-NEXT: store i32 [[Z]], ptr [[TMP8]], align 4
925 ; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <8 x i32>, ptr [[TMP5]], align 4
926 ; CHECK-NEXT: [[TMP13:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i64 0
927 ; CHECK-NEXT: store i32 [[TMP13]], ptr [[TMP9]], align 4
928 ; CHECK-NEXT: [[TMP14:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i64 2
929 ; CHECK-NEXT: store i32 [[TMP14]], ptr [[TMP10]], align 4
930 ; CHECK-NEXT: [[TMP15:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i64 4
931 ; CHECK-NEXT: store i32 [[TMP15]], ptr [[TMP11]], align 4
932 ; CHECK-NEXT: [[TMP16:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i64 6
933 ; CHECK-NEXT: store i32 [[TMP16]], ptr [[TMP12]], align 4
934 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
935 ; CHECK-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
936 ; CHECK-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
937 ; CHECK: middle.block:
938 ; CHECK-NEXT: br label [[SCALAR_PH]]
940 ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
941 ; CHECK-NEXT: br label [[FOR_BODY:%.*]]
943 ; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
944 ; CHECK-NEXT: [[P_I_X:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[I]], i32 0
945 ; CHECK-NEXT: [[P_I_Y:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[I]], i32 1
946 ; CHECK-NEXT: store i32 [[Z]], ptr [[P_I_X]], align 4
947 ; CHECK-NEXT: store i32 [[Z]], ptr [[P_I_Y]], align 4
948 ; CHECK-NEXT: [[I_NEXT]] = add nuw nsw i64 [[I]], 1
949 ; CHECK-NEXT: [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]]
950 ; CHECK-NEXT: br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END:%.*]], !llvm.loop [[LOOP25:![0-9]+]]
952 ; CHECK-NEXT: ret void
958 %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
959 %p_i.x = getelementptr inbounds %pair.i32, ptr %p, i64 %i, i32 0
960 %p_i.y = getelementptr inbounds %pair.i32, ptr %p, i64 %i, i32 1
961 store i32 %z, ptr %p_i.x, align 4
962 %0 = load i32, ptr %p_i.x, align 4
963 store i32 %0, ptr %p_i.y, align 4
964 %i.next = add nuw nsw i64 %i, 1
965 %cond = icmp slt i64 %i.next, %n
966 br i1 %cond, label %for.body, label %for.end
972 ; PR27626_1: Ensure a strided load is not moved before a dependent (zero
973 ; distance) strided store.
975 ; void PR27626_1(struct pair *p, int n) {
977 ; for (int i = 0; i < n; i++) {
984 define i32 @PR27626_1(ptr %p, i64 %n) {
985 ; CHECK-LABEL: @PR27626_1(
987 ; CHECK-NEXT: [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N:%.*]], i64 1)
988 ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[SMAX]], 5
989 ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
991 ; CHECK-NEXT: [[N_MOD_VF:%.*]] = and i64 [[SMAX]], 3
992 ; CHECK-NEXT: [[TMP0:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
993 ; CHECK-NEXT: [[TMP1:%.*]] = select i1 [[TMP0]], i64 4, i64 [[N_MOD_VF]]
994 ; CHECK-NEXT: [[N_VEC:%.*]] = sub nsw i64 [[SMAX]], [[TMP1]]
995 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
996 ; CHECK: vector.body:
997 ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
998 ; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP14:%.*]], [[VECTOR_BODY]] ]
999 ; CHECK-NEXT: [[TMP2:%.*]] = or i64 [[INDEX]], 1
1000 ; CHECK-NEXT: [[TMP3:%.*]] = or i64 [[INDEX]], 2
1001 ; CHECK-NEXT: [[TMP4:%.*]] = or i64 [[INDEX]], 3
1002 ; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds [[PAIR_I32:%.*]], ptr [[P:%.*]], i64 [[INDEX]], i32 0
1003 ; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[INDEX]], i32 1
1004 ; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP2]], i32 1
1005 ; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP3]], i32 1
1006 ; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP4]], i32 1
1007 ; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <8 x i32>, ptr [[TMP5]], align 4
1008 ; CHECK-NEXT: [[TMP10:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i64 0
1009 ; CHECK-NEXT: store i32 [[TMP10]], ptr [[TMP6]], align 4
1010 ; CHECK-NEXT: [[TMP11:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i64 2
1011 ; CHECK-NEXT: store i32 [[TMP11]], ptr [[TMP7]], align 4
1012 ; CHECK-NEXT: [[TMP12:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i64 4
1013 ; CHECK-NEXT: store i32 [[TMP12]], ptr [[TMP8]], align 4
1014 ; CHECK-NEXT: [[TMP13:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i64 6
1015 ; CHECK-NEXT: store i32 [[TMP13]], ptr [[TMP9]], align 4
1016 ; CHECK-NEXT: [[WIDE_VEC1:%.*]] = load <8 x i32>, ptr [[TMP6]], align 4
1017 ; CHECK-NEXT: [[STRIDED_VEC2:%.*]] = shufflevector <8 x i32> [[WIDE_VEC1]], <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
1018 ; CHECK-NEXT: [[TMP14]] = add <4 x i32> [[STRIDED_VEC2]], [[VEC_PHI]]
1019 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
1020 ; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
1021 ; CHECK-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]]
1022 ; CHECK: middle.block:
1023 ; CHECK-NEXT: [[TMP16:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP14]])
1024 ; CHECK-NEXT: br label [[SCALAR_PH]]
1026 ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
1027 ; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP16]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
1028 ; CHECK-NEXT: br label [[FOR_BODY:%.*]]
1030 ; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
1031 ; CHECK-NEXT: [[S:%.*]] = phi i32 [ [[TMP18:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
1032 ; CHECK-NEXT: [[P_I_X:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[I]], i32 0
1033 ; CHECK-NEXT: [[P_I_Y:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[I]], i32 1
1034 ; CHECK-NEXT: [[TMP17:%.*]] = load i32, ptr [[P_I_X]], align 4
1035 ; CHECK-NEXT: store i32 [[TMP17]], ptr [[P_I_Y]], align 4
1036 ; CHECK-NEXT: [[TMP18]] = add nsw i32 [[TMP17]], [[S]]
1037 ; CHECK-NEXT: [[I_NEXT]] = add nuw nsw i64 [[I]], 1
1038 ; CHECK-NEXT: [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]]
1039 ; CHECK-NEXT: br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END:%.*]], !llvm.loop [[LOOP27:![0-9]+]]
1041 ; CHECK-NEXT: ret i32 [[TMP18]]
1047 %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
1048 %s = phi i32 [ %2, %for.body ], [ 0, %entry ]
1049 %p_i.x = getelementptr inbounds %pair.i32, ptr %p, i64 %i, i32 0
1050 %p_i.y = getelementptr inbounds %pair.i32, ptr %p, i64 %i, i32 1
1051 %0 = load i32, ptr %p_i.x, align 4
1052 store i32 %0, ptr %p_i.y, align 4
1053 %1 = load i32, ptr %p_i.y, align 4
1054 %2 = add nsw i32 %1, %s
1055 %i.next = add nuw nsw i64 %i, 1
1056 %cond = icmp slt i64 %i.next, %n
1057 br i1 %cond, label %for.body, label %for.end
1060 %3 = phi i32 [ %2, %for.body ]
1064 ; PR27626_2: Ensure a strided store is not moved after a dependent (negative
1065 ; distance) strided load.
1067 ; void PR27626_2(struct pair *p, int z, int n) {
1068 ; for (int i = 0; i < n; i++) {
1070 ; p[i].y = p[i - 1].x;
1075 define void @PR27626_2(ptr %p, i64 %n, i32 %z) {
1076 ; CHECK-LABEL: @PR27626_2(
1077 ; CHECK-NEXT: entry:
1078 ; CHECK-NEXT: [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N:%.*]], i64 1)
1079 ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[SMAX]], 5
1080 ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
1082 ; CHECK-NEXT: [[N_MOD_VF:%.*]] = and i64 [[SMAX]], 3
1083 ; CHECK-NEXT: [[TMP0:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
1084 ; CHECK-NEXT: [[TMP1:%.*]] = select i1 [[TMP0]], i64 4, i64 [[N_MOD_VF]]
1085 ; CHECK-NEXT: [[N_VEC:%.*]] = sub nsw i64 [[SMAX]], [[TMP1]]
1086 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
1087 ; CHECK: vector.body:
1088 ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
1089 ; CHECK-NEXT: [[TMP2:%.*]] = or i64 [[INDEX]], 1
1090 ; CHECK-NEXT: [[TMP3:%.*]] = or i64 [[INDEX]], 2
1091 ; CHECK-NEXT: [[TMP4:%.*]] = or i64 [[INDEX]], 3
1092 ; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds [[PAIR_I32:%.*]], ptr [[P:%.*]], i64 [[INDEX]], i32 0
1093 ; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP2]], i32 0
1094 ; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP3]], i32 0
1095 ; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP4]], i32 0
1096 ; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 -1, i32 0
1097 ; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[INDEX]], i32 1
1098 ; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP2]], i32 1
1099 ; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP3]], i32 1
1100 ; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP4]], i32 1
1101 ; CHECK-NEXT: store i32 [[Z:%.*]], ptr [[TMP5]], align 4
1102 ; CHECK-NEXT: store i32 [[Z]], ptr [[TMP6]], align 4
1103 ; CHECK-NEXT: store i32 [[Z]], ptr [[TMP7]], align 4
1104 ; CHECK-NEXT: store i32 [[Z]], ptr [[TMP8]], align 4
1105 ; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <8 x i32>, ptr [[TMP9]], align 4
1106 ; CHECK-NEXT: [[TMP14:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i64 0
1107 ; CHECK-NEXT: store i32 [[TMP14]], ptr [[TMP10]], align 4
1108 ; CHECK-NEXT: [[TMP15:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i64 2
1109 ; CHECK-NEXT: store i32 [[TMP15]], ptr [[TMP11]], align 4
1110 ; CHECK-NEXT: [[TMP16:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i64 4
1111 ; CHECK-NEXT: store i32 [[TMP16]], ptr [[TMP12]], align 4
1112 ; CHECK-NEXT: [[TMP17:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i64 6
1113 ; CHECK-NEXT: store i32 [[TMP17]], ptr [[TMP13]], align 4
1114 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
1115 ; CHECK-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
1116 ; CHECK-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]]
1117 ; CHECK: middle.block:
1118 ; CHECK-NEXT: br label [[SCALAR_PH]]
1120 ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
1121 ; CHECK-NEXT: br label [[FOR_BODY:%.*]]
1123 ; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
1124 ; CHECK-NEXT: [[P_I_X:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[I]], i32 0
1125 ; CHECK-NEXT: [[P_I_MINUS_1_X:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 -1, i32 0
1126 ; CHECK-NEXT: [[P_I_Y:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[I]], i32 1
1127 ; CHECK-NEXT: store i32 [[Z]], ptr [[P_I_X]], align 4
1128 ; CHECK-NEXT: [[TMP19:%.*]] = load i32, ptr [[P_I_MINUS_1_X]], align 4
1129 ; CHECK-NEXT: store i32 [[TMP19]], ptr [[P_I_Y]], align 4
1130 ; CHECK-NEXT: [[I_NEXT]] = add nuw nsw i64 [[I]], 1
1131 ; CHECK-NEXT: [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]]
1132 ; CHECK-NEXT: br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END:%.*]], !llvm.loop [[LOOP29:![0-9]+]]
1134 ; CHECK-NEXT: ret void
1140 %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
1141 %i_minus_1 = add nuw nsw i64 %i, -1
1142 %p_i.x = getelementptr inbounds %pair.i32, ptr %p, i64 %i, i32 0
1143 %p_i_minus_1.x = getelementptr inbounds %pair.i32, ptr %p, i64 %i_minus_1, i32 0
1144 %p_i.y = getelementptr inbounds %pair.i32, ptr %p, i64 %i, i32 1
1145 store i32 %z, ptr %p_i.x, align 4
1146 %0 = load i32, ptr %p_i_minus_1.x, align 4
1147 store i32 %0, ptr %p_i.y, align 4
1148 %i.next = add nuw nsw i64 %i, 1
1149 %cond = icmp slt i64 %i.next, %n
1150 br i1 %cond, label %for.body, label %for.end
1156 ; PR27626_3: Ensure a strided load is not moved before a dependent (negative
1157 ; distance) strided store.
1159 ; void PR27626_3(struct pair *p, int z, int n) {
1160 ; for (int i = 0; i < n; i++) {
1161 ; p[i + 1].y = p[i].x;
1167 define i32 @PR27626_3(ptr %p, i64 %n, i32 %z) {
1168 ; CHECK-LABEL: @PR27626_3(
1169 ; CHECK-NEXT: entry:
1170 ; CHECK-NEXT: [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N:%.*]], i64 1)
1171 ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[SMAX]], 5
1172 ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
1174 ; CHECK-NEXT: [[N_MOD_VF:%.*]] = and i64 [[SMAX]], 3
1175 ; CHECK-NEXT: [[TMP0:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
1176 ; CHECK-NEXT: [[TMP1:%.*]] = select i1 [[TMP0]], i64 4, i64 [[N_MOD_VF]]
1177 ; CHECK-NEXT: [[N_VEC:%.*]] = sub nsw i64 [[SMAX]], [[TMP1]]
1178 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
1179 ; CHECK: vector.body:
1180 ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
1181 ; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
1182 ; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP17:%.*]], [[VECTOR_BODY]] ]
1183 ; CHECK-NEXT: [[TMP2:%.*]] = add nuw nsw <4 x i64> [[VEC_IND]], <i64 1, i64 1, i64 1, i64 1>
1184 ; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds [[PAIR_I32:%.*]], ptr [[P:%.*]], i64 [[INDEX]], i32 0
1185 ; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[INDEX]], i32 1
1186 ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[TMP2]], i64 0
1187 ; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP5]], i32 1
1188 ; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP2]], i64 1
1189 ; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP7]], i32 1
1190 ; CHECK-NEXT: [[TMP9:%.*]] = extractelement <4 x i64> [[TMP2]], i64 2
1191 ; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP9]], i32 1
1192 ; CHECK-NEXT: [[TMP11:%.*]] = extractelement <4 x i64> [[TMP2]], i64 3
1193 ; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP11]], i32 1
1194 ; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <8 x i32>, ptr [[TMP3]], align 4
1195 ; CHECK-NEXT: [[TMP13:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i64 0
1196 ; CHECK-NEXT: store i32 [[TMP13]], ptr [[TMP6]], align 4
1197 ; CHECK-NEXT: [[TMP14:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i64 2
1198 ; CHECK-NEXT: store i32 [[TMP14]], ptr [[TMP8]], align 4
1199 ; CHECK-NEXT: [[TMP15:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i64 4
1200 ; CHECK-NEXT: store i32 [[TMP15]], ptr [[TMP10]], align 4
1201 ; CHECK-NEXT: [[TMP16:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i64 6
1202 ; CHECK-NEXT: store i32 [[TMP16]], ptr [[TMP12]], align 4
1203 ; CHECK-NEXT: [[WIDE_VEC1:%.*]] = load <8 x i32>, ptr [[TMP4]], align 4
1204 ; CHECK-NEXT: [[STRIDED_VEC2:%.*]] = shufflevector <8 x i32> [[WIDE_VEC1]], <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
1205 ; CHECK-NEXT: [[TMP17]] = add <4 x i32> [[STRIDED_VEC2]], [[VEC_PHI]]
1206 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
1207 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], <i64 4, i64 4, i64 4, i64 4>
1208 ; CHECK-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
1209 ; CHECK-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP30:![0-9]+]]
1210 ; CHECK: middle.block:
1211 ; CHECK-NEXT: [[TMP19:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP17]])
1212 ; CHECK-NEXT: br label [[SCALAR_PH]]
1214 ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
1215 ; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP19]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
1216 ; CHECK-NEXT: br label [[FOR_BODY:%.*]]
1218 ; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
1219 ; CHECK-NEXT: [[S:%.*]] = phi i32 [ [[TMP22:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
1220 ; CHECK-NEXT: [[I_PLUS_1:%.*]] = add nuw nsw i64 [[I]], 1
1221 ; CHECK-NEXT: [[P_I_X:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[I]], i32 0
1222 ; CHECK-NEXT: [[P_I_Y:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[I]], i32 1
1223 ; CHECK-NEXT: [[P_I_PLUS_1_Y:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[I_PLUS_1]], i32 1
1224 ; CHECK-NEXT: [[TMP20:%.*]] = load i32, ptr [[P_I_X]], align 4
1225 ; CHECK-NEXT: store i32 [[TMP20]], ptr [[P_I_PLUS_1_Y]], align 4
1226 ; CHECK-NEXT: [[TMP21:%.*]] = load i32, ptr [[P_I_Y]], align 4
1227 ; CHECK-NEXT: [[TMP22]] = add nsw i32 [[TMP21]], [[S]]
1228 ; CHECK-NEXT: [[I_NEXT]] = add nuw nsw i64 [[I]], 1
1229 ; CHECK-NEXT: [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]]
1230 ; CHECK-NEXT: br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END:%.*]], !llvm.loop [[LOOP31:![0-9]+]]
1232 ; CHECK-NEXT: ret i32 [[TMP22]]
1238 %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
1239 %s = phi i32 [ %2, %for.body ], [ 0, %entry ]
1240 %i_plus_1 = add nuw nsw i64 %i, 1
1241 %p_i.x = getelementptr inbounds %pair.i32, ptr %p, i64 %i, i32 0
1242 %p_i.y = getelementptr inbounds %pair.i32, ptr %p, i64 %i, i32 1
1243 %p_i_plus_1.y = getelementptr inbounds %pair.i32, ptr %p, i64 %i_plus_1, i32 1
1244 %0 = load i32, ptr %p_i.x, align 4
1245 store i32 %0, ptr %p_i_plus_1.y, align 4
1246 %1 = load i32, ptr %p_i.y, align 4
1247 %2 = add nsw i32 %1, %s
1248 %i.next = add nuw nsw i64 %i, 1
1249 %cond = icmp slt i64 %i.next, %n
1250 br i1 %cond, label %for.body, label %for.end
1253 %3 = phi i32 [ %2, %for.body ]
1257 ; PR27626_4: Ensure we form an interleaved group for strided stores in the
1258 ; presence of a write-after-write dependence. We create a group for
1259 ; (2) and (3) while excluding (1).
1261 ; void PR27626_4(int *a, int x, int y, int z, int n) {
1262 ; for (int i = 0; i < n; i += 2) {
1265 ; a[i + 1] = z; // (3)
1270 define void @PR27626_4(ptr %a, i32 %x, i32 %y, i32 %z, i64 %n) {
1271 ; CHECK-LABEL: @PR27626_4(
1272 ; CHECK-NEXT: entry:
1273 ; CHECK-NEXT: [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N:%.*]], i64 2)
1274 ; CHECK-NEXT: [[TMP0:%.*]] = add nsw i64 [[SMAX]], -1
1275 ; CHECK-NEXT: [[TMP1:%.*]] = lshr i64 [[TMP0]], 1
1276 ; CHECK-NEXT: [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1
1277 ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[SMAX]], 7
1278 ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
1280 ; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[TMP2]], 9223372036854775804
1281 ; CHECK-NEXT: [[IND_END:%.*]] = shl nuw i64 [[N_VEC]], 1
1282 ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[Y:%.*]], i64 0
1283 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
1284 ; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i32> poison, i32 [[Z:%.*]], i64 0
1285 ; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT1]], <4 x i32> poison, <4 x i32> zeroinitializer
1286 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
1287 ; CHECK: vector.body:
1288 ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
1289 ; CHECK-NEXT: [[OFFSET_IDX:%.*]] = shl i64 [[INDEX]], 1
1290 ; CHECK-NEXT: [[TMP3:%.*]] = or i64 [[OFFSET_IDX]], 2
1291 ; CHECK-NEXT: [[TMP4:%.*]] = or i64 [[OFFSET_IDX]], 4
1292 ; CHECK-NEXT: [[TMP5:%.*]] = or i64 [[OFFSET_IDX]], 6
1293 ; CHECK-NEXT: [[TMP6:%.*]] = or i64 [[OFFSET_IDX]], 1
1294 ; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[OFFSET_IDX]]
1295 ; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP3]]
1296 ; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP4]]
1297 ; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP5]]
1298 ; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP6]]
1299 ; CHECK-NEXT: store i32 [[X:%.*]], ptr [[TMP7]], align 4
1300 ; CHECK-NEXT: store i32 [[X]], ptr [[TMP8]], align 4
1301 ; CHECK-NEXT: store i32 [[X]], ptr [[TMP9]], align 4
1302 ; CHECK-NEXT: store i32 [[X]], ptr [[TMP10]], align 4
1303 ; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP11]], i64 -1
1304 ; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLAT]], <4 x i32> [[BROADCAST_SPLAT2]], <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
1305 ; CHECK-NEXT: store <8 x i32> [[INTERLEAVED_VEC]], ptr [[TMP12]], align 4
1306 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
1307 ; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
1308 ; CHECK-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP32:![0-9]+]]
1309 ; CHECK: middle.block:
1310 ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
1311 ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
1313 ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
1314 ; CHECK-NEXT: br label [[FOR_BODY:%.*]]
1316 ; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
1317 ; CHECK-NEXT: [[I_PLUS_1:%.*]] = or i64 [[I]], 1
1318 ; CHECK-NEXT: [[A_I:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[I]]
1319 ; CHECK-NEXT: [[A_I_PLUS_1:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[I_PLUS_1]]
1320 ; CHECK-NEXT: store i32 [[Y]], ptr [[A_I]], align 4
1321 ; CHECK-NEXT: store i32 [[Z]], ptr [[A_I_PLUS_1]], align 4
1322 ; CHECK-NEXT: [[I_NEXT]] = add nuw nsw i64 [[I]], 2
1323 ; CHECK-NEXT: [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]]
1324 ; CHECK-NEXT: br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP33:![0-9]+]]
1326 ; CHECK-NEXT: ret void
1332 %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
1333 %i_plus_1 = add i64 %i, 1
1334 %a_i = getelementptr inbounds i32, ptr %a, i64 %i
1335 %a_i_plus_1 = getelementptr inbounds i32, ptr %a, i64 %i_plus_1
1336 store i32 %x, ptr %a_i, align 4
1337 store i32 %y, ptr %a_i, align 4
1338 store i32 %z, ptr %a_i_plus_1, align 4
1339 %i.next = add nuw nsw i64 %i, 2
1340 %cond = icmp slt i64 %i.next, %n
1341 br i1 %cond, label %for.body, label %for.end
1347 ; PR27626_5: Ensure we do not form an interleaved group for strided stores in
1348 ; the presence of a write-after-write dependence.
1350 ; void PR27626_5(int *a, int x, int y, int z, int n) {
1351 ; for (int i = 3; i < n; i += 2) {
1359 define void @PR27626_5(ptr %a, i32 %x, i32 %y, i32 %z, i64 %n) {
1360 ; CHECK-LABEL: @PR27626_5(
1361 ; CHECK-NEXT: entry:
1362 ; CHECK-NEXT: [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N:%.*]], i64 5)
1363 ; CHECK-NEXT: [[TMP0:%.*]] = add nsw i64 [[SMAX]], -4
1364 ; CHECK-NEXT: [[TMP1:%.*]] = lshr i64 [[TMP0]], 1
1365 ; CHECK-NEXT: [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1
1366 ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 6
1367 ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
1369 ; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[TMP2]], 9223372036854775804
1370 ; CHECK-NEXT: [[TMP3:%.*]] = shl nuw i64 [[N_VEC]], 1
1371 ; CHECK-NEXT: [[IND_END:%.*]] = or i64 [[TMP3]], 3
1372 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
1373 ; CHECK: vector.body:
1374 ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
1375 ; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 3, i64 5, i64 7, i64 9>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
1376 ; CHECK-NEXT: [[TMP4:%.*]] = shl i64 [[INDEX]], 1
1377 ; CHECK-NEXT: [[OFFSET_IDX:%.*]] = or i64 [[TMP4]], 3
1378 ; CHECK-NEXT: [[TMP5:%.*]] = or i64 [[TMP4]], 5
1379 ; CHECK-NEXT: [[TMP6:%.*]] = or i64 [[TMP4]], 7
1380 ; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[TMP4]], 9
1381 ; CHECK-NEXT: [[TMP8:%.*]] = add <4 x i64> [[VEC_IND]], <i64 -1, i64 -1, i64 -1, i64 -1>
1382 ; CHECK-NEXT: [[TMP9:%.*]] = add <4 x i64> [[VEC_IND]], <i64 -3, i64 -3, i64 -3, i64 -3>
1383 ; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[OFFSET_IDX]]
1384 ; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP5]]
1385 ; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP6]]
1386 ; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP7]]
1387 ; CHECK-NEXT: [[TMP14:%.*]] = extractelement <4 x i64> [[TMP8]], i64 0
1388 ; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP14]]
1389 ; CHECK-NEXT: [[TMP16:%.*]] = extractelement <4 x i64> [[TMP8]], i64 1
1390 ; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP16]]
1391 ; CHECK-NEXT: [[TMP18:%.*]] = extractelement <4 x i64> [[TMP8]], i64 2
1392 ; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP18]]
1393 ; CHECK-NEXT: [[TMP20:%.*]] = extractelement <4 x i64> [[TMP8]], i64 3
1394 ; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP20]]
1395 ; CHECK-NEXT: [[TMP22:%.*]] = extractelement <4 x i64> [[TMP9]], i64 0
1396 ; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP22]]
1397 ; CHECK-NEXT: [[TMP24:%.*]] = extractelement <4 x i64> [[TMP9]], i64 1
1398 ; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP24]]
1399 ; CHECK-NEXT: [[TMP26:%.*]] = extractelement <4 x i64> [[TMP9]], i64 2
1400 ; CHECK-NEXT: [[TMP27:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP26]]
1401 ; CHECK-NEXT: [[TMP28:%.*]] = extractelement <4 x i64> [[TMP9]], i64 3
1402 ; CHECK-NEXT: [[TMP29:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP28]]
1403 ; CHECK-NEXT: store i32 [[X:%.*]], ptr [[TMP15]], align 4
1404 ; CHECK-NEXT: store i32 [[X]], ptr [[TMP17]], align 4
1405 ; CHECK-NEXT: store i32 [[X]], ptr [[TMP19]], align 4
1406 ; CHECK-NEXT: store i32 [[X]], ptr [[TMP21]], align 4
1407 ; CHECK-NEXT: store i32 [[Y:%.*]], ptr [[TMP23]], align 4
1408 ; CHECK-NEXT: store i32 [[Y]], ptr [[TMP25]], align 4
1409 ; CHECK-NEXT: store i32 [[Y]], ptr [[TMP27]], align 4
1410 ; CHECK-NEXT: store i32 [[Y]], ptr [[TMP29]], align 4
1411 ; CHECK-NEXT: store i32 [[Z:%.*]], ptr [[TMP10]], align 4
1412 ; CHECK-NEXT: store i32 [[Z]], ptr [[TMP11]], align 4
1413 ; CHECK-NEXT: store i32 [[Z]], ptr [[TMP12]], align 4
1414 ; CHECK-NEXT: store i32 [[Z]], ptr [[TMP13]], align 4
1415 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
1416 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], <i64 8, i64 8, i64 8, i64 8>
1417 ; CHECK-NEXT: [[TMP30:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
1418 ; CHECK-NEXT: br i1 [[TMP30]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP34:![0-9]+]]
1419 ; CHECK: middle.block:
1420 ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
1421 ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
1423 ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 3, [[ENTRY:%.*]] ]
1424 ; CHECK-NEXT: br label [[FOR_BODY:%.*]]
1426 ; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
1427 ; CHECK-NEXT: [[I_MINUS_1:%.*]] = add i64 [[I]], -1
1428 ; CHECK-NEXT: [[I_MINUS_3:%.*]] = add i64 [[I]], -3
1429 ; CHECK-NEXT: [[A_I:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[I]]
1430 ; CHECK-NEXT: [[A_I_MINUS_1:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[I_MINUS_1]]
1431 ; CHECK-NEXT: [[A_I_MINUS_3:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[I_MINUS_3]]
1432 ; CHECK-NEXT: store i32 [[X]], ptr [[A_I_MINUS_1]], align 4
1433 ; CHECK-NEXT: store i32 [[Y]], ptr [[A_I_MINUS_3]], align 4
1434 ; CHECK-NEXT: store i32 [[Z]], ptr [[A_I]], align 4
1435 ; CHECK-NEXT: [[I_NEXT]] = add nuw nsw i64 [[I]], 2
1436 ; CHECK-NEXT: [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]]
1437 ; CHECK-NEXT: br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP35:![0-9]+]]
1439 ; CHECK-NEXT: ret void
1445 %i = phi i64 [ %i.next, %for.body ], [ 3, %entry ]
1446 %i_minus_1 = sub i64 %i, 1
1447 %i_minus_3 = sub i64 %i_minus_1, 2
1448 %a_i = getelementptr inbounds i32, ptr %a, i64 %i
1449 %a_i_minus_1 = getelementptr inbounds i32, ptr %a, i64 %i_minus_1
1450 %a_i_minus_3 = getelementptr inbounds i32, ptr %a, i64 %i_minus_3
1451 store i32 %x, ptr %a_i_minus_1, align 4
1452 store i32 %y, ptr %a_i_minus_3, align 4
1453 store i32 %z, ptr %a_i, align 4
1454 %i.next = add nuw nsw i64 %i, 2
1455 %cond = icmp slt i64 %i.next, %n
1456 br i1 %cond, label %for.body, label %for.end
1462 ; PR34743: Ensure that a cast which needs to sink after a load that belongs to
1463 ; an interleaved group, indeeded gets sunk.
1465 ; void PR34743(short *a, int *b, int n) {
1466 ; for (int i = 0, iv = 0; iv < n; i++, iv += 2) {
1467 ; b[i] = a[iv] * a[iv+1] * a[iv+2];
1472 define void @PR34743(ptr %a, ptr %b, i64 %n) {
1473 ; CHECK-LABEL: @PR34743(
1474 ; CHECK-NEXT: entry:
1475 ; CHECK-NEXT: [[DOTPRE:%.*]] = load i16, ptr [[A:%.*]], align 2
1476 ; CHECK-NEXT: [[TMP0:%.*]] = lshr i64 [[N:%.*]], 1
1477 ; CHECK-NEXT: [[TMP1:%.*]] = add nuw i64 [[TMP0]], 1
1478 ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 6
1479 ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
1480 ; CHECK: vector.memcheck:
1481 ; CHECK-NEXT: [[TMP2:%.*]] = shl i64 [[N]], 1
1482 ; CHECK-NEXT: [[TMP3:%.*]] = and i64 [[TMP2]], -4
1483 ; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[TMP3]], 4
1484 ; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[B:%.*]], i64 [[TMP4]]
1485 ; CHECK-NEXT: [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[A]], i64 2
1486 ; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[TMP3]], 6
1487 ; CHECK-NEXT: [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP5]]
1488 ; CHECK-NEXT: [[BOUND0:%.*]] = icmp ugt ptr [[SCEVGEP2]], [[B]]
1489 ; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[SCEVGEP1]], [[SCEVGEP]]
1490 ; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
1491 ; CHECK-NEXT: br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
1493 ; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[TMP1]], -4
1494 ; CHECK-NEXT: [[IND_END:%.*]] = shl i64 [[N_VEC]], 1
1495 ; CHECK-NEXT: [[VECTOR_RECUR_INIT:%.*]] = insertelement <4 x i16> poison, i16 [[DOTPRE]], i64 3
1496 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
1497 ; CHECK: vector.body:
1498 ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
1499 ; CHECK-NEXT: [[VECTOR_RECUR:%.*]] = phi <4 x i16> [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[STRIDED_VEC4:%.*]], [[VECTOR_BODY]] ]
1500 ; CHECK-NEXT: [[OFFSET_IDX:%.*]] = shl i64 [[INDEX]], 1
1501 ; CHECK-NEXT: [[TMP6:%.*]] = or i64 [[OFFSET_IDX]], 1
1502 ; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i16, ptr [[A]], i64 [[TMP6]]
1503 ; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <8 x i16>, ptr [[TMP7]], align 4
1504 ; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <8 x i16> [[WIDE_VEC]], <8 x i16> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
1505 ; CHECK-NEXT: [[STRIDED_VEC4]] = shufflevector <8 x i16> [[WIDE_VEC]], <8 x i16> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
1506 ; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x i16> [[VECTOR_RECUR]], <4 x i16> [[STRIDED_VEC4]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
1507 ; CHECK-NEXT: [[TMP9:%.*]] = sext <4 x i16> [[TMP8]] to <4 x i32>
1508 ; CHECK-NEXT: [[TMP10:%.*]] = sext <4 x i16> [[STRIDED_VEC]] to <4 x i32>
1509 ; CHECK-NEXT: [[TMP11:%.*]] = sext <4 x i16> [[STRIDED_VEC4]] to <4 x i32>
1510 ; CHECK-NEXT: [[TMP12:%.*]] = mul nsw <4 x i32> [[TMP9]], [[TMP10]]
1511 ; CHECK-NEXT: [[TMP13:%.*]] = mul nsw <4 x i32> [[TMP12]], [[TMP11]]
1512 ; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDEX]]
1513 ; CHECK-NEXT: store <4 x i32> [[TMP13]], ptr [[TMP14]], align 4, !alias.scope !36, !noalias !39
1514 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
1515 ; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
1516 ; CHECK-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP41:![0-9]+]]
1517 ; CHECK: middle.block:
1518 ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP1]], [[N_VEC]]
1519 ; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <8 x i16> [[WIDE_VEC]], i64 7
1520 ; CHECK-NEXT: br i1 [[CMP_N]], label [[END:%.*]], label [[SCALAR_PH]]
1522 ; CHECK-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ [[DOTPRE]], [[VECTOR_MEMCHECK]] ], [ [[DOTPRE]], [[ENTRY:%.*]] ], [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ]
1523 ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ENTRY]] ], [ [[IND_END]], [[MIDDLE_BLOCK]] ]
1524 ; CHECK-NEXT: [[BC_RESUME_VAL3:%.*]] = phi i64 [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ENTRY]] ], [ [[N_VEC]], [[MIDDLE_BLOCK]] ]
1525 ; CHECK-NEXT: br label [[LOOP:%.*]]
1527 ; CHECK-NEXT: [[SCALAR_RECUR:%.*]] = phi i16 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[LOAD2:%.*]], [[LOOP]] ]
1528 ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV2:%.*]], [[LOOP]] ]
1529 ; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL3]], [[SCALAR_PH]] ], [ [[I1:%.*]], [[LOOP]] ]
1530 ; CHECK-NEXT: [[CONV:%.*]] = sext i16 [[SCALAR_RECUR]] to i32
1531 ; CHECK-NEXT: [[I1]] = add nuw nsw i64 [[I]], 1
1532 ; CHECK-NEXT: [[IV1:%.*]] = or i64 [[IV]], 1
1533 ; CHECK-NEXT: [[IV2]] = add nuw nsw i64 [[IV]], 2
1534 ; CHECK-NEXT: [[GEP1:%.*]] = getelementptr inbounds i16, ptr [[A]], i64 [[IV1]]
1535 ; CHECK-NEXT: [[LOAD1:%.*]] = load i16, ptr [[GEP1]], align 4
1536 ; CHECK-NEXT: [[CONV1:%.*]] = sext i16 [[LOAD1]] to i32
1537 ; CHECK-NEXT: [[GEP2:%.*]] = getelementptr inbounds i16, ptr [[A]], i64 [[IV2]]
1538 ; CHECK-NEXT: [[LOAD2]] = load i16, ptr [[GEP2]], align 4
1539 ; CHECK-NEXT: [[CONV2:%.*]] = sext i16 [[LOAD2]] to i32
1540 ; CHECK-NEXT: [[MUL01:%.*]] = mul nsw i32 [[CONV]], [[CONV1]]
1541 ; CHECK-NEXT: [[MUL012:%.*]] = mul nsw i32 [[MUL01]], [[CONV2]]
1542 ; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[I]]
1543 ; CHECK-NEXT: store i32 [[MUL012]], ptr [[ARRAYIDX5]], align 4
1544 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV]], [[N]]
1545 ; CHECK-NEXT: br i1 [[EXITCOND]], label [[END]], label [[LOOP]], !llvm.loop [[LOOP42:![0-9]+]]
1547 ; CHECK-NEXT: ret void
1550 %.pre = load i16, ptr %a
1554 %0 = phi i16 [ %.pre, %entry ], [ %load2, %loop ]
1555 %iv = phi i64 [ 0, %entry ], [ %iv2, %loop ]
1556 %i = phi i64 [ 0, %entry ], [ %i1, %loop ]
1557 %conv = sext i16 %0 to i32
1558 %i1 = add nuw nsw i64 %i, 1
1559 %iv1 = add nuw nsw i64 %iv, 1
1560 %iv2 = add nuw nsw i64 %iv, 2
1561 %gep1 = getelementptr inbounds i16, ptr %a, i64 %iv1
1562 %load1 = load i16, ptr %gep1, align 4
1563 %conv1 = sext i16 %load1 to i32
1564 %gep2 = getelementptr inbounds i16, ptr %a, i64 %iv2
1565 %load2 = load i16, ptr %gep2, align 4
1566 %conv2 = sext i16 %load2 to i32
1567 %mul01 = mul nsw i32 %conv, %conv1
1568 %mul012 = mul nsw i32 %mul01, %conv2
1569 %arrayidx5 = getelementptr inbounds i32, ptr %b, i64 %i
1570 store i32 %mul012, ptr %arrayidx5
1571 %exitcond = icmp eq i64 %iv, %n
1572 br i1 %exitcond, label %end, label %loop
1578 attributes #0 = { "unsafe-fp-math"="true" }