1 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2 ; RUN: opt -S -loop-vectorize -instcombine -force-vector-width=4 -force-vector-interleave=1 -enable-interleaved-mem-accesses=true -runtime-memory-check-threshold=24 < %s | FileCheck %s
4 target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
6 ; Check vectorization on an interleaved load group of factor 2 and an interleaved
7 ; store group of factor 2.
11 ; void test_array_load2_store2(int C, int D) {
12 ; for (int i = 0; i < 1024; i+=2) {
21 @AB = common global [1024 x i32] zeroinitializer, align 4
22 @CD = common global [1024 x i32] zeroinitializer, align 4
24 define void @test_array_load2_store2(i32 %C, i32 %D) {
25 ; CHECK-LABEL: @test_array_load2_store2(
27 ; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
29 ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[C:%.*]], i32 0
30 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
31 ; CHECK-NEXT: [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <4 x i32> poison, i32 [[D:%.*]], i32 0
32 ; CHECK-NEXT: [[BROADCAST_SPLAT3:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT2]], <4 x i32> poison, <4 x i32> zeroinitializer
33 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
35 ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
36 ; CHECK-NEXT: [[OFFSET_IDX:%.*]] = shl i64 [[INDEX]], 1
37 ; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* @AB, i64 0, i64 [[OFFSET_IDX]]
38 ; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <8 x i32>*
39 ; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <8 x i32>, <8 x i32>* [[TMP1]], align 4
40 ; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
41 ; CHECK-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
42 ; CHECK-NEXT: [[TMP2:%.*]] = or i64 [[OFFSET_IDX]], 1
43 ; CHECK-NEXT: [[TMP3:%.*]] = add nsw <4 x i32> [[STRIDED_VEC]], [[BROADCAST_SPLAT]]
44 ; CHECK-NEXT: [[TMP4:%.*]] = mul nsw <4 x i32> [[STRIDED_VEC1]], [[BROADCAST_SPLAT3]]
45 ; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* @CD, i64 0, i64 [[TMP2]]
46 ; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP5]], i64 -1
47 ; CHECK-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <8 x i32>*
48 ; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
49 ; CHECK-NEXT: store <8 x i32> [[INTERLEAVED_VEC]], <8 x i32>* [[TMP7]], align 4
50 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
51 ; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 512
52 ; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
53 ; CHECK: middle.block:
54 ; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
56 ; CHECK-NEXT: br label [[FOR_BODY:%.*]]
58 ; CHECK-NEXT: br i1 undef, label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP2:![0-9]+]]
60 ; CHECK-NEXT: ret void
65 for.body: ; preds = %for.body, %entry
66 %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
67 %arrayidx0 = getelementptr inbounds [1024 x i32], [1024 x i32]* @AB, i64 0, i64 %indvars.iv
68 %tmp = load i32, i32* %arrayidx0, align 4
69 %tmp1 = or i64 %indvars.iv, 1
70 %arrayidx1 = getelementptr inbounds [1024 x i32], [1024 x i32]* @AB, i64 0, i64 %tmp1
71 %tmp2 = load i32, i32* %arrayidx1, align 4
72 %add = add nsw i32 %tmp, %C
73 %mul = mul nsw i32 %tmp2, %D
74 %arrayidx2 = getelementptr inbounds [1024 x i32], [1024 x i32]* @CD, i64 0, i64 %indvars.iv
75 store i32 %add, i32* %arrayidx2, align 4
76 %arrayidx3 = getelementptr inbounds [1024 x i32], [1024 x i32]* @CD, i64 0, i64 %tmp1
77 store i32 %mul, i32* %arrayidx3, align 4
78 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 2
79 %cmp = icmp slt i64 %indvars.iv.next, 1024
80 br i1 %cmp, label %for.body, label %for.end
82 for.end: ; preds = %for.body
88 ; void test_struct_st3() {
90 ; for (int i = 0; i < 1024; i++) {
101 %struct.ST3 = type { i32, i32, i32 }
102 @A = common global [3072 x i32] zeroinitializer, align 4
103 @S = common global [1024 x %struct.ST3] zeroinitializer, align 4
105 define void @test_struct_array_load3_store3() {
106 ; CHECK-LABEL: @test_struct_array_load3_store3(
108 ; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
110 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
111 ; CHECK: vector.body:
112 ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
113 ; CHECK-NEXT: [[TMP0:%.*]] = mul i64 [[INDEX]], 3
114 ; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr [3072 x i32], [3072 x i32]* @A, i64 0, i64 [[TMP0]]
115 ; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[NEXT_GEP]] to <12 x i32>*
116 ; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <12 x i32>, <12 x i32>* [[TMP1]], align 4
117 ; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <12 x i32> [[WIDE_VEC]], <12 x i32> poison, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
118 ; CHECK-NEXT: [[STRIDED_VEC2:%.*]] = shufflevector <12 x i32> [[WIDE_VEC]], <12 x i32> poison, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
119 ; CHECK-NEXT: [[STRIDED_VEC3:%.*]] = shufflevector <12 x i32> [[WIDE_VEC]], <12 x i32> poison, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
120 ; CHECK-NEXT: [[TMP2:%.*]] = add nsw <4 x i32> [[STRIDED_VEC]], <i32 1, i32 1, i32 1, i32 1>
121 ; CHECK-NEXT: [[TMP3:%.*]] = add nsw <4 x i32> [[STRIDED_VEC2]], <i32 2, i32 2, i32 2, i32 2>
122 ; CHECK-NEXT: [[TMP4:%.*]] = add nsw <4 x i32> [[STRIDED_VEC3]], <i32 3, i32 3, i32 3, i32 3>
123 ; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds [1024 x %struct.ST3], [1024 x %struct.ST3]* @S, i64 0, i64 [[INDEX]], i32 2
124 ; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP5]], i64 -2
125 ; CHECK-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <12 x i32>*
126 ; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
127 ; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
128 ; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i32> [[TMP8]], <8 x i32> [[TMP9]], <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
129 ; CHECK-NEXT: store <12 x i32> [[INTERLEAVED_VEC]], <12 x i32>* [[TMP7]], align 4
130 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
131 ; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
132 ; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
133 ; CHECK: middle.block:
134 ; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
136 ; CHECK-NEXT: br label [[FOR_BODY:%.*]]
138 ; CHECK-NEXT: br i1 undef, label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
140 ; CHECK-NEXT: ret void
145 for.body: ; preds = %for.body, %entry
146 %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
147 %ptr.016 = phi i32* [ getelementptr inbounds ([3072 x i32], [3072 x i32]* @A, i64 0, i64 0), %entry ], [ %incdec.ptr2, %for.body ]
148 %incdec.ptr = getelementptr inbounds i32, i32* %ptr.016, i64 1
149 %tmp = load i32, i32* %ptr.016, align 4
150 %incdec.ptr1 = getelementptr inbounds i32, i32* %ptr.016, i64 2
151 %tmp1 = load i32, i32* %incdec.ptr, align 4
152 %incdec.ptr2 = getelementptr inbounds i32, i32* %ptr.016, i64 3
153 %tmp2 = load i32, i32* %incdec.ptr1, align 4
154 %add = add nsw i32 %tmp, 1
155 %x = getelementptr inbounds [1024 x %struct.ST3], [1024 x %struct.ST3]* @S, i64 0, i64 %indvars.iv, i32 0
156 store i32 %add, i32* %x, align 4
157 %add3 = add nsw i32 %tmp1, 2
158 %y = getelementptr inbounds [1024 x %struct.ST3], [1024 x %struct.ST3]* @S, i64 0, i64 %indvars.iv, i32 1
159 store i32 %add3, i32* %y, align 4
160 %add6 = add nsw i32 %tmp2, 3
161 %z = getelementptr inbounds [1024 x %struct.ST3], [1024 x %struct.ST3]* @S, i64 0, i64 %indvars.iv, i32 2
162 store i32 %add6, i32* %z, align 4
163 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
164 %exitcond = icmp eq i64 %indvars.iv.next, 1024
165 br i1 %exitcond, label %for.end, label %for.body
167 for.end: ; preds = %for.body
171 ; Check vectorization on an interleaved load group of factor 4.
179 ; int test_struct_load4(struct ST4 *S) {
181 ; for (int i = 0; i < 1024; i++) {
190 %struct.ST4 = type { i32, i32, i32, i32 }
192 define i32 @test_struct_load4(%struct.ST4* nocapture readonly %S) {
194 ; CHECK-LABEL: @test_struct_load4(
196 ; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
198 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
199 ; CHECK: vector.body:
200 ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
201 ; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
202 ; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds [[STRUCT_ST4:%.*]], %struct.ST4* [[S:%.*]], i64 [[INDEX]], i32 0
203 ; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <16 x i32>*
204 ; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <16 x i32>, <16 x i32>* [[TMP1]], align 4
205 ; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> poison, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
206 ; CHECK-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> poison, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
207 ; CHECK-NEXT: [[STRIDED_VEC2:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> poison, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
208 ; CHECK-NEXT: [[STRIDED_VEC3:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> poison, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
209 ; CHECK-NEXT: [[TMP2:%.*]] = add <4 x i32> [[STRIDED_VEC]], [[VEC_PHI]]
210 ; CHECK-NEXT: [[TMP3:%.*]] = add <4 x i32> [[TMP2]], [[STRIDED_VEC2]]
211 ; CHECK-NEXT: [[TMP4:%.*]] = add <4 x i32> [[STRIDED_VEC1]], [[STRIDED_VEC3]]
212 ; CHECK-NEXT: [[TMP5]] = sub <4 x i32> [[TMP3]], [[TMP4]]
213 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
214 ; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
215 ; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
216 ; CHECK: middle.block:
217 ; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP5]])
218 ; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
220 ; CHECK-NEXT: br label [[FOR_BODY:%.*]]
222 ; CHECK-NEXT: br i1 undef, label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
224 ; CHECK-NEXT: [[SUB8_LCSSA:%.*]] = phi i32 [ undef, [[FOR_BODY]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ]
225 ; CHECK-NEXT: ret i32 [[SUB8_LCSSA]]
230 for.body: ; preds = %for.body, %entry
231 %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
232 %r.022 = phi i32 [ 0, %entry ], [ %sub8, %for.body ]
233 %x = getelementptr inbounds %struct.ST4, %struct.ST4* %S, i64 %indvars.iv, i32 0
234 %tmp = load i32, i32* %x, align 4
235 %add = add nsw i32 %tmp, %r.022
236 %y = getelementptr inbounds %struct.ST4, %struct.ST4* %S, i64 %indvars.iv, i32 1
237 %tmp1 = load i32, i32* %y, align 4
238 %sub = sub i32 %add, %tmp1
239 %z = getelementptr inbounds %struct.ST4, %struct.ST4* %S, i64 %indvars.iv, i32 2
240 %tmp2 = load i32, i32* %z, align 4
241 %add5 = add nsw i32 %sub, %tmp2
242 %w = getelementptr inbounds %struct.ST4, %struct.ST4* %S, i64 %indvars.iv, i32 3
243 %tmp3 = load i32, i32* %w, align 4
244 %sub8 = sub i32 %add5, %tmp3
245 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
246 %exitcond = icmp eq i64 %indvars.iv.next, 1024
247 br i1 %exitcond, label %for.end, label %for.body
249 for.end: ; preds = %for.body
253 ; Check vectorization on an interleaved store group of factor 4.
255 ; void test_struct_store4(int *A, struct ST4 *B) {
257 ; for (int i = 0; i < 1024; i++) {
267 define void @test_struct_store4(i32* noalias nocapture readonly %A, %struct.ST4* noalias nocapture %B) {
268 ; CHECK-LABEL: @test_struct_store4(
270 ; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
272 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
273 ; CHECK: vector.body:
274 ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
275 ; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i32, i32* [[A:%.*]], i64 [[INDEX]]
276 ; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[NEXT_GEP]] to <4 x i32>*
277 ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
278 ; CHECK-NEXT: [[TMP1:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], <i32 1, i32 1, i32 1, i32 1>
279 ; CHECK-NEXT: [[TMP2:%.*]] = shl nsw <4 x i32> [[WIDE_LOAD]], <i32 1, i32 1, i32 1, i32 1>
280 ; CHECK-NEXT: [[TMP3:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], <i32 3, i32 3, i32 3, i32 3>
281 ; CHECK-NEXT: [[TMP4:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], <i32 4, i32 4, i32 4, i32 4>
282 ; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds [[STRUCT_ST4:%.*]], %struct.ST4* [[B:%.*]], i64 [[INDEX]], i32 3
283 ; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP5]], i64 -3
284 ; CHECK-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <16 x i32>*
285 ; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
286 ; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
287 ; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i32> [[TMP8]], <8 x i32> [[TMP9]], <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
288 ; CHECK-NEXT: store <16 x i32> [[INTERLEAVED_VEC]], <16 x i32>* [[TMP7]], align 4
289 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
290 ; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
291 ; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
292 ; CHECK: middle.block:
293 ; CHECK-NEXT: br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
295 ; CHECK-NEXT: br label [[FOR_BODY:%.*]]
296 ; CHECK: for.cond.cleanup:
297 ; CHECK-NEXT: ret void
299 ; CHECK-NEXT: br i1 undef, label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
304 for.cond.cleanup: ; preds = %for.body
307 for.body: ; preds = %for.body, %entry
308 %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
309 %ptr.024 = phi i32* [ %A, %entry ], [ %incdec.ptr, %for.body ]
310 %incdec.ptr = getelementptr inbounds i32, i32* %ptr.024, i64 1
311 %tmp = load i32, i32* %ptr.024, align 4
312 %add = add nsw i32 %tmp, 1
313 %x = getelementptr inbounds %struct.ST4, %struct.ST4* %B, i64 %indvars.iv, i32 0
314 store i32 %add, i32* %x, align 4
315 %mul = shl nsw i32 %tmp, 1
316 %y = getelementptr inbounds %struct.ST4, %struct.ST4* %B, i64 %indvars.iv, i32 1
317 store i32 %mul, i32* %y, align 4
318 %add3 = add nsw i32 %tmp, 3
319 %z = getelementptr inbounds %struct.ST4, %struct.ST4* %B, i64 %indvars.iv, i32 2
320 store i32 %add3, i32* %z, align 4
321 %add6 = add nsw i32 %tmp, 4
322 %w = getelementptr inbounds %struct.ST4, %struct.ST4* %B, i64 %indvars.iv, i32 3
323 store i32 %add6, i32* %w, align 4
324 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
325 %exitcond = icmp eq i64 %indvars.iv.next, 1024
326 br i1 %exitcond, label %for.cond.cleanup, label %for.body
329 ; Check vectorization on a reverse interleaved load group of factor 2 and
330 ; a reverse interleaved store group of factor 2.
337 ; void test_reversed_load2_store2(struct ST2 *A, struct ST2 *B) {
338 ; for (int i = 1023; i >= 0; i--) {
339 ; int a = A[i].x + i; // interleaved load of index 0
340 ; int b = A[i].y - i; // interleaved load of index 1
341 ; B[i].x = a; // interleaved store of index 0
342 ; B[i].y = b; // interleaved store of index 1
347 %struct.ST2 = type { i32, i32 }
349 define void @test_reversed_load2_store2(%struct.ST2* noalias nocapture readonly %A, %struct.ST2* noalias nocapture %B) {
350 ; CHECK-LABEL: @test_reversed_load2_store2(
352 ; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
354 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
355 ; CHECK: vector.body:
356 ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
357 ; CHECK-NEXT: [[VEC_IND3:%.*]] = phi <4 x i32> [ <i32 1023, i32 1022, i32 1021, i32 1020>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT4:%.*]], [[VECTOR_BODY]] ]
358 ; CHECK-NEXT: [[OFFSET_IDX:%.*]] = sub i64 1023, [[INDEX]]
359 ; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds [[STRUCT_ST2:%.*]], %struct.ST2* [[A:%.*]], i64 [[OFFSET_IDX]], i32 0
360 ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 -6
361 ; CHECK-NEXT: [[TMP2:%.*]] = bitcast i32* [[TMP1]] to <8 x i32>*
362 ; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <8 x i32>, <8 x i32>* [[TMP2]], align 4
363 ; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
364 ; CHECK-NEXT: [[REVERSE:%.*]] = shufflevector <4 x i32> [[STRIDED_VEC]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
365 ; CHECK-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
366 ; CHECK-NEXT: [[REVERSE2:%.*]] = shufflevector <4 x i32> [[STRIDED_VEC1]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
367 ; CHECK-NEXT: [[TMP3:%.*]] = add nsw <4 x i32> [[REVERSE]], [[VEC_IND3]]
368 ; CHECK-NEXT: [[TMP4:%.*]] = sub nsw <4 x i32> [[REVERSE2]], [[VEC_IND3]]
369 ; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds [[STRUCT_ST2]], %struct.ST2* [[B:%.*]], i64 [[OFFSET_IDX]], i32 1
370 ; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP5]], i64 -7
371 ; CHECK-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <8 x i32>*
372 ; CHECK-NEXT: [[REVERSE5:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
373 ; CHECK-NEXT: [[REVERSE6:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
374 ; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i32> [[REVERSE5]], <4 x i32> [[REVERSE6]], <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
375 ; CHECK-NEXT: store <8 x i32> [[INTERLEAVED_VEC]], <8 x i32>* [[TMP7]], align 4
376 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
377 ; CHECK-NEXT: [[VEC_IND_NEXT4]] = add <4 x i32> [[VEC_IND3]], <i32 -4, i32 -4, i32 -4, i32 -4>
378 ; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
379 ; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
380 ; CHECK: middle.block:
381 ; CHECK-NEXT: br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
383 ; CHECK-NEXT: br label [[FOR_BODY:%.*]]
384 ; CHECK: for.cond.cleanup:
385 ; CHECK-NEXT: ret void
387 ; CHECK-NEXT: br i1 undef, label [[FOR_BODY]], label [[FOR_COND_CLEANUP]], !llvm.loop [[LOOP11:![0-9]+]]
392 for.cond.cleanup: ; preds = %for.body
395 for.body: ; preds = %for.body, %entry
396 %indvars.iv = phi i64 [ 1023, %entry ], [ %indvars.iv.next, %for.body ]
397 %x = getelementptr inbounds %struct.ST2, %struct.ST2* %A, i64 %indvars.iv, i32 0
398 %tmp = load i32, i32* %x, align 4
399 %tmp1 = trunc i64 %indvars.iv to i32
400 %add = add nsw i32 %tmp, %tmp1
401 %y = getelementptr inbounds %struct.ST2, %struct.ST2* %A, i64 %indvars.iv, i32 1
402 %tmp2 = load i32, i32* %y, align 4
403 %sub = sub nsw i32 %tmp2, %tmp1
404 %x5 = getelementptr inbounds %struct.ST2, %struct.ST2* %B, i64 %indvars.iv, i32 0
405 store i32 %add, i32* %x5, align 4
406 %y8 = getelementptr inbounds %struct.ST2, %struct.ST2* %B, i64 %indvars.iv, i32 1
407 store i32 %sub, i32* %y8, align 4
408 %indvars.iv.next = add nsw i64 %indvars.iv, -1
409 %cmp = icmp sgt i64 %indvars.iv, 0
410 br i1 %cmp, label %for.body, label %for.cond.cleanup
413 ; Check vectorization on an interleaved load group of factor 2 with 1 gap
414 ; (missing the load of odd elements). Because the vectorized loop would
415 ; speculatively access memory out-of-bounds, we must execute at least one
416 ; iteration of the scalar loop.
418 ; void even_load_static_tc(int *A, int *B) {
419 ; for (unsigned i = 0; i < 1024; i+=2)
424 define void @even_load_static_tc(i32* noalias nocapture readonly %A, i32* noalias nocapture %B) {
425 ; CHECK-LABEL: @even_load_static_tc(
427 ; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
429 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
430 ; CHECK: vector.body:
431 ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
432 ; CHECK-NEXT: [[OFFSET_IDX:%.*]] = shl i64 [[INDEX]], 1
433 ; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[OFFSET_IDX]]
434 ; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <8 x i32>*
435 ; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <8 x i32>, <8 x i32>* [[TMP1]], align 4
436 ; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
437 ; CHECK-NEXT: [[TMP2:%.*]] = shl nsw <4 x i32> [[STRIDED_VEC]], <i32 1, i32 1, i32 1, i32 1>
438 ; CHECK-NEXT: [[TMP3:%.*]] = and i64 [[INDEX]], 9223372036854775804
439 ; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[TMP3]]
440 ; CHECK-NEXT: [[TMP5:%.*]] = bitcast i32* [[TMP4]] to <4 x i32>*
441 ; CHECK-NEXT: store <4 x i32> [[TMP2]], <4 x i32>* [[TMP5]], align 4
442 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
443 ; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 508
444 ; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
445 ; CHECK: middle.block:
446 ; CHECK-NEXT: br label [[SCALAR_PH]]
448 ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1016, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
449 ; CHECK-NEXT: br label [[FOR_BODY:%.*]]
450 ; CHECK: for.cond.cleanup:
451 ; CHECK-NEXT: ret void
453 ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
454 ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV]]
455 ; CHECK-NEXT: [[TMP:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
456 ; CHECK-NEXT: [[MUL:%.*]] = shl nsw i32 [[TMP]], 1
457 ; CHECK-NEXT: [[TMP1:%.*]] = lshr exact i64 [[INDVARS_IV]], 1
458 ; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[TMP1]]
459 ; CHECK-NEXT: store i32 [[MUL]], i32* [[ARRAYIDX2]], align 4
460 ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 2
461 ; CHECK-NEXT: [[CMP:%.*]] = icmp ult i64 [[INDVARS_IV]], 1022
462 ; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP:%.*]], !llvm.loop [[LOOP13:![0-9]+]]
467 for.cond.cleanup: ; preds = %for.body
470 for.body: ; preds = %for.body, %entry
471 %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
472 %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
473 %tmp = load i32, i32* %arrayidx, align 4
474 %mul = shl nsw i32 %tmp, 1
475 %tmp1 = lshr exact i64 %indvars.iv, 1
476 %arrayidx2 = getelementptr inbounds i32, i32* %B, i64 %tmp1
477 store i32 %mul, i32* %arrayidx2, align 4
478 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 2
479 %cmp = icmp ult i64 %indvars.iv.next, 1024
480 br i1 %cmp, label %for.body, label %for.cond.cleanup
483 ; Check vectorization on an interleaved load group of factor 2 with 1 gap
484 ; (missing the load of odd elements). Because the vectorized loop would
485 ; speculatively access memory out-of-bounds, we must execute at least one
486 ; iteration of the scalar loop.
488 ; void even_load_dynamic_tc(int *A, int *B, unsigned N) {
489 ; for (unsigned i = 0; i < N; i+=2)
494 define void @even_load_dynamic_tc(i32* noalias nocapture readonly %A, i32* noalias nocapture %B, i64 %N) {
495 ; CHECK-LABEL: @even_load_dynamic_tc(
497 ; CHECK-NEXT: [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[N:%.*]], i64 2)
498 ; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[UMAX]], -1
499 ; CHECK-NEXT: [[TMP1:%.*]] = lshr i64 [[TMP0]], 1
500 ; CHECK-NEXT: [[TMP2:%.*]] = add nuw i64 [[TMP1]], 1
501 ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 8
502 ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
504 ; CHECK-NEXT: [[N_MOD_VF:%.*]] = and i64 [[TMP2]], 3
505 ; CHECK-NEXT: [[TMP3:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
506 ; CHECK-NEXT: [[TMP4:%.*]] = select i1 [[TMP3]], i64 4, i64 [[N_MOD_VF]]
507 ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP2]], [[TMP4]]
508 ; CHECK-NEXT: [[IND_END:%.*]] = shl i64 [[N_VEC]], 1
509 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
510 ; CHECK: vector.body:
511 ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
512 ; CHECK-NEXT: [[OFFSET_IDX:%.*]] = shl i64 [[INDEX]], 1
513 ; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[OFFSET_IDX]]
514 ; CHECK-NEXT: [[TMP6:%.*]] = bitcast i32* [[TMP5]] to <8 x i32>*
515 ; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <8 x i32>, <8 x i32>* [[TMP6]], align 4
516 ; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
517 ; CHECK-NEXT: [[TMP7:%.*]] = shl nsw <4 x i32> [[STRIDED_VEC]], <i32 1, i32 1, i32 1, i32 1>
518 ; CHECK-NEXT: [[TMP8:%.*]] = and i64 [[INDEX]], 9223372036854775804
519 ; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[TMP8]]
520 ; CHECK-NEXT: [[TMP10:%.*]] = bitcast i32* [[TMP9]] to <4 x i32>*
521 ; CHECK-NEXT: store <4 x i32> [[TMP7]], <4 x i32>* [[TMP10]], align 4
522 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
523 ; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
524 ; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
525 ; CHECK: middle.block:
526 ; CHECK-NEXT: br label [[SCALAR_PH]]
528 ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
529 ; CHECK-NEXT: br label [[FOR_BODY:%.*]]
530 ; CHECK: for.cond.cleanup:
531 ; CHECK-NEXT: ret void
533 ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
534 ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV]]
535 ; CHECK-NEXT: [[TMP:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
536 ; CHECK-NEXT: [[MUL:%.*]] = shl nsw i32 [[TMP]], 1
537 ; CHECK-NEXT: [[TMP1:%.*]] = lshr exact i64 [[INDVARS_IV]], 1
538 ; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[TMP1]]
539 ; CHECK-NEXT: store i32 [[MUL]], i32* [[ARRAYIDX2]], align 4
540 ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 2
541 ; CHECK-NEXT: [[CMP:%.*]] = icmp ult i64 [[INDVARS_IV_NEXT]], [[N]]
542 ; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP:%.*]], !llvm.loop [[LOOP15:![0-9]+]]
547 for.cond.cleanup: ; preds = %for.body
550 for.body: ; preds = %for.body, %entry
551 %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
552 %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
553 %tmp = load i32, i32* %arrayidx, align 4
554 %mul = shl nsw i32 %tmp, 1
555 %tmp1 = lshr exact i64 %indvars.iv, 1
556 %arrayidx2 = getelementptr inbounds i32, i32* %B, i64 %tmp1
557 store i32 %mul, i32* %arrayidx2, align 4
558 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 2
559 %cmp = icmp ult i64 %indvars.iv.next, %N
560 br i1 %cmp, label %for.body, label %for.cond.cleanup
563 ; Check vectorization on a reverse interleaved load group of factor 2 with 1
564 ; gap and a reverse interleaved store group of factor 2. The interleaved load
565 ; group should be removed since it has a gap and is reverse.
572 ; void load_gap_reverse(struct pair *P1, struct pair *P2, int X) {
573 ; for (int i = 1023; i >= 0; i--) {
575 ; int b = A[i].y - i;
582 %pair = type { i64, i64 }
583 define void @load_gap_reverse(%pair* noalias nocapture readonly %P1, %pair* noalias nocapture readonly %P2, i64 %X) {
584 ; CHECK-LABEL: @load_gap_reverse(
586 ; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
588 ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[X:%.*]], i32 0
589 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
590 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
591 ; CHECK: vector.body:
592 ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
593 ; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 1023, i64 1022, i64 1021, i64 1020>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
594 ; CHECK-NEXT: [[OFFSET_IDX:%.*]] = sub i64 1023, [[INDEX]]
595 ; CHECK-NEXT: [[TMP0:%.*]] = sub i64 1022, [[INDEX]]
596 ; CHECK-NEXT: [[TMP1:%.*]] = sub i64 1021, [[INDEX]]
597 ; CHECK-NEXT: [[TMP2:%.*]] = sub i64 1020, [[INDEX]]
598 ; CHECK-NEXT: [[TMP3:%.*]] = add nsw <4 x i64> [[BROADCAST_SPLAT]], [[VEC_IND]]
599 ; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds [[PAIR:%.*]], %pair* [[P1:%.*]], i64 [[OFFSET_IDX]], i32 0
600 ; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds [[PAIR]], %pair* [[P1]], i64 [[TMP0]], i32 0
601 ; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds [[PAIR]], %pair* [[P1]], i64 [[TMP1]], i32 0
602 ; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds [[PAIR]], %pair* [[P1]], i64 [[TMP2]], i32 0
603 ; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds [[PAIR]], %pair* [[P2:%.*]], i64 [[OFFSET_IDX]], i32 1
604 ; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds [[PAIR]], %pair* [[P2]], i64 [[TMP0]], i32 1
605 ; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds [[PAIR]], %pair* [[P2]], i64 [[TMP1]], i32 1
606 ; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds [[PAIR]], %pair* [[P2]], i64 [[TMP2]], i32 1
607 ; CHECK-NEXT: [[TMP12:%.*]] = load i64, i64* [[TMP8]], align 8
608 ; CHECK-NEXT: [[TMP13:%.*]] = load i64, i64* [[TMP9]], align 8
609 ; CHECK-NEXT: [[TMP14:%.*]] = load i64, i64* [[TMP10]], align 8
610 ; CHECK-NEXT: [[TMP15:%.*]] = load i64, i64* [[TMP11]], align 8
611 ; CHECK-NEXT: [[TMP16:%.*]] = insertelement <4 x i64> poison, i64 [[TMP12]], i32 0
612 ; CHECK-NEXT: [[TMP17:%.*]] = insertelement <4 x i64> [[TMP16]], i64 [[TMP13]], i32 1
613 ; CHECK-NEXT: [[TMP18:%.*]] = insertelement <4 x i64> [[TMP17]], i64 [[TMP14]], i32 2
614 ; CHECK-NEXT: [[TMP19:%.*]] = insertelement <4 x i64> [[TMP18]], i64 [[TMP15]], i32 3
615 ; CHECK-NEXT: [[TMP20:%.*]] = sub nsw <4 x i64> [[TMP19]], [[VEC_IND]]
616 ; CHECK-NEXT: [[TMP21:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0
617 ; CHECK-NEXT: store i64 [[TMP21]], i64* [[TMP4]], align 8
618 ; CHECK-NEXT: [[TMP22:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1
619 ; CHECK-NEXT: store i64 [[TMP22]], i64* [[TMP5]], align 8
620 ; CHECK-NEXT: [[TMP23:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2
621 ; CHECK-NEXT: store i64 [[TMP23]], i64* [[TMP6]], align 8
622 ; CHECK-NEXT: [[TMP24:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3
623 ; CHECK-NEXT: store i64 [[TMP24]], i64* [[TMP7]], align 8
624 ; CHECK-NEXT: [[TMP25:%.*]] = extractelement <4 x i64> [[TMP20]], i32 0
625 ; CHECK-NEXT: store i64 [[TMP25]], i64* [[TMP8]], align 8
626 ; CHECK-NEXT: [[TMP26:%.*]] = extractelement <4 x i64> [[TMP20]], i32 1
627 ; CHECK-NEXT: store i64 [[TMP26]], i64* [[TMP9]], align 8
628 ; CHECK-NEXT: [[TMP27:%.*]] = extractelement <4 x i64> [[TMP20]], i32 2
629 ; CHECK-NEXT: store i64 [[TMP27]], i64* [[TMP10]], align 8
630 ; CHECK-NEXT: [[TMP28:%.*]] = extractelement <4 x i64> [[TMP20]], i32 3
631 ; CHECK-NEXT: store i64 [[TMP28]], i64* [[TMP11]], align 8
632 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
633 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], <i64 -4, i64 -4, i64 -4, i64 -4>
634 ; CHECK-NEXT: [[TMP29:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
635 ; CHECK-NEXT: br i1 [[TMP29]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
636 ; CHECK: middle.block:
637 ; CHECK-NEXT: br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
639 ; CHECK-NEXT: br label [[FOR_BODY:%.*]]
641 ; CHECK-NEXT: br i1 undef, label [[FOR_BODY]], label [[FOR_EXIT]], !llvm.loop [[LOOP17:![0-9]+]]
643 ; CHECK-NEXT: ret void
649 %i = phi i64 [ 1023, %entry ], [ %i.next, %for.body ]
650 %0 = add nsw i64 %X, %i
651 %1 = getelementptr inbounds %pair, %pair* %P1, i64 %i, i32 0
652 %2 = getelementptr inbounds %pair, %pair* %P2, i64 %i, i32 1
653 %3 = load i64, i64* %2, align 8
654 %4 = sub nsw i64 %3, %i
655 store i64 %0, i64* %1, align 8
656 store i64 %4, i64* %2, align 8
657 %i.next = add nsw i64 %i, -1
658 %cond = icmp sgt i64 %i, 0
659 br i1 %cond, label %for.body, label %for.exit
665 ; Check vectorization on interleaved access groups identified from mixed
667 ; void mixed_load2_store2(int *A, int *B) {
668 ; for (unsigned i = 0; i < 1024; i+=2) {
669 ; B[i] = A[i] * A[i+1];
670 ; B[i+1] = A[i] + A[i+1];
675 define void @mixed_load2_store2(i32* noalias nocapture readonly %A, i32* noalias nocapture %B) {
676 ; CHECK-LABEL: @mixed_load2_store2(
678 ; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
680 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
681 ; CHECK: vector.body:
682 ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
683 ; CHECK-NEXT: [[OFFSET_IDX:%.*]] = shl i64 [[INDEX]], 1
684 ; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[OFFSET_IDX]]
685 ; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <8 x i32>*
686 ; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <8 x i32>, <8 x i32>* [[TMP1]], align 4
687 ; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
688 ; CHECK-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
689 ; CHECK-NEXT: [[TMP2:%.*]] = mul nsw <4 x i32> [[STRIDED_VEC1]], [[STRIDED_VEC]]
690 ; CHECK-NEXT: [[STRIDED_VEC3:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
691 ; CHECK-NEXT: [[STRIDED_VEC4:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
692 ; CHECK-NEXT: [[TMP3:%.*]] = add nsw <4 x i32> [[STRIDED_VEC4]], [[STRIDED_VEC3]]
693 ; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[OFFSET_IDX]]
694 ; CHECK-NEXT: [[TMP5:%.*]] = bitcast i32* [[TMP4]] to <8 x i32>*
695 ; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
696 ; CHECK-NEXT: store <8 x i32> [[INTERLEAVED_VEC]], <8 x i32>* [[TMP5]], align 4
697 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
698 ; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 512
699 ; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
700 ; CHECK: middle.block:
701 ; CHECK-NEXT: br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
703 ; CHECK-NEXT: br label [[FOR_BODY:%.*]]
704 ; CHECK: for.cond.cleanup:
705 ; CHECK-NEXT: ret void
707 ; CHECK-NEXT: br i1 undef, label [[FOR_BODY]], label [[FOR_COND_CLEANUP]], !llvm.loop [[LOOP19:![0-9]+]]
712 for.cond.cleanup: ; preds = %for.body
715 for.body: ; preds = %for.body, %entry
716 %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
717 %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
718 %tmp = load i32, i32* %arrayidx, align 4
719 %tmp1 = or i64 %indvars.iv, 1
720 %arrayidx2 = getelementptr inbounds i32, i32* %A, i64 %tmp1
721 %tmp2 = load i32, i32* %arrayidx2, align 4
722 %mul = mul nsw i32 %tmp2, %tmp
723 %arrayidx4 = getelementptr inbounds i32, i32* %B, i64 %indvars.iv
724 store i32 %mul, i32* %arrayidx4, align 4
725 %tmp3 = load i32, i32* %arrayidx, align 4
726 %tmp4 = load i32, i32* %arrayidx2, align 4
727 %add10 = add nsw i32 %tmp4, %tmp3
728 %arrayidx13 = getelementptr inbounds i32, i32* %B, i64 %tmp1
729 store i32 %add10, i32* %arrayidx13, align 4
730 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 2
731 %cmp = icmp ult i64 %indvars.iv.next, 1024
732 br i1 %cmp, label %for.body, label %for.cond.cleanup
735 ; Check vectorization on interleaved access groups identified from mixed
737 ; void mixed_load3_store3(int *A) {
738 ; for (unsigned i = 0; i < 1024; i++) {
746 define void @mixed_load3_store3(i32* nocapture %A) {
747 ; CHECK-LABEL: @mixed_load3_store3(
749 ; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
751 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
752 ; CHECK: vector.body:
753 ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
754 ; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
755 ; CHECK-NEXT: [[TMP0:%.*]] = mul i64 [[INDEX]], 3
756 ; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i32, i32* [[A:%.*]], i64 [[TMP0]]
757 ; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[NEXT_GEP]] to <12 x i32>*
758 ; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <12 x i32>, <12 x i32>* [[TMP1]], align 4
759 ; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <12 x i32> [[WIDE_VEC]], <12 x i32> poison, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
760 ; CHECK-NEXT: [[STRIDED_VEC2:%.*]] = shufflevector <12 x i32> [[WIDE_VEC]], <12 x i32> poison, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
761 ; CHECK-NEXT: [[STRIDED_VEC3:%.*]] = shufflevector <12 x i32> [[WIDE_VEC]], <12 x i32> poison, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
762 ; CHECK-NEXT: [[TMP2:%.*]] = add <4 x i32> [[STRIDED_VEC]], [[VEC_IND]]
763 ; CHECK-NEXT: [[TMP3:%.*]] = add <4 x i32> [[STRIDED_VEC2]], [[VEC_IND]]
764 ; CHECK-NEXT: [[TMP4:%.*]] = add <4 x i32> [[STRIDED_VEC3]], [[VEC_IND]]
765 ; CHECK-NEXT: [[TMP5:%.*]] = bitcast i32* [[NEXT_GEP]] to <12 x i32>*
766 ; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
767 ; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
768 ; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i32> [[TMP6]], <8 x i32> [[TMP7]], <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
769 ; CHECK-NEXT: store <12 x i32> [[INTERLEAVED_VEC]], <12 x i32>* [[TMP5]], align 4
770 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
771 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
772 ; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
773 ; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
774 ; CHECK: middle.block:
775 ; CHECK-NEXT: br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
777 ; CHECK-NEXT: br label [[FOR_BODY:%.*]]
778 ; CHECK: for.cond.cleanup:
779 ; CHECK-NEXT: ret void
781 ; CHECK-NEXT: br i1 undef, label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
786 for.cond.cleanup: ; preds = %for.body
789 for.body: ; preds = %for.body, %entry
790 %i.013 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
791 %A.addr.012 = phi i32* [ %A, %entry ], [ %incdec.ptr3, %for.body ]
792 %incdec.ptr = getelementptr inbounds i32, i32* %A.addr.012, i64 1
793 %tmp = load i32, i32* %A.addr.012, align 4
794 %add = add i32 %tmp, %i.013
795 store i32 %add, i32* %A.addr.012, align 4
796 %incdec.ptr1 = getelementptr inbounds i32, i32* %A.addr.012, i64 2
797 %tmp1 = load i32, i32* %incdec.ptr, align 4
798 %add2 = add i32 %tmp1, %i.013
799 store i32 %add2, i32* %incdec.ptr, align 4
800 %incdec.ptr3 = getelementptr inbounds i32, i32* %A.addr.012, i64 3
801 %tmp2 = load i32, i32* %incdec.ptr1, align 4
802 %add4 = add i32 %tmp2, %i.013
803 store i32 %add4, i32* %incdec.ptr1, align 4
804 %inc = add nuw nsw i32 %i.013, 1
805 %exitcond = icmp eq i32 %inc, 1024
806 br i1 %exitcond, label %for.cond.cleanup, label %for.body
809 ; Check vectorization on interleaved access groups with members having different
820 ; void int_float_struct(struct IntFloat *A) {
823 ; for (unsigned i = 0; i < 1024; i++) {
832 %struct.IntFloat = type { i32, float }
834 @SA = common global i32 0, align 4
835 @SB = common global float 0.000000e+00, align 4
837 define void @int_float_struct(%struct.IntFloat* nocapture readonly %A) #0 {
838 ; CHECK-LABEL: @int_float_struct(
840 ; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
842 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
843 ; CHECK: vector.body:
844 ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
845 ; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x float> [ <float undef, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
846 ; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i32> [ <i32 undef, i32 0, i32 0, i32 0>, [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ]
847 ; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds [[STRUCT_INTFLOAT:%.*]], %struct.IntFloat* [[A:%.*]], i64 [[INDEX]], i32 0
848 ; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <8 x i32>*
849 ; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <8 x i32>, <8 x i32>* [[TMP1]], align 4
850 ; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
851 ; CHECK-NEXT: [[STRIDED_VEC2:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
852 ; CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[STRIDED_VEC2]] to <4 x float>
853 ; CHECK-NEXT: [[TMP3]] = add <4 x i32> [[STRIDED_VEC]], [[VEC_PHI1]]
854 ; CHECK-NEXT: [[TMP4]] = fadd fast <4 x float> [[VEC_PHI]], [[TMP2]]
855 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
856 ; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
857 ; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
858 ; CHECK: middle.block:
859 ; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP3]])
860 ; CHECK-NEXT: [[TMP7:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP4]])
861 ; CHECK-NEXT: br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
863 ; CHECK-NEXT: br label [[FOR_BODY:%.*]]
864 ; CHECK: for.cond.cleanup:
865 ; CHECK-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ undef, [[FOR_BODY]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ]
866 ; CHECK-NEXT: [[ADD3_LCSSA:%.*]] = phi float [ undef, [[FOR_BODY]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ]
867 ; CHECK-NEXT: store i32 [[ADD_LCSSA]], i32* @SA, align 4
868 ; CHECK-NEXT: store float [[ADD3_LCSSA]], float* @SB, align 4
869 ; CHECK-NEXT: ret void
871 ; CHECK-NEXT: br i1 undef, label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]]
876 for.cond.cleanup: ; preds = %for.body
877 store i32 %add, i32* @SA, align 4
878 store float %add3, float* @SB, align 4
881 for.body: ; preds = %for.body, %entry
882 %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
883 %SumB.014 = phi float [ undef, %entry ], [ %add3, %for.body ]
884 %SumA.013 = phi i32 [ undef, %entry ], [ %add, %for.body ]
885 %a = getelementptr inbounds %struct.IntFloat, %struct.IntFloat* %A, i64 %indvars.iv, i32 0
886 %tmp = load i32, i32* %a, align 4
887 %add = add nsw i32 %tmp, %SumA.013
888 %b = getelementptr inbounds %struct.IntFloat, %struct.IntFloat* %A, i64 %indvars.iv, i32 1
889 %tmp1 = load float, float* %b, align 4
890 %add3 = fadd fast float %SumB.014, %tmp1
891 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
892 %exitcond = icmp eq i64 %indvars.iv.next, 1024
893 br i1 %exitcond, label %for.cond.cleanup, label %for.body
896 ; Check vectorization of interleaved access groups in the presence of
897 ; dependences (PR27626). The following tests check that we don't reorder
898 ; dependent loads and stores when generating code for interleaved access
899 ; groups. Stores should be scalarized because the required code motion would
900 ; break dependences, and the remaining interleaved load groups should have
903 ; PR27626_0: Ensure a strided store is not moved after a dependent (zero
904 ; distance) strided load.
906 ; void PR27626_0(struct pair *p, int z, int n) {
907 ; for (int i = 0; i < n; i++) {
914 %pair.i32 = type { i32, i32 }
915 define void @PR27626_0(%pair.i32 *%p, i32 %z, i64 %n) {
916 ; CHECK-LABEL: @PR27626_0(
918 ; CHECK-NEXT: [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N:%.*]], i64 1)
919 ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[SMAX]], 5
920 ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
922 ; CHECK-NEXT: [[N_MOD_VF:%.*]] = and i64 [[SMAX]], 3
923 ; CHECK-NEXT: [[TMP0:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
924 ; CHECK-NEXT: [[TMP1:%.*]] = select i1 [[TMP0]], i64 4, i64 [[N_MOD_VF]]
925 ; CHECK-NEXT: [[N_VEC:%.*]] = sub nsw i64 [[SMAX]], [[TMP1]]
926 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
927 ; CHECK: vector.body:
928 ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
929 ; CHECK-NEXT: [[TMP2:%.*]] = or i64 [[INDEX]], 1
930 ; CHECK-NEXT: [[TMP3:%.*]] = or i64 [[INDEX]], 2
931 ; CHECK-NEXT: [[TMP4:%.*]] = or i64 [[INDEX]], 3
932 ; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds [[PAIR_I32:%.*]], %pair.i32* [[P:%.*]], i64 [[INDEX]], i32 0
933 ; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[TMP2]], i32 0
934 ; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[TMP3]], i32 0
935 ; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[TMP4]], i32 0
936 ; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[INDEX]], i32 1
937 ; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[TMP2]], i32 1
938 ; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[TMP3]], i32 1
939 ; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[TMP4]], i32 1
940 ; CHECK-NEXT: store i32 [[Z:%.*]], i32* [[TMP5]], align 4
941 ; CHECK-NEXT: store i32 [[Z]], i32* [[TMP6]], align 4
942 ; CHECK-NEXT: store i32 [[Z]], i32* [[TMP7]], align 4
943 ; CHECK-NEXT: store i32 [[Z]], i32* [[TMP8]], align 4
944 ; CHECK-NEXT: [[TMP13:%.*]] = bitcast i32* [[TMP5]] to <8 x i32>*
945 ; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <8 x i32>, <8 x i32>* [[TMP13]], align 4
946 ; CHECK-NEXT: [[TMP14:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i32 0
947 ; CHECK-NEXT: store i32 [[TMP14]], i32* [[TMP9]], align 4
948 ; CHECK-NEXT: [[TMP15:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i32 2
949 ; CHECK-NEXT: store i32 [[TMP15]], i32* [[TMP10]], align 4
950 ; CHECK-NEXT: [[TMP16:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i32 4
951 ; CHECK-NEXT: store i32 [[TMP16]], i32* [[TMP11]], align 4
952 ; CHECK-NEXT: [[TMP17:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i32 6
953 ; CHECK-NEXT: store i32 [[TMP17]], i32* [[TMP12]], align 4
954 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
955 ; CHECK-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
956 ; CHECK-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
957 ; CHECK: middle.block:
958 ; CHECK-NEXT: br label [[SCALAR_PH]]
960 ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
961 ; CHECK-NEXT: br label [[FOR_BODY:%.*]]
963 ; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
964 ; CHECK-NEXT: [[P_I_X:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[I]], i32 0
965 ; CHECK-NEXT: [[P_I_Y:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[I]], i32 1
966 ; CHECK-NEXT: store i32 [[Z]], i32* [[P_I_X]], align 4
967 ; CHECK-NEXT: store i32 [[Z]], i32* [[P_I_Y]], align 4
968 ; CHECK-NEXT: [[I_NEXT]] = add nuw nsw i64 [[I]], 1
969 ; CHECK-NEXT: [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]]
970 ; CHECK-NEXT: br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END:%.*]], !llvm.loop [[LOOP25:![0-9]+]]
972 ; CHECK-NEXT: ret void
978 %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
979 %p_i.x = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i, i32 0
980 %p_i.y = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i, i32 1
981 store i32 %z, i32* %p_i.x, align 4
982 %0 = load i32, i32* %p_i.x, align 4
983 store i32 %0, i32 *%p_i.y, align 4
984 %i.next = add nuw nsw i64 %i, 1
985 %cond = icmp slt i64 %i.next, %n
986 br i1 %cond, label %for.body, label %for.end
992 ; PR27626_1: Ensure a strided load is not moved before a dependent (zero
993 ; distance) strided store.
995 ; void PR27626_1(struct pair *p, int n) {
997 ; for (int i = 0; i < n; i++) {
1004 define i32 @PR27626_1(%pair.i32 *%p, i64 %n) {
1005 ; CHECK-LABEL: @PR27626_1(
1006 ; CHECK-NEXT: entry:
1007 ; CHECK-NEXT: [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N:%.*]], i64 1)
1008 ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[SMAX]], 5
1009 ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
1011 ; CHECK-NEXT: [[N_MOD_VF:%.*]] = and i64 [[SMAX]], 3
1012 ; CHECK-NEXT: [[TMP0:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
1013 ; CHECK-NEXT: [[TMP1:%.*]] = select i1 [[TMP0]], i64 4, i64 [[N_MOD_VF]]
1014 ; CHECK-NEXT: [[N_VEC:%.*]] = sub nsw i64 [[SMAX]], [[TMP1]]
1015 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
1016 ; CHECK: vector.body:
1017 ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
1018 ; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP16:%.*]], [[VECTOR_BODY]] ]
1019 ; CHECK-NEXT: [[TMP2:%.*]] = or i64 [[INDEX]], 1
1020 ; CHECK-NEXT: [[TMP3:%.*]] = or i64 [[INDEX]], 2
1021 ; CHECK-NEXT: [[TMP4:%.*]] = or i64 [[INDEX]], 3
1022 ; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds [[PAIR_I32:%.*]], %pair.i32* [[P:%.*]], i64 [[INDEX]], i32 0
1023 ; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[INDEX]], i32 1
1024 ; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[TMP2]], i32 1
1025 ; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[TMP3]], i32 1
1026 ; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[TMP4]], i32 1
1027 ; CHECK-NEXT: [[TMP10:%.*]] = bitcast i32* [[TMP5]] to <8 x i32>*
1028 ; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <8 x i32>, <8 x i32>* [[TMP10]], align 4
1029 ; CHECK-NEXT: [[TMP11:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i32 0
1030 ; CHECK-NEXT: store i32 [[TMP11]], i32* [[TMP6]], align 4
1031 ; CHECK-NEXT: [[TMP12:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i32 2
1032 ; CHECK-NEXT: store i32 [[TMP12]], i32* [[TMP7]], align 4
1033 ; CHECK-NEXT: [[TMP13:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i32 4
1034 ; CHECK-NEXT: store i32 [[TMP13]], i32* [[TMP8]], align 4
1035 ; CHECK-NEXT: [[TMP14:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i32 6
1036 ; CHECK-NEXT: store i32 [[TMP14]], i32* [[TMP9]], align 4
1037 ; CHECK-NEXT: [[TMP15:%.*]] = bitcast i32* [[TMP6]] to <8 x i32>*
1038 ; CHECK-NEXT: [[WIDE_VEC1:%.*]] = load <8 x i32>, <8 x i32>* [[TMP15]], align 4
1039 ; CHECK-NEXT: [[STRIDED_VEC2:%.*]] = shufflevector <8 x i32> [[WIDE_VEC1]], <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
1040 ; CHECK-NEXT: [[TMP16]] = add <4 x i32> [[STRIDED_VEC2]], [[VEC_PHI]]
1041 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
1042 ; CHECK-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
1043 ; CHECK-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]]
1044 ; CHECK: middle.block:
1045 ; CHECK-NEXT: [[TMP18:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP16]])
1046 ; CHECK-NEXT: br label [[SCALAR_PH]]
1048 ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
1049 ; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP18]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
1050 ; CHECK-NEXT: br label [[FOR_BODY:%.*]]
1052 ; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
1053 ; CHECK-NEXT: [[S:%.*]] = phi i32 [ [[TMP20:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
1054 ; CHECK-NEXT: [[P_I_X:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[I]], i32 0
1055 ; CHECK-NEXT: [[P_I_Y:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[I]], i32 1
1056 ; CHECK-NEXT: [[TMP19:%.*]] = load i32, i32* [[P_I_X]], align 4
1057 ; CHECK-NEXT: store i32 [[TMP19]], i32* [[P_I_Y]], align 4
1058 ; CHECK-NEXT: [[TMP20]] = add nsw i32 [[TMP19]], [[S]]
1059 ; CHECK-NEXT: [[I_NEXT]] = add nuw nsw i64 [[I]], 1
1060 ; CHECK-NEXT: [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]]
1061 ; CHECK-NEXT: br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END:%.*]], !llvm.loop [[LOOP27:![0-9]+]]
1063 ; CHECK-NEXT: ret i32 [[TMP20]]
1069 %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
1070 %s = phi i32 [ %2, %for.body ], [ 0, %entry ]
1071 %p_i.x = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i, i32 0
1072 %p_i.y = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i, i32 1
1073 %0 = load i32, i32* %p_i.x, align 4
1074 store i32 %0, i32* %p_i.y, align 4
1075 %1 = load i32, i32* %p_i.y, align 4
1076 %2 = add nsw i32 %1, %s
1077 %i.next = add nuw nsw i64 %i, 1
1078 %cond = icmp slt i64 %i.next, %n
1079 br i1 %cond, label %for.body, label %for.end
1082 %3 = phi i32 [ %2, %for.body ]
1086 ; PR27626_2: Ensure a strided store is not moved after a dependent (negative
1087 ; distance) strided load.
1089 ; void PR27626_2(struct pair *p, int z, int n) {
1090 ; for (int i = 0; i < n; i++) {
1092 ; p[i].y = p[i - 1].x;
1097 define void @PR27626_2(%pair.i32 *%p, i64 %n, i32 %z) {
1098 ; CHECK-LABEL: @PR27626_2(
1099 ; CHECK-NEXT: entry:
1100 ; CHECK-NEXT: [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N:%.*]], i64 1)
1101 ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[SMAX]], 5
1102 ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
1104 ; CHECK-NEXT: [[N_MOD_VF:%.*]] = and i64 [[SMAX]], 3
1105 ; CHECK-NEXT: [[TMP0:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
1106 ; CHECK-NEXT: [[TMP1:%.*]] = select i1 [[TMP0]], i64 4, i64 [[N_MOD_VF]]
1107 ; CHECK-NEXT: [[N_VEC:%.*]] = sub nsw i64 [[SMAX]], [[TMP1]]
1108 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
1109 ; CHECK: vector.body:
1110 ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
1111 ; CHECK-NEXT: [[TMP2:%.*]] = or i64 [[INDEX]], 1
1112 ; CHECK-NEXT: [[TMP3:%.*]] = or i64 [[INDEX]], 2
1113 ; CHECK-NEXT: [[TMP4:%.*]] = or i64 [[INDEX]], 3
1114 ; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds [[PAIR_I32:%.*]], %pair.i32* [[P:%.*]], i64 [[INDEX]], i32 0
1115 ; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[TMP2]], i32 0
1116 ; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[TMP3]], i32 0
1117 ; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[TMP4]], i32 0
1118 ; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 -1, i32 0
1119 ; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[INDEX]], i32 1
1120 ; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[TMP2]], i32 1
1121 ; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[TMP3]], i32 1
1122 ; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[TMP4]], i32 1
1123 ; CHECK-NEXT: store i32 [[Z:%.*]], i32* [[TMP5]], align 4
1124 ; CHECK-NEXT: store i32 [[Z]], i32* [[TMP6]], align 4
1125 ; CHECK-NEXT: store i32 [[Z]], i32* [[TMP7]], align 4
1126 ; CHECK-NEXT: store i32 [[Z]], i32* [[TMP8]], align 4
1127 ; CHECK-NEXT: [[TMP14:%.*]] = bitcast i32* [[TMP9]] to <8 x i32>*
1128 ; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <8 x i32>, <8 x i32>* [[TMP14]], align 4
1129 ; CHECK-NEXT: [[TMP15:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i32 0
1130 ; CHECK-NEXT: store i32 [[TMP15]], i32* [[TMP10]], align 4
1131 ; CHECK-NEXT: [[TMP16:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i32 2
1132 ; CHECK-NEXT: store i32 [[TMP16]], i32* [[TMP11]], align 4
1133 ; CHECK-NEXT: [[TMP17:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i32 4
1134 ; CHECK-NEXT: store i32 [[TMP17]], i32* [[TMP12]], align 4
1135 ; CHECK-NEXT: [[TMP18:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i32 6
1136 ; CHECK-NEXT: store i32 [[TMP18]], i32* [[TMP13]], align 4
1137 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
1138 ; CHECK-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
1139 ; CHECK-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]]
1140 ; CHECK: middle.block:
1141 ; CHECK-NEXT: br label [[SCALAR_PH]]
1143 ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
1144 ; CHECK-NEXT: br label [[FOR_BODY:%.*]]
1146 ; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
1147 ; CHECK-NEXT: [[P_I_X:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[I]], i32 0
1148 ; CHECK-NEXT: [[P_I_MINUS_1_X:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 -1, i32 0
1149 ; CHECK-NEXT: [[P_I_Y:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[I]], i32 1
1150 ; CHECK-NEXT: store i32 [[Z]], i32* [[P_I_X]], align 4
1151 ; CHECK-NEXT: [[TMP20:%.*]] = load i32, i32* [[P_I_MINUS_1_X]], align 4
1152 ; CHECK-NEXT: store i32 [[TMP20]], i32* [[P_I_Y]], align 4
1153 ; CHECK-NEXT: [[I_NEXT]] = add nuw nsw i64 [[I]], 1
1154 ; CHECK-NEXT: [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]]
1155 ; CHECK-NEXT: br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END:%.*]], !llvm.loop [[LOOP29:![0-9]+]]
1157 ; CHECK-NEXT: ret void
1163 %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
1164 %i_minus_1 = add nuw nsw i64 %i, -1
1165 %p_i.x = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i, i32 0
1166 %p_i_minus_1.x = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i_minus_1, i32 0
1167 %p_i.y = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i, i32 1
1168 store i32 %z, i32* %p_i.x, align 4
1169 %0 = load i32, i32* %p_i_minus_1.x, align 4
1170 store i32 %0, i32 *%p_i.y, align 4
1171 %i.next = add nuw nsw i64 %i, 1
1172 %cond = icmp slt i64 %i.next, %n
1173 br i1 %cond, label %for.body, label %for.end
1179 ; PR27626_3: Ensure a strided load is not moved before a dependent (negative
1180 ; distance) strided store.
1182 ; void PR27626_3(struct pair *p, int z, int n) {
1183 ; for (int i = 0; i < n; i++) {
1184 ; p[i + 1].y = p[i].x;
1190 define i32 @PR27626_3(%pair.i32 *%p, i64 %n, i32 %z) {
1191 ; CHECK-LABEL: @PR27626_3(
1192 ; CHECK-NEXT: entry:
1193 ; CHECK-NEXT: [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N:%.*]], i64 1)
1194 ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[SMAX]], 5
1195 ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
1197 ; CHECK-NEXT: [[N_MOD_VF:%.*]] = and i64 [[SMAX]], 3
1198 ; CHECK-NEXT: [[TMP0:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
1199 ; CHECK-NEXT: [[TMP1:%.*]] = select i1 [[TMP0]], i64 4, i64 [[N_MOD_VF]]
1200 ; CHECK-NEXT: [[N_VEC:%.*]] = sub nsw i64 [[SMAX]], [[TMP1]]
1201 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
1202 ; CHECK: vector.body:
1203 ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
1204 ; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
1205 ; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ]
1206 ; CHECK-NEXT: [[TMP2:%.*]] = add nuw nsw <4 x i64> [[VEC_IND]], <i64 1, i64 1, i64 1, i64 1>
1207 ; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds [[PAIR_I32:%.*]], %pair.i32* [[P:%.*]], i64 [[INDEX]], i32 0
1208 ; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[INDEX]], i32 1
1209 ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[TMP2]], i32 0
1210 ; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[TMP5]], i32 1
1211 ; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP2]], i32 1
1212 ; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[TMP7]], i32 1
1213 ; CHECK-NEXT: [[TMP9:%.*]] = extractelement <4 x i64> [[TMP2]], i32 2
1214 ; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[TMP9]], i32 1
1215 ; CHECK-NEXT: [[TMP11:%.*]] = extractelement <4 x i64> [[TMP2]], i32 3
1216 ; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[TMP11]], i32 1
1217 ; CHECK-NEXT: [[TMP13:%.*]] = bitcast i32* [[TMP3]] to <8 x i32>*
1218 ; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <8 x i32>, <8 x i32>* [[TMP13]], align 4
1219 ; CHECK-NEXT: [[TMP14:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i32 0
1220 ; CHECK-NEXT: store i32 [[TMP14]], i32* [[TMP6]], align 4
1221 ; CHECK-NEXT: [[TMP15:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i32 2
1222 ; CHECK-NEXT: store i32 [[TMP15]], i32* [[TMP8]], align 4
1223 ; CHECK-NEXT: [[TMP16:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i32 4
1224 ; CHECK-NEXT: store i32 [[TMP16]], i32* [[TMP10]], align 4
1225 ; CHECK-NEXT: [[TMP17:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i32 6
1226 ; CHECK-NEXT: store i32 [[TMP17]], i32* [[TMP12]], align 4
1227 ; CHECK-NEXT: [[TMP18:%.*]] = bitcast i32* [[TMP4]] to <8 x i32>*
1228 ; CHECK-NEXT: [[WIDE_VEC1:%.*]] = load <8 x i32>, <8 x i32>* [[TMP18]], align 4
1229 ; CHECK-NEXT: [[STRIDED_VEC2:%.*]] = shufflevector <8 x i32> [[WIDE_VEC1]], <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
1230 ; CHECK-NEXT: [[TMP19]] = add <4 x i32> [[STRIDED_VEC2]], [[VEC_PHI]]
1231 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
1232 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], <i64 4, i64 4, i64 4, i64 4>
1233 ; CHECK-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
1234 ; CHECK-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP30:![0-9]+]]
1235 ; CHECK: middle.block:
1236 ; CHECK-NEXT: [[TMP21:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP19]])
1237 ; CHECK-NEXT: br label [[SCALAR_PH]]
1239 ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
1240 ; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP21]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
1241 ; CHECK-NEXT: br label [[FOR_BODY:%.*]]
1243 ; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
1244 ; CHECK-NEXT: [[S:%.*]] = phi i32 [ [[TMP24:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
1245 ; CHECK-NEXT: [[I_PLUS_1:%.*]] = add nuw nsw i64 [[I]], 1
1246 ; CHECK-NEXT: [[P_I_X:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[I]], i32 0
1247 ; CHECK-NEXT: [[P_I_Y:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[I]], i32 1
1248 ; CHECK-NEXT: [[P_I_PLUS_1_Y:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[I_PLUS_1]], i32 1
1249 ; CHECK-NEXT: [[TMP22:%.*]] = load i32, i32* [[P_I_X]], align 4
1250 ; CHECK-NEXT: store i32 [[TMP22]], i32* [[P_I_PLUS_1_Y]], align 4
1251 ; CHECK-NEXT: [[TMP23:%.*]] = load i32, i32* [[P_I_Y]], align 4
1252 ; CHECK-NEXT: [[TMP24]] = add nsw i32 [[TMP23]], [[S]]
1253 ; CHECK-NEXT: [[I_NEXT]] = add nuw nsw i64 [[I]], 1
1254 ; CHECK-NEXT: [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]]
1255 ; CHECK-NEXT: br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END:%.*]], !llvm.loop [[LOOP31:![0-9]+]]
1257 ; CHECK-NEXT: ret i32 [[TMP24]]
1263 %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
1264 %s = phi i32 [ %2, %for.body ], [ 0, %entry ]
1265 %i_plus_1 = add nuw nsw i64 %i, 1
1266 %p_i.x = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i, i32 0
1267 %p_i.y = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i, i32 1
1268 %p_i_plus_1.y = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i_plus_1, i32 1
1269 %0 = load i32, i32* %p_i.x, align 4
1270 store i32 %0, i32* %p_i_plus_1.y, align 4
1271 %1 = load i32, i32* %p_i.y, align 4
1272 %2 = add nsw i32 %1, %s
1273 %i.next = add nuw nsw i64 %i, 1
1274 %cond = icmp slt i64 %i.next, %n
1275 br i1 %cond, label %for.body, label %for.end
1278 %3 = phi i32 [ %2, %for.body ]
1282 ; PR27626_4: Ensure we form an interleaved group for strided stores in the
1283 ; presence of a write-after-write dependence. We create a group for
1284 ; (2) and (3) while excluding (1).
1286 ; void PR27626_4(int *a, int x, int y, int z, int n) {
1287 ; for (int i = 0; i < n; i += 2) {
1290 ; a[i + 1] = z; // (3)
1295 define void @PR27626_4(i32 *%a, i32 %x, i32 %y, i32 %z, i64 %n) {
1296 ; CHECK-LABEL: @PR27626_4(
1297 ; CHECK-NEXT: entry:
1298 ; CHECK-NEXT: [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N:%.*]], i64 2)
1299 ; CHECK-NEXT: [[TMP0:%.*]] = add nsw i64 [[SMAX]], -1
1300 ; CHECK-NEXT: [[TMP1:%.*]] = lshr i64 [[TMP0]], 1
1301 ; CHECK-NEXT: [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1
1302 ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 6
1303 ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
1305 ; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[TMP2]], 9223372036854775804
1306 ; CHECK-NEXT: [[IND_END:%.*]] = shl nuw i64 [[N_VEC]], 1
1307 ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[Y:%.*]], i32 0
1308 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
1309 ; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i32> poison, i32 [[Z:%.*]], i32 0
1310 ; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT1]], <4 x i32> poison, <4 x i32> zeroinitializer
1311 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
1312 ; CHECK: vector.body:
1313 ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
1314 ; CHECK-NEXT: [[OFFSET_IDX:%.*]] = shl i64 [[INDEX]], 1
1315 ; CHECK-NEXT: [[TMP3:%.*]] = or i64 [[OFFSET_IDX]], 2
1316 ; CHECK-NEXT: [[TMP4:%.*]] = or i64 [[OFFSET_IDX]], 4
1317 ; CHECK-NEXT: [[TMP5:%.*]] = or i64 [[OFFSET_IDX]], 6
1318 ; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[OFFSET_IDX]]
1319 ; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP3]]
1320 ; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP4]]
1321 ; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP5]]
1322 ; CHECK-NEXT: store i32 [[X:%.*]], i32* [[TMP6]], align 4
1323 ; CHECK-NEXT: store i32 [[X]], i32* [[TMP7]], align 4
1324 ; CHECK-NEXT: store i32 [[X]], i32* [[TMP8]], align 4
1325 ; CHECK-NEXT: store i32 [[X]], i32* [[TMP9]], align 4
1326 ; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[OFFSET_IDX]]
1327 ; CHECK-NEXT: [[TMP11:%.*]] = bitcast i32* [[TMP10]] to <8 x i32>*
1328 ; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLAT]], <4 x i32> [[BROADCAST_SPLAT2]], <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
1329 ; CHECK-NEXT: store <8 x i32> [[INTERLEAVED_VEC]], <8 x i32>* [[TMP11]], align 4
1330 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
1331 ; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
1332 ; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP32:![0-9]+]]
1333 ; CHECK: middle.block:
1334 ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
1335 ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
1337 ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
1338 ; CHECK-NEXT: br label [[FOR_BODY:%.*]]
1340 ; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
1341 ; CHECK-NEXT: [[I_PLUS_1:%.*]] = or i64 [[I]], 1
1342 ; CHECK-NEXT: [[A_I:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[I]]
1343 ; CHECK-NEXT: [[A_I_PLUS_1:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[I_PLUS_1]]
1344 ; CHECK-NEXT: store i32 [[Y]], i32* [[A_I]], align 4
1345 ; CHECK-NEXT: store i32 [[Z]], i32* [[A_I_PLUS_1]], align 4
1346 ; CHECK-NEXT: [[I_NEXT]] = add nuw nsw i64 [[I]], 2
1347 ; CHECK-NEXT: [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]]
1348 ; CHECK-NEXT: br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP33:![0-9]+]]
1350 ; CHECK-NEXT: ret void
1356 %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
1357 %i_plus_1 = add i64 %i, 1
1358 %a_i = getelementptr inbounds i32, i32* %a, i64 %i
1359 %a_i_plus_1 = getelementptr inbounds i32, i32* %a, i64 %i_plus_1
1360 store i32 %x, i32* %a_i, align 4
1361 store i32 %y, i32* %a_i, align 4
1362 store i32 %z, i32* %a_i_plus_1, align 4
1363 %i.next = add nuw nsw i64 %i, 2
1364 %cond = icmp slt i64 %i.next, %n
1365 br i1 %cond, label %for.body, label %for.end
1371 ; PR27626_5: Ensure we do not form an interleaved group for strided stores in
1372 ; the presence of a write-after-write dependence.
1374 ; void PR27626_5(int *a, int x, int y, int z, int n) {
1375 ; for (int i = 3; i < n; i += 2) {
1383 define void @PR27626_5(i32 *%a, i32 %x, i32 %y, i32 %z, i64 %n) {
1384 ; CHECK-LABEL: @PR27626_5(
1385 ; CHECK-NEXT: entry:
1386 ; CHECK-NEXT: [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N:%.*]], i64 5)
1387 ; CHECK-NEXT: [[TMP0:%.*]] = add nsw i64 [[SMAX]], -4
1388 ; CHECK-NEXT: [[TMP1:%.*]] = lshr i64 [[TMP0]], 1
1389 ; CHECK-NEXT: [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1
1390 ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 6
1391 ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
1393 ; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[TMP2]], 9223372036854775804
1394 ; CHECK-NEXT: [[TMP3:%.*]] = shl nuw i64 [[N_VEC]], 1
1395 ; CHECK-NEXT: [[IND_END:%.*]] = or i64 [[TMP3]], 3
1396 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
1397 ; CHECK: vector.body:
1398 ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
1399 ; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 3, i64 5, i64 7, i64 9>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
1400 ; CHECK-NEXT: [[TMP4:%.*]] = shl i64 [[INDEX]], 1
1401 ; CHECK-NEXT: [[OFFSET_IDX:%.*]] = or i64 [[TMP4]], 3
1402 ; CHECK-NEXT: [[TMP5:%.*]] = or i64 [[TMP4]], 5
1403 ; CHECK-NEXT: [[TMP6:%.*]] = or i64 [[TMP4]], 7
1404 ; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[TMP4]], 9
1405 ; CHECK-NEXT: [[TMP8:%.*]] = add <4 x i64> [[VEC_IND]], <i64 -1, i64 -1, i64 -1, i64 -1>
1406 ; CHECK-NEXT: [[TMP9:%.*]] = add <4 x i64> [[VEC_IND]], <i64 -3, i64 -3, i64 -3, i64 -3>
1407 ; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[OFFSET_IDX]]
1408 ; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP5]]
1409 ; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP6]]
1410 ; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP7]]
1411 ; CHECK-NEXT: [[TMP14:%.*]] = extractelement <4 x i64> [[TMP8]], i32 0
1412 ; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP14]]
1413 ; CHECK-NEXT: [[TMP16:%.*]] = extractelement <4 x i64> [[TMP8]], i32 1
1414 ; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP16]]
1415 ; CHECK-NEXT: [[TMP18:%.*]] = extractelement <4 x i64> [[TMP8]], i32 2
1416 ; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP18]]
1417 ; CHECK-NEXT: [[TMP20:%.*]] = extractelement <4 x i64> [[TMP8]], i32 3
1418 ; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP20]]
1419 ; CHECK-NEXT: [[TMP22:%.*]] = extractelement <4 x i64> [[TMP9]], i32 0
1420 ; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP22]]
1421 ; CHECK-NEXT: [[TMP24:%.*]] = extractelement <4 x i64> [[TMP9]], i32 1
1422 ; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP24]]
1423 ; CHECK-NEXT: [[TMP26:%.*]] = extractelement <4 x i64> [[TMP9]], i32 2
1424 ; CHECK-NEXT: [[TMP27:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP26]]
1425 ; CHECK-NEXT: [[TMP28:%.*]] = extractelement <4 x i64> [[TMP9]], i32 3
1426 ; CHECK-NEXT: [[TMP29:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP28]]
1427 ; CHECK-NEXT: store i32 [[X:%.*]], i32* [[TMP15]], align 4
1428 ; CHECK-NEXT: store i32 [[X]], i32* [[TMP17]], align 4
1429 ; CHECK-NEXT: store i32 [[X]], i32* [[TMP19]], align 4
1430 ; CHECK-NEXT: store i32 [[X]], i32* [[TMP21]], align 4
1431 ; CHECK-NEXT: store i32 [[Y:%.*]], i32* [[TMP23]], align 4
1432 ; CHECK-NEXT: store i32 [[Y]], i32* [[TMP25]], align 4
1433 ; CHECK-NEXT: store i32 [[Y]], i32* [[TMP27]], align 4
1434 ; CHECK-NEXT: store i32 [[Y]], i32* [[TMP29]], align 4
1435 ; CHECK-NEXT: store i32 [[Z:%.*]], i32* [[TMP10]], align 4
1436 ; CHECK-NEXT: store i32 [[Z]], i32* [[TMP11]], align 4
1437 ; CHECK-NEXT: store i32 [[Z]], i32* [[TMP12]], align 4
1438 ; CHECK-NEXT: store i32 [[Z]], i32* [[TMP13]], align 4
1439 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
1440 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], <i64 8, i64 8, i64 8, i64 8>
1441 ; CHECK-NEXT: [[TMP30:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
1442 ; CHECK-NEXT: br i1 [[TMP30]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP34:![0-9]+]]
1443 ; CHECK: middle.block:
1444 ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
1445 ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
1447 ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 3, [[ENTRY:%.*]] ]
1448 ; CHECK-NEXT: br label [[FOR_BODY:%.*]]
1450 ; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
1451 ; CHECK-NEXT: [[I_MINUS_1:%.*]] = add i64 [[I]], -1
1452 ; CHECK-NEXT: [[I_MINUS_3:%.*]] = add i64 [[I]], -3
1453 ; CHECK-NEXT: [[A_I:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[I]]
1454 ; CHECK-NEXT: [[A_I_MINUS_1:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[I_MINUS_1]]
1455 ; CHECK-NEXT: [[A_I_MINUS_3:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[I_MINUS_3]]
1456 ; CHECK-NEXT: store i32 [[X]], i32* [[A_I_MINUS_1]], align 4
1457 ; CHECK-NEXT: store i32 [[Y]], i32* [[A_I_MINUS_3]], align 4
1458 ; CHECK-NEXT: store i32 [[Z]], i32* [[A_I]], align 4
1459 ; CHECK-NEXT: [[I_NEXT]] = add nuw nsw i64 [[I]], 2
1460 ; CHECK-NEXT: [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]]
1461 ; CHECK-NEXT: br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP35:![0-9]+]]
1463 ; CHECK-NEXT: ret void
1469 %i = phi i64 [ %i.next, %for.body ], [ 3, %entry ]
1470 %i_minus_1 = sub i64 %i, 1
1471 %i_minus_3 = sub i64 %i_minus_1, 2
1472 %a_i = getelementptr inbounds i32, i32* %a, i64 %i
1473 %a_i_minus_1 = getelementptr inbounds i32, i32* %a, i64 %i_minus_1
1474 %a_i_minus_3 = getelementptr inbounds i32, i32* %a, i64 %i_minus_3
1475 store i32 %x, i32* %a_i_minus_1, align 4
1476 store i32 %y, i32* %a_i_minus_3, align 4
1477 store i32 %z, i32* %a_i, align 4
1478 %i.next = add nuw nsw i64 %i, 2
1479 %cond = icmp slt i64 %i.next, %n
1480 br i1 %cond, label %for.body, label %for.end
1486 ; PR34743: Ensure that a cast which needs to sink after a load that belongs to
1487 ; an interleaved group, indeeded gets sunk.
1489 ; void PR34743(short *a, int *b, int n) {
1490 ; for (int i = 0, iv = 0; iv < n; i++, iv += 2) {
1491 ; b[i] = a[iv] * a[iv+1] * a[iv+2];
1496 define void @PR34743(i16* %a, i32* %b, i64 %n) {
1497 ; CHECK-LABEL: @PR34743(
1498 ; CHECK-NEXT: entry:
1499 ; CHECK-NEXT: [[DOTPRE:%.*]] = load i16, i16* [[A:%.*]], align 2
1500 ; CHECK-NEXT: [[TMP0:%.*]] = lshr i64 [[N:%.*]], 1
1501 ; CHECK-NEXT: [[TMP1:%.*]] = add nuw i64 [[TMP0]], 1
1502 ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 6
1503 ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
1504 ; CHECK: vector.memcheck:
1505 ; CHECK-NEXT: [[TMP2:%.*]] = lshr i64 [[N]], 1
1506 ; CHECK-NEXT: [[TMP3:%.*]] = add nuw i64 [[TMP2]], 1
1507 ; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i32, i32* [[B:%.*]], i64 [[TMP3]]
1508 ; CHECK-NEXT: [[SCEVGEP3:%.*]] = getelementptr i16, i16* [[A]], i64 1
1509 ; CHECK-NEXT: [[TMP4:%.*]] = and i64 [[N]], -2
1510 ; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[TMP4]], 3
1511 ; CHECK-NEXT: [[SCEVGEP5:%.*]] = getelementptr i16, i16* [[A]], i64 [[TMP5]]
1512 ; CHECK-NEXT: [[TMP6:%.*]] = bitcast i16* [[SCEVGEP5]] to i32*
1513 ; CHECK-NEXT: [[BOUND0:%.*]] = icmp ugt i32* [[TMP6]], [[B]]
1514 ; CHECK-NEXT: [[TMP7:%.*]] = bitcast i32* [[SCEVGEP]] to i16*
1515 ; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult i16* [[SCEVGEP3]], [[TMP7]]
1516 ; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
1517 ; CHECK-NEXT: br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
1519 ; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[TMP1]], -4
1520 ; CHECK-NEXT: [[IND_END:%.*]] = shl i64 [[N_VEC]], 1
1521 ; CHECK-NEXT: [[VECTOR_RECUR_INIT:%.*]] = insertelement <4 x i16> poison, i16 [[DOTPRE]], i32 3
1522 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
1523 ; CHECK: vector.body:
1524 ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
1525 ; CHECK-NEXT: [[VECTOR_RECUR:%.*]] = phi <4 x i16> [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[STRIDED_VEC8:%.*]], [[VECTOR_BODY]] ]
1526 ; CHECK-NEXT: [[OFFSET_IDX:%.*]] = shl i64 [[INDEX]], 1
1527 ; CHECK-NEXT: [[TMP8:%.*]] = or i64 [[OFFSET_IDX]], 1
1528 ; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i16, i16* [[A]], i64 [[TMP8]]
1529 ; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16* [[TMP9]] to <8 x i16>*
1530 ; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <8 x i16>, <8 x i16>* [[TMP10]], align 4
1531 ; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <8 x i16> [[WIDE_VEC]], <8 x i16> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
1532 ; CHECK-NEXT: [[STRIDED_VEC8]] = shufflevector <8 x i16> [[WIDE_VEC]], <8 x i16> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
1533 ; CHECK-NEXT: [[TMP11:%.*]] = sext <4 x i16> [[STRIDED_VEC]] to <4 x i32>
1534 ; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <4 x i16> [[VECTOR_RECUR]], <4 x i16> [[STRIDED_VEC8]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
1535 ; CHECK-NEXT: [[TMP13:%.*]] = sext <4 x i16> [[TMP12]] to <4 x i32>
1536 ; CHECK-NEXT: [[TMP14:%.*]] = sext <4 x i16> [[STRIDED_VEC8]] to <4 x i32>
1537 ; CHECK-NEXT: [[TMP15:%.*]] = mul nsw <4 x i32> [[TMP13]], [[TMP11]]
1538 ; CHECK-NEXT: [[TMP16:%.*]] = mul nsw <4 x i32> [[TMP15]], [[TMP14]]
1539 ; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDEX]]
1540 ; CHECK-NEXT: [[TMP18:%.*]] = bitcast i32* [[TMP17]] to <4 x i32>*
1541 ; CHECK-NEXT: store <4 x i32> [[TMP16]], <4 x i32>* [[TMP18]], align 4, !alias.scope !36, !noalias !39
1542 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
1543 ; CHECK-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
1544 ; CHECK-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP41:![0-9]+]]
1545 ; CHECK: middle.block:
1546 ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP1]], [[N_VEC]]
1547 ; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <8 x i16> [[WIDE_VEC]], i32 7
1548 ; CHECK-NEXT: br i1 [[CMP_N]], label [[END:%.*]], label [[SCALAR_PH]]
1550 ; CHECK-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ [[DOTPRE]], [[VECTOR_MEMCHECK]] ], [ [[DOTPRE]], [[ENTRY:%.*]] ], [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ]
1551 ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ENTRY]] ], [ [[IND_END]], [[MIDDLE_BLOCK]] ]
1552 ; CHECK-NEXT: [[BC_RESUME_VAL7:%.*]] = phi i64 [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ENTRY]] ], [ [[N_VEC]], [[MIDDLE_BLOCK]] ]
1553 ; CHECK-NEXT: br label [[LOOP:%.*]]
1555 ; CHECK-NEXT: [[SCALAR_RECUR:%.*]] = phi i16 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[LOAD2:%.*]], [[LOOP]] ]
1556 ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV2:%.*]], [[LOOP]] ]
1557 ; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL7]], [[SCALAR_PH]] ], [ [[I1:%.*]], [[LOOP]] ]
1558 ; CHECK-NEXT: [[CONV:%.*]] = sext i16 [[SCALAR_RECUR]] to i32
1559 ; CHECK-NEXT: [[I1]] = add nuw nsw i64 [[I]], 1
1560 ; CHECK-NEXT: [[IV1:%.*]] = or i64 [[IV]], 1
1561 ; CHECK-NEXT: [[IV2]] = add nuw nsw i64 [[IV]], 2
1562 ; CHECK-NEXT: [[GEP1:%.*]] = getelementptr inbounds i16, i16* [[A]], i64 [[IV1]]
1563 ; CHECK-NEXT: [[LOAD1:%.*]] = load i16, i16* [[GEP1]], align 4
1564 ; CHECK-NEXT: [[CONV1:%.*]] = sext i16 [[LOAD1]] to i32
1565 ; CHECK-NEXT: [[GEP2:%.*]] = getelementptr inbounds i16, i16* [[A]], i64 [[IV2]]
1566 ; CHECK-NEXT: [[LOAD2]] = load i16, i16* [[GEP2]], align 4
1567 ; CHECK-NEXT: [[CONV2:%.*]] = sext i16 [[LOAD2]] to i32
1568 ; CHECK-NEXT: [[MUL01:%.*]] = mul nsw i32 [[CONV]], [[CONV1]]
1569 ; CHECK-NEXT: [[MUL012:%.*]] = mul nsw i32 [[MUL01]], [[CONV2]]
1570 ; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[I]]
1571 ; CHECK-NEXT: store i32 [[MUL012]], i32* [[ARRAYIDX5]], align 4
1572 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV]], [[N]]
1573 ; CHECK-NEXT: br i1 [[EXITCOND]], label [[END]], label [[LOOP]], !llvm.loop [[LOOP42:![0-9]+]]
1575 ; CHECK-NEXT: ret void
1578 %.pre = load i16, i16* %a
1582 %0 = phi i16 [ %.pre, %entry ], [ %load2, %loop ]
1583 %iv = phi i64 [ 0, %entry ], [ %iv2, %loop ]
1584 %i = phi i64 [ 0, %entry ], [ %i1, %loop ]
1585 %conv = sext i16 %0 to i32
1586 %i1 = add nuw nsw i64 %i, 1
1587 %iv1 = add nuw nsw i64 %iv, 1
1588 %iv2 = add nuw nsw i64 %iv, 2
1589 %gep1 = getelementptr inbounds i16, i16* %a, i64 %iv1
1590 %load1 = load i16, i16* %gep1, align 4
1591 %conv1 = sext i16 %load1 to i32
1592 %gep2 = getelementptr inbounds i16, i16* %a, i64 %iv2
1593 %load2 = load i16, i16* %gep2, align 4
1594 %conv2 = sext i16 %load2 to i32
1595 %mul01 = mul nsw i32 %conv, %conv1
1596 %mul012 = mul nsw i32 %mul01, %conv2
1597 %arrayidx5 = getelementptr inbounds i32, i32* %b, i64 %i
1598 store i32 %mul012, i32* %arrayidx5
1599 %exitcond = icmp eq i64 %iv, %n
1600 br i1 %exitcond, label %end, label %loop
1606 attributes #0 = { "unsafe-fp-math"="true" }