1 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2 ; RUN: opt < %s -passes=loop-vectorize -mcpu=corei7-avx -S | FileCheck %s -check-prefix=AVX -check-prefix=AVX1
3 ; RUN: opt < %s -passes=loop-vectorize -mcpu=core-avx2 -S | FileCheck %s -check-prefix=AVX -check-prefix=AVX2
4 ; RUN: opt < %s -passes=loop-vectorize -mcpu=knl -S | FileCheck %s -check-prefix=AVX512
6 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
7 target triple = "x86_64-pc_linux"
11 ;void foo1(int *A, int *B, int *trigger) {
13 ; for (int i=0; i<10000; i++) {
14 ; if (trigger[i] < 100) {
15 ; A[i] = B[i] + trigger[i];
20 define void @foo1(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture readonly %trigger) local_unnamed_addr #0 {
23 ; AVX1-NEXT: [[B3:%.*]] = ptrtoint ptr [[B:%.*]] to i64
24 ; AVX1-NEXT: [[TRIGGER2:%.*]] = ptrtoint ptr [[TRIGGER:%.*]] to i64
25 ; AVX1-NEXT: [[A1:%.*]] = ptrtoint ptr [[A:%.*]] to i64
26 ; AVX1-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
27 ; AVX1: vector.memcheck:
28 ; AVX1-NEXT: [[TMP0:%.*]] = sub i64 [[A1]], [[TRIGGER2]]
29 ; AVX1-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP0]], 32
30 ; AVX1-NEXT: [[TMP1:%.*]] = sub i64 [[A1]], [[B3]]
31 ; AVX1-NEXT: [[DIFF_CHECK4:%.*]] = icmp ult i64 [[TMP1]], 32
32 ; AVX1-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[DIFF_CHECK]], [[DIFF_CHECK4]]
33 ; AVX1-NEXT: br i1 [[CONFLICT_RDX]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
35 ; AVX1-NEXT: br label [[VECTOR_BODY:%.*]]
37 ; AVX1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
38 ; AVX1-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 0
39 ; AVX1-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[TMP2]]
40 ; AVX1-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 0
41 ; AVX1-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i32>, ptr [[TMP4]], align 4
42 ; AVX1-NEXT: [[TMP5:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD]], splat (i32 100)
43 ; AVX1-NEXT: [[TMP6:%.*]] = getelementptr i32, ptr [[B]], i64 [[TMP2]]
44 ; AVX1-NEXT: [[TMP7:%.*]] = getelementptr i32, ptr [[TMP6]], i32 0
45 ; AVX1-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr [[TMP7]], i32 4, <8 x i1> [[TMP5]], <8 x i32> poison)
46 ; AVX1-NEXT: [[TMP8:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD]], [[WIDE_LOAD]]
47 ; AVX1-NEXT: [[TMP9:%.*]] = getelementptr i32, ptr [[A]], i64 [[TMP2]]
48 ; AVX1-NEXT: [[TMP10:%.*]] = getelementptr i32, ptr [[TMP9]], i32 0
49 ; AVX1-NEXT: call void @llvm.masked.store.v8i32.p0(<8 x i32> [[TMP8]], ptr [[TMP10]], i32 4, <8 x i1> [[TMP5]])
50 ; AVX1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
51 ; AVX1-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], 10000
52 ; AVX1-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
54 ; AVX1-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
56 ; AVX1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 10000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ]
57 ; AVX1-NEXT: br label [[FOR_BODY:%.*]]
59 ; AVX1-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
60 ; AVX1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[INDVARS_IV]]
61 ; AVX1-NEXT: [[TMP12:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
62 ; AVX1-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP12]], 100
63 ; AVX1-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]]
65 ; AVX1-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDVARS_IV]]
66 ; AVX1-NEXT: [[TMP13:%.*]] = load i32, ptr [[ARRAYIDX3]], align 4
67 ; AVX1-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], [[TMP12]]
68 ; AVX1-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDVARS_IV]]
69 ; AVX1-NEXT: store i32 [[ADD]], ptr [[ARRAYIDX7]], align 4
70 ; AVX1-NEXT: br label [[FOR_INC]]
72 ; AVX1-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
73 ; AVX1-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 10000
74 ; AVX1-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
79 ; AVX2-NEXT: iter.check:
80 ; AVX2-NEXT: [[B3:%.*]] = ptrtoint ptr [[B:%.*]] to i64
81 ; AVX2-NEXT: [[TRIGGER2:%.*]] = ptrtoint ptr [[TRIGGER:%.*]] to i64
82 ; AVX2-NEXT: [[A1:%.*]] = ptrtoint ptr [[A:%.*]] to i64
83 ; AVX2-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
84 ; AVX2: vector.memcheck:
85 ; AVX2-NEXT: [[TMP0:%.*]] = sub i64 [[A1]], [[TRIGGER2]]
86 ; AVX2-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP0]], 128
87 ; AVX2-NEXT: [[TMP1:%.*]] = sub i64 [[A1]], [[B3]]
88 ; AVX2-NEXT: [[DIFF_CHECK4:%.*]] = icmp ult i64 [[TMP1]], 128
89 ; AVX2-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[DIFF_CHECK]], [[DIFF_CHECK4]]
90 ; AVX2-NEXT: br i1 [[CONFLICT_RDX]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
91 ; AVX2: vector.main.loop.iter.check:
92 ; AVX2-NEXT: br i1 false, label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH1:%.*]]
94 ; AVX2-NEXT: br label [[VECTOR_BODY:%.*]]
96 ; AVX2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH1]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
97 ; AVX2-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 0
98 ; AVX2-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[TMP2]]
99 ; AVX2-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 0
100 ; AVX2-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 8
101 ; AVX2-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 16
102 ; AVX2-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 24
103 ; AVX2-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i32>, ptr [[TMP4]], align 4
104 ; AVX2-NEXT: [[WIDE_LOAD5:%.*]] = load <8 x i32>, ptr [[TMP5]], align 4
105 ; AVX2-NEXT: [[WIDE_LOAD6:%.*]] = load <8 x i32>, ptr [[TMP6]], align 4
106 ; AVX2-NEXT: [[WIDE_LOAD7:%.*]] = load <8 x i32>, ptr [[TMP7]], align 4
107 ; AVX2-NEXT: [[TMP8:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD]], splat (i32 100)
108 ; AVX2-NEXT: [[TMP9:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD5]], splat (i32 100)
109 ; AVX2-NEXT: [[TMP10:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD6]], splat (i32 100)
110 ; AVX2-NEXT: [[TMP11:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD7]], splat (i32 100)
111 ; AVX2-NEXT: [[TMP12:%.*]] = getelementptr i32, ptr [[B]], i64 [[TMP2]]
112 ; AVX2-NEXT: [[TMP13:%.*]] = getelementptr i32, ptr [[TMP12]], i32 0
113 ; AVX2-NEXT: [[TMP14:%.*]] = getelementptr i32, ptr [[TMP12]], i32 8
114 ; AVX2-NEXT: [[TMP15:%.*]] = getelementptr i32, ptr [[TMP12]], i32 16
115 ; AVX2-NEXT: [[TMP16:%.*]] = getelementptr i32, ptr [[TMP12]], i32 24
116 ; AVX2-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr [[TMP13]], i32 4, <8 x i1> [[TMP8]], <8 x i32> poison)
117 ; AVX2-NEXT: [[WIDE_MASKED_LOAD8:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr [[TMP14]], i32 4, <8 x i1> [[TMP9]], <8 x i32> poison)
118 ; AVX2-NEXT: [[WIDE_MASKED_LOAD9:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr [[TMP15]], i32 4, <8 x i1> [[TMP10]], <8 x i32> poison)
119 ; AVX2-NEXT: [[WIDE_MASKED_LOAD10:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr [[TMP16]], i32 4, <8 x i1> [[TMP11]], <8 x i32> poison)
120 ; AVX2-NEXT: [[TMP17:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD]], [[WIDE_LOAD]]
121 ; AVX2-NEXT: [[TMP18:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD8]], [[WIDE_LOAD5]]
122 ; AVX2-NEXT: [[TMP19:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD9]], [[WIDE_LOAD6]]
123 ; AVX2-NEXT: [[TMP20:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD10]], [[WIDE_LOAD7]]
124 ; AVX2-NEXT: [[TMP21:%.*]] = getelementptr i32, ptr [[A]], i64 [[TMP2]]
125 ; AVX2-NEXT: [[TMP22:%.*]] = getelementptr i32, ptr [[TMP21]], i32 0
126 ; AVX2-NEXT: [[TMP23:%.*]] = getelementptr i32, ptr [[TMP21]], i32 8
127 ; AVX2-NEXT: [[TMP24:%.*]] = getelementptr i32, ptr [[TMP21]], i32 16
128 ; AVX2-NEXT: [[TMP25:%.*]] = getelementptr i32, ptr [[TMP21]], i32 24
129 ; AVX2-NEXT: call void @llvm.masked.store.v8i32.p0(<8 x i32> [[TMP17]], ptr [[TMP22]], i32 4, <8 x i1> [[TMP8]])
130 ; AVX2-NEXT: call void @llvm.masked.store.v8i32.p0(<8 x i32> [[TMP18]], ptr [[TMP23]], i32 4, <8 x i1> [[TMP9]])
131 ; AVX2-NEXT: call void @llvm.masked.store.v8i32.p0(<8 x i32> [[TMP19]], ptr [[TMP24]], i32 4, <8 x i1> [[TMP10]])
132 ; AVX2-NEXT: call void @llvm.masked.store.v8i32.p0(<8 x i32> [[TMP20]], ptr [[TMP25]], i32 4, <8 x i1> [[TMP11]])
133 ; AVX2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
134 ; AVX2-NEXT: [[TMP26:%.*]] = icmp eq i64 [[INDEX_NEXT]], 9984
135 ; AVX2-NEXT: br i1 [[TMP26]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
136 ; AVX2: middle.block:
137 ; AVX2-NEXT: br i1 false, label [[FOR_END:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
138 ; AVX2: vec.epilog.iter.check:
139 ; AVX2-NEXT: br i1 false, label [[SCALAR_PH]], label [[VEC_EPILOG_PH]]
140 ; AVX2: vec.epilog.ph:
141 ; AVX2-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 9984, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_PH]] ]
142 ; AVX2-NEXT: br label [[FOR_BODY:%.*]]
143 ; AVX2: vec.epilog.vector.body:
144 ; AVX2-NEXT: [[INDEX11:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT14:%.*]], [[FOR_BODY]] ]
145 ; AVX2-NEXT: [[TMP37:%.*]] = add i64 [[INDEX11]], 0
146 ; AVX2-NEXT: [[TMP38:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[TMP37]]
147 ; AVX2-NEXT: [[TMP29:%.*]] = getelementptr inbounds i32, ptr [[TMP38]], i32 0
148 ; AVX2-NEXT: [[WIDE_LOAD12:%.*]] = load <8 x i32>, ptr [[TMP29]], align 4
149 ; AVX2-NEXT: [[TMP30:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD12]], splat (i32 100)
150 ; AVX2-NEXT: [[TMP31:%.*]] = getelementptr i32, ptr [[B]], i64 [[TMP37]]
151 ; AVX2-NEXT: [[TMP32:%.*]] = getelementptr i32, ptr [[TMP31]], i32 0
152 ; AVX2-NEXT: [[WIDE_MASKED_LOAD13:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr [[TMP32]], i32 4, <8 x i1> [[TMP30]], <8 x i32> poison)
153 ; AVX2-NEXT: [[TMP33:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD13]], [[WIDE_LOAD12]]
154 ; AVX2-NEXT: [[TMP34:%.*]] = getelementptr i32, ptr [[A]], i64 [[TMP37]]
155 ; AVX2-NEXT: [[TMP35:%.*]] = getelementptr i32, ptr [[TMP34]], i32 0
156 ; AVX2-NEXT: call void @llvm.masked.store.v8i32.p0(<8 x i32> [[TMP33]], ptr [[TMP35]], i32 4, <8 x i1> [[TMP30]])
157 ; AVX2-NEXT: [[INDEX_NEXT14]] = add nuw i64 [[INDEX11]], 8
158 ; AVX2-NEXT: [[TMP36:%.*]] = icmp eq i64 [[INDEX_NEXT14]], 10000
159 ; AVX2-NEXT: br i1 [[TMP36]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
160 ; AVX2: vec.epilog.middle.block:
161 ; AVX2-NEXT: br i1 true, label [[FOR_END]], label [[SCALAR_PH]]
162 ; AVX2: vec.epilog.scalar.ph:
163 ; AVX2-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 10000, [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 9984, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ITER_CHECK:%.*]] ]
164 ; AVX2-NEXT: br label [[FOR_BODY1:%.*]]
166 ; AVX2-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
167 ; AVX2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[INDVARS_IV]]
168 ; AVX2-NEXT: [[TMP27:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
169 ; AVX2-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP27]], 100
170 ; AVX2-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]]
172 ; AVX2-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDVARS_IV]]
173 ; AVX2-NEXT: [[TMP28:%.*]] = load i32, ptr [[ARRAYIDX3]], align 4
174 ; AVX2-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP28]], [[TMP27]]
175 ; AVX2-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDVARS_IV]]
176 ; AVX2-NEXT: store i32 [[ADD]], ptr [[ARRAYIDX7]], align 4
177 ; AVX2-NEXT: br label [[FOR_INC]]
179 ; AVX2-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
180 ; AVX2-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 10000
181 ; AVX2-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY1]], !llvm.loop [[LOOP4:![0-9]+]]
183 ; AVX2-NEXT: ret void
185 ; AVX512-LABEL: @foo1(
186 ; AVX512-NEXT: iter.check:
187 ; AVX512-NEXT: [[B3:%.*]] = ptrtoint ptr [[B:%.*]] to i64
188 ; AVX512-NEXT: [[TRIGGER2:%.*]] = ptrtoint ptr [[TRIGGER:%.*]] to i64
189 ; AVX512-NEXT: [[A1:%.*]] = ptrtoint ptr [[A:%.*]] to i64
190 ; AVX512-NEXT: br i1 false, label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
191 ; AVX512: vector.memcheck:
192 ; AVX512-NEXT: [[TMP0:%.*]] = sub i64 [[A1]], [[TRIGGER2]]
193 ; AVX512-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP0]], 256
194 ; AVX512-NEXT: [[TMP1:%.*]] = sub i64 [[A1]], [[B3]]
195 ; AVX512-NEXT: [[DIFF_CHECK4:%.*]] = icmp ult i64 [[TMP1]], 256
196 ; AVX512-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[DIFF_CHECK]], [[DIFF_CHECK4]]
197 ; AVX512-NEXT: br i1 [[CONFLICT_RDX]], label [[VEC_EPILOG_SCALAR_PH]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
198 ; AVX512: vector.main.loop.iter.check:
199 ; AVX512-NEXT: br i1 false, label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
201 ; AVX512-NEXT: br label [[VECTOR_BODY:%.*]]
202 ; AVX512: vector.body:
203 ; AVX512-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
204 ; AVX512-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 0
205 ; AVX512-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[TMP2]]
206 ; AVX512-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 0
207 ; AVX512-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 16
208 ; AVX512-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 32
209 ; AVX512-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 48
210 ; AVX512-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i32>, ptr [[TMP4]], align 4
211 ; AVX512-NEXT: [[WIDE_LOAD5:%.*]] = load <16 x i32>, ptr [[TMP5]], align 4
212 ; AVX512-NEXT: [[WIDE_LOAD6:%.*]] = load <16 x i32>, ptr [[TMP6]], align 4
213 ; AVX512-NEXT: [[WIDE_LOAD7:%.*]] = load <16 x i32>, ptr [[TMP7]], align 4
214 ; AVX512-NEXT: [[TMP8:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD]], splat (i32 100)
215 ; AVX512-NEXT: [[TMP9:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD5]], splat (i32 100)
216 ; AVX512-NEXT: [[TMP10:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD6]], splat (i32 100)
217 ; AVX512-NEXT: [[TMP11:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD7]], splat (i32 100)
218 ; AVX512-NEXT: [[TMP12:%.*]] = getelementptr i32, ptr [[B]], i64 [[TMP2]]
219 ; AVX512-NEXT: [[TMP13:%.*]] = getelementptr i32, ptr [[TMP12]], i32 0
220 ; AVX512-NEXT: [[TMP14:%.*]] = getelementptr i32, ptr [[TMP12]], i32 16
221 ; AVX512-NEXT: [[TMP15:%.*]] = getelementptr i32, ptr [[TMP12]], i32 32
222 ; AVX512-NEXT: [[TMP16:%.*]] = getelementptr i32, ptr [[TMP12]], i32 48
223 ; AVX512-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr [[TMP13]], i32 4, <16 x i1> [[TMP8]], <16 x i32> poison)
224 ; AVX512-NEXT: [[WIDE_MASKED_LOAD8:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr [[TMP14]], i32 4, <16 x i1> [[TMP9]], <16 x i32> poison)
225 ; AVX512-NEXT: [[WIDE_MASKED_LOAD9:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr [[TMP15]], i32 4, <16 x i1> [[TMP10]], <16 x i32> poison)
226 ; AVX512-NEXT: [[WIDE_MASKED_LOAD10:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr [[TMP16]], i32 4, <16 x i1> [[TMP11]], <16 x i32> poison)
227 ; AVX512-NEXT: [[TMP17:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD]], [[WIDE_LOAD]]
228 ; AVX512-NEXT: [[TMP18:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD8]], [[WIDE_LOAD5]]
229 ; AVX512-NEXT: [[TMP19:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD9]], [[WIDE_LOAD6]]
230 ; AVX512-NEXT: [[TMP20:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD10]], [[WIDE_LOAD7]]
231 ; AVX512-NEXT: [[TMP21:%.*]] = getelementptr i32, ptr [[A]], i64 [[TMP2]]
232 ; AVX512-NEXT: [[TMP22:%.*]] = getelementptr i32, ptr [[TMP21]], i32 0
233 ; AVX512-NEXT: [[TMP23:%.*]] = getelementptr i32, ptr [[TMP21]], i32 16
234 ; AVX512-NEXT: [[TMP24:%.*]] = getelementptr i32, ptr [[TMP21]], i32 32
235 ; AVX512-NEXT: [[TMP25:%.*]] = getelementptr i32, ptr [[TMP21]], i32 48
236 ; AVX512-NEXT: call void @llvm.masked.store.v16i32.p0(<16 x i32> [[TMP17]], ptr [[TMP22]], i32 4, <16 x i1> [[TMP8]])
237 ; AVX512-NEXT: call void @llvm.masked.store.v16i32.p0(<16 x i32> [[TMP18]], ptr [[TMP23]], i32 4, <16 x i1> [[TMP9]])
238 ; AVX512-NEXT: call void @llvm.masked.store.v16i32.p0(<16 x i32> [[TMP19]], ptr [[TMP24]], i32 4, <16 x i1> [[TMP10]])
239 ; AVX512-NEXT: call void @llvm.masked.store.v16i32.p0(<16 x i32> [[TMP20]], ptr [[TMP25]], i32 4, <16 x i1> [[TMP11]])
240 ; AVX512-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 64
241 ; AVX512-NEXT: [[TMP26:%.*]] = icmp eq i64 [[INDEX_NEXT]], 9984
242 ; AVX512-NEXT: br i1 [[TMP26]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
243 ; AVX512: middle.block:
244 ; AVX512-NEXT: br i1 false, label [[FOR_END:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
245 ; AVX512: vec.epilog.iter.check:
246 ; AVX512-NEXT: br i1 false, label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
247 ; AVX512: vec.epilog.ph:
248 ; AVX512-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 9984, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
249 ; AVX512-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
250 ; AVX512: vec.epilog.vector.body:
251 ; AVX512-NEXT: [[INDEX11:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT14:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
252 ; AVX512-NEXT: [[TMP27:%.*]] = add i64 [[INDEX11]], 0
253 ; AVX512-NEXT: [[TMP28:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[TMP27]]
254 ; AVX512-NEXT: [[TMP29:%.*]] = getelementptr inbounds i32, ptr [[TMP28]], i32 0
255 ; AVX512-NEXT: [[WIDE_LOAD12:%.*]] = load <16 x i32>, ptr [[TMP29]], align 4
256 ; AVX512-NEXT: [[TMP30:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD12]], splat (i32 100)
257 ; AVX512-NEXT: [[TMP31:%.*]] = getelementptr i32, ptr [[B]], i64 [[TMP27]]
258 ; AVX512-NEXT: [[TMP32:%.*]] = getelementptr i32, ptr [[TMP31]], i32 0
259 ; AVX512-NEXT: [[WIDE_MASKED_LOAD13:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr [[TMP32]], i32 4, <16 x i1> [[TMP30]], <16 x i32> poison)
260 ; AVX512-NEXT: [[TMP33:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD13]], [[WIDE_LOAD12]]
261 ; AVX512-NEXT: [[TMP34:%.*]] = getelementptr i32, ptr [[A]], i64 [[TMP27]]
262 ; AVX512-NEXT: [[TMP35:%.*]] = getelementptr i32, ptr [[TMP34]], i32 0
263 ; AVX512-NEXT: call void @llvm.masked.store.v16i32.p0(<16 x i32> [[TMP33]], ptr [[TMP35]], i32 4, <16 x i1> [[TMP30]])
264 ; AVX512-NEXT: [[INDEX_NEXT14]] = add nuw i64 [[INDEX11]], 16
265 ; AVX512-NEXT: [[TMP36:%.*]] = icmp eq i64 [[INDEX_NEXT14]], 10000
266 ; AVX512-NEXT: br i1 [[TMP36]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
267 ; AVX512: vec.epilog.middle.block:
268 ; AVX512-NEXT: br i1 true, label [[FOR_END]], label [[VEC_EPILOG_SCALAR_PH]]
269 ; AVX512: vec.epilog.scalar.ph:
270 ; AVX512-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 10000, [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 9984, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ITER_CHECK:%.*]] ]
271 ; AVX512-NEXT: br label [[FOR_BODY:%.*]]
273 ; AVX512-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
274 ; AVX512-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[INDVARS_IV]]
275 ; AVX512-NEXT: [[TMP37:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
276 ; AVX512-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP37]], 100
277 ; AVX512-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]]
279 ; AVX512-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDVARS_IV]]
280 ; AVX512-NEXT: [[TMP38:%.*]] = load i32, ptr [[ARRAYIDX3]], align 4
281 ; AVX512-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP38]], [[TMP37]]
282 ; AVX512-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDVARS_IV]]
283 ; AVX512-NEXT: store i32 [[ADD]], ptr [[ARRAYIDX7]], align 4
284 ; AVX512-NEXT: br label [[FOR_INC]]
286 ; AVX512-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
287 ; AVX512-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 10000
288 ; AVX512-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
290 ; AVX512-NEXT: ret void
295 for.body: ; preds = %for.inc, %entry
296 %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.inc ]
297 %arrayidx = getelementptr inbounds i32, ptr %trigger, i64 %indvars.iv
298 %0 = load i32, ptr %arrayidx, align 4
299 %cmp1 = icmp slt i32 %0, 100
300 br i1 %cmp1, label %if.then, label %for.inc
302 if.then: ; preds = %for.body
303 %arrayidx3 = getelementptr inbounds i32, ptr %B, i64 %indvars.iv
304 %1 = load i32, ptr %arrayidx3, align 4
305 %add = add nsw i32 %1, %0
306 %arrayidx7 = getelementptr inbounds i32, ptr %A, i64 %indvars.iv
307 store i32 %add, ptr %arrayidx7, align 4
310 for.inc: ; preds = %for.body, %if.then
311 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
312 %exitcond = icmp eq i64 %indvars.iv.next, 10000
313 br i1 %exitcond, label %for.end, label %for.body
315 for.end: ; preds = %for.inc
319 ; The same as @foo1 but all the pointers are address space 1 pointers.
321 define void @foo1_addrspace1(ptr addrspace(1) nocapture %A, ptr addrspace(1) nocapture readonly %B, ptr addrspace(1) nocapture readonly %trigger) local_unnamed_addr #0 {
322 ; AVX1-LABEL: @foo1_addrspace1(
324 ; AVX1-NEXT: [[B3:%.*]] = ptrtoint ptr addrspace(1) [[B:%.*]] to i64
325 ; AVX1-NEXT: [[TRIGGER2:%.*]] = ptrtoint ptr addrspace(1) [[TRIGGER:%.*]] to i64
326 ; AVX1-NEXT: [[A1:%.*]] = ptrtoint ptr addrspace(1) [[A:%.*]] to i64
327 ; AVX1-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
328 ; AVX1: vector.memcheck:
329 ; AVX1-NEXT: [[TMP0:%.*]] = sub i64 [[A1]], [[TRIGGER2]]
330 ; AVX1-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP0]], 32
331 ; AVX1-NEXT: [[TMP1:%.*]] = sub i64 [[A1]], [[B3]]
332 ; AVX1-NEXT: [[DIFF_CHECK4:%.*]] = icmp ult i64 [[TMP1]], 32
333 ; AVX1-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[DIFF_CHECK]], [[DIFF_CHECK4]]
334 ; AVX1-NEXT: br i1 [[CONFLICT_RDX]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
336 ; AVX1-NEXT: br label [[VECTOR_BODY:%.*]]
338 ; AVX1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
339 ; AVX1-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 0
340 ; AVX1-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[TRIGGER]], i64 [[TMP2]]
341 ; AVX1-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[TMP3]], i32 0
342 ; AVX1-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i32>, ptr addrspace(1) [[TMP4]], align 4
343 ; AVX1-NEXT: [[TMP5:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD]], splat (i32 100)
344 ; AVX1-NEXT: [[TMP6:%.*]] = getelementptr i32, ptr addrspace(1) [[B]], i64 [[TMP2]]
345 ; AVX1-NEXT: [[TMP7:%.*]] = getelementptr i32, ptr addrspace(1) [[TMP6]], i32 0
346 ; AVX1-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1(ptr addrspace(1) [[TMP7]], i32 4, <8 x i1> [[TMP5]], <8 x i32> poison)
347 ; AVX1-NEXT: [[TMP8:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD]], [[WIDE_LOAD]]
348 ; AVX1-NEXT: [[TMP9:%.*]] = getelementptr i32, ptr addrspace(1) [[A]], i64 [[TMP2]]
349 ; AVX1-NEXT: [[TMP10:%.*]] = getelementptr i32, ptr addrspace(1) [[TMP9]], i32 0
350 ; AVX1-NEXT: call void @llvm.masked.store.v8i32.p1(<8 x i32> [[TMP8]], ptr addrspace(1) [[TMP10]], i32 4, <8 x i1> [[TMP5]])
351 ; AVX1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
352 ; AVX1-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], 10000
353 ; AVX1-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
354 ; AVX1: middle.block:
355 ; AVX1-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
357 ; AVX1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 10000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ]
358 ; AVX1-NEXT: br label [[FOR_BODY:%.*]]
360 ; AVX1-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
361 ; AVX1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[TRIGGER]], i64 [[INDVARS_IV]]
362 ; AVX1-NEXT: [[TMP12:%.*]] = load i32, ptr addrspace(1) [[ARRAYIDX]], align 4
363 ; AVX1-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP12]], 100
364 ; AVX1-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]]
366 ; AVX1-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[B]], i64 [[INDVARS_IV]]
367 ; AVX1-NEXT: [[TMP13:%.*]] = load i32, ptr addrspace(1) [[ARRAYIDX3]], align 4
368 ; AVX1-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], [[TMP12]]
369 ; AVX1-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[A]], i64 [[INDVARS_IV]]
370 ; AVX1-NEXT: store i32 [[ADD]], ptr addrspace(1) [[ARRAYIDX7]], align 4
371 ; AVX1-NEXT: br label [[FOR_INC]]
373 ; AVX1-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
374 ; AVX1-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 10000
375 ; AVX1-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
377 ; AVX1-NEXT: ret void
379 ; AVX2-LABEL: @foo1_addrspace1(
380 ; AVX2-NEXT: iter.check:
381 ; AVX2-NEXT: [[B3:%.*]] = ptrtoint ptr addrspace(1) [[B:%.*]] to i64
382 ; AVX2-NEXT: [[TRIGGER2:%.*]] = ptrtoint ptr addrspace(1) [[TRIGGER:%.*]] to i64
383 ; AVX2-NEXT: [[A1:%.*]] = ptrtoint ptr addrspace(1) [[A:%.*]] to i64
384 ; AVX2-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
385 ; AVX2: vector.memcheck:
386 ; AVX2-NEXT: [[TMP0:%.*]] = sub i64 [[A1]], [[TRIGGER2]]
387 ; AVX2-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP0]], 128
388 ; AVX2-NEXT: [[TMP1:%.*]] = sub i64 [[A1]], [[B3]]
389 ; AVX2-NEXT: [[DIFF_CHECK4:%.*]] = icmp ult i64 [[TMP1]], 128
390 ; AVX2-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[DIFF_CHECK]], [[DIFF_CHECK4]]
391 ; AVX2-NEXT: br i1 [[CONFLICT_RDX]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
392 ; AVX2: vector.main.loop.iter.check:
393 ; AVX2-NEXT: br i1 false, label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH1:%.*]]
395 ; AVX2-NEXT: br label [[VECTOR_BODY:%.*]]
397 ; AVX2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH1]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
398 ; AVX2-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 0
399 ; AVX2-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[TRIGGER]], i64 [[TMP2]]
400 ; AVX2-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[TMP3]], i32 0
401 ; AVX2-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[TMP3]], i32 8
402 ; AVX2-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[TMP3]], i32 16
403 ; AVX2-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[TMP3]], i32 24
404 ; AVX2-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i32>, ptr addrspace(1) [[TMP4]], align 4
405 ; AVX2-NEXT: [[WIDE_LOAD5:%.*]] = load <8 x i32>, ptr addrspace(1) [[TMP5]], align 4
406 ; AVX2-NEXT: [[WIDE_LOAD6:%.*]] = load <8 x i32>, ptr addrspace(1) [[TMP6]], align 4
407 ; AVX2-NEXT: [[WIDE_LOAD7:%.*]] = load <8 x i32>, ptr addrspace(1) [[TMP7]], align 4
408 ; AVX2-NEXT: [[TMP8:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD]], splat (i32 100)
409 ; AVX2-NEXT: [[TMP9:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD5]], splat (i32 100)
410 ; AVX2-NEXT: [[TMP10:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD6]], splat (i32 100)
411 ; AVX2-NEXT: [[TMP11:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD7]], splat (i32 100)
412 ; AVX2-NEXT: [[TMP12:%.*]] = getelementptr i32, ptr addrspace(1) [[B]], i64 [[TMP2]]
413 ; AVX2-NEXT: [[TMP13:%.*]] = getelementptr i32, ptr addrspace(1) [[TMP12]], i32 0
414 ; AVX2-NEXT: [[TMP14:%.*]] = getelementptr i32, ptr addrspace(1) [[TMP12]], i32 8
415 ; AVX2-NEXT: [[TMP15:%.*]] = getelementptr i32, ptr addrspace(1) [[TMP12]], i32 16
416 ; AVX2-NEXT: [[TMP16:%.*]] = getelementptr i32, ptr addrspace(1) [[TMP12]], i32 24
417 ; AVX2-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1(ptr addrspace(1) [[TMP13]], i32 4, <8 x i1> [[TMP8]], <8 x i32> poison)
418 ; AVX2-NEXT: [[WIDE_MASKED_LOAD8:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1(ptr addrspace(1) [[TMP14]], i32 4, <8 x i1> [[TMP9]], <8 x i32> poison)
419 ; AVX2-NEXT: [[WIDE_MASKED_LOAD9:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1(ptr addrspace(1) [[TMP15]], i32 4, <8 x i1> [[TMP10]], <8 x i32> poison)
420 ; AVX2-NEXT: [[WIDE_MASKED_LOAD10:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1(ptr addrspace(1) [[TMP16]], i32 4, <8 x i1> [[TMP11]], <8 x i32> poison)
421 ; AVX2-NEXT: [[TMP17:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD]], [[WIDE_LOAD]]
422 ; AVX2-NEXT: [[TMP18:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD8]], [[WIDE_LOAD5]]
423 ; AVX2-NEXT: [[TMP19:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD9]], [[WIDE_LOAD6]]
424 ; AVX2-NEXT: [[TMP20:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD10]], [[WIDE_LOAD7]]
425 ; AVX2-NEXT: [[TMP21:%.*]] = getelementptr i32, ptr addrspace(1) [[A]], i64 [[TMP2]]
426 ; AVX2-NEXT: [[TMP22:%.*]] = getelementptr i32, ptr addrspace(1) [[TMP21]], i32 0
427 ; AVX2-NEXT: [[TMP23:%.*]] = getelementptr i32, ptr addrspace(1) [[TMP21]], i32 8
428 ; AVX2-NEXT: [[TMP24:%.*]] = getelementptr i32, ptr addrspace(1) [[TMP21]], i32 16
429 ; AVX2-NEXT: [[TMP25:%.*]] = getelementptr i32, ptr addrspace(1) [[TMP21]], i32 24
430 ; AVX2-NEXT: call void @llvm.masked.store.v8i32.p1(<8 x i32> [[TMP17]], ptr addrspace(1) [[TMP22]], i32 4, <8 x i1> [[TMP8]])
431 ; AVX2-NEXT: call void @llvm.masked.store.v8i32.p1(<8 x i32> [[TMP18]], ptr addrspace(1) [[TMP23]], i32 4, <8 x i1> [[TMP9]])
432 ; AVX2-NEXT: call void @llvm.masked.store.v8i32.p1(<8 x i32> [[TMP19]], ptr addrspace(1) [[TMP24]], i32 4, <8 x i1> [[TMP10]])
433 ; AVX2-NEXT: call void @llvm.masked.store.v8i32.p1(<8 x i32> [[TMP20]], ptr addrspace(1) [[TMP25]], i32 4, <8 x i1> [[TMP11]])
434 ; AVX2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
435 ; AVX2-NEXT: [[TMP26:%.*]] = icmp eq i64 [[INDEX_NEXT]], 9984
436 ; AVX2-NEXT: br i1 [[TMP26]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
437 ; AVX2: middle.block:
438 ; AVX2-NEXT: br i1 false, label [[FOR_END:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
439 ; AVX2: vec.epilog.iter.check:
440 ; AVX2-NEXT: br i1 false, label [[SCALAR_PH]], label [[VEC_EPILOG_PH]]
441 ; AVX2: vec.epilog.ph:
442 ; AVX2-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 9984, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_PH]] ]
443 ; AVX2-NEXT: br label [[FOR_BODY:%.*]]
444 ; AVX2: vec.epilog.vector.body:
445 ; AVX2-NEXT: [[INDEX11:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT14:%.*]], [[FOR_BODY]] ]
446 ; AVX2-NEXT: [[TMP37:%.*]] = add i64 [[INDEX11]], 0
447 ; AVX2-NEXT: [[TMP38:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[TRIGGER]], i64 [[TMP37]]
448 ; AVX2-NEXT: [[TMP29:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[TMP38]], i32 0
449 ; AVX2-NEXT: [[WIDE_LOAD12:%.*]] = load <8 x i32>, ptr addrspace(1) [[TMP29]], align 4
450 ; AVX2-NEXT: [[TMP30:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD12]], splat (i32 100)
451 ; AVX2-NEXT: [[TMP31:%.*]] = getelementptr i32, ptr addrspace(1) [[B]], i64 [[TMP37]]
452 ; AVX2-NEXT: [[TMP32:%.*]] = getelementptr i32, ptr addrspace(1) [[TMP31]], i32 0
453 ; AVX2-NEXT: [[WIDE_MASKED_LOAD13:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1(ptr addrspace(1) [[TMP32]], i32 4, <8 x i1> [[TMP30]], <8 x i32> poison)
454 ; AVX2-NEXT: [[TMP33:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD13]], [[WIDE_LOAD12]]
455 ; AVX2-NEXT: [[TMP34:%.*]] = getelementptr i32, ptr addrspace(1) [[A]], i64 [[TMP37]]
456 ; AVX2-NEXT: [[TMP35:%.*]] = getelementptr i32, ptr addrspace(1) [[TMP34]], i32 0
457 ; AVX2-NEXT: call void @llvm.masked.store.v8i32.p1(<8 x i32> [[TMP33]], ptr addrspace(1) [[TMP35]], i32 4, <8 x i1> [[TMP30]])
458 ; AVX2-NEXT: [[INDEX_NEXT14]] = add nuw i64 [[INDEX11]], 8
459 ; AVX2-NEXT: [[TMP36:%.*]] = icmp eq i64 [[INDEX_NEXT14]], 10000
460 ; AVX2-NEXT: br i1 [[TMP36]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
461 ; AVX2: vec.epilog.middle.block:
462 ; AVX2-NEXT: br i1 true, label [[FOR_END]], label [[SCALAR_PH]]
463 ; AVX2: vec.epilog.scalar.ph:
464 ; AVX2-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 10000, [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 9984, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ITER_CHECK:%.*]] ]
465 ; AVX2-NEXT: br label [[FOR_BODY1:%.*]]
467 ; AVX2-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
468 ; AVX2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[TRIGGER]], i64 [[INDVARS_IV]]
469 ; AVX2-NEXT: [[TMP27:%.*]] = load i32, ptr addrspace(1) [[ARRAYIDX]], align 4
470 ; AVX2-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP27]], 100
471 ; AVX2-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]]
473 ; AVX2-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[B]], i64 [[INDVARS_IV]]
474 ; AVX2-NEXT: [[TMP28:%.*]] = load i32, ptr addrspace(1) [[ARRAYIDX3]], align 4
475 ; AVX2-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP28]], [[TMP27]]
476 ; AVX2-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[A]], i64 [[INDVARS_IV]]
477 ; AVX2-NEXT: store i32 [[ADD]], ptr addrspace(1) [[ARRAYIDX7]], align 4
478 ; AVX2-NEXT: br label [[FOR_INC]]
480 ; AVX2-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
481 ; AVX2-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 10000
482 ; AVX2-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY1]], !llvm.loop [[LOOP7:![0-9]+]]
484 ; AVX2-NEXT: ret void
486 ; AVX512-LABEL: @foo1_addrspace1(
487 ; AVX512-NEXT: iter.check:
488 ; AVX512-NEXT: [[B3:%.*]] = ptrtoint ptr addrspace(1) [[B:%.*]] to i64
489 ; AVX512-NEXT: [[TRIGGER2:%.*]] = ptrtoint ptr addrspace(1) [[TRIGGER:%.*]] to i64
490 ; AVX512-NEXT: [[A1:%.*]] = ptrtoint ptr addrspace(1) [[A:%.*]] to i64
491 ; AVX512-NEXT: br i1 false, label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
492 ; AVX512: vector.memcheck:
493 ; AVX512-NEXT: [[TMP0:%.*]] = sub i64 [[A1]], [[TRIGGER2]]
494 ; AVX512-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP0]], 256
495 ; AVX512-NEXT: [[TMP1:%.*]] = sub i64 [[A1]], [[B3]]
496 ; AVX512-NEXT: [[DIFF_CHECK4:%.*]] = icmp ult i64 [[TMP1]], 256
497 ; AVX512-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[DIFF_CHECK]], [[DIFF_CHECK4]]
498 ; AVX512-NEXT: br i1 [[CONFLICT_RDX]], label [[VEC_EPILOG_SCALAR_PH]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
499 ; AVX512: vector.main.loop.iter.check:
500 ; AVX512-NEXT: br i1 false, label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
502 ; AVX512-NEXT: br label [[VECTOR_BODY:%.*]]
503 ; AVX512: vector.body:
504 ; AVX512-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
505 ; AVX512-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 0
506 ; AVX512-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[TRIGGER]], i64 [[TMP2]]
507 ; AVX512-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[TMP3]], i32 0
508 ; AVX512-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[TMP3]], i32 16
509 ; AVX512-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[TMP3]], i32 32
510 ; AVX512-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[TMP3]], i32 48
511 ; AVX512-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i32>, ptr addrspace(1) [[TMP4]], align 4
512 ; AVX512-NEXT: [[WIDE_LOAD5:%.*]] = load <16 x i32>, ptr addrspace(1) [[TMP5]], align 4
513 ; AVX512-NEXT: [[WIDE_LOAD6:%.*]] = load <16 x i32>, ptr addrspace(1) [[TMP6]], align 4
514 ; AVX512-NEXT: [[WIDE_LOAD7:%.*]] = load <16 x i32>, ptr addrspace(1) [[TMP7]], align 4
515 ; AVX512-NEXT: [[TMP8:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD]], splat (i32 100)
516 ; AVX512-NEXT: [[TMP9:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD5]], splat (i32 100)
517 ; AVX512-NEXT: [[TMP10:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD6]], splat (i32 100)
518 ; AVX512-NEXT: [[TMP11:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD7]], splat (i32 100)
519 ; AVX512-NEXT: [[TMP12:%.*]] = getelementptr i32, ptr addrspace(1) [[B]], i64 [[TMP2]]
520 ; AVX512-NEXT: [[TMP13:%.*]] = getelementptr i32, ptr addrspace(1) [[TMP12]], i32 0
521 ; AVX512-NEXT: [[TMP14:%.*]] = getelementptr i32, ptr addrspace(1) [[TMP12]], i32 16
522 ; AVX512-NEXT: [[TMP15:%.*]] = getelementptr i32, ptr addrspace(1) [[TMP12]], i32 32
523 ; AVX512-NEXT: [[TMP16:%.*]] = getelementptr i32, ptr addrspace(1) [[TMP12]], i32 48
524 ; AVX512-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p1(ptr addrspace(1) [[TMP13]], i32 4, <16 x i1> [[TMP8]], <16 x i32> poison)
525 ; AVX512-NEXT: [[WIDE_MASKED_LOAD8:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p1(ptr addrspace(1) [[TMP14]], i32 4, <16 x i1> [[TMP9]], <16 x i32> poison)
526 ; AVX512-NEXT: [[WIDE_MASKED_LOAD9:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p1(ptr addrspace(1) [[TMP15]], i32 4, <16 x i1> [[TMP10]], <16 x i32> poison)
527 ; AVX512-NEXT: [[WIDE_MASKED_LOAD10:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p1(ptr addrspace(1) [[TMP16]], i32 4, <16 x i1> [[TMP11]], <16 x i32> poison)
528 ; AVX512-NEXT: [[TMP17:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD]], [[WIDE_LOAD]]
529 ; AVX512-NEXT: [[TMP18:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD8]], [[WIDE_LOAD5]]
530 ; AVX512-NEXT: [[TMP19:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD9]], [[WIDE_LOAD6]]
531 ; AVX512-NEXT: [[TMP20:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD10]], [[WIDE_LOAD7]]
532 ; AVX512-NEXT: [[TMP21:%.*]] = getelementptr i32, ptr addrspace(1) [[A]], i64 [[TMP2]]
533 ; AVX512-NEXT: [[TMP22:%.*]] = getelementptr i32, ptr addrspace(1) [[TMP21]], i32 0
534 ; AVX512-NEXT: [[TMP23:%.*]] = getelementptr i32, ptr addrspace(1) [[TMP21]], i32 16
535 ; AVX512-NEXT: [[TMP24:%.*]] = getelementptr i32, ptr addrspace(1) [[TMP21]], i32 32
536 ; AVX512-NEXT: [[TMP25:%.*]] = getelementptr i32, ptr addrspace(1) [[TMP21]], i32 48
537 ; AVX512-NEXT: call void @llvm.masked.store.v16i32.p1(<16 x i32> [[TMP17]], ptr addrspace(1) [[TMP22]], i32 4, <16 x i1> [[TMP8]])
538 ; AVX512-NEXT: call void @llvm.masked.store.v16i32.p1(<16 x i32> [[TMP18]], ptr addrspace(1) [[TMP23]], i32 4, <16 x i1> [[TMP9]])
539 ; AVX512-NEXT: call void @llvm.masked.store.v16i32.p1(<16 x i32> [[TMP19]], ptr addrspace(1) [[TMP24]], i32 4, <16 x i1> [[TMP10]])
540 ; AVX512-NEXT: call void @llvm.masked.store.v16i32.p1(<16 x i32> [[TMP20]], ptr addrspace(1) [[TMP25]], i32 4, <16 x i1> [[TMP11]])
541 ; AVX512-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 64
542 ; AVX512-NEXT: [[TMP26:%.*]] = icmp eq i64 [[INDEX_NEXT]], 9984
543 ; AVX512-NEXT: br i1 [[TMP26]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
544 ; AVX512: middle.block:
545 ; AVX512-NEXT: br i1 false, label [[FOR_END:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
546 ; AVX512: vec.epilog.iter.check:
547 ; AVX512-NEXT: br i1 false, label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
548 ; AVX512: vec.epilog.ph:
549 ; AVX512-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 9984, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
550 ; AVX512-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
551 ; AVX512: vec.epilog.vector.body:
552 ; AVX512-NEXT: [[INDEX11:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT14:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
553 ; AVX512-NEXT: [[TMP27:%.*]] = add i64 [[INDEX11]], 0
554 ; AVX512-NEXT: [[TMP28:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[TRIGGER]], i64 [[TMP27]]
555 ; AVX512-NEXT: [[TMP29:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[TMP28]], i32 0
556 ; AVX512-NEXT: [[WIDE_LOAD12:%.*]] = load <16 x i32>, ptr addrspace(1) [[TMP29]], align 4
557 ; AVX512-NEXT: [[TMP30:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD12]], splat (i32 100)
558 ; AVX512-NEXT: [[TMP31:%.*]] = getelementptr i32, ptr addrspace(1) [[B]], i64 [[TMP27]]
559 ; AVX512-NEXT: [[TMP32:%.*]] = getelementptr i32, ptr addrspace(1) [[TMP31]], i32 0
560 ; AVX512-NEXT: [[WIDE_MASKED_LOAD13:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p1(ptr addrspace(1) [[TMP32]], i32 4, <16 x i1> [[TMP30]], <16 x i32> poison)
561 ; AVX512-NEXT: [[TMP33:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD13]], [[WIDE_LOAD12]]
562 ; AVX512-NEXT: [[TMP34:%.*]] = getelementptr i32, ptr addrspace(1) [[A]], i64 [[TMP27]]
563 ; AVX512-NEXT: [[TMP35:%.*]] = getelementptr i32, ptr addrspace(1) [[TMP34]], i32 0
564 ; AVX512-NEXT: call void @llvm.masked.store.v16i32.p1(<16 x i32> [[TMP33]], ptr addrspace(1) [[TMP35]], i32 4, <16 x i1> [[TMP30]])
565 ; AVX512-NEXT: [[INDEX_NEXT14]] = add nuw i64 [[INDEX11]], 16
566 ; AVX512-NEXT: [[TMP36:%.*]] = icmp eq i64 [[INDEX_NEXT14]], 10000
567 ; AVX512-NEXT: br i1 [[TMP36]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
568 ; AVX512: vec.epilog.middle.block:
569 ; AVX512-NEXT: br i1 true, label [[FOR_END]], label [[VEC_EPILOG_SCALAR_PH]]
570 ; AVX512: vec.epilog.scalar.ph:
571 ; AVX512-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 10000, [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 9984, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ITER_CHECK:%.*]] ]
572 ; AVX512-NEXT: br label [[FOR_BODY:%.*]]
574 ; AVX512-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
575 ; AVX512-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[TRIGGER]], i64 [[INDVARS_IV]]
576 ; AVX512-NEXT: [[TMP37:%.*]] = load i32, ptr addrspace(1) [[ARRAYIDX]], align 4
577 ; AVX512-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP37]], 100
578 ; AVX512-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]]
580 ; AVX512-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[B]], i64 [[INDVARS_IV]]
581 ; AVX512-NEXT: [[TMP38:%.*]] = load i32, ptr addrspace(1) [[ARRAYIDX3]], align 4
582 ; AVX512-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP38]], [[TMP37]]
583 ; AVX512-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[A]], i64 [[INDVARS_IV]]
584 ; AVX512-NEXT: store i32 [[ADD]], ptr addrspace(1) [[ARRAYIDX7]], align 4
585 ; AVX512-NEXT: br label [[FOR_INC]]
587 ; AVX512-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
588 ; AVX512-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 10000
589 ; AVX512-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
591 ; AVX512-NEXT: ret void
596 for.body: ; preds = %for.inc, %entry
597 %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.inc ]
598 %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %trigger, i64 %indvars.iv
599 %0 = load i32, ptr addrspace(1) %arrayidx, align 4
600 %cmp1 = icmp slt i32 %0, 100
601 br i1 %cmp1, label %if.then, label %for.inc
603 if.then: ; preds = %for.body
604 %arrayidx3 = getelementptr inbounds i32, ptr addrspace(1) %B, i64 %indvars.iv
605 %1 = load i32, ptr addrspace(1) %arrayidx3, align 4
606 %add = add nsw i32 %1, %0
607 %arrayidx7 = getelementptr inbounds i32, ptr addrspace(1) %A, i64 %indvars.iv
608 store i32 %add, ptr addrspace(1) %arrayidx7, align 4
611 for.inc: ; preds = %for.body, %if.then
612 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
613 %exitcond = icmp eq i64 %indvars.iv.next, 10000
614 br i1 %exitcond, label %for.end, label %for.body
616 for.end: ; preds = %for.inc
622 ;void foo2(ptr A, ptr B, int *trigger) {
624 ; for (int i=0; i<10000; i++) {
625 ; if (trigger[i] < 100) {
626 ; A[i] = B[i] + trigger[i];
631 define void @foo2(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture readonly %trigger) local_unnamed_addr #0 {
634 ; AVX1-NEXT: [[B3:%.*]] = ptrtoint ptr [[B:%.*]] to i64
635 ; AVX1-NEXT: [[TRIGGER2:%.*]] = ptrtoint ptr [[TRIGGER:%.*]] to i64
636 ; AVX1-NEXT: [[A1:%.*]] = ptrtoint ptr [[A:%.*]] to i64
637 ; AVX1-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
638 ; AVX1: vector.memcheck:
639 ; AVX1-NEXT: [[TMP0:%.*]] = sub i64 [[A1]], [[TRIGGER2]]
640 ; AVX1-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP0]], 32
641 ; AVX1-NEXT: [[TMP1:%.*]] = sub i64 [[A1]], [[B3]]
642 ; AVX1-NEXT: [[DIFF_CHECK4:%.*]] = icmp ult i64 [[TMP1]], 32
643 ; AVX1-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[DIFF_CHECK]], [[DIFF_CHECK4]]
644 ; AVX1-NEXT: br i1 [[CONFLICT_RDX]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
646 ; AVX1-NEXT: br label [[VECTOR_BODY:%.*]]
648 ; AVX1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
649 ; AVX1-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 0
650 ; AVX1-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[TMP2]]
651 ; AVX1-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 0
652 ; AVX1-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i32>, ptr [[TMP4]], align 4
653 ; AVX1-NEXT: [[TMP5:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD]], splat (i32 100)
654 ; AVX1-NEXT: [[TMP6:%.*]] = getelementptr float, ptr [[B]], i64 [[TMP2]]
655 ; AVX1-NEXT: [[TMP7:%.*]] = getelementptr float, ptr [[TMP6]], i32 0
656 ; AVX1-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0(ptr [[TMP7]], i32 4, <8 x i1> [[TMP5]], <8 x float> poison)
657 ; AVX1-NEXT: [[TMP8:%.*]] = sitofp <8 x i32> [[WIDE_LOAD]] to <8 x float>
658 ; AVX1-NEXT: [[TMP9:%.*]] = fadd <8 x float> [[WIDE_MASKED_LOAD]], [[TMP8]]
659 ; AVX1-NEXT: [[TMP10:%.*]] = getelementptr float, ptr [[A]], i64 [[TMP2]]
660 ; AVX1-NEXT: [[TMP11:%.*]] = getelementptr float, ptr [[TMP10]], i32 0
661 ; AVX1-NEXT: call void @llvm.masked.store.v8f32.p0(<8 x float> [[TMP9]], ptr [[TMP11]], i32 4, <8 x i1> [[TMP5]])
662 ; AVX1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
663 ; AVX1-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 10000
664 ; AVX1-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
665 ; AVX1: middle.block:
666 ; AVX1-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
668 ; AVX1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 10000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ]
669 ; AVX1-NEXT: br label [[FOR_BODY:%.*]]
671 ; AVX1-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
672 ; AVX1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[INDVARS_IV]]
673 ; AVX1-NEXT: [[TMP13:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
674 ; AVX1-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP13]], 100
675 ; AVX1-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]]
677 ; AVX1-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[INDVARS_IV]]
678 ; AVX1-NEXT: [[TMP14:%.*]] = load float, ptr [[ARRAYIDX3]], align 4
679 ; AVX1-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP13]] to float
680 ; AVX1-NEXT: [[ADD:%.*]] = fadd float [[TMP14]], [[CONV]]
681 ; AVX1-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDVARS_IV]]
682 ; AVX1-NEXT: store float [[ADD]], ptr [[ARRAYIDX7]], align 4
683 ; AVX1-NEXT: br label [[FOR_INC]]
685 ; AVX1-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
686 ; AVX1-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 10000
687 ; AVX1-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
689 ; AVX1-NEXT: ret void
692 ; AVX2-NEXT: iter.check:
693 ; AVX2-NEXT: [[B3:%.*]] = ptrtoint ptr [[B:%.*]] to i64
694 ; AVX2-NEXT: [[TRIGGER2:%.*]] = ptrtoint ptr [[TRIGGER:%.*]] to i64
695 ; AVX2-NEXT: [[A1:%.*]] = ptrtoint ptr [[A:%.*]] to i64
696 ; AVX2-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
697 ; AVX2: vector.memcheck:
698 ; AVX2-NEXT: [[TMP0:%.*]] = sub i64 [[A1]], [[TRIGGER2]]
699 ; AVX2-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP0]], 128
700 ; AVX2-NEXT: [[TMP1:%.*]] = sub i64 [[A1]], [[B3]]
701 ; AVX2-NEXT: [[DIFF_CHECK4:%.*]] = icmp ult i64 [[TMP1]], 128
702 ; AVX2-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[DIFF_CHECK]], [[DIFF_CHECK4]]
703 ; AVX2-NEXT: br i1 [[CONFLICT_RDX]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
704 ; AVX2: vector.main.loop.iter.check:
705 ; AVX2-NEXT: br i1 false, label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH1:%.*]]
707 ; AVX2-NEXT: br label [[VECTOR_BODY:%.*]]
709 ; AVX2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH1]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
710 ; AVX2-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 0
711 ; AVX2-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[TMP2]]
712 ; AVX2-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 0
713 ; AVX2-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 8
714 ; AVX2-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 16
715 ; AVX2-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 24
716 ; AVX2-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i32>, ptr [[TMP4]], align 4
717 ; AVX2-NEXT: [[WIDE_LOAD5:%.*]] = load <8 x i32>, ptr [[TMP5]], align 4
718 ; AVX2-NEXT: [[WIDE_LOAD6:%.*]] = load <8 x i32>, ptr [[TMP6]], align 4
719 ; AVX2-NEXT: [[WIDE_LOAD7:%.*]] = load <8 x i32>, ptr [[TMP7]], align 4
720 ; AVX2-NEXT: [[TMP8:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD]], splat (i32 100)
721 ; AVX2-NEXT: [[TMP9:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD5]], splat (i32 100)
722 ; AVX2-NEXT: [[TMP10:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD6]], splat (i32 100)
723 ; AVX2-NEXT: [[TMP11:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD7]], splat (i32 100)
724 ; AVX2-NEXT: [[TMP12:%.*]] = getelementptr float, ptr [[B]], i64 [[TMP2]]
725 ; AVX2-NEXT: [[TMP13:%.*]] = getelementptr float, ptr [[TMP12]], i32 0
726 ; AVX2-NEXT: [[TMP14:%.*]] = getelementptr float, ptr [[TMP12]], i32 8
727 ; AVX2-NEXT: [[TMP15:%.*]] = getelementptr float, ptr [[TMP12]], i32 16
728 ; AVX2-NEXT: [[TMP16:%.*]] = getelementptr float, ptr [[TMP12]], i32 24
729 ; AVX2-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0(ptr [[TMP13]], i32 4, <8 x i1> [[TMP8]], <8 x float> poison)
730 ; AVX2-NEXT: [[WIDE_MASKED_LOAD8:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0(ptr [[TMP14]], i32 4, <8 x i1> [[TMP9]], <8 x float> poison)
731 ; AVX2-NEXT: [[WIDE_MASKED_LOAD9:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0(ptr [[TMP15]], i32 4, <8 x i1> [[TMP10]], <8 x float> poison)
732 ; AVX2-NEXT: [[WIDE_MASKED_LOAD10:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0(ptr [[TMP16]], i32 4, <8 x i1> [[TMP11]], <8 x float> poison)
733 ; AVX2-NEXT: [[TMP17:%.*]] = sitofp <8 x i32> [[WIDE_LOAD]] to <8 x float>
734 ; AVX2-NEXT: [[TMP18:%.*]] = sitofp <8 x i32> [[WIDE_LOAD5]] to <8 x float>
735 ; AVX2-NEXT: [[TMP19:%.*]] = sitofp <8 x i32> [[WIDE_LOAD6]] to <8 x float>
736 ; AVX2-NEXT: [[TMP20:%.*]] = sitofp <8 x i32> [[WIDE_LOAD7]] to <8 x float>
737 ; AVX2-NEXT: [[TMP21:%.*]] = fadd <8 x float> [[WIDE_MASKED_LOAD]], [[TMP17]]
738 ; AVX2-NEXT: [[TMP22:%.*]] = fadd <8 x float> [[WIDE_MASKED_LOAD8]], [[TMP18]]
739 ; AVX2-NEXT: [[TMP23:%.*]] = fadd <8 x float> [[WIDE_MASKED_LOAD9]], [[TMP19]]
740 ; AVX2-NEXT: [[TMP24:%.*]] = fadd <8 x float> [[WIDE_MASKED_LOAD10]], [[TMP20]]
741 ; AVX2-NEXT: [[TMP25:%.*]] = getelementptr float, ptr [[A]], i64 [[TMP2]]
742 ; AVX2-NEXT: [[TMP26:%.*]] = getelementptr float, ptr [[TMP25]], i32 0
743 ; AVX2-NEXT: [[TMP27:%.*]] = getelementptr float, ptr [[TMP25]], i32 8
744 ; AVX2-NEXT: [[TMP28:%.*]] = getelementptr float, ptr [[TMP25]], i32 16
745 ; AVX2-NEXT: [[TMP29:%.*]] = getelementptr float, ptr [[TMP25]], i32 24
746 ; AVX2-NEXT: call void @llvm.masked.store.v8f32.p0(<8 x float> [[TMP21]], ptr [[TMP26]], i32 4, <8 x i1> [[TMP8]])
747 ; AVX2-NEXT: call void @llvm.masked.store.v8f32.p0(<8 x float> [[TMP22]], ptr [[TMP27]], i32 4, <8 x i1> [[TMP9]])
748 ; AVX2-NEXT: call void @llvm.masked.store.v8f32.p0(<8 x float> [[TMP23]], ptr [[TMP28]], i32 4, <8 x i1> [[TMP10]])
749 ; AVX2-NEXT: call void @llvm.masked.store.v8f32.p0(<8 x float> [[TMP24]], ptr [[TMP29]], i32 4, <8 x i1> [[TMP11]])
750 ; AVX2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
751 ; AVX2-NEXT: [[TMP30:%.*]] = icmp eq i64 [[INDEX_NEXT]], 9984
752 ; AVX2-NEXT: br i1 [[TMP30]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
753 ; AVX2: middle.block:
754 ; AVX2-NEXT: br i1 false, label [[FOR_END:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
755 ; AVX2: vec.epilog.iter.check:
756 ; AVX2-NEXT: br i1 false, label [[SCALAR_PH]], label [[VEC_EPILOG_PH]]
757 ; AVX2: vec.epilog.ph:
758 ; AVX2-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 9984, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_PH]] ]
759 ; AVX2-NEXT: br label [[FOR_BODY:%.*]]
760 ; AVX2: vec.epilog.vector.body:
761 ; AVX2-NEXT: [[INDEX11:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT14:%.*]], [[FOR_BODY]] ]
762 ; AVX2-NEXT: [[TMP42:%.*]] = add i64 [[INDEX11]], 0
763 ; AVX2-NEXT: [[TMP43:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[TMP42]]
764 ; AVX2-NEXT: [[TMP33:%.*]] = getelementptr inbounds i32, ptr [[TMP43]], i32 0
765 ; AVX2-NEXT: [[WIDE_LOAD12:%.*]] = load <8 x i32>, ptr [[TMP33]], align 4
766 ; AVX2-NEXT: [[TMP34:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD12]], splat (i32 100)
767 ; AVX2-NEXT: [[TMP35:%.*]] = getelementptr float, ptr [[B]], i64 [[TMP42]]
768 ; AVX2-NEXT: [[TMP36:%.*]] = getelementptr float, ptr [[TMP35]], i32 0
769 ; AVX2-NEXT: [[WIDE_MASKED_LOAD13:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0(ptr [[TMP36]], i32 4, <8 x i1> [[TMP34]], <8 x float> poison)
770 ; AVX2-NEXT: [[TMP37:%.*]] = sitofp <8 x i32> [[WIDE_LOAD12]] to <8 x float>
771 ; AVX2-NEXT: [[TMP38:%.*]] = fadd <8 x float> [[WIDE_MASKED_LOAD13]], [[TMP37]]
772 ; AVX2-NEXT: [[TMP39:%.*]] = getelementptr float, ptr [[A]], i64 [[TMP42]]
773 ; AVX2-NEXT: [[TMP40:%.*]] = getelementptr float, ptr [[TMP39]], i32 0
774 ; AVX2-NEXT: call void @llvm.masked.store.v8f32.p0(<8 x float> [[TMP38]], ptr [[TMP40]], i32 4, <8 x i1> [[TMP34]])
775 ; AVX2-NEXT: [[INDEX_NEXT14]] = add nuw i64 [[INDEX11]], 8
776 ; AVX2-NEXT: [[TMP41:%.*]] = icmp eq i64 [[INDEX_NEXT14]], 10000
777 ; AVX2-NEXT: br i1 [[TMP41]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
778 ; AVX2: vec.epilog.middle.block:
779 ; AVX2-NEXT: br i1 true, label [[FOR_END]], label [[SCALAR_PH]]
780 ; AVX2: vec.epilog.scalar.ph:
781 ; AVX2-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 10000, [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 9984, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ITER_CHECK:%.*]] ]
782 ; AVX2-NEXT: br label [[FOR_BODY1:%.*]]
784 ; AVX2-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
785 ; AVX2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[INDVARS_IV]]
786 ; AVX2-NEXT: [[TMP31:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
787 ; AVX2-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP31]], 100
788 ; AVX2-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]]
790 ; AVX2-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[INDVARS_IV]]
791 ; AVX2-NEXT: [[TMP32:%.*]] = load float, ptr [[ARRAYIDX3]], align 4
792 ; AVX2-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP31]] to float
793 ; AVX2-NEXT: [[ADD:%.*]] = fadd float [[TMP32]], [[CONV]]
794 ; AVX2-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDVARS_IV]]
795 ; AVX2-NEXT: store float [[ADD]], ptr [[ARRAYIDX7]], align 4
796 ; AVX2-NEXT: br label [[FOR_INC]]
798 ; AVX2-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
799 ; AVX2-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 10000
800 ; AVX2-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY1]], !llvm.loop [[LOOP10:![0-9]+]]
802 ; AVX2-NEXT: ret void
804 ; AVX512-LABEL: @foo2(
805 ; AVX512-NEXT: iter.check:
806 ; AVX512-NEXT: [[B3:%.*]] = ptrtoint ptr [[B:%.*]] to i64
807 ; AVX512-NEXT: [[TRIGGER2:%.*]] = ptrtoint ptr [[TRIGGER:%.*]] to i64
808 ; AVX512-NEXT: [[A1:%.*]] = ptrtoint ptr [[A:%.*]] to i64
809 ; AVX512-NEXT: br i1 false, label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
810 ; AVX512: vector.memcheck:
811 ; AVX512-NEXT: [[TMP0:%.*]] = sub i64 [[A1]], [[TRIGGER2]]
812 ; AVX512-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP0]], 256
813 ; AVX512-NEXT: [[TMP1:%.*]] = sub i64 [[A1]], [[B3]]
814 ; AVX512-NEXT: [[DIFF_CHECK4:%.*]] = icmp ult i64 [[TMP1]], 256
815 ; AVX512-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[DIFF_CHECK]], [[DIFF_CHECK4]]
816 ; AVX512-NEXT: br i1 [[CONFLICT_RDX]], label [[VEC_EPILOG_SCALAR_PH]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
817 ; AVX512: vector.main.loop.iter.check:
818 ; AVX512-NEXT: br i1 false, label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
820 ; AVX512-NEXT: br label [[VECTOR_BODY:%.*]]
821 ; AVX512: vector.body:
822 ; AVX512-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
823 ; AVX512-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 0
824 ; AVX512-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[TMP2]]
825 ; AVX512-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 0
826 ; AVX512-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 16
827 ; AVX512-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 32
828 ; AVX512-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 48
829 ; AVX512-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i32>, ptr [[TMP4]], align 4
830 ; AVX512-NEXT: [[WIDE_LOAD5:%.*]] = load <16 x i32>, ptr [[TMP5]], align 4
831 ; AVX512-NEXT: [[WIDE_LOAD6:%.*]] = load <16 x i32>, ptr [[TMP6]], align 4
832 ; AVX512-NEXT: [[WIDE_LOAD7:%.*]] = load <16 x i32>, ptr [[TMP7]], align 4
833 ; AVX512-NEXT: [[TMP8:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD]], splat (i32 100)
834 ; AVX512-NEXT: [[TMP9:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD5]], splat (i32 100)
835 ; AVX512-NEXT: [[TMP10:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD6]], splat (i32 100)
836 ; AVX512-NEXT: [[TMP11:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD7]], splat (i32 100)
837 ; AVX512-NEXT: [[TMP12:%.*]] = getelementptr float, ptr [[B]], i64 [[TMP2]]
838 ; AVX512-NEXT: [[TMP13:%.*]] = getelementptr float, ptr [[TMP12]], i32 0
839 ; AVX512-NEXT: [[TMP14:%.*]] = getelementptr float, ptr [[TMP12]], i32 16
840 ; AVX512-NEXT: [[TMP15:%.*]] = getelementptr float, ptr [[TMP12]], i32 32
841 ; AVX512-NEXT: [[TMP16:%.*]] = getelementptr float, ptr [[TMP12]], i32 48
842 ; AVX512-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <16 x float> @llvm.masked.load.v16f32.p0(ptr [[TMP13]], i32 4, <16 x i1> [[TMP8]], <16 x float> poison)
843 ; AVX512-NEXT: [[WIDE_MASKED_LOAD8:%.*]] = call <16 x float> @llvm.masked.load.v16f32.p0(ptr [[TMP14]], i32 4, <16 x i1> [[TMP9]], <16 x float> poison)
844 ; AVX512-NEXT: [[WIDE_MASKED_LOAD9:%.*]] = call <16 x float> @llvm.masked.load.v16f32.p0(ptr [[TMP15]], i32 4, <16 x i1> [[TMP10]], <16 x float> poison)
845 ; AVX512-NEXT: [[WIDE_MASKED_LOAD10:%.*]] = call <16 x float> @llvm.masked.load.v16f32.p0(ptr [[TMP16]], i32 4, <16 x i1> [[TMP11]], <16 x float> poison)
846 ; AVX512-NEXT: [[TMP17:%.*]] = sitofp <16 x i32> [[WIDE_LOAD]] to <16 x float>
847 ; AVX512-NEXT: [[TMP18:%.*]] = sitofp <16 x i32> [[WIDE_LOAD5]] to <16 x float>
848 ; AVX512-NEXT: [[TMP19:%.*]] = sitofp <16 x i32> [[WIDE_LOAD6]] to <16 x float>
849 ; AVX512-NEXT: [[TMP20:%.*]] = sitofp <16 x i32> [[WIDE_LOAD7]] to <16 x float>
850 ; AVX512-NEXT: [[TMP21:%.*]] = fadd <16 x float> [[WIDE_MASKED_LOAD]], [[TMP17]]
851 ; AVX512-NEXT: [[TMP22:%.*]] = fadd <16 x float> [[WIDE_MASKED_LOAD8]], [[TMP18]]
852 ; AVX512-NEXT: [[TMP23:%.*]] = fadd <16 x float> [[WIDE_MASKED_LOAD9]], [[TMP19]]
853 ; AVX512-NEXT: [[TMP24:%.*]] = fadd <16 x float> [[WIDE_MASKED_LOAD10]], [[TMP20]]
854 ; AVX512-NEXT: [[TMP25:%.*]] = getelementptr float, ptr [[A]], i64 [[TMP2]]
855 ; AVX512-NEXT: [[TMP26:%.*]] = getelementptr float, ptr [[TMP25]], i32 0
856 ; AVX512-NEXT: [[TMP27:%.*]] = getelementptr float, ptr [[TMP25]], i32 16
857 ; AVX512-NEXT: [[TMP28:%.*]] = getelementptr float, ptr [[TMP25]], i32 32
858 ; AVX512-NEXT: [[TMP29:%.*]] = getelementptr float, ptr [[TMP25]], i32 48
859 ; AVX512-NEXT: call void @llvm.masked.store.v16f32.p0(<16 x float> [[TMP21]], ptr [[TMP26]], i32 4, <16 x i1> [[TMP8]])
860 ; AVX512-NEXT: call void @llvm.masked.store.v16f32.p0(<16 x float> [[TMP22]], ptr [[TMP27]], i32 4, <16 x i1> [[TMP9]])
861 ; AVX512-NEXT: call void @llvm.masked.store.v16f32.p0(<16 x float> [[TMP23]], ptr [[TMP28]], i32 4, <16 x i1> [[TMP10]])
862 ; AVX512-NEXT: call void @llvm.masked.store.v16f32.p0(<16 x float> [[TMP24]], ptr [[TMP29]], i32 4, <16 x i1> [[TMP11]])
863 ; AVX512-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 64
864 ; AVX512-NEXT: [[TMP30:%.*]] = icmp eq i64 [[INDEX_NEXT]], 9984
865 ; AVX512-NEXT: br i1 [[TMP30]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
866 ; AVX512: middle.block:
867 ; AVX512-NEXT: br i1 false, label [[FOR_END:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
868 ; AVX512: vec.epilog.iter.check:
869 ; AVX512-NEXT: br i1 false, label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
870 ; AVX512: vec.epilog.ph:
871 ; AVX512-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 9984, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
872 ; AVX512-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
873 ; AVX512: vec.epilog.vector.body:
874 ; AVX512-NEXT: [[INDEX11:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT14:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
875 ; AVX512-NEXT: [[TMP31:%.*]] = add i64 [[INDEX11]], 0
876 ; AVX512-NEXT: [[TMP32:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[TMP31]]
877 ; AVX512-NEXT: [[TMP33:%.*]] = getelementptr inbounds i32, ptr [[TMP32]], i32 0
878 ; AVX512-NEXT: [[WIDE_LOAD12:%.*]] = load <16 x i32>, ptr [[TMP33]], align 4
879 ; AVX512-NEXT: [[TMP34:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD12]], splat (i32 100)
880 ; AVX512-NEXT: [[TMP35:%.*]] = getelementptr float, ptr [[B]], i64 [[TMP31]]
881 ; AVX512-NEXT: [[TMP36:%.*]] = getelementptr float, ptr [[TMP35]], i32 0
882 ; AVX512-NEXT: [[WIDE_MASKED_LOAD13:%.*]] = call <16 x float> @llvm.masked.load.v16f32.p0(ptr [[TMP36]], i32 4, <16 x i1> [[TMP34]], <16 x float> poison)
883 ; AVX512-NEXT: [[TMP37:%.*]] = sitofp <16 x i32> [[WIDE_LOAD12]] to <16 x float>
884 ; AVX512-NEXT: [[TMP38:%.*]] = fadd <16 x float> [[WIDE_MASKED_LOAD13]], [[TMP37]]
885 ; AVX512-NEXT: [[TMP39:%.*]] = getelementptr float, ptr [[A]], i64 [[TMP31]]
886 ; AVX512-NEXT: [[TMP40:%.*]] = getelementptr float, ptr [[TMP39]], i32 0
887 ; AVX512-NEXT: call void @llvm.masked.store.v16f32.p0(<16 x float> [[TMP38]], ptr [[TMP40]], i32 4, <16 x i1> [[TMP34]])
888 ; AVX512-NEXT: [[INDEX_NEXT14]] = add nuw i64 [[INDEX11]], 16
889 ; AVX512-NEXT: [[TMP41:%.*]] = icmp eq i64 [[INDEX_NEXT14]], 10000
890 ; AVX512-NEXT: br i1 [[TMP41]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
891 ; AVX512: vec.epilog.middle.block:
892 ; AVX512-NEXT: br i1 true, label [[FOR_END]], label [[VEC_EPILOG_SCALAR_PH]]
893 ; AVX512: vec.epilog.scalar.ph:
894 ; AVX512-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 10000, [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 9984, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ITER_CHECK:%.*]] ]
895 ; AVX512-NEXT: br label [[FOR_BODY:%.*]]
897 ; AVX512-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
898 ; AVX512-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[INDVARS_IV]]
899 ; AVX512-NEXT: [[TMP42:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
900 ; AVX512-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP42]], 100
901 ; AVX512-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]]
903 ; AVX512-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[INDVARS_IV]]
904 ; AVX512-NEXT: [[TMP43:%.*]] = load float, ptr [[ARRAYIDX3]], align 4
905 ; AVX512-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP42]] to float
906 ; AVX512-NEXT: [[ADD:%.*]] = fadd float [[TMP43]], [[CONV]]
907 ; AVX512-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDVARS_IV]]
908 ; AVX512-NEXT: store float [[ADD]], ptr [[ARRAYIDX7]], align 4
909 ; AVX512-NEXT: br label [[FOR_INC]]
911 ; AVX512-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
912 ; AVX512-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 10000
913 ; AVX512-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
915 ; AVX512-NEXT: ret void
920 for.body: ; preds = %for.inc, %entry
921 %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.inc ]
922 %arrayidx = getelementptr inbounds i32, ptr %trigger, i64 %indvars.iv
923 %0 = load i32, ptr %arrayidx, align 4
924 %cmp1 = icmp slt i32 %0, 100
925 br i1 %cmp1, label %if.then, label %for.inc
927 if.then: ; preds = %for.body
928 %arrayidx3 = getelementptr inbounds float, ptr %B, i64 %indvars.iv
929 %1 = load float, ptr %arrayidx3, align 4
930 %conv = sitofp i32 %0 to float
931 %add = fadd float %1, %conv
932 %arrayidx7 = getelementptr inbounds float, ptr %A, i64 %indvars.iv
933 store float %add, ptr %arrayidx7, align 4
936 for.inc: ; preds = %for.body, %if.then
937 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
938 %exitcond = icmp eq i64 %indvars.iv.next, 10000
939 br i1 %exitcond, label %for.end, label %for.body
941 for.end: ; preds = %for.inc
947 ;void foo3(ptr A, ptr B, int *trigger) {
949 ; for (int i=0; i<10000; i++) {
950 ; if (trigger[i] < 100) {
951 ; A[i] = B[i] + trigger[i];
956 define void @foo3(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture readonly %trigger) local_unnamed_addr #0 {
959 ; AVX1-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
960 ; AVX1: vector.memcheck:
961 ; AVX1-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 80000
962 ; AVX1-NEXT: [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[TRIGGER:%.*]], i64 40000
963 ; AVX1-NEXT: [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[B:%.*]], i64 80000
964 ; AVX1-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[A]], [[SCEVGEP1]]
965 ; AVX1-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[TRIGGER]], [[SCEVGEP]]
966 ; AVX1-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
967 ; AVX1-NEXT: [[BOUND03:%.*]] = icmp ult ptr [[A]], [[SCEVGEP2]]
968 ; AVX1-NEXT: [[BOUND14:%.*]] = icmp ult ptr [[B]], [[SCEVGEP]]
969 ; AVX1-NEXT: [[FOUND_CONFLICT5:%.*]] = and i1 [[BOUND03]], [[BOUND14]]
970 ; AVX1-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT5]]
971 ; AVX1-NEXT: br i1 [[CONFLICT_RDX]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
973 ; AVX1-NEXT: br label [[VECTOR_BODY:%.*]]
975 ; AVX1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
976 ; AVX1-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
977 ; AVX1-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[TMP0]]
978 ; AVX1-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
979 ; AVX1-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 4
980 ; AVX1-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 8
981 ; AVX1-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 12
982 ; AVX1-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP2]], align 4, !alias.scope [[META8:![0-9]+]]
983 ; AVX1-NEXT: [[WIDE_LOAD6:%.*]] = load <4 x i32>, ptr [[TMP3]], align 4, !alias.scope [[META8]]
984 ; AVX1-NEXT: [[WIDE_LOAD7:%.*]] = load <4 x i32>, ptr [[TMP4]], align 4, !alias.scope [[META8]]
985 ; AVX1-NEXT: [[WIDE_LOAD8:%.*]] = load <4 x i32>, ptr [[TMP5]], align 4, !alias.scope [[META8]]
986 ; AVX1-NEXT: [[TMP6:%.*]] = icmp slt <4 x i32> [[WIDE_LOAD]], splat (i32 100)
987 ; AVX1-NEXT: [[TMP7:%.*]] = icmp slt <4 x i32> [[WIDE_LOAD6]], splat (i32 100)
988 ; AVX1-NEXT: [[TMP8:%.*]] = icmp slt <4 x i32> [[WIDE_LOAD7]], splat (i32 100)
989 ; AVX1-NEXT: [[TMP9:%.*]] = icmp slt <4 x i32> [[WIDE_LOAD8]], splat (i32 100)
990 ; AVX1-NEXT: [[TMP10:%.*]] = getelementptr double, ptr [[B]], i64 [[TMP0]]
991 ; AVX1-NEXT: [[TMP11:%.*]] = getelementptr double, ptr [[TMP10]], i32 0
992 ; AVX1-NEXT: [[TMP12:%.*]] = getelementptr double, ptr [[TMP10]], i32 4
993 ; AVX1-NEXT: [[TMP13:%.*]] = getelementptr double, ptr [[TMP10]], i32 8
994 ; AVX1-NEXT: [[TMP14:%.*]] = getelementptr double, ptr [[TMP10]], i32 12
995 ; AVX1-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr [[TMP11]], i32 8, <4 x i1> [[TMP6]], <4 x double> poison), !alias.scope [[META11:![0-9]+]]
996 ; AVX1-NEXT: [[WIDE_MASKED_LOAD9:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr [[TMP12]], i32 8, <4 x i1> [[TMP7]], <4 x double> poison), !alias.scope [[META11]]
997 ; AVX1-NEXT: [[WIDE_MASKED_LOAD10:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr [[TMP13]], i32 8, <4 x i1> [[TMP8]], <4 x double> poison), !alias.scope [[META11]]
998 ; AVX1-NEXT: [[WIDE_MASKED_LOAD11:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr [[TMP14]], i32 8, <4 x i1> [[TMP9]], <4 x double> poison), !alias.scope [[META11]]
999 ; AVX1-NEXT: [[TMP15:%.*]] = sitofp <4 x i32> [[WIDE_LOAD]] to <4 x double>
1000 ; AVX1-NEXT: [[TMP16:%.*]] = sitofp <4 x i32> [[WIDE_LOAD6]] to <4 x double>
1001 ; AVX1-NEXT: [[TMP17:%.*]] = sitofp <4 x i32> [[WIDE_LOAD7]] to <4 x double>
1002 ; AVX1-NEXT: [[TMP18:%.*]] = sitofp <4 x i32> [[WIDE_LOAD8]] to <4 x double>
1003 ; AVX1-NEXT: [[TMP19:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD]], [[TMP15]]
1004 ; AVX1-NEXT: [[TMP20:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD9]], [[TMP16]]
1005 ; AVX1-NEXT: [[TMP21:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD10]], [[TMP17]]
1006 ; AVX1-NEXT: [[TMP22:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD11]], [[TMP18]]
1007 ; AVX1-NEXT: [[TMP23:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP0]]
1008 ; AVX1-NEXT: [[TMP24:%.*]] = getelementptr double, ptr [[TMP23]], i32 0
1009 ; AVX1-NEXT: [[TMP25:%.*]] = getelementptr double, ptr [[TMP23]], i32 4
1010 ; AVX1-NEXT: [[TMP26:%.*]] = getelementptr double, ptr [[TMP23]], i32 8
1011 ; AVX1-NEXT: [[TMP27:%.*]] = getelementptr double, ptr [[TMP23]], i32 12
1012 ; AVX1-NEXT: call void @llvm.masked.store.v4f64.p0(<4 x double> [[TMP19]], ptr [[TMP24]], i32 8, <4 x i1> [[TMP6]]), !alias.scope [[META13:![0-9]+]], !noalias [[META15:![0-9]+]]
1013 ; AVX1-NEXT: call void @llvm.masked.store.v4f64.p0(<4 x double> [[TMP20]], ptr [[TMP25]], i32 8, <4 x i1> [[TMP7]]), !alias.scope [[META13]], !noalias [[META15]]
1014 ; AVX1-NEXT: call void @llvm.masked.store.v4f64.p0(<4 x double> [[TMP21]], ptr [[TMP26]], i32 8, <4 x i1> [[TMP8]]), !alias.scope [[META13]], !noalias [[META15]]
1015 ; AVX1-NEXT: call void @llvm.masked.store.v4f64.p0(<4 x double> [[TMP22]], ptr [[TMP27]], i32 8, <4 x i1> [[TMP9]]), !alias.scope [[META13]], !noalias [[META15]]
1016 ; AVX1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
1017 ; AVX1-NEXT: [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], 10000
1018 ; AVX1-NEXT: br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
1019 ; AVX1: middle.block:
1020 ; AVX1-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
1022 ; AVX1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 10000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ]
1023 ; AVX1-NEXT: br label [[FOR_BODY:%.*]]
1025 ; AVX1-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
1026 ; AVX1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[INDVARS_IV]]
1027 ; AVX1-NEXT: [[TMP29:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
1028 ; AVX1-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP29]], 100
1029 ; AVX1-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]]
1031 ; AVX1-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds double, ptr [[B]], i64 [[INDVARS_IV]]
1032 ; AVX1-NEXT: [[TMP30:%.*]] = load double, ptr [[ARRAYIDX3]], align 8
1033 ; AVX1-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP29]] to double
1034 ; AVX1-NEXT: [[ADD:%.*]] = fadd double [[TMP30]], [[CONV]]
1035 ; AVX1-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds double, ptr [[A]], i64 [[INDVARS_IV]]
1036 ; AVX1-NEXT: store double [[ADD]], ptr [[ARRAYIDX7]], align 8
1037 ; AVX1-NEXT: br label [[FOR_INC]]
1039 ; AVX1-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
1040 ; AVX1-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 10000
1041 ; AVX1-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
1043 ; AVX1-NEXT: ret void
1045 ; AVX2-LABEL: @foo3(
1047 ; AVX2-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
1048 ; AVX2: vector.memcheck:
1049 ; AVX2-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 80000
1050 ; AVX2-NEXT: [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[TRIGGER:%.*]], i64 40000
1051 ; AVX2-NEXT: [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[B:%.*]], i64 80000
1052 ; AVX2-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[A]], [[SCEVGEP1]]
1053 ; AVX2-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[TRIGGER]], [[SCEVGEP]]
1054 ; AVX2-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
1055 ; AVX2-NEXT: [[BOUND03:%.*]] = icmp ult ptr [[A]], [[SCEVGEP2]]
1056 ; AVX2-NEXT: [[BOUND14:%.*]] = icmp ult ptr [[B]], [[SCEVGEP]]
1057 ; AVX2-NEXT: [[FOUND_CONFLICT5:%.*]] = and i1 [[BOUND03]], [[BOUND14]]
1058 ; AVX2-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT5]]
1059 ; AVX2-NEXT: br i1 [[CONFLICT_RDX]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
1061 ; AVX2-NEXT: br label [[VECTOR_BODY:%.*]]
1062 ; AVX2: vector.body:
1063 ; AVX2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
1064 ; AVX2-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
1065 ; AVX2-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[TMP0]]
1066 ; AVX2-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
1067 ; AVX2-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 4
1068 ; AVX2-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 8
1069 ; AVX2-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 12
1070 ; AVX2-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP2]], align 4, !alias.scope [[META11:![0-9]+]]
1071 ; AVX2-NEXT: [[WIDE_LOAD6:%.*]] = load <4 x i32>, ptr [[TMP3]], align 4, !alias.scope [[META11]]
1072 ; AVX2-NEXT: [[WIDE_LOAD7:%.*]] = load <4 x i32>, ptr [[TMP4]], align 4, !alias.scope [[META11]]
1073 ; AVX2-NEXT: [[WIDE_LOAD8:%.*]] = load <4 x i32>, ptr [[TMP5]], align 4, !alias.scope [[META11]]
1074 ; AVX2-NEXT: [[TMP6:%.*]] = icmp slt <4 x i32> [[WIDE_LOAD]], splat (i32 100)
1075 ; AVX2-NEXT: [[TMP7:%.*]] = icmp slt <4 x i32> [[WIDE_LOAD6]], splat (i32 100)
1076 ; AVX2-NEXT: [[TMP8:%.*]] = icmp slt <4 x i32> [[WIDE_LOAD7]], splat (i32 100)
1077 ; AVX2-NEXT: [[TMP9:%.*]] = icmp slt <4 x i32> [[WIDE_LOAD8]], splat (i32 100)
1078 ; AVX2-NEXT: [[TMP10:%.*]] = getelementptr double, ptr [[B]], i64 [[TMP0]]
1079 ; AVX2-NEXT: [[TMP11:%.*]] = getelementptr double, ptr [[TMP10]], i32 0
1080 ; AVX2-NEXT: [[TMP12:%.*]] = getelementptr double, ptr [[TMP10]], i32 4
1081 ; AVX2-NEXT: [[TMP13:%.*]] = getelementptr double, ptr [[TMP10]], i32 8
1082 ; AVX2-NEXT: [[TMP14:%.*]] = getelementptr double, ptr [[TMP10]], i32 12
1083 ; AVX2-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr [[TMP11]], i32 8, <4 x i1> [[TMP6]], <4 x double> poison), !alias.scope [[META14:![0-9]+]]
1084 ; AVX2-NEXT: [[WIDE_MASKED_LOAD9:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr [[TMP12]], i32 8, <4 x i1> [[TMP7]], <4 x double> poison), !alias.scope [[META14]]
1085 ; AVX2-NEXT: [[WIDE_MASKED_LOAD10:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr [[TMP13]], i32 8, <4 x i1> [[TMP8]], <4 x double> poison), !alias.scope [[META14]]
1086 ; AVX2-NEXT: [[WIDE_MASKED_LOAD11:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr [[TMP14]], i32 8, <4 x i1> [[TMP9]], <4 x double> poison), !alias.scope [[META14]]
1087 ; AVX2-NEXT: [[TMP15:%.*]] = sitofp <4 x i32> [[WIDE_LOAD]] to <4 x double>
1088 ; AVX2-NEXT: [[TMP16:%.*]] = sitofp <4 x i32> [[WIDE_LOAD6]] to <4 x double>
1089 ; AVX2-NEXT: [[TMP17:%.*]] = sitofp <4 x i32> [[WIDE_LOAD7]] to <4 x double>
1090 ; AVX2-NEXT: [[TMP18:%.*]] = sitofp <4 x i32> [[WIDE_LOAD8]] to <4 x double>
1091 ; AVX2-NEXT: [[TMP19:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD]], [[TMP15]]
1092 ; AVX2-NEXT: [[TMP20:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD9]], [[TMP16]]
1093 ; AVX2-NEXT: [[TMP21:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD10]], [[TMP17]]
1094 ; AVX2-NEXT: [[TMP22:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD11]], [[TMP18]]
1095 ; AVX2-NEXT: [[TMP23:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP0]]
1096 ; AVX2-NEXT: [[TMP24:%.*]] = getelementptr double, ptr [[TMP23]], i32 0
1097 ; AVX2-NEXT: [[TMP25:%.*]] = getelementptr double, ptr [[TMP23]], i32 4
1098 ; AVX2-NEXT: [[TMP26:%.*]] = getelementptr double, ptr [[TMP23]], i32 8
1099 ; AVX2-NEXT: [[TMP27:%.*]] = getelementptr double, ptr [[TMP23]], i32 12
1100 ; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0(<4 x double> [[TMP19]], ptr [[TMP24]], i32 8, <4 x i1> [[TMP6]]), !alias.scope [[META16:![0-9]+]], !noalias [[META18:![0-9]+]]
1101 ; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0(<4 x double> [[TMP20]], ptr [[TMP25]], i32 8, <4 x i1> [[TMP7]]), !alias.scope [[META16]], !noalias [[META18]]
1102 ; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0(<4 x double> [[TMP21]], ptr [[TMP26]], i32 8, <4 x i1> [[TMP8]]), !alias.scope [[META16]], !noalias [[META18]]
1103 ; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0(<4 x double> [[TMP22]], ptr [[TMP27]], i32 8, <4 x i1> [[TMP9]]), !alias.scope [[META16]], !noalias [[META18]]
1104 ; AVX2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
1105 ; AVX2-NEXT: [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], 10000
1106 ; AVX2-NEXT: br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]]
1107 ; AVX2: middle.block:
1108 ; AVX2-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
1110 ; AVX2-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 10000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ]
1111 ; AVX2-NEXT: br label [[FOR_BODY:%.*]]
1113 ; AVX2-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
1114 ; AVX2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[INDVARS_IV]]
1115 ; AVX2-NEXT: [[TMP29:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
1116 ; AVX2-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP29]], 100
1117 ; AVX2-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]]
1119 ; AVX2-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds double, ptr [[B]], i64 [[INDVARS_IV]]
1120 ; AVX2-NEXT: [[TMP30:%.*]] = load double, ptr [[ARRAYIDX3]], align 8
1121 ; AVX2-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP29]] to double
1122 ; AVX2-NEXT: [[ADD:%.*]] = fadd double [[TMP30]], [[CONV]]
1123 ; AVX2-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds double, ptr [[A]], i64 [[INDVARS_IV]]
1124 ; AVX2-NEXT: store double [[ADD]], ptr [[ARRAYIDX7]], align 8
1125 ; AVX2-NEXT: br label [[FOR_INC]]
1127 ; AVX2-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
1128 ; AVX2-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 10000
1129 ; AVX2-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
1131 ; AVX2-NEXT: ret void
1133 ; AVX512-LABEL: @foo3(
1134 ; AVX512-NEXT: iter.check:
1135 ; AVX512-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
1136 ; AVX512: vector.memcheck:
1137 ; AVX512-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 80000
1138 ; AVX512-NEXT: [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[TRIGGER:%.*]], i64 40000
1139 ; AVX512-NEXT: [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[B:%.*]], i64 80000
1140 ; AVX512-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[A]], [[SCEVGEP1]]
1141 ; AVX512-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[TRIGGER]], [[SCEVGEP]]
1142 ; AVX512-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
1143 ; AVX512-NEXT: [[BOUND03:%.*]] = icmp ult ptr [[A]], [[SCEVGEP2]]
1144 ; AVX512-NEXT: [[BOUND14:%.*]] = icmp ult ptr [[B]], [[SCEVGEP]]
1145 ; AVX512-NEXT: [[FOUND_CONFLICT5:%.*]] = and i1 [[BOUND03]], [[BOUND14]]
1146 ; AVX512-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT5]]
1147 ; AVX512-NEXT: br i1 [[CONFLICT_RDX]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
1148 ; AVX512: vector.main.loop.iter.check:
1149 ; AVX512-NEXT: br i1 false, label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH1:%.*]]
1150 ; AVX512: vector.ph:
1151 ; AVX512-NEXT: br label [[VECTOR_BODY:%.*]]
1152 ; AVX512: vector.body:
1153 ; AVX512-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH1]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
1154 ; AVX512-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
1155 ; AVX512-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[TMP0]]
1156 ; AVX512-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
1157 ; AVX512-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 8
1158 ; AVX512-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 16
1159 ; AVX512-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 24
1160 ; AVX512-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i32>, ptr [[TMP2]], align 4, !alias.scope [[META11:![0-9]+]]
1161 ; AVX512-NEXT: [[WIDE_LOAD6:%.*]] = load <8 x i32>, ptr [[TMP3]], align 4, !alias.scope [[META11]]
1162 ; AVX512-NEXT: [[WIDE_LOAD7:%.*]] = load <8 x i32>, ptr [[TMP4]], align 4, !alias.scope [[META11]]
1163 ; AVX512-NEXT: [[WIDE_LOAD8:%.*]] = load <8 x i32>, ptr [[TMP5]], align 4, !alias.scope [[META11]]
1164 ; AVX512-NEXT: [[TMP6:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD]], splat (i32 100)
1165 ; AVX512-NEXT: [[TMP7:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD6]], splat (i32 100)
1166 ; AVX512-NEXT: [[TMP8:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD7]], splat (i32 100)
1167 ; AVX512-NEXT: [[TMP9:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD8]], splat (i32 100)
1168 ; AVX512-NEXT: [[TMP10:%.*]] = getelementptr double, ptr [[B]], i64 [[TMP0]]
1169 ; AVX512-NEXT: [[TMP11:%.*]] = getelementptr double, ptr [[TMP10]], i32 0
1170 ; AVX512-NEXT: [[TMP12:%.*]] = getelementptr double, ptr [[TMP10]], i32 8
1171 ; AVX512-NEXT: [[TMP13:%.*]] = getelementptr double, ptr [[TMP10]], i32 16
1172 ; AVX512-NEXT: [[TMP14:%.*]] = getelementptr double, ptr [[TMP10]], i32 24
1173 ; AVX512-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0(ptr [[TMP11]], i32 8, <8 x i1> [[TMP6]], <8 x double> poison), !alias.scope [[META14:![0-9]+]]
1174 ; AVX512-NEXT: [[WIDE_MASKED_LOAD9:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0(ptr [[TMP12]], i32 8, <8 x i1> [[TMP7]], <8 x double> poison), !alias.scope [[META14]]
1175 ; AVX512-NEXT: [[WIDE_MASKED_LOAD10:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0(ptr [[TMP13]], i32 8, <8 x i1> [[TMP8]], <8 x double> poison), !alias.scope [[META14]]
1176 ; AVX512-NEXT: [[WIDE_MASKED_LOAD11:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0(ptr [[TMP14]], i32 8, <8 x i1> [[TMP9]], <8 x double> poison), !alias.scope [[META14]]
1177 ; AVX512-NEXT: [[TMP15:%.*]] = sitofp <8 x i32> [[WIDE_LOAD]] to <8 x double>
1178 ; AVX512-NEXT: [[TMP16:%.*]] = sitofp <8 x i32> [[WIDE_LOAD6]] to <8 x double>
1179 ; AVX512-NEXT: [[TMP17:%.*]] = sitofp <8 x i32> [[WIDE_LOAD7]] to <8 x double>
1180 ; AVX512-NEXT: [[TMP18:%.*]] = sitofp <8 x i32> [[WIDE_LOAD8]] to <8 x double>
1181 ; AVX512-NEXT: [[TMP19:%.*]] = fadd <8 x double> [[WIDE_MASKED_LOAD]], [[TMP15]]
1182 ; AVX512-NEXT: [[TMP20:%.*]] = fadd <8 x double> [[WIDE_MASKED_LOAD9]], [[TMP16]]
1183 ; AVX512-NEXT: [[TMP21:%.*]] = fadd <8 x double> [[WIDE_MASKED_LOAD10]], [[TMP17]]
1184 ; AVX512-NEXT: [[TMP22:%.*]] = fadd <8 x double> [[WIDE_MASKED_LOAD11]], [[TMP18]]
1185 ; AVX512-NEXT: [[TMP23:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP0]]
1186 ; AVX512-NEXT: [[TMP24:%.*]] = getelementptr double, ptr [[TMP23]], i32 0
1187 ; AVX512-NEXT: [[TMP25:%.*]] = getelementptr double, ptr [[TMP23]], i32 8
1188 ; AVX512-NEXT: [[TMP26:%.*]] = getelementptr double, ptr [[TMP23]], i32 16
1189 ; AVX512-NEXT: [[TMP27:%.*]] = getelementptr double, ptr [[TMP23]], i32 24
1190 ; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0(<8 x double> [[TMP19]], ptr [[TMP24]], i32 8, <8 x i1> [[TMP6]]), !alias.scope [[META16:![0-9]+]], !noalias [[META18:![0-9]+]]
1191 ; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0(<8 x double> [[TMP20]], ptr [[TMP25]], i32 8, <8 x i1> [[TMP7]]), !alias.scope [[META16]], !noalias [[META18]]
1192 ; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0(<8 x double> [[TMP21]], ptr [[TMP26]], i32 8, <8 x i1> [[TMP8]]), !alias.scope [[META16]], !noalias [[META18]]
1193 ; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0(<8 x double> [[TMP22]], ptr [[TMP27]], i32 8, <8 x i1> [[TMP9]]), !alias.scope [[META16]], !noalias [[META18]]
1194 ; AVX512-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
1195 ; AVX512-NEXT: [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], 9984
1196 ; AVX512-NEXT: br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]]
1197 ; AVX512: middle.block:
1198 ; AVX512-NEXT: br i1 false, label [[FOR_END:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
1199 ; AVX512: vec.epilog.iter.check:
1200 ; AVX512-NEXT: br i1 false, label [[SCALAR_PH]], label [[VEC_EPILOG_PH]]
1201 ; AVX512: vec.epilog.ph:
1202 ; AVX512-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 9984, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_PH]] ]
1203 ; AVX512-NEXT: br label [[FOR_BODY:%.*]]
1204 ; AVX512: vec.epilog.vector.body:
1205 ; AVX512-NEXT: [[INDEX12:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT15:%.*]], [[FOR_BODY]] ]
1206 ; AVX512-NEXT: [[TMP40:%.*]] = add i64 [[INDEX12]], 0
1207 ; AVX512-NEXT: [[TMP41:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[TMP40]]
1208 ; AVX512-NEXT: [[TMP31:%.*]] = getelementptr inbounds i32, ptr [[TMP41]], i32 0
1209 ; AVX512-NEXT: [[WIDE_LOAD13:%.*]] = load <8 x i32>, ptr [[TMP31]], align 4, !alias.scope [[META20:![0-9]+]]
1210 ; AVX512-NEXT: [[TMP32:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD13]], splat (i32 100)
1211 ; AVX512-NEXT: [[TMP33:%.*]] = getelementptr double, ptr [[B]], i64 [[TMP40]]
1212 ; AVX512-NEXT: [[TMP34:%.*]] = getelementptr double, ptr [[TMP33]], i32 0
1213 ; AVX512-NEXT: [[WIDE_MASKED_LOAD14:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0(ptr [[TMP34]], i32 8, <8 x i1> [[TMP32]], <8 x double> poison), !alias.scope [[META23:![0-9]+]]
1214 ; AVX512-NEXT: [[TMP35:%.*]] = sitofp <8 x i32> [[WIDE_LOAD13]] to <8 x double>
1215 ; AVX512-NEXT: [[TMP36:%.*]] = fadd <8 x double> [[WIDE_MASKED_LOAD14]], [[TMP35]]
1216 ; AVX512-NEXT: [[TMP37:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP40]]
1217 ; AVX512-NEXT: [[TMP38:%.*]] = getelementptr double, ptr [[TMP37]], i32 0
1218 ; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0(<8 x double> [[TMP36]], ptr [[TMP38]], i32 8, <8 x i1> [[TMP32]]), !alias.scope [[META25:![0-9]+]], !noalias [[META27:![0-9]+]]
1219 ; AVX512-NEXT: [[INDEX_NEXT15]] = add nuw i64 [[INDEX12]], 8
1220 ; AVX512-NEXT: [[TMP39:%.*]] = icmp eq i64 [[INDEX_NEXT15]], 10000
1221 ; AVX512-NEXT: br i1 [[TMP39]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]]
1222 ; AVX512: vec.epilog.middle.block:
1223 ; AVX512-NEXT: br i1 true, label [[FOR_END]], label [[SCALAR_PH]]
1224 ; AVX512: vec.epilog.scalar.ph:
1225 ; AVX512-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 10000, [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 9984, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ITER_CHECK:%.*]] ]
1226 ; AVX512-NEXT: br label [[FOR_BODY1:%.*]]
1228 ; AVX512-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
1229 ; AVX512-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[INDVARS_IV]]
1230 ; AVX512-NEXT: [[TMP29:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
1231 ; AVX512-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP29]], 100
1232 ; AVX512-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]]
1234 ; AVX512-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds double, ptr [[B]], i64 [[INDVARS_IV]]
1235 ; AVX512-NEXT: [[TMP30:%.*]] = load double, ptr [[ARRAYIDX3]], align 8
1236 ; AVX512-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP29]] to double
1237 ; AVX512-NEXT: [[ADD:%.*]] = fadd double [[TMP30]], [[CONV]]
1238 ; AVX512-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds double, ptr [[A]], i64 [[INDVARS_IV]]
1239 ; AVX512-NEXT: store double [[ADD]], ptr [[ARRAYIDX7]], align 8
1240 ; AVX512-NEXT: br label [[FOR_INC]]
1242 ; AVX512-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
1243 ; AVX512-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 10000
1244 ; AVX512-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY1]], !llvm.loop [[LOOP29:![0-9]+]]
1246 ; AVX512-NEXT: ret void
1251 for.body: ; preds = %for.inc, %entry
1252 %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.inc ]
1253 %arrayidx = getelementptr inbounds i32, ptr %trigger, i64 %indvars.iv
1254 %0 = load i32, ptr %arrayidx, align 4
1255 %cmp1 = icmp slt i32 %0, 100
1256 br i1 %cmp1, label %if.then, label %for.inc
1258 if.then: ; preds = %for.body
1259 %arrayidx3 = getelementptr inbounds double, ptr %B, i64 %indvars.iv
1260 %1 = load double, ptr %arrayidx3, align 8
1261 %conv = sitofp i32 %0 to double
1262 %add = fadd double %1, %conv
1263 %arrayidx7 = getelementptr inbounds double, ptr %A, i64 %indvars.iv
1264 store double %add, ptr %arrayidx7, align 8
1267 for.inc: ; preds = %for.body, %if.then
1268 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
1269 %exitcond = icmp eq i64 %indvars.iv.next, 10000
1270 br i1 %exitcond, label %for.end, label %for.body
1272 for.end: ; preds = %for.inc
1278 ;void foo4(ptr A, ptr B, int *trigger) {
1280 ; for (int i=0; i<10000; i += 16) {
1281 ; if (trigger[i] < 100) {
1282 ; A[i] = B[i*2] + trigger[i]; << non-cosecutive access
1287 define void @foo4(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture readonly %trigger) local_unnamed_addr #0 {
1290 ; AVX-NEXT: br label [[FOR_BODY:%.*]]
1292 ; AVX-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
1293 ; AVX-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER:%.*]], i64 [[INDVARS_IV]]
1294 ; AVX-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
1295 ; AVX-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP0]], 100
1296 ; AVX-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]]
1298 ; AVX-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[INDVARS_IV]], 1
1299 ; AVX-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds double, ptr [[B:%.*]], i64 [[TMP1]]
1300 ; AVX-NEXT: [[TMP2:%.*]] = load double, ptr [[ARRAYIDX3]], align 8
1301 ; AVX-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP0]] to double
1302 ; AVX-NEXT: [[ADD:%.*]] = fadd double [[TMP2]], [[CONV]]
1303 ; AVX-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds double, ptr [[A:%.*]], i64 [[INDVARS_IV]]
1304 ; AVX-NEXT: store double [[ADD]], ptr [[ARRAYIDX7]], align 8
1305 ; AVX-NEXT: br label [[FOR_INC]]
1307 ; AVX-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 16
1308 ; AVX-NEXT: [[CMP:%.*]] = icmp ult i64 [[INDVARS_IV_NEXT]], 10000
1309 ; AVX-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END:%.*]]
1311 ; AVX-NEXT: ret void
1313 ; AVX512-LABEL: @foo4(
1314 ; AVX512-NEXT: entry:
1315 ; AVX512-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
1316 ; AVX512: vector.memcheck:
1317 ; AVX512-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 79880
1318 ; AVX512-NEXT: [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[TRIGGER:%.*]], i64 39940
1319 ; AVX512-NEXT: [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[B:%.*]], i64 159752
1320 ; AVX512-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[A]], [[SCEVGEP1]]
1321 ; AVX512-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[TRIGGER]], [[SCEVGEP]]
1322 ; AVX512-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
1323 ; AVX512-NEXT: [[BOUND03:%.*]] = icmp ult ptr [[A]], [[SCEVGEP2]]
1324 ; AVX512-NEXT: [[BOUND14:%.*]] = icmp ult ptr [[B]], [[SCEVGEP]]
1325 ; AVX512-NEXT: [[FOUND_CONFLICT5:%.*]] = and i1 [[BOUND03]], [[BOUND14]]
1326 ; AVX512-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT5]]
1327 ; AVX512-NEXT: br i1 [[CONFLICT_RDX]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
1328 ; AVX512: vector.ph:
1329 ; AVX512-NEXT: br label [[VECTOR_BODY:%.*]]
1330 ; AVX512: vector.body:
1331 ; AVX512-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
1332 ; AVX512-NEXT: [[VEC_IND:%.*]] = phi <8 x i64> [ <i64 0, i64 16, i64 32, i64 48, i64 64, i64 80, i64 96, i64 112>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
1333 ; AVX512-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], <8 x i64> [[VEC_IND]]
1334 ; AVX512-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> [[TMP0]], i32 4, <8 x i1> splat (i1 true), <8 x i32> poison), !alias.scope [[META30:![0-9]+]]
1335 ; AVX512-NEXT: [[TMP1:%.*]] = icmp slt <8 x i32> [[WIDE_MASKED_GATHER]], splat (i32 100)
1336 ; AVX512-NEXT: [[TMP2:%.*]] = shl nuw nsw <8 x i64> [[VEC_IND]], splat (i64 1)
1337 ; AVX512-NEXT: [[TMP3:%.*]] = getelementptr inbounds double, ptr [[B]], <8 x i64> [[TMP2]]
1338 ; AVX512-NEXT: [[WIDE_MASKED_GATHER6:%.*]] = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> [[TMP3]], i32 8, <8 x i1> [[TMP1]], <8 x double> poison), !alias.scope [[META33:![0-9]+]]
1339 ; AVX512-NEXT: [[TMP4:%.*]] = sitofp <8 x i32> [[WIDE_MASKED_GATHER]] to <8 x double>
1340 ; AVX512-NEXT: [[TMP5:%.*]] = fadd <8 x double> [[WIDE_MASKED_GATHER6]], [[TMP4]]
1341 ; AVX512-NEXT: [[TMP6:%.*]] = getelementptr inbounds double, ptr [[A]], <8 x i64> [[VEC_IND]]
1342 ; AVX512-NEXT: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> [[TMP5]], <8 x ptr> [[TMP6]], i32 8, <8 x i1> [[TMP1]]), !alias.scope [[META35:![0-9]+]], !noalias [[META37:![0-9]+]]
1343 ; AVX512-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
1344 ; AVX512-NEXT: [[VEC_IND_NEXT]] = add <8 x i64> [[VEC_IND]], splat (i64 128)
1345 ; AVX512-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 624
1346 ; AVX512-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP38:![0-9]+]]
1347 ; AVX512: middle.block:
1348 ; AVX512-NEXT: br i1 false, label [[FOR_END:%.*]], label [[SCALAR_PH]]
1349 ; AVX512: scalar.ph:
1350 ; AVX512-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 9984, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ]
1351 ; AVX512-NEXT: br label [[FOR_BODY:%.*]]
1353 ; AVX512-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
1354 ; AVX512-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[INDVARS_IV]]
1355 ; AVX512-NEXT: [[TMP8:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
1356 ; AVX512-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP8]], 100
1357 ; AVX512-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]]
1359 ; AVX512-NEXT: [[TMP9:%.*]] = shl nuw nsw i64 [[INDVARS_IV]], 1
1360 ; AVX512-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds double, ptr [[B]], i64 [[TMP9]]
1361 ; AVX512-NEXT: [[TMP10:%.*]] = load double, ptr [[ARRAYIDX3]], align 8
1362 ; AVX512-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP8]] to double
1363 ; AVX512-NEXT: [[ADD:%.*]] = fadd double [[TMP10]], [[CONV]]
1364 ; AVX512-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds double, ptr [[A]], i64 [[INDVARS_IV]]
1365 ; AVX512-NEXT: store double [[ADD]], ptr [[ARRAYIDX7]], align 8
1366 ; AVX512-NEXT: br label [[FOR_INC]]
1368 ; AVX512-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 16
1369 ; AVX512-NEXT: [[CMP:%.*]] = icmp ult i64 [[INDVARS_IV_NEXT]], 10000
1370 ; AVX512-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP39:![0-9]+]]
1372 ; AVX512-NEXT: ret void
1377 for.body: ; preds = %entry, %for.inc
1378 %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.inc ]
1379 %arrayidx = getelementptr inbounds i32, ptr %trigger, i64 %indvars.iv
1380 %0 = load i32, ptr %arrayidx, align 4
1381 %cmp1 = icmp slt i32 %0, 100
1382 br i1 %cmp1, label %if.then, label %for.inc
1384 if.then: ; preds = %for.body
1385 %1 = shl nuw nsw i64 %indvars.iv, 1
1386 %arrayidx3 = getelementptr inbounds double, ptr %B, i64 %1
1387 %2 = load double, ptr %arrayidx3, align 8
1388 %conv = sitofp i32 %0 to double
1389 %add = fadd double %2, %conv
1390 %arrayidx7 = getelementptr inbounds double, ptr %A, i64 %indvars.iv
1391 store double %add, ptr %arrayidx7, align 8
1394 for.inc: ; preds = %for.body, %if.then
1395 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 16
1396 %cmp = icmp ult i64 %indvars.iv.next, 10000
1397 br i1 %cmp, label %for.body, label %for.end
1399 for.end: ; preds = %for.inc
1403 @a = common global [1 x ptr] zeroinitializer, align 8
1404 @c = common global ptr null, align 8
1407 ;void foo6(ptr in, ptr out, unsigned size, int *trigger) {
1409 ; for (int i=SIZE-1; i>=0; i--) {
1410 ; if (trigger[i] > 0) {
1411 ; out[i] = in[i] + (double) 0.5;
1416 define void @foo6(ptr nocapture readonly %in, ptr nocapture %out, i32 %size, ptr nocapture readonly %trigger) local_unnamed_addr #0 {
1417 ; AVX1-LABEL: @foo6(
1419 ; AVX1-NEXT: br label [[FOR_BODY:%.*]]
1421 ; AVX1-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 4095, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
1422 ; AVX1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER:%.*]], i64 [[INDVARS_IV]]
1423 ; AVX1-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
1424 ; AVX1-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[TMP0]], 0
1425 ; AVX1-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]]
1427 ; AVX1-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds double, ptr [[IN:%.*]], i64 [[INDVARS_IV]]
1428 ; AVX1-NEXT: [[TMP1:%.*]] = load double, ptr [[ARRAYIDX3]], align 8
1429 ; AVX1-NEXT: [[ADD:%.*]] = fadd double [[TMP1]], 5.000000e-01
1430 ; AVX1-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds double, ptr [[OUT:%.*]], i64 [[INDVARS_IV]]
1431 ; AVX1-NEXT: store double [[ADD]], ptr [[ARRAYIDX5]], align 8
1432 ; AVX1-NEXT: br label [[FOR_INC]]
1434 ; AVX1-NEXT: [[INDVARS_IV_NEXT]] = add nsw i64 [[INDVARS_IV]], -1
1435 ; AVX1-NEXT: [[CMP:%.*]] = icmp eq i64 [[INDVARS_IV]], 0
1436 ; AVX1-NEXT: br i1 [[CMP]], label [[FOR_END:%.*]], label [[FOR_BODY]]
1438 ; AVX1-NEXT: ret void
1440 ; AVX2-LABEL: @foo6(
1442 ; AVX2-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
1443 ; AVX2: vector.memcheck:
1444 ; AVX2-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[OUT:%.*]], i64 32768
1445 ; AVX2-NEXT: [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[TRIGGER:%.*]], i64 16384
1446 ; AVX2-NEXT: [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[IN:%.*]], i64 32768
1447 ; AVX2-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[OUT]], [[SCEVGEP1]]
1448 ; AVX2-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[TRIGGER]], [[SCEVGEP]]
1449 ; AVX2-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
1450 ; AVX2-NEXT: [[BOUND03:%.*]] = icmp ult ptr [[OUT]], [[SCEVGEP2]]
1451 ; AVX2-NEXT: [[BOUND14:%.*]] = icmp ult ptr [[IN]], [[SCEVGEP]]
1452 ; AVX2-NEXT: [[FOUND_CONFLICT5:%.*]] = and i1 [[BOUND03]], [[BOUND14]]
1453 ; AVX2-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT5]]
1454 ; AVX2-NEXT: br i1 [[CONFLICT_RDX]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
1456 ; AVX2-NEXT: br label [[VECTOR_BODY:%.*]]
1457 ; AVX2: vector.body:
1458 ; AVX2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
1459 ; AVX2-NEXT: [[OFFSET_IDX:%.*]] = sub i64 4095, [[INDEX]]
1460 ; AVX2-NEXT: [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], 0
1461 ; AVX2-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[TMP0]]
1462 ; AVX2-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
1463 ; AVX2-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 -3
1464 ; AVX2-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 -4
1465 ; AVX2-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 -3
1466 ; AVX2-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 -8
1467 ; AVX2-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 -3
1468 ; AVX2-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 -12
1469 ; AVX2-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 -3
1470 ; AVX2-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP3]], align 4, !alias.scope [[META21:![0-9]+]]
1471 ; AVX2-NEXT: [[REVERSE:%.*]] = shufflevector <4 x i32> [[WIDE_LOAD]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
1472 ; AVX2-NEXT: [[WIDE_LOAD6:%.*]] = load <4 x i32>, ptr [[TMP5]], align 4, !alias.scope [[META21]]
1473 ; AVX2-NEXT: [[REVERSE7:%.*]] = shufflevector <4 x i32> [[WIDE_LOAD6]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
1474 ; AVX2-NEXT: [[WIDE_LOAD8:%.*]] = load <4 x i32>, ptr [[TMP7]], align 4, !alias.scope [[META21]]
1475 ; AVX2-NEXT: [[REVERSE9:%.*]] = shufflevector <4 x i32> [[WIDE_LOAD8]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
1476 ; AVX2-NEXT: [[WIDE_LOAD10:%.*]] = load <4 x i32>, ptr [[TMP9]], align 4, !alias.scope [[META21]]
1477 ; AVX2-NEXT: [[REVERSE11:%.*]] = shufflevector <4 x i32> [[WIDE_LOAD10]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
1478 ; AVX2-NEXT: [[TMP10:%.*]] = icmp sgt <4 x i32> [[REVERSE]], zeroinitializer
1479 ; AVX2-NEXT: [[TMP11:%.*]] = icmp sgt <4 x i32> [[REVERSE7]], zeroinitializer
1480 ; AVX2-NEXT: [[TMP12:%.*]] = icmp sgt <4 x i32> [[REVERSE9]], zeroinitializer
1481 ; AVX2-NEXT: [[TMP13:%.*]] = icmp sgt <4 x i32> [[REVERSE11]], zeroinitializer
1482 ; AVX2-NEXT: [[TMP14:%.*]] = getelementptr double, ptr [[IN]], i64 [[TMP0]]
1483 ; AVX2-NEXT: [[TMP15:%.*]] = getelementptr double, ptr [[TMP14]], i32 0
1484 ; AVX2-NEXT: [[TMP16:%.*]] = getelementptr double, ptr [[TMP15]], i32 -3
1485 ; AVX2-NEXT: [[TMP17:%.*]] = getelementptr double, ptr [[TMP14]], i32 -4
1486 ; AVX2-NEXT: [[TMP18:%.*]] = getelementptr double, ptr [[TMP17]], i32 -3
1487 ; AVX2-NEXT: [[TMP19:%.*]] = getelementptr double, ptr [[TMP14]], i32 -8
1488 ; AVX2-NEXT: [[TMP20:%.*]] = getelementptr double, ptr [[TMP19]], i32 -3
1489 ; AVX2-NEXT: [[TMP21:%.*]] = getelementptr double, ptr [[TMP14]], i32 -12
1490 ; AVX2-NEXT: [[TMP22:%.*]] = getelementptr double, ptr [[TMP21]], i32 -3
1491 ; AVX2-NEXT: [[REVERSE12:%.*]] = shufflevector <4 x i1> [[TMP10]], <4 x i1> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
1492 ; AVX2-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr [[TMP16]], i32 8, <4 x i1> [[REVERSE12]], <4 x double> poison), !alias.scope [[META24:![0-9]+]]
1493 ; AVX2-NEXT: [[REVERSE13:%.*]] = shufflevector <4 x double> [[WIDE_MASKED_LOAD]], <4 x double> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
1494 ; AVX2-NEXT: [[REVERSE14:%.*]] = shufflevector <4 x i1> [[TMP11]], <4 x i1> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
1495 ; AVX2-NEXT: [[WIDE_MASKED_LOAD15:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr [[TMP18]], i32 8, <4 x i1> [[REVERSE14]], <4 x double> poison), !alias.scope [[META24]]
1496 ; AVX2-NEXT: [[REVERSE16:%.*]] = shufflevector <4 x double> [[WIDE_MASKED_LOAD15]], <4 x double> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
1497 ; AVX2-NEXT: [[REVERSE17:%.*]] = shufflevector <4 x i1> [[TMP12]], <4 x i1> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
1498 ; AVX2-NEXT: [[WIDE_MASKED_LOAD18:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr [[TMP20]], i32 8, <4 x i1> [[REVERSE17]], <4 x double> poison), !alias.scope [[META24]]
1499 ; AVX2-NEXT: [[REVERSE19:%.*]] = shufflevector <4 x double> [[WIDE_MASKED_LOAD18]], <4 x double> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
1500 ; AVX2-NEXT: [[REVERSE20:%.*]] = shufflevector <4 x i1> [[TMP13]], <4 x i1> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
1501 ; AVX2-NEXT: [[WIDE_MASKED_LOAD21:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr [[TMP22]], i32 8, <4 x i1> [[REVERSE20]], <4 x double> poison), !alias.scope [[META24]]
1502 ; AVX2-NEXT: [[REVERSE22:%.*]] = shufflevector <4 x double> [[WIDE_MASKED_LOAD21]], <4 x double> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
1503 ; AVX2-NEXT: [[TMP23:%.*]] = fadd <4 x double> [[REVERSE13]], splat (double 5.000000e-01)
1504 ; AVX2-NEXT: [[TMP24:%.*]] = fadd <4 x double> [[REVERSE16]], splat (double 5.000000e-01)
1505 ; AVX2-NEXT: [[TMP25:%.*]] = fadd <4 x double> [[REVERSE19]], splat (double 5.000000e-01)
1506 ; AVX2-NEXT: [[TMP26:%.*]] = fadd <4 x double> [[REVERSE22]], splat (double 5.000000e-01)
1507 ; AVX2-NEXT: [[TMP27:%.*]] = getelementptr double, ptr [[OUT]], i64 [[TMP0]]
1508 ; AVX2-NEXT: [[TMP28:%.*]] = getelementptr double, ptr [[TMP27]], i32 0
1509 ; AVX2-NEXT: [[TMP29:%.*]] = getelementptr double, ptr [[TMP28]], i32 -3
1510 ; AVX2-NEXT: [[TMP30:%.*]] = getelementptr double, ptr [[TMP27]], i32 -4
1511 ; AVX2-NEXT: [[TMP31:%.*]] = getelementptr double, ptr [[TMP30]], i32 -3
1512 ; AVX2-NEXT: [[TMP32:%.*]] = getelementptr double, ptr [[TMP27]], i32 -8
1513 ; AVX2-NEXT: [[TMP33:%.*]] = getelementptr double, ptr [[TMP32]], i32 -3
1514 ; AVX2-NEXT: [[TMP34:%.*]] = getelementptr double, ptr [[TMP27]], i32 -12
1515 ; AVX2-NEXT: [[TMP35:%.*]] = getelementptr double, ptr [[TMP34]], i32 -3
1516 ; AVX2-NEXT: [[REVERSE24:%.*]] = shufflevector <4 x double> [[TMP23]], <4 x double> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
1517 ; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0(<4 x double> [[REVERSE24]], ptr [[TMP29]], i32 8, <4 x i1> [[REVERSE12]]), !alias.scope [[META26:![0-9]+]], !noalias [[META28:![0-9]+]]
1518 ; AVX2-NEXT: [[REVERSE26:%.*]] = shufflevector <4 x double> [[TMP24]], <4 x double> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
1519 ; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0(<4 x double> [[REVERSE26]], ptr [[TMP31]], i32 8, <4 x i1> [[REVERSE14]]), !alias.scope [[META26]], !noalias [[META28]]
1520 ; AVX2-NEXT: [[REVERSE28:%.*]] = shufflevector <4 x double> [[TMP25]], <4 x double> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
1521 ; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0(<4 x double> [[REVERSE28]], ptr [[TMP33]], i32 8, <4 x i1> [[REVERSE17]]), !alias.scope [[META26]], !noalias [[META28]]
1522 ; AVX2-NEXT: [[REVERSE30:%.*]] = shufflevector <4 x double> [[TMP26]], <4 x double> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
1523 ; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0(<4 x double> [[REVERSE30]], ptr [[TMP35]], i32 8, <4 x i1> [[REVERSE20]]), !alias.scope [[META26]], !noalias [[META28]]
1524 ; AVX2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
1525 ; AVX2-NEXT: [[TMP36:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
1526 ; AVX2-NEXT: br i1 [[TMP36]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP29:![0-9]+]]
1527 ; AVX2: middle.block:
1528 ; AVX2-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
1530 ; AVX2-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ -1, [[MIDDLE_BLOCK]] ], [ 4095, [[ENTRY:%.*]] ], [ 4095, [[VECTOR_MEMCHECK]] ]
1531 ; AVX2-NEXT: br label [[FOR_BODY:%.*]]
1533 ; AVX2-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
1534 ; AVX2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[INDVARS_IV]]
1535 ; AVX2-NEXT: [[TMP37:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
1536 ; AVX2-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[TMP37]], 0
1537 ; AVX2-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]]
1539 ; AVX2-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds double, ptr [[IN]], i64 [[INDVARS_IV]]
1540 ; AVX2-NEXT: [[TMP38:%.*]] = load double, ptr [[ARRAYIDX3]], align 8
1541 ; AVX2-NEXT: [[ADD:%.*]] = fadd double [[TMP38]], 5.000000e-01
1542 ; AVX2-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds double, ptr [[OUT]], i64 [[INDVARS_IV]]
1543 ; AVX2-NEXT: store double [[ADD]], ptr [[ARRAYIDX5]], align 8
1544 ; AVX2-NEXT: br label [[FOR_INC]]
1546 ; AVX2-NEXT: [[INDVARS_IV_NEXT]] = add nsw i64 [[INDVARS_IV]], -1
1547 ; AVX2-NEXT: [[CMP:%.*]] = icmp eq i64 [[INDVARS_IV]], 0
1548 ; AVX2-NEXT: br i1 [[CMP]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP30:![0-9]+]]
1550 ; AVX2-NEXT: ret void
1552 ; AVX512-LABEL: @foo6(
1553 ; AVX512-NEXT: entry:
1554 ; AVX512-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
1555 ; AVX512: vector.memcheck:
1556 ; AVX512-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[OUT:%.*]], i64 32768
1557 ; AVX512-NEXT: [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[TRIGGER:%.*]], i64 16384
1558 ; AVX512-NEXT: [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[IN:%.*]], i64 32768
1559 ; AVX512-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[OUT]], [[SCEVGEP1]]
1560 ; AVX512-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[TRIGGER]], [[SCEVGEP]]
1561 ; AVX512-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
1562 ; AVX512-NEXT: [[BOUND03:%.*]] = icmp ult ptr [[OUT]], [[SCEVGEP2]]
1563 ; AVX512-NEXT: [[BOUND14:%.*]] = icmp ult ptr [[IN]], [[SCEVGEP]]
1564 ; AVX512-NEXT: [[FOUND_CONFLICT5:%.*]] = and i1 [[BOUND03]], [[BOUND14]]
1565 ; AVX512-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT5]]
1566 ; AVX512-NEXT: br i1 [[CONFLICT_RDX]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
1567 ; AVX512: vector.ph:
1568 ; AVX512-NEXT: br label [[VECTOR_BODY:%.*]]
1569 ; AVX512: vector.body:
1570 ; AVX512-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
1571 ; AVX512-NEXT: [[OFFSET_IDX:%.*]] = sub i64 4095, [[INDEX]]
1572 ; AVX512-NEXT: [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], 0
1573 ; AVX512-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[TMP0]]
1574 ; AVX512-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
1575 ; AVX512-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 -7
1576 ; AVX512-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 -8
1577 ; AVX512-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 -7
1578 ; AVX512-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 -16
1579 ; AVX512-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 -7
1580 ; AVX512-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 -24
1581 ; AVX512-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 -7
1582 ; AVX512-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i32>, ptr [[TMP3]], align 4, !alias.scope [[META40:![0-9]+]]
1583 ; AVX512-NEXT: [[REVERSE:%.*]] = shufflevector <8 x i32> [[WIDE_LOAD]], <8 x i32> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
1584 ; AVX512-NEXT: [[WIDE_LOAD6:%.*]] = load <8 x i32>, ptr [[TMP5]], align 4, !alias.scope [[META40]]
1585 ; AVX512-NEXT: [[REVERSE7:%.*]] = shufflevector <8 x i32> [[WIDE_LOAD6]], <8 x i32> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
1586 ; AVX512-NEXT: [[WIDE_LOAD8:%.*]] = load <8 x i32>, ptr [[TMP7]], align 4, !alias.scope [[META40]]
1587 ; AVX512-NEXT: [[REVERSE9:%.*]] = shufflevector <8 x i32> [[WIDE_LOAD8]], <8 x i32> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
1588 ; AVX512-NEXT: [[WIDE_LOAD10:%.*]] = load <8 x i32>, ptr [[TMP9]], align 4, !alias.scope [[META40]]
1589 ; AVX512-NEXT: [[REVERSE11:%.*]] = shufflevector <8 x i32> [[WIDE_LOAD10]], <8 x i32> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
1590 ; AVX512-NEXT: [[TMP10:%.*]] = icmp sgt <8 x i32> [[REVERSE]], zeroinitializer
1591 ; AVX512-NEXT: [[TMP11:%.*]] = icmp sgt <8 x i32> [[REVERSE7]], zeroinitializer
1592 ; AVX512-NEXT: [[TMP12:%.*]] = icmp sgt <8 x i32> [[REVERSE9]], zeroinitializer
1593 ; AVX512-NEXT: [[TMP13:%.*]] = icmp sgt <8 x i32> [[REVERSE11]], zeroinitializer
1594 ; AVX512-NEXT: [[TMP14:%.*]] = getelementptr double, ptr [[IN]], i64 [[TMP0]]
1595 ; AVX512-NEXT: [[TMP15:%.*]] = getelementptr double, ptr [[TMP14]], i32 0
1596 ; AVX512-NEXT: [[TMP16:%.*]] = getelementptr double, ptr [[TMP15]], i32 -7
1597 ; AVX512-NEXT: [[TMP17:%.*]] = getelementptr double, ptr [[TMP14]], i32 -8
1598 ; AVX512-NEXT: [[TMP18:%.*]] = getelementptr double, ptr [[TMP17]], i32 -7
1599 ; AVX512-NEXT: [[TMP19:%.*]] = getelementptr double, ptr [[TMP14]], i32 -16
1600 ; AVX512-NEXT: [[TMP20:%.*]] = getelementptr double, ptr [[TMP19]], i32 -7
1601 ; AVX512-NEXT: [[TMP21:%.*]] = getelementptr double, ptr [[TMP14]], i32 -24
1602 ; AVX512-NEXT: [[TMP22:%.*]] = getelementptr double, ptr [[TMP21]], i32 -7
1603 ; AVX512-NEXT: [[REVERSE12:%.*]] = shufflevector <8 x i1> [[TMP10]], <8 x i1> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
1604 ; AVX512-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0(ptr [[TMP16]], i32 8, <8 x i1> [[REVERSE12]], <8 x double> poison), !alias.scope [[META43:![0-9]+]]
1605 ; AVX512-NEXT: [[REVERSE13:%.*]] = shufflevector <8 x double> [[WIDE_MASKED_LOAD]], <8 x double> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
1606 ; AVX512-NEXT: [[REVERSE14:%.*]] = shufflevector <8 x i1> [[TMP11]], <8 x i1> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
1607 ; AVX512-NEXT: [[WIDE_MASKED_LOAD15:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0(ptr [[TMP18]], i32 8, <8 x i1> [[REVERSE14]], <8 x double> poison), !alias.scope [[META43]]
1608 ; AVX512-NEXT: [[REVERSE16:%.*]] = shufflevector <8 x double> [[WIDE_MASKED_LOAD15]], <8 x double> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
1609 ; AVX512-NEXT: [[REVERSE17:%.*]] = shufflevector <8 x i1> [[TMP12]], <8 x i1> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
1610 ; AVX512-NEXT: [[WIDE_MASKED_LOAD18:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0(ptr [[TMP20]], i32 8, <8 x i1> [[REVERSE17]], <8 x double> poison), !alias.scope [[META43]]
1611 ; AVX512-NEXT: [[REVERSE19:%.*]] = shufflevector <8 x double> [[WIDE_MASKED_LOAD18]], <8 x double> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
1612 ; AVX512-NEXT: [[REVERSE20:%.*]] = shufflevector <8 x i1> [[TMP13]], <8 x i1> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
1613 ; AVX512-NEXT: [[WIDE_MASKED_LOAD21:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0(ptr [[TMP22]], i32 8, <8 x i1> [[REVERSE20]], <8 x double> poison), !alias.scope [[META43]]
1614 ; AVX512-NEXT: [[REVERSE22:%.*]] = shufflevector <8 x double> [[WIDE_MASKED_LOAD21]], <8 x double> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
1615 ; AVX512-NEXT: [[TMP23:%.*]] = fadd <8 x double> [[REVERSE13]], splat (double 5.000000e-01)
1616 ; AVX512-NEXT: [[TMP24:%.*]] = fadd <8 x double> [[REVERSE16]], splat (double 5.000000e-01)
1617 ; AVX512-NEXT: [[TMP25:%.*]] = fadd <8 x double> [[REVERSE19]], splat (double 5.000000e-01)
1618 ; AVX512-NEXT: [[TMP26:%.*]] = fadd <8 x double> [[REVERSE22]], splat (double 5.000000e-01)
1619 ; AVX512-NEXT: [[TMP27:%.*]] = getelementptr double, ptr [[OUT]], i64 [[TMP0]]
1620 ; AVX512-NEXT: [[TMP28:%.*]] = getelementptr double, ptr [[TMP27]], i32 0
1621 ; AVX512-NEXT: [[TMP29:%.*]] = getelementptr double, ptr [[TMP28]], i32 -7
1622 ; AVX512-NEXT: [[TMP30:%.*]] = getelementptr double, ptr [[TMP27]], i32 -8
1623 ; AVX512-NEXT: [[TMP31:%.*]] = getelementptr double, ptr [[TMP30]], i32 -7
1624 ; AVX512-NEXT: [[TMP32:%.*]] = getelementptr double, ptr [[TMP27]], i32 -16
1625 ; AVX512-NEXT: [[TMP33:%.*]] = getelementptr double, ptr [[TMP32]], i32 -7
1626 ; AVX512-NEXT: [[TMP34:%.*]] = getelementptr double, ptr [[TMP27]], i32 -24
1627 ; AVX512-NEXT: [[TMP35:%.*]] = getelementptr double, ptr [[TMP34]], i32 -7
1628 ; AVX512-NEXT: [[REVERSE24:%.*]] = shufflevector <8 x double> [[TMP23]], <8 x double> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
1629 ; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0(<8 x double> [[REVERSE24]], ptr [[TMP29]], i32 8, <8 x i1> [[REVERSE12]]), !alias.scope [[META45:![0-9]+]], !noalias [[META47:![0-9]+]]
1630 ; AVX512-NEXT: [[REVERSE26:%.*]] = shufflevector <8 x double> [[TMP24]], <8 x double> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
1631 ; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0(<8 x double> [[REVERSE26]], ptr [[TMP31]], i32 8, <8 x i1> [[REVERSE14]]), !alias.scope [[META45]], !noalias [[META47]]
1632 ; AVX512-NEXT: [[REVERSE28:%.*]] = shufflevector <8 x double> [[TMP25]], <8 x double> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
1633 ; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0(<8 x double> [[REVERSE28]], ptr [[TMP33]], i32 8, <8 x i1> [[REVERSE17]]), !alias.scope [[META45]], !noalias [[META47]]
1634 ; AVX512-NEXT: [[REVERSE30:%.*]] = shufflevector <8 x double> [[TMP26]], <8 x double> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
1635 ; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0(<8 x double> [[REVERSE30]], ptr [[TMP35]], i32 8, <8 x i1> [[REVERSE20]]), !alias.scope [[META45]], !noalias [[META47]]
1636 ; AVX512-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
1637 ; AVX512-NEXT: [[TMP36:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
1638 ; AVX512-NEXT: br i1 [[TMP36]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP48:![0-9]+]]
1639 ; AVX512: middle.block:
1640 ; AVX512-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
1641 ; AVX512: scalar.ph:
1642 ; AVX512-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ -1, [[MIDDLE_BLOCK]] ], [ 4095, [[ENTRY:%.*]] ], [ 4095, [[VECTOR_MEMCHECK]] ]
1643 ; AVX512-NEXT: br label [[FOR_BODY:%.*]]
1645 ; AVX512-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
1646 ; AVX512-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[INDVARS_IV]]
1647 ; AVX512-NEXT: [[TMP37:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
1648 ; AVX512-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[TMP37]], 0
1649 ; AVX512-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]]
1651 ; AVX512-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds double, ptr [[IN]], i64 [[INDVARS_IV]]
1652 ; AVX512-NEXT: [[TMP38:%.*]] = load double, ptr [[ARRAYIDX3]], align 8
1653 ; AVX512-NEXT: [[ADD:%.*]] = fadd double [[TMP38]], 5.000000e-01
1654 ; AVX512-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds double, ptr [[OUT]], i64 [[INDVARS_IV]]
1655 ; AVX512-NEXT: store double [[ADD]], ptr [[ARRAYIDX5]], align 8
1656 ; AVX512-NEXT: br label [[FOR_INC]]
1658 ; AVX512-NEXT: [[INDVARS_IV_NEXT]] = add nsw i64 [[INDVARS_IV]], -1
1659 ; AVX512-NEXT: [[CMP:%.*]] = icmp eq i64 [[INDVARS_IV]], 0
1660 ; AVX512-NEXT: br i1 [[CMP]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP49:![0-9]+]]
1662 ; AVX512-NEXT: ret void
1667 for.body: ; preds = %for.inc, %entry
1668 %indvars.iv = phi i64 [ 4095, %entry ], [ %indvars.iv.next, %for.inc ]
1669 %arrayidx = getelementptr inbounds i32, ptr %trigger, i64 %indvars.iv
1670 %0 = load i32, ptr %arrayidx, align 4
1671 %cmp1 = icmp sgt i32 %0, 0
1672 br i1 %cmp1, label %if.then, label %for.inc
1674 if.then: ; preds = %for.body
1675 %arrayidx3 = getelementptr inbounds double, ptr %in, i64 %indvars.iv
1676 %1 = load double, ptr %arrayidx3, align 8
1677 %add = fadd double %1, 5.000000e-01
1678 %arrayidx5 = getelementptr inbounds double, ptr %out, i64 %indvars.iv
1679 store double %add, ptr %arrayidx5, align 8
1682 for.inc: ; preds = %for.body, %if.then
1683 %indvars.iv.next = add nsw i64 %indvars.iv, -1
1684 %cmp = icmp eq i64 %indvars.iv, 0
1685 br i1 %cmp, label %for.end, label %for.body
1687 for.end: ; preds = %for.inc
1691 ; void foo7 (ptr __restrict__ out, ptr __restrict__ in,
1692 ; bool * __restrict__ trigger, unsigned size) {
1694 ; for (unsigned i=0; i<size; i++)
1695 ; if (trigger[i] && (in[i] != 0))
1696 ; out[i] = (double) 0.5;
1699 define void @foo7(ptr noalias nocapture %out, ptr noalias nocapture readonly %in, ptr noalias nocapture readonly %trigger, i32 %size) local_unnamed_addr #0 {
1700 ; AVX1-LABEL: @foo7(
1702 ; AVX1-NEXT: [[CMP5:%.*]] = icmp eq i32 [[SIZE:%.*]], 0
1703 ; AVX1-NEXT: br i1 [[CMP5]], label [[FOR_END:%.*]], label [[FOR_BODY_PREHEADER:%.*]]
1705 ; AVX1-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[SIZE]] to i64
1706 ; AVX1-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 4
1707 ; AVX1-NEXT: br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
1708 ; AVX1: vector.main.loop.iter.check:
1709 ; AVX1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 16
1710 ; AVX1-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
1712 ; AVX1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 16
1713 ; AVX1-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
1714 ; AVX1-NEXT: br label [[VECTOR_BODY:%.*]]
1715 ; AVX1: vector.body:
1716 ; AVX1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
1717 ; AVX1-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
1718 ; AVX1-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[TRIGGER:%.*]], i64 [[TMP0]]
1719 ; AVX1-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
1720 ; AVX1-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 4
1721 ; AVX1-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 8
1722 ; AVX1-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 12
1723 ; AVX1-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP2]], align 1
1724 ; AVX1-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i8>, ptr [[TMP3]], align 1
1725 ; AVX1-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP4]], align 1
1726 ; AVX1-NEXT: [[WIDE_LOAD3:%.*]] = load <4 x i8>, ptr [[TMP5]], align 1
1727 ; AVX1-NEXT: [[TMP6:%.*]] = and <4 x i8> [[WIDE_LOAD]], splat (i8 1)
1728 ; AVX1-NEXT: [[TMP7:%.*]] = and <4 x i8> [[WIDE_LOAD1]], splat (i8 1)
1729 ; AVX1-NEXT: [[TMP8:%.*]] = and <4 x i8> [[WIDE_LOAD2]], splat (i8 1)
1730 ; AVX1-NEXT: [[TMP9:%.*]] = and <4 x i8> [[WIDE_LOAD3]], splat (i8 1)
1731 ; AVX1-NEXT: [[TMP10:%.*]] = icmp eq <4 x i8> [[TMP6]], zeroinitializer
1732 ; AVX1-NEXT: [[TMP11:%.*]] = icmp eq <4 x i8> [[TMP7]], zeroinitializer
1733 ; AVX1-NEXT: [[TMP12:%.*]] = icmp eq <4 x i8> [[TMP8]], zeroinitializer
1734 ; AVX1-NEXT: [[TMP13:%.*]] = icmp eq <4 x i8> [[TMP9]], zeroinitializer
1735 ; AVX1-NEXT: [[TMP14:%.*]] = xor <4 x i1> [[TMP10]], splat (i1 true)
1736 ; AVX1-NEXT: [[TMP15:%.*]] = xor <4 x i1> [[TMP11]], splat (i1 true)
1737 ; AVX1-NEXT: [[TMP16:%.*]] = xor <4 x i1> [[TMP12]], splat (i1 true)
1738 ; AVX1-NEXT: [[TMP17:%.*]] = xor <4 x i1> [[TMP13]], splat (i1 true)
1739 ; AVX1-NEXT: [[TMP18:%.*]] = getelementptr ptr, ptr [[IN:%.*]], i64 [[TMP0]]
1740 ; AVX1-NEXT: [[TMP19:%.*]] = getelementptr ptr, ptr [[TMP18]], i32 0
1741 ; AVX1-NEXT: [[TMP20:%.*]] = getelementptr ptr, ptr [[TMP18]], i32 4
1742 ; AVX1-NEXT: [[TMP21:%.*]] = getelementptr ptr, ptr [[TMP18]], i32 8
1743 ; AVX1-NEXT: [[TMP22:%.*]] = getelementptr ptr, ptr [[TMP18]], i32 12
1744 ; AVX1-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP19]], i32 8, <4 x i1> [[TMP14]], <4 x ptr> poison)
1745 ; AVX1-NEXT: [[WIDE_MASKED_LOAD4:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP20]], i32 8, <4 x i1> [[TMP15]], <4 x ptr> poison)
1746 ; AVX1-NEXT: [[WIDE_MASKED_LOAD5:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP21]], i32 8, <4 x i1> [[TMP16]], <4 x ptr> poison)
1747 ; AVX1-NEXT: [[WIDE_MASKED_LOAD6:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP22]], i32 8, <4 x i1> [[TMP17]], <4 x ptr> poison)
1748 ; AVX1-NEXT: [[TMP23:%.*]] = icmp eq <4 x ptr> [[WIDE_MASKED_LOAD]], zeroinitializer
1749 ; AVX1-NEXT: [[TMP24:%.*]] = icmp eq <4 x ptr> [[WIDE_MASKED_LOAD4]], zeroinitializer
1750 ; AVX1-NEXT: [[TMP25:%.*]] = icmp eq <4 x ptr> [[WIDE_MASKED_LOAD5]], zeroinitializer
1751 ; AVX1-NEXT: [[TMP26:%.*]] = icmp eq <4 x ptr> [[WIDE_MASKED_LOAD6]], zeroinitializer
1752 ; AVX1-NEXT: [[TMP27:%.*]] = xor <4 x i1> [[TMP23]], splat (i1 true)
1753 ; AVX1-NEXT: [[TMP28:%.*]] = xor <4 x i1> [[TMP24]], splat (i1 true)
1754 ; AVX1-NEXT: [[TMP29:%.*]] = xor <4 x i1> [[TMP25]], splat (i1 true)
1755 ; AVX1-NEXT: [[TMP30:%.*]] = xor <4 x i1> [[TMP26]], splat (i1 true)
1756 ; AVX1-NEXT: [[TMP31:%.*]] = select <4 x i1> [[TMP14]], <4 x i1> [[TMP27]], <4 x i1> zeroinitializer
1757 ; AVX1-NEXT: [[TMP32:%.*]] = select <4 x i1> [[TMP15]], <4 x i1> [[TMP28]], <4 x i1> zeroinitializer
1758 ; AVX1-NEXT: [[TMP33:%.*]] = select <4 x i1> [[TMP16]], <4 x i1> [[TMP29]], <4 x i1> zeroinitializer
1759 ; AVX1-NEXT: [[TMP34:%.*]] = select <4 x i1> [[TMP17]], <4 x i1> [[TMP30]], <4 x i1> zeroinitializer
1760 ; AVX1-NEXT: [[TMP35:%.*]] = getelementptr double, ptr [[OUT:%.*]], i64 [[TMP0]]
1761 ; AVX1-NEXT: [[TMP36:%.*]] = getelementptr double, ptr [[TMP35]], i32 0
1762 ; AVX1-NEXT: [[TMP37:%.*]] = getelementptr double, ptr [[TMP35]], i32 4
1763 ; AVX1-NEXT: [[TMP38:%.*]] = getelementptr double, ptr [[TMP35]], i32 8
1764 ; AVX1-NEXT: [[TMP39:%.*]] = getelementptr double, ptr [[TMP35]], i32 12
1765 ; AVX1-NEXT: call void @llvm.masked.store.v4f64.p0(<4 x double> splat (double 5.000000e-01), ptr [[TMP36]], i32 8, <4 x i1> [[TMP31]])
1766 ; AVX1-NEXT: call void @llvm.masked.store.v4f64.p0(<4 x double> splat (double 5.000000e-01), ptr [[TMP37]], i32 8, <4 x i1> [[TMP32]])
1767 ; AVX1-NEXT: call void @llvm.masked.store.v4f64.p0(<4 x double> splat (double 5.000000e-01), ptr [[TMP38]], i32 8, <4 x i1> [[TMP33]])
1768 ; AVX1-NEXT: call void @llvm.masked.store.v4f64.p0(<4 x double> splat (double 5.000000e-01), ptr [[TMP39]], i32 8, <4 x i1> [[TMP34]])
1769 ; AVX1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
1770 ; AVX1-NEXT: [[TMP40:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
1771 ; AVX1-NEXT: br i1 [[TMP40]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
1772 ; AVX1: middle.block:
1773 ; AVX1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
1774 ; AVX1-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
1775 ; AVX1: vec.epilog.iter.check:
1776 ; AVX1-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
1777 ; AVX1-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 4
1778 ; AVX1-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[SCALAR_PH]]
1779 ; AVX1: vec.epilog.ph:
1780 ; AVX1-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
1781 ; AVX1-NEXT: [[N_MOD_VF8:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 4
1782 ; AVX1-NEXT: [[N_VEC9:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF8]]
1783 ; AVX1-NEXT: br label [[FOR_BODY:%.*]]
1784 ; AVX1: vec.epilog.vector.body:
1785 ; AVX1-NEXT: [[INDEX10:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDEX_NEXT13:%.*]], [[FOR_BODY]] ]
1786 ; AVX1-NEXT: [[TMP55:%.*]] = add i64 [[INDEX10]], 0
1787 ; AVX1-NEXT: [[TMP56:%.*]] = getelementptr inbounds i8, ptr [[TRIGGER]], i64 [[TMP55]]
1788 ; AVX1-NEXT: [[TMP57:%.*]] = getelementptr inbounds i8, ptr [[TMP56]], i32 0
1789 ; AVX1-NEXT: [[WIDE_LOAD11:%.*]] = load <4 x i8>, ptr [[TMP57]], align 1
1790 ; AVX1-NEXT: [[TMP44:%.*]] = and <4 x i8> [[WIDE_LOAD11]], splat (i8 1)
1791 ; AVX1-NEXT: [[TMP45:%.*]] = icmp eq <4 x i8> [[TMP44]], zeroinitializer
1792 ; AVX1-NEXT: [[TMP46:%.*]] = xor <4 x i1> [[TMP45]], splat (i1 true)
1793 ; AVX1-NEXT: [[TMP47:%.*]] = getelementptr ptr, ptr [[IN]], i64 [[TMP55]]
1794 ; AVX1-NEXT: [[TMP48:%.*]] = getelementptr ptr, ptr [[TMP47]], i32 0
1795 ; AVX1-NEXT: [[WIDE_MASKED_LOAD12:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP48]], i32 8, <4 x i1> [[TMP46]], <4 x ptr> poison)
1796 ; AVX1-NEXT: [[TMP49:%.*]] = icmp eq <4 x ptr> [[WIDE_MASKED_LOAD12]], zeroinitializer
1797 ; AVX1-NEXT: [[TMP50:%.*]] = xor <4 x i1> [[TMP49]], splat (i1 true)
1798 ; AVX1-NEXT: [[TMP51:%.*]] = select <4 x i1> [[TMP46]], <4 x i1> [[TMP50]], <4 x i1> zeroinitializer
1799 ; AVX1-NEXT: [[TMP52:%.*]] = getelementptr double, ptr [[OUT]], i64 [[TMP55]]
1800 ; AVX1-NEXT: [[TMP53:%.*]] = getelementptr double, ptr [[TMP52]], i32 0
1801 ; AVX1-NEXT: call void @llvm.masked.store.v4f64.p0(<4 x double> splat (double 5.000000e-01), ptr [[TMP53]], i32 8, <4 x i1> [[TMP51]])
1802 ; AVX1-NEXT: [[INDEX_NEXT13]] = add nuw i64 [[INDEX10]], 4
1803 ; AVX1-NEXT: [[TMP54:%.*]] = icmp eq i64 [[INDEX_NEXT13]], [[N_VEC9]]
1804 ; AVX1-NEXT: br i1 [[TMP54]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]]
1805 ; AVX1: vec.epilog.middle.block:
1806 ; AVX1-NEXT: [[CMP_N14:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC9]]
1807 ; AVX1-NEXT: br i1 [[CMP_N14]], label [[FOR_END_LOOPEXIT]], label [[VEC_EPILOG_SCALAR_PH]]
1808 ; AVX1: vec.epilog.scalar.ph:
1809 ; AVX1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC9]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
1810 ; AVX1-NEXT: br label [[FOR_BODY1:%.*]]
1812 ; AVX1-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
1813 ; AVX1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[TRIGGER]], i64 [[INDVARS_IV]]
1814 ; AVX1-NEXT: [[TMP41:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
1815 ; AVX1-NEXT: [[TMP42:%.*]] = and i8 [[TMP41]], 1
1816 ; AVX1-NEXT: [[TOBOOL:%.*]] = icmp eq i8 [[TMP42]], 0
1817 ; AVX1-NEXT: br i1 [[TOBOOL]], label [[FOR_INC]], label [[LAND_LHS_TRUE:%.*]]
1818 ; AVX1: land.lhs.true:
1819 ; AVX1-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds ptr, ptr [[IN]], i64 [[INDVARS_IV]]
1820 ; AVX1-NEXT: [[TMP43:%.*]] = load ptr, ptr [[ARRAYIDX2]], align 8
1821 ; AVX1-NEXT: [[CMP3:%.*]] = icmp eq ptr [[TMP43]], null
1822 ; AVX1-NEXT: br i1 [[CMP3]], label [[FOR_INC]], label [[IF_THEN:%.*]]
1824 ; AVX1-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds double, ptr [[OUT]], i64 [[INDVARS_IV]]
1825 ; AVX1-NEXT: store double 5.000000e-01, ptr [[ARRAYIDX5]], align 8
1826 ; AVX1-NEXT: br label [[FOR_INC]]
1828 ; AVX1-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
1829 ; AVX1-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
1830 ; AVX1-NEXT: br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY1]], !llvm.loop [[LOOP20:![0-9]+]]
1831 ; AVX1: for.end.loopexit:
1832 ; AVX1-NEXT: br label [[FOR_END]]
1834 ; AVX1-NEXT: ret void
1836 ; AVX2-LABEL: @foo7(
1838 ; AVX2-NEXT: [[CMP5:%.*]] = icmp eq i32 [[SIZE:%.*]], 0
1839 ; AVX2-NEXT: br i1 [[CMP5]], label [[FOR_END:%.*]], label [[FOR_BODY_PREHEADER:%.*]]
1841 ; AVX2-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[SIZE]] to i64
1842 ; AVX2-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 4
1843 ; AVX2-NEXT: br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
1844 ; AVX2: vector.main.loop.iter.check:
1845 ; AVX2-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 16
1846 ; AVX2-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
1848 ; AVX2-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 16
1849 ; AVX2-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
1850 ; AVX2-NEXT: br label [[VECTOR_BODY:%.*]]
1851 ; AVX2: vector.body:
1852 ; AVX2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
1853 ; AVX2-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
1854 ; AVX2-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[TRIGGER:%.*]], i64 [[TMP0]]
1855 ; AVX2-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
1856 ; AVX2-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 4
1857 ; AVX2-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 8
1858 ; AVX2-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 12
1859 ; AVX2-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP2]], align 1
1860 ; AVX2-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i8>, ptr [[TMP3]], align 1
1861 ; AVX2-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP4]], align 1
1862 ; AVX2-NEXT: [[WIDE_LOAD3:%.*]] = load <4 x i8>, ptr [[TMP5]], align 1
1863 ; AVX2-NEXT: [[TMP6:%.*]] = and <4 x i8> [[WIDE_LOAD]], splat (i8 1)
1864 ; AVX2-NEXT: [[TMP7:%.*]] = and <4 x i8> [[WIDE_LOAD1]], splat (i8 1)
1865 ; AVX2-NEXT: [[TMP8:%.*]] = and <4 x i8> [[WIDE_LOAD2]], splat (i8 1)
1866 ; AVX2-NEXT: [[TMP9:%.*]] = and <4 x i8> [[WIDE_LOAD3]], splat (i8 1)
1867 ; AVX2-NEXT: [[TMP10:%.*]] = icmp eq <4 x i8> [[TMP6]], zeroinitializer
1868 ; AVX2-NEXT: [[TMP11:%.*]] = icmp eq <4 x i8> [[TMP7]], zeroinitializer
1869 ; AVX2-NEXT: [[TMP12:%.*]] = icmp eq <4 x i8> [[TMP8]], zeroinitializer
1870 ; AVX2-NEXT: [[TMP13:%.*]] = icmp eq <4 x i8> [[TMP9]], zeroinitializer
1871 ; AVX2-NEXT: [[TMP14:%.*]] = xor <4 x i1> [[TMP10]], splat (i1 true)
1872 ; AVX2-NEXT: [[TMP15:%.*]] = xor <4 x i1> [[TMP11]], splat (i1 true)
1873 ; AVX2-NEXT: [[TMP16:%.*]] = xor <4 x i1> [[TMP12]], splat (i1 true)
1874 ; AVX2-NEXT: [[TMP17:%.*]] = xor <4 x i1> [[TMP13]], splat (i1 true)
1875 ; AVX2-NEXT: [[TMP18:%.*]] = getelementptr ptr, ptr [[IN:%.*]], i64 [[TMP0]]
1876 ; AVX2-NEXT: [[TMP19:%.*]] = getelementptr ptr, ptr [[TMP18]], i32 0
1877 ; AVX2-NEXT: [[TMP20:%.*]] = getelementptr ptr, ptr [[TMP18]], i32 4
1878 ; AVX2-NEXT: [[TMP21:%.*]] = getelementptr ptr, ptr [[TMP18]], i32 8
1879 ; AVX2-NEXT: [[TMP22:%.*]] = getelementptr ptr, ptr [[TMP18]], i32 12
1880 ; AVX2-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP19]], i32 8, <4 x i1> [[TMP14]], <4 x ptr> poison)
1881 ; AVX2-NEXT: [[WIDE_MASKED_LOAD4:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP20]], i32 8, <4 x i1> [[TMP15]], <4 x ptr> poison)
1882 ; AVX2-NEXT: [[WIDE_MASKED_LOAD5:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP21]], i32 8, <4 x i1> [[TMP16]], <4 x ptr> poison)
1883 ; AVX2-NEXT: [[WIDE_MASKED_LOAD6:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP22]], i32 8, <4 x i1> [[TMP17]], <4 x ptr> poison)
1884 ; AVX2-NEXT: [[TMP23:%.*]] = icmp eq <4 x ptr> [[WIDE_MASKED_LOAD]], zeroinitializer
1885 ; AVX2-NEXT: [[TMP24:%.*]] = icmp eq <4 x ptr> [[WIDE_MASKED_LOAD4]], zeroinitializer
1886 ; AVX2-NEXT: [[TMP25:%.*]] = icmp eq <4 x ptr> [[WIDE_MASKED_LOAD5]], zeroinitializer
1887 ; AVX2-NEXT: [[TMP26:%.*]] = icmp eq <4 x ptr> [[WIDE_MASKED_LOAD6]], zeroinitializer
1888 ; AVX2-NEXT: [[TMP27:%.*]] = xor <4 x i1> [[TMP23]], splat (i1 true)
1889 ; AVX2-NEXT: [[TMP28:%.*]] = xor <4 x i1> [[TMP24]], splat (i1 true)
1890 ; AVX2-NEXT: [[TMP29:%.*]] = xor <4 x i1> [[TMP25]], splat (i1 true)
1891 ; AVX2-NEXT: [[TMP30:%.*]] = xor <4 x i1> [[TMP26]], splat (i1 true)
1892 ; AVX2-NEXT: [[TMP31:%.*]] = select <4 x i1> [[TMP14]], <4 x i1> [[TMP27]], <4 x i1> zeroinitializer
1893 ; AVX2-NEXT: [[TMP32:%.*]] = select <4 x i1> [[TMP15]], <4 x i1> [[TMP28]], <4 x i1> zeroinitializer
1894 ; AVX2-NEXT: [[TMP33:%.*]] = select <4 x i1> [[TMP16]], <4 x i1> [[TMP29]], <4 x i1> zeroinitializer
1895 ; AVX2-NEXT: [[TMP34:%.*]] = select <4 x i1> [[TMP17]], <4 x i1> [[TMP30]], <4 x i1> zeroinitializer
1896 ; AVX2-NEXT: [[TMP35:%.*]] = getelementptr double, ptr [[OUT:%.*]], i64 [[TMP0]]
1897 ; AVX2-NEXT: [[TMP36:%.*]] = getelementptr double, ptr [[TMP35]], i32 0
1898 ; AVX2-NEXT: [[TMP37:%.*]] = getelementptr double, ptr [[TMP35]], i32 4
1899 ; AVX2-NEXT: [[TMP38:%.*]] = getelementptr double, ptr [[TMP35]], i32 8
1900 ; AVX2-NEXT: [[TMP39:%.*]] = getelementptr double, ptr [[TMP35]], i32 12
1901 ; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0(<4 x double> splat (double 5.000000e-01), ptr [[TMP36]], i32 8, <4 x i1> [[TMP31]])
1902 ; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0(<4 x double> splat (double 5.000000e-01), ptr [[TMP37]], i32 8, <4 x i1> [[TMP32]])
1903 ; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0(<4 x double> splat (double 5.000000e-01), ptr [[TMP38]], i32 8, <4 x i1> [[TMP33]])
1904 ; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0(<4 x double> splat (double 5.000000e-01), ptr [[TMP39]], i32 8, <4 x i1> [[TMP34]])
1905 ; AVX2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
1906 ; AVX2-NEXT: [[TMP40:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
1907 ; AVX2-NEXT: br i1 [[TMP40]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP31:![0-9]+]]
1908 ; AVX2: middle.block:
1909 ; AVX2-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
1910 ; AVX2-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
1911 ; AVX2: vec.epilog.iter.check:
1912 ; AVX2-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
1913 ; AVX2-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 4
1914 ; AVX2-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[SCALAR_PH]]
1915 ; AVX2: vec.epilog.ph:
1916 ; AVX2-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
1917 ; AVX2-NEXT: [[N_MOD_VF8:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 4
1918 ; AVX2-NEXT: [[N_VEC9:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF8]]
1919 ; AVX2-NEXT: br label [[FOR_BODY:%.*]]
1920 ; AVX2: vec.epilog.vector.body:
1921 ; AVX2-NEXT: [[INDEX10:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDEX_NEXT13:%.*]], [[FOR_BODY]] ]
1922 ; AVX2-NEXT: [[TMP55:%.*]] = add i64 [[INDEX10]], 0
1923 ; AVX2-NEXT: [[TMP56:%.*]] = getelementptr inbounds i8, ptr [[TRIGGER]], i64 [[TMP55]]
1924 ; AVX2-NEXT: [[TMP57:%.*]] = getelementptr inbounds i8, ptr [[TMP56]], i32 0
1925 ; AVX2-NEXT: [[WIDE_LOAD11:%.*]] = load <4 x i8>, ptr [[TMP57]], align 1
1926 ; AVX2-NEXT: [[TMP44:%.*]] = and <4 x i8> [[WIDE_LOAD11]], splat (i8 1)
1927 ; AVX2-NEXT: [[TMP45:%.*]] = icmp eq <4 x i8> [[TMP44]], zeroinitializer
1928 ; AVX2-NEXT: [[TMP46:%.*]] = xor <4 x i1> [[TMP45]], splat (i1 true)
1929 ; AVX2-NEXT: [[TMP47:%.*]] = getelementptr ptr, ptr [[IN]], i64 [[TMP55]]
1930 ; AVX2-NEXT: [[TMP48:%.*]] = getelementptr ptr, ptr [[TMP47]], i32 0
1931 ; AVX2-NEXT: [[WIDE_MASKED_LOAD12:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP48]], i32 8, <4 x i1> [[TMP46]], <4 x ptr> poison)
1932 ; AVX2-NEXT: [[TMP49:%.*]] = icmp eq <4 x ptr> [[WIDE_MASKED_LOAD12]], zeroinitializer
1933 ; AVX2-NEXT: [[TMP50:%.*]] = xor <4 x i1> [[TMP49]], splat (i1 true)
1934 ; AVX2-NEXT: [[TMP51:%.*]] = select <4 x i1> [[TMP46]], <4 x i1> [[TMP50]], <4 x i1> zeroinitializer
1935 ; AVX2-NEXT: [[TMP52:%.*]] = getelementptr double, ptr [[OUT]], i64 [[TMP55]]
1936 ; AVX2-NEXT: [[TMP53:%.*]] = getelementptr double, ptr [[TMP52]], i32 0
1937 ; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0(<4 x double> splat (double 5.000000e-01), ptr [[TMP53]], i32 8, <4 x i1> [[TMP51]])
1938 ; AVX2-NEXT: [[INDEX_NEXT13]] = add nuw i64 [[INDEX10]], 4
1939 ; AVX2-NEXT: [[TMP54:%.*]] = icmp eq i64 [[INDEX_NEXT13]], [[N_VEC9]]
1940 ; AVX2-NEXT: br i1 [[TMP54]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP32:![0-9]+]]
1941 ; AVX2: vec.epilog.middle.block:
1942 ; AVX2-NEXT: [[CMP_N14:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC9]]
1943 ; AVX2-NEXT: br i1 [[CMP_N14]], label [[FOR_END_LOOPEXIT]], label [[VEC_EPILOG_SCALAR_PH]]
1944 ; AVX2: vec.epilog.scalar.ph:
1945 ; AVX2-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC9]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
1946 ; AVX2-NEXT: br label [[FOR_BODY1:%.*]]
1948 ; AVX2-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
1949 ; AVX2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[TRIGGER]], i64 [[INDVARS_IV]]
1950 ; AVX2-NEXT: [[TMP41:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
1951 ; AVX2-NEXT: [[TMP42:%.*]] = and i8 [[TMP41]], 1
1952 ; AVX2-NEXT: [[TOBOOL:%.*]] = icmp eq i8 [[TMP42]], 0
1953 ; AVX2-NEXT: br i1 [[TOBOOL]], label [[FOR_INC]], label [[LAND_LHS_TRUE:%.*]]
1954 ; AVX2: land.lhs.true:
1955 ; AVX2-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds ptr, ptr [[IN]], i64 [[INDVARS_IV]]
1956 ; AVX2-NEXT: [[TMP43:%.*]] = load ptr, ptr [[ARRAYIDX2]], align 8
1957 ; AVX2-NEXT: [[CMP3:%.*]] = icmp eq ptr [[TMP43]], null
1958 ; AVX2-NEXT: br i1 [[CMP3]], label [[FOR_INC]], label [[IF_THEN:%.*]]
1960 ; AVX2-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds double, ptr [[OUT]], i64 [[INDVARS_IV]]
1961 ; AVX2-NEXT: store double 5.000000e-01, ptr [[ARRAYIDX5]], align 8
1962 ; AVX2-NEXT: br label [[FOR_INC]]
1964 ; AVX2-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
1965 ; AVX2-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
1966 ; AVX2-NEXT: br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY1]], !llvm.loop [[LOOP33:![0-9]+]]
1967 ; AVX2: for.end.loopexit:
1968 ; AVX2-NEXT: br label [[FOR_END]]
1970 ; AVX2-NEXT: ret void
1972 ; AVX512-LABEL: @foo7(
1973 ; AVX512-NEXT: entry:
1974 ; AVX512-NEXT: [[CMP5:%.*]] = icmp eq i32 [[SIZE:%.*]], 0
1975 ; AVX512-NEXT: br i1 [[CMP5]], label [[FOR_END:%.*]], label [[FOR_BODY_PREHEADER:%.*]]
1976 ; AVX512: iter.check:
1977 ; AVX512-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[SIZE]] to i64
1978 ; AVX512-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 8
1979 ; AVX512-NEXT: br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
1980 ; AVX512: vector.main.loop.iter.check:
1981 ; AVX512-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 32
1982 ; AVX512-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
1983 ; AVX512: vector.ph:
1984 ; AVX512-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 32
1985 ; AVX512-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
1986 ; AVX512-NEXT: br label [[VECTOR_BODY:%.*]]
1987 ; AVX512: vector.body:
1988 ; AVX512-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
1989 ; AVX512-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
1990 ; AVX512-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[TRIGGER:%.*]], i64 [[TMP0]]
1991 ; AVX512-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
1992 ; AVX512-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 8
1993 ; AVX512-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 16
1994 ; AVX512-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 24
1995 ; AVX512-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i8>, ptr [[TMP2]], align 1
1996 ; AVX512-NEXT: [[WIDE_LOAD1:%.*]] = load <8 x i8>, ptr [[TMP3]], align 1
1997 ; AVX512-NEXT: [[WIDE_LOAD2:%.*]] = load <8 x i8>, ptr [[TMP4]], align 1
1998 ; AVX512-NEXT: [[WIDE_LOAD3:%.*]] = load <8 x i8>, ptr [[TMP5]], align 1
1999 ; AVX512-NEXT: [[TMP6:%.*]] = and <8 x i8> [[WIDE_LOAD]], splat (i8 1)
2000 ; AVX512-NEXT: [[TMP7:%.*]] = and <8 x i8> [[WIDE_LOAD1]], splat (i8 1)
2001 ; AVX512-NEXT: [[TMP8:%.*]] = and <8 x i8> [[WIDE_LOAD2]], splat (i8 1)
2002 ; AVX512-NEXT: [[TMP9:%.*]] = and <8 x i8> [[WIDE_LOAD3]], splat (i8 1)
2003 ; AVX512-NEXT: [[TMP10:%.*]] = icmp eq <8 x i8> [[TMP6]], zeroinitializer
2004 ; AVX512-NEXT: [[TMP11:%.*]] = icmp eq <8 x i8> [[TMP7]], zeroinitializer
2005 ; AVX512-NEXT: [[TMP12:%.*]] = icmp eq <8 x i8> [[TMP8]], zeroinitializer
2006 ; AVX512-NEXT: [[TMP13:%.*]] = icmp eq <8 x i8> [[TMP9]], zeroinitializer
2007 ; AVX512-NEXT: [[TMP14:%.*]] = xor <8 x i1> [[TMP10]], splat (i1 true)
2008 ; AVX512-NEXT: [[TMP15:%.*]] = xor <8 x i1> [[TMP11]], splat (i1 true)
2009 ; AVX512-NEXT: [[TMP16:%.*]] = xor <8 x i1> [[TMP12]], splat (i1 true)
2010 ; AVX512-NEXT: [[TMP17:%.*]] = xor <8 x i1> [[TMP13]], splat (i1 true)
2011 ; AVX512-NEXT: [[TMP18:%.*]] = getelementptr ptr, ptr [[IN:%.*]], i64 [[TMP0]]
2012 ; AVX512-NEXT: [[TMP19:%.*]] = getelementptr ptr, ptr [[TMP18]], i32 0
2013 ; AVX512-NEXT: [[TMP20:%.*]] = getelementptr ptr, ptr [[TMP18]], i32 8
2014 ; AVX512-NEXT: [[TMP21:%.*]] = getelementptr ptr, ptr [[TMP18]], i32 16
2015 ; AVX512-NEXT: [[TMP22:%.*]] = getelementptr ptr, ptr [[TMP18]], i32 24
2016 ; AVX512-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x ptr> @llvm.masked.load.v8p0.p0(ptr [[TMP19]], i32 8, <8 x i1> [[TMP14]], <8 x ptr> poison)
2017 ; AVX512-NEXT: [[WIDE_MASKED_LOAD4:%.*]] = call <8 x ptr> @llvm.masked.load.v8p0.p0(ptr [[TMP20]], i32 8, <8 x i1> [[TMP15]], <8 x ptr> poison)
2018 ; AVX512-NEXT: [[WIDE_MASKED_LOAD5:%.*]] = call <8 x ptr> @llvm.masked.load.v8p0.p0(ptr [[TMP21]], i32 8, <8 x i1> [[TMP16]], <8 x ptr> poison)
2019 ; AVX512-NEXT: [[WIDE_MASKED_LOAD6:%.*]] = call <8 x ptr> @llvm.masked.load.v8p0.p0(ptr [[TMP22]], i32 8, <8 x i1> [[TMP17]], <8 x ptr> poison)
2020 ; AVX512-NEXT: [[TMP23:%.*]] = icmp eq <8 x ptr> [[WIDE_MASKED_LOAD]], zeroinitializer
2021 ; AVX512-NEXT: [[TMP24:%.*]] = icmp eq <8 x ptr> [[WIDE_MASKED_LOAD4]], zeroinitializer
2022 ; AVX512-NEXT: [[TMP25:%.*]] = icmp eq <8 x ptr> [[WIDE_MASKED_LOAD5]], zeroinitializer
2023 ; AVX512-NEXT: [[TMP26:%.*]] = icmp eq <8 x ptr> [[WIDE_MASKED_LOAD6]], zeroinitializer
2024 ; AVX512-NEXT: [[TMP27:%.*]] = xor <8 x i1> [[TMP23]], splat (i1 true)
2025 ; AVX512-NEXT: [[TMP28:%.*]] = xor <8 x i1> [[TMP24]], splat (i1 true)
2026 ; AVX512-NEXT: [[TMP29:%.*]] = xor <8 x i1> [[TMP25]], splat (i1 true)
2027 ; AVX512-NEXT: [[TMP30:%.*]] = xor <8 x i1> [[TMP26]], splat (i1 true)
2028 ; AVX512-NEXT: [[TMP31:%.*]] = select <8 x i1> [[TMP14]], <8 x i1> [[TMP27]], <8 x i1> zeroinitializer
2029 ; AVX512-NEXT: [[TMP32:%.*]] = select <8 x i1> [[TMP15]], <8 x i1> [[TMP28]], <8 x i1> zeroinitializer
2030 ; AVX512-NEXT: [[TMP33:%.*]] = select <8 x i1> [[TMP16]], <8 x i1> [[TMP29]], <8 x i1> zeroinitializer
2031 ; AVX512-NEXT: [[TMP34:%.*]] = select <8 x i1> [[TMP17]], <8 x i1> [[TMP30]], <8 x i1> zeroinitializer
2032 ; AVX512-NEXT: [[TMP35:%.*]] = getelementptr double, ptr [[OUT:%.*]], i64 [[TMP0]]
2033 ; AVX512-NEXT: [[TMP36:%.*]] = getelementptr double, ptr [[TMP35]], i32 0
2034 ; AVX512-NEXT: [[TMP37:%.*]] = getelementptr double, ptr [[TMP35]], i32 8
2035 ; AVX512-NEXT: [[TMP38:%.*]] = getelementptr double, ptr [[TMP35]], i32 16
2036 ; AVX512-NEXT: [[TMP39:%.*]] = getelementptr double, ptr [[TMP35]], i32 24
2037 ; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0(<8 x double> splat (double 5.000000e-01), ptr [[TMP36]], i32 8, <8 x i1> [[TMP31]])
2038 ; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0(<8 x double> splat (double 5.000000e-01), ptr [[TMP37]], i32 8, <8 x i1> [[TMP32]])
2039 ; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0(<8 x double> splat (double 5.000000e-01), ptr [[TMP38]], i32 8, <8 x i1> [[TMP33]])
2040 ; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0(<8 x double> splat (double 5.000000e-01), ptr [[TMP39]], i32 8, <8 x i1> [[TMP34]])
2041 ; AVX512-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
2042 ; AVX512-NEXT: [[TMP40:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
2043 ; AVX512-NEXT: br i1 [[TMP40]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP50:![0-9]+]]
2044 ; AVX512: middle.block:
2045 ; AVX512-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
2046 ; AVX512-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
2047 ; AVX512: vec.epilog.iter.check:
2048 ; AVX512-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
2049 ; AVX512-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 8
2050 ; AVX512-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[SCALAR_PH]]
2051 ; AVX512: vec.epilog.ph:
2052 ; AVX512-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
2053 ; AVX512-NEXT: [[N_MOD_VF8:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 8
2054 ; AVX512-NEXT: [[N_VEC9:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF8]]
2055 ; AVX512-NEXT: br label [[FOR_BODY:%.*]]
2056 ; AVX512: vec.epilog.vector.body:
2057 ; AVX512-NEXT: [[INDEX10:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDEX_NEXT13:%.*]], [[FOR_BODY]] ]
2058 ; AVX512-NEXT: [[TMP55:%.*]] = add i64 [[INDEX10]], 0
2059 ; AVX512-NEXT: [[TMP56:%.*]] = getelementptr inbounds i8, ptr [[TRIGGER]], i64 [[TMP55]]
2060 ; AVX512-NEXT: [[TMP57:%.*]] = getelementptr inbounds i8, ptr [[TMP56]], i32 0
2061 ; AVX512-NEXT: [[WIDE_LOAD11:%.*]] = load <8 x i8>, ptr [[TMP57]], align 1
2062 ; AVX512-NEXT: [[TMP44:%.*]] = and <8 x i8> [[WIDE_LOAD11]], splat (i8 1)
2063 ; AVX512-NEXT: [[TMP45:%.*]] = icmp eq <8 x i8> [[TMP44]], zeroinitializer
2064 ; AVX512-NEXT: [[TMP46:%.*]] = xor <8 x i1> [[TMP45]], splat (i1 true)
2065 ; AVX512-NEXT: [[TMP47:%.*]] = getelementptr ptr, ptr [[IN]], i64 [[TMP55]]
2066 ; AVX512-NEXT: [[TMP48:%.*]] = getelementptr ptr, ptr [[TMP47]], i32 0
2067 ; AVX512-NEXT: [[WIDE_MASKED_LOAD12:%.*]] = call <8 x ptr> @llvm.masked.load.v8p0.p0(ptr [[TMP48]], i32 8, <8 x i1> [[TMP46]], <8 x ptr> poison)
2068 ; AVX512-NEXT: [[TMP49:%.*]] = icmp eq <8 x ptr> [[WIDE_MASKED_LOAD12]], zeroinitializer
2069 ; AVX512-NEXT: [[TMP50:%.*]] = xor <8 x i1> [[TMP49]], splat (i1 true)
2070 ; AVX512-NEXT: [[TMP51:%.*]] = select <8 x i1> [[TMP46]], <8 x i1> [[TMP50]], <8 x i1> zeroinitializer
2071 ; AVX512-NEXT: [[TMP52:%.*]] = getelementptr double, ptr [[OUT]], i64 [[TMP55]]
2072 ; AVX512-NEXT: [[TMP53:%.*]] = getelementptr double, ptr [[TMP52]], i32 0
2073 ; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0(<8 x double> splat (double 5.000000e-01), ptr [[TMP53]], i32 8, <8 x i1> [[TMP51]])
2074 ; AVX512-NEXT: [[INDEX_NEXT13]] = add nuw i64 [[INDEX10]], 8
2075 ; AVX512-NEXT: [[TMP54:%.*]] = icmp eq i64 [[INDEX_NEXT13]], [[N_VEC9]]
2076 ; AVX512-NEXT: br i1 [[TMP54]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP51:![0-9]+]]
2077 ; AVX512: vec.epilog.middle.block:
2078 ; AVX512-NEXT: [[CMP_N14:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC9]]
2079 ; AVX512-NEXT: br i1 [[CMP_N14]], label [[FOR_END_LOOPEXIT]], label [[VEC_EPILOG_SCALAR_PH]]
2080 ; AVX512: vec.epilog.scalar.ph:
2081 ; AVX512-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC9]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
2082 ; AVX512-NEXT: br label [[FOR_BODY1:%.*]]
2084 ; AVX512-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
2085 ; AVX512-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[TRIGGER]], i64 [[INDVARS_IV]]
2086 ; AVX512-NEXT: [[TMP41:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
2087 ; AVX512-NEXT: [[TMP42:%.*]] = and i8 [[TMP41]], 1
2088 ; AVX512-NEXT: [[TOBOOL:%.*]] = icmp eq i8 [[TMP42]], 0
2089 ; AVX512-NEXT: br i1 [[TOBOOL]], label [[FOR_INC]], label [[LAND_LHS_TRUE:%.*]]
2090 ; AVX512: land.lhs.true:
2091 ; AVX512-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds ptr, ptr [[IN]], i64 [[INDVARS_IV]]
2092 ; AVX512-NEXT: [[TMP43:%.*]] = load ptr, ptr [[ARRAYIDX2]], align 8
2093 ; AVX512-NEXT: [[CMP3:%.*]] = icmp eq ptr [[TMP43]], null
2094 ; AVX512-NEXT: br i1 [[CMP3]], label [[FOR_INC]], label [[IF_THEN:%.*]]
2096 ; AVX512-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds double, ptr [[OUT]], i64 [[INDVARS_IV]]
2097 ; AVX512-NEXT: store double 5.000000e-01, ptr [[ARRAYIDX5]], align 8
2098 ; AVX512-NEXT: br label [[FOR_INC]]
2100 ; AVX512-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
2101 ; AVX512-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
2102 ; AVX512-NEXT: br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY1]], !llvm.loop [[LOOP52:![0-9]+]]
2103 ; AVX512: for.end.loopexit:
2104 ; AVX512-NEXT: br label [[FOR_END]]
2106 ; AVX512-NEXT: ret void
2109 %cmp5 = icmp eq i32 %size, 0
2110 br i1 %cmp5, label %for.end, label %for.body.preheader
2112 for.body.preheader: ; preds = %entry
2113 %wide.trip.count = zext i32 %size to i64
2116 for.body: ; preds = %for.inc, %for.body.preheader
2117 %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.inc ]
2118 %arrayidx = getelementptr inbounds i8, ptr %trigger, i64 %indvars.iv
2119 %0 = load i8, ptr %arrayidx, align 1
2121 %tobool = icmp eq i8 %1, 0
2122 br i1 %tobool, label %for.inc, label %land.lhs.true
2124 land.lhs.true: ; preds = %for.body
2125 %arrayidx2 = getelementptr inbounds ptr, ptr %in, i64 %indvars.iv
2126 %2 = load ptr, ptr %arrayidx2, align 8
2127 %cmp3 = icmp eq ptr %2, null
2128 br i1 %cmp3, label %for.inc, label %if.then
2130 if.then: ; preds = %land.lhs.true
2131 %arrayidx5 = getelementptr inbounds double, ptr %out, i64 %indvars.iv
2132 store double 5.000000e-01, ptr %arrayidx5, align 8
2135 for.inc: ; preds = %land.lhs.true, %for.body, %if.then
2136 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
2137 %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count
2138 br i1 %exitcond, label %for.end, label %for.body
2140 for.end: ; preds = %for.inc, %entry
2144 ;typedef int (*fp)();
2145 ;void foo8 (ptr __restrict__ out, fp* __restrict__ in, bool * __restrict__ trigger, unsigned size) {
2147 ; for (unsigned i=0; i<size; i++)
2148 ; if (trigger[i] && (in[i] != 0))
2149 ; out[i] = (double) 0.5;
2152 define void @foo8(ptr noalias nocapture %out, ptr noalias nocapture readonly %in, ptr noalias nocapture readonly %trigger, i32 %size) local_unnamed_addr #0 {
2153 ; AVX1-LABEL: @foo8(
2155 ; AVX1-NEXT: [[CMP5:%.*]] = icmp eq i32 [[SIZE:%.*]], 0
2156 ; AVX1-NEXT: br i1 [[CMP5]], label [[FOR_END:%.*]], label [[FOR_BODY_PREHEADER:%.*]]
2158 ; AVX1-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[SIZE]] to i64
2159 ; AVX1-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 4
2160 ; AVX1-NEXT: br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
2161 ; AVX1: vector.main.loop.iter.check:
2162 ; AVX1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 16
2163 ; AVX1-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
2165 ; AVX1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 16
2166 ; AVX1-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
2167 ; AVX1-NEXT: br label [[VECTOR_BODY:%.*]]
2168 ; AVX1: vector.body:
2169 ; AVX1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
2170 ; AVX1-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
2171 ; AVX1-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[TRIGGER:%.*]], i64 [[TMP0]]
2172 ; AVX1-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
2173 ; AVX1-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 4
2174 ; AVX1-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 8
2175 ; AVX1-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 12
2176 ; AVX1-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP2]], align 1
2177 ; AVX1-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i8>, ptr [[TMP3]], align 1
2178 ; AVX1-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP4]], align 1
2179 ; AVX1-NEXT: [[WIDE_LOAD3:%.*]] = load <4 x i8>, ptr [[TMP5]], align 1
2180 ; AVX1-NEXT: [[TMP6:%.*]] = and <4 x i8> [[WIDE_LOAD]], splat (i8 1)
2181 ; AVX1-NEXT: [[TMP7:%.*]] = and <4 x i8> [[WIDE_LOAD1]], splat (i8 1)
2182 ; AVX1-NEXT: [[TMP8:%.*]] = and <4 x i8> [[WIDE_LOAD2]], splat (i8 1)
2183 ; AVX1-NEXT: [[TMP9:%.*]] = and <4 x i8> [[WIDE_LOAD3]], splat (i8 1)
2184 ; AVX1-NEXT: [[TMP10:%.*]] = icmp eq <4 x i8> [[TMP6]], zeroinitializer
2185 ; AVX1-NEXT: [[TMP11:%.*]] = icmp eq <4 x i8> [[TMP7]], zeroinitializer
2186 ; AVX1-NEXT: [[TMP12:%.*]] = icmp eq <4 x i8> [[TMP8]], zeroinitializer
2187 ; AVX1-NEXT: [[TMP13:%.*]] = icmp eq <4 x i8> [[TMP9]], zeroinitializer
2188 ; AVX1-NEXT: [[TMP14:%.*]] = xor <4 x i1> [[TMP10]], splat (i1 true)
2189 ; AVX1-NEXT: [[TMP15:%.*]] = xor <4 x i1> [[TMP11]], splat (i1 true)
2190 ; AVX1-NEXT: [[TMP16:%.*]] = xor <4 x i1> [[TMP12]], splat (i1 true)
2191 ; AVX1-NEXT: [[TMP17:%.*]] = xor <4 x i1> [[TMP13]], splat (i1 true)
2192 ; AVX1-NEXT: [[TMP18:%.*]] = getelementptr ptr, ptr [[IN:%.*]], i64 [[TMP0]]
2193 ; AVX1-NEXT: [[TMP19:%.*]] = getelementptr ptr, ptr [[TMP18]], i32 0
2194 ; AVX1-NEXT: [[TMP20:%.*]] = getelementptr ptr, ptr [[TMP18]], i32 4
2195 ; AVX1-NEXT: [[TMP21:%.*]] = getelementptr ptr, ptr [[TMP18]], i32 8
2196 ; AVX1-NEXT: [[TMP22:%.*]] = getelementptr ptr, ptr [[TMP18]], i32 12
2197 ; AVX1-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP19]], i32 8, <4 x i1> [[TMP14]], <4 x ptr> poison)
2198 ; AVX1-NEXT: [[WIDE_MASKED_LOAD4:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP20]], i32 8, <4 x i1> [[TMP15]], <4 x ptr> poison)
2199 ; AVX1-NEXT: [[WIDE_MASKED_LOAD5:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP21]], i32 8, <4 x i1> [[TMP16]], <4 x ptr> poison)
2200 ; AVX1-NEXT: [[WIDE_MASKED_LOAD6:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP22]], i32 8, <4 x i1> [[TMP17]], <4 x ptr> poison)
2201 ; AVX1-NEXT: [[TMP23:%.*]] = icmp eq <4 x ptr> [[WIDE_MASKED_LOAD]], zeroinitializer
2202 ; AVX1-NEXT: [[TMP24:%.*]] = icmp eq <4 x ptr> [[WIDE_MASKED_LOAD4]], zeroinitializer
2203 ; AVX1-NEXT: [[TMP25:%.*]] = icmp eq <4 x ptr> [[WIDE_MASKED_LOAD5]], zeroinitializer
2204 ; AVX1-NEXT: [[TMP26:%.*]] = icmp eq <4 x ptr> [[WIDE_MASKED_LOAD6]], zeroinitializer
2205 ; AVX1-NEXT: [[TMP27:%.*]] = xor <4 x i1> [[TMP23]], splat (i1 true)
2206 ; AVX1-NEXT: [[TMP28:%.*]] = xor <4 x i1> [[TMP24]], splat (i1 true)
2207 ; AVX1-NEXT: [[TMP29:%.*]] = xor <4 x i1> [[TMP25]], splat (i1 true)
2208 ; AVX1-NEXT: [[TMP30:%.*]] = xor <4 x i1> [[TMP26]], splat (i1 true)
2209 ; AVX1-NEXT: [[TMP31:%.*]] = select <4 x i1> [[TMP14]], <4 x i1> [[TMP27]], <4 x i1> zeroinitializer
2210 ; AVX1-NEXT: [[TMP32:%.*]] = select <4 x i1> [[TMP15]], <4 x i1> [[TMP28]], <4 x i1> zeroinitializer
2211 ; AVX1-NEXT: [[TMP33:%.*]] = select <4 x i1> [[TMP16]], <4 x i1> [[TMP29]], <4 x i1> zeroinitializer
2212 ; AVX1-NEXT: [[TMP34:%.*]] = select <4 x i1> [[TMP17]], <4 x i1> [[TMP30]], <4 x i1> zeroinitializer
2213 ; AVX1-NEXT: [[TMP35:%.*]] = getelementptr double, ptr [[OUT:%.*]], i64 [[TMP0]]
2214 ; AVX1-NEXT: [[TMP36:%.*]] = getelementptr double, ptr [[TMP35]], i32 0
2215 ; AVX1-NEXT: [[TMP37:%.*]] = getelementptr double, ptr [[TMP35]], i32 4
2216 ; AVX1-NEXT: [[TMP38:%.*]] = getelementptr double, ptr [[TMP35]], i32 8
2217 ; AVX1-NEXT: [[TMP39:%.*]] = getelementptr double, ptr [[TMP35]], i32 12
2218 ; AVX1-NEXT: call void @llvm.masked.store.v4f64.p0(<4 x double> splat (double 5.000000e-01), ptr [[TMP36]], i32 8, <4 x i1> [[TMP31]])
2219 ; AVX1-NEXT: call void @llvm.masked.store.v4f64.p0(<4 x double> splat (double 5.000000e-01), ptr [[TMP37]], i32 8, <4 x i1> [[TMP32]])
2220 ; AVX1-NEXT: call void @llvm.masked.store.v4f64.p0(<4 x double> splat (double 5.000000e-01), ptr [[TMP38]], i32 8, <4 x i1> [[TMP33]])
2221 ; AVX1-NEXT: call void @llvm.masked.store.v4f64.p0(<4 x double> splat (double 5.000000e-01), ptr [[TMP39]], i32 8, <4 x i1> [[TMP34]])
2222 ; AVX1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
2223 ; AVX1-NEXT: [[TMP40:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
2224 ; AVX1-NEXT: br i1 [[TMP40]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
2225 ; AVX1: middle.block:
2226 ; AVX1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
2227 ; AVX1-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
2228 ; AVX1: vec.epilog.iter.check:
2229 ; AVX1-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
2230 ; AVX1-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 4
2231 ; AVX1-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[SCALAR_PH]]
2232 ; AVX1: vec.epilog.ph:
2233 ; AVX1-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
2234 ; AVX1-NEXT: [[N_MOD_VF8:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 4
2235 ; AVX1-NEXT: [[N_VEC9:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF8]]
2236 ; AVX1-NEXT: br label [[FOR_BODY:%.*]]
2237 ; AVX1: vec.epilog.vector.body:
2238 ; AVX1-NEXT: [[INDEX10:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDEX_NEXT13:%.*]], [[FOR_BODY]] ]
2239 ; AVX1-NEXT: [[TMP55:%.*]] = add i64 [[INDEX10]], 0
2240 ; AVX1-NEXT: [[TMP56:%.*]] = getelementptr inbounds i8, ptr [[TRIGGER]], i64 [[TMP55]]
2241 ; AVX1-NEXT: [[TMP57:%.*]] = getelementptr inbounds i8, ptr [[TMP56]], i32 0
2242 ; AVX1-NEXT: [[WIDE_LOAD11:%.*]] = load <4 x i8>, ptr [[TMP57]], align 1
2243 ; AVX1-NEXT: [[TMP44:%.*]] = and <4 x i8> [[WIDE_LOAD11]], splat (i8 1)
2244 ; AVX1-NEXT: [[TMP45:%.*]] = icmp eq <4 x i8> [[TMP44]], zeroinitializer
2245 ; AVX1-NEXT: [[TMP46:%.*]] = xor <4 x i1> [[TMP45]], splat (i1 true)
2246 ; AVX1-NEXT: [[TMP47:%.*]] = getelementptr ptr, ptr [[IN]], i64 [[TMP55]]
2247 ; AVX1-NEXT: [[TMP48:%.*]] = getelementptr ptr, ptr [[TMP47]], i32 0
2248 ; AVX1-NEXT: [[WIDE_MASKED_LOAD12:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP48]], i32 8, <4 x i1> [[TMP46]], <4 x ptr> poison)
2249 ; AVX1-NEXT: [[TMP49:%.*]] = icmp eq <4 x ptr> [[WIDE_MASKED_LOAD12]], zeroinitializer
2250 ; AVX1-NEXT: [[TMP50:%.*]] = xor <4 x i1> [[TMP49]], splat (i1 true)
2251 ; AVX1-NEXT: [[TMP51:%.*]] = select <4 x i1> [[TMP46]], <4 x i1> [[TMP50]], <4 x i1> zeroinitializer
2252 ; AVX1-NEXT: [[TMP52:%.*]] = getelementptr double, ptr [[OUT]], i64 [[TMP55]]
2253 ; AVX1-NEXT: [[TMP53:%.*]] = getelementptr double, ptr [[TMP52]], i32 0
2254 ; AVX1-NEXT: call void @llvm.masked.store.v4f64.p0(<4 x double> splat (double 5.000000e-01), ptr [[TMP53]], i32 8, <4 x i1> [[TMP51]])
2255 ; AVX1-NEXT: [[INDEX_NEXT13]] = add nuw i64 [[INDEX10]], 4
2256 ; AVX1-NEXT: [[TMP54:%.*]] = icmp eq i64 [[INDEX_NEXT13]], [[N_VEC9]]
2257 ; AVX1-NEXT: br i1 [[TMP54]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
2258 ; AVX1: vec.epilog.middle.block:
2259 ; AVX1-NEXT: [[CMP_N14:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC9]]
2260 ; AVX1-NEXT: br i1 [[CMP_N14]], label [[FOR_END_LOOPEXIT]], label [[VEC_EPILOG_SCALAR_PH]]
2261 ; AVX1: vec.epilog.scalar.ph:
2262 ; AVX1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC9]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
2263 ; AVX1-NEXT: br label [[FOR_BODY1:%.*]]
2265 ; AVX1-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
2266 ; AVX1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[TRIGGER]], i64 [[INDVARS_IV]]
2267 ; AVX1-NEXT: [[TMP41:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
2268 ; AVX1-NEXT: [[TMP42:%.*]] = and i8 [[TMP41]], 1
2269 ; AVX1-NEXT: [[TOBOOL:%.*]] = icmp eq i8 [[TMP42]], 0
2270 ; AVX1-NEXT: br i1 [[TOBOOL]], label [[FOR_INC]], label [[LAND_LHS_TRUE:%.*]]
2271 ; AVX1: land.lhs.true:
2272 ; AVX1-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds ptr, ptr [[IN]], i64 [[INDVARS_IV]]
2273 ; AVX1-NEXT: [[TMP43:%.*]] = load ptr, ptr [[ARRAYIDX2]], align 8
2274 ; AVX1-NEXT: [[CMP3:%.*]] = icmp eq ptr [[TMP43]], null
2275 ; AVX1-NEXT: br i1 [[CMP3]], label [[FOR_INC]], label [[IF_THEN:%.*]]
2277 ; AVX1-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds double, ptr [[OUT]], i64 [[INDVARS_IV]]
2278 ; AVX1-NEXT: store double 5.000000e-01, ptr [[ARRAYIDX5]], align 8
2279 ; AVX1-NEXT: br label [[FOR_INC]]
2281 ; AVX1-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
2282 ; AVX1-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
2283 ; AVX1-NEXT: br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY1]], !llvm.loop [[LOOP23:![0-9]+]]
2284 ; AVX1: for.end.loopexit:
2285 ; AVX1-NEXT: br label [[FOR_END]]
2287 ; AVX1-NEXT: ret void
2289 ; AVX2-LABEL: @foo8(
2291 ; AVX2-NEXT: [[CMP5:%.*]] = icmp eq i32 [[SIZE:%.*]], 0
2292 ; AVX2-NEXT: br i1 [[CMP5]], label [[FOR_END:%.*]], label [[FOR_BODY_PREHEADER:%.*]]
2294 ; AVX2-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[SIZE]] to i64
2295 ; AVX2-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 4
2296 ; AVX2-NEXT: br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
2297 ; AVX2: vector.main.loop.iter.check:
2298 ; AVX2-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 16
2299 ; AVX2-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
2301 ; AVX2-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 16
2302 ; AVX2-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
2303 ; AVX2-NEXT: br label [[VECTOR_BODY:%.*]]
2304 ; AVX2: vector.body:
2305 ; AVX2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
2306 ; AVX2-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
2307 ; AVX2-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[TRIGGER:%.*]], i64 [[TMP0]]
2308 ; AVX2-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
2309 ; AVX2-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 4
2310 ; AVX2-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 8
2311 ; AVX2-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 12
2312 ; AVX2-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP2]], align 1
2313 ; AVX2-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i8>, ptr [[TMP3]], align 1
2314 ; AVX2-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP4]], align 1
2315 ; AVX2-NEXT: [[WIDE_LOAD3:%.*]] = load <4 x i8>, ptr [[TMP5]], align 1
2316 ; AVX2-NEXT: [[TMP6:%.*]] = and <4 x i8> [[WIDE_LOAD]], splat (i8 1)
2317 ; AVX2-NEXT: [[TMP7:%.*]] = and <4 x i8> [[WIDE_LOAD1]], splat (i8 1)
2318 ; AVX2-NEXT: [[TMP8:%.*]] = and <4 x i8> [[WIDE_LOAD2]], splat (i8 1)
2319 ; AVX2-NEXT: [[TMP9:%.*]] = and <4 x i8> [[WIDE_LOAD3]], splat (i8 1)
2320 ; AVX2-NEXT: [[TMP10:%.*]] = icmp eq <4 x i8> [[TMP6]], zeroinitializer
2321 ; AVX2-NEXT: [[TMP11:%.*]] = icmp eq <4 x i8> [[TMP7]], zeroinitializer
2322 ; AVX2-NEXT: [[TMP12:%.*]] = icmp eq <4 x i8> [[TMP8]], zeroinitializer
2323 ; AVX2-NEXT: [[TMP13:%.*]] = icmp eq <4 x i8> [[TMP9]], zeroinitializer
2324 ; AVX2-NEXT: [[TMP14:%.*]] = xor <4 x i1> [[TMP10]], splat (i1 true)
2325 ; AVX2-NEXT: [[TMP15:%.*]] = xor <4 x i1> [[TMP11]], splat (i1 true)
2326 ; AVX2-NEXT: [[TMP16:%.*]] = xor <4 x i1> [[TMP12]], splat (i1 true)
2327 ; AVX2-NEXT: [[TMP17:%.*]] = xor <4 x i1> [[TMP13]], splat (i1 true)
2328 ; AVX2-NEXT: [[TMP18:%.*]] = getelementptr ptr, ptr [[IN:%.*]], i64 [[TMP0]]
2329 ; AVX2-NEXT: [[TMP19:%.*]] = getelementptr ptr, ptr [[TMP18]], i32 0
2330 ; AVX2-NEXT: [[TMP20:%.*]] = getelementptr ptr, ptr [[TMP18]], i32 4
2331 ; AVX2-NEXT: [[TMP21:%.*]] = getelementptr ptr, ptr [[TMP18]], i32 8
2332 ; AVX2-NEXT: [[TMP22:%.*]] = getelementptr ptr, ptr [[TMP18]], i32 12
2333 ; AVX2-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP19]], i32 8, <4 x i1> [[TMP14]], <4 x ptr> poison)
2334 ; AVX2-NEXT: [[WIDE_MASKED_LOAD4:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP20]], i32 8, <4 x i1> [[TMP15]], <4 x ptr> poison)
2335 ; AVX2-NEXT: [[WIDE_MASKED_LOAD5:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP21]], i32 8, <4 x i1> [[TMP16]], <4 x ptr> poison)
2336 ; AVX2-NEXT: [[WIDE_MASKED_LOAD6:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP22]], i32 8, <4 x i1> [[TMP17]], <4 x ptr> poison)
2337 ; AVX2-NEXT: [[TMP23:%.*]] = icmp eq <4 x ptr> [[WIDE_MASKED_LOAD]], zeroinitializer
2338 ; AVX2-NEXT: [[TMP24:%.*]] = icmp eq <4 x ptr> [[WIDE_MASKED_LOAD4]], zeroinitializer
2339 ; AVX2-NEXT: [[TMP25:%.*]] = icmp eq <4 x ptr> [[WIDE_MASKED_LOAD5]], zeroinitializer
2340 ; AVX2-NEXT: [[TMP26:%.*]] = icmp eq <4 x ptr> [[WIDE_MASKED_LOAD6]], zeroinitializer
2341 ; AVX2-NEXT: [[TMP27:%.*]] = xor <4 x i1> [[TMP23]], splat (i1 true)
2342 ; AVX2-NEXT: [[TMP28:%.*]] = xor <4 x i1> [[TMP24]], splat (i1 true)
2343 ; AVX2-NEXT: [[TMP29:%.*]] = xor <4 x i1> [[TMP25]], splat (i1 true)
2344 ; AVX2-NEXT: [[TMP30:%.*]] = xor <4 x i1> [[TMP26]], splat (i1 true)
2345 ; AVX2-NEXT: [[TMP31:%.*]] = select <4 x i1> [[TMP14]], <4 x i1> [[TMP27]], <4 x i1> zeroinitializer
2346 ; AVX2-NEXT: [[TMP32:%.*]] = select <4 x i1> [[TMP15]], <4 x i1> [[TMP28]], <4 x i1> zeroinitializer
2347 ; AVX2-NEXT: [[TMP33:%.*]] = select <4 x i1> [[TMP16]], <4 x i1> [[TMP29]], <4 x i1> zeroinitializer
2348 ; AVX2-NEXT: [[TMP34:%.*]] = select <4 x i1> [[TMP17]], <4 x i1> [[TMP30]], <4 x i1> zeroinitializer
2349 ; AVX2-NEXT: [[TMP35:%.*]] = getelementptr double, ptr [[OUT:%.*]], i64 [[TMP0]]
2350 ; AVX2-NEXT: [[TMP36:%.*]] = getelementptr double, ptr [[TMP35]], i32 0
2351 ; AVX2-NEXT: [[TMP37:%.*]] = getelementptr double, ptr [[TMP35]], i32 4
2352 ; AVX2-NEXT: [[TMP38:%.*]] = getelementptr double, ptr [[TMP35]], i32 8
2353 ; AVX2-NEXT: [[TMP39:%.*]] = getelementptr double, ptr [[TMP35]], i32 12
2354 ; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0(<4 x double> splat (double 5.000000e-01), ptr [[TMP36]], i32 8, <4 x i1> [[TMP31]])
2355 ; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0(<4 x double> splat (double 5.000000e-01), ptr [[TMP37]], i32 8, <4 x i1> [[TMP32]])
2356 ; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0(<4 x double> splat (double 5.000000e-01), ptr [[TMP38]], i32 8, <4 x i1> [[TMP33]])
2357 ; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0(<4 x double> splat (double 5.000000e-01), ptr [[TMP39]], i32 8, <4 x i1> [[TMP34]])
2358 ; AVX2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
2359 ; AVX2-NEXT: [[TMP40:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
2360 ; AVX2-NEXT: br i1 [[TMP40]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP34:![0-9]+]]
2361 ; AVX2: middle.block:
2362 ; AVX2-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
2363 ; AVX2-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
2364 ; AVX2: vec.epilog.iter.check:
2365 ; AVX2-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
2366 ; AVX2-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 4
2367 ; AVX2-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[SCALAR_PH]]
2368 ; AVX2: vec.epilog.ph:
2369 ; AVX2-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
2370 ; AVX2-NEXT: [[N_MOD_VF8:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 4
2371 ; AVX2-NEXT: [[N_VEC9:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF8]]
2372 ; AVX2-NEXT: br label [[FOR_BODY:%.*]]
2373 ; AVX2: vec.epilog.vector.body:
2374 ; AVX2-NEXT: [[INDEX10:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDEX_NEXT13:%.*]], [[FOR_BODY]] ]
2375 ; AVX2-NEXT: [[TMP55:%.*]] = add i64 [[INDEX10]], 0
2376 ; AVX2-NEXT: [[TMP56:%.*]] = getelementptr inbounds i8, ptr [[TRIGGER]], i64 [[TMP55]]
2377 ; AVX2-NEXT: [[TMP57:%.*]] = getelementptr inbounds i8, ptr [[TMP56]], i32 0
2378 ; AVX2-NEXT: [[WIDE_LOAD11:%.*]] = load <4 x i8>, ptr [[TMP57]], align 1
2379 ; AVX2-NEXT: [[TMP44:%.*]] = and <4 x i8> [[WIDE_LOAD11]], splat (i8 1)
2380 ; AVX2-NEXT: [[TMP45:%.*]] = icmp eq <4 x i8> [[TMP44]], zeroinitializer
2381 ; AVX2-NEXT: [[TMP46:%.*]] = xor <4 x i1> [[TMP45]], splat (i1 true)
2382 ; AVX2-NEXT: [[TMP47:%.*]] = getelementptr ptr, ptr [[IN]], i64 [[TMP55]]
2383 ; AVX2-NEXT: [[TMP48:%.*]] = getelementptr ptr, ptr [[TMP47]], i32 0
2384 ; AVX2-NEXT: [[WIDE_MASKED_LOAD12:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP48]], i32 8, <4 x i1> [[TMP46]], <4 x ptr> poison)
2385 ; AVX2-NEXT: [[TMP49:%.*]] = icmp eq <4 x ptr> [[WIDE_MASKED_LOAD12]], zeroinitializer
2386 ; AVX2-NEXT: [[TMP50:%.*]] = xor <4 x i1> [[TMP49]], splat (i1 true)
2387 ; AVX2-NEXT: [[TMP51:%.*]] = select <4 x i1> [[TMP46]], <4 x i1> [[TMP50]], <4 x i1> zeroinitializer
2388 ; AVX2-NEXT: [[TMP52:%.*]] = getelementptr double, ptr [[OUT]], i64 [[TMP55]]
2389 ; AVX2-NEXT: [[TMP53:%.*]] = getelementptr double, ptr [[TMP52]], i32 0
2390 ; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0(<4 x double> splat (double 5.000000e-01), ptr [[TMP53]], i32 8, <4 x i1> [[TMP51]])
2391 ; AVX2-NEXT: [[INDEX_NEXT13]] = add nuw i64 [[INDEX10]], 4
2392 ; AVX2-NEXT: [[TMP54:%.*]] = icmp eq i64 [[INDEX_NEXT13]], [[N_VEC9]]
2393 ; AVX2-NEXT: br i1 [[TMP54]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP35:![0-9]+]]
2394 ; AVX2: vec.epilog.middle.block:
2395 ; AVX2-NEXT: [[CMP_N14:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC9]]
2396 ; AVX2-NEXT: br i1 [[CMP_N14]], label [[FOR_END_LOOPEXIT]], label [[VEC_EPILOG_SCALAR_PH]]
2397 ; AVX2: vec.epilog.scalar.ph:
2398 ; AVX2-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC9]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
2399 ; AVX2-NEXT: br label [[FOR_BODY1:%.*]]
2401 ; AVX2-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
2402 ; AVX2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[TRIGGER]], i64 [[INDVARS_IV]]
2403 ; AVX2-NEXT: [[TMP41:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
2404 ; AVX2-NEXT: [[TMP42:%.*]] = and i8 [[TMP41]], 1
2405 ; AVX2-NEXT: [[TOBOOL:%.*]] = icmp eq i8 [[TMP42]], 0
2406 ; AVX2-NEXT: br i1 [[TOBOOL]], label [[FOR_INC]], label [[LAND_LHS_TRUE:%.*]]
2407 ; AVX2: land.lhs.true:
2408 ; AVX2-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds ptr, ptr [[IN]], i64 [[INDVARS_IV]]
2409 ; AVX2-NEXT: [[TMP43:%.*]] = load ptr, ptr [[ARRAYIDX2]], align 8
2410 ; AVX2-NEXT: [[CMP3:%.*]] = icmp eq ptr [[TMP43]], null
2411 ; AVX2-NEXT: br i1 [[CMP3]], label [[FOR_INC]], label [[IF_THEN:%.*]]
2413 ; AVX2-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds double, ptr [[OUT]], i64 [[INDVARS_IV]]
2414 ; AVX2-NEXT: store double 5.000000e-01, ptr [[ARRAYIDX5]], align 8
2415 ; AVX2-NEXT: br label [[FOR_INC]]
2417 ; AVX2-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
2418 ; AVX2-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
2419 ; AVX2-NEXT: br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY1]], !llvm.loop [[LOOP36:![0-9]+]]
2420 ; AVX2: for.end.loopexit:
2421 ; AVX2-NEXT: br label [[FOR_END]]
2423 ; AVX2-NEXT: ret void
2425 ; AVX512-LABEL: @foo8(
2426 ; AVX512-NEXT: entry:
2427 ; AVX512-NEXT: [[CMP5:%.*]] = icmp eq i32 [[SIZE:%.*]], 0
2428 ; AVX512-NEXT: br i1 [[CMP5]], label [[FOR_END:%.*]], label [[FOR_BODY_PREHEADER:%.*]]
2429 ; AVX512: iter.check:
2430 ; AVX512-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[SIZE]] to i64
2431 ; AVX512-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 8
2432 ; AVX512-NEXT: br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
2433 ; AVX512: vector.main.loop.iter.check:
2434 ; AVX512-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 32
2435 ; AVX512-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
2436 ; AVX512: vector.ph:
2437 ; AVX512-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 32
2438 ; AVX512-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
2439 ; AVX512-NEXT: br label [[VECTOR_BODY:%.*]]
2440 ; AVX512: vector.body:
2441 ; AVX512-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
2442 ; AVX512-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
2443 ; AVX512-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[TRIGGER:%.*]], i64 [[TMP0]]
2444 ; AVX512-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
2445 ; AVX512-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 8
2446 ; AVX512-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 16
2447 ; AVX512-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 24
2448 ; AVX512-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i8>, ptr [[TMP2]], align 1
2449 ; AVX512-NEXT: [[WIDE_LOAD1:%.*]] = load <8 x i8>, ptr [[TMP3]], align 1
2450 ; AVX512-NEXT: [[WIDE_LOAD2:%.*]] = load <8 x i8>, ptr [[TMP4]], align 1
2451 ; AVX512-NEXT: [[WIDE_LOAD3:%.*]] = load <8 x i8>, ptr [[TMP5]], align 1
2452 ; AVX512-NEXT: [[TMP6:%.*]] = and <8 x i8> [[WIDE_LOAD]], splat (i8 1)
2453 ; AVX512-NEXT: [[TMP7:%.*]] = and <8 x i8> [[WIDE_LOAD1]], splat (i8 1)
2454 ; AVX512-NEXT: [[TMP8:%.*]] = and <8 x i8> [[WIDE_LOAD2]], splat (i8 1)
2455 ; AVX512-NEXT: [[TMP9:%.*]] = and <8 x i8> [[WIDE_LOAD3]], splat (i8 1)
2456 ; AVX512-NEXT: [[TMP10:%.*]] = icmp eq <8 x i8> [[TMP6]], zeroinitializer
2457 ; AVX512-NEXT: [[TMP11:%.*]] = icmp eq <8 x i8> [[TMP7]], zeroinitializer
2458 ; AVX512-NEXT: [[TMP12:%.*]] = icmp eq <8 x i8> [[TMP8]], zeroinitializer
2459 ; AVX512-NEXT: [[TMP13:%.*]] = icmp eq <8 x i8> [[TMP9]], zeroinitializer
2460 ; AVX512-NEXT: [[TMP14:%.*]] = xor <8 x i1> [[TMP10]], splat (i1 true)
2461 ; AVX512-NEXT: [[TMP15:%.*]] = xor <8 x i1> [[TMP11]], splat (i1 true)
2462 ; AVX512-NEXT: [[TMP16:%.*]] = xor <8 x i1> [[TMP12]], splat (i1 true)
2463 ; AVX512-NEXT: [[TMP17:%.*]] = xor <8 x i1> [[TMP13]], splat (i1 true)
2464 ; AVX512-NEXT: [[TMP18:%.*]] = getelementptr ptr, ptr [[IN:%.*]], i64 [[TMP0]]
2465 ; AVX512-NEXT: [[TMP19:%.*]] = getelementptr ptr, ptr [[TMP18]], i32 0
2466 ; AVX512-NEXT: [[TMP20:%.*]] = getelementptr ptr, ptr [[TMP18]], i32 8
2467 ; AVX512-NEXT: [[TMP21:%.*]] = getelementptr ptr, ptr [[TMP18]], i32 16
2468 ; AVX512-NEXT: [[TMP22:%.*]] = getelementptr ptr, ptr [[TMP18]], i32 24
2469 ; AVX512-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x ptr> @llvm.masked.load.v8p0.p0(ptr [[TMP19]], i32 8, <8 x i1> [[TMP14]], <8 x ptr> poison)
2470 ; AVX512-NEXT: [[WIDE_MASKED_LOAD4:%.*]] = call <8 x ptr> @llvm.masked.load.v8p0.p0(ptr [[TMP20]], i32 8, <8 x i1> [[TMP15]], <8 x ptr> poison)
2471 ; AVX512-NEXT: [[WIDE_MASKED_LOAD5:%.*]] = call <8 x ptr> @llvm.masked.load.v8p0.p0(ptr [[TMP21]], i32 8, <8 x i1> [[TMP16]], <8 x ptr> poison)
2472 ; AVX512-NEXT: [[WIDE_MASKED_LOAD6:%.*]] = call <8 x ptr> @llvm.masked.load.v8p0.p0(ptr [[TMP22]], i32 8, <8 x i1> [[TMP17]], <8 x ptr> poison)
2473 ; AVX512-NEXT: [[TMP23:%.*]] = icmp eq <8 x ptr> [[WIDE_MASKED_LOAD]], zeroinitializer
2474 ; AVX512-NEXT: [[TMP24:%.*]] = icmp eq <8 x ptr> [[WIDE_MASKED_LOAD4]], zeroinitializer
2475 ; AVX512-NEXT: [[TMP25:%.*]] = icmp eq <8 x ptr> [[WIDE_MASKED_LOAD5]], zeroinitializer
2476 ; AVX512-NEXT: [[TMP26:%.*]] = icmp eq <8 x ptr> [[WIDE_MASKED_LOAD6]], zeroinitializer
2477 ; AVX512-NEXT: [[TMP27:%.*]] = xor <8 x i1> [[TMP23]], splat (i1 true)
2478 ; AVX512-NEXT: [[TMP28:%.*]] = xor <8 x i1> [[TMP24]], splat (i1 true)
2479 ; AVX512-NEXT: [[TMP29:%.*]] = xor <8 x i1> [[TMP25]], splat (i1 true)
2480 ; AVX512-NEXT: [[TMP30:%.*]] = xor <8 x i1> [[TMP26]], splat (i1 true)
2481 ; AVX512-NEXT: [[TMP31:%.*]] = select <8 x i1> [[TMP14]], <8 x i1> [[TMP27]], <8 x i1> zeroinitializer
2482 ; AVX512-NEXT: [[TMP32:%.*]] = select <8 x i1> [[TMP15]], <8 x i1> [[TMP28]], <8 x i1> zeroinitializer
2483 ; AVX512-NEXT: [[TMP33:%.*]] = select <8 x i1> [[TMP16]], <8 x i1> [[TMP29]], <8 x i1> zeroinitializer
2484 ; AVX512-NEXT: [[TMP34:%.*]] = select <8 x i1> [[TMP17]], <8 x i1> [[TMP30]], <8 x i1> zeroinitializer
2485 ; AVX512-NEXT: [[TMP35:%.*]] = getelementptr double, ptr [[OUT:%.*]], i64 [[TMP0]]
2486 ; AVX512-NEXT: [[TMP36:%.*]] = getelementptr double, ptr [[TMP35]], i32 0
2487 ; AVX512-NEXT: [[TMP37:%.*]] = getelementptr double, ptr [[TMP35]], i32 8
2488 ; AVX512-NEXT: [[TMP38:%.*]] = getelementptr double, ptr [[TMP35]], i32 16
2489 ; AVX512-NEXT: [[TMP39:%.*]] = getelementptr double, ptr [[TMP35]], i32 24
2490 ; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0(<8 x double> splat (double 5.000000e-01), ptr [[TMP36]], i32 8, <8 x i1> [[TMP31]])
2491 ; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0(<8 x double> splat (double 5.000000e-01), ptr [[TMP37]], i32 8, <8 x i1> [[TMP32]])
2492 ; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0(<8 x double> splat (double 5.000000e-01), ptr [[TMP38]], i32 8, <8 x i1> [[TMP33]])
2493 ; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0(<8 x double> splat (double 5.000000e-01), ptr [[TMP39]], i32 8, <8 x i1> [[TMP34]])
2494 ; AVX512-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
2495 ; AVX512-NEXT: [[TMP40:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
2496 ; AVX512-NEXT: br i1 [[TMP40]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP53:![0-9]+]]
2497 ; AVX512: middle.block:
2498 ; AVX512-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
2499 ; AVX512-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
2500 ; AVX512: vec.epilog.iter.check:
2501 ; AVX512-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
2502 ; AVX512-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 8
2503 ; AVX512-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[SCALAR_PH]]
2504 ; AVX512: vec.epilog.ph:
2505 ; AVX512-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
2506 ; AVX512-NEXT: [[N_MOD_VF8:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 8
2507 ; AVX512-NEXT: [[N_VEC9:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF8]]
2508 ; AVX512-NEXT: br label [[FOR_BODY:%.*]]
2509 ; AVX512: vec.epilog.vector.body:
2510 ; AVX512-NEXT: [[INDEX10:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDEX_NEXT13:%.*]], [[FOR_BODY]] ]
2511 ; AVX512-NEXT: [[TMP55:%.*]] = add i64 [[INDEX10]], 0
2512 ; AVX512-NEXT: [[TMP56:%.*]] = getelementptr inbounds i8, ptr [[TRIGGER]], i64 [[TMP55]]
2513 ; AVX512-NEXT: [[TMP57:%.*]] = getelementptr inbounds i8, ptr [[TMP56]], i32 0
2514 ; AVX512-NEXT: [[WIDE_LOAD11:%.*]] = load <8 x i8>, ptr [[TMP57]], align 1
2515 ; AVX512-NEXT: [[TMP44:%.*]] = and <8 x i8> [[WIDE_LOAD11]], splat (i8 1)
2516 ; AVX512-NEXT: [[TMP45:%.*]] = icmp eq <8 x i8> [[TMP44]], zeroinitializer
2517 ; AVX512-NEXT: [[TMP46:%.*]] = xor <8 x i1> [[TMP45]], splat (i1 true)
2518 ; AVX512-NEXT: [[TMP47:%.*]] = getelementptr ptr, ptr [[IN]], i64 [[TMP55]]
2519 ; AVX512-NEXT: [[TMP48:%.*]] = getelementptr ptr, ptr [[TMP47]], i32 0
2520 ; AVX512-NEXT: [[WIDE_MASKED_LOAD12:%.*]] = call <8 x ptr> @llvm.masked.load.v8p0.p0(ptr [[TMP48]], i32 8, <8 x i1> [[TMP46]], <8 x ptr> poison)
2521 ; AVX512-NEXT: [[TMP49:%.*]] = icmp eq <8 x ptr> [[WIDE_MASKED_LOAD12]], zeroinitializer
2522 ; AVX512-NEXT: [[TMP50:%.*]] = xor <8 x i1> [[TMP49]], splat (i1 true)
2523 ; AVX512-NEXT: [[TMP51:%.*]] = select <8 x i1> [[TMP46]], <8 x i1> [[TMP50]], <8 x i1> zeroinitializer
2524 ; AVX512-NEXT: [[TMP52:%.*]] = getelementptr double, ptr [[OUT]], i64 [[TMP55]]
2525 ; AVX512-NEXT: [[TMP53:%.*]] = getelementptr double, ptr [[TMP52]], i32 0
2526 ; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0(<8 x double> splat (double 5.000000e-01), ptr [[TMP53]], i32 8, <8 x i1> [[TMP51]])
2527 ; AVX512-NEXT: [[INDEX_NEXT13]] = add nuw i64 [[INDEX10]], 8
2528 ; AVX512-NEXT: [[TMP54:%.*]] = icmp eq i64 [[INDEX_NEXT13]], [[N_VEC9]]
2529 ; AVX512-NEXT: br i1 [[TMP54]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP54:![0-9]+]]
2530 ; AVX512: vec.epilog.middle.block:
2531 ; AVX512-NEXT: [[CMP_N14:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC9]]
2532 ; AVX512-NEXT: br i1 [[CMP_N14]], label [[FOR_END_LOOPEXIT]], label [[VEC_EPILOG_SCALAR_PH]]
2533 ; AVX512: vec.epilog.scalar.ph:
2534 ; AVX512-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC9]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
2535 ; AVX512-NEXT: br label [[FOR_BODY1:%.*]]
2537 ; AVX512-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
2538 ; AVX512-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[TRIGGER]], i64 [[INDVARS_IV]]
2539 ; AVX512-NEXT: [[TMP41:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
2540 ; AVX512-NEXT: [[TMP42:%.*]] = and i8 [[TMP41]], 1
2541 ; AVX512-NEXT: [[TOBOOL:%.*]] = icmp eq i8 [[TMP42]], 0
2542 ; AVX512-NEXT: br i1 [[TOBOOL]], label [[FOR_INC]], label [[LAND_LHS_TRUE:%.*]]
2543 ; AVX512: land.lhs.true:
2544 ; AVX512-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds ptr, ptr [[IN]], i64 [[INDVARS_IV]]
2545 ; AVX512-NEXT: [[TMP43:%.*]] = load ptr, ptr [[ARRAYIDX2]], align 8
2546 ; AVX512-NEXT: [[CMP3:%.*]] = icmp eq ptr [[TMP43]], null
2547 ; AVX512-NEXT: br i1 [[CMP3]], label [[FOR_INC]], label [[IF_THEN:%.*]]
2549 ; AVX512-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds double, ptr [[OUT]], i64 [[INDVARS_IV]]
2550 ; AVX512-NEXT: store double 5.000000e-01, ptr [[ARRAYIDX5]], align 8
2551 ; AVX512-NEXT: br label [[FOR_INC]]
2553 ; AVX512-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
2554 ; AVX512-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
2555 ; AVX512-NEXT: br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY1]], !llvm.loop [[LOOP55:![0-9]+]]
2556 ; AVX512: for.end.loopexit:
2557 ; AVX512-NEXT: br label [[FOR_END]]
2559 ; AVX512-NEXT: ret void
2562 %cmp5 = icmp eq i32 %size, 0
2563 br i1 %cmp5, label %for.end, label %for.body.preheader
2565 for.body.preheader: ; preds = %entry
2566 %wide.trip.count = zext i32 %size to i64
2569 for.body: ; preds = %for.inc, %for.body.preheader
2570 %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.inc ]
2571 %arrayidx = getelementptr inbounds i8, ptr %trigger, i64 %indvars.iv
2572 %0 = load i8, ptr %arrayidx, align 1
2574 %tobool = icmp eq i8 %1, 0
2575 br i1 %tobool, label %for.inc, label %land.lhs.true
2577 land.lhs.true: ; preds = %for.body
2578 %arrayidx2 = getelementptr inbounds ptr, ptr %in, i64 %indvars.iv
2579 %2 = load ptr, ptr %arrayidx2, align 8
2580 %cmp3 = icmp eq ptr %2, null
2581 br i1 %cmp3, label %for.inc, label %if.then
2583 if.then: ; preds = %land.lhs.true
2584 %arrayidx5 = getelementptr inbounds double, ptr %out, i64 %indvars.iv
2585 store double 5.000000e-01, ptr %arrayidx5, align 8
2588 for.inc: ; preds = %land.lhs.true, %for.body, %if.then
2589 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
2590 %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count
2591 br i1 %exitcond, label %for.end, label %for.body
2593 for.end: ; preds = %for.inc, %entry
2597 attributes #0 = { norecurse nounwind }