1 ; RUN: opt -mcpu=skx -S -loop-vectorize -instcombine -simplifycfg -force-vector-width=8 -force-vector-interleave=1 -enable-interleaved-mem-accesses < %s | FileCheck %s -check-prefix=DISABLED_MASKED_STRIDED
2 ; RUN: opt -mcpu=skx -S -loop-vectorize -instcombine -simplifycfg -force-vector-width=8 -force-vector-interleave=1 -enable-interleaved-mem-accesses -enable-masked-interleaved-mem-accesses < %s | FileCheck %s -check-prefix=ENABLED_MASKED_STRIDED
4 target datalayout = "e-m:e-p:32:32-f64:32:64-f80:32-n8:16:32-S128"
5 target triple = "i386-unknown-linux-gnu"
7 ; When masked-interleaved-groups are disabled:
8 ; Check that the predicated load is not vectorized as an
9 ; interleaved-group but rather as a scalarized accesses.
10 ; (For SKX, Gather is not supported by the compiler for chars, therefore
11 ; the only remaining alternative is to scalarize).
12 ; In this case a scalar epilogue is not needed.
14 ; When masked-interleave-group is enabled we expect to find the proper mask
15 ; shuffling code, feeding the wide masked load for an interleave-group (with
17 ; Since the last (second) member of the load-group is a gap, peeling is used,
18 ; so we also expect to find a scalar epilogue loop.
20 ; void masked_strided1(const unsigned char* restrict p,
21 ; unsigned char* restrict q,
22 ; unsigned char guard) {
23 ; for(ix=0; ix < 1024; ++ix) {
31 ;DISABLED_MASKED_STRIDED-LABEL: @masked_strided1(
32 ;DISABLED_MASKED_STRIDED: vector.body:
33 ;DISABLED_MASKED_STRIDED-NEXT: %index = phi i32
34 ;DISABLED_MASKED_STRIDED-NEXT: %[[VECIND:.+]] = phi <8 x i32> [ <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
35 ;DISABLED_MASKED_STRIDED-NOT: %interleaved.mask =
36 ;DISABLED_MASKED_STRIDED-NOT: call void @llvm.masked.load.
37 ;DISABLED_MASKED_STRIDED-NOT: %{{.*}} = shufflevector <16 x i8> %[[WIDEVEC]], <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
38 ;DISABLED_MASKED_STRIDED: %[[VMASK:.+]] = icmp ugt <8 x i32> %[[VECIND]], %{{broadcast.splat*}}
39 ;DISABLED_MASKED_STRIDED-NEXT: %{{.*}} = shl nuw nsw <8 x i32> %[[VECIND]], <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
40 ;DISABLED_MASKED_STRIDED-NEXT: %[[M:.+]] = extractelement <8 x i1> %[[VMASK]], i32 0
41 ;DISABLED_MASKED_STRIDED-NEXT: br i1 %[[M]], label %pred.load.if, label %pred.load.continue
42 ;DISABLED_MASKED_STRIDED-NOT: %interleaved.mask =
43 ;DISABLED_MASKED_STRIDED-NOT: call void @llvm.masked.load.
44 ;DISABLED_MASKED_STRIDED-NOT: %{{.*}} = shufflevector <16 x i8> %{{.*}}, <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
45 ;DISABLED_MASKED_STRIDED-NOT: for.body:
46 ;DISABLED_MASKED_STRIDED: for.end:
48 ;ENABLED_MASKED_STRIDED-LABEL: @masked_strided1(
49 ;ENABLED_MASKED_STRIDED: vector.body:
50 ;ENABLED_MASKED_STRIDED-NEXT: %index = phi i32
51 ;ENABLED_MASKED_STRIDED-NEXT: %[[VECIND:.+]] = phi <8 x i32> [ <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
52 ;ENABLED_MASKED_STRIDED: %[[VMASK:.+]] = icmp ugt <8 x i32> %[[VECIND]], %{{broadcast.splat*}}
53 ;ENABLED_MASKED_STRIDED: %interleaved.mask = shufflevector <8 x i1> %[[VMASK]], <8 x i1> undef, <16 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7>
54 ;ENABLED_MASKED_STRIDED-NEXT: %[[WIDEMASKEDLOAD:.+]] = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %{{.*}}, i32 1, <16 x i1> %interleaved.mask, <16 x i8> undef)
55 ;ENABLED_MASKED_STRIDED-NEXT: %[[STRIDEDVEC:.+]] = shufflevector <16 x i8> %[[WIDEMASKEDLOAD]], <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
56 ;ENABLED_MASKED_STRIDED: for.body:
58 define dso_local void @masked_strided1(i8* noalias nocapture readonly %p, i8* noalias nocapture %q, i8 zeroext %guard) local_unnamed_addr {
60 %conv = zext i8 %guard to i32
64 %ix.09 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
65 %cmp1 = icmp ugt i32 %ix.09, %conv
66 br i1 %cmp1, label %if.then, label %for.inc
69 %mul = shl nuw nsw i32 %ix.09, 1
70 %arrayidx = getelementptr inbounds i8, i8* %p, i32 %mul
71 %0 = load i8, i8* %arrayidx, align 1
72 %arrayidx3 = getelementptr inbounds i8, i8* %q, i32 %ix.09
73 store i8 %0, i8* %arrayidx3, align 1
77 %inc = add nuw nsw i32 %ix.09, 1
78 %exitcond = icmp eq i32 %inc, 1024
79 br i1 %exitcond, label %for.end, label %for.body
85 ; Exactly the same scenario except we are now optimizing for size, therefore
86 ; we check that no scalar epilogue is created. Since we can't create an epilog
87 ; we need the ability to mask out the gaps.
88 ; When enable-masked-interleaved-access is enabled, the interleave-groups will
89 ; be vectorized with masked wide-loads with the mask properly shuffled and
90 ; And-ed with the gaps mask.
92 ;ENABLED_MASKED_STRIDED-LABEL: @masked_strided1_optsize(
93 ;ENABLED_MASKED_STRIDED-NEXT: entry:
94 ;ENABLED_MASKED_STRIDED-NEXT: [[CONV:%.*]] = zext i8 [[GUARD:%.*]] to i32
95 ;ENABLED_MASKED_STRIDED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i32> undef, i32 [[CONV]], i32 0
96 ;ENABLED_MASKED_STRIDED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT]], <8 x i32> undef, <8 x i32> zeroinitializer
97 ;ENABLED_MASKED_STRIDED-NEXT: br label [[VECTOR_BODY:%.*]]
98 ;ENABLED_MASKED_STRIDED: vector.body:
99 ;ENABLED_MASKED_STRIDED-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
100 ;ENABLED_MASKED_STRIDED-NEXT: [[VEC_IND:%.*]] = phi <8 x i32> [ <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
101 ;ENABLED_MASKED_STRIDED-NEXT: [[TMP0:%.*]] = icmp ugt <8 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
102 ;ENABLED_MASKED_STRIDED-NEXT: [[TMP1:%.*]] = shl nuw nsw i32 [[INDEX]], 1
103 ;ENABLED_MASKED_STRIDED-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, i8* [[P:%.*]], i32 [[TMP1]]
104 ;ENABLED_MASKED_STRIDED-NEXT: [[TMP3:%.*]] = bitcast i8* [[TMP2]] to <16 x i8>*
105 ;ENABLED_MASKED_STRIDED-NEXT: [[INTERLEAVED_MASK:%.*]] = shufflevector <8 x i1> [[TMP0]], <8 x i1> undef, <16 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7>
106 ;ENABLED_MASKED_STRIDED-NEXT: [[TMP4:%.*]] = and <16 x i1> [[INTERLEAVED_MASK]], <i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false>
107 ;ENABLED_MASKED_STRIDED-NEXT: [[WIDE_MASKED_VEC:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* [[TMP3]], i32 1, <16 x i1> [[TMP4]], <16 x i8> undef)
108 ;ENABLED_MASKED_STRIDED-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <16 x i8> [[WIDE_MASKED_VEC]], <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
109 ;ENABLED_MASKED_STRIDED-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, i8* [[Q:%.*]], i32 [[INDEX]]
110 ;ENABLED_MASKED_STRIDED-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to <8 x i8>*
111 ;ENABLED_MASKED_STRIDED-NEXT: call void @llvm.masked.store.v8i8.p0v8i8(<8 x i8> [[STRIDED_VEC]], <8 x i8>* [[TMP6]], i32 1, <8 x i1> [[TMP0]])
112 ;ENABLED_MASKED_STRIDED-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 8
113 ;ENABLED_MASKED_STRIDED-NEXT: [[VEC_IND_NEXT]] = add <8 x i32> [[VEC_IND]], <i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
114 ;ENABLED_MASKED_STRIDED-NEXT: [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1024
115 ;ENABLED_MASKED_STRIDED-NEXT: br i1 [[TMP7]]
116 ;ENABLED_MASKED_STRIDED-NOT: for.body:
117 ;ENABLED_MASKED_STRIDED: for.end:
118 ;ENABLED_MASKED_STRIDED-NEXT: ret void
121 define dso_local void @masked_strided1_optsize(i8* noalias nocapture readonly %p, i8* noalias nocapture %q, i8 zeroext %guard) local_unnamed_addr optsize {
123 %conv = zext i8 %guard to i32
127 %ix.09 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
128 %cmp1 = icmp ugt i32 %ix.09, %conv
129 br i1 %cmp1, label %if.then, label %for.inc
132 %mul = shl nuw nsw i32 %ix.09, 1
133 %arrayidx = getelementptr inbounds i8, i8* %p, i32 %mul
134 %0 = load i8, i8* %arrayidx, align 1
135 %arrayidx3 = getelementptr inbounds i8, i8* %q, i32 %ix.09
136 store i8 %0, i8* %arrayidx3, align 1
140 %inc = add nuw nsw i32 %ix.09, 1
141 %exitcond = icmp eq i32 %inc, 1024
142 br i1 %exitcond, label %for.end, label %for.body
149 ; Accesses with gaps under Optsize scenario again, with unknown trip-count
150 ; this time, in order to check the behavior of folding-the-tail (folding the
151 ; remainder loop into the main loop using masking) together with interleaved-
153 ; When masked-interleave-group is disabled the interleave-groups will be
154 ; invalidated during Legality checks; So there we check for no epilogue
155 ; and for scalarized conditional accesses.
156 ; When masked-interleave-group is enabled we check that there is no epilogue,
157 ; and that the interleave-groups are vectorized using proper masking (with
158 ; shuffling of the mask feeding the wide masked load/store).
159 ; The mask itself is an And of two masks: one that masks away the remainder
160 ; iterations, and one that masks away the 'else' of the 'if' statement.
161 ; The shuffled mask is also And-ed with the gaps mask.
163 ; void masked_strided1_optsize_unknown_tc(const unsigned char* restrict p,
164 ; unsigned char* restrict q,
165 ; unsigned char guard,
167 ; for(ix=0; ix < n; ++ix) {
175 ; DISABLED_MASKED_STRIDED-LABEL: @masked_strided1_optsize_unknown_tc(
176 ; DISABLED_MASKED_STRIDED: vector.body:
177 ; DISABLED_MASKED_STRIDED-NEXT: [[INDEX:%.*]] = phi i32
178 ; DISABLED_MASKED_STRIDED-NEXT: [[VEC_IND:%.*]] = phi <8 x i32> [ <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
179 ; DISABLED_MASKED_STRIDED-NEXT: [[TMP0:%.*]] = icmp ugt <8 x i32> [[VEC_IND]], {{.*}}
180 ; DISABLED_MASKED_STRIDED-NEXT: [[TMP1:%.*]] = shl nuw nsw <8 x i32> [[VEC_IND]], <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
181 ; DISABLED_MASKED_STRIDED-NEXT: [[TMP2:%.*]] = icmp ule <8 x i32> [[VEC_IND]], {{.*}}
182 ; DISABLED_MASKED_STRIDED-NEXT: [[TMP3:%.*]] = and <8 x i1> [[TMP0]], [[TMP2]]
183 ; DISABLED_MASKED_STRIDED-NEXT: [[TMP4:%.*]] = extractelement <8 x i1> [[TMP3]], i32 0
184 ; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP4]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
185 ; DISABLED_MASKED_STRIDED: pred.load.if:
186 ; DISABLED_MASKED_STRIDED-NEXT: [[TMP5:%.*]] = extractelement <8 x i32> [[TMP1]], i32 0
187 ; DISABLED_MASKED_STRIDED-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, i8* [[P:%.*]], i32 [[TMP5]]
188 ; DISABLED_MASKED_STRIDED-NEXT: [[TMP7:%.*]] = load i8, i8* [[TMP6]], align 1
189 ; DISABLED_MASKED_STRIDED-NEXT: [[TMP8:%.*]] = insertelement <8 x i8> undef, i8 [[TMP7]], i32 0
190 ; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_LOAD_CONTINUE]]
191 ; DISABLED_MASKED_STRIDED-NOT: for.body:
192 ; DISABLED_MASKED_STRIDED: for.end:
193 ; DISABLED_MASKED_STRIDED-NEXT: ret void
196 ; ENABLED_MASKED_STRIDED-LABEL: @masked_strided1_optsize_unknown_tc(
197 ; ENABLED_MASKED_STRIDED-NEXT: entry:
198 ; ENABLED_MASKED_STRIDED-NEXT: [[CMP9:%.*]] = icmp sgt i32 [[N:%.*]], 0
199 ; ENABLED_MASKED_STRIDED-NEXT: br i1 [[CMP9]], label [[VECTOR_PH:%.*]], label [[FOR_END:%.*]]
200 ; ENABLED_MASKED_STRIDED: vector.ph:
201 ; ENABLED_MASKED_STRIDED-NEXT: [[CONV:%.*]] = zext i8 [[GUARD:%.*]] to i32
202 ; ENABLED_MASKED_STRIDED-NEXT: [[N_RND_UP:%.*]] = add i32 [[N]], 7
203 ; ENABLED_MASKED_STRIDED-NEXT: [[N_VEC:%.*]] = and i32 [[N_RND_UP]], -8
204 ; ENABLED_MASKED_STRIDED-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = add i32 [[N]], -1
205 ; ENABLED_MASKED_STRIDED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i32> undef, i32 [[CONV]], i32 0
206 ; ENABLED_MASKED_STRIDED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT]], <8 x i32> undef, <8 x i32> zeroinitializer
207 ; ENABLED_MASKED_STRIDED-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <8 x i32> undef, i32 [[TRIP_COUNT_MINUS_1]], i32 0
208 ; ENABLED_MASKED_STRIDED-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT1]], <8 x i32> undef, <8 x i32> zeroinitializer
209 ; ENABLED_MASKED_STRIDED-NEXT: br label [[VECTOR_BODY:%.*]]
210 ; ENABLED_MASKED_STRIDED: vector.body:
211 ; ENABLED_MASKED_STRIDED-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
212 ; ENABLED_MASKED_STRIDED-NEXT: [[VEC_IND:%.*]] = phi <8 x i32> [ <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
213 ; ENABLED_MASKED_STRIDED-NEXT: [[TMP0:%.*]] = icmp ugt <8 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
214 ; ENABLED_MASKED_STRIDED-NEXT: [[TMP1:%.*]] = shl nuw nsw i32 [[INDEX]], 1
215 ; ENABLED_MASKED_STRIDED-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, i8* [[P:%.*]], i32 [[TMP1]]
216 ; ENABLED_MASKED_STRIDED-NEXT: [[TMP3:%.*]] = icmp ule <8 x i32> [[VEC_IND]], [[BROADCAST_SPLAT2]]
217 ; ENABLED_MASKED_STRIDED-NEXT: [[TMP4:%.*]] = and <8 x i1> [[TMP0]], [[TMP3]]
218 ; ENABLED_MASKED_STRIDED-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP2]] to <16 x i8>*
219 ; ENABLED_MASKED_STRIDED-NEXT: [[INTERLEAVED_MASK:%.*]] = shufflevector <8 x i1> [[TMP4]], <8 x i1> undef, <16 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7>
220 ; ENABLED_MASKED_STRIDED-NEXT: [[TMP6:%.*]] = and <16 x i1> [[INTERLEAVED_MASK]], <i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false>
221 ; ENABLED_MASKED_STRIDED-NEXT: [[WIDE_MASKED_VEC:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* [[TMP5]], i32 1, <16 x i1> [[TMP6]], <16 x i8> undef)
222 ; ENABLED_MASKED_STRIDED-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <16 x i8> [[WIDE_MASKED_VEC]], <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
223 ; ENABLED_MASKED_STRIDED-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, i8* [[Q:%.*]], i32 [[INDEX]]
224 ; ENABLED_MASKED_STRIDED-NEXT: [[TMP8:%.*]] = bitcast i8* [[TMP7]] to <8 x i8>*
225 ; ENABLED_MASKED_STRIDED-NEXT: call void @llvm.masked.store.v8i8.p0v8i8(<8 x i8> [[STRIDED_VEC]], <8 x i8>* [[TMP8]], i32 1, <8 x i1> [[TMP4]])
226 ; ENABLED_MASKED_STRIDED-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 8
227 ; ENABLED_MASKED_STRIDED-NEXT: [[VEC_IND_NEXT]] = add <8 x i32> [[VEC_IND]], <i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
228 ; ENABLED_MASKED_STRIDED-NEXT: [[TMP9:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
229 ; ENABLED_MASKED_STRIDED-NEXT: br i1 [[TMP9]], label [[FOR_END]], label [[VECTOR_BODY]]
230 ; ENABLED_MASKED_STRIDED-NOT: for.body:
231 ; ENABLED_MASKED_STRIDED: for.end:
232 ; ENABLED_MASKED_STRIDED-NEXT: ret void
234 define dso_local void @masked_strided1_optsize_unknown_tc(i8* noalias nocapture readonly %p, i8* noalias nocapture %q, i8 zeroext %guard, i32 %n) local_unnamed_addr optsize {
236 %cmp9 = icmp sgt i32 %n, 0
237 br i1 %cmp9, label %for.body.lr.ph, label %for.end
240 %conv = zext i8 %guard to i32
244 %ix.010 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.inc ]
245 %cmp1 = icmp ugt i32 %ix.010, %conv
246 br i1 %cmp1, label %if.then, label %for.inc
249 %mul = shl nuw nsw i32 %ix.010, 1
250 %arrayidx = getelementptr inbounds i8, i8* %p, i32 %mul
251 %0 = load i8, i8* %arrayidx, align 1
252 %arrayidx3 = getelementptr inbounds i8, i8* %q, i32 %ix.010
253 store i8 %0, i8* %arrayidx3, align 1
257 %inc = add nuw nsw i32 %ix.010, 1
258 %exitcond = icmp eq i32 %inc, %n
259 br i1 %exitcond, label %for.end.loopexit, label %for.body
268 ; Same, with stride 3. This is to check the gaps-mask and the shuffled mask
269 ; with a different stride.
270 ; So accesses are with gaps under Optsize scenario again, with unknown trip-
271 ; count, in order to check the behavior of folding-the-tail (folding the
272 ; remainder loop into the main loop using masking) together with interleaved-
274 ; When masked-interleave-group is enabled we check that there is no epilogue,
275 ; and that the interleave-groups are vectorized using proper masking (with
276 ; shuffling of the mask feeding the wide masked load/store).
277 ; The mask itself is an And of two masks: one that masks away the remainder
278 ; iterations, and one that masks away the 'else' of the 'if' statement.
279 ; The shuffled mask is also And-ed with the gaps mask.
281 ; void masked_strided3_optsize_unknown_tc(const unsigned char* restrict p,
282 ; unsigned char* restrict q,
283 ; unsigned char guard,
285 ; for(ix=0; ix < n; ++ix) {
294 ; ENABLED_MASKED_STRIDED-LABEL: @masked_strided3_optsize_unknown_tc(
295 ; ENABLED_MASKED_STRIDED-NEXT: entry:
296 ; ENABLED_MASKED_STRIDED-NEXT: [[CMP9:%.*]] = icmp sgt i32 [[N:%.*]], 0
297 ; ENABLED_MASKED_STRIDED-NEXT: br i1 [[CMP9]], label [[VECTOR_PH:%.*]], label [[FOR_END:%.*]]
298 ; ENABLED_MASKED_STRIDED: vector.ph:
299 ; ENABLED_MASKED_STRIDED-NEXT: [[CONV:%.*]] = zext i8 [[GUARD:%.*]] to i32
300 ; ENABLED_MASKED_STRIDED-NEXT: [[N_RND_UP:%.*]] = add i32 [[N]], 7
301 ; ENABLED_MASKED_STRIDED-NEXT: [[N_VEC:%.*]] = and i32 [[N_RND_UP]], -8
302 ; ENABLED_MASKED_STRIDED-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = add i32 [[N]], -1
303 ; ENABLED_MASKED_STRIDED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i32> undef, i32 [[CONV]], i32 0
304 ; ENABLED_MASKED_STRIDED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT]], <8 x i32> undef, <8 x i32> zeroinitializer
305 ; ENABLED_MASKED_STRIDED-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <8 x i32> undef, i32 [[TRIP_COUNT_MINUS_1]], i32 0
306 ; ENABLED_MASKED_STRIDED-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT1]], <8 x i32> undef, <8 x i32> zeroinitializer
307 ; ENABLED_MASKED_STRIDED-NEXT: br label [[VECTOR_BODY:%.*]]
308 ; ENABLED_MASKED_STRIDED: vector.body:
309 ; ENABLED_MASKED_STRIDED-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
310 ; ENABLED_MASKED_STRIDED-NEXT: [[VEC_IND:%.*]] = phi <8 x i32> [ <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
311 ; ENABLED_MASKED_STRIDED-NEXT: [[TMP0:%.*]] = icmp ugt <8 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
312 ; ENABLED_MASKED_STRIDED-NEXT: [[TMP1:%.*]] = mul nsw i32 [[INDEX]], 3
313 ; ENABLED_MASKED_STRIDED-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, i8* [[P:%.*]], i32 [[TMP1]]
314 ; ENABLED_MASKED_STRIDED-NEXT: [[TMP3:%.*]] = icmp ule <8 x i32> [[VEC_IND]], [[BROADCAST_SPLAT2]]
315 ; ENABLED_MASKED_STRIDED-NEXT: [[TMP4:%.*]] = and <8 x i1> [[TMP0]], [[TMP3]]
316 ; ENABLED_MASKED_STRIDED-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP2]] to <24 x i8>*
317 ; ENABLED_MASKED_STRIDED-NEXT: [[INTERLEAVED_MASK:%.*]] = shufflevector <8 x i1> [[TMP4]], <8 x i1> undef, <24 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7>
318 ; ENABLED_MASKED_STRIDED-NEXT: [[TMP6:%.*]] = and <24 x i1> [[INTERLEAVED_MASK]], <i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false>
319 ; ENABLED_MASKED_STRIDED-NEXT: [[WIDE_MASKED_VEC:%.*]] = call <24 x i8> @llvm.masked.load.v24i8.p0v24i8(<24 x i8>* [[TMP5]], i32 1, <24 x i1> [[TMP6]], <24 x i8> undef)
320 ; ENABLED_MASKED_STRIDED-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <24 x i8> [[WIDE_MASKED_VEC]], <24 x i8> undef, <8 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21>
321 ; ENABLED_MASKED_STRIDED-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, i8* [[Q:%.*]], i32 [[INDEX]]
322 ; ENABLED_MASKED_STRIDED-NEXT: [[TMP8:%.*]] = bitcast i8* [[TMP7]] to <8 x i8>*
323 ; ENABLED_MASKED_STRIDED-NEXT: call void @llvm.masked.store.v8i8.p0v8i8(<8 x i8> [[STRIDED_VEC]], <8 x i8>* [[TMP8]], i32 1, <8 x i1> [[TMP4]])
324 ; ENABLED_MASKED_STRIDED-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 8
325 ; ENABLED_MASKED_STRIDED-NEXT: [[VEC_IND_NEXT]] = add <8 x i32> [[VEC_IND]], <i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
326 ; ENABLED_MASKED_STRIDED-NEXT: [[TMP9:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
327 ; ENABLED_MASKED_STRIDED-NEXT: br i1 [[TMP9]], label [[FOR_END]], label [[VECTOR_BODY]]
328 ; ENABLED_MASKED_STRIDED: for.end:
329 ; ENABLED_MASKED_STRIDED-NEXT: ret void
331 define dso_local void @masked_strided3_optsize_unknown_tc(i8* noalias nocapture readonly %p, i8* noalias nocapture %q, i8 zeroext %guard, i32 %n) local_unnamed_addr optsize {
333 %cmp9 = icmp sgt i32 %n, 0
334 br i1 %cmp9, label %for.body.lr.ph, label %for.end
337 %conv = zext i8 %guard to i32
341 %ix.010 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.inc ]
342 %cmp1 = icmp ugt i32 %ix.010, %conv
343 br i1 %cmp1, label %if.then, label %for.inc
346 %mul = mul nsw i32 %ix.010, 3
347 %arrayidx = getelementptr inbounds i8, i8* %p, i32 %mul
348 %0 = load i8, i8* %arrayidx, align 1
349 %arrayidx3 = getelementptr inbounds i8, i8* %q, i32 %ix.010
350 store i8 %0, i8* %arrayidx3, align 1
354 %inc = add nuw nsw i32 %ix.010, 1
355 %exitcond = icmp eq i32 %inc, %n
356 br i1 %exitcond, label %for.end.loopexit, label %for.body
366 ; Back to stride 2 with gaps with a known trip count under opt for size,
367 ; but this time the load/store are not predicated.
368 ; When enable-masked-interleaved-access is disabled, the interleave-groups will
369 ; be invalidated during cost-model checks because we have gaps and we can't
370 ; create an epilog. The access is thus scalarized.
371 ; (Before the fix that this test checks, we used to create an epilogue despite
372 ; optsize, and vectorized the access as an interleaved-group. This is now fixed,
373 ; and we make sure that a scalar epilogue does not exist).
374 ; When enable-masked-interleaved-access is enabled, the interleave-groups will
375 ; be vectorized with masked wide-loads (masking away the gaps).
377 ; void unconditional_strided1_optsize(const unsigned char* restrict p,
378 ; unsigned char* restrict q,
379 ; unsigned char guard) {
380 ; for(ix=0; ix < 1024; ++ix) {
386 ;DISABLED_MASKED_STRIDED-LABEL: @unconditional_strided1_optsize(
387 ;DISABLED_MASKED_STRIDED: vector.body:
388 ;DISABLED_MASKED_STRIDED-NOT: call <16 x i8> @llvm.masked.load.v16i8.p0v16i8
389 ;DISABLED_MASKED_STRIDED: %{{.*}} = extractelement <8 x i32> %{{.*}}, i32 0
390 ;DISABLED_MASKED_STRIDED-NOT: for.body:
391 ;DISABLED_MASKED_STRIDED: for.end:
393 ;ENABLED_MASKED_STRIDED-LABEL: @unconditional_strided1_optsize(
394 ;ENABLED_MASKED_STRIDED-NEXT: entry:
395 ;ENABLED_MASKED_STRIDED-NEXT: br label [[VECTOR_BODY:%.*]]
396 ;ENABLED_MASKED_STRIDED: vector.body:
397 ;ENABLED_MASKED_STRIDED-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
398 ;ENABLED_MASKED_STRIDED-NEXT: [[TMP0:%.*]] = shl nuw nsw i32 [[INDEX]], 1
399 ;ENABLED_MASKED_STRIDED-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, i8* [[P:%.*]], i32 [[TMP0]]
400 ;ENABLED_MASKED_STRIDED-NEXT: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <16 x i8>*
401 ;ENABLED_MASKED_STRIDED-NEXT: [[WIDE_MASKED_VEC:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* [[TMP2]], i32 1, <16 x i1> <i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false>, <16 x i8> undef)
402 ;ENABLED_MASKED_STRIDED-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <16 x i8> [[WIDE_MASKED_VEC]], <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
403 ;ENABLED_MASKED_STRIDED-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, i8* [[Q:%.*]], i32 [[INDEX]]
404 ;ENABLED_MASKED_STRIDED-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to <8 x i8>*
405 ;ENABLED_MASKED_STRIDED-NEXT: store <8 x i8> [[STRIDED_VEC]], <8 x i8>* [[TMP4]], align 1
406 ;ENABLED_MASKED_STRIDED-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 8
407 ;ENABLED_MASKED_STRIDED-NEXT: [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1024
408 ;ENABLED_MASKED_STRIDED-NEXT: br i1 [[TMP5]], label [[FOR_END:%.*]], label [[VECTOR_BODY]]
409 ;ENABLED_MASKED_STRIDED-NOT: for.body:
410 ;ENABLED_MASKED_STRIDED: for.end:
411 ;ENABLED_MASKED_STRIDED-NEXT: ret void
414 define dso_local void @unconditional_strided1_optsize(i8* noalias nocapture readonly %p, i8* noalias nocapture %q, i8 zeroext %guard) local_unnamed_addr optsize {
419 %ix.06 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
420 %mul = shl nuw nsw i32 %ix.06, 1
421 %arrayidx = getelementptr inbounds i8, i8* %p, i32 %mul
422 %0 = load i8, i8* %arrayidx, align 1
423 %arrayidx1 = getelementptr inbounds i8, i8* %q, i32 %ix.06
424 store i8 %0, i8* %arrayidx1, align 1
425 %inc = add nuw nsw i32 %ix.06, 1
426 %exitcond = icmp eq i32 %inc, 1024
427 br i1 %exitcond, label %for.end, label %for.body
435 ; Unconditioal accesses with gaps under Optsize scenario again, with unknown
436 ; trip-count this time, in order to check the behavior of folding-the-tail
437 ; (folding the remainder loop into the main loop using masking) together with
438 ; interleaved-groups. Folding-the-tail turns the accesses to conditional which
439 ; requires proper masking. In addition we need to mask out the gaps (all
440 ; because we are not allowed to use an epilog due to optsize).
441 ; When enable-masked-interleaved-access is disabled, the interleave-groups will
442 ; be invalidated during cost-model checks. So there we check for no epilogue
443 ; and for scalarized conditional accesses.
444 ; When masked-interleave-group is enabled we check that there is no epilogue,
445 ; and that the interleave-groups are vectorized using proper masking (with
446 ; shuffling of the mask feeding the wide masked load/store).
447 ; The shuffled mask is also And-ed with the gaps mask.
449 ; for(ix=0; ix < n; ++ix) {
454 ; DISABLED_MASKED_STRIDED-LABEL: @unconditional_strided1_optsize_unknown_tc(
455 ; DISABLED_MASKED_STRIDED: vector.body:
456 ; DISABLED_MASKED_STRIDED-NEXT: [[INDEX:%.*]] = phi i32
457 ; DISABLED_MASKED_STRIDED-NEXT: [[VEC_IND:%.*]] = phi <8 x i32> [ <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
458 ; DISABLED_MASKED_STRIDED-NEXT: [[TMP0:%.*]] = shl nuw nsw <8 x i32> [[VEC_IND]], <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
459 ; DISABLED_MASKED_STRIDED-NEXT: [[TMP1:%.*]] = icmp ule <8 x i32> [[VEC_IND]], {{.*}}
460 ; DISABLED_MASKED_STRIDED-NEXT: [[TMP2:%.*]] = extractelement <8 x i1> [[TMP1]], i32 0
461 ; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP2]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
462 ; DISABLED_MASKED_STRIDED: pred.load.if:
463 ; DISABLED_MASKED_STRIDED-NEXT: [[TMP3:%.*]] = extractelement <8 x i32> [[TMP0]], i32 0
464 ; DISABLED_MASKED_STRIDED-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, i8* [[P:%.*]], i32 [[TMP3]]
465 ; DISABLED_MASKED_STRIDED-NEXT: [[TMP5:%.*]] = load i8, i8* [[TMP4]], align 1
466 ; DISABLED_MASKED_STRIDED-NEXT: [[TMP6:%.*]] = insertelement <8 x i8> undef, i8 [[TMP5]], i32 0
467 ; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_LOAD_CONTINUE]]
468 ; DISBLED_MASKED_STRIDED-NOT: for.body:
469 ; DISABLED_MASKED_STRIDED: for.end:
470 ; DISABLED_MASKED_STRIDED-NEXT: ret void
472 ; ENABLED_MASKED_STRIDED-LABEL: @unconditional_strided1_optsize_unknown_tc(
473 ; ENABLED_MASKED_STRIDED-NEXT: entry:
474 ; ENABLED_MASKED_STRIDED-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[N:%.*]], 0
475 ; ENABLED_MASKED_STRIDED-NEXT: br i1 [[CMP6]], label [[VECTOR_PH:%.*]], label [[FOR_END:%.*]]
476 ; ENABLED_MASKED_STRIDED: vector.ph:
477 ; ENABLED_MASKED_STRIDED-NEXT: [[N_RND_UP:%.*]] = add i32 [[N]], 7
478 ; ENABLED_MASKED_STRIDED-NEXT: [[N_VEC:%.*]] = and i32 [[N_RND_UP]], -8
479 ; ENABLED_MASKED_STRIDED-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = add i32 [[N]], -1
480 ; ENABLED_MASKED_STRIDED-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <8 x i32> undef, i32 [[TRIP_COUNT_MINUS_1]], i32 0
481 ; ENABLED_MASKED_STRIDED-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT1]], <8 x i32> undef, <8 x i32> zeroinitializer
482 ; ENABLED_MASKED_STRIDED-NEXT: br label [[VECTOR_BODY:%.*]]
483 ; ENABLED_MASKED_STRIDED: vector.body:
484 ; ENABLED_MASKED_STRIDED-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
485 ; ENABLED_MASKED_STRIDED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i32> undef, i32 [[INDEX]], i32 0
486 ; ENABLED_MASKED_STRIDED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT]], <8 x i32> undef, <8 x i32> zeroinitializer
487 ; ENABLED_MASKED_STRIDED-NEXT: [[INDUCTION:%.*]] = add <8 x i32> [[BROADCAST_SPLAT]], <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
488 ; ENABLED_MASKED_STRIDED-NEXT: [[TMP0:%.*]] = shl nuw nsw i32 [[INDEX]], 1
489 ; ENABLED_MASKED_STRIDED-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, i8* [[P:%.*]], i32 [[TMP0]]
490 ; ENABLED_MASKED_STRIDED-NEXT: [[TMP2:%.*]] = icmp ule <8 x i32> [[INDUCTION]], [[BROADCAST_SPLAT2]]
491 ; ENABLED_MASKED_STRIDED-NEXT: [[TMP3:%.*]] = bitcast i8* [[TMP1]] to <16 x i8>*
492 ; ENABLED_MASKED_STRIDED-NEXT: [[INTERLEAVED_MASK:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> undef, <16 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7>
493 ; ENABLED_MASKED_STRIDED-NEXT: [[TMP4:%.*]] = and <16 x i1> [[INTERLEAVED_MASK]], <i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false>
494 ; ENABLED_MASKED_STRIDED-NEXT: [[WIDE_MASKED_VEC:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* [[TMP3]], i32 1, <16 x i1> [[TMP4]], <16 x i8> undef)
495 ; ENABLED_MASKED_STRIDED-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <16 x i8> [[WIDE_MASKED_VEC]], <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
496 ; ENABLED_MASKED_STRIDED-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, i8* [[Q:%.*]], i32 [[INDEX]]
497 ; ENABLED_MASKED_STRIDED-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to <8 x i8>*
498 ; ENABLED_MASKED_STRIDED-NEXT: call void @llvm.masked.store.v8i8.p0v8i8(<8 x i8> [[STRIDED_VEC]], <8 x i8>* [[TMP6]], i32 1, <8 x i1> [[TMP2]])
499 ; ENABLED_MASKED_STRIDED-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 8
500 ; ENABLED_MASKED_STRIDED-NEXT: [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
501 ; ENABLED_MASKED_STRIDED-NEXT: br i1 [[TMP7]], label [[FOR_END]], label [[VECTOR_BODY]]
502 ; ENABLED_MASKED_STRIDED-NOT: for.body:
503 ; ENABLED_MASKED_STRIDED: for.end:
504 ; ENABLED_MASKED_STRIDED-NEXT: ret void
506 define dso_local void @unconditional_strided1_optsize_unknown_tc(i8* noalias nocapture readonly %p, i8* noalias nocapture %q, i32 %n) local_unnamed_addr optsize {
508 %cmp6 = icmp sgt i32 %n, 0
509 br i1 %cmp6, label %for.body.preheader, label %for.end
515 %ix.07 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
516 %mul = shl nuw nsw i32 %ix.07, 1
517 %arrayidx = getelementptr inbounds i8, i8* %p, i32 %mul
518 %0 = load i8, i8* %arrayidx, align 1
519 %arrayidx1 = getelementptr inbounds i8, i8* %q, i32 %ix.07
520 store i8 %0, i8* %arrayidx1, align 1
521 %inc = add nuw nsw i32 %ix.07, 1
522 %exitcond = icmp eq i32 %inc, %n
523 br i1 %exitcond, label %for.end.loopexit, label %for.body
533 ; Check also a scenario with full interleave-groups (no gaps) as well as both
534 ; load and store groups. We check that when masked-interleave-group is disabled
535 ; the predicated loads (and stores) are not vectorized as an
536 ; interleaved-group but rather as four separate scalarized accesses.
537 ; (For SKX, gather/scatter is not supported by the compiler for chars, therefore
538 ; the only remaining alternative is to scalarize).
539 ; When masked-interleave-group is enabled we expect to find the proper mask
540 ; shuffling code, feeding the wide masked load/store for the two interleave-
543 ; void masked_strided2(const unsigned char* restrict p,
544 ; unsigned char* restrict q,
545 ; unsigned char guard) {
546 ; for(ix=0; ix < 1024; ++ix) {
548 ; char left = p[2*ix];
549 ; char right = p[2*ix + 1];
550 ; char max = max(left, right);
552 ; q[2*ix+1] = 0 - max;
557 ;DISABLED_MASKED_STRIDED-LABEL: @masked_strided2(
558 ;DISABLED_MASKED_STRIDED: vector.body:
559 ;DISABLED_MASKED_STRIDED-NEXT: %index = phi i32
560 ;DISABLED_MASKED_STRIDED-NEXT: %[[VECIND:.+]] = phi <8 x i32> [ <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
561 ;DISABLED_MASKED_STRIDED-NOT: %interleaved.mask =
562 ;DISABLED_MASKED_STRIDED-NOT: call void @llvm.masked.load.
563 ;DISABLED_MASKED_STRIDED-NOT: call void @llvm.masked.store.
564 ;DISABLED_MASKED_STRIDED-NOT: %{{.*}} = shufflevector <16 x i8> %{{.*}}, <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
565 ;DISABLED_MASKED_STRIDED: %[[VMASK:.+]] = icmp ugt <8 x i32> %[[VECIND]], %{{broadcast.splat*}}
566 ;DISABLED_MASKED_STRIDED-NEXT: %{{.*}} = shl nuw nsw <8 x i32> %[[VECIND]], <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
567 ;DISABLED_MASKED_STRIDED-NEXT: %[[M:.+]] = extractelement <8 x i1> %[[VMASK]], i32 0
568 ;DISABLED_MASKED_STRIDED-NEXT: br i1 %[[M]], label %pred.load.if, label %pred.load.continue
569 ;DISABLED_MASKED_STRIDED-NOT: %interleaved.mask =
570 ;DISABLED_MASKED_STRIDED-NOT: call void @llvm.masked.load.
571 ;DISABLED_MASKED_STRIDED-NOT: call void @llvm.masked.store.
572 ;DISABLED_MASKED_STRIDED-NOT: %{{.*}} = shufflevector <16 x i8> %{{.*}}, <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
574 ;ENABLED_MASKED_STRIDED-LABEL: @masked_strided2(
575 ;ENABLED_MASKED_STRIDED: vector.body:
576 ;ENABLED_MASKED_STRIDED-NEXT: %index = phi i32
577 ;ENABLED_MASKED_STRIDED-NEXT: %[[VECIND:.+]] = phi <8 x i32> [ <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
578 ;ENABLED_MASKED_STRIDED: %[[VMASK:.+]] = icmp ugt <8 x i32> %[[VECIND]], %{{broadcast.splat*}}
579 ;ENABLED_MASKED_STRIDED: %interleaved.mask = shufflevector <8 x i1> %[[VMASK]], <8 x i1> undef, <16 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7>
580 ;ENABLED_MASKED_STRIDED-NEXT: %[[WIDEMASKEDLOAD:.+]] = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %{{.*}}, i32 1, <16 x i1> %interleaved.mask, <16 x i8> undef)
581 ;ENABLED_MASKED_STRIDED-NEXT: %{{.*}} = shufflevector <16 x i8> %[[WIDEMASKEDLOAD]], <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
582 ;ENABLED_MASKED_STRIDED-NEXT: %{{.*}} = shufflevector <16 x i8> %[[WIDEMASKEDLOAD]], <16 x i8> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
583 ;ENABLED_MASKED_STRIDED: call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> %{{.*}}, <16 x i8>* %{{.*}}, i32 1, <16 x i1> %interleaved.mask)
585 ; Function Attrs: norecurse nounwind
586 define dso_local void @masked_strided2(i8* noalias nocapture readonly %p, i8* noalias nocapture %q, i8 zeroext %guard) local_unnamed_addr {
588 %conv = zext i8 %guard to i32
592 %ix.024 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
593 %cmp1 = icmp ugt i32 %ix.024, %conv
594 br i1 %cmp1, label %if.then, label %for.inc
597 %mul = shl nuw nsw i32 %ix.024, 1
598 %arrayidx = getelementptr inbounds i8, i8* %p, i32 %mul
599 %0 = load i8, i8* %arrayidx, align 1
600 %add = or i32 %mul, 1
601 %arrayidx4 = getelementptr inbounds i8, i8* %p, i32 %add
602 %1 = load i8, i8* %arrayidx4, align 1
603 %cmp.i = icmp slt i8 %0, %1
604 %spec.select.i = select i1 %cmp.i, i8 %1, i8 %0
605 %arrayidx6 = getelementptr inbounds i8, i8* %q, i32 %mul
606 store i8 %spec.select.i, i8* %arrayidx6, align 1
607 %sub = sub i8 0, %spec.select.i
608 %arrayidx11 = getelementptr inbounds i8, i8* %q, i32 %add
609 store i8 %sub, i8* %arrayidx11, align 1
613 %inc = add nuw nsw i32 %ix.024, 1
614 %exitcond = icmp eq i32 %inc, 1024
615 br i1 %exitcond, label %for.end, label %for.body
621 ; Full groups again, this time checking an Optsize scenario, with unknown trip-
622 ; count, to check the behavior of folding-the-tail (folding the remainder loop
623 ; into the main loop using masking) together with interleaved-groups.
624 ; When masked-interleave-group is disabled the interleave-groups will be
625 ; invalidated during Legality check, so nothing to check here.
626 ; When masked-interleave-group is enabled we check that there is no epilogue,
627 ; and that the interleave-groups are vectorized using proper masking (with
628 ; shuffling of the mask feeding the wide masked load/store).
629 ; The mask itself is an And of two masks: one that masks away the remainder
630 ; iterations, and one that masks away the 'else' of the 'if' statement.
632 ; void masked_strided2_unknown_tc(const unsigned char* restrict p,
633 ; unsigned char* restrict q,
634 ; unsigned char guard,
636 ; for(ix=0; ix < n; ++ix) {
638 ; char left = p[2*ix];
639 ; char right = p[2*ix + 1];
640 ; char max = max(left, right);
642 ; q[2*ix+1] = 0 - max;
647 ; ENABLED_MASKED_STRIDED-LABEL: @masked_strided2_unknown_tc(
648 ; ENABLED_MASKED_STRIDED: vector.body:
649 ; ENABLED_MASKED_STRIDED-NEXT: [[INDEX:%.*]] = phi i32
650 ; ENABLED_MASKED_STRIDED-NEXT: [[VEC_IND:%.*]] = phi <8 x i32> [ <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
651 ; ENABLED_MASKED_STRIDED-NEXT: [[TMP0:%.*]] = icmp sgt <8 x i32> [[VEC_IND]], {{.*}}
652 ; ENABLED_MASKED_STRIDED-NEXT: [[TMP1:%.*]] = shl nuw nsw i32 [[INDEX]], 1
653 ; ENABLED_MASKED_STRIDED-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, i8* [[P:%.*]], i32 [[TMP1]]
654 ; ENABLED_MASKED_STRIDED-NEXT: [[TMP3:%.*]] = icmp ule <8 x i32> [[VEC_IND]], {{.*}}
655 ; ENABLED_MASKED_STRIDED-NEXT: [[TMP4:%.*]] = and <8 x i1> [[TMP0]], [[TMP3]]
656 ; ENABLED_MASKED_STRIDED-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP2]] to <16 x i8>*
657 ; ENABLED_MASKED_STRIDED-NEXT: [[INTERLEAVED_MASK:%.*]] = shufflevector <8 x i1> [[TMP4]], <8 x i1> undef, <16 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7>
658 ; ENABLED_MASKED_STRIDED-NEXT: [[WIDE_MASKED_VEC:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* [[TMP5]], i32 1, <16 x i1> [[INTERLEAVED_MASK]], <16 x i8> undef)
659 ; ENABLED_MASKED_STRIDED-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <16 x i8> [[WIDE_MASKED_VEC]], <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
660 ; ENABLED_MASKED_STRIDED-NEXT: [[STRIDED_VEC3:%.*]] = shufflevector <16 x i8> [[WIDE_MASKED_VEC]], <16 x i8> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
661 ; ENABLED_MASKED_STRIDED-NEXT: [[TMP6:%.*]] = or i32 [[TMP1]], 1
662 ; ENABLED_MASKED_STRIDED-NEXT: [[TMP7:%.*]] = icmp slt <8 x i8> [[STRIDED_VEC]], [[STRIDED_VEC3]]
663 ; ENABLED_MASKED_STRIDED-NEXT: [[TMP8:%.*]] = select <8 x i1> [[TMP7]], <8 x i8> [[STRIDED_VEC3]], <8 x i8> [[STRIDED_VEC]]
664 ; ENABLED_MASKED_STRIDED-NEXT: [[TMP9:%.*]] = sub <8 x i8> zeroinitializer, [[TMP8]]
665 ; ENABLED_MASKED_STRIDED-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, i8* [[Q:%.*]], i32 -1
666 ; ENABLED_MASKED_STRIDED-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, i8* [[TMP10]], i32 [[TMP6]]
667 ; ENABLED_MASKED_STRIDED-NEXT: [[TMP12:%.*]] = bitcast i8* [[TMP11]] to <16 x i8>*
668 ; ENABLED_MASKED_STRIDED-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i8> [[TMP8]], <8 x i8> [[TMP9]], <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
669 ; ENABLED_MASKED_STRIDED-NEXT: call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> [[INTERLEAVED_VEC]], <16 x i8>* [[TMP12]], i32 1, <16 x i1> [[INTERLEAVED_MASK]])
670 ; ENABLED_MASKED_STRIDED-NEXT: {{.*}} = add i32 [[INDEX]], 8
671 ; ENABLED_MASKED_STRIDED-NEXT: {{.*}} = add <8 x i32> {{.*}}, <i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
672 ; ENABLED_MASKED_STRIDED-NEXT: [[TMP13:%.*]] = icmp eq i32 {{.*}}, {{.*}}
673 ; ENABLED_MASKED_STRIDED-NEXT: br i1 [[TMP13]],
674 ; ENABLED_MASKED_STRIDED: for.end:
675 ; ENABLED_MASKED_STRIDED-NEXT: ret void
677 define dso_local void @masked_strided2_unknown_tc(i8* noalias nocapture readonly %p, i8* noalias nocapture %q, i32 %guard, i32 %n) local_unnamed_addr optsize {
679 %cmp22 = icmp sgt i32 %n, 0
680 br i1 %cmp22, label %for.body.preheader, label %for.end
686 %ix.023 = phi i32 [ %inc, %for.inc ], [ 0, %for.body.preheader ]
687 %cmp1 = icmp sgt i32 %ix.023, %guard
688 br i1 %cmp1, label %if.then, label %for.inc
691 %mul = shl nuw nsw i32 %ix.023, 1
692 %arrayidx = getelementptr inbounds i8, i8* %p, i32 %mul
693 %0 = load i8, i8* %arrayidx, align 1
694 %add = or i32 %mul, 1
695 %arrayidx3 = getelementptr inbounds i8, i8* %p, i32 %add
696 %1 = load i8, i8* %arrayidx3, align 1
697 %cmp.i = icmp slt i8 %0, %1
698 %spec.select.i = select i1 %cmp.i, i8 %1, i8 %0
699 %arrayidx5 = getelementptr inbounds i8, i8* %q, i32 %mul
700 store i8 %spec.select.i, i8* %arrayidx5, align 1
701 %sub = sub i8 0, %spec.select.i
702 %arrayidx9 = getelementptr inbounds i8, i8* %q, i32 %add
703 store i8 %sub, i8* %arrayidx9, align 1
707 %inc = add nuw nsw i32 %ix.023, 1
708 %exitcond = icmp eq i32 %inc, %n
709 br i1 %exitcond, label %for.end.loopexit, label %for.body
718 ; Full groups under Optsize scenario again, with unknown trip-count, again in
719 ; order to check the behavior of folding-the-tail (folding the remainder loop
720 ; into the main loop using masking) together with interleaved-groups.
721 ; This time the accesses are not conditional, they become conditional only
722 ; due to tail folding.
723 ; When masked-interleave-group is disabled the interleave-groups will be
724 ; invalidated during cost-model checks, so we check for no epilogue and
725 ; scalarized conditional accesses.
726 ; When masked-interleave-group is enabled we check for no epilogue,
727 ; and interleave-groups vectorized using proper masking (with
728 ; shuffling of the mask feeding the wide masked load/store).
729 ; (Same vectorization scheme as for the previous loop with conditional accesses
730 ; except here the mask only masks away the remainder iterations.)
732 ; void unconditional_masked_strided2_unknown_tc(const unsigned char* restrict p,
733 ; unsigned char* restrict q,
735 ; for(ix=0; ix < n; ++ix) {
736 ; char left = p[2*ix];
737 ; char right = p[2*ix + 1];
738 ; char max = max(left, right);
740 ; q[2*ix+1] = 0 - max;
744 ; DISABLED_MASKED_STRIDED-LABEL: @unconditional_masked_strided2_unknown_tc(
745 ; DISABLED_MASKED_STRIDED: vector.body:
746 ; DISABLED_MASKED_STRIDED-NEXT: [[INDEX:%.*]] = phi i32
747 ; DISABLED_MASKED_STRIDED-NEXT: [[VEC_IND:%.*]] = phi <8 x i32> [ <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
748 ; DISABLED_MASKED_STRIDED-NEXT: [[TMP0:%.*]] = shl nuw nsw <8 x i32> [[VEC_IND]], <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
749 ; DISABLED_MASKED_STRIDED-NEXT: [[TMP1:%.*]] = icmp ule <8 x i32> [[VEC_IND]], {{.*}}
750 ; DISABLED_MASKED_STRIDED-NEXT: [[TMP2:%.*]] = extractelement <8 x i1> [[TMP1]], i32 0
751 ; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP2]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
752 ; DISABLED_MASKED_STRIDED: pred.load.if:
753 ; DISABLED_MASKED_STRIDED-NEXT: [[TMP3:%.*]] = extractelement <8 x i32> [[TMP0]], i32 0
754 ; DISABLED_MASKED_STRIDED-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, i8* [[P:%.*]], i32 [[TMP3]]
755 ; DISABLED_MASKED_STRIDED-NEXT: [[TMP5:%.*]] = load i8, i8* [[TMP4]], align 1
756 ; DISABLED_MASKED_STRIDED-NEXT: [[TMP6:%.*]] = insertelement <8 x i8> undef, i8 [[TMP5]], i32 0
757 ; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_LOAD_CONTINUE]]
758 ; DISABLED_MASKED_STRIDED-NOT: for.body:
759 ; DISABLED_MASKED_STRIDED: for.end:
760 ; DISABLED_MASKED_STRIDED-NEXT: ret void
764 ; ENABLED_MASKED_STRIDED-LABEL: @unconditional_masked_strided2_unknown_tc(
765 ; ENABLED_MASKED_STRIDED: vector.body:
766 ; ENABLED_MASKED_STRIDED-NEXT: [[INDEX:%.*]] = phi i32
767 ; ENABLED_MASKED_STRIDED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i32> undef, i32 [[INDEX]], i32 0
768 ; ENABLED_MASKED_STRIDED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT]], <8 x i32> undef, <8 x i32> zeroinitializer
769 ; ENABLED_MASKED_STRIDED-NEXT: [[INDUCTION:%.*]] = add <8 x i32> [[BROADCAST_SPLAT]], <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
770 ; ENABLED_MASKED_STRIDED-NEXT: [[TMP0:%.*]] = shl nuw nsw i32 [[INDEX]], 1
771 ; ENABLED_MASKED_STRIDED-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, i8* [[P:%.*]], i32 [[TMP0]]
772 ; ENABLED_MASKED_STRIDED-NEXT: [[TMP2:%.*]] = icmp ule <8 x i32> {{.*}}, {{.*}}
773 ; ENABLED_MASKED_STRIDED-NEXT: [[TMP3:%.*]] = bitcast i8* [[TMP1]] to <16 x i8>*
774 ; ENABLED_MASKED_STRIDED-NEXT: [[INTERLEAVED_MASK:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> undef, <16 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7>
775 ; ENABLED_MASKED_STRIDED-NEXT: [[WIDE_MASKED_VEC:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* [[TMP3]], i32 1, <16 x i1> [[INTERLEAVED_MASK]], <16 x i8> undef)
776 ; ENABLED_MASKED_STRIDED-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <16 x i8> [[WIDE_MASKED_VEC]], <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
777 ; ENABLED_MASKED_STRIDED-NEXT: [[STRIDED_VEC3:%.*]] = shufflevector <16 x i8> [[WIDE_MASKED_VEC]], <16 x i8> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
778 ; ENABLED_MASKED_STRIDED-NEXT: [[TMP4:%.*]] = or i32 [[TMP0]], 1
779 ; ENABLED_MASKED_STRIDED-NEXT: [[TMP5:%.*]] = icmp slt <8 x i8> [[STRIDED_VEC]], [[STRIDED_VEC3]]
780 ; ENABLED_MASKED_STRIDED-NEXT: [[TMP6:%.*]] = select <8 x i1> [[TMP5]], <8 x i8> [[STRIDED_VEC3]], <8 x i8> [[STRIDED_VEC]]
781 ; ENABLED_MASKED_STRIDED-NEXT: [[TMP7:%.*]] = sub <8 x i8> zeroinitializer, [[TMP6]]
782 ; ENABLED_MASKED_STRIDED-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, i8* [[Q:%.*]], i32 -1
783 ; ENABLED_MASKED_STRIDED-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, i8* [[TMP8]], i32 [[TMP4]]
784 ; ENABLED_MASKED_STRIDED-NEXT: [[TMP10:%.*]] = bitcast i8* [[TMP9]] to <16 x i8>*
785 ; ENABLED_MASKED_STRIDED-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i8> [[TMP6]], <8 x i8> [[TMP7]], <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
786 ; ENABLED_MASKED_STRIDED-NEXT: call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> [[INTERLEAVED_VEC]], <16 x i8>* [[TMP10]], i32 1, <16 x i1> [[INTERLEAVED_MASK]])
787 ; ENABLED_MASKED_STRIDED-NEXT: {{.*}} = add i32 [[INDEX]], 8
788 ; ENABLED_MASKED_STRIDED-NEXT: [[TMP11:%.*]] = icmp eq i32 {{.*}}, {{.*}}
789 ; ENABLED_MASKED_STRIDED-NEXT: br i1 [[TMP11]]
790 ; ENABLED_MASKED_STRIDED: for.end:
791 ; ENABLED_MASKED_STRIDED-NEXT: ret void
793 define dso_local void @unconditional_masked_strided2_unknown_tc(i8* noalias nocapture readonly %p, i8* noalias nocapture %q, i32 %n) local_unnamed_addr optsize {
795 %cmp20 = icmp sgt i32 %n, 0
796 br i1 %cmp20, label %for.body.preheader, label %for.end
802 %ix.021 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
803 %mul = shl nuw nsw i32 %ix.021, 1
804 %arrayidx = getelementptr inbounds i8, i8* %p, i32 %mul
805 %0 = load i8, i8* %arrayidx, align 1
806 %add = or i32 %mul, 1
807 %arrayidx2 = getelementptr inbounds i8, i8* %p, i32 %add
808 %1 = load i8, i8* %arrayidx2, align 1
809 %cmp.i = icmp slt i8 %0, %1
810 %spec.select.i = select i1 %cmp.i, i8 %1, i8 %0
811 %arrayidx4 = getelementptr inbounds i8, i8* %q, i32 %mul
812 store i8 %spec.select.i, i8* %arrayidx4, align 1
813 %sub = sub i8 0, %spec.select.i
814 %arrayidx8 = getelementptr inbounds i8, i8* %q, i32 %add
815 store i8 %sub, i8* %arrayidx8, align 1
816 %inc = add nuw nsw i32 %ix.021, 1
817 %exitcond = icmp eq i32 %inc, %n
818 br i1 %exitcond, label %for.end.loopexit, label %for.body