test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll

   1 ; RUN: opt -mcpu=skx -S -loop-vectorize -instcombine -simplifycfg -force-vector-width=8 -force-vector-interleave=1 -enable-interleaved-mem-accesses < %s | FileCheck %s -check-prefix=DISABLED_MASKED_STRIDED
   2 ; RUN: opt -mcpu=skx -S -loop-vectorize -instcombine -simplifycfg -force-vector-width=8 -force-vector-interleave=1 -enable-interleaved-mem-accesses  -enable-masked-interleaved-mem-accesses < %s | FileCheck %s -check-prefix=ENABLED_MASKED_STRIDED
   3
   4 target datalayout = "e-m:e-p:32:32-f64:32:64-f80:32-n8:16:32-S128"
   5 target triple = "i386-unknown-linux-gnu"
   6
   7 ; When masked-interleaved-groups are disabled:
   8 ; Check that the predicated load is not vectorized as an
   9 ; interleaved-group but rather as a scalarized accesses.
  10 ; (For SKX, Gather is not supported by the compiler for chars, therefore
  11 ;  the only remaining alternative is to scalarize).
  12 ; In this case a scalar epilogue is not needed.
  13 ;
  14 ; When  masked-interleave-group is enabled we expect to find the proper mask
  15 ; shuffling code, feeding the wide masked load for an interleave-group (with
  16 ; a single member).
  17 ; Since the last (second) member of the load-group is a gap, peeling is used,
  18 ; so we also expect to find a scalar epilogue loop.
  19 ;
  20 ; void masked_strided1(const unsigned char* restrict p,
  21 ;                      unsigned char* restrict q,
  22 ;                      unsigned char guard) {
  23 ;   for(ix=0; ix < 1024; ++ix) {
  24 ;     if (ix > guard) {
  25 ;         char t = p[2*ix];
  26 ;         q[ix] = t;
  27 ;     }
  28 ;   }
  29 ; }
  30
  31 ;DISABLED_MASKED_STRIDED-LABEL: @masked_strided1(
  32 ;DISABLED_MASKED_STRIDED: vector.body:
  33 ;DISABLED_MASKED_STRIDED-NEXT:  %index = phi i32
  34 ;DISABLED_MASKED_STRIDED-NEXT:  %[[VECIND:.+]] = phi <8 x i32> [ <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
  35 ;DISABLED_MASKED_STRIDED-NOT:   %interleaved.mask =
  36 ;DISABLED_MASKED_STRIDED-NOT:   call void @llvm.masked.load.
  37 ;DISABLED_MASKED_STRIDED-NOT:   %{{.*}} = shufflevector <16 x i8> %[[WIDEVEC]], <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
  38 ;DISABLED_MASKED_STRIDED:       %[[VMASK:.+]] = icmp ugt <8 x i32> %[[VECIND]], %{{broadcast.splat*}}
  39 ;DISABLED_MASKED_STRIDED-NEXT:  %{{.*}} = shl nuw nsw <8 x i32> %[[VECIND]], <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
  40 ;DISABLED_MASKED_STRIDED-NEXT:  %[[M:.+]] = extractelement <8 x i1> %[[VMASK]], i32 0
  41 ;DISABLED_MASKED_STRIDED-NEXT:  br i1 %[[M]], label %pred.load.if, label %pred.load.continue
  42 ;DISABLED_MASKED_STRIDED-NOT:   %interleaved.mask =
  43 ;DISABLED_MASKED_STRIDED-NOT:   call void @llvm.masked.load.
  44 ;DISABLED_MASKED_STRIDED-NOT:   %{{.*}} = shufflevector <16 x i8> %{{.*}}, <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
  45 ;DISABLED_MASKED_STRIDED-NOT: for.body:
  46 ;DISABLED_MASKED_STRIDED:     for.end:
  47
  48 ;ENABLED_MASKED_STRIDED-LABEL: @masked_strided1(
  49 ;ENABLED_MASKED_STRIDED: vector.body:
  50 ;ENABLED_MASKED_STRIDED-NEXT:  %index = phi i32
  51 ;ENABLED_MASKED_STRIDED-NEXT:  %[[VECIND:.+]] = phi <8 x i32> [ <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
  52 ;ENABLED_MASKED_STRIDED:       %[[VMASK:.+]] = icmp ugt <8 x i32> %[[VECIND]], %{{broadcast.splat*}}
  53 ;ENABLED_MASKED_STRIDED:       %interleaved.mask = shufflevector <8 x i1> %[[VMASK]], <8 x i1> undef, <16 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7>
  54 ;ENABLED_MASKED_STRIDED-NEXT:  %[[WIDEMASKEDLOAD:.+]] = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %{{.*}}, i32 1, <16 x i1> %interleaved.mask, <16 x i8> undef)
  55 ;ENABLED_MASKED_STRIDED-NEXT:  %[[STRIDEDVEC:.+]] = shufflevector <16 x i8> %[[WIDEMASKEDLOAD]], <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
  56 ;ENABLED_MASKED_STRIDED: for.body:
  57
  58 define dso_local void @masked_strided1(i8* noalias nocapture readonly %p, i8* noalias nocapture %q, i8 zeroext %guard) local_unnamed_addr {
  59 entry:
  60   %conv = zext i8 %guard to i32
  61   br label %for.body
  62
  63 for.body:
  64   %ix.09 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
  65   %cmp1 = icmp ugt i32 %ix.09, %conv
  66   br i1 %cmp1, label %if.then, label %for.inc
  67
  68 if.then:
  69   %mul = shl nuw nsw i32 %ix.09, 1
  70   %arrayidx = getelementptr inbounds i8, i8* %p, i32 %mul
  71   %0 = load i8, i8* %arrayidx, align 1
  72   %arrayidx3 = getelementptr inbounds i8, i8* %q, i32 %ix.09
  73   store i8 %0, i8* %arrayidx3, align 1
  74   br label %for.inc
  75
  76 for.inc:
  77   %inc = add nuw nsw i32 %ix.09, 1
  78   %exitcond = icmp eq i32 %inc, 1024
  79   br i1 %exitcond, label %for.end, label %for.body
  80
  81 for.end:
  82   ret void
  83 }
  84
  85 ; Exactly the same scenario except we are now optimizing for size, therefore
  86 ; we check that no scalar epilogue is created. Since we can't create an epilog
  87 ; we need the ability to mask out the gaps.
  88 ; When enable-masked-interleaved-access is enabled, the interleave-groups will
  89 ; be vectorized with masked wide-loads with the mask properly shuffled and
  90 ; And-ed with the gaps mask.
  91
  92 ;ENABLED_MASKED_STRIDED-LABEL: @masked_strided1_optsize(
  93 ;ENABLED_MASKED_STRIDED-NEXT:  entry:
  94 ;ENABLED_MASKED_STRIDED-NEXT:    [[CONV:%.*]] = zext i8 [[GUARD:%.*]] to i32
  95 ;ENABLED_MASKED_STRIDED-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i32> undef, i32 [[CONV]], i32 0
  96 ;ENABLED_MASKED_STRIDED-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT]], <8 x i32> undef, <8 x i32> zeroinitializer
  97 ;ENABLED_MASKED_STRIDED-NEXT:    br label [[VECTOR_BODY:%.*]]
  98 ;ENABLED_MASKED_STRIDED:       vector.body:
  99 ;ENABLED_MASKED_STRIDED-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 100 ;ENABLED_MASKED_STRIDED-NEXT:    [[VEC_IND:%.*]] = phi <8 x i32> [ <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 101 ;ENABLED_MASKED_STRIDED-NEXT:    [[TMP0:%.*]] = icmp ugt <8 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
 102 ;ENABLED_MASKED_STRIDED-NEXT:    [[TMP1:%.*]] = shl nuw nsw i32 [[INDEX]], 1
 103 ;ENABLED_MASKED_STRIDED-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, i8* [[P:%.*]], i32 [[TMP1]]
 104 ;ENABLED_MASKED_STRIDED-NEXT:    [[TMP3:%.*]] = bitcast i8* [[TMP2]] to <16 x i8>*
 105 ;ENABLED_MASKED_STRIDED-NEXT:    [[INTERLEAVED_MASK:%.*]] = shufflevector <8 x i1> [[TMP0]], <8 x i1> undef, <16 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7>
 106 ;ENABLED_MASKED_STRIDED-NEXT:    [[TMP4:%.*]] = and <16 x i1> [[INTERLEAVED_MASK]], <i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false>
 107 ;ENABLED_MASKED_STRIDED-NEXT:    [[WIDE_MASKED_VEC:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* [[TMP3]], i32 1, <16 x i1> [[TMP4]], <16 x i8> undef)
 108 ;ENABLED_MASKED_STRIDED-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <16 x i8> [[WIDE_MASKED_VEC]], <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
 109 ;ENABLED_MASKED_STRIDED-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, i8* [[Q:%.*]], i32 [[INDEX]]
 110 ;ENABLED_MASKED_STRIDED-NEXT:    [[TMP6:%.*]] = bitcast i8* [[TMP5]] to <8 x i8>*
 111 ;ENABLED_MASKED_STRIDED-NEXT:    call void @llvm.masked.store.v8i8.p0v8i8(<8 x i8> [[STRIDED_VEC]], <8 x i8>* [[TMP6]], i32 1, <8 x i1> [[TMP0]])
 112 ;ENABLED_MASKED_STRIDED-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 8
 113 ;ENABLED_MASKED_STRIDED-NEXT:    [[VEC_IND_NEXT]] = add <8 x i32> [[VEC_IND]], <i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
 114 ;ENABLED_MASKED_STRIDED-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1024
 115 ;ENABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP7]]
 116 ;ENABLED_MASKED_STRIDED-NOT:   for.body:
 117 ;ENABLED_MASKED_STRIDED:       for.end:
 118 ;ENABLED_MASKED_STRIDED-NEXT:    ret void
 119
 120
 121 define dso_local void @masked_strided1_optsize(i8* noalias nocapture readonly %p, i8* noalias nocapture %q, i8 zeroext %guard) local_unnamed_addr optsize {
 122 entry:
 123   %conv = zext i8 %guard to i32
 124   br label %for.body
 125
 126 for.body:
 127   %ix.09 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
 128   %cmp1 = icmp ugt i32 %ix.09, %conv
 129   br i1 %cmp1, label %if.then, label %for.inc
 130
 131 if.then:
 132   %mul = shl nuw nsw i32 %ix.09, 1
 133   %arrayidx = getelementptr inbounds i8, i8* %p, i32 %mul
 134   %0 = load i8, i8* %arrayidx, align 1
 135   %arrayidx3 = getelementptr inbounds i8, i8* %q, i32 %ix.09
 136   store i8 %0, i8* %arrayidx3, align 1
 137   br label %for.inc
 138
 139 for.inc:
 140   %inc = add nuw nsw i32 %ix.09, 1
 141   %exitcond = icmp eq i32 %inc, 1024
 142   br i1 %exitcond, label %for.end, label %for.body
 143
 144 for.end:
 145   ret void
 146 }
 147
 148
 149 ; Accesses with gaps under Optsize scenario again, with unknown trip-count
 150 ; this time, in order to check the behavior of folding-the-tail (folding the
 151 ; remainder loop into the main loop using masking) together with interleaved-
 152 ; groups.
 153 ; When masked-interleave-group is disabled the interleave-groups will be
 154 ; invalidated during Legality checks; So there we check for no epilogue
 155 ; and for scalarized conditional accesses.
 156 ; When masked-interleave-group is enabled we check that there is no epilogue,
 157 ; and that the interleave-groups are vectorized using proper masking (with
 158 ; shuffling of the mask feeding the wide masked load/store).
 159 ; The mask itself is an And of two masks: one that masks away the remainder
 160 ; iterations, and one that masks away the 'else' of the 'if' statement.
 161 ; The shuffled mask is also And-ed with the gaps mask.
 162 ;
 163 ; void masked_strided1_optsize_unknown_tc(const unsigned char* restrict p,
 164 ;                      unsigned char* restrict q,
 165 ;                      unsigned char guard,
 166 ;                      int n) {
 167 ;   for(ix=0; ix < n; ++ix) {
 168 ;     if (ix > guard) {
 169 ;         char t = p[2*ix];
 170 ;         q[ix] = t;
 171 ;     }
 172 ;   }
 173 ; }
 174
 175 ; DISABLED_MASKED_STRIDED-LABEL: @masked_strided1_optsize_unknown_tc(
 176 ; DISABLED_MASKED_STRIDED:       vector.body:
 177 ; DISABLED_MASKED_STRIDED-NEXT:    [[INDEX:%.*]] = phi i32
 178 ; DISABLED_MASKED_STRIDED-NEXT:    [[VEC_IND:%.*]] = phi <8 x i32> [ <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 179 ; DISABLED_MASKED_STRIDED-NEXT:    [[TMP0:%.*]] = icmp ugt <8 x i32> [[VEC_IND]], {{.*}}
 180 ; DISABLED_MASKED_STRIDED-NEXT:    [[TMP1:%.*]] = shl nuw nsw <8 x i32> [[VEC_IND]], <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
 181 ; DISABLED_MASKED_STRIDED-NEXT:    [[TMP2:%.*]] = icmp ule <8 x i32> [[VEC_IND]], {{.*}}
 182 ; DISABLED_MASKED_STRIDED-NEXT:    [[TMP3:%.*]] = and <8 x i1> [[TMP0]], [[TMP2]]
 183 ; DISABLED_MASKED_STRIDED-NEXT:    [[TMP4:%.*]] = extractelement <8 x i1> [[TMP3]], i32 0
 184 ; DISABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP4]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
 185 ; DISABLED_MASKED_STRIDED:       pred.load.if:
 186 ; DISABLED_MASKED_STRIDED-NEXT:    [[TMP5:%.*]] = extractelement <8 x i32> [[TMP1]], i32 0
 187 ; DISABLED_MASKED_STRIDED-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, i8* [[P:%.*]], i32 [[TMP5]]
 188 ; DISABLED_MASKED_STRIDED-NEXT:    [[TMP7:%.*]] = load i8, i8* [[TMP6]], align 1
 189 ; DISABLED_MASKED_STRIDED-NEXT:    [[TMP8:%.*]] = insertelement <8 x i8> undef, i8 [[TMP7]], i32 0
 190 ; DISABLED_MASKED_STRIDED-NEXT:    br label [[PRED_LOAD_CONTINUE]]
 191 ; DISABLED_MASKED_STRIDED-NOT:   for.body:
 192 ; DISABLED_MASKED_STRIDED:       for.end:
 193 ; DISABLED_MASKED_STRIDED-NEXT:    ret void
 194
 195
 196 ; ENABLED_MASKED_STRIDED-LABEL: @masked_strided1_optsize_unknown_tc(
 197 ; ENABLED_MASKED_STRIDED-NEXT:  entry:
 198 ; ENABLED_MASKED_STRIDED-NEXT:    [[CMP9:%.*]] = icmp sgt i32 [[N:%.*]], 0
 199 ; ENABLED_MASKED_STRIDED-NEXT:    br i1 [[CMP9]], label [[VECTOR_PH:%.*]], label [[FOR_END:%.*]]
 200 ; ENABLED_MASKED_STRIDED:       vector.ph:
 201 ; ENABLED_MASKED_STRIDED-NEXT:    [[CONV:%.*]] = zext i8 [[GUARD:%.*]] to i32
 202 ; ENABLED_MASKED_STRIDED-NEXT:    [[N_RND_UP:%.*]] = add i32 [[N]], 7
 203 ; ENABLED_MASKED_STRIDED-NEXT:    [[N_VEC:%.*]] = and i32 [[N_RND_UP]], -8
 204 ; ENABLED_MASKED_STRIDED-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = add i32 [[N]], -1
 205 ; ENABLED_MASKED_STRIDED-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i32> undef, i32 [[CONV]], i32 0
 206 ; ENABLED_MASKED_STRIDED-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT]], <8 x i32> undef, <8 x i32> zeroinitializer
 207 ; ENABLED_MASKED_STRIDED-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <8 x i32> undef, i32 [[TRIP_COUNT_MINUS_1]], i32 0
 208 ; ENABLED_MASKED_STRIDED-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT1]], <8 x i32> undef, <8 x i32> zeroinitializer
 209 ; ENABLED_MASKED_STRIDED-NEXT:    br label [[VECTOR_BODY:%.*]]
 210 ; ENABLED_MASKED_STRIDED:       vector.body:
 211 ; ENABLED_MASKED_STRIDED-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 212 ; ENABLED_MASKED_STRIDED-NEXT:    [[VEC_IND:%.*]] = phi <8 x i32> [ <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 213 ; ENABLED_MASKED_STRIDED-NEXT:    [[TMP0:%.*]] = icmp ugt <8 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
 214 ; ENABLED_MASKED_STRIDED-NEXT:    [[TMP1:%.*]] = shl nuw nsw i32 [[INDEX]], 1
 215 ; ENABLED_MASKED_STRIDED-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, i8* [[P:%.*]], i32 [[TMP1]]
 216 ; ENABLED_MASKED_STRIDED-NEXT:    [[TMP3:%.*]] = icmp ule <8 x i32> [[VEC_IND]], [[BROADCAST_SPLAT2]]
 217 ; ENABLED_MASKED_STRIDED-NEXT:    [[TMP4:%.*]] = and <8 x i1> [[TMP0]], [[TMP3]]
 218 ; ENABLED_MASKED_STRIDED-NEXT:    [[TMP5:%.*]] = bitcast i8* [[TMP2]] to <16 x i8>*
 219 ; ENABLED_MASKED_STRIDED-NEXT:    [[INTERLEAVED_MASK:%.*]] = shufflevector <8 x i1> [[TMP4]], <8 x i1> undef, <16 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7>
 220 ; ENABLED_MASKED_STRIDED-NEXT:    [[TMP6:%.*]] = and <16 x i1> [[INTERLEAVED_MASK]], <i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false>
 221 ; ENABLED_MASKED_STRIDED-NEXT:    [[WIDE_MASKED_VEC:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* [[TMP5]], i32 1, <16 x i1> [[TMP6]], <16 x i8> undef)
 222 ; ENABLED_MASKED_STRIDED-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <16 x i8> [[WIDE_MASKED_VEC]], <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
 223 ; ENABLED_MASKED_STRIDED-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, i8* [[Q:%.*]], i32 [[INDEX]]
 224 ; ENABLED_MASKED_STRIDED-NEXT:    [[TMP8:%.*]] = bitcast i8* [[TMP7]] to <8 x i8>*
 225 ; ENABLED_MASKED_STRIDED-NEXT:    call void @llvm.masked.store.v8i8.p0v8i8(<8 x i8> [[STRIDED_VEC]], <8 x i8>* [[TMP8]], i32 1, <8 x i1> [[TMP4]])
 226 ; ENABLED_MASKED_STRIDED-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 8
 227 ; ENABLED_MASKED_STRIDED-NEXT:    [[VEC_IND_NEXT]] = add <8 x i32> [[VEC_IND]], <i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
 228 ; ENABLED_MASKED_STRIDED-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 229 ; ENABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP9]], label [[FOR_END]], label [[VECTOR_BODY]]
 230 ; ENABLED_MASKED_STRIDED-NOT:   for.body:
 231 ; ENABLED_MASKED_STRIDED:       for.end:
 232 ; ENABLED_MASKED_STRIDED-NEXT:    ret void
 233
 234 define dso_local void @masked_strided1_optsize_unknown_tc(i8* noalias nocapture readonly %p, i8* noalias nocapture %q, i8 zeroext %guard, i32 %n) local_unnamed_addr optsize {
 235 entry:
 236   %cmp9 = icmp sgt i32 %n, 0
 237   br i1 %cmp9, label %for.body.lr.ph, label %for.end
 238
 239 for.body.lr.ph:
 240   %conv = zext i8 %guard to i32
 241   br label %for.body
 242
 243 for.body:
 244   %ix.010 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.inc ]
 245   %cmp1 = icmp ugt i32 %ix.010, %conv
 246   br i1 %cmp1, label %if.then, label %for.inc
 247
 248 if.then:
 249   %mul = shl nuw nsw i32 %ix.010, 1
 250   %arrayidx = getelementptr inbounds i8, i8* %p, i32 %mul
 251   %0 = load i8, i8* %arrayidx, align 1
 252   %arrayidx3 = getelementptr inbounds i8, i8* %q, i32 %ix.010
 253   store i8 %0, i8* %arrayidx3, align 1
 254   br label %for.inc
 255
 256 for.inc:
 257   %inc = add nuw nsw i32 %ix.010, 1
 258   %exitcond = icmp eq i32 %inc, %n
 259   br i1 %exitcond, label %for.end.loopexit, label %for.body
 260
 261 for.end.loopexit:
 262   br label %for.end
 263
 264 for.end:
 265   ret void
 266 }
 267
 268 ; Same, with stride 3. This is to check the gaps-mask and the shuffled mask
 269 ; with a different stride.
 270 ; So accesses are with gaps under Optsize scenario again, with unknown trip-
 271 ; count, in order to check the behavior of folding-the-tail (folding the
 272 ; remainder loop into the main loop using masking) together with interleaved-
 273 ; groups.
 274 ; When masked-interleave-group is enabled we check that there is no epilogue,
 275 ; and that the interleave-groups are vectorized using proper masking (with
 276 ; shuffling of the mask feeding the wide masked load/store).
 277 ; The mask itself is an And of two masks: one that masks away the remainder
 278 ; iterations, and one that masks away the 'else' of the 'if' statement.
 279 ; The shuffled mask is also And-ed with the gaps mask.
 280 ;
 281 ; void masked_strided3_optsize_unknown_tc(const unsigned char* restrict p,
 282 ;                      unsigned char* restrict q,
 283 ;                      unsigned char guard,
 284 ;                      int n) {
 285 ;   for(ix=0; ix < n; ++ix) {
 286 ;     if (ix > guard) {
 287 ;         char t = p[3*ix];
 288 ;         q[ix] = t;
 289 ;     }
 290 ;   }
 291 ; }
 292
 293
 294 ; ENABLED_MASKED_STRIDED-LABEL: @masked_strided3_optsize_unknown_tc(
 295 ; ENABLED_MASKED_STRIDED-NEXT:  entry:
 296 ; ENABLED_MASKED_STRIDED-NEXT:    [[CMP9:%.*]] = icmp sgt i32 [[N:%.*]], 0
 297 ; ENABLED_MASKED_STRIDED-NEXT:    br i1 [[CMP9]], label [[VECTOR_PH:%.*]], label [[FOR_END:%.*]]
 298 ; ENABLED_MASKED_STRIDED:       vector.ph:
 299 ; ENABLED_MASKED_STRIDED-NEXT:    [[CONV:%.*]] = zext i8 [[GUARD:%.*]] to i32
 300 ; ENABLED_MASKED_STRIDED-NEXT:    [[N_RND_UP:%.*]] = add i32 [[N]], 7
 301 ; ENABLED_MASKED_STRIDED-NEXT:    [[N_VEC:%.*]] = and i32 [[N_RND_UP]], -8
 302 ; ENABLED_MASKED_STRIDED-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = add i32 [[N]], -1
 303 ; ENABLED_MASKED_STRIDED-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i32> undef, i32 [[CONV]], i32 0
 304 ; ENABLED_MASKED_STRIDED-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT]], <8 x i32> undef, <8 x i32> zeroinitializer
 305 ; ENABLED_MASKED_STRIDED-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <8 x i32> undef, i32 [[TRIP_COUNT_MINUS_1]], i32 0
 306 ; ENABLED_MASKED_STRIDED-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT1]], <8 x i32> undef, <8 x i32> zeroinitializer
 307 ; ENABLED_MASKED_STRIDED-NEXT:    br label [[VECTOR_BODY:%.*]]
 308 ; ENABLED_MASKED_STRIDED:       vector.body:
 309 ; ENABLED_MASKED_STRIDED-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 310 ; ENABLED_MASKED_STRIDED-NEXT:    [[VEC_IND:%.*]] = phi <8 x i32> [ <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 311 ; ENABLED_MASKED_STRIDED-NEXT:    [[TMP0:%.*]] = icmp ugt <8 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
 312 ; ENABLED_MASKED_STRIDED-NEXT:    [[TMP1:%.*]] = mul nsw i32 [[INDEX]], 3
 313 ; ENABLED_MASKED_STRIDED-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, i8* [[P:%.*]], i32 [[TMP1]]
 314 ; ENABLED_MASKED_STRIDED-NEXT:    [[TMP3:%.*]] = icmp ule <8 x i32> [[VEC_IND]], [[BROADCAST_SPLAT2]]
 315 ; ENABLED_MASKED_STRIDED-NEXT:    [[TMP4:%.*]] = and <8 x i1> [[TMP0]], [[TMP3]]
 316 ; ENABLED_MASKED_STRIDED-NEXT:    [[TMP5:%.*]] = bitcast i8* [[TMP2]] to <24 x i8>*
 317 ; ENABLED_MASKED_STRIDED-NEXT:    [[INTERLEAVED_MASK:%.*]] = shufflevector <8 x i1> [[TMP4]], <8 x i1> undef, <24 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7>
 318 ; ENABLED_MASKED_STRIDED-NEXT:    [[TMP6:%.*]] = and <24 x i1> [[INTERLEAVED_MASK]], <i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false>
 319 ; ENABLED_MASKED_STRIDED-NEXT:    [[WIDE_MASKED_VEC:%.*]] = call <24 x i8> @llvm.masked.load.v24i8.p0v24i8(<24 x i8>* [[TMP5]], i32 1, <24 x i1> [[TMP6]], <24 x i8> undef)
 320 ; ENABLED_MASKED_STRIDED-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <24 x i8> [[WIDE_MASKED_VEC]], <24 x i8> undef, <8 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21>
 321 ; ENABLED_MASKED_STRIDED-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, i8* [[Q:%.*]], i32 [[INDEX]]
 322 ; ENABLED_MASKED_STRIDED-NEXT:    [[TMP8:%.*]] = bitcast i8* [[TMP7]] to <8 x i8>*
 323 ; ENABLED_MASKED_STRIDED-NEXT:    call void @llvm.masked.store.v8i8.p0v8i8(<8 x i8> [[STRIDED_VEC]], <8 x i8>* [[TMP8]], i32 1, <8 x i1> [[TMP4]])
 324 ; ENABLED_MASKED_STRIDED-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 8
 325 ; ENABLED_MASKED_STRIDED-NEXT:    [[VEC_IND_NEXT]] = add <8 x i32> [[VEC_IND]], <i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
 326 ; ENABLED_MASKED_STRIDED-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 327 ; ENABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP9]], label [[FOR_END]], label [[VECTOR_BODY]]
 328 ; ENABLED_MASKED_STRIDED:       for.end:
 329 ; ENABLED_MASKED_STRIDED-NEXT:    ret void
 330 ;
 331 define dso_local void @masked_strided3_optsize_unknown_tc(i8* noalias nocapture readonly %p, i8* noalias nocapture %q, i8 zeroext %guard, i32 %n) local_unnamed_addr optsize {
 332 entry:
 333   %cmp9 = icmp sgt i32 %n, 0
 334   br i1 %cmp9, label %for.body.lr.ph, label %for.end
 335
 336 for.body.lr.ph:
 337   %conv = zext i8 %guard to i32
 338   br label %for.body
 339
 340 for.body:
 341   %ix.010 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.inc ]
 342   %cmp1 = icmp ugt i32 %ix.010, %conv
 343   br i1 %cmp1, label %if.then, label %for.inc
 344
 345 if.then:
 346   %mul = mul nsw i32 %ix.010, 3
 347   %arrayidx = getelementptr inbounds i8, i8* %p, i32 %mul
 348   %0 = load i8, i8* %arrayidx, align 1
 349   %arrayidx3 = getelementptr inbounds i8, i8* %q, i32 %ix.010
 350   store i8 %0, i8* %arrayidx3, align 1
 351   br label %for.inc
 352
 353 for.inc:
 354   %inc = add nuw nsw i32 %ix.010, 1
 355   %exitcond = icmp eq i32 %inc, %n
 356   br i1 %exitcond, label %for.end.loopexit, label %for.body
 357
 358 for.end.loopexit:
 359   br label %for.end
 360
 361 for.end:
 362   ret void
 363 }
 364
 365
 366 ; Back to stride 2 with gaps with a known trip count under opt for size,
 367 ; but this time the load/store are not predicated.
 368 ; When enable-masked-interleaved-access is disabled, the interleave-groups will
 369 ; be invalidated during cost-model checks because we have gaps and we can't
 370 ; create an epilog. The access is thus scalarized.
 371 ; (Before the fix that this test checks, we used to create an epilogue despite
 372 ; optsize, and vectorized the access as an interleaved-group. This is now fixed,
 373 ; and we make sure that a scalar epilogue does not exist).
 374 ; When enable-masked-interleaved-access is enabled, the interleave-groups will
 375 ; be vectorized with masked wide-loads (masking away the gaps).
 376 ;
 377 ; void unconditional_strided1_optsize(const unsigned char* restrict p,
 378 ;                                unsigned char* restrict q,
 379 ;                                unsigned char guard) {
 380 ;   for(ix=0; ix < 1024; ++ix) {
 381 ;         char t = p[2*ix];
 382 ;         q[ix] = t;
 383 ;   }
 384 ; }
 385
 386 ;DISABLED_MASKED_STRIDED-LABEL: @unconditional_strided1_optsize(
 387 ;DISABLED_MASKED_STRIDED: vector.body:
 388 ;DISABLED_MASKED_STRIDED-NOT: call <16 x i8> @llvm.masked.load.v16i8.p0v16i8
 389 ;DISABLED_MASKED_STRIDED:     %{{.*}} = extractelement <8 x i32> %{{.*}}, i32 0
 390 ;DISABLED_MASKED_STRIDED-NOT: for.body:
 391 ;DISABLED_MASKED_STRIDED:     for.end:
 392
 393 ;ENABLED_MASKED_STRIDED-LABEL: @unconditional_strided1_optsize(
 394 ;ENABLED_MASKED_STRIDED-NEXT:  entry:
 395 ;ENABLED_MASKED_STRIDED-NEXT:    br label [[VECTOR_BODY:%.*]]
 396 ;ENABLED_MASKED_STRIDED:       vector.body:
 397 ;ENABLED_MASKED_STRIDED-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 398 ;ENABLED_MASKED_STRIDED-NEXT:    [[TMP0:%.*]] = shl nuw nsw i32 [[INDEX]], 1
 399 ;ENABLED_MASKED_STRIDED-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, i8* [[P:%.*]], i32 [[TMP0]]
 400 ;ENABLED_MASKED_STRIDED-NEXT:    [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <16 x i8>*
 401 ;ENABLED_MASKED_STRIDED-NEXT:    [[WIDE_MASKED_VEC:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* [[TMP2]], i32 1, <16 x i1> <i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false>, <16 x i8> undef)
 402 ;ENABLED_MASKED_STRIDED-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <16 x i8> [[WIDE_MASKED_VEC]], <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
 403 ;ENABLED_MASKED_STRIDED-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, i8* [[Q:%.*]], i32 [[INDEX]]
 404 ;ENABLED_MASKED_STRIDED-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP3]] to <8 x i8>*
 405 ;ENABLED_MASKED_STRIDED-NEXT:    store <8 x i8> [[STRIDED_VEC]], <8 x i8>* [[TMP4]], align 1
 406 ;ENABLED_MASKED_STRIDED-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 8
 407 ;ENABLED_MASKED_STRIDED-NEXT:    [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1024
 408 ;ENABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP5]], label [[FOR_END:%.*]], label [[VECTOR_BODY]]
 409 ;ENABLED_MASKED_STRIDED-NOT:   for.body:
 410 ;ENABLED_MASKED_STRIDED:       for.end:
 411 ;ENABLED_MASKED_STRIDED-NEXT:    ret void
 412
 413
 414 define dso_local void @unconditional_strided1_optsize(i8* noalias nocapture readonly %p, i8* noalias nocapture %q, i8 zeroext %guard) local_unnamed_addr optsize {
 415 entry:
 416   br label %for.body
 417
 418 for.body:
 419   %ix.06 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
 420   %mul = shl nuw nsw i32 %ix.06, 1
 421   %arrayidx = getelementptr inbounds i8, i8* %p, i32 %mul
 422   %0 = load i8, i8* %arrayidx, align 1
 423   %arrayidx1 = getelementptr inbounds i8, i8* %q, i32 %ix.06
 424   store i8 %0, i8* %arrayidx1, align 1
 425   %inc = add nuw nsw i32 %ix.06, 1
 426   %exitcond = icmp eq i32 %inc, 1024
 427   br i1 %exitcond, label %for.end, label %for.body
 428
 429 for.end:
 430   ret void
 431 }
 432
 433
 434
 435 ; Unconditioal accesses with gaps under Optsize scenario again, with unknown
 436 ; trip-count this time, in order to check the behavior of folding-the-tail
 437 ; (folding the remainder loop into the main loop using masking) together with
 438 ; interleaved-groups. Folding-the-tail turns the accesses to conditional which
 439 ; requires proper masking. In addition we need to mask out the gaps (all
 440 ; because we are not allowed to use an epilog due to optsize).
 441 ; When enable-masked-interleaved-access is disabled, the interleave-groups will
 442 ; be invalidated during cost-model checks. So there we check for no epilogue
 443 ; and for scalarized conditional accesses.
 444 ; When masked-interleave-group is enabled we check that there is no epilogue,
 445 ; and that the interleave-groups are vectorized using proper masking (with
 446 ; shuffling of the mask feeding the wide masked load/store).
 447 ; The shuffled mask is also And-ed with the gaps mask.
 448 ;
 449 ;   for(ix=0; ix < n; ++ix) {
 450 ;         char t = p[2*ix];
 451 ;         q[ix] = t;
 452 ;   }
 453
 454 ; DISABLED_MASKED_STRIDED-LABEL: @unconditional_strided1_optsize_unknown_tc(
 455 ; DISABLED_MASKED_STRIDED:       vector.body:
 456 ; DISABLED_MASKED_STRIDED-NEXT:    [[INDEX:%.*]] = phi i32
 457 ; DISABLED_MASKED_STRIDED-NEXT:    [[VEC_IND:%.*]] = phi <8 x i32> [ <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 458 ; DISABLED_MASKED_STRIDED-NEXT:    [[TMP0:%.*]] = shl nuw nsw <8 x i32> [[VEC_IND]], <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
 459 ; DISABLED_MASKED_STRIDED-NEXT:    [[TMP1:%.*]] = icmp ule <8 x i32> [[VEC_IND]], {{.*}}
 460 ; DISABLED_MASKED_STRIDED-NEXT:    [[TMP2:%.*]] = extractelement <8 x i1> [[TMP1]], i32 0
 461 ; DISABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP2]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
 462 ; DISABLED_MASKED_STRIDED:       pred.load.if:
 463 ; DISABLED_MASKED_STRIDED-NEXT:    [[TMP3:%.*]] = extractelement <8 x i32> [[TMP0]], i32 0
 464 ; DISABLED_MASKED_STRIDED-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, i8* [[P:%.*]], i32 [[TMP3]]
 465 ; DISABLED_MASKED_STRIDED-NEXT:    [[TMP5:%.*]] = load i8, i8* [[TMP4]], align 1
 466 ; DISABLED_MASKED_STRIDED-NEXT:    [[TMP6:%.*]] = insertelement <8 x i8> undef, i8 [[TMP5]], i32 0
 467 ; DISABLED_MASKED_STRIDED-NEXT:    br label [[PRED_LOAD_CONTINUE]]
 468 ; DISBLED_MASKED_STRIDED-NOT:    for.body:
 469 ; DISABLED_MASKED_STRIDED:       for.end:
 470 ; DISABLED_MASKED_STRIDED-NEXT:    ret void
 471
 472 ; ENABLED_MASKED_STRIDED-LABEL: @unconditional_strided1_optsize_unknown_tc(
 473 ; ENABLED_MASKED_STRIDED-NEXT:  entry:
 474 ; ENABLED_MASKED_STRIDED-NEXT:    [[CMP6:%.*]] = icmp sgt i32 [[N:%.*]], 0
 475 ; ENABLED_MASKED_STRIDED-NEXT:    br i1 [[CMP6]], label [[VECTOR_PH:%.*]], label [[FOR_END:%.*]]
 476 ; ENABLED_MASKED_STRIDED:       vector.ph:
 477 ; ENABLED_MASKED_STRIDED-NEXT:    [[N_RND_UP:%.*]] = add i32 [[N]], 7
 478 ; ENABLED_MASKED_STRIDED-NEXT:    [[N_VEC:%.*]] = and i32 [[N_RND_UP]], -8
 479 ; ENABLED_MASKED_STRIDED-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = add i32 [[N]], -1
 480 ; ENABLED_MASKED_STRIDED-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <8 x i32> undef, i32 [[TRIP_COUNT_MINUS_1]], i32 0
 481 ; ENABLED_MASKED_STRIDED-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT1]], <8 x i32> undef, <8 x i32> zeroinitializer
 482 ; ENABLED_MASKED_STRIDED-NEXT:    br label [[VECTOR_BODY:%.*]]
 483 ; ENABLED_MASKED_STRIDED:       vector.body:
 484 ; ENABLED_MASKED_STRIDED-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 485 ; ENABLED_MASKED_STRIDED-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i32> undef, i32 [[INDEX]], i32 0
 486 ; ENABLED_MASKED_STRIDED-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT]], <8 x i32> undef, <8 x i32> zeroinitializer
 487 ; ENABLED_MASKED_STRIDED-NEXT:    [[INDUCTION:%.*]] = add <8 x i32> [[BROADCAST_SPLAT]], <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 488 ; ENABLED_MASKED_STRIDED-NEXT:    [[TMP0:%.*]] = shl nuw nsw i32 [[INDEX]], 1
 489 ; ENABLED_MASKED_STRIDED-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, i8* [[P:%.*]], i32 [[TMP0]]
 490 ; ENABLED_MASKED_STRIDED-NEXT:    [[TMP2:%.*]] = icmp ule <8 x i32> [[INDUCTION]], [[BROADCAST_SPLAT2]]
 491 ; ENABLED_MASKED_STRIDED-NEXT:    [[TMP3:%.*]] = bitcast i8* [[TMP1]] to <16 x i8>*
 492 ; ENABLED_MASKED_STRIDED-NEXT:    [[INTERLEAVED_MASK:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> undef, <16 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7>
 493 ; ENABLED_MASKED_STRIDED-NEXT:    [[TMP4:%.*]] = and <16 x i1> [[INTERLEAVED_MASK]], <i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false>
 494 ; ENABLED_MASKED_STRIDED-NEXT:    [[WIDE_MASKED_VEC:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* [[TMP3]], i32 1, <16 x i1> [[TMP4]], <16 x i8> undef)
 495 ; ENABLED_MASKED_STRIDED-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <16 x i8> [[WIDE_MASKED_VEC]], <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
 496 ; ENABLED_MASKED_STRIDED-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, i8* [[Q:%.*]], i32 [[INDEX]]
 497 ; ENABLED_MASKED_STRIDED-NEXT:    [[TMP6:%.*]] = bitcast i8* [[TMP5]] to <8 x i8>*
 498 ; ENABLED_MASKED_STRIDED-NEXT:    call void @llvm.masked.store.v8i8.p0v8i8(<8 x i8> [[STRIDED_VEC]], <8 x i8>* [[TMP6]], i32 1, <8 x i1> [[TMP2]])
 499 ; ENABLED_MASKED_STRIDED-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 8
 500 ; ENABLED_MASKED_STRIDED-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 501 ; ENABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP7]], label [[FOR_END]], label [[VECTOR_BODY]]
 502 ; ENABLED_MASKED_STRIDED-NOT:   for.body:
 503 ; ENABLED_MASKED_STRIDED:       for.end:
 504 ; ENABLED_MASKED_STRIDED-NEXT:    ret void
 505
 506 define dso_local void @unconditional_strided1_optsize_unknown_tc(i8* noalias nocapture readonly %p, i8* noalias nocapture %q, i32 %n) local_unnamed_addr optsize {
 507 entry:
 508   %cmp6 = icmp sgt i32 %n, 0
 509   br i1 %cmp6, label %for.body.preheader, label %for.end
 510
 511 for.body.preheader:
 512   br label %for.body
 513
 514 for.body:
 515   %ix.07 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
 516   %mul = shl nuw nsw i32 %ix.07, 1
 517   %arrayidx = getelementptr inbounds i8, i8* %p, i32 %mul
 518   %0 = load i8, i8* %arrayidx, align 1
 519   %arrayidx1 = getelementptr inbounds i8, i8* %q, i32 %ix.07
 520   store i8 %0, i8* %arrayidx1, align 1
 521   %inc = add nuw nsw i32 %ix.07, 1
 522   %exitcond = icmp eq i32 %inc, %n
 523   br i1 %exitcond, label %for.end.loopexit, label %for.body
 524
 525 for.end.loopexit:
 526   br label %for.end
 527
 528 for.end:
 529   ret void
 530 }
 531
 532
 533 ; Check also a scenario with full interleave-groups (no gaps) as well as both
 534 ; load and store groups. We check that when masked-interleave-group is disabled
 535 ; the predicated loads (and stores) are not vectorized as an
 536 ; interleaved-group but rather as four separate scalarized accesses.
 537 ; (For SKX, gather/scatter is not supported by the compiler for chars, therefore
 538 ; the only remaining alternative is to scalarize).
 539 ; When  masked-interleave-group is enabled we expect to find the proper mask
 540 ; shuffling code, feeding the wide masked load/store for the two interleave-
 541 ; groups.
 542 ;
 543 ; void masked_strided2(const unsigned char* restrict p,
 544 ;                     unsigned char* restrict q,
 545 ;                     unsigned char guard) {
 546 ; for(ix=0; ix < 1024; ++ix) {
 547 ;     if (ix > guard) {
 548 ;         char left = p[2*ix];
 549 ;         char right = p[2*ix + 1];
 550 ;         char max = max(left, right);
 551 ;         q[2*ix] = max;
 552 ;         q[2*ix+1] = 0 - max;
 553 ;     }
 554 ; }
 555 ;}
 556
 557 ;DISABLED_MASKED_STRIDED-LABEL: @masked_strided2(
 558 ;DISABLED_MASKED_STRIDED: vector.body:
 559 ;DISABLED_MASKED_STRIDED-NEXT:  %index = phi i32
 560 ;DISABLED_MASKED_STRIDED-NEXT:  %[[VECIND:.+]] = phi <8 x i32> [ <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 561 ;DISABLED_MASKED_STRIDED-NOT:   %interleaved.mask =
 562 ;DISABLED_MASKED_STRIDED-NOT:   call void @llvm.masked.load.
 563 ;DISABLED_MASKED_STRIDED-NOT:   call void @llvm.masked.store.
 564 ;DISABLED_MASKED_STRIDED-NOT:   %{{.*}} = shufflevector <16 x i8> %{{.*}}, <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
 565 ;DISABLED_MASKED_STRIDED:        %[[VMASK:.+]] = icmp ugt <8 x i32> %[[VECIND]], %{{broadcast.splat*}}
 566 ;DISABLED_MASKED_STRIDED-NEXT:  %{{.*}} = shl nuw nsw <8 x i32> %[[VECIND]], <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
 567 ;DISABLED_MASKED_STRIDED-NEXT:  %[[M:.+]] = extractelement <8 x i1> %[[VMASK]], i32 0
 568 ;DISABLED_MASKED_STRIDED-NEXT:  br i1 %[[M]], label %pred.load.if, label %pred.load.continue
 569 ;DISABLED_MASKED_STRIDED-NOT:   %interleaved.mask =
 570 ;DISABLED_MASKED_STRIDED-NOT:   call void @llvm.masked.load.
 571 ;DISABLED_MASKED_STRIDED-NOT:   call void @llvm.masked.store.
 572 ;DISABLED_MASKED_STRIDED-NOT:   %{{.*}} = shufflevector <16 x i8> %{{.*}}, <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
 573
 574 ;ENABLED_MASKED_STRIDED-LABEL: @masked_strided2(
 575 ;ENABLED_MASKED_STRIDED: vector.body:
 576 ;ENABLED_MASKED_STRIDED-NEXT:  %index = phi i32
 577 ;ENABLED_MASKED_STRIDED-NEXT:  %[[VECIND:.+]] = phi <8 x i32> [ <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 578 ;ENABLED_MASKED_STRIDED:       %[[VMASK:.+]] = icmp ugt <8 x i32> %[[VECIND]], %{{broadcast.splat*}}
 579 ;ENABLED_MASKED_STRIDED:       %interleaved.mask = shufflevector <8 x i1> %[[VMASK]], <8 x i1> undef, <16 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7>
 580 ;ENABLED_MASKED_STRIDED-NEXT:  %[[WIDEMASKEDLOAD:.+]] = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %{{.*}}, i32 1, <16 x i1> %interleaved.mask, <16 x i8> undef)
 581 ;ENABLED_MASKED_STRIDED-NEXT:  %{{.*}} = shufflevector <16 x i8> %[[WIDEMASKEDLOAD]], <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
 582 ;ENABLED_MASKED_STRIDED-NEXT:  %{{.*}} = shufflevector <16 x i8> %[[WIDEMASKEDLOAD]], <16 x i8> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
 583 ;ENABLED_MASKED_STRIDED:       call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> %{{.*}}, <16 x i8>* %{{.*}}, i32 1, <16 x i1> %interleaved.mask)
 584
 585 ; Function Attrs: norecurse nounwind
 586 define dso_local void @masked_strided2(i8* noalias nocapture readonly %p, i8* noalias nocapture %q, i8 zeroext %guard) local_unnamed_addr  {
 587 entry:
 588   %conv = zext i8 %guard to i32
 589   br label %for.body
 590
 591 for.body:
 592   %ix.024 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
 593   %cmp1 = icmp ugt i32 %ix.024, %conv
 594   br i1 %cmp1, label %if.then, label %for.inc
 595
 596 if.then:
 597   %mul = shl nuw nsw i32 %ix.024, 1
 598   %arrayidx = getelementptr inbounds i8, i8* %p, i32 %mul
 599   %0 = load i8, i8* %arrayidx, align 1
 600   %add = or i32 %mul, 1
 601   %arrayidx4 = getelementptr inbounds i8, i8* %p, i32 %add
 602   %1 = load i8, i8* %arrayidx4, align 1
 603   %cmp.i = icmp slt i8 %0, %1
 604   %spec.select.i = select i1 %cmp.i, i8 %1, i8 %0
 605   %arrayidx6 = getelementptr inbounds i8, i8* %q, i32 %mul
 606   store i8 %spec.select.i, i8* %arrayidx6, align 1
 607   %sub = sub i8 0, %spec.select.i
 608   %arrayidx11 = getelementptr inbounds i8, i8* %q, i32 %add
 609   store i8 %sub, i8* %arrayidx11, align 1
 610   br label %for.inc
 611
 612 for.inc:
 613   %inc = add nuw nsw i32 %ix.024, 1
 614   %exitcond = icmp eq i32 %inc, 1024
 615   br i1 %exitcond, label %for.end, label %for.body
 616
 617 for.end:
 618   ret void
 619 }
 620
 621 ; Full groups again, this time checking an Optsize scenario, with unknown trip-
 622 ; count, to check the behavior of folding-the-tail (folding the remainder loop
 623 ; into the main loop using masking) together with interleaved-groups.
 624 ; When masked-interleave-group is disabled the interleave-groups will be
 625 ; invalidated during Legality check, so nothing to check here.
 626 ; When masked-interleave-group is enabled we check that there is no epilogue,
 627 ; and that the interleave-groups are vectorized using proper masking (with
 628 ; shuffling of the mask feeding the wide masked load/store).
 629 ; The mask itself is an And of two masks: one that masks away the remainder
 630 ; iterations, and one that masks away the 'else' of the 'if' statement.
 631 ;
 632 ; void masked_strided2_unknown_tc(const unsigned char* restrict p,
 633 ;                     unsigned char* restrict q,
 634 ;                     unsigned char guard,
 635 ;                     int n) {
 636 ; for(ix=0; ix < n; ++ix) {
 637 ;     if (ix > guard) {
 638 ;         char left = p[2*ix];
 639 ;         char right = p[2*ix + 1];
 640 ;         char max = max(left, right);
 641 ;         q[2*ix] = max;
 642 ;         q[2*ix+1] = 0 - max;
 643 ;     }
 644 ; }
 645 ;}
 646
 647 ; ENABLED_MASKED_STRIDED-LABEL: @masked_strided2_unknown_tc(
 648 ; ENABLED_MASKED_STRIDED:       vector.body:
 649 ; ENABLED_MASKED_STRIDED-NEXT:    [[INDEX:%.*]] = phi i32
 650 ; ENABLED_MASKED_STRIDED-NEXT:    [[VEC_IND:%.*]] = phi <8 x i32> [ <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 651 ; ENABLED_MASKED_STRIDED-NEXT:    [[TMP0:%.*]] = icmp sgt <8 x i32> [[VEC_IND]], {{.*}}
 652 ; ENABLED_MASKED_STRIDED-NEXT:    [[TMP1:%.*]] = shl nuw nsw i32 [[INDEX]], 1
 653 ; ENABLED_MASKED_STRIDED-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, i8* [[P:%.*]], i32 [[TMP1]]
 654 ; ENABLED_MASKED_STRIDED-NEXT:    [[TMP3:%.*]] = icmp ule <8 x i32> [[VEC_IND]], {{.*}}
 655 ; ENABLED_MASKED_STRIDED-NEXT:    [[TMP4:%.*]] = and <8 x i1> [[TMP0]], [[TMP3]]
 656 ; ENABLED_MASKED_STRIDED-NEXT:    [[TMP5:%.*]] = bitcast i8* [[TMP2]] to <16 x i8>*
 657 ; ENABLED_MASKED_STRIDED-NEXT:    [[INTERLEAVED_MASK:%.*]] = shufflevector <8 x i1> [[TMP4]], <8 x i1> undef, <16 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7>
 658 ; ENABLED_MASKED_STRIDED-NEXT:    [[WIDE_MASKED_VEC:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* [[TMP5]], i32 1, <16 x i1> [[INTERLEAVED_MASK]], <16 x i8> undef)
 659 ; ENABLED_MASKED_STRIDED-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <16 x i8> [[WIDE_MASKED_VEC]], <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
 660 ; ENABLED_MASKED_STRIDED-NEXT:    [[STRIDED_VEC3:%.*]] = shufflevector <16 x i8> [[WIDE_MASKED_VEC]], <16 x i8> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
 661 ; ENABLED_MASKED_STRIDED-NEXT:    [[TMP6:%.*]] = or i32 [[TMP1]], 1
 662 ; ENABLED_MASKED_STRIDED-NEXT:    [[TMP7:%.*]] = icmp slt <8 x i8> [[STRIDED_VEC]], [[STRIDED_VEC3]]
 663 ; ENABLED_MASKED_STRIDED-NEXT:    [[TMP8:%.*]] = select <8 x i1> [[TMP7]], <8 x i8> [[STRIDED_VEC3]], <8 x i8> [[STRIDED_VEC]]
 664 ; ENABLED_MASKED_STRIDED-NEXT:    [[TMP9:%.*]] = sub <8 x i8> zeroinitializer, [[TMP8]]
 665 ; ENABLED_MASKED_STRIDED-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, i8* [[Q:%.*]], i32 -1
 666 ; ENABLED_MASKED_STRIDED-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, i8* [[TMP10]], i32 [[TMP6]]
 667 ; ENABLED_MASKED_STRIDED-NEXT:    [[TMP12:%.*]] = bitcast i8* [[TMP11]] to <16 x i8>*
 668 ; ENABLED_MASKED_STRIDED-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i8> [[TMP8]], <8 x i8> [[TMP9]], <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
 669 ; ENABLED_MASKED_STRIDED-NEXT:    call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> [[INTERLEAVED_VEC]], <16 x i8>* [[TMP12]], i32 1, <16 x i1> [[INTERLEAVED_MASK]])
 670 ; ENABLED_MASKED_STRIDED-NEXT:    {{.*}} = add i32 [[INDEX]], 8
 671 ; ENABLED_MASKED_STRIDED-NEXT:    {{.*}} = add <8 x i32> {{.*}}, <i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
 672 ; ENABLED_MASKED_STRIDED-NEXT:    [[TMP13:%.*]] = icmp eq i32 {{.*}}, {{.*}}
 673 ; ENABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP13]],
 674 ; ENABLED_MASKED_STRIDED:       for.end:
 675 ; ENABLED_MASKED_STRIDED-NEXT:    ret void
 676
 677 define dso_local void @masked_strided2_unknown_tc(i8* noalias nocapture readonly %p, i8* noalias nocapture %q, i32 %guard, i32 %n) local_unnamed_addr optsize {
 678 entry:
 679   %cmp22 = icmp sgt i32 %n, 0
 680   br i1 %cmp22, label %for.body.preheader, label %for.end
 681
 682 for.body.preheader:
 683   br label %for.body
 684
 685 for.body:
 686   %ix.023 = phi i32 [ %inc, %for.inc ], [ 0, %for.body.preheader ]
 687   %cmp1 = icmp sgt i32 %ix.023, %guard
 688   br i1 %cmp1, label %if.then, label %for.inc
 689
 690 if.then:
 691   %mul = shl nuw nsw i32 %ix.023, 1
 692   %arrayidx = getelementptr inbounds i8, i8* %p, i32 %mul
 693   %0 = load i8, i8* %arrayidx, align 1
 694   %add = or i32 %mul, 1
 695   %arrayidx3 = getelementptr inbounds i8, i8* %p, i32 %add
 696   %1 = load i8, i8* %arrayidx3, align 1
 697   %cmp.i = icmp slt i8 %0, %1
 698   %spec.select.i = select i1 %cmp.i, i8 %1, i8 %0
 699   %arrayidx5 = getelementptr inbounds i8, i8* %q, i32 %mul
 700   store i8 %spec.select.i, i8* %arrayidx5, align 1
 701   %sub = sub i8 0, %spec.select.i
 702   %arrayidx9 = getelementptr inbounds i8, i8* %q, i32 %add
 703   store i8 %sub, i8* %arrayidx9, align 1
 704   br label %for.inc
 705
 706 for.inc:
 707   %inc = add nuw nsw i32 %ix.023, 1
 708   %exitcond = icmp eq i32 %inc, %n
 709   br i1 %exitcond, label %for.end.loopexit, label %for.body
 710
 711 for.end.loopexit:
 712   br label %for.end
 713
 714 for.end:
 715   ret void
 716 }
 717
 718 ; Full groups under Optsize scenario again, with unknown trip-count, again in
 719 ; order to check the behavior of folding-the-tail (folding the remainder loop
 720 ; into the main loop using masking) together with interleaved-groups.
 721 ; This time the accesses are not conditional, they become conditional only
 722 ; due to tail folding.
 723 ; When masked-interleave-group is disabled the interleave-groups will be
 724 ; invalidated during cost-model checks, so we check for no epilogue and
 725 ; scalarized conditional accesses.
 726 ; When masked-interleave-group is enabled we check for no epilogue,
 727 ; and interleave-groups vectorized using proper masking (with
 728 ; shuffling of the mask feeding the wide masked load/store).
 729 ; (Same vectorization scheme as for the previous loop with conditional accesses
 730 ; except here the mask only masks away the remainder iterations.)
 731 ;
 732 ; void unconditional_masked_strided2_unknown_tc(const unsigned char* restrict p,
 733 ;                     unsigned char* restrict q,
 734 ;                     int n) {
 735 ; for(ix=0; ix < n; ++ix) {
 736 ;         char left = p[2*ix];
 737 ;         char right = p[2*ix + 1];
 738 ;         char max = max(left, right);
 739 ;         q[2*ix] = max;
 740 ;         q[2*ix+1] = 0 - max;
 741 ; }
 742 ;}
 743
 744 ; DISABLED_MASKED_STRIDED-LABEL: @unconditional_masked_strided2_unknown_tc(
 745 ; DISABLED_MASKED_STRIDED:       vector.body:
 746 ; DISABLED_MASKED_STRIDED-NEXT:    [[INDEX:%.*]] = phi i32
 747 ; DISABLED_MASKED_STRIDED-NEXT:    [[VEC_IND:%.*]] = phi <8 x i32> [ <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 748 ; DISABLED_MASKED_STRIDED-NEXT:    [[TMP0:%.*]] = shl nuw nsw <8 x i32> [[VEC_IND]], <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
 749 ; DISABLED_MASKED_STRIDED-NEXT:    [[TMP1:%.*]] = icmp ule <8 x i32> [[VEC_IND]], {{.*}}
 750 ; DISABLED_MASKED_STRIDED-NEXT:    [[TMP2:%.*]] = extractelement <8 x i1> [[TMP1]], i32 0
 751 ; DISABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP2]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
 752 ; DISABLED_MASKED_STRIDED:       pred.load.if:
 753 ; DISABLED_MASKED_STRIDED-NEXT:    [[TMP3:%.*]] = extractelement <8 x i32> [[TMP0]], i32 0
 754 ; DISABLED_MASKED_STRIDED-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, i8* [[P:%.*]], i32 [[TMP3]]
 755 ; DISABLED_MASKED_STRIDED-NEXT:    [[TMP5:%.*]] = load i8, i8* [[TMP4]], align 1
 756 ; DISABLED_MASKED_STRIDED-NEXT:    [[TMP6:%.*]] = insertelement <8 x i8> undef, i8 [[TMP5]], i32 0
 757 ; DISABLED_MASKED_STRIDED-NEXT:    br label [[PRED_LOAD_CONTINUE]]
 758 ; DISABLED_MASKED_STRIDED-NOT:   for.body:
 759 ; DISABLED_MASKED_STRIDED:       for.end:
 760 ; DISABLED_MASKED_STRIDED-NEXT:    ret void
 761
 762
 763
 764 ; ENABLED_MASKED_STRIDED-LABEL: @unconditional_masked_strided2_unknown_tc(
 765 ; ENABLED_MASKED_STRIDED:       vector.body:
 766 ; ENABLED_MASKED_STRIDED-NEXT:    [[INDEX:%.*]] = phi i32
 767 ; ENABLED_MASKED_STRIDED-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i32> undef, i32 [[INDEX]], i32 0
 768 ; ENABLED_MASKED_STRIDED-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT]], <8 x i32> undef, <8 x i32> zeroinitializer
 769 ; ENABLED_MASKED_STRIDED-NEXT:    [[INDUCTION:%.*]] = add <8 x i32> [[BROADCAST_SPLAT]], <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 770 ; ENABLED_MASKED_STRIDED-NEXT:    [[TMP0:%.*]] = shl nuw nsw i32 [[INDEX]], 1
 771 ; ENABLED_MASKED_STRIDED-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, i8* [[P:%.*]], i32 [[TMP0]]
 772 ; ENABLED_MASKED_STRIDED-NEXT:    [[TMP2:%.*]] = icmp ule <8 x i32> {{.*}}, {{.*}}
 773 ; ENABLED_MASKED_STRIDED-NEXT:    [[TMP3:%.*]] = bitcast i8* [[TMP1]] to <16 x i8>*
 774 ; ENABLED_MASKED_STRIDED-NEXT:    [[INTERLEAVED_MASK:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> undef, <16 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7>
 775 ; ENABLED_MASKED_STRIDED-NEXT:    [[WIDE_MASKED_VEC:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* [[TMP3]], i32 1, <16 x i1> [[INTERLEAVED_MASK]], <16 x i8> undef)
 776 ; ENABLED_MASKED_STRIDED-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <16 x i8> [[WIDE_MASKED_VEC]], <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
 777 ; ENABLED_MASKED_STRIDED-NEXT:    [[STRIDED_VEC3:%.*]] = shufflevector <16 x i8> [[WIDE_MASKED_VEC]], <16 x i8> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
 778 ; ENABLED_MASKED_STRIDED-NEXT:    [[TMP4:%.*]] = or i32 [[TMP0]], 1
 779 ; ENABLED_MASKED_STRIDED-NEXT:    [[TMP5:%.*]] = icmp slt <8 x i8> [[STRIDED_VEC]], [[STRIDED_VEC3]]
 780 ; ENABLED_MASKED_STRIDED-NEXT:    [[TMP6:%.*]] = select <8 x i1> [[TMP5]], <8 x i8> [[STRIDED_VEC3]], <8 x i8> [[STRIDED_VEC]]
 781 ; ENABLED_MASKED_STRIDED-NEXT:    [[TMP7:%.*]] = sub <8 x i8> zeroinitializer, [[TMP6]]
 782 ; ENABLED_MASKED_STRIDED-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, i8* [[Q:%.*]], i32 -1
 783 ; ENABLED_MASKED_STRIDED-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, i8* [[TMP8]], i32 [[TMP4]]
 784 ; ENABLED_MASKED_STRIDED-NEXT:    [[TMP10:%.*]] = bitcast i8* [[TMP9]] to <16 x i8>*
 785 ; ENABLED_MASKED_STRIDED-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i8> [[TMP6]], <8 x i8> [[TMP7]], <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
 786 ; ENABLED_MASKED_STRIDED-NEXT:    call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> [[INTERLEAVED_VEC]], <16 x i8>* [[TMP10]], i32 1, <16 x i1> [[INTERLEAVED_MASK]])
 787 ; ENABLED_MASKED_STRIDED-NEXT:    {{.*}} = add i32 [[INDEX]], 8
 788 ; ENABLED_MASKED_STRIDED-NEXT:    [[TMP11:%.*]] = icmp eq i32 {{.*}}, {{.*}}
 789 ; ENABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP11]]
 790 ; ENABLED_MASKED_STRIDED:       for.end:
 791 ; ENABLED_MASKED_STRIDED-NEXT:    ret void
 792
 793 define dso_local void @unconditional_masked_strided2_unknown_tc(i8* noalias nocapture readonly %p, i8* noalias nocapture %q, i32 %n) local_unnamed_addr optsize {
 794 entry:
 795   %cmp20 = icmp sgt i32 %n, 0
 796   br i1 %cmp20, label %for.body.preheader, label %for.end
 797
 798 for.body.preheader:
 799   br label %for.body
 800
 801 for.body:
 802   %ix.021 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
 803   %mul = shl nuw nsw i32 %ix.021, 1
 804   %arrayidx = getelementptr inbounds i8, i8* %p, i32 %mul
 805   %0 = load i8, i8* %arrayidx, align 1
 806   %add = or i32 %mul, 1
 807   %arrayidx2 = getelementptr inbounds i8, i8* %p, i32 %add
 808   %1 = load i8, i8* %arrayidx2, align 1
 809   %cmp.i = icmp slt i8 %0, %1
 810   %spec.select.i = select i1 %cmp.i, i8 %1, i8 %0
 811   %arrayidx4 = getelementptr inbounds i8, i8* %q, i32 %mul
 812   store i8 %spec.select.i, i8* %arrayidx4, align 1
 813   %sub = sub i8 0, %spec.select.i
 814   %arrayidx8 = getelementptr inbounds i8, i8* %q, i32 %add
 815   store i8 %sub, i8* %arrayidx8, align 1
 816   %inc = add nuw nsw i32 %ix.021, 1
 817   %exitcond = icmp eq i32 %inc, %n
 818   br i1 %exitcond, label %for.end.loopexit, label %for.body
 819
 820 for.end.loopexit:
 821   br label %for.end
 822
 823 for.end:
 824   ret void
 825 }
 826