llvm/test/Transforms/LoopVectorize/AArch64/sve-gather-scatter.ll

   1 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
   2 ; RUN: opt -passes=loop-vectorize,dce,instcombine -mtriple aarch64-linux-gnu -mattr=+sve \
   3 ; RUN:   -prefer-predicate-over-epilogue=scalar-epilogue -S %s -force-target-instruction-cost=1 -o - | FileCheck %s
   4
   5 define void @gather_nxv4i32_ind64(ptr noalias nocapture readonly %a, ptr noalias nocapture readonly %b, ptr noalias nocapture %c, i64 %n) #0 {
   6 ; CHECK-LABEL: @gather_nxv4i32_ind64(
   7 ; CHECK-NEXT:  entry:
   8 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
   9 ; CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2
  10 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ugt i64 [[TMP1]], [[N:%.*]]
  11 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
  12 ; CHECK:       vector.ph:
  13 ; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
  14 ; CHECK-NEXT:    [[DOTNEG:%.*]] = mul nsw i64 [[TMP2]], -4
  15 ; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[DOTNEG]], [[N]]
  16 ; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
  17 ; CHECK-NEXT:    [[TMP4:%.*]] = shl nuw nsw i64 [[TMP3]], 2
  18 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
  19 ; CHECK:       vector.body:
  20 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
  21 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[INDEX]]
  22 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i64>, ptr [[TMP5]], align 8
  23 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], <vscale x 4 x i64> [[WIDE_LOAD]]
  24 ; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x float> @llvm.masked.gather.nxv4f32.nxv4p0(<vscale x 4 x ptr> [[TMP6]], i32 4, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x float> poison)
  25 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, ptr [[C:%.*]], i64 [[INDEX]]
  26 ; CHECK-NEXT:    store <vscale x 4 x float> [[WIDE_MASKED_GATHER]], ptr [[TMP7]], align 4
  27 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]]
  28 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
  29 ; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
  30 ; CHECK:       middle.block:
  31 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[N]]
  32 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
  33 ; CHECK:       scalar.ph:
  34 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
  35 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
  36 ; CHECK:       for.body:
  37 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
  38 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[INDVARS_IV]]
  39 ; CHECK-NEXT:    [[TMP9:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
  40 ; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP9]]
  41 ; CHECK-NEXT:    [[TMP10:%.*]] = load float, ptr [[ARRAYIDX3]], align 4
  42 ; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds float, ptr [[C]], i64 [[INDVARS_IV]]
  43 ; CHECK-NEXT:    store float [[TMP10]], ptr [[ARRAYIDX5]], align 4
  44 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
  45 ; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[N]]
  46 ; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
  47 ; CHECK:       for.cond.cleanup:
  48 ; CHECK-NEXT:    ret void
  49 ;
  50 entry:
  51   br label %for.body
  52
  53 for.body:                                         ; preds = %entry, %for.body
  54   %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
  55   %arrayidx = getelementptr inbounds i64, ptr %b, i64 %indvars.iv
  56   %0 = load i64, ptr %arrayidx, align 8
  57   %arrayidx3 = getelementptr inbounds float, ptr %a, i64 %0
  58   %1 = load float, ptr %arrayidx3, align 4
  59   %arrayidx5 = getelementptr inbounds float, ptr %c, i64 %indvars.iv
  60   store float %1, ptr %arrayidx5, align 4
  61   %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
  62   %exitcond.not = icmp eq i64 %indvars.iv.next, %n
  63   br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !llvm.loop !0
  64
  65 for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
  66   ret void
  67 }
  68
  69 ; NOTE: I deliberately chose '%b' as an array of i32 indices, since the
  70 ; additional 'sext' in the for.body loop exposes additional code paths
  71 ; during vectorisation.
  72 define void @scatter_nxv4i32_ind32(ptr noalias nocapture %a, ptr noalias nocapture readonly %b, ptr noalias nocapture readonly %c, i64 %n) #0 {
  73 ; CHECK-LABEL: @scatter_nxv4i32_ind32(
  74 ; CHECK-NEXT:  entry:
  75 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
  76 ; CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2
  77 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ugt i64 [[TMP1]], [[N:%.*]]
  78 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
  79 ; CHECK:       vector.ph:
  80 ; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
  81 ; CHECK-NEXT:    [[DOTNEG:%.*]] = mul nsw i64 [[TMP2]], -4
  82 ; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[DOTNEG]], [[N]]
  83 ; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
  84 ; CHECK-NEXT:    [[TMP4:%.*]] = shl nuw nsw i64 [[TMP3]], 2
  85 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
  86 ; CHECK:       vector.body:
  87 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
  88 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, ptr [[C:%.*]], i64 [[INDEX]]
  89 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP5]], align 4
  90 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[INDEX]]
  91 ; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 4 x i32>, ptr [[TMP6]], align 4
  92 ; CHECK-NEXT:    [[TMP7:%.*]] = sext <vscale x 4 x i32> [[WIDE_LOAD1]] to <vscale x 4 x i64>
  93 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], <vscale x 4 x i64> [[TMP7]]
  94 ; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4f32.nxv4p0(<vscale x 4 x float> [[WIDE_LOAD]], <vscale x 4 x ptr> [[TMP8]], i32 4, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
  95 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]]
  96 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
  97 ; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
  98 ; CHECK:       middle.block:
  99 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[N]]
 100 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
 101 ; CHECK:       scalar.ph:
 102 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
 103 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 104 ; CHECK:       for.body:
 105 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
 106 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[C]], i64 [[INDVARS_IV]]
 107 ; CHECK-NEXT:    [[TMP10:%.*]] = load float, ptr [[ARRAYIDX]], align 4
 108 ; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDVARS_IV]]
 109 ; CHECK-NEXT:    [[TMP11:%.*]] = load i32, ptr [[ARRAYIDX3]], align 4
 110 ; CHECK-NEXT:    [[IDXPROM4:%.*]] = sext i32 [[TMP11]] to i64
 111 ; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IDXPROM4]]
 112 ; CHECK-NEXT:    store float [[TMP10]], ptr [[ARRAYIDX5]], align 4
 113 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 114 ; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[N]]
 115 ; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 116 ; CHECK:       for.cond.cleanup:
 117 ; CHECK-NEXT:    ret void
 118 ;
 119 entry:
 120   br label %for.body
 121
 122 for.body:                                         ; preds = %entry, %for.body
 123   %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
 124   %arrayidx = getelementptr inbounds float, ptr %c, i64 %indvars.iv
 125   %0 = load float, ptr %arrayidx, align 4
 126   %arrayidx3 = getelementptr inbounds i32, ptr %b, i64 %indvars.iv
 127   %1 = load i32, ptr %arrayidx3, align 4
 128   %idxprom4 = sext i32 %1 to i64
 129   %arrayidx5 = getelementptr inbounds float, ptr %a, i64 %idxprom4
 130   store float %0, ptr %arrayidx5, align 4
 131   %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
 132   %exitcond.not = icmp eq i64 %indvars.iv.next, %n
 133   br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !llvm.loop !0
 134
 135 for.cond.cleanup:                                 ; preds = %for.body, %entry
 136   ret void
 137 }
 138
 139 define void @scatter_inv_nxv4i32(ptr noalias nocapture %inv, ptr noalias nocapture readonly %b, i64 %n) #0 {
 140 ; CHECK-LABEL: @scatter_inv_nxv4i32(
 141 ; CHECK-NEXT:  entry:
 142 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
 143 ; CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2
 144 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ugt i64 [[TMP1]], [[N:%.*]]
 145 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 146 ; CHECK:       vector.ph:
 147 ; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
 148 ; CHECK-NEXT:    [[DOTNEG:%.*]] = mul nsw i64 [[TMP2]], -4
 149 ; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[DOTNEG]], [[N]]
 150 ; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
 151 ; CHECK-NEXT:    [[TMP4:%.*]] = shl nuw nsw i64 [[TMP3]], 2
 152 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x ptr> poison, ptr [[INV:%.*]], i64 0
 153 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x ptr> [[BROADCAST_SPLATINSERT]], <vscale x 4 x ptr> poison, <vscale x 4 x i32> zeroinitializer
 154 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 155 ; CHECK:       vector.body:
 156 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 157 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[INDEX]]
 158 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP5]], align 4
 159 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp ne <vscale x 4 x i32> [[WIDE_LOAD]], zeroinitializer
 160 ; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 3, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x ptr> [[BROADCAST_SPLAT]], i32 4, <vscale x 4 x i1> [[TMP6]])
 161 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]]
 162 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 163 ; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
 164 ; CHECK:       middle.block:
 165 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[N]]
 166 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
 167 ; CHECK:       scalar.ph:
 168 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
 169 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 170 ; CHECK:       for.body:
 171 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
 172 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDVARS_IV]]
 173 ; CHECK-NEXT:    [[TMP8:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
 174 ; CHECK-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq i32 [[TMP8]], 0
 175 ; CHECK-NEXT:    br i1 [[TOBOOL_NOT]], label [[FOR_INC]], label [[IF_THEN:%.*]]
 176 ; CHECK:       if.then:
 177 ; CHECK-NEXT:    store i32 3, ptr [[INV]], align 4
 178 ; CHECK-NEXT:    br label [[FOR_INC]]
 179 ; CHECK:       for.inc:
 180 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 181 ; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[N]]
 182 ; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 183 ; CHECK:       for.cond.cleanup:
 184 ; CHECK-NEXT:    ret void
 185 ;
 186 entry:
 187   br label %for.body
 188
 189 for.body:                                         ; preds = %entry, %for.inc
 190   %indvars.iv = phi i64 [ %indvars.iv.next, %for.inc ], [ 0, %entry ]
 191   %arrayidx = getelementptr inbounds i32, ptr %b, i64 %indvars.iv
 192   %0 = load i32, ptr %arrayidx, align 4
 193   %tobool.not = icmp eq i32 %0, 0
 194   br i1 %tobool.not, label %for.inc, label %if.then
 195
 196 if.then:                                          ; preds = %for.body
 197   store i32 3, ptr %inv, align 4
 198   br label %for.inc
 199
 200 for.inc:                                          ; preds = %for.body, %if.then
 201   %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
 202   %exitcond.not = icmp eq i64 %indvars.iv.next, %n
 203   br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !llvm.loop !0
 204
 205 for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
 206   ret void
 207 }
 208
 209 define void @gather_inv_nxv4i32(ptr noalias nocapture %a, ptr noalias nocapture readonly %inv, i64 %n) #0 {
 210 ; CHECK-LABEL: @gather_inv_nxv4i32(
 211 ; CHECK-NEXT:  entry:
 212 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
 213 ; CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2
 214 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ugt i64 [[TMP1]], [[N:%.*]]
 215 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 216 ; CHECK:       vector.ph:
 217 ; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
 218 ; CHECK-NEXT:    [[DOTNEG:%.*]] = mul nsw i64 [[TMP2]], -4
 219 ; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[DOTNEG]], [[N]]
 220 ; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
 221 ; CHECK-NEXT:    [[TMP4:%.*]] = shl nuw nsw i64 [[TMP3]], 2
 222 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x ptr> poison, ptr [[INV:%.*]], i64 0
 223 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x ptr> [[BROADCAST_SPLATINSERT]], <vscale x 4 x ptr> poison, <vscale x 4 x i32> zeroinitializer
 224 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 225 ; CHECK:       vector.body:
 226 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 227 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i32, ptr [[A:%.*]], i64 [[INDEX]]
 228 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP5]], align 4
 229 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp sgt <vscale x 4 x i32> [[WIDE_LOAD]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 3, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
 230 ; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[BROADCAST_SPLAT]], i32 4, <vscale x 4 x i1> [[TMP6]], <vscale x 4 x i32> poison)
 231 ; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[WIDE_MASKED_GATHER]], ptr [[TMP5]], i32 4, <vscale x 4 x i1> [[TMP6]])
 232 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]]
 233 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 234 ; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
 235 ; CHECK:       middle.block:
 236 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[N]]
 237 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
 238 ; CHECK:       scalar.ph:
 239 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
 240 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 241 ; CHECK:       for.body:
 242 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
 243 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDVARS_IV]]
 244 ; CHECK-NEXT:    [[TMP8:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
 245 ; CHECK-NEXT:    [[CMP2:%.*]] = icmp sgt i32 [[TMP8]], 3
 246 ; CHECK-NEXT:    br i1 [[CMP2]], label [[IF_THEN:%.*]], label [[FOR_INC]]
 247 ; CHECK:       if.then:
 248 ; CHECK-NEXT:    [[TMP9:%.*]] = load i32, ptr [[INV]], align 4
 249 ; CHECK-NEXT:    store i32 [[TMP9]], ptr [[ARRAYIDX]], align 4
 250 ; CHECK-NEXT:    br label [[FOR_INC]]
 251 ; CHECK:       for.inc:
 252 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 253 ; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[N]]
 254 ; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 255 ; CHECK:       for.cond.cleanup:
 256 ; CHECK-NEXT:    ret void
 257 ;
 258 entry:
 259   br label %for.body
 260
 261 for.body:                                         ; preds = %entry, %for.inc
 262   %indvars.iv = phi i64 [ %indvars.iv.next, %for.inc ], [ 0, %entry ]
 263   %arrayidx = getelementptr inbounds i32, ptr %a, i64 %indvars.iv
 264   %0 = load i32, ptr %arrayidx, align 4
 265   %cmp2 = icmp sgt i32 %0, 3
 266   br i1 %cmp2, label %if.then, label %for.inc
 267
 268 if.then:                                          ; preds = %for.body
 269   %1 = load i32, ptr %inv, align 4
 270   store i32 %1, ptr %arrayidx, align 4
 271   br label %for.inc
 272
 273 for.inc:                                          ; preds = %for.body, %if.then
 274   %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
 275   %exitcond.not = icmp eq i64 %indvars.iv.next, %n
 276   br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !llvm.loop !0
 277
 278 for.cond.cleanup:                                 ; preds = %for.inc, %entry
 279   ret void
 280 }
 281
 282
 283
 284 define void @gather_nxv4i32_ind64_stride2(ptr noalias nocapture %a, ptr noalias nocapture readonly %b, i64 %n) #0 {
 285 ; CHECK-LABEL: @gather_nxv4i32_ind64_stride2(
 286 ; CHECK-NEXT:  entry:
 287 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
 288 ; CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 3
 289 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ugt i64 [[TMP1]], [[N:%.*]]
 290 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 291 ; CHECK:       vector.ph:
 292 ; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
 293 ; CHECK-NEXT:    [[DOTNEG:%.*]] = mul nsw i64 [[TMP2]], -8
 294 ; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[DOTNEG]], [[N]]
 295 ; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
 296 ; CHECK-NEXT:    [[TMP4:%.*]] = shl nuw nsw i64 [[TMP3]], 3
 297 ; CHECK-NEXT:    [[TMP5:%.*]] = call <vscale x 4 x i64> @llvm.experimental.stepvector.nxv4i64()
 298 ; CHECK-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
 299 ; CHECK-NEXT:    [[TMP7:%.*]] = shl nuw nsw i64 [[TMP6]], 2
 300 ; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP7]], i64 0
 301 ; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[DOTSPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
 302 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 303 ; CHECK:       vector.body:
 304 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 305 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[TMP5]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 306 ; CHECK-NEXT:    [[STEP_ADD:%.*]] = add <vscale x 4 x i64> [[VEC_IND]], [[DOTSPLAT]]
 307 ; CHECK-NEXT:    [[TMP8:%.*]] = shl <vscale x 4 x i64> [[VEC_IND]], shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 1, i64 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer)
 308 ; CHECK-NEXT:    [[TMP9:%.*]] = shl <vscale x 4 x i64> [[STEP_ADD]], shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 1, i64 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer)
 309 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], <vscale x 4 x i64> [[TMP8]]
 310 ; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds float, ptr [[B]], <vscale x 4 x i64> [[TMP9]]
 311 ; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x float> @llvm.masked.gather.nxv4f32.nxv4p0(<vscale x 4 x ptr> [[TMP10]], i32 4, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x float> poison)
 312 ; CHECK-NEXT:    [[WIDE_MASKED_GATHER2:%.*]] = call <vscale x 4 x float> @llvm.masked.gather.nxv4f32.nxv4p0(<vscale x 4 x ptr> [[TMP11]], i32 4, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x float> poison)
 313 ; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[INDEX]]
 314 ; CHECK-NEXT:    [[TMP13:%.*]] = call i64 @llvm.vscale.i64()
 315 ; CHECK-NEXT:    [[TMP14:%.*]] = shl nuw nsw i64 [[TMP13]], 2
 316 ; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds float, ptr [[TMP12]], i64 [[TMP14]]
 317 ; CHECK-NEXT:    store <vscale x 4 x float> [[WIDE_MASKED_GATHER]], ptr [[TMP12]], align 4
 318 ; CHECK-NEXT:    store <vscale x 4 x float> [[WIDE_MASKED_GATHER2]], ptr [[TMP15]], align 4
 319 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]]
 320 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[STEP_ADD]], [[DOTSPLAT]]
 321 ; CHECK-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 322 ; CHECK-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
 323 ; CHECK:       middle.block:
 324 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[N]]
 325 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
 326 ; CHECK:       scalar.ph:
 327 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
 328 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 329 ; CHECK:       for.body:
 330 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
 331 ; CHECK-NEXT:    [[INDVARS_IV_STRIDE2:%.*]] = shl i64 [[INDVARS_IV]], 1
 332 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[INDVARS_IV_STRIDE2]]
 333 ; CHECK-NEXT:    [[TMP17:%.*]] = load float, ptr [[ARRAYIDX]], align 4
 334 ; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDVARS_IV]]
 335 ; CHECK-NEXT:    store float [[TMP17]], ptr [[ARRAYIDX2]], align 4
 336 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 337 ; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[N]]
 338 ; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
 339 ; CHECK:       for.cond.cleanup:
 340 ; CHECK-NEXT:    ret void
 341 ;
 342 entry:
 343   br label %for.body
 344
 345 for.body:                                         ; preds = %entry, %for.body
 346   %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
 347   %indvars.iv.stride2 = mul i64 %indvars.iv, 2
 348   %arrayidx = getelementptr inbounds float, ptr %b, i64 %indvars.iv.stride2
 349   %0 = load float, ptr %arrayidx, align 4
 350   %arrayidx2 = getelementptr inbounds float, ptr %a, i64 %indvars.iv
 351   store float %0, ptr %arrayidx2, align 4
 352   %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
 353   %exitcond.not = icmp eq i64 %indvars.iv.next, %n
 354   br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
 355
 356 for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
 357   ret void
 358 }
 359
 360 attributes #0 = { vscale_range(1, 16) }
 361
 362 !0 = distinct !{!0, !1, !2, !3, !4, !5}
 363 !1 = !{!"llvm.loop.mustprogress"}
 364 !2 = !{!"llvm.loop.vectorize.width", i32 4}
 365 !3 = !{!"llvm.loop.vectorize.scalable.enable", i1 true}
 366 !4 = !{!"llvm.loop.interleave.count", i32 1}
 367 !5 = !{!"llvm.loop.vectorize.enable", i1 true}