llvm/test/CodeGen/AArch64/sve2-vscale-sinking.ll

   1 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2
   2 ; RUN: opt -passes='require<profile-summary>,function(codegenprepare)' -S -o - %s | FileCheck %s
   3
   4 target triple = "aarch64-unknown-linux-gnu"
   5
   6 define void @inc_add(i32 %first, i32 %N, ptr %in1, ptr %in2, ptr %out) #0 {
   7 ; CHECK-LABEL: define void @inc_add
   8 ; CHECK-SAME: (i32 [[FIRST:%.*]], i32 [[N:%.*]], ptr [[IN1:%.*]], ptr [[IN2:%.*]], ptr [[OUT:%.*]]) #[[ATTR0:[0-9]+]] {
   9 ; CHECK-NEXT:  entry:
  10 ; CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
  11 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
  12 ; CHECK:       vector.body:
  13 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
  14 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds float, ptr [[IN1]], i64 [[INDEX]]
  15 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP0]], align 4
  16 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds float, ptr [[IN2]], i64 [[INDEX]]
  17 ; CHECK-NEXT:    [[WIDE_LOAD16:%.*]] = load <vscale x 4 x float>, ptr [[TMP1]], align 4
  18 ; CHECK-NEXT:    [[TMP2:%.*]] = fmul <vscale x 4 x float> [[WIDE_LOAD]], [[WIDE_LOAD16]]
  19 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds float, ptr [[OUT]], i64 [[INDEX]]
  20 ; CHECK-NEXT:    store <vscale x 4 x float> [[TMP2]], ptr [[TMP3]], align 4
  21 ; CHECK-NEXT:    [[TMP4:%.*]] = tail call i64 @llvm.vscale.i64()
  22 ; CHECK-NEXT:    [[TMP5:%.*]] = shl nuw nsw i64 [[TMP4]], 2
  23 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
  24 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[WIDE_TRIP_COUNT]]
  25 ; CHECK-NEXT:    br i1 [[TMP6]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]]
  26 ; CHECK:       for.cond.cleanup:
  27 ; CHECK-NEXT:    ret void
  28 ;
  29 entry:
  30   %wide.trip.count = zext i32 %N to i64
  31   %0 = tail call i64 @llvm.vscale.i64()
  32   %1 = shl nuw nsw i64 %0, 2
  33   br label %vector.body
  34
  35 vector.body:
  36   %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
  37   %2 = getelementptr inbounds float, ptr %in1, i64 %index
  38   %wide.load = load <vscale x 4 x float>, ptr %2, align 4
  39   %3 = getelementptr inbounds float, ptr %in2, i64 %index
  40   %wide.load16 = load <vscale x 4 x float>, ptr %3, align 4
  41   %4 = fmul <vscale x 4 x float> %wide.load, %wide.load16
  42   %5 = getelementptr inbounds float, ptr %out, i64 %index
  43   store <vscale x 4 x float> %4, ptr %5, align 4
  44   %index.next = add nuw i64 %index, %1
  45   %6 = icmp eq i64 %index.next, %wide.trip.count
  46   br i1 %6, label %for.cond.cleanup, label %vector.body
  47
  48 for.cond.cleanup:
  49   ret void
  50 }
  51
  52 define void @dec_sub(i32 %first, i32 %N, ptr %in1, ptr %in2, ptr %out) #0 {
  53 ; CHECK-LABEL: define void @dec_sub
  54 ; CHECK-SAME: (i32 [[FIRST:%.*]], i32 [[N:%.*]], ptr [[IN1:%.*]], ptr [[IN2:%.*]], ptr [[OUT:%.*]]) #[[ATTR0]] {
  55 ; CHECK-NEXT:  entry:
  56 ; CHECK-NEXT:    [[TMP0:%.*]] = zext i32 [[N]] to i64
  57 ; CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
  58 ; CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 2
  59 ; CHECK-NEXT:    [[TMP3:%.*]] = sub nsw i64 1, [[TMP2]]
  60 ; CHECK-NEXT:    [[INVARIANT_GEP:%.*]] = getelementptr float, ptr [[IN1]], i64 [[TMP3]]
  61 ; CHECK-NEXT:    [[INVARIANT_GEP20:%.*]] = getelementptr float, ptr [[IN2]], i64 [[TMP3]]
  62 ; CHECK-NEXT:    [[INVARIANT_GEP22:%.*]] = getelementptr float, ptr [[OUT]], i64 [[TMP3]]
  63 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
  64 ; CHECK:       vector.body:
  65 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
  66 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = sub i64 [[TMP0]], [[INDEX]]
  67 ; CHECK-NEXT:    [[GEP:%.*]] = getelementptr float, ptr [[INVARIANT_GEP]], i64 [[OFFSET_IDX]]
  68 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[GEP]], align 4
  69 ; CHECK-NEXT:    [[GEP21:%.*]] = getelementptr float, ptr [[INVARIANT_GEP20]], i64 [[OFFSET_IDX]]
  70 ; CHECK-NEXT:    [[WIDE_LOAD16:%.*]] = load <vscale x 4 x float>, ptr [[GEP21]], align 4
  71 ; CHECK-NEXT:    [[TMP4:%.*]] = fmul <vscale x 4 x float> [[WIDE_LOAD]], [[WIDE_LOAD16]]
  72 ; CHECK-NEXT:    [[GEP23:%.*]] = getelementptr float, ptr [[INVARIANT_GEP22]], i64 [[OFFSET_IDX]]
  73 ; CHECK-NEXT:    store <vscale x 4 x float> [[TMP4]], ptr [[GEP23]], align 4
  74 ; CHECK-NEXT:    [[TMP5:%.*]] = tail call i64 @llvm.vscale.i64()
  75 ; CHECK-NEXT:    [[TMP6:%.*]] = shl nuw nsw i64 [[TMP5]], 2
  76 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]]
  77 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[TMP0]]
  78 ; CHECK-NEXT:    br i1 [[TMP7]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]]
  79 ; CHECK:       for.cond.cleanup:
  80 ; CHECK-NEXT:    ret void
  81 ;
  82 entry:
  83   %0 = zext i32 %N to i64
  84   %1 = tail call i64 @llvm.vscale.i64()
  85   %2 = shl nuw nsw i64 %1, 2
  86   %3 = sub nsw i64 1, %2
  87   %invariant.gep = getelementptr float, ptr %in1, i64 %3
  88   %invariant.gep20 = getelementptr float, ptr %in2, i64 %3
  89   %invariant.gep22 = getelementptr float, ptr %out, i64 %3
  90   br label %vector.body
  91
  92 vector.body:
  93   %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
  94   %offset.idx = sub i64 %0, %index
  95   %gep = getelementptr float, ptr %invariant.gep, i64 %offset.idx
  96   %wide.load = load <vscale x 4 x float>, ptr %gep, align 4
  97   %gep21 = getelementptr float, ptr %invariant.gep20, i64 %offset.idx
  98   %wide.load16 = load <vscale x 4 x float>, ptr %gep21, align 4
  99   %4 = fmul <vscale x 4 x float> %wide.load, %wide.load16
 100   %gep23 = getelementptr float, ptr %invariant.gep22, i64 %offset.idx
 101   store <vscale x 4 x float> %4, ptr %gep23, align 4
 102   %index.next = add nuw i64 %index, %2
 103   %5 = icmp eq i64 %index.next, %0
 104   br i1 %5, label %for.cond.cleanup, label %vector.body
 105
 106 for.cond.cleanup:
 107   ret void
 108 }
 109
 110 define void @gep(i32 noundef %first, i32 noundef %N, ptr nocapture noundef writeonly %ptr, <vscale x 16 x i1> %pg, <vscale x 16 x i8> %val) #0 {
 111 ; CHECK-LABEL: define void @gep
 112 ; CHECK-SAME: (i32 noundef [[FIRST:%.*]], i32 noundef [[N:%.*]], ptr noundef writeonly captures(none) [[PTR:%.*]], <vscale x 16 x i1> [[PG:%.*]], <vscale x 16 x i8> [[VAL:%.*]]) #[[ATTR0]] {
 113 ; CHECK-NEXT:  entry:
 114 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 115 ; CHECK:       for.body:
 116 ; CHECK-NEXT:    [[LSR_IV:%.*]] = phi i32 [ [[N]], [[ENTRY:%.*]] ], [ [[LSR_IV_NEXT:%.*]], [[FOR_BODY]] ]
 117 ; CHECK-NEXT:    [[PTR_ADDR:%.*]] = phi ptr [ [[PTR]], [[ENTRY]] ], [ [[ADD_PTR_3:%.*]], [[FOR_BODY]] ]
 118 ; CHECK-NEXT:    tail call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[VAL]], ptr [[PTR_ADDR]], i32 1, <vscale x 16 x i1> [[PG]])
 119 ; CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
 120 ; CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[TMP0]], 4
 121 ; CHECK-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds i8, ptr [[PTR_ADDR]], i64 [[TMP1]]
 122 ; CHECK-NEXT:    tail call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[VAL]], ptr [[ADD_PTR]], i32 1, <vscale x 16 x i1> [[PG]])
 123 ; CHECK-NEXT:    [[TMP2:%.*]] = tail call i64 @llvm.vscale.i64()
 124 ; CHECK-NEXT:    [[TMP3:%.*]] = shl i64 [[TMP2]], 4
 125 ; CHECK-NEXT:    [[ADD_PTR_1:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR]], i64 [[TMP3]]
 126 ; CHECK-NEXT:    tail call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[VAL]], ptr [[ADD_PTR_1]], i32 1, <vscale x 16 x i1> [[PG]])
 127 ; CHECK-NEXT:    [[TMP4:%.*]] = tail call i64 @llvm.vscale.i64()
 128 ; CHECK-NEXT:    [[TMP5:%.*]] = shl i64 [[TMP4]], 4
 129 ; CHECK-NEXT:    [[ADD_PTR_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_1]], i64 [[TMP5]]
 130 ; CHECK-NEXT:    tail call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[VAL]], ptr [[ADD_PTR_2]], i32 1, <vscale x 16 x i1> [[PG]])
 131 ; CHECK-NEXT:    [[TMP6:%.*]] = tail call i64 @llvm.vscale.i64()
 132 ; CHECK-NEXT:    [[TMP7:%.*]] = shl i64 [[TMP6]], 4
 133 ; CHECK-NEXT:    [[ADD_PTR_3]] = getelementptr inbounds i8, ptr [[ADD_PTR_2]], i64 [[TMP7]]
 134 ; CHECK-NEXT:    [[LSR_IV_NEXT]] = add i32 [[LSR_IV]], -4
 135 ; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[LSR_IV_NEXT]], 0
 136 ; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_EXIT:%.*]], label [[FOR_BODY]]
 137 ; CHECK:       for.exit:
 138 ; CHECK-NEXT:    ret void
 139 ;
 140 entry:
 141   %0 = tail call i64 @llvm.vscale.i64()
 142   %1 = shl i64 %0, 4
 143   br label %for.body
 144
 145 for.body:                                         ; preds = %for.body, %for.body.lr.ph.new
 146   %lsr.iv = phi i32 [ %N, %entry ], [ %lsr.iv.next, %for.body ]
 147   %ptr.addr = phi ptr [ %ptr, %entry ], [ %add.ptr.3, %for.body ]
 148   tail call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> %val, ptr %ptr.addr, i32 1, <vscale x 16 x i1> %pg)
 149   %add.ptr = getelementptr inbounds i8, ptr %ptr.addr, i64 %1
 150   tail call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> %val, ptr %add.ptr, i32 1, <vscale x 16 x i1> %pg)
 151   %add.ptr.1 = getelementptr inbounds i8, ptr %add.ptr, i64 %1
 152   tail call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> %val, ptr %add.ptr.1, i32 1, <vscale x 16 x i1> %pg)
 153   %add.ptr.2 = getelementptr inbounds i8, ptr %add.ptr.1, i64 %1
 154   tail call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> %val, ptr %add.ptr.2, i32 1, <vscale x 16 x i1> %pg)
 155   %add.ptr.3 = getelementptr inbounds i8, ptr %add.ptr.2, i64 %1
 156   %lsr.iv.next = add i32 %lsr.iv, -4
 157   %cmp = icmp eq i32 %lsr.iv.next, 0
 158   br i1 %cmp, label %for.exit, label %for.body
 159
 160 for.exit:
 161   ret void
 162 }
 163
 164 define void @inc_add_i32(i32 %first, i32 %N, ptr %in1, ptr %in2, ptr %out) #0 {
 165 ; CHECK-LABEL: define void @inc_add_i32
 166 ; CHECK-SAME: (i32 [[FIRST:%.*]], i32 [[N:%.*]], ptr [[IN1:%.*]], ptr [[IN2:%.*]], ptr [[OUT:%.*]]) #[[ATTR0]] {
 167 ; CHECK-NEXT:  entry:
 168 ; CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
 169 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 170 ; CHECK:       vector.body:
 171 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 172 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds float, ptr [[IN1]], i64 [[INDEX]]
 173 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP0]], align 4
 174 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds float, ptr [[IN2]], i64 [[INDEX]]
 175 ; CHECK-NEXT:    [[WIDE_LOAD16:%.*]] = load <vscale x 4 x float>, ptr [[TMP1]], align 4
 176 ; CHECK-NEXT:    [[TMP2:%.*]] = fmul <vscale x 4 x float> [[WIDE_LOAD]], [[WIDE_LOAD16]]
 177 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds float, ptr [[OUT]], i64 [[INDEX]]
 178 ; CHECK-NEXT:    store <vscale x 4 x float> [[TMP2]], ptr [[TMP3]], align 4
 179 ; CHECK-NEXT:    [[TMP4:%.*]] = tail call i32 @llvm.vscale.i32()
 180 ; CHECK-NEXT:    [[TMP5:%.*]] = zext i32 [[TMP4]] to i64
 181 ; CHECK-NEXT:    [[TMP6:%.*]] = shl nuw nsw i64 [[TMP5]], 2
 182 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]]
 183 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[WIDE_TRIP_COUNT]]
 184 ; CHECK-NEXT:    br i1 [[TMP7]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]]
 185 ; CHECK:       for.cond.cleanup:
 186 ; CHECK-NEXT:    ret void
 187 ;
 188 entry:
 189   %wide.trip.count = zext i32 %N to i64
 190   %0 = tail call i32 @llvm.vscale.i32()
 191   %1 = zext i32 %0 to i64
 192   %2 = shl nuw nsw i64 %1, 2
 193   br label %vector.body
 194
 195 vector.body:
 196   %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
 197   %3 = getelementptr inbounds float, ptr %in1, i64 %index
 198   %wide.load = load <vscale x 4 x float>, ptr %3, align 4
 199   %4 = getelementptr inbounds float, ptr %in2, i64 %index
 200   %wide.load16 = load <vscale x 4 x float>, ptr %4, align 4
 201   %5 = fmul <vscale x 4 x float> %wide.load, %wide.load16
 202   %6 = getelementptr inbounds float, ptr %out, i64 %index
 203   store <vscale x 4 x float> %5, ptr %6, align 4
 204   %index.next = add nuw i64 %index, %2
 205   %7 = icmp eq i64 %index.next, %wide.trip.count
 206   br i1 %7, label %for.cond.cleanup, label %vector.body
 207
 208 for.cond.cleanup:
 209   ret void
 210 }
 211
 212 define void @dec_sub_i32(i32 %first, i32 %N, ptr %in1, ptr %in2, ptr %out) #0 {
 213 ; CHECK-LABEL: define void @dec_sub_i32
 214 ; CHECK-SAME: (i32 [[FIRST:%.*]], i32 [[N:%.*]], ptr [[IN1:%.*]], ptr [[IN2:%.*]], ptr [[OUT:%.*]]) #[[ATTR0]] {
 215 ; CHECK-NEXT:  entry:
 216 ; CHECK-NEXT:    [[TMP0:%.*]] = zext i32 [[N]] to i64
 217 ; CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.vscale.i32()
 218 ; CHECK-NEXT:    [[TMP2:%.*]] = zext i32 [[TMP1]] to i64
 219 ; CHECK-NEXT:    [[TMP3:%.*]] = shl nuw nsw i64 [[TMP2]], 2
 220 ; CHECK-NEXT:    [[TMP4:%.*]] = sub nsw i64 1, [[TMP3]]
 221 ; CHECK-NEXT:    [[INVARIANT_GEP:%.*]] = getelementptr float, ptr [[IN1]], i64 [[TMP4]]
 222 ; CHECK-NEXT:    [[INVARIANT_GEP20:%.*]] = getelementptr float, ptr [[IN2]], i64 [[TMP4]]
 223 ; CHECK-NEXT:    [[INVARIANT_GEP22:%.*]] = getelementptr float, ptr [[OUT]], i64 [[TMP4]]
 224 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 225 ; CHECK:       vector.body:
 226 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 227 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = sub i64 [[TMP0]], [[INDEX]]
 228 ; CHECK-NEXT:    [[GEP:%.*]] = getelementptr float, ptr [[INVARIANT_GEP]], i64 [[OFFSET_IDX]]
 229 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[GEP]], align 4
 230 ; CHECK-NEXT:    [[GEP21:%.*]] = getelementptr float, ptr [[INVARIANT_GEP20]], i64 [[OFFSET_IDX]]
 231 ; CHECK-NEXT:    [[WIDE_LOAD16:%.*]] = load <vscale x 4 x float>, ptr [[GEP21]], align 4
 232 ; CHECK-NEXT:    [[TMP5:%.*]] = fmul <vscale x 4 x float> [[WIDE_LOAD]], [[WIDE_LOAD16]]
 233 ; CHECK-NEXT:    [[GEP23:%.*]] = getelementptr float, ptr [[INVARIANT_GEP22]], i64 [[OFFSET_IDX]]
 234 ; CHECK-NEXT:    store <vscale x 4 x float> [[TMP5]], ptr [[GEP23]], align 4
 235 ; CHECK-NEXT:    [[TMP6:%.*]] = tail call i32 @llvm.vscale.i32()
 236 ; CHECK-NEXT:    [[TMP7:%.*]] = zext i32 [[TMP6]] to i64
 237 ; CHECK-NEXT:    [[TMP8:%.*]] = shl nuw nsw i64 [[TMP7]], 2
 238 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP8]]
 239 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[TMP0]]
 240 ; CHECK-NEXT:    br i1 [[TMP9]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]]
 241 ; CHECK:       for.cond.cleanup:
 242 ; CHECK-NEXT:    ret void
 243 ;
 244 entry:
 245   %0 = zext i32 %N to i64
 246   %1 = tail call i32 @llvm.vscale.i32()
 247   %2 = zext i32 %1 to i64
 248   %3 = shl nuw nsw i64 %2, 2
 249   %4 = sub nsw i64 1, %3
 250   %invariant.gep = getelementptr float, ptr %in1, i64 %4
 251   %invariant.gep20 = getelementptr float, ptr %in2, i64 %4
 252   %invariant.gep22 = getelementptr float, ptr %out, i64 %4
 253   br label %vector.body
 254
 255 vector.body:
 256   %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
 257   %offset.idx = sub i64 %0, %index
 258   %gep = getelementptr float, ptr %invariant.gep, i64 %offset.idx
 259   %wide.load = load <vscale x 4 x float>, ptr %gep, align 4
 260   %gep21 = getelementptr float, ptr %invariant.gep20, i64 %offset.idx
 261   %wide.load16 = load <vscale x 4 x float>, ptr %gep21, align 4
 262   %5 = fmul <vscale x 4 x float> %wide.load, %wide.load16
 263   %gep23 = getelementptr float, ptr %invariant.gep22, i64 %offset.idx
 264   store <vscale x 4 x float> %5, ptr %gep23, align 4
 265   %index.next = add nuw i64 %index, %3
 266   %6 = icmp eq i64 %index.next, %0
 267   br i1 %6, label %for.cond.cleanup, label %vector.body
 268
 269 for.cond.cleanup:
 270   ret void
 271 }
 272
 273 define void @gep_i32(i32 noundef %first, i32 noundef %N, ptr nocapture noundef writeonly %ptr, <vscale x 16 x i1> %pg, <vscale x 16 x i8> %val) #0 {
 274 ; CHECK-LABEL: define void @gep_i32
 275 ; CHECK-SAME: (i32 noundef [[FIRST:%.*]], i32 noundef [[N:%.*]], ptr noundef writeonly captures(none) [[PTR:%.*]], <vscale x 16 x i1> [[PG:%.*]], <vscale x 16 x i8> [[VAL:%.*]]) #[[ATTR0]] {
 276 ; CHECK-NEXT:  entry:
 277 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 278 ; CHECK:       for.body:
 279 ; CHECK-NEXT:    [[LSR_IV:%.*]] = phi i32 [ [[N]], [[ENTRY:%.*]] ], [ [[LSR_IV_NEXT:%.*]], [[FOR_BODY]] ]
 280 ; CHECK-NEXT:    [[PTR_ADDR:%.*]] = phi ptr [ [[PTR]], [[ENTRY]] ], [ [[ADD_PTR_3:%.*]], [[FOR_BODY]] ]
 281 ; CHECK-NEXT:    tail call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[VAL]], ptr [[PTR_ADDR]], i32 1, <vscale x 16 x i1> [[PG]])
 282 ; CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.vscale.i32()
 283 ; CHECK-NEXT:    [[TMP1:%.*]] = zext i32 [[TMP0]] to i64
 284 ; CHECK-NEXT:    [[TMP2:%.*]] = shl i64 [[TMP1]], 4
 285 ; CHECK-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds i8, ptr [[PTR_ADDR]], i64 [[TMP2]]
 286 ; CHECK-NEXT:    tail call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[VAL]], ptr [[ADD_PTR]], i32 1, <vscale x 16 x i1> [[PG]])
 287 ; CHECK-NEXT:    [[TMP3:%.*]] = tail call i32 @llvm.vscale.i32()
 288 ; CHECK-NEXT:    [[TMP4:%.*]] = zext i32 [[TMP3]] to i64
 289 ; CHECK-NEXT:    [[TMP5:%.*]] = shl i64 [[TMP4]], 4
 290 ; CHECK-NEXT:    [[ADD_PTR_1:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR]], i64 [[TMP5]]
 291 ; CHECK-NEXT:    tail call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[VAL]], ptr [[ADD_PTR_1]], i32 1, <vscale x 16 x i1> [[PG]])
 292 ; CHECK-NEXT:    [[TMP6:%.*]] = tail call i32 @llvm.vscale.i32()
 293 ; CHECK-NEXT:    [[TMP7:%.*]] = zext i32 [[TMP6]] to i64
 294 ; CHECK-NEXT:    [[TMP8:%.*]] = shl i64 [[TMP7]], 4
 295 ; CHECK-NEXT:    [[ADD_PTR_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_1]], i64 [[TMP8]]
 296 ; CHECK-NEXT:    tail call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[VAL]], ptr [[ADD_PTR_2]], i32 1, <vscale x 16 x i1> [[PG]])
 297 ; CHECK-NEXT:    [[TMP9:%.*]] = tail call i32 @llvm.vscale.i32()
 298 ; CHECK-NEXT:    [[TMP10:%.*]] = zext i32 [[TMP9]] to i64
 299 ; CHECK-NEXT:    [[TMP11:%.*]] = shl i64 [[TMP10]], 4
 300 ; CHECK-NEXT:    [[ADD_PTR_3]] = getelementptr inbounds i8, ptr [[ADD_PTR_2]], i64 [[TMP11]]
 301 ; CHECK-NEXT:    [[LSR_IV_NEXT]] = add i32 [[LSR_IV]], -4
 302 ; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[LSR_IV_NEXT]], 0
 303 ; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_EXIT:%.*]], label [[FOR_BODY]]
 304 ; CHECK:       for.exit:
 305 ; CHECK-NEXT:    ret void
 306 ;
 307 entry:
 308   %0 = tail call i32 @llvm.vscale.i32()
 309   %1 = zext i32 %0 to i64
 310   %2 = shl i64 %1, 4
 311   br label %for.body
 312
 313 for.body:                                         ; preds = %for.body, %for.body.lr.ph.new
 314   %lsr.iv = phi i32 [ %N, %entry ], [ %lsr.iv.next, %for.body ]
 315   %ptr.addr = phi ptr [ %ptr, %entry ], [ %add.ptr.3, %for.body ]
 316   tail call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> %val, ptr %ptr.addr, i32 1, <vscale x 16 x i1> %pg)
 317   %add.ptr = getelementptr inbounds i8, ptr %ptr.addr, i64 %2
 318   tail call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> %val, ptr %add.ptr, i32 1, <vscale x 16 x i1> %pg)
 319   %add.ptr.1 = getelementptr inbounds i8, ptr %add.ptr, i64 %2
 320   tail call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> %val, ptr %add.ptr.1, i32 1, <vscale x 16 x i1> %pg)
 321   %add.ptr.2 = getelementptr inbounds i8, ptr %add.ptr.1, i64 %2
 322   tail call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> %val, ptr %add.ptr.2, i32 1, <vscale x 16 x i1> %pg)
 323   %add.ptr.3 = getelementptr inbounds i8, ptr %add.ptr.2, i64 %2
 324   %lsr.iv.next = add i32 %lsr.iv, -4
 325   %cmp = icmp eq i32 %lsr.iv.next, 0
 326   br i1 %cmp, label %for.exit, label %for.body
 327
 328 for.exit:
 329   ret void
 330 }
 331
 332 declare void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8>, ptr nocapture, i32 immarg, <vscale x 16 x i1>)
 333
 334 declare i64 @llvm.vscale.i64()
 335
 336 attributes #0 = { "target-features"="+sve2" }