test/CodeGen/Thumb2/LowOverheadLoops/mve-tail-data-types.ll

   1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
   2 ; RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve -disable-mve-tail-predication=false -enable-arm-maskedldst=true %s -o - | FileCheck %s
   3
   4 define arm_aapcs_vfpcc i32 @test_acc_scalar_char(i8 zeroext %a, i8* nocapture readonly %b, i32 %N) {
   5 ; CHECK-LABEL: test_acc_scalar_char:
   6 ; CHECK:       @ %bb.0: @ %entry
   7 ; CHECK-NEXT:    cmp r2, #0
   8 ; CHECK-NEXT:    itt eq
   9 ; CHECK-NEXT:    moveq r0, #0
  10 ; CHECK-NEXT:    bxeq lr
  11 ; CHECK-NEXT:    push {r7, lr}
  12 ; CHECK-NEXT:    vpush {d8, d9}
  13 ; CHECK-NEXT:    adds r3, r2, #3
  14 ; CHECK-NEXT:    subs r2, #1
  15 ; CHECK-NEXT:    bic r3, r3, #3
  16 ; CHECK-NEXT:    vdup.32 q1, r2
  17 ; CHECK-NEXT:    sub.w r12, r3, #4
  18 ; CHECK-NEXT:    movs r3, #1
  19 ; CHECK-NEXT:    vmov.i32 q0, #0x0
  20 ; CHECK-NEXT:    movs r2, #0
  21 ; CHECK-NEXT:    add.w lr, r3, r12, lsr #2
  22 ; CHECK-NEXT:    adr r3, .LCPI0_0
  23 ; CHECK-NEXT:    vldrw.u32 q2, [r3]
  24 ; CHECK-NEXT:    dls lr, lr
  25 ; CHECK-NEXT:  .LBB0_1: @ %vector.body
  26 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
  27 ; CHECK-NEXT:    vadd.i32 q4, q2, r2
  28 ; CHECK-NEXT:    adds r3, r1, r2
  29 ; CHECK-NEXT:    adds r2, #4
  30 ; CHECK-NEXT:    vpt.u32 cs, q1, q4
  31 ; CHECK-NEXT:    vldrbt.u32 q4, [r3]
  32 ; CHECK-NEXT:    vmov q3, q0
  33 ; CHECK-NEXT:    vmla.u32 q0, q4, r0
  34 ; CHECK-NEXT:    le lr, .LBB0_1
  35 ; CHECK-NEXT:  @ %bb.2: @ %middle.block
  36 ; CHECK-NEXT:    vpsel q0, q0, q3
  37 ; CHECK-NEXT:    vaddv.u32 r0, q0
  38 ; CHECK-NEXT:    vpop {d8, d9}
  39 ; CHECK-NEXT:    pop {r7, pc}
  40 ; CHECK-NEXT:    .p2align 4
  41 ; CHECK-NEXT:  @ %bb.3:
  42 ; CHECK-NEXT:  .LCPI0_0:
  43 ; CHECK-NEXT:    .long 0 @ 0x0
  44 ; CHECK-NEXT:    .long 1 @ 0x1
  45 ; CHECK-NEXT:    .long 2 @ 0x2
  46 ; CHECK-NEXT:    .long 3 @ 0x3
  47 entry:
  48   %cmp7 = icmp eq i32 %N, 0
  49   br i1 %cmp7, label %for.cond.cleanup, label %vector.ph
  50
  51 vector.ph:                                        ; preds = %entry
  52   %conv = zext i8 %a to i32
  53   %n.rnd.up = add i32 %N, 3
  54   %n.vec = and i32 %n.rnd.up, -4
  55   %trip.count.minus.1 = add i32 %N, -1
  56   %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
  57   %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
  58   %broadcast.splatinsert12 = insertelement <4 x i32> undef, i32 %conv, i32 0
  59   %broadcast.splat13 = shufflevector <4 x i32> %broadcast.splatinsert12, <4 x i32> undef, <4 x i32> zeroinitializer
  60   br label %vector.body
  61
  62 vector.body:                                      ; preds = %vector.body, %vector.ph
  63   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
  64   %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %5, %vector.body ]
  65   %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
  66   %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
  67   %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
  68   %0 = getelementptr inbounds i8, i8* %b, i32 %index
  69   %1 = icmp ule <4 x i32> %induction, %broadcast.splat11
  70   %2 = bitcast i8* %0 to <4 x i8>*
  71   %wide.masked.load = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %2, i32 1, <4 x i1> %1, <4 x i8> undef)
  72   %3 = zext <4 x i8> %wide.masked.load to <4 x i32>
  73   %4 = mul nuw nsw <4 x i32> %broadcast.splat13, %3
  74   %5 = add nuw nsw <4 x i32> %4, %vec.phi
  75   %index.next = add i32 %index, 4
  76   %6 = icmp eq i32 %index.next, %n.vec
  77   br i1 %6, label %middle.block, label %vector.body
  78
  79 middle.block:                                     ; preds = %vector.body
  80   %7 = select <4 x i1> %1, <4 x i32> %5, <4 x i32> %vec.phi
  81   %8 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %7)
  82   br label %for.cond.cleanup
  83
  84 for.cond.cleanup:                                 ; preds = %middle.block, %entry
  85   %res.0.lcssa = phi i32 [ 0, %entry ], [ %8, %middle.block ]
  86   ret i32 %res.0.lcssa
  87 }
  88
  89 define arm_aapcs_vfpcc i32 @test_acc_scalar_short(i16 signext %a, i16* nocapture readonly %b, i32 %N) {
  90 ; CHECK-LABEL: test_acc_scalar_short:
  91 ; CHECK:       @ %bb.0: @ %entry
  92 ; CHECK-NEXT:    cmp r2, #0
  93 ; CHECK-NEXT:    itt eq
  94 ; CHECK-NEXT:    moveq r0, #0
  95 ; CHECK-NEXT:    bxeq lr
  96 ; CHECK-NEXT:    push {r7, lr}
  97 ; CHECK-NEXT:    vpush {d8, d9}
  98 ; CHECK-NEXT:    adds r3, r2, #3
  99 ; CHECK-NEXT:    subs r2, #1
 100 ; CHECK-NEXT:    bic r3, r3, #3
 101 ; CHECK-NEXT:    vdup.32 q1, r2
 102 ; CHECK-NEXT:    sub.w r12, r3, #4
 103 ; CHECK-NEXT:    movs r3, #1
 104 ; CHECK-NEXT:    vmov.i32 q0, #0x0
 105 ; CHECK-NEXT:    movs r2, #0
 106 ; CHECK-NEXT:    add.w lr, r3, r12, lsr #2
 107 ; CHECK-NEXT:    adr r3, .LCPI1_0
 108 ; CHECK-NEXT:    vldrw.u32 q2, [r3]
 109 ; CHECK-NEXT:    dls lr, lr
 110 ; CHECK-NEXT:  .LBB1_1: @ %vector.body
 111 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 112 ; CHECK-NEXT:    vadd.i32 q4, q2, r2
 113 ; CHECK-NEXT:    adds r2, #4
 114 ; CHECK-NEXT:    vpt.u32 cs, q1, q4
 115 ; CHECK-NEXT:    vldrht.s32 q4, [r1]
 116 ; CHECK-NEXT:    adds r1, #8
 117 ; CHECK-NEXT:    vmov q3, q0
 118 ; CHECK-NEXT:    vmla.u32 q0, q4, r0
 119 ; CHECK-NEXT:    le lr, .LBB1_1
 120 ; CHECK-NEXT:  @ %bb.2: @ %middle.block
 121 ; CHECK-NEXT:    vpsel q0, q0, q3
 122 ; CHECK-NEXT:    vaddv.u32 r0, q0
 123 ; CHECK-NEXT:    vpop {d8, d9}
 124 ; CHECK-NEXT:    pop {r7, pc}
 125 ; CHECK-NEXT:    .p2align 4
 126 ; CHECK-NEXT:  @ %bb.3:
 127 ; CHECK-NEXT:  .LCPI1_0:
 128 ; CHECK-NEXT:    .long 0 @ 0x0
 129 ; CHECK-NEXT:    .long 1 @ 0x1
 130 ; CHECK-NEXT:    .long 2 @ 0x2
 131 ; CHECK-NEXT:    .long 3 @ 0x3
 132 entry:
 133   %cmp7 = icmp eq i32 %N, 0
 134   br i1 %cmp7, label %for.cond.cleanup, label %vector.ph
 135
 136 vector.ph:                                        ; preds = %entry
 137   %conv = sext i16 %a to i32
 138   %n.rnd.up = add i32 %N, 3
 139   %n.vec = and i32 %n.rnd.up, -4
 140   %trip.count.minus.1 = add i32 %N, -1
 141   %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
 142   %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
 143   %broadcast.splatinsert12 = insertelement <4 x i32> undef, i32 %conv, i32 0
 144   %broadcast.splat13 = shufflevector <4 x i32> %broadcast.splatinsert12, <4 x i32> undef, <4 x i32> zeroinitializer
 145   br label %vector.body
 146
 147 vector.body:                                      ; preds = %vector.body, %vector.ph
 148   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
 149   %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %5, %vector.body ]
 150   %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
 151   %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
 152   %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
 153   %0 = getelementptr inbounds i16, i16* %b, i32 %index
 154   %1 = icmp ule <4 x i32> %induction, %broadcast.splat11
 155   %2 = bitcast i16* %0 to <4 x i16>*
 156   %wide.masked.load = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %2, i32 2, <4 x i1> %1, <4 x i16> undef)
 157   %3 = sext <4 x i16> %wide.masked.load to <4 x i32>
 158   %4 = mul nsw <4 x i32> %broadcast.splat13, %3
 159   %5 = add nsw <4 x i32> %4, %vec.phi
 160   %index.next = add i32 %index, 4
 161   %6 = icmp eq i32 %index.next, %n.vec
 162   br i1 %6, label %middle.block, label %vector.body
 163
 164 middle.block:                                     ; preds = %vector.body
 165   %7 = select <4 x i1> %1, <4 x i32> %5, <4 x i32> %vec.phi
 166   %8 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %7)
 167   br label %for.cond.cleanup
 168
 169 for.cond.cleanup:                                 ; preds = %middle.block, %entry
 170   %res.0.lcssa = phi i32 [ 0, %entry ], [ %8, %middle.block ]
 171   ret i32 %res.0.lcssa
 172 }
 173
 174 define arm_aapcs_vfpcc i32 @test_acc_scalar_uchar(i8 zeroext %a, i8* nocapture readonly %b, i32 %N) {
 175 ; CHECK-LABEL: test_acc_scalar_uchar:
 176 ; CHECK:       @ %bb.0: @ %entry
 177 ; CHECK-NEXT:    cmp r2, #0
 178 ; CHECK-NEXT:    itt eq
 179 ; CHECK-NEXT:    moveq r0, #0
 180 ; CHECK-NEXT:    bxeq lr
 181 ; CHECK-NEXT:    push {r7, lr}
 182 ; CHECK-NEXT:    vpush {d8, d9}
 183 ; CHECK-NEXT:    adds r3, r2, #3
 184 ; CHECK-NEXT:    subs r2, #1
 185 ; CHECK-NEXT:    bic r3, r3, #3
 186 ; CHECK-NEXT:    vdup.32 q1, r2
 187 ; CHECK-NEXT:    sub.w r12, r3, #4
 188 ; CHECK-NEXT:    movs r3, #1
 189 ; CHECK-NEXT:    vmov.i32 q0, #0x0
 190 ; CHECK-NEXT:    movs r2, #0
 191 ; CHECK-NEXT:    add.w lr, r3, r12, lsr #2
 192 ; CHECK-NEXT:    adr r3, .LCPI2_0
 193 ; CHECK-NEXT:    vldrw.u32 q2, [r3]
 194 ; CHECK-NEXT:    dls lr, lr
 195 ; CHECK-NEXT:  .LBB2_1: @ %vector.body
 196 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 197 ; CHECK-NEXT:    vadd.i32 q4, q2, r2
 198 ; CHECK-NEXT:    adds r3, r1, r2
 199 ; CHECK-NEXT:    adds r2, #4
 200 ; CHECK-NEXT:    vpt.u32 cs, q1, q4
 201 ; CHECK-NEXT:    vldrbt.u32 q4, [r3]
 202 ; CHECK-NEXT:    vmov q3, q0
 203 ; CHECK-NEXT:    vmla.u32 q0, q4, r0
 204 ; CHECK-NEXT:    le lr, .LBB2_1
 205 ; CHECK-NEXT:  @ %bb.2: @ %middle.block
 206 ; CHECK-NEXT:    vpsel q0, q0, q3
 207 ; CHECK-NEXT:    vaddv.u32 r0, q0
 208 ; CHECK-NEXT:    vpop {d8, d9}
 209 ; CHECK-NEXT:    pop {r7, pc}
 210 ; CHECK-NEXT:    .p2align 4
 211 ; CHECK-NEXT:  @ %bb.3:
 212 ; CHECK-NEXT:  .LCPI2_0:
 213 ; CHECK-NEXT:    .long 0 @ 0x0
 214 ; CHECK-NEXT:    .long 1 @ 0x1
 215 ; CHECK-NEXT:    .long 2 @ 0x2
 216 ; CHECK-NEXT:    .long 3 @ 0x3
 217 entry:
 218   %cmp7 = icmp eq i32 %N, 0
 219   br i1 %cmp7, label %for.cond.cleanup, label %vector.ph
 220
 221 vector.ph:                                        ; preds = %entry
 222   %conv = zext i8 %a to i32
 223   %n.rnd.up = add i32 %N, 3
 224   %n.vec = and i32 %n.rnd.up, -4
 225   %trip.count.minus.1 = add i32 %N, -1
 226   %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
 227   %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
 228   %broadcast.splatinsert12 = insertelement <4 x i32> undef, i32 %conv, i32 0
 229   %broadcast.splat13 = shufflevector <4 x i32> %broadcast.splatinsert12, <4 x i32> undef, <4 x i32> zeroinitializer
 230   br label %vector.body
 231
 232 vector.body:                                      ; preds = %vector.body, %vector.ph
 233   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
 234   %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %5, %vector.body ]
 235   %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
 236   %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
 237   %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
 238   %0 = getelementptr inbounds i8, i8* %b, i32 %index
 239   %1 = icmp ule <4 x i32> %induction, %broadcast.splat11
 240   %2 = bitcast i8* %0 to <4 x i8>*
 241   %wide.masked.load = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %2, i32 1, <4 x i1> %1, <4 x i8> undef)
 242   %3 = zext <4 x i8> %wide.masked.load to <4 x i32>
 243   %4 = mul nuw nsw <4 x i32> %broadcast.splat13, %3
 244   %5 = add nuw nsw <4 x i32> %4, %vec.phi
 245   %index.next = add i32 %index, 4
 246   %6 = icmp eq i32 %index.next, %n.vec
 247   br i1 %6, label %middle.block, label %vector.body
 248
 249 middle.block:                                     ; preds = %vector.body
 250   %7 = select <4 x i1> %1, <4 x i32> %5, <4 x i32> %vec.phi
 251   %8 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %7)
 252   br label %for.cond.cleanup
 253
 254 for.cond.cleanup:                                 ; preds = %middle.block, %entry
 255   %res.0.lcssa = phi i32 [ 0, %entry ], [ %8, %middle.block ]
 256   ret i32 %res.0.lcssa
 257 }
 258
 259 define arm_aapcs_vfpcc i32 @test_acc_scalar_ushort(i16 signext %a, i16* nocapture readonly %b, i32 %N) {
 260 ; CHECK-LABEL: test_acc_scalar_ushort:
 261 ; CHECK:       @ %bb.0: @ %entry
 262 ; CHECK-NEXT:    cmp r2, #0
 263 ; CHECK-NEXT:    itt eq
 264 ; CHECK-NEXT:    moveq r0, #0
 265 ; CHECK-NEXT:    bxeq lr
 266 ; CHECK-NEXT:    push {r7, lr}
 267 ; CHECK-NEXT:    vpush {d8, d9}
 268 ; CHECK-NEXT:    adds r3, r2, #3
 269 ; CHECK-NEXT:    subs r2, #1
 270 ; CHECK-NEXT:    bic r3, r3, #3
 271 ; CHECK-NEXT:    vdup.32 q1, r2
 272 ; CHECK-NEXT:    sub.w r12, r3, #4
 273 ; CHECK-NEXT:    movs r3, #1
 274 ; CHECK-NEXT:    vmov.i32 q0, #0x0
 275 ; CHECK-NEXT:    movs r2, #0
 276 ; CHECK-NEXT:    add.w lr, r3, r12, lsr #2
 277 ; CHECK-NEXT:    adr r3, .LCPI3_0
 278 ; CHECK-NEXT:    vldrw.u32 q2, [r3]
 279 ; CHECK-NEXT:    dls lr, lr
 280 ; CHECK-NEXT:  .LBB3_1: @ %vector.body
 281 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 282 ; CHECK-NEXT:    vadd.i32 q4, q2, r2
 283 ; CHECK-NEXT:    adds r2, #4
 284 ; CHECK-NEXT:    vpt.u32 cs, q1, q4
 285 ; CHECK-NEXT:    vldrht.u32 q4, [r1]
 286 ; CHECK-NEXT:    adds r1, #8
 287 ; CHECK-NEXT:    vmov q3, q0
 288 ; CHECK-NEXT:    vmla.u32 q0, q4, r0
 289 ; CHECK-NEXT:    le lr, .LBB3_1
 290 ; CHECK-NEXT:  @ %bb.2: @ %middle.block
 291 ; CHECK-NEXT:    vpsel q0, q0, q3
 292 ; CHECK-NEXT:    vaddv.u32 r0, q0
 293 ; CHECK-NEXT:    vpop {d8, d9}
 294 ; CHECK-NEXT:    pop {r7, pc}
 295 ; CHECK-NEXT:    .p2align 4
 296 ; CHECK-NEXT:  @ %bb.3:
 297 ; CHECK-NEXT:  .LCPI3_0:
 298 ; CHECK-NEXT:    .long 0 @ 0x0
 299 ; CHECK-NEXT:    .long 1 @ 0x1
 300 ; CHECK-NEXT:    .long 2 @ 0x2
 301 ; CHECK-NEXT:    .long 3 @ 0x3
 302 entry:
 303   %cmp7 = icmp eq i32 %N, 0
 304   br i1 %cmp7, label %for.cond.cleanup, label %vector.ph
 305
 306 vector.ph:                                        ; preds = %entry
 307   %conv = sext i16 %a to i32
 308   %n.rnd.up = add i32 %N, 3
 309   %n.vec = and i32 %n.rnd.up, -4
 310   %trip.count.minus.1 = add i32 %N, -1
 311   %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
 312   %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
 313   %broadcast.splatinsert12 = insertelement <4 x i32> undef, i32 %conv, i32 0
 314   %broadcast.splat13 = shufflevector <4 x i32> %broadcast.splatinsert12, <4 x i32> undef, <4 x i32> zeroinitializer
 315   br label %vector.body
 316
 317 vector.body:                                      ; preds = %vector.body, %vector.ph
 318   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
 319   %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %5, %vector.body ]
 320   %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
 321   %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
 322   %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
 323   %0 = getelementptr inbounds i16, i16* %b, i32 %index
 324   %1 = icmp ule <4 x i32> %induction, %broadcast.splat11
 325   %2 = bitcast i16* %0 to <4 x i16>*
 326   %wide.masked.load = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %2, i32 2, <4 x i1> %1, <4 x i16> undef)
 327   %3 = zext <4 x i16> %wide.masked.load to <4 x i32>
 328   %4 = mul nsw <4 x i32> %broadcast.splat13, %3
 329   %5 = add nsw <4 x i32> %4, %vec.phi
 330   %index.next = add i32 %index, 4
 331   %6 = icmp eq i32 %index.next, %n.vec
 332   br i1 %6, label %middle.block, label %vector.body
 333
 334 middle.block:                                     ; preds = %vector.body
 335   %7 = select <4 x i1> %1, <4 x i32> %5, <4 x i32> %vec.phi
 336   %8 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %7)
 337   br label %for.cond.cleanup
 338
 339 for.cond.cleanup:                                 ; preds = %middle.block, %entry
 340   %res.0.lcssa = phi i32 [ 0, %entry ], [ %8, %middle.block ]
 341   ret i32 %res.0.lcssa
 342 }
 343
 344 define arm_aapcs_vfpcc i32 @test_acc_scalar_int(i32 %a, i32* nocapture readonly %b, i32 %N) {
 345 ; CHECK-LABEL: test_acc_scalar_int:
 346 ; CHECK:       @ %bb.0: @ %entry
 347 ; CHECK-NEXT:    cmp r2, #0
 348 ; CHECK-NEXT:    itt eq
 349 ; CHECK-NEXT:    moveq r0, #0
 350 ; CHECK-NEXT:    bxeq lr
 351 ; CHECK-NEXT:    push {r7, lr}
 352 ; CHECK-NEXT:    adds r3, r2, #3
 353 ; CHECK-NEXT:    vmov.i32 q0, #0x0
 354 ; CHECK-NEXT:    bic r3, r3, #3
 355 ; CHECK-NEXT:    sub.w r12, r3, #4
 356 ; CHECK-NEXT:    movs r3, #1
 357 ; CHECK-NEXT:    add.w lr, r3, r12, lsr #2
 358 ; CHECK-NEXT:    dls lr, lr
 359 ; CHECK-NEXT:  .LBB4_1: @ %vector.body
 360 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 361 ; CHECK-NEXT:    vctp.32 r2
 362 ; CHECK-NEXT:    vpst
 363 ; CHECK-NEXT:    vldrwt.u32 q2, [r1]
 364 ; CHECK-NEXT:    mov r3, r2
 365 ; CHECK-NEXT:    adds r1, #16
 366 ; CHECK-NEXT:    subs r2, #4
 367 ; CHECK-NEXT:    vmov q1, q0
 368 ; CHECK-NEXT:    vmla.u32 q0, q2, r0
 369 ; CHECK-NEXT:    le lr, .LBB4_1
 370 ; CHECK-NEXT:  @ %bb.2: @ %middle.block
 371 ; CHECK-NEXT:    vctp.32 r3
 372 ; CHECK-NEXT:    vpsel q0, q0, q1
 373 ; CHECK-NEXT:    vaddv.u32 r0, q0
 374 ; CHECK-NEXT:    pop {r7, pc}
 375 entry:
 376   %cmp6 = icmp eq i32 %N, 0
 377   br i1 %cmp6, label %for.cond.cleanup, label %vector.ph
 378
 379 vector.ph:                                        ; preds = %entry
 380   %n.rnd.up = add i32 %N, 3
 381   %n.vec = and i32 %n.rnd.up, -4
 382   %trip.count.minus.1 = add i32 %N, -1
 383   %broadcast.splatinsert9 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
 384   %broadcast.splat10 = shufflevector <4 x i32> %broadcast.splatinsert9, <4 x i32> undef, <4 x i32> zeroinitializer
 385   %broadcast.splatinsert11 = insertelement <4 x i32> undef, i32 %a, i32 0
 386   %broadcast.splat12 = shufflevector <4 x i32> %broadcast.splatinsert11, <4 x i32> undef, <4 x i32> zeroinitializer
 387   br label %vector.body
 388
 389 vector.body:                                      ; preds = %vector.body, %vector.ph
 390   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
 391   %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %4, %vector.body ]
 392   %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
 393   %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
 394   %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
 395   %0 = getelementptr inbounds i32, i32* %b, i32 %index
 396   %1 = icmp ule <4 x i32> %induction, %broadcast.splat10
 397   %2 = bitcast i32* %0 to <4 x i32>*
 398   %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %2, i32 4, <4 x i1> %1, <4 x i32> undef)
 399   %3 = mul nsw <4 x i32> %wide.masked.load, %broadcast.splat12
 400   %4 = add nsw <4 x i32> %3, %vec.phi
 401   %index.next = add i32 %index, 4
 402   %5 = icmp eq i32 %index.next, %n.vec
 403   br i1 %5, label %middle.block, label %vector.body
 404
 405 middle.block:                                     ; preds = %vector.body
 406   %6 = select <4 x i1> %1, <4 x i32> %4, <4 x i32> %vec.phi
 407   %7 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %6)
 408   br label %for.cond.cleanup
 409
 410 for.cond.cleanup:                                 ; preds = %middle.block, %entry
 411   %res.0.lcssa = phi i32 [ 0, %entry ], [ %7, %middle.block ]
 412   ret i32 %res.0.lcssa
 413 }
 414
 415 define arm_aapcs_vfpcc void @test_vec_mul_scalar_add_char(i8* nocapture readonly %a, i8* nocapture readonly %b, i8 zeroext %c, i32* nocapture %res, i32 %N) {
 416 ; CHECK-LABEL: test_vec_mul_scalar_add_char:
 417 ; CHECK:       @ %bb.0: @ %entry
 418 ; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, r9, lr}
 419 ; CHECK-NEXT:    ldr r7, [sp, #28]
 420 ; CHECK-NEXT:    cmp r7, #0
 421 ; CHECK-NEXT:    beq.w .LBB5_12
 422 ; CHECK-NEXT:  @ %bb.1: @ %for.body.lr.ph
 423 ; CHECK-NEXT:    add.w r4, r3, r7, lsl #2
 424 ; CHECK-NEXT:    adds r5, r1, r7
 425 ; CHECK-NEXT:    cmp r4, r1
 426 ; CHECK-NEXT:    add.w r6, r0, r7
 427 ; CHECK-NEXT:    cset r12, hi
 428 ; CHECK-NEXT:    cmp r5, r3
 429 ; CHECK-NEXT:    cset r5, hi
 430 ; CHECK-NEXT:    cmp r4, r0
 431 ; CHECK-NEXT:    cset r4, hi
 432 ; CHECK-NEXT:    cmp r6, r3
 433 ; CHECK-NEXT:    cset r6, hi
 434 ; CHECK-NEXT:    ands r6, r4
 435 ; CHECK-NEXT:    lsls r6, r6, #31
 436 ; CHECK-NEXT:    itt eq
 437 ; CHECK-NEXT:    andeq.w r6, r5, r12
 438 ; CHECK-NEXT:    lslseq.w r6, r6, #31
 439 ; CHECK-NEXT:    beq .LBB5_4
 440 ; CHECK-NEXT:  @ %bb.2: @ %for.body.preheader
 441 ; CHECK-NEXT:    subs r6, r7, #1
 442 ; CHECK-NEXT:    and lr, r7, #3
 443 ; CHECK-NEXT:    cmp r6, #3
 444 ; CHECK-NEXT:    bhs .LBB5_6
 445 ; CHECK-NEXT:  @ %bb.3:
 446 ; CHECK-NEXT:    movs r7, #0
 447 ; CHECK-NEXT:    b .LBB5_9
 448 ; CHECK-NEXT:  .LBB5_4: @ %vector.ph
 449 ; CHECK-NEXT:    adds r6, r7, #3
 450 ; CHECK-NEXT:    movs r5, #1
 451 ; CHECK-NEXT:    bic r6, r6, #3
 452 ; CHECK-NEXT:    subs r7, #1
 453 ; CHECK-NEXT:    subs r6, #4
 454 ; CHECK-NEXT:    vdup.32 q0, r7
 455 ; CHECK-NEXT:    movs r7, #0
 456 ; CHECK-NEXT:    add.w lr, r5, r6, lsr #2
 457 ; CHECK-NEXT:    adr r6, .LCPI5_0
 458 ; CHECK-NEXT:    vldrw.u32 q1, [r6]
 459 ; CHECK-NEXT:    dls lr, lr
 460 ; CHECK-NEXT:  .LBB5_5: @ %vector.body
 461 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 462 ; CHECK-NEXT:    vadd.i32 q2, q1, r7
 463 ; CHECK-NEXT:    adds r4, r0, r7
 464 ; CHECK-NEXT:    vpt.u32 cs, q0, q2
 465 ; CHECK-NEXT:    vldrbt.u32 q2, [r4]
 466 ; CHECK-NEXT:    adds r4, r1, r7
 467 ; CHECK-NEXT:    vpst
 468 ; CHECK-NEXT:    vldrbt.u32 q3, [r4]
 469 ; CHECK-NEXT:    vmul.i32 q2, q3, q2
 470 ; CHECK-NEXT:    vadd.i32 q2, q2, r2
 471 ; CHECK-NEXT:    vpst
 472 ; CHECK-NEXT:    vstrwt.32 q2, [r3]
 473 ; CHECK-NEXT:    adds r3, #16
 474 ; CHECK-NEXT:    adds r7, #4
 475 ; CHECK-NEXT:    le lr, .LBB5_5
 476 ; CHECK-NEXT:    b .LBB5_12
 477 ; CHECK-NEXT:  .LBB5_6: @ %for.body.preheader.new
 478 ; CHECK-NEXT:    sub.w r12, lr, r7
 479 ; CHECK-NEXT:    subs r4, r1, #3
 480 ; CHECK-NEXT:    subs r5, r0, #3
 481 ; CHECK-NEXT:    sub.w r7, r3, #16
 482 ; CHECK-NEXT:    mov.w r9, #0
 483 ; CHECK-NEXT:  .LBB5_7: @ %for.body
 484 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 485 ; CHECK-NEXT:    ldrb.w r8, [r5, #3]
 486 ; CHECK-NEXT:    sub.w r9, r9, #4
 487 ; CHECK-NEXT:    ldrb r6, [r4, #3]
 488 ; CHECK-NEXT:    cmp r12, r9
 489 ; CHECK-NEXT:    smlabb r6, r6, r8, r2
 490 ; CHECK-NEXT:    str r6, [r7, #16]!
 491 ; CHECK-NEXT:    ldrb r8, [r5, #4]!
 492 ; CHECK-NEXT:    ldrb r6, [r4, #4]!
 493 ; CHECK-NEXT:    smlabb r6, r6, r8, r2
 494 ; CHECK-NEXT:    str r6, [r7, #4]
 495 ; CHECK-NEXT:    ldrb.w r8, [r5, #1]
 496 ; CHECK-NEXT:    ldrb r6, [r4, #1]
 497 ; CHECK-NEXT:    smlabb r6, r6, r8, r2
 498 ; CHECK-NEXT:    str r6, [r7, #8]
 499 ; CHECK-NEXT:    ldrb.w r8, [r5, #2]
 500 ; CHECK-NEXT:    ldrb r6, [r4, #2]
 501 ; CHECK-NEXT:    smlabb r6, r6, r8, r2
 502 ; CHECK-NEXT:    str r6, [r7, #12]
 503 ; CHECK-NEXT:    bne .LBB5_7
 504 ; CHECK-NEXT:  @ %bb.8: @ %for.cond.cleanup.loopexit.unr-lcssa.loopexit
 505 ; CHECK-NEXT:    rsb.w r7, r9, #0
 506 ; CHECK-NEXT:  .LBB5_9: @ %for.cond.cleanup.loopexit.unr-lcssa
 507 ; CHECK-NEXT:    wls lr, lr, .LBB5_12
 508 ; CHECK-NEXT:  @ %bb.10: @ %for.body.epil.preheader
 509 ; CHECK-NEXT:    subs r7, #1
 510 ; CHECK-NEXT:    add r0, r7
 511 ; CHECK-NEXT:    add r1, r7
 512 ; CHECK-NEXT:    add.w r3, r3, r7, lsl #2
 513 ; CHECK-NEXT:  .LBB5_11: @ %for.body.epil
 514 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 515 ; CHECK-NEXT:    ldrb r7, [r0, #1]!
 516 ; CHECK-NEXT:    ldrb r6, [r1, #1]!
 517 ; CHECK-NEXT:    smlabb r7, r6, r7, r2
 518 ; CHECK-NEXT:    str r7, [r3, #4]!
 519 ; CHECK-NEXT:    le lr, .LBB5_11
 520 ; CHECK-NEXT:  .LBB5_12: @ %for.cond.cleanup
 521 ; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, pc}
 522 ; CHECK-NEXT:    .p2align 4
 523 ; CHECK-NEXT:  @ %bb.13:
 524 ; CHECK-NEXT:  .LCPI5_0:
 525 ; CHECK-NEXT:    .long 0 @ 0x0
 526 ; CHECK-NEXT:    .long 1 @ 0x1
 527 ; CHECK-NEXT:    .long 2 @ 0x2
 528 ; CHECK-NEXT:    .long 3 @ 0x3
 529 entry:
 530   %res12 = bitcast i32* %res to i8*
 531   %cmp10 = icmp eq i32 %N, 0
 532   br i1 %cmp10, label %for.cond.cleanup, label %for.body.lr.ph
 533
 534 for.body.lr.ph:                                   ; preds = %entry
 535   %conv3 = zext i8 %c to i32
 536   %scevgep = getelementptr i32, i32* %res, i32 %N
 537   %scevgep13 = bitcast i32* %scevgep to i8*
 538   %scevgep14 = getelementptr i8, i8* %a, i32 %N
 539   %scevgep15 = getelementptr i8, i8* %b, i32 %N
 540   %bound0 = icmp ugt i8* %scevgep14, %res12
 541   %bound1 = icmp ugt i8* %scevgep13, %a
 542   %found.conflict = and i1 %bound0, %bound1
 543   %bound016 = icmp ugt i8* %scevgep15, %res12
 544   %bound117 = icmp ugt i8* %scevgep13, %b
 545   %found.conflict18 = and i1 %bound016, %bound117
 546   %conflict.rdx = or i1 %found.conflict, %found.conflict18
 547   br i1 %conflict.rdx, label %for.body.preheader, label %vector.ph
 548
 549 for.body.preheader:                               ; preds = %for.body.lr.ph
 550   %0 = add i32 %N, -1
 551   %xtraiter = and i32 %N, 3
 552   %1 = icmp ult i32 %0, 3
 553   br i1 %1, label %for.cond.cleanup.loopexit.unr-lcssa, label %for.body.preheader.new
 554
 555 for.body.preheader.new:                           ; preds = %for.body.preheader
 556   %unroll_iter = sub i32 %N, %xtraiter
 557   br label %for.body
 558
 559 vector.ph:                                        ; preds = %for.body.lr.ph
 560   %n.rnd.up = add i32 %N, 3
 561   %n.vec = and i32 %n.rnd.up, -4
 562   %trip.count.minus.1 = add i32 %N, -1
 563   %broadcast.splatinsert19 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
 564   %broadcast.splat20 = shufflevector <4 x i32> %broadcast.splatinsert19, <4 x i32> undef, <4 x i32> zeroinitializer
 565   %broadcast.splatinsert22 = insertelement <4 x i32> undef, i32 %conv3, i32 0
 566   %broadcast.splat23 = shufflevector <4 x i32> %broadcast.splatinsert22, <4 x i32> undef, <4 x i32> zeroinitializer
 567   br label %vector.body
 568
 569 vector.body:                                      ; preds = %vector.body, %vector.ph
 570   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
 571   %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
 572   %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
 573   %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
 574   %2 = getelementptr inbounds i8, i8* %a, i32 %index
 575   %3 = icmp ule <4 x i32> %induction, %broadcast.splat20
 576   %4 = bitcast i8* %2 to <4 x i8>*
 577   %wide.masked.load = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %4, i32 1, <4 x i1> %3, <4 x i8> undef)
 578   %5 = zext <4 x i8> %wide.masked.load to <4 x i32>
 579   %6 = getelementptr inbounds i8, i8* %b, i32 %index
 580   %7 = bitcast i8* %6 to <4 x i8>*
 581   %wide.masked.load21 = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %7, i32 1, <4 x i1> %3, <4 x i8> undef)
 582   %8 = zext <4 x i8> %wide.masked.load21 to <4 x i32>
 583   %9 = mul nuw nsw <4 x i32> %8, %5
 584   %10 = add nuw nsw <4 x i32> %9, %broadcast.splat23
 585   %11 = getelementptr inbounds i32, i32* %res, i32 %index
 586   %12 = bitcast i32* %11 to <4 x i32>*
 587   call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %10, <4 x i32>* %12, i32 4, <4 x i1> %3)
 588   %index.next = add i32 %index, 4
 589   %13 = icmp eq i32 %index.next, %n.vec
 590   br i1 %13, label %for.cond.cleanup, label %vector.body
 591
 592 for.cond.cleanup.loopexit.unr-lcssa:              ; preds = %for.body, %for.body.preheader
 593   %i.011.unr = phi i32 [ 0, %for.body.preheader ], [ %inc.3, %for.body ]
 594   %lcmp.mod = icmp eq i32 %xtraiter, 0
 595   br i1 %lcmp.mod, label %for.cond.cleanup, label %for.body.epil
 596
 597 for.body.epil:                                    ; preds = %for.cond.cleanup.loopexit.unr-lcssa, %for.body.epil
 598   %i.011.epil = phi i32 [ %inc.epil, %for.body.epil ], [ %i.011.unr, %for.cond.cleanup.loopexit.unr-lcssa ]
 599   %epil.iter = phi i32 [ %epil.iter.sub, %for.body.epil ], [ %xtraiter, %for.cond.cleanup.loopexit.unr-lcssa ]
 600   %arrayidx.epil = getelementptr inbounds i8, i8* %a, i32 %i.011.epil
 601   %14 = load i8, i8* %arrayidx.epil, align 1
 602   %conv.epil = zext i8 %14 to i32
 603   %arrayidx1.epil = getelementptr inbounds i8, i8* %b, i32 %i.011.epil
 604   %15 = load i8, i8* %arrayidx1.epil, align 1
 605   %conv2.epil = zext i8 %15 to i32
 606   %mul.epil = mul nuw nsw i32 %conv2.epil, %conv.epil
 607   %add.epil = add nuw nsw i32 %mul.epil, %conv3
 608   %arrayidx4.epil = getelementptr inbounds i32, i32* %res, i32 %i.011.epil
 609   store i32 %add.epil, i32* %arrayidx4.epil, align 4
 610   %inc.epil = add nuw i32 %i.011.epil, 1
 611   %epil.iter.sub = add i32 %epil.iter, -1
 612   %epil.iter.cmp = icmp eq i32 %epil.iter.sub, 0
 613   br i1 %epil.iter.cmp, label %for.cond.cleanup, label %for.body.epil
 614
 615 for.cond.cleanup:                                 ; preds = %vector.body, %for.cond.cleanup.loopexit.unr-lcssa, %for.body.epil, %entry
 616   ret void
 617
 618 for.body:                                         ; preds = %for.body, %for.body.preheader.new
 619   %i.011 = phi i32 [ 0, %for.body.preheader.new ], [ %inc.3, %for.body ]
 620   %niter = phi i32 [ %unroll_iter, %for.body.preheader.new ], [ %niter.nsub.3, %for.body ]
 621   %arrayidx = getelementptr inbounds i8, i8* %a, i32 %i.011
 622   %16 = load i8, i8* %arrayidx, align 1
 623   %conv = zext i8 %16 to i32
 624   %arrayidx1 = getelementptr inbounds i8, i8* %b, i32 %i.011
 625   %17 = load i8, i8* %arrayidx1, align 1
 626   %conv2 = zext i8 %17 to i32
 627   %mul = mul nuw nsw i32 %conv2, %conv
 628   %add = add nuw nsw i32 %mul, %conv3
 629   %arrayidx4 = getelementptr inbounds i32, i32* %res, i32 %i.011
 630   store i32 %add, i32* %arrayidx4, align 4
 631   %inc = or i32 %i.011, 1
 632   %arrayidx.1 = getelementptr inbounds i8, i8* %a, i32 %inc
 633   %18 = load i8, i8* %arrayidx.1, align 1
 634   %conv.1 = zext i8 %18 to i32
 635   %arrayidx1.1 = getelementptr inbounds i8, i8* %b, i32 %inc
 636   %19 = load i8, i8* %arrayidx1.1, align 1
 637   %conv2.1 = zext i8 %19 to i32
 638   %mul.1 = mul nuw nsw i32 %conv2.1, %conv.1
 639   %add.1 = add nuw nsw i32 %mul.1, %conv3
 640   %arrayidx4.1 = getelementptr inbounds i32, i32* %res, i32 %inc
 641   store i32 %add.1, i32* %arrayidx4.1, align 4
 642   %inc.1 = or i32 %i.011, 2
 643   %arrayidx.2 = getelementptr inbounds i8, i8* %a, i32 %inc.1
 644   %20 = load i8, i8* %arrayidx.2, align 1
 645   %conv.2 = zext i8 %20 to i32
 646   %arrayidx1.2 = getelementptr inbounds i8, i8* %b, i32 %inc.1
 647   %21 = load i8, i8* %arrayidx1.2, align 1
 648   %conv2.2 = zext i8 %21 to i32
 649   %mul.2 = mul nuw nsw i32 %conv2.2, %conv.2
 650   %add.2 = add nuw nsw i32 %mul.2, %conv3
 651   %arrayidx4.2 = getelementptr inbounds i32, i32* %res, i32 %inc.1
 652   store i32 %add.2, i32* %arrayidx4.2, align 4
 653   %inc.2 = or i32 %i.011, 3
 654   %arrayidx.3 = getelementptr inbounds i8, i8* %a, i32 %inc.2
 655   %22 = load i8, i8* %arrayidx.3, align 1
 656   %conv.3 = zext i8 %22 to i32
 657   %arrayidx1.3 = getelementptr inbounds i8, i8* %b, i32 %inc.2
 658   %23 = load i8, i8* %arrayidx1.3, align 1
 659   %conv2.3 = zext i8 %23 to i32
 660   %mul.3 = mul nuw nsw i32 %conv2.3, %conv.3
 661   %add.3 = add nuw nsw i32 %mul.3, %conv3
 662   %arrayidx4.3 = getelementptr inbounds i32, i32* %res, i32 %inc.2
 663   store i32 %add.3, i32* %arrayidx4.3, align 4
 664   %inc.3 = add nuw i32 %i.011, 4
 665   %niter.nsub.3 = add i32 %niter, -4
 666   %niter.ncmp.3 = icmp eq i32 %niter.nsub.3, 0
 667   br i1 %niter.ncmp.3, label %for.cond.cleanup.loopexit.unr-lcssa, label %for.body
 668 }
 669
 670 define arm_aapcs_vfpcc void @test_vec_mul_scalar_add_short(i16* nocapture readonly %a, i16* nocapture readonly %b, i16 signext %c, i32* nocapture %res, i32 %N) {
 671 ; CHECK-LABEL: test_vec_mul_scalar_add_short:
 672 ; CHECK:       @ %bb.0: @ %entry
 673 ; CHECK-NEXT:    push {r4, lr}
 674 ; CHECK-NEXT:    ldr.w r12, [sp, #8]
 675 ; CHECK-NEXT:    cmp.w r12, #0
 676 ; CHECK-NEXT:    it eq
 677 ; CHECK-NEXT:    popeq {r4, pc}
 678 ; CHECK-NEXT:    add.w lr, r12, #3
 679 ; CHECK-NEXT:    movs r4, #1
 680 ; CHECK-NEXT:    bic lr, lr, #3
 681 ; CHECK-NEXT:    sub.w lr, lr, #4
 682 ; CHECK-NEXT:    add.w lr, r4, lr, lsr #2
 683 ; CHECK-NEXT:    sub.w r4, r12, #1
 684 ; CHECK-NEXT:    vdup.32 q0, r4
 685 ; CHECK-NEXT:    adr r4, .LCPI6_0
 686 ; CHECK-NEXT:    vldrw.u32 q1, [r4]
 687 ; CHECK-NEXT:    mov.w r12, #0
 688 ; CHECK-NEXT:    dls lr, lr
 689 ; CHECK-NEXT:  .LBB6_1: @ %vector.body
 690 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 691 ; CHECK-NEXT:    vadd.i32 q2, q1, r12
 692 ; CHECK-NEXT:    add.w r12, r12, #4
 693 ; CHECK-NEXT:    vptt.u32 cs, q0, q2
 694 ; CHECK-NEXT:    vldrht.s32 q2, [r0]
 695 ; CHECK-NEXT:    vldrht.s32 q3, [r1]
 696 ; CHECK-NEXT:    adds r0, #8
 697 ; CHECK-NEXT:    vmul.i32 q2, q3, q2
 698 ; CHECK-NEXT:    adds r1, #8
 699 ; CHECK-NEXT:    vadd.i32 q2, q2, r2
 700 ; CHECK-NEXT:    vpst
 701 ; CHECK-NEXT:    vstrwt.32 q2, [r3]
 702 ; CHECK-NEXT:    adds r3, #16
 703 ; CHECK-NEXT:    le lr, .LBB6_1
 704 ; CHECK-NEXT:  @ %bb.2: @ %for.cond.cleanup
 705 ; CHECK-NEXT:    pop {r4, pc}
 706 ; CHECK-NEXT:    .p2align 4
 707 ; CHECK-NEXT:  @ %bb.3:
 708 ; CHECK-NEXT:  .LCPI6_0:
 709 ; CHECK-NEXT:    .long 0 @ 0x0
 710 ; CHECK-NEXT:    .long 1 @ 0x1
 711 ; CHECK-NEXT:    .long 2 @ 0x2
 712 ; CHECK-NEXT:    .long 3 @ 0x3
 713 entry:
 714   %cmp10 = icmp eq i32 %N, 0
 715   br i1 %cmp10, label %for.cond.cleanup, label %vector.ph
 716
 717 vector.ph:                                        ; preds = %entry
 718   %conv3 = sext i16 %c to i32
 719   %n.rnd.up = add i32 %N, 3
 720   %n.vec = and i32 %n.rnd.up, -4
 721   %trip.count.minus.1 = add i32 %N, -1
 722   %broadcast.splatinsert12 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
 723   %broadcast.splat13 = shufflevector <4 x i32> %broadcast.splatinsert12, <4 x i32> undef, <4 x i32> zeroinitializer
 724   %broadcast.splatinsert15 = insertelement <4 x i32> undef, i32 %conv3, i32 0
 725   %broadcast.splat16 = shufflevector <4 x i32> %broadcast.splatinsert15, <4 x i32> undef, <4 x i32> zeroinitializer
 726   br label %vector.body
 727
 728 vector.body:                                      ; preds = %vector.body, %vector.ph
 729   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
 730   %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
 731   %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
 732   %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
 733   %0 = getelementptr inbounds i16, i16* %a, i32 %index
 734   %1 = icmp ule <4 x i32> %induction, %broadcast.splat13
 735   %2 = bitcast i16* %0 to <4 x i16>*
 736   %wide.masked.load = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %2, i32 2, <4 x i1> %1, <4 x i16> undef)
 737   %3 = sext <4 x i16> %wide.masked.load to <4 x i32>
 738   %4 = getelementptr inbounds i16, i16* %b, i32 %index
 739   %5 = bitcast i16* %4 to <4 x i16>*
 740   %wide.masked.load14 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %5, i32 2, <4 x i1> %1, <4 x i16> undef)
 741   %6 = sext <4 x i16> %wide.masked.load14 to <4 x i32>
 742   %7 = mul nsw <4 x i32> %6, %3
 743   %8 = add nsw <4 x i32> %7, %broadcast.splat16
 744   %9 = getelementptr inbounds i32, i32* %res, i32 %index
 745   %10 = bitcast i32* %9 to <4 x i32>*
 746   call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %8, <4 x i32>* %10, i32 4, <4 x i1> %1)
 747   %index.next = add i32 %index, 4
 748   %11 = icmp eq i32 %index.next, %n.vec
 749   br i1 %11, label %for.cond.cleanup, label %vector.body
 750
 751 for.cond.cleanup:                                 ; preds = %vector.body, %entry
 752   ret void
 753 }
 754
 755 define arm_aapcs_vfpcc void @test_vec_mul_scalar_add_uchar(i8* nocapture readonly %a, i8* nocapture readonly %b, i8 zeroext %c, i32* nocapture %res, i32 %N) {
 756 ; CHECK-LABEL: test_vec_mul_scalar_add_uchar:
 757 ; CHECK:       @ %bb.0: @ %entry
 758 ; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, r9, lr}
 759 ; CHECK-NEXT:    ldr r7, [sp, #28]
 760 ; CHECK-NEXT:    cmp r7, #0
 761 ; CHECK-NEXT:    beq.w .LBB7_12
 762 ; CHECK-NEXT:  @ %bb.1: @ %for.body.lr.ph
 763 ; CHECK-NEXT:    add.w r4, r3, r7, lsl #2
 764 ; CHECK-NEXT:    adds r5, r1, r7
 765 ; CHECK-NEXT:    cmp r4, r1
 766 ; CHECK-NEXT:    add.w r6, r0, r7
 767 ; CHECK-NEXT:    cset r12, hi
 768 ; CHECK-NEXT:    cmp r5, r3
 769 ; CHECK-NEXT:    cset r5, hi
 770 ; CHECK-NEXT:    cmp r4, r0
 771 ; CHECK-NEXT:    cset r4, hi
 772 ; CHECK-NEXT:    cmp r6, r3
 773 ; CHECK-NEXT:    cset r6, hi
 774 ; CHECK-NEXT:    ands r6, r4
 775 ; CHECK-NEXT:    lsls r6, r6, #31
 776 ; CHECK-NEXT:    itt eq
 777 ; CHECK-NEXT:    andeq.w r6, r5, r12
 778 ; CHECK-NEXT:    lslseq.w r6, r6, #31
 779 ; CHECK-NEXT:    beq .LBB7_4
 780 ; CHECK-NEXT:  @ %bb.2: @ %for.body.preheader
 781 ; CHECK-NEXT:    subs r6, r7, #1
 782 ; CHECK-NEXT:    and lr, r7, #3
 783 ; CHECK-NEXT:    cmp r6, #3
 784 ; CHECK-NEXT:    bhs .LBB7_6
 785 ; CHECK-NEXT:  @ %bb.3:
 786 ; CHECK-NEXT:    movs r7, #0
 787 ; CHECK-NEXT:    b .LBB7_9
 788 ; CHECK-NEXT:  .LBB7_4: @ %vector.ph
 789 ; CHECK-NEXT:    adds r6, r7, #3
 790 ; CHECK-NEXT:    movs r5, #1
 791 ; CHECK-NEXT:    bic r6, r6, #3
 792 ; CHECK-NEXT:    subs r7, #1
 793 ; CHECK-NEXT:    subs r6, #4
 794 ; CHECK-NEXT:    vdup.32 q0, r7
 795 ; CHECK-NEXT:    movs r7, #0
 796 ; CHECK-NEXT:    add.w lr, r5, r6, lsr #2
 797 ; CHECK-NEXT:    adr r6, .LCPI7_0
 798 ; CHECK-NEXT:    vldrw.u32 q1, [r6]
 799 ; CHECK-NEXT:    dls lr, lr
 800 ; CHECK-NEXT:  .LBB7_5: @ %vector.body
 801 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 802 ; CHECK-NEXT:    vadd.i32 q2, q1, r7
 803 ; CHECK-NEXT:    adds r4, r0, r7
 804 ; CHECK-NEXT:    vpt.u32 cs, q0, q2
 805 ; CHECK-NEXT:    vldrbt.u32 q2, [r4]
 806 ; CHECK-NEXT:    adds r4, r1, r7
 807 ; CHECK-NEXT:    vpst
 808 ; CHECK-NEXT:    vldrbt.u32 q3, [r4]
 809 ; CHECK-NEXT:    vmul.i32 q2, q3, q2
 810 ; CHECK-NEXT:    vadd.i32 q2, q2, r2
 811 ; CHECK-NEXT:    vpst
 812 ; CHECK-NEXT:    vstrwt.32 q2, [r3]
 813 ; CHECK-NEXT:    adds r3, #16
 814 ; CHECK-NEXT:    adds r7, #4
 815 ; CHECK-NEXT:    le lr, .LBB7_5
 816 ; CHECK-NEXT:    b .LBB7_12
 817 ; CHECK-NEXT:  .LBB7_6: @ %for.body.preheader.new
 818 ; CHECK-NEXT:    sub.w r12, lr, r7
 819 ; CHECK-NEXT:    subs r4, r1, #3
 820 ; CHECK-NEXT:    subs r5, r0, #3
 821 ; CHECK-NEXT:    sub.w r7, r3, #16
 822 ; CHECK-NEXT:    mov.w r9, #0
 823 ; CHECK-NEXT:  .LBB7_7: @ %for.body
 824 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 825 ; CHECK-NEXT:    ldrb.w r8, [r5, #3]
 826 ; CHECK-NEXT:    sub.w r9, r9, #4
 827 ; CHECK-NEXT:    ldrb r6, [r4, #3]
 828 ; CHECK-NEXT:    cmp r12, r9
 829 ; CHECK-NEXT:    smlabb r6, r6, r8, r2
 830 ; CHECK-NEXT:    str r6, [r7, #16]!
 831 ; CHECK-NEXT:    ldrb r8, [r5, #4]!
 832 ; CHECK-NEXT:    ldrb r6, [r4, #4]!
 833 ; CHECK-NEXT:    smlabb r6, r6, r8, r2
 834 ; CHECK-NEXT:    str r6, [r7, #4]
 835 ; CHECK-NEXT:    ldrb.w r8, [r5, #1]
 836 ; CHECK-NEXT:    ldrb r6, [r4, #1]
 837 ; CHECK-NEXT:    smlabb r6, r6, r8, r2
 838 ; CHECK-NEXT:    str r6, [r7, #8]
 839 ; CHECK-NEXT:    ldrb.w r8, [r5, #2]
 840 ; CHECK-NEXT:    ldrb r6, [r4, #2]
 841 ; CHECK-NEXT:    smlabb r6, r6, r8, r2
 842 ; CHECK-NEXT:    str r6, [r7, #12]
 843 ; CHECK-NEXT:    bne .LBB7_7
 844 ; CHECK-NEXT:  @ %bb.8: @ %for.cond.cleanup.loopexit.unr-lcssa.loopexit
 845 ; CHECK-NEXT:    rsb.w r7, r9, #0
 846 ; CHECK-NEXT:  .LBB7_9: @ %for.cond.cleanup.loopexit.unr-lcssa
 847 ; CHECK-NEXT:    wls lr, lr, .LBB7_12
 848 ; CHECK-NEXT:  @ %bb.10: @ %for.body.epil.preheader
 849 ; CHECK-NEXT:    subs r7, #1
 850 ; CHECK-NEXT:    add r0, r7
 851 ; CHECK-NEXT:    add r1, r7
 852 ; CHECK-NEXT:    add.w r3, r3, r7, lsl #2
 853 ; CHECK-NEXT:  .LBB7_11: @ %for.body.epil
 854 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 855 ; CHECK-NEXT:    ldrb r7, [r0, #1]!
 856 ; CHECK-NEXT:    ldrb r6, [r1, #1]!
 857 ; CHECK-NEXT:    smlabb r7, r6, r7, r2
 858 ; CHECK-NEXT:    str r7, [r3, #4]!
 859 ; CHECK-NEXT:    le lr, .LBB7_11
 860 ; CHECK-NEXT:  .LBB7_12: @ %for.cond.cleanup
 861 ; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, pc}
 862 ; CHECK-NEXT:    .p2align 4
 863 ; CHECK-NEXT:  @ %bb.13:
 864 ; CHECK-NEXT:  .LCPI7_0:
 865 ; CHECK-NEXT:    .long 0 @ 0x0
 866 ; CHECK-NEXT:    .long 1 @ 0x1
 867 ; CHECK-NEXT:    .long 2 @ 0x2
 868 ; CHECK-NEXT:    .long 3 @ 0x3
 869 entry:
 870   %res12 = bitcast i32* %res to i8*
 871   %cmp10 = icmp eq i32 %N, 0
 872   br i1 %cmp10, label %for.cond.cleanup, label %for.body.lr.ph
 873
 874 for.body.lr.ph:                                   ; preds = %entry
 875   %conv3 = zext i8 %c to i32
 876   %scevgep = getelementptr i32, i32* %res, i32 %N
 877   %scevgep13 = bitcast i32* %scevgep to i8*
 878   %scevgep14 = getelementptr i8, i8* %a, i32 %N
 879   %scevgep15 = getelementptr i8, i8* %b, i32 %N
 880   %bound0 = icmp ugt i8* %scevgep14, %res12
 881   %bound1 = icmp ugt i8* %scevgep13, %a
 882   %found.conflict = and i1 %bound0, %bound1
 883   %bound016 = icmp ugt i8* %scevgep15, %res12
 884   %bound117 = icmp ugt i8* %scevgep13, %b
 885   %found.conflict18 = and i1 %bound016, %bound117
 886   %conflict.rdx = or i1 %found.conflict, %found.conflict18
 887   br i1 %conflict.rdx, label %for.body.preheader, label %vector.ph
 888
 889 for.body.preheader:                               ; preds = %for.body.lr.ph
 890   %0 = add i32 %N, -1
 891   %xtraiter = and i32 %N, 3
 892   %1 = icmp ult i32 %0, 3
 893   br i1 %1, label %for.cond.cleanup.loopexit.unr-lcssa, label %for.body.preheader.new
 894
 895 for.body.preheader.new:                           ; preds = %for.body.preheader
 896   %unroll_iter = sub i32 %N, %xtraiter
 897   br label %for.body
 898
 899 vector.ph:                                        ; preds = %for.body.lr.ph
 900   %n.rnd.up = add i32 %N, 3
 901   %n.vec = and i32 %n.rnd.up, -4
 902   %trip.count.minus.1 = add i32 %N, -1
 903   %broadcast.splatinsert19 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
 904   %broadcast.splat20 = shufflevector <4 x i32> %broadcast.splatinsert19, <4 x i32> undef, <4 x i32> zeroinitializer
 905   %broadcast.splatinsert22 = insertelement <4 x i32> undef, i32 %conv3, i32 0
 906   %broadcast.splat23 = shufflevector <4 x i32> %broadcast.splatinsert22, <4 x i32> undef, <4 x i32> zeroinitializer
 907   br label %vector.body
 908
 909 vector.body:                                      ; preds = %vector.body, %vector.ph
 910   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
 911   %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
 912   %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
 913   %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
 914   %2 = getelementptr inbounds i8, i8* %a, i32 %index
 915   %3 = icmp ule <4 x i32> %induction, %broadcast.splat20
 916   %4 = bitcast i8* %2 to <4 x i8>*
 917   %wide.masked.load = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %4, i32 1, <4 x i1> %3, <4 x i8> undef)
 918   %5 = zext <4 x i8> %wide.masked.load to <4 x i32>
 919   %6 = getelementptr inbounds i8, i8* %b, i32 %index
 920   %7 = bitcast i8* %6 to <4 x i8>*
 921   %wide.masked.load21 = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %7, i32 1, <4 x i1> %3, <4 x i8> undef)
 922   %8 = zext <4 x i8> %wide.masked.load21 to <4 x i32>
 923   %9 = mul nuw nsw <4 x i32> %8, %5
 924   %10 = add nuw nsw <4 x i32> %9, %broadcast.splat23
 925   %11 = getelementptr inbounds i32, i32* %res, i32 %index
 926   %12 = bitcast i32* %11 to <4 x i32>*
 927   call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %10, <4 x i32>* %12, i32 4, <4 x i1> %3)
 928   %index.next = add i32 %index, 4
 929   %13 = icmp eq i32 %index.next, %n.vec
 930   br i1 %13, label %for.cond.cleanup, label %vector.body
 931
 932 for.cond.cleanup.loopexit.unr-lcssa:              ; preds = %for.body, %for.body.preheader
 933   %i.011.unr = phi i32 [ 0, %for.body.preheader ], [ %inc.3, %for.body ]
 934   %lcmp.mod = icmp eq i32 %xtraiter, 0
 935   br i1 %lcmp.mod, label %for.cond.cleanup, label %for.body.epil
 936
 937 for.body.epil:                                    ; preds = %for.cond.cleanup.loopexit.unr-lcssa, %for.body.epil
 938   %i.011.epil = phi i32 [ %inc.epil, %for.body.epil ], [ %i.011.unr, %for.cond.cleanup.loopexit.unr-lcssa ]
 939   %epil.iter = phi i32 [ %epil.iter.sub, %for.body.epil ], [ %xtraiter, %for.cond.cleanup.loopexit.unr-lcssa ]
 940   %arrayidx.epil = getelementptr inbounds i8, i8* %a, i32 %i.011.epil
 941   %14 = load i8, i8* %arrayidx.epil, align 1
 942   %conv.epil = zext i8 %14 to i32
 943   %arrayidx1.epil = getelementptr inbounds i8, i8* %b, i32 %i.011.epil
 944   %15 = load i8, i8* %arrayidx1.epil, align 1
 945   %conv2.epil = zext i8 %15 to i32
 946   %mul.epil = mul nuw nsw i32 %conv2.epil, %conv.epil
 947   %add.epil = add nuw nsw i32 %mul.epil, %conv3
 948   %arrayidx4.epil = getelementptr inbounds i32, i32* %res, i32 %i.011.epil
 949   store i32 %add.epil, i32* %arrayidx4.epil, align 4
 950   %inc.epil = add nuw i32 %i.011.epil, 1
 951   %epil.iter.sub = add i32 %epil.iter, -1
 952   %epil.iter.cmp = icmp eq i32 %epil.iter.sub, 0
 953   br i1 %epil.iter.cmp, label %for.cond.cleanup, label %for.body.epil
 954
 955 for.cond.cleanup:                                 ; preds = %vector.body, %for.cond.cleanup.loopexit.unr-lcssa, %for.body.epil, %entry
 956   ret void
 957
 958 for.body:                                         ; preds = %for.body, %for.body.preheader.new
 959   %i.011 = phi i32 [ 0, %for.body.preheader.new ], [ %inc.3, %for.body ]
 960   %niter = phi i32 [ %unroll_iter, %for.body.preheader.new ], [ %niter.nsub.3, %for.body ]
 961   %arrayidx = getelementptr inbounds i8, i8* %a, i32 %i.011
 962   %16 = load i8, i8* %arrayidx, align 1
 963   %conv = zext i8 %16 to i32
 964   %arrayidx1 = getelementptr inbounds i8, i8* %b, i32 %i.011
 965   %17 = load i8, i8* %arrayidx1, align 1
 966   %conv2 = zext i8 %17 to i32
 967   %mul = mul nuw nsw i32 %conv2, %conv
 968   %add = add nuw nsw i32 %mul, %conv3
 969   %arrayidx4 = getelementptr inbounds i32, i32* %res, i32 %i.011
 970   store i32 %add, i32* %arrayidx4, align 4
 971   %inc = or i32 %i.011, 1
 972   %arrayidx.1 = getelementptr inbounds i8, i8* %a, i32 %inc
 973   %18 = load i8, i8* %arrayidx.1, align 1
 974   %conv.1 = zext i8 %18 to i32
 975   %arrayidx1.1 = getelementptr inbounds i8, i8* %b, i32 %inc
 976   %19 = load i8, i8* %arrayidx1.1, align 1
 977   %conv2.1 = zext i8 %19 to i32
 978   %mul.1 = mul nuw nsw i32 %conv2.1, %conv.1
 979   %add.1 = add nuw nsw i32 %mul.1, %conv3
 980   %arrayidx4.1 = getelementptr inbounds i32, i32* %res, i32 %inc
 981   store i32 %add.1, i32* %arrayidx4.1, align 4
 982   %inc.1 = or i32 %i.011, 2
 983   %arrayidx.2 = getelementptr inbounds i8, i8* %a, i32 %inc.1
 984   %20 = load i8, i8* %arrayidx.2, align 1
 985   %conv.2 = zext i8 %20 to i32
 986   %arrayidx1.2 = getelementptr inbounds i8, i8* %b, i32 %inc.1
 987   %21 = load i8, i8* %arrayidx1.2, align 1
 988   %conv2.2 = zext i8 %21 to i32
 989   %mul.2 = mul nuw nsw i32 %conv2.2, %conv.2
 990   %add.2 = add nuw nsw i32 %mul.2, %conv3
 991   %arrayidx4.2 = getelementptr inbounds i32, i32* %res, i32 %inc.1
 992   store i32 %add.2, i32* %arrayidx4.2, align 4
 993   %inc.2 = or i32 %i.011, 3
 994   %arrayidx.3 = getelementptr inbounds i8, i8* %a, i32 %inc.2
 995   %22 = load i8, i8* %arrayidx.3, align 1
 996   %conv.3 = zext i8 %22 to i32
 997   %arrayidx1.3 = getelementptr inbounds i8, i8* %b, i32 %inc.2
 998   %23 = load i8, i8* %arrayidx1.3, align 1
 999   %conv2.3 = zext i8 %23 to i32
1000   %mul.3 = mul nuw nsw i32 %conv2.3, %conv.3
1001   %add.3 = add nuw nsw i32 %mul.3, %conv3
1002   %arrayidx4.3 = getelementptr inbounds i32, i32* %res, i32 %inc.2
1003   store i32 %add.3, i32* %arrayidx4.3, align 4
1004   %inc.3 = add nuw i32 %i.011, 4
1005   %niter.nsub.3 = add i32 %niter, -4
1006   %niter.ncmp.3 = icmp eq i32 %niter.nsub.3, 0
1007   br i1 %niter.ncmp.3, label %for.cond.cleanup.loopexit.unr-lcssa, label %for.body
1008 }
1009
1010 define arm_aapcs_vfpcc void @test_vec_mul_scalar_add_ushort(i16* nocapture readonly %a, i16* nocapture readonly %b, i16 signext %c, i32* nocapture %res, i32 %N) {
1011 ; CHECK-LABEL: test_vec_mul_scalar_add_ushort:
1012 ; CHECK:       @ %bb.0: @ %entry
1013 ; CHECK-NEXT:    push {r4, lr}
1014 ; CHECK-NEXT:    ldr.w r12, [sp, #8]
1015 ; CHECK-NEXT:    cmp.w r12, #0
1016 ; CHECK-NEXT:    it eq
1017 ; CHECK-NEXT:    popeq {r4, pc}
1018 ; CHECK-NEXT:    add.w lr, r12, #3
1019 ; CHECK-NEXT:    movs r4, #1
1020 ; CHECK-NEXT:    bic lr, lr, #3
1021 ; CHECK-NEXT:    sub.w lr, lr, #4
1022 ; CHECK-NEXT:    add.w lr, r4, lr, lsr #2
1023 ; CHECK-NEXT:    sub.w r4, r12, #1
1024 ; CHECK-NEXT:    vdup.32 q0, r4
1025 ; CHECK-NEXT:    adr r4, .LCPI8_0
1026 ; CHECK-NEXT:    vldrw.u32 q1, [r4]
1027 ; CHECK-NEXT:    mov.w r12, #0
1028 ; CHECK-NEXT:    dls lr, lr
1029 ; CHECK-NEXT:  .LBB8_1: @ %vector.body
1030 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
1031 ; CHECK-NEXT:    vadd.i32 q2, q1, r12
1032 ; CHECK-NEXT:    add.w r12, r12, #4
1033 ; CHECK-NEXT:    vptt.u32 cs, q0, q2
1034 ; CHECK-NEXT:    vldrht.u32 q2, [r0]
1035 ; CHECK-NEXT:    vldrht.u32 q3, [r1]
1036 ; CHECK-NEXT:    adds r0, #8
1037 ; CHECK-NEXT:    vmul.i32 q2, q3, q2
1038 ; CHECK-NEXT:    adds r1, #8
1039 ; CHECK-NEXT:    vadd.i32 q2, q2, r2
1040 ; CHECK-NEXT:    vpst
1041 ; CHECK-NEXT:    vstrwt.32 q2, [r3]
1042 ; CHECK-NEXT:    adds r3, #16
1043 ; CHECK-NEXT:    le lr, .LBB8_1
1044 ; CHECK-NEXT:  @ %bb.2: @ %for.cond.cleanup
1045 ; CHECK-NEXT:    pop {r4, pc}
1046 ; CHECK-NEXT:    .p2align 4
1047 ; CHECK-NEXT:  @ %bb.3:
1048 ; CHECK-NEXT:  .LCPI8_0:
1049 ; CHECK-NEXT:    .long 0 @ 0x0
1050 ; CHECK-NEXT:    .long 1 @ 0x1
1051 ; CHECK-NEXT:    .long 2 @ 0x2
1052 ; CHECK-NEXT:    .long 3 @ 0x3
1053 entry:
1054   %cmp10 = icmp eq i32 %N, 0
1055   br i1 %cmp10, label %for.cond.cleanup, label %vector.ph
1056
1057 vector.ph:                                        ; preds = %entry
1058   %conv3 = sext i16 %c to i32
1059   %n.rnd.up = add i32 %N, 3
1060   %n.vec = and i32 %n.rnd.up, -4
1061   %trip.count.minus.1 = add i32 %N, -1
1062   %broadcast.splatinsert12 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
1063   %broadcast.splat13 = shufflevector <4 x i32> %broadcast.splatinsert12, <4 x i32> undef, <4 x i32> zeroinitializer
1064   %broadcast.splatinsert15 = insertelement <4 x i32> undef, i32 %conv3, i32 0
1065   %broadcast.splat16 = shufflevector <4 x i32> %broadcast.splatinsert15, <4 x i32> undef, <4 x i32> zeroinitializer
1066   br label %vector.body
1067
1068 vector.body:                                      ; preds = %vector.body, %vector.ph
1069   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
1070   %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
1071   %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
1072   %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
1073   %0 = getelementptr inbounds i16, i16* %a, i32 %index
1074   %1 = icmp ule <4 x i32> %induction, %broadcast.splat13
1075   %2 = bitcast i16* %0 to <4 x i16>*
1076   %wide.masked.load = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %2, i32 2, <4 x i1> %1, <4 x i16> undef)
1077   %3 = zext <4 x i16> %wide.masked.load to <4 x i32>
1078   %4 = getelementptr inbounds i16, i16* %b, i32 %index
1079   %5 = bitcast i16* %4 to <4 x i16>*
1080   %wide.masked.load14 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %5, i32 2, <4 x i1> %1, <4 x i16> undef)
1081   %6 = zext <4 x i16> %wide.masked.load14 to <4 x i32>
1082   %7 = mul nuw nsw <4 x i32> %6, %3
1083   %8 = add nsw <4 x i32> %7, %broadcast.splat16
1084   %9 = getelementptr inbounds i32, i32* %res, i32 %index
1085   %10 = bitcast i32* %9 to <4 x i32>*
1086   call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %8, <4 x i32>* %10, i32 4, <4 x i1> %1)
1087   %index.next = add i32 %index, 4
1088   %11 = icmp eq i32 %index.next, %n.vec
1089   br i1 %11, label %for.cond.cleanup, label %vector.body
1090
1091 for.cond.cleanup:                                 ; preds = %vector.body, %entry
1092   ret void
1093 }
1094
1095 define arm_aapcs_vfpcc void @test_vec_mul_scalar_add_int(i32* nocapture readonly %a, i32* nocapture readonly %b, i32 %c, i32* nocapture %res, i32 %N) {
1096 ; CHECK-LABEL: test_vec_mul_scalar_add_int:
1097 ; CHECK:       @ %bb.0: @ %entry
1098 ; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, r9, r10, lr}
1099 ; CHECK-NEXT:    ldr.w r12, [sp, #32]
1100 ; CHECK-NEXT:    cmp.w r12, #0
1101 ; CHECK-NEXT:    beq.w .LBB9_11
1102 ; CHECK-NEXT:  @ %bb.1: @ %vector.memcheck
1103 ; CHECK-NEXT:    add.w r4, r3, r12, lsl #2
1104 ; CHECK-NEXT:    add.w r5, r1, r12, lsl #2
1105 ; CHECK-NEXT:    cmp r4, r1
1106 ; CHECK-NEXT:    add.w r6, r0, r12, lsl #2
1107 ; CHECK-NEXT:    cset r7, hi
1108 ; CHECK-NEXT:    cmp r5, r3
1109 ; CHECK-NEXT:    cset r5, hi
1110 ; CHECK-NEXT:    cmp r4, r0
1111 ; CHECK-NEXT:    cset r4, hi
1112 ; CHECK-NEXT:    cmp r6, r3
1113 ; CHECK-NEXT:    cset r6, hi
1114 ; CHECK-NEXT:    mov.w lr, #1
1115 ; CHECK-NEXT:    ands r6, r4
1116 ; CHECK-NEXT:    lsls r6, r6, #31
1117 ; CHECK-NEXT:    itt eq
1118 ; CHECK-NEXT:    andeq.w r4, r5, r7
1119 ; CHECK-NEXT:    lslseq.w r4, r4, #31
1120 ; CHECK-NEXT:    beq .LBB9_4
1121 ; CHECK-NEXT:  @ %bb.2: @ %for.body.preheader
1122 ; CHECK-NEXT:    sub.w r4, r12, #1
1123 ; CHECK-NEXT:    and r5, r12, #3
1124 ; CHECK-NEXT:    cmp r4, #3
1125 ; CHECK-NEXT:    bhs .LBB9_6
1126 ; CHECK-NEXT:  @ %bb.3:
1127 ; CHECK-NEXT:    mov r10, r5
1128 ; CHECK-NEXT:    mov.w r12, #0
1129 ; CHECK-NEXT:    b .LBB9_8
1130 ; CHECK-NEXT:  .LBB9_4: @ %vector.ph
1131 ; CHECK-NEXT:    add.w r4, r12, #3
1132 ; CHECK-NEXT:    bic r4, r4, #3
1133 ; CHECK-NEXT:    subs r4, #4
1134 ; CHECK-NEXT:    add.w lr, lr, r4, lsr #2
1135 ; CHECK-NEXT:    dls lr, lr
1136 ; CHECK-NEXT:  .LBB9_5: @ %vector.body
1137 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
1138 ; CHECK-NEXT:    vctp.32 r12
1139 ; CHECK-NEXT:    vpstt
1140 ; CHECK-NEXT:    vldrwt.u32 q0, [r0]
1141 ; CHECK-NEXT:    vldrwt.u32 q1, [r1]
1142 ; CHECK-NEXT:    vmul.i32 q0, q1, q0
1143 ; CHECK-NEXT:    adds r0, #16
1144 ; CHECK-NEXT:    vadd.i32 q0, q0, r2
1145 ; CHECK-NEXT:    vpst
1146 ; CHECK-NEXT:    vstrwt.32 q0, [r3]
1147 ; CHECK-NEXT:    adds r1, #16
1148 ; CHECK-NEXT:    adds r3, #16
1149 ; CHECK-NEXT:    sub.w r12, r12, #4
1150 ; CHECK-NEXT:    le lr, .LBB9_5
1151 ; CHECK-NEXT:    b .LBB9_11
1152 ; CHECK-NEXT:  .LBB9_6: @ %for.body.preheader.new
1153 ; CHECK-NEXT:    sub.w r7, r12, r5
1154 ; CHECK-NEXT:    mov r10, r5
1155 ; CHECK-NEXT:    subs r7, #4
1156 ; CHECK-NEXT:    movs r4, #0
1157 ; CHECK-NEXT:    mov.w r12, #0
1158 ; CHECK-NEXT:    add.w lr, lr, r7, lsr #2
1159 ; CHECK-NEXT:    dls lr, lr
1160 ; CHECK-NEXT:  .LBB9_7: @ %for.body
1161 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
1162 ; CHECK-NEXT:    ldr r5, [r0, r4]
1163 ; CHECK-NEXT:    add.w r9, r0, r4
1164 ; CHECK-NEXT:    ldr r6, [r1, r4]
1165 ; CHECK-NEXT:    adds r7, r1, r4
1166 ; CHECK-NEXT:    add.w r12, r12, #4
1167 ; CHECK-NEXT:    mla r5, r6, r5, r2
1168 ; CHECK-NEXT:    str r5, [r3, r4]
1169 ; CHECK-NEXT:    ldr.w r8, [r9, #4]
1170 ; CHECK-NEXT:    ldr r6, [r7, #4]
1171 ; CHECK-NEXT:    mla r8, r6, r8, r2
1172 ; CHECK-NEXT:    adds r6, r3, r4
1173 ; CHECK-NEXT:    adds r4, #16
1174 ; CHECK-NEXT:    str.w r8, [r6, #4]
1175 ; CHECK-NEXT:    ldr.w r8, [r9, #8]
1176 ; CHECK-NEXT:    ldr r5, [r7, #8]
1177 ; CHECK-NEXT:    mla r5, r5, r8, r2
1178 ; CHECK-NEXT:    str r5, [r6, #8]
1179 ; CHECK-NEXT:    ldr.w r5, [r9, #12]
1180 ; CHECK-NEXT:    ldr r7, [r7, #12]
1181 ; CHECK-NEXT:    mla r5, r7, r5, r2
1182 ; CHECK-NEXT:    str r5, [r6, #12]
1183 ; CHECK-NEXT:    le lr, .LBB9_7
1184 ; CHECK-NEXT:  .LBB9_8: @ %for.cond.cleanup.loopexit.unr-lcssa
1185 ; CHECK-NEXT:    wls lr, r10, .LBB9_11
1186 ; CHECK-NEXT:  @ %bb.9: @ %for.body.epil.preheader
1187 ; CHECK-NEXT:    mvn r7, #3
1188 ; CHECK-NEXT:    mov lr, r10
1189 ; CHECK-NEXT:    add.w r7, r7, r12, lsl #2
1190 ; CHECK-NEXT:    add r0, r7
1191 ; CHECK-NEXT:    add r1, r7
1192 ; CHECK-NEXT:    add r3, r7
1193 ; CHECK-NEXT:  .LBB9_10: @ %for.body.epil
1194 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
1195 ; CHECK-NEXT:    ldr r7, [r0, #4]!
1196 ; CHECK-NEXT:    ldr r6, [r1, #4]!
1197 ; CHECK-NEXT:    mla r7, r6, r7, r2
1198 ; CHECK-NEXT:    str r7, [r3, #4]!
1199 ; CHECK-NEXT:    le lr, .LBB9_10
1200 ; CHECK-NEXT:  .LBB9_11: @ %for.cond.cleanup
1201 ; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, r10, pc}
1202 entry:
1203   %cmp8 = icmp eq i32 %N, 0
1204   br i1 %cmp8, label %for.cond.cleanup, label %vector.memcheck
1205
1206 vector.memcheck:                                  ; preds = %entry
1207   %scevgep = getelementptr i32, i32* %res, i32 %N
1208   %scevgep13 = getelementptr i32, i32* %a, i32 %N
1209   %scevgep16 = getelementptr i32, i32* %b, i32 %N
1210   %bound0 = icmp ugt i32* %scevgep13, %res
1211   %bound1 = icmp ugt i32* %scevgep, %a
1212   %found.conflict = and i1 %bound0, %bound1
1213   %bound018 = icmp ugt i32* %scevgep16, %res
1214   %bound119 = icmp ugt i32* %scevgep, %b
1215   %found.conflict20 = and i1 %bound018, %bound119
1216   %conflict.rdx = or i1 %found.conflict, %found.conflict20
1217   br i1 %conflict.rdx, label %for.body.preheader, label %vector.ph
1218
1219 for.body.preheader:                               ; preds = %vector.memcheck
1220   %0 = add i32 %N, -1
1221   %xtraiter = and i32 %N, 3
1222   %1 = icmp ult i32 %0, 3
1223   br i1 %1, label %for.cond.cleanup.loopexit.unr-lcssa, label %for.body.preheader.new
1224
1225 for.body.preheader.new:                           ; preds = %for.body.preheader
1226   %unroll_iter = sub i32 %N, %xtraiter
1227   br label %for.body
1228
1229 vector.ph:                                        ; preds = %vector.memcheck
1230   %n.rnd.up = add i32 %N, 3
1231   %n.vec = and i32 %n.rnd.up, -4
1232   %trip.count.minus.1 = add i32 %N, -1
1233   %broadcast.splatinsert21 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
1234   %broadcast.splat22 = shufflevector <4 x i32> %broadcast.splatinsert21, <4 x i32> undef, <4 x i32> zeroinitializer
1235   %broadcast.splatinsert24 = insertelement <4 x i32> undef, i32 %c, i32 0
1236   %broadcast.splat25 = shufflevector <4 x i32> %broadcast.splatinsert24, <4 x i32> undef, <4 x i32> zeroinitializer
1237   br label %vector.body
1238
1239 vector.body:                                      ; preds = %vector.body, %vector.ph
1240   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
1241   %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
1242   %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
1243   %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
1244   %2 = getelementptr inbounds i32, i32* %a, i32 %index
1245   %3 = icmp ule <4 x i32> %induction, %broadcast.splat22
1246   %4 = bitcast i32* %2 to <4 x i32>*
1247   %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %4, i32 4, <4 x i1> %3, <4 x i32> undef)
1248   %5 = getelementptr inbounds i32, i32* %b, i32 %index
1249   %6 = bitcast i32* %5 to <4 x i32>*
1250   %wide.masked.load23 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %6, i32 4, <4 x i1> %3, <4 x i32> undef)
1251   %7 = mul nsw <4 x i32> %wide.masked.load23, %wide.masked.load
1252   %8 = add nsw <4 x i32> %7, %broadcast.splat25
1253   %9 = getelementptr inbounds i32, i32* %res, i32 %index
1254   %10 = bitcast i32* %9 to <4 x i32>*
1255   call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %8, <4 x i32>* %10, i32 4, <4 x i1> %3)
1256   %index.next = add i32 %index, 4
1257   %11 = icmp eq i32 %index.next, %n.vec
1258   br i1 %11, label %for.cond.cleanup, label %vector.body
1259
1260 for.cond.cleanup.loopexit.unr-lcssa:              ; preds = %for.body, %for.body.preheader
1261   %i.09.unr = phi i32 [ 0, %for.body.preheader ], [ %inc.3, %for.body ]
1262   %lcmp.mod = icmp eq i32 %xtraiter, 0
1263   br i1 %lcmp.mod, label %for.cond.cleanup, label %for.body.epil
1264
1265 for.body.epil:                                    ; preds = %for.cond.cleanup.loopexit.unr-lcssa, %for.body.epil
1266   %i.09.epil = phi i32 [ %inc.epil, %for.body.epil ], [ %i.09.unr, %for.cond.cleanup.loopexit.unr-lcssa ]
1267   %epil.iter = phi i32 [ %epil.iter.sub, %for.body.epil ], [ %xtraiter, %for.cond.cleanup.loopexit.unr-lcssa ]
1268   %arrayidx.epil = getelementptr inbounds i32, i32* %a, i32 %i.09.epil
1269   %12 = load i32, i32* %arrayidx.epil, align 4
1270   %arrayidx1.epil = getelementptr inbounds i32, i32* %b, i32 %i.09.epil
1271   %13 = load i32, i32* %arrayidx1.epil, align 4
1272   %mul.epil = mul nsw i32 %13, %12
1273   %add.epil = add nsw i32 %mul.epil, %c
1274   %arrayidx2.epil = getelementptr inbounds i32, i32* %res, i32 %i.09.epil
1275   store i32 %add.epil, i32* %arrayidx2.epil, align 4
1276   %inc.epil = add nuw i32 %i.09.epil, 1
1277   %epil.iter.sub = add i32 %epil.iter, -1
1278   %epil.iter.cmp = icmp eq i32 %epil.iter.sub, 0
1279   br i1 %epil.iter.cmp, label %for.cond.cleanup, label %for.body.epil
1280
1281 for.cond.cleanup:                                 ; preds = %vector.body, %for.cond.cleanup.loopexit.unr-lcssa, %for.body.epil, %entry
1282   ret void
1283
1284 for.body:                                         ; preds = %for.body, %for.body.preheader.new
1285   %i.09 = phi i32 [ 0, %for.body.preheader.new ], [ %inc.3, %for.body ]
1286   %niter = phi i32 [ %unroll_iter, %for.body.preheader.new ], [ %niter.nsub.3, %for.body ]
1287   %arrayidx = getelementptr inbounds i32, i32* %a, i32 %i.09
1288   %14 = load i32, i32* %arrayidx, align 4
1289   %arrayidx1 = getelementptr inbounds i32, i32* %b, i32 %i.09
1290   %15 = load i32, i32* %arrayidx1, align 4
1291   %mul = mul nsw i32 %15, %14
1292   %add = add nsw i32 %mul, %c
1293   %arrayidx2 = getelementptr inbounds i32, i32* %res, i32 %i.09
1294   store i32 %add, i32* %arrayidx2, align 4
1295   %inc = or i32 %i.09, 1
1296   %arrayidx.1 = getelementptr inbounds i32, i32* %a, i32 %inc
1297   %16 = load i32, i32* %arrayidx.1, align 4
1298   %arrayidx1.1 = getelementptr inbounds i32, i32* %b, i32 %inc
1299   %17 = load i32, i32* %arrayidx1.1, align 4
1300   %mul.1 = mul nsw i32 %17, %16
1301   %add.1 = add nsw i32 %mul.1, %c
1302   %arrayidx2.1 = getelementptr inbounds i32, i32* %res, i32 %inc
1303   store i32 %add.1, i32* %arrayidx2.1, align 4
1304   %inc.1 = or i32 %i.09, 2
1305   %arrayidx.2 = getelementptr inbounds i32, i32* %a, i32 %inc.1
1306   %18 = load i32, i32* %arrayidx.2, align 4
1307   %arrayidx1.2 = getelementptr inbounds i32, i32* %b, i32 %inc.1
1308   %19 = load i32, i32* %arrayidx1.2, align 4
1309   %mul.2 = mul nsw i32 %19, %18
1310   %add.2 = add nsw i32 %mul.2, %c
1311   %arrayidx2.2 = getelementptr inbounds i32, i32* %res, i32 %inc.1
1312   store i32 %add.2, i32* %arrayidx2.2, align 4
1313   %inc.2 = or i32 %i.09, 3
1314   %arrayidx.3 = getelementptr inbounds i32, i32* %a, i32 %inc.2
1315   %20 = load i32, i32* %arrayidx.3, align 4
1316   %arrayidx1.3 = getelementptr inbounds i32, i32* %b, i32 %inc.2
1317   %21 = load i32, i32* %arrayidx1.3, align 4
1318   %mul.3 = mul nsw i32 %21, %20
1319   %add.3 = add nsw i32 %mul.3, %c
1320   %arrayidx2.3 = getelementptr inbounds i32, i32* %res, i32 %inc.2
1321   store i32 %add.3, i32* %arrayidx2.3, align 4
1322   %inc.3 = add nuw i32 %i.09, 4
1323   %niter.nsub.3 = add i32 %niter, -4
1324   %niter.ncmp.3 = icmp eq i32 %niter.nsub.3, 0
1325   br i1 %niter.ncmp.3, label %for.cond.cleanup.loopexit.unr-lcssa, label %for.body
1326 }
1327
1328 ; Function Attrs: argmemonly nounwind readonly willreturn
1329 declare <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>*, i32 immarg, <4 x i1>, <4 x i8>) #2
1330
1331 ; Function Attrs: nounwind readnone willreturn
1332 declare i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32>) #3
1333
1334 ; Function Attrs: argmemonly nounwind readonly willreturn
1335 declare <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>*, i32 immarg, <4 x i1>, <4 x i16>) #2
1336
1337 ; Function Attrs: argmemonly nounwind readonly willreturn
1338 declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>) #2
1339
1340 ; Function Attrs: argmemonly nounwind willreturn
1341 declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>) #4
1342