llvm/test/CodeGen/Thumb2/mve-fma-loops.ll

   1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
   2 ; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp -verify-machineinstrs -tail-predication=enabled %s -o - | FileCheck %s
   3
   4 define arm_aapcs_vfpcc void @fmas1(float* nocapture readonly %x, float* nocapture readonly %y, float* noalias nocapture %z, float %a, i32 %n) {
   5 ; CHECK-LABEL: fmas1:
   6 ; CHECK:       @ %bb.0: @ %entry
   7 ; CHECK-NEXT:    .save {r4, lr}
   8 ; CHECK-NEXT:    push {r4, lr}
   9 ; CHECK-NEXT:    cmp r3, #1
  10 ; CHECK-NEXT:    it lt
  11 ; CHECK-NEXT:    poplt {r4, pc}
  12 ; CHECK-NEXT:  .LBB0_1: @ %vector.ph
  13 ; CHECK-NEXT:    vmov r12, s0
  14 ; CHECK-NEXT:    dlstp.32 lr, r3
  15 ; CHECK-NEXT:  .LBB0_2: @ %vector.body
  16 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
  17 ; CHECK-NEXT:    vldrw.u32 q0, [r1], #16
  18 ; CHECK-NEXT:    vldrw.u32 q1, [r0], #16
  19 ; CHECK-NEXT:    vfmas.f32 q1, q0, r12
  20 ; CHECK-NEXT:    vstrw.32 q1, [r2], #16
  21 ; CHECK-NEXT:    letp lr, .LBB0_2
  22 ; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup
  23 ; CHECK-NEXT:    pop {r4, pc}
  24 entry:
  25   %cmp8 = icmp sgt i32 %n, 0
  26   br i1 %cmp8, label %vector.ph, label %for.cond.cleanup
  27
  28 vector.ph:                                        ; preds = %entry
  29   %n.rnd.up = add i32 %n, 3
  30   %n.vec = and i32 %n.rnd.up, -4
  31   %broadcast.splatinsert13 = insertelement <4 x float> undef, float %a, i32 0
  32   %broadcast.splat14 = shufflevector <4 x float> %broadcast.splatinsert13, <4 x float> undef, <4 x i32> zeroinitializer
  33   br label %vector.body
  34
  35 vector.body:                                      ; preds = %vector.body, %vector.ph
  36   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
  37   %0 = getelementptr inbounds float, float* %x, i32 %index
  38   %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n)
  39   %2 = bitcast float* %0 to <4 x float>*
  40   %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %2, i32 4, <4 x i1> %1, <4 x float> undef)
  41   %3 = getelementptr inbounds float, float* %y, i32 %index
  42   %4 = bitcast float* %3 to <4 x float>*
  43   %wide.masked.load12 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %4, i32 4, <4 x i1> %1, <4 x float> undef)
  44   %5 = call fast <4 x float> @llvm.fma.v4f32(<4 x float> %wide.masked.load, <4 x float> %wide.masked.load12, <4 x float> %broadcast.splat14)
  45   %6 = getelementptr inbounds float, float* %z, i32 %index
  46   %7 = bitcast float* %6 to <4 x float>*
  47   call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %5, <4 x float>* %7, i32 4, <4 x i1> %1)
  48   %index.next = add i32 %index, 4
  49   %8 = icmp eq i32 %index.next, %n.vec
  50   br i1 %8, label %for.cond.cleanup, label %vector.body
  51
  52 for.cond.cleanup:                                 ; preds = %vector.body, %entry
  53   ret void
  54 }
  55
  56 define arm_aapcs_vfpcc void @fmas2(float* nocapture readonly %x, float* nocapture readonly %y, float* noalias nocapture %z, float %a, i32 %n) {
  57 ; CHECK-LABEL: fmas2:
  58 ; CHECK:       @ %bb.0: @ %entry
  59 ; CHECK-NEXT:    .save {r4, lr}
  60 ; CHECK-NEXT:    push {r4, lr}
  61 ; CHECK-NEXT:    cmp r3, #1
  62 ; CHECK-NEXT:    it lt
  63 ; CHECK-NEXT:    poplt {r4, pc}
  64 ; CHECK-NEXT:  .LBB1_1: @ %vector.ph
  65 ; CHECK-NEXT:    vmov r12, s0
  66 ; CHECK-NEXT:    dlstp.32 lr, r3
  67 ; CHECK-NEXT:  .LBB1_2: @ %vector.body
  68 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
  69 ; CHECK-NEXT:    vldrw.u32 q0, [r0], #16
  70 ; CHECK-NEXT:    vldrw.u32 q1, [r1], #16
  71 ; CHECK-NEXT:    vfmas.f32 q1, q0, r12
  72 ; CHECK-NEXT:    vstrw.32 q1, [r2], #16
  73 ; CHECK-NEXT:    letp lr, .LBB1_2
  74 ; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup
  75 ; CHECK-NEXT:    pop {r4, pc}
  76 entry:
  77   %cmp8 = icmp sgt i32 %n, 0
  78   br i1 %cmp8, label %vector.ph, label %for.cond.cleanup
  79
  80 vector.ph:                                        ; preds = %entry
  81   %n.rnd.up = add i32 %n, 3
  82   %n.vec = and i32 %n.rnd.up, -4
  83   %broadcast.splatinsert13 = insertelement <4 x float> undef, float %a, i32 0
  84   %broadcast.splat14 = shufflevector <4 x float> %broadcast.splatinsert13, <4 x float> undef, <4 x i32> zeroinitializer
  85   br label %vector.body
  86
  87 vector.body:                                      ; preds = %vector.body, %vector.ph
  88   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
  89   %0 = getelementptr inbounds float, float* %x, i32 %index
  90   %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n)
  91   %2 = bitcast float* %0 to <4 x float>*
  92   %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %2, i32 4, <4 x i1> %1, <4 x float> undef)
  93   %3 = getelementptr inbounds float, float* %y, i32 %index
  94   %4 = bitcast float* %3 to <4 x float>*
  95   %wide.masked.load12 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %4, i32 4, <4 x i1> %1, <4 x float> undef)
  96   %5 = fmul fast <4 x float> %wide.masked.load12, %wide.masked.load
  97   %6 = fadd fast <4 x float> %5, %broadcast.splat14
  98   %7 = getelementptr inbounds float, float* %z, i32 %index
  99   %8 = bitcast float* %7 to <4 x float>*
 100   call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %6, <4 x float>* %8, i32 4, <4 x i1> %1)
 101   %index.next = add i32 %index, 4
 102   %9 = icmp eq i32 %index.next, %n.vec
 103   br i1 %9, label %for.cond.cleanup, label %vector.body
 104
 105 for.cond.cleanup:                                 ; preds = %vector.body, %entry
 106   ret void
 107 }
 108
 109 define arm_aapcs_vfpcc void @fma1(float* nocapture readonly %x, float* nocapture readonly %y, float* noalias nocapture %z, float %a, i32 %n) {
 110 ; CHECK-LABEL: fma1:
 111 ; CHECK:       @ %bb.0: @ %entry
 112 ; CHECK-NEXT:    .save {r4, lr}
 113 ; CHECK-NEXT:    push {r4, lr}
 114 ; CHECK-NEXT:    cmp r3, #1
 115 ; CHECK-NEXT:    it lt
 116 ; CHECK-NEXT:    poplt {r4, pc}
 117 ; CHECK-NEXT:  .LBB2_1: @ %vector.ph
 118 ; CHECK-NEXT:    vmov r12, s0
 119 ; CHECK-NEXT:    dlstp.32 lr, r3
 120 ; CHECK-NEXT:  .LBB2_2: @ %vector.body
 121 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 122 ; CHECK-NEXT:    vldrw.u32 q0, [r0], #16
 123 ; CHECK-NEXT:    vldrw.u32 q1, [r1], #16
 124 ; CHECK-NEXT:    vfma.f32 q1, q0, r12
 125 ; CHECK-NEXT:    vstrw.32 q1, [r2], #16
 126 ; CHECK-NEXT:    letp lr, .LBB2_2
 127 ; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup
 128 ; CHECK-NEXT:    pop {r4, pc}
 129 entry:
 130   %cmp8 = icmp sgt i32 %n, 0
 131   br i1 %cmp8, label %vector.ph, label %for.cond.cleanup
 132
 133 vector.ph:                                        ; preds = %entry
 134   %n.rnd.up = add i32 %n, 3
 135   %n.vec = and i32 %n.rnd.up, -4
 136   %broadcast.splatinsert13 = insertelement <4 x float> undef, float %a, i32 0
 137   %broadcast.splat14 = shufflevector <4 x float> %broadcast.splatinsert13, <4 x float> undef, <4 x i32> zeroinitializer
 138   br label %vector.body
 139
 140 vector.body:                                      ; preds = %vector.body, %vector.ph
 141   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
 142   %0 = getelementptr inbounds float, float* %x, i32 %index
 143   %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n)
 144   %2 = bitcast float* %0 to <4 x float>*
 145   %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %2, i32 4, <4 x i1> %1, <4 x float> undef)
 146   %3 = getelementptr inbounds float, float* %y, i32 %index
 147   %4 = bitcast float* %3 to <4 x float>*
 148   %wide.masked.load12 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %4, i32 4, <4 x i1> %1, <4 x float> undef)
 149   %5 = call fast <4 x float> @llvm.fma.v4f32(<4 x float> %wide.masked.load, <4 x float> %broadcast.splat14, <4 x float> %wide.masked.load12)
 150   %6 = getelementptr inbounds float, float* %z, i32 %index
 151   %7 = bitcast float* %6 to <4 x float>*
 152   call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %5, <4 x float>* %7, i32 4, <4 x i1> %1)
 153   %index.next = add i32 %index, 4
 154   %8 = icmp eq i32 %index.next, %n.vec
 155   br i1 %8, label %for.cond.cleanup, label %vector.body
 156
 157 for.cond.cleanup:                                 ; preds = %vector.body, %entry
 158   ret void
 159 }
 160
 161 define arm_aapcs_vfpcc void @fma2(float* nocapture readonly %x, float* nocapture readonly %y, float* noalias nocapture %z, float %a, i32 %n) {
 162 ; CHECK-LABEL: fma2:
 163 ; CHECK:       @ %bb.0: @ %entry
 164 ; CHECK-NEXT:    .save {r4, lr}
 165 ; CHECK-NEXT:    push {r4, lr}
 166 ; CHECK-NEXT:    cmp r3, #1
 167 ; CHECK-NEXT:    it lt
 168 ; CHECK-NEXT:    poplt {r4, pc}
 169 ; CHECK-NEXT:  .LBB3_1: @ %vector.ph
 170 ; CHECK-NEXT:    vmov r12, s0
 171 ; CHECK-NEXT:    dlstp.32 lr, r3
 172 ; CHECK-NEXT:  .LBB3_2: @ %vector.body
 173 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 174 ; CHECK-NEXT:    vldrw.u32 q0, [r0], #16
 175 ; CHECK-NEXT:    vldrw.u32 q1, [r1], #16
 176 ; CHECK-NEXT:    vfma.f32 q1, q0, r12
 177 ; CHECK-NEXT:    vstrw.32 q1, [r2], #16
 178 ; CHECK-NEXT:    letp lr, .LBB3_2
 179 ; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup
 180 ; CHECK-NEXT:    pop {r4, pc}
 181 entry:
 182   %cmp8 = icmp sgt i32 %n, 0
 183   br i1 %cmp8, label %vector.ph, label %for.cond.cleanup
 184
 185 vector.ph:                                        ; preds = %entry
 186   %n.rnd.up = add i32 %n, 3
 187   %n.vec = and i32 %n.rnd.up, -4
 188   %broadcast.splatinsert12 = insertelement <4 x float> undef, float %a, i32 0
 189   %broadcast.splat13 = shufflevector <4 x float> %broadcast.splatinsert12, <4 x float> undef, <4 x i32> zeroinitializer
 190   br label %vector.body
 191
 192 vector.body:                                      ; preds = %vector.body, %vector.ph
 193   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
 194   %0 = getelementptr inbounds float, float* %x, i32 %index
 195   %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n)
 196   %2 = bitcast float* %0 to <4 x float>*
 197   %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %2, i32 4, <4 x i1> %1, <4 x float> undef)
 198   %3 = fmul fast <4 x float> %wide.masked.load, %broadcast.splat13
 199   %4 = getelementptr inbounds float, float* %y, i32 %index
 200   %5 = bitcast float* %4 to <4 x float>*
 201   %wide.masked.load14 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %5, i32 4, <4 x i1> %1, <4 x float> undef)
 202   %6 = fadd fast <4 x float> %3, %wide.masked.load14
 203   %7 = getelementptr inbounds float, float* %z, i32 %index
 204   %8 = bitcast float* %7 to <4 x float>*
 205   call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %6, <4 x float>* %8, i32 4, <4 x i1> %1)
 206   %index.next = add i32 %index, 4
 207   %9 = icmp eq i32 %index.next, %n.vec
 208   br i1 %9, label %for.cond.cleanup, label %vector.body
 209
 210 for.cond.cleanup:                                 ; preds = %vector.body, %entry
 211   ret void
 212 }
 213
 214 define arm_aapcs_vfpcc void @fmss1(float* nocapture readonly %x, float* nocapture readonly %y, float* noalias nocapture %z, float %a, i32 %n) {
 215 ; CHECK-LABEL: fmss1:
 216 ; CHECK:       @ %bb.0: @ %entry
 217 ; CHECK-NEXT:    .save {r4, lr}
 218 ; CHECK-NEXT:    push {r4, lr}
 219 ; CHECK-NEXT:    cmp r3, #1
 220 ; CHECK-NEXT:    it lt
 221 ; CHECK-NEXT:    poplt {r4, pc}
 222 ; CHECK-NEXT:  .LBB4_1: @ %vector.ph
 223 ; CHECK-NEXT:    vmov r12, s0
 224 ; CHECK-NEXT:    eor r12, r12, #-2147483648
 225 ; CHECK-NEXT:    dlstp.32 lr, r3
 226 ; CHECK-NEXT:  .LBB4_2: @ %vector.body
 227 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 228 ; CHECK-NEXT:    vldrw.u32 q0, [r1], #16
 229 ; CHECK-NEXT:    vldrw.u32 q1, [r0], #16
 230 ; CHECK-NEXT:    vfmas.f32 q1, q0, r12
 231 ; CHECK-NEXT:    vstrw.32 q1, [r2], #16
 232 ; CHECK-NEXT:    letp lr, .LBB4_2
 233 ; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup
 234 ; CHECK-NEXT:    pop {r4, pc}
 235 entry:
 236   %cmp8 = icmp sgt i32 %n, 0
 237   br i1 %cmp8, label %vector.ph, label %for.cond.cleanup
 238
 239 vector.ph:                                        ; preds = %entry
 240   %fneg = fneg fast float %a
 241   %n.rnd.up = add i32 %n, 3
 242   %n.vec = and i32 %n.rnd.up, -4
 243   %broadcast.splatinsert13 = insertelement <4 x float> undef, float %fneg, i32 0
 244   %broadcast.splat14 = shufflevector <4 x float> %broadcast.splatinsert13, <4 x float> undef, <4 x i32> zeroinitializer
 245   br label %vector.body
 246
 247 vector.body:                                      ; preds = %vector.body, %vector.ph
 248   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
 249   %0 = getelementptr inbounds float, float* %x, i32 %index
 250   %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n)
 251   %2 = bitcast float* %0 to <4 x float>*
 252   %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %2, i32 4, <4 x i1> %1, <4 x float> undef)
 253   %3 = getelementptr inbounds float, float* %y, i32 %index
 254   %4 = bitcast float* %3 to <4 x float>*
 255   %wide.masked.load12 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %4, i32 4, <4 x i1> %1, <4 x float> undef)
 256   %5 = call fast <4 x float> @llvm.fma.v4f32(<4 x float> %wide.masked.load, <4 x float> %wide.masked.load12, <4 x float> %broadcast.splat14)
 257   %6 = getelementptr inbounds float, float* %z, i32 %index
 258   %7 = bitcast float* %6 to <4 x float>*
 259   call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %5, <4 x float>* %7, i32 4, <4 x i1> %1)
 260   %index.next = add i32 %index, 4
 261   %8 = icmp eq i32 %index.next, %n.vec
 262   br i1 %8, label %for.cond.cleanup, label %vector.body
 263
 264 for.cond.cleanup:                                 ; preds = %vector.body, %entry
 265   ret void
 266 }
 267
 268 define arm_aapcs_vfpcc void @fmss2(float* nocapture readonly %x, float* nocapture readonly %y, float* noalias nocapture %z, float %a, i32 %n) {
 269 ; CHECK-LABEL: fmss2:
 270 ; CHECK:       @ %bb.0: @ %entry
 271 ; CHECK-NEXT:    .save {r4, lr}
 272 ; CHECK-NEXT:    push {r4, lr}
 273 ; CHECK-NEXT:    cmp r3, #1
 274 ; CHECK-NEXT:    it lt
 275 ; CHECK-NEXT:    poplt {r4, pc}
 276 ; CHECK-NEXT:  .LBB5_1: @ %vector.ph
 277 ; CHECK-NEXT:    vmov r12, s0
 278 ; CHECK-NEXT:    vdup.32 q0, r12
 279 ; CHECK-NEXT:    vneg.f32 q0, q0
 280 ; CHECK-NEXT:    dlstp.32 lr, r3
 281 ; CHECK-NEXT:  .LBB5_2: @ %vector.body
 282 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 283 ; CHECK-NEXT:    vmov q3, q0
 284 ; CHECK-NEXT:    vldrw.u32 q1, [r0], #16
 285 ; CHECK-NEXT:    vldrw.u32 q2, [r1], #16
 286 ; CHECK-NEXT:    vfma.f32 q3, q2, q1
 287 ; CHECK-NEXT:    vstrw.32 q3, [r2], #16
 288 ; CHECK-NEXT:    letp lr, .LBB5_2
 289 ; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup
 290 ; CHECK-NEXT:    pop {r4, pc}
 291 entry:
 292   %cmp8 = icmp sgt i32 %n, 0
 293   br i1 %cmp8, label %vector.ph, label %for.cond.cleanup
 294
 295 vector.ph:                                        ; preds = %entry
 296   %n.rnd.up = add i32 %n, 3
 297   %n.vec = and i32 %n.rnd.up, -4
 298   %broadcast.splatinsert13 = insertelement <4 x float> undef, float %a, i32 0
 299   %broadcast.splat14 = shufflevector <4 x float> %broadcast.splatinsert13, <4 x float> undef, <4 x i32> zeroinitializer
 300   br label %vector.body
 301
 302 vector.body:                                      ; preds = %vector.body, %vector.ph
 303   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
 304   %0 = getelementptr inbounds float, float* %x, i32 %index
 305   %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n)
 306   %2 = bitcast float* %0 to <4 x float>*
 307   %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %2, i32 4, <4 x i1> %1, <4 x float> undef)
 308   %3 = getelementptr inbounds float, float* %y, i32 %index
 309   %4 = bitcast float* %3 to <4 x float>*
 310   %wide.masked.load12 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %4, i32 4, <4 x i1> %1, <4 x float> undef)
 311   %5 = fmul fast <4 x float> %wide.masked.load12, %wide.masked.load
 312   %6 = fsub fast <4 x float> %5, %broadcast.splat14
 313   %7 = getelementptr inbounds float, float* %z, i32 %index
 314   %8 = bitcast float* %7 to <4 x float>*
 315   call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %6, <4 x float>* %8, i32 4, <4 x i1> %1)
 316   %index.next = add i32 %index, 4
 317   %9 = icmp eq i32 %index.next, %n.vec
 318   br i1 %9, label %for.cond.cleanup, label %vector.body
 319
 320 for.cond.cleanup:                                 ; preds = %vector.body, %entry
 321   ret void
 322 }
 323
 324 define arm_aapcs_vfpcc void @fmss3(float* nocapture readonly %x, float* nocapture readonly %y, float* noalias nocapture %z, float %a, i32 %n) {
 325 ; CHECK-LABEL: fmss3:
 326 ; CHECK:       @ %bb.0: @ %entry
 327 ; CHECK-NEXT:    .save {r4, lr}
 328 ; CHECK-NEXT:    push {r4, lr}
 329 ; CHECK-NEXT:    cmp r3, #1
 330 ; CHECK-NEXT:    it lt
 331 ; CHECK-NEXT:    poplt {r4, pc}
 332 ; CHECK-NEXT:  .LBB6_1: @ %vector.ph
 333 ; CHECK-NEXT:    vmov r4, s0
 334 ; CHECK-NEXT:    vdup.32 q0, r4
 335 ; CHECK-NEXT:    dlstp.32 lr, r3
 336 ; CHECK-NEXT:  .LBB6_2: @ %vector.body
 337 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 338 ; CHECK-NEXT:    vmov q3, q0
 339 ; CHECK-NEXT:    vldrw.u32 q1, [r0], #16
 340 ; CHECK-NEXT:    vldrw.u32 q2, [r1], #16
 341 ; CHECK-NEXT:    vfms.f32 q3, q2, q1
 342 ; CHECK-NEXT:    vstrw.32 q3, [r2], #16
 343 ; CHECK-NEXT:    letp lr, .LBB6_2
 344 ; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup
 345 ; CHECK-NEXT:    pop {r4, pc}
 346 entry:
 347   %cmp8 = icmp sgt i32 %n, 0
 348   br i1 %cmp8, label %vector.ph, label %for.cond.cleanup
 349
 350 vector.ph:                                        ; preds = %entry
 351   %n.rnd.up = add i32 %n, 3
 352   %n.vec = and i32 %n.rnd.up, -4
 353   %broadcast.splatinsert13 = insertelement <4 x float> undef, float %a, i32 0
 354   %broadcast.splat14 = shufflevector <4 x float> %broadcast.splatinsert13, <4 x float> undef, <4 x i32> zeroinitializer
 355   br label %vector.body
 356
 357 vector.body:                                      ; preds = %vector.body, %vector.ph
 358   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
 359   %0 = getelementptr inbounds float, float* %x, i32 %index
 360   %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n)
 361   %2 = bitcast float* %0 to <4 x float>*
 362   %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %2, i32 4, <4 x i1> %1, <4 x float> undef)
 363   %3 = getelementptr inbounds float, float* %y, i32 %index
 364   %4 = bitcast float* %3 to <4 x float>*
 365   %wide.masked.load12 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %4, i32 4, <4 x i1> %1, <4 x float> undef)
 366   %5 = fneg fast <4 x float> %wide.masked.load12
 367   %6 = call fast <4 x float> @llvm.fma.v4f32(<4 x float> %wide.masked.load, <4 x float> %5, <4 x float> %broadcast.splat14)
 368   %7 = getelementptr inbounds float, float* %z, i32 %index
 369   %8 = bitcast float* %7 to <4 x float>*
 370   call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %6, <4 x float>* %8, i32 4, <4 x i1> %1)
 371   %index.next = add i32 %index, 4
 372   %9 = icmp eq i32 %index.next, %n.vec
 373   br i1 %9, label %for.cond.cleanup, label %vector.body
 374
 375 for.cond.cleanup:                                 ; preds = %vector.body, %entry
 376   ret void
 377 }
 378
 379 define arm_aapcs_vfpcc void @fmss4(float* nocapture readonly %x, float* nocapture readonly %y, float* noalias nocapture %z, float %a, i32 %n) {
 380 ; CHECK-LABEL: fmss4:
 381 ; CHECK:       @ %bb.0: @ %entry
 382 ; CHECK-NEXT:    .save {r4, lr}
 383 ; CHECK-NEXT:    push {r4, lr}
 384 ; CHECK-NEXT:    cmp r3, #1
 385 ; CHECK-NEXT:    it lt
 386 ; CHECK-NEXT:    poplt {r4, pc}
 387 ; CHECK-NEXT:  .LBB7_1: @ %vector.ph
 388 ; CHECK-NEXT:    vmov r4, s0
 389 ; CHECK-NEXT:    vdup.32 q0, r4
 390 ; CHECK-NEXT:    dlstp.32 lr, r3
 391 ; CHECK-NEXT:  .LBB7_2: @ %vector.body
 392 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 393 ; CHECK-NEXT:    vmov q3, q0
 394 ; CHECK-NEXT:    vldrw.u32 q1, [r0], #16
 395 ; CHECK-NEXT:    vldrw.u32 q2, [r1], #16
 396 ; CHECK-NEXT:    vfms.f32 q3, q2, q1
 397 ; CHECK-NEXT:    vstrw.32 q3, [r2], #16
 398 ; CHECK-NEXT:    letp lr, .LBB7_2
 399 ; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup
 400 ; CHECK-NEXT:    pop {r4, pc}
 401 entry:
 402   %cmp8 = icmp sgt i32 %n, 0
 403   br i1 %cmp8, label %vector.ph, label %for.cond.cleanup
 404
 405 vector.ph:                                        ; preds = %entry
 406   %n.rnd.up = add i32 %n, 3
 407   %n.vec = and i32 %n.rnd.up, -4
 408   %broadcast.splatinsert13 = insertelement <4 x float> undef, float %a, i32 0
 409   %broadcast.splat14 = shufflevector <4 x float> %broadcast.splatinsert13, <4 x float> undef, <4 x i32> zeroinitializer
 410   br label %vector.body
 411
 412 vector.body:                                      ; preds = %vector.body, %vector.ph
 413   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
 414   %0 = getelementptr inbounds float, float* %x, i32 %index
 415   %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n)
 416   %2 = bitcast float* %0 to <4 x float>*
 417   %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %2, i32 4, <4 x i1> %1, <4 x float> undef)
 418   %3 = getelementptr inbounds float, float* %y, i32 %index
 419   %4 = bitcast float* %3 to <4 x float>*
 420   %wide.masked.load12 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %4, i32 4, <4 x i1> %1, <4 x float> undef)
 421   %5 = fmul fast <4 x float> %wide.masked.load12, %wide.masked.load
 422   %6 = fsub fast <4 x float> %broadcast.splat14, %5
 423   %7 = getelementptr inbounds float, float* %z, i32 %index
 424   %8 = bitcast float* %7 to <4 x float>*
 425   call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %6, <4 x float>* %8, i32 4, <4 x i1> %1)
 426   %index.next = add i32 %index, 4
 427   %9 = icmp eq i32 %index.next, %n.vec
 428   br i1 %9, label %for.cond.cleanup, label %vector.body
 429
 430 for.cond.cleanup:                                 ; preds = %vector.body, %entry
 431   ret void
 432 }
 433
 434 define arm_aapcs_vfpcc void @fms1(float* nocapture readonly %x, float* nocapture readonly %y, float* noalias nocapture %z, float %a, i32 %n) {
 435 ; CHECK-LABEL: fms1:
 436 ; CHECK:       @ %bb.0: @ %entry
 437 ; CHECK-NEXT:    .save {r4, lr}
 438 ; CHECK-NEXT:    push {r4, lr}
 439 ; CHECK-NEXT:    cmp r3, #1
 440 ; CHECK-NEXT:    it lt
 441 ; CHECK-NEXT:    poplt {r4, pc}
 442 ; CHECK-NEXT:  .LBB8_1: @ %vector.ph
 443 ; CHECK-NEXT:    vmov r12, s0
 444 ; CHECK-NEXT:    eor r12, r12, #-2147483648
 445 ; CHECK-NEXT:    dlstp.32 lr, r3
 446 ; CHECK-NEXT:  .LBB8_2: @ %vector.body
 447 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 448 ; CHECK-NEXT:    vldrw.u32 q0, [r0], #16
 449 ; CHECK-NEXT:    vldrw.u32 q1, [r1], #16
 450 ; CHECK-NEXT:    vfma.f32 q1, q0, r12
 451 ; CHECK-NEXT:    vstrw.32 q1, [r2], #16
 452 ; CHECK-NEXT:    letp lr, .LBB8_2
 453 ; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup
 454 ; CHECK-NEXT:    pop {r4, pc}
 455 entry:
 456   %cmp8 = icmp sgt i32 %n, 0
 457   br i1 %cmp8, label %vector.ph, label %for.cond.cleanup
 458
 459 vector.ph:                                        ; preds = %entry
 460   %fneg = fneg fast float %a
 461   %n.rnd.up = add i32 %n, 3
 462   %n.vec = and i32 %n.rnd.up, -4
 463   %broadcast.splatinsert13 = insertelement <4 x float> undef, float %fneg, i32 0
 464   %broadcast.splat14 = shufflevector <4 x float> %broadcast.splatinsert13, <4 x float> undef, <4 x i32> zeroinitializer
 465   br label %vector.body
 466
 467 vector.body:                                      ; preds = %vector.body, %vector.ph
 468   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
 469   %0 = getelementptr inbounds float, float* %x, i32 %index
 470   %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n)
 471   %2 = bitcast float* %0 to <4 x float>*
 472   %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %2, i32 4, <4 x i1> %1, <4 x float> undef)
 473   %3 = getelementptr inbounds float, float* %y, i32 %index
 474   %4 = bitcast float* %3 to <4 x float>*
 475   %wide.masked.load12 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %4, i32 4, <4 x i1> %1, <4 x float> undef)
 476   %5 = call fast <4 x float> @llvm.fma.v4f32(<4 x float> %wide.masked.load, <4 x float> %broadcast.splat14, <4 x float> %wide.masked.load12)
 477   %6 = getelementptr inbounds float, float* %z, i32 %index
 478   %7 = bitcast float* %6 to <4 x float>*
 479   call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %5, <4 x float>* %7, i32 4, <4 x i1> %1)
 480   %index.next = add i32 %index, 4
 481   %8 = icmp eq i32 %index.next, %n.vec
 482   br i1 %8, label %for.cond.cleanup, label %vector.body
 483
 484 for.cond.cleanup:                                 ; preds = %vector.body, %entry
 485   ret void
 486 }
 487
 488 define arm_aapcs_vfpcc void @fms2(float* nocapture readonly %x, float* nocapture readonly %y, float* noalias nocapture %z, float %a, i32 %n) {
 489 ; CHECK-LABEL: fms2:
 490 ; CHECK:       @ %bb.0: @ %entry
 491 ; CHECK-NEXT:    .save {r4, lr}
 492 ; CHECK-NEXT:    push {r4, lr}
 493 ; CHECK-NEXT:    cmp r3, #1
 494 ; CHECK-NEXT:    it lt
 495 ; CHECK-NEXT:    poplt {r4, pc}
 496 ; CHECK-NEXT:  .LBB9_1: @ %vector.ph
 497 ; CHECK-NEXT:    vmov r4, s0
 498 ; CHECK-NEXT:    vdup.32 q0, r4
 499 ; CHECK-NEXT:    dlstp.32 lr, r3
 500 ; CHECK-NEXT:  .LBB9_2: @ %vector.body
 501 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 502 ; CHECK-NEXT:    vldrw.u32 q1, [r0], #16
 503 ; CHECK-NEXT:    vldrw.u32 q2, [r1], #16
 504 ; CHECK-NEXT:    vfms.f32 q2, q1, q0
 505 ; CHECK-NEXT:    vstrw.32 q2, [r2], #16
 506 ; CHECK-NEXT:    letp lr, .LBB9_2
 507 ; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup
 508 ; CHECK-NEXT:    pop {r4, pc}
 509 entry:
 510   %cmp8 = icmp sgt i32 %n, 0
 511   br i1 %cmp8, label %vector.ph, label %for.cond.cleanup
 512
 513 vector.ph:                                        ; preds = %entry
 514   %n.rnd.up = add i32 %n, 3
 515   %n.vec = and i32 %n.rnd.up, -4
 516   %broadcast.splatinsert13 = insertelement <4 x float> undef, float %a, i32 0
 517   %broadcast.splat14 = shufflevector <4 x float> %broadcast.splatinsert13, <4 x float> undef, <4 x i32> zeroinitializer
 518   br label %vector.body
 519
 520 vector.body:                                      ; preds = %vector.body, %vector.ph
 521   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
 522   %0 = getelementptr inbounds float, float* %x, i32 %index
 523   %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n)
 524   %2 = bitcast float* %0 to <4 x float>*
 525   %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %2, i32 4, <4 x i1> %1, <4 x float> undef)
 526   %3 = getelementptr inbounds float, float* %y, i32 %index
 527   %4 = bitcast float* %3 to <4 x float>*
 528   %wide.masked.load12 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %4, i32 4, <4 x i1> %1, <4 x float> undef)
 529   %5 = fmul fast <4 x float> %wide.masked.load, %broadcast.splat14
 530   %6 = fsub fast <4 x float> %wide.masked.load12, %5
 531   %7 = getelementptr inbounds float, float* %z, i32 %index
 532   %8 = bitcast float* %7 to <4 x float>*
 533   call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %6, <4 x float>* %8, i32 4, <4 x i1> %1)
 534   %index.next = add i32 %index, 4
 535   %9 = icmp eq i32 %index.next, %n.vec
 536   br i1 %9, label %for.cond.cleanup, label %vector.body
 537
 538 for.cond.cleanup:                                 ; preds = %vector.body, %entry
 539   ret void
 540 }
 541
 542 define arm_aapcs_vfpcc void @fms3(float* nocapture readonly %x, float* nocapture readonly %y, float* noalias nocapture %z, float %a, i32 %n) {
 543 ; CHECK-LABEL: fms3:
 544 ; CHECK:       @ %bb.0: @ %entry
 545 ; CHECK-NEXT:    .save {r4, lr}
 546 ; CHECK-NEXT:    push {r4, lr}
 547 ; CHECK-NEXT:    cmp r3, #1
 548 ; CHECK-NEXT:    it lt
 549 ; CHECK-NEXT:    poplt {r4, pc}
 550 ; CHECK-NEXT:  .LBB10_1: @ %vector.ph
 551 ; CHECK-NEXT:    vmov r12, s0
 552 ; CHECK-NEXT:    dlstp.32 lr, r3
 553 ; CHECK-NEXT:  .LBB10_2: @ %vector.body
 554 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 555 ; CHECK-NEXT:    vldrw.u32 q0, [r1], #16
 556 ; CHECK-NEXT:    vldrw.u32 q1, [r0], #16
 557 ; CHECK-NEXT:    vneg.f32 q0, q0
 558 ; CHECK-NEXT:    vfma.f32 q0, q1, r12
 559 ; CHECK-NEXT:    vstrw.32 q0, [r2], #16
 560 ; CHECK-NEXT:    letp lr, .LBB10_2
 561 ; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup
 562 ; CHECK-NEXT:    pop {r4, pc}
 563 entry:
 564   %cmp8 = icmp sgt i32 %n, 0
 565   br i1 %cmp8, label %vector.ph, label %for.cond.cleanup
 566
 567 vector.ph:                                        ; preds = %entry
 568   %n.rnd.up = add i32 %n, 3
 569   %n.vec = and i32 %n.rnd.up, -4
 570   %broadcast.splatinsert13 = insertelement <4 x float> undef, float %a, i32 0
 571   %broadcast.splat14 = shufflevector <4 x float> %broadcast.splatinsert13, <4 x float> undef, <4 x i32> zeroinitializer
 572   br label %vector.body
 573
 574 vector.body:                                      ; preds = %vector.body, %vector.ph
 575   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
 576   %0 = getelementptr inbounds float, float* %x, i32 %index
 577   %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n)
 578   %2 = bitcast float* %0 to <4 x float>*
 579   %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %2, i32 4, <4 x i1> %1, <4 x float> undef)
 580   %3 = getelementptr inbounds float, float* %y, i32 %index
 581   %4 = bitcast float* %3 to <4 x float>*
 582   %wide.masked.load12 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %4, i32 4, <4 x i1> %1, <4 x float> undef)
 583   %5 = fneg fast <4 x float> %wide.masked.load12
 584   %6 = call fast <4 x float> @llvm.fma.v4f32(<4 x float> %wide.masked.load, <4 x float> %broadcast.splat14, <4 x float> %5)
 585   %7 = getelementptr inbounds float, float* %z, i32 %index
 586   %8 = bitcast float* %7 to <4 x float>*
 587   call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %6, <4 x float>* %8, i32 4, <4 x i1> %1)
 588   %index.next = add i32 %index, 4
 589   %9 = icmp eq i32 %index.next, %n.vec
 590   br i1 %9, label %for.cond.cleanup, label %vector.body
 591
 592 for.cond.cleanup:                                 ; preds = %vector.body, %entry
 593   ret void
 594 }
 595
 596 define arm_aapcs_vfpcc void @fms4(float* nocapture readonly %x, float* nocapture readonly %y, float* noalias nocapture %z, float %a, i32 %n) {
 597 ; CHECK-LABEL: fms4:
 598 ; CHECK:       @ %bb.0: @ %entry
 599 ; CHECK-NEXT:    .save {r4, lr}
 600 ; CHECK-NEXT:    push {r4, lr}
 601 ; CHECK-NEXT:    cmp r3, #1
 602 ; CHECK-NEXT:    it lt
 603 ; CHECK-NEXT:    poplt {r4, pc}
 604 ; CHECK-NEXT:  .LBB11_1: @ %vector.ph
 605 ; CHECK-NEXT:    vmov r12, s0
 606 ; CHECK-NEXT:    dlstp.32 lr, r3
 607 ; CHECK-NEXT:  .LBB11_2: @ %vector.body
 608 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 609 ; CHECK-NEXT:    vldrw.u32 q0, [r1], #16
 610 ; CHECK-NEXT:    vldrw.u32 q1, [r0], #16
 611 ; CHECK-NEXT:    vneg.f32 q0, q0
 612 ; CHECK-NEXT:    vfma.f32 q0, q1, r12
 613 ; CHECK-NEXT:    vstrw.32 q0, [r2], #16
 614 ; CHECK-NEXT:    letp lr, .LBB11_2
 615 ; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup
 616 ; CHECK-NEXT:    pop {r4, pc}
 617 entry:
 618   %cmp8 = icmp sgt i32 %n, 0
 619   br i1 %cmp8, label %vector.ph, label %for.cond.cleanup
 620
 621 vector.ph:                                        ; preds = %entry
 622   %n.rnd.up = add i32 %n, 3
 623   %n.vec = and i32 %n.rnd.up, -4
 624   %broadcast.splatinsert12 = insertelement <4 x float> undef, float %a, i32 0
 625   %broadcast.splat13 = shufflevector <4 x float> %broadcast.splatinsert12, <4 x float> undef, <4 x i32> zeroinitializer
 626   br label %vector.body
 627
 628 vector.body:                                      ; preds = %vector.body, %vector.ph
 629   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
 630   %0 = getelementptr inbounds float, float* %x, i32 %index
 631   %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n)
 632   %2 = bitcast float* %0 to <4 x float>*
 633   %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %2, i32 4, <4 x i1> %1, <4 x float> undef)
 634   %3 = fmul fast <4 x float> %wide.masked.load, %broadcast.splat13
 635   %4 = getelementptr inbounds float, float* %y, i32 %index
 636   %5 = bitcast float* %4 to <4 x float>*
 637   %wide.masked.load14 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %5, i32 4, <4 x i1> %1, <4 x float> undef)
 638   %6 = fsub fast <4 x float> %3, %wide.masked.load14
 639   %7 = getelementptr inbounds float, float* %z, i32 %index
 640   %8 = bitcast float* %7 to <4 x float>*
 641   call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %6, <4 x float>* %8, i32 4, <4 x i1> %1)
 642   %index.next = add i32 %index, 4
 643   %9 = icmp eq i32 %index.next, %n.vec
 644   br i1 %9, label %for.cond.cleanup, label %vector.body
 645
 646 for.cond.cleanup:                                 ; preds = %vector.body, %entry
 647   ret void
 648 }
 649
 650 declare <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>*, i32 immarg, <4 x i1>, <4 x float>)
 651 declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>)
 652 declare void @llvm.masked.store.v4f32.p0v4f32(<4 x float>, <4 x float>*, i32 immarg, <4 x i1>)
 653 declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32, i32)