llvm/test/CodeGen/Thumb2/LowOverheadLoops/constbound.ll

   1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
   2 ; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve %s -o - | FileCheck %s
   3
   4 define dso_local i32 @test_500_504(i32* nocapture readonly %x) {
   5 ; CHECK-LABEL: test_500_504:
   6 ; CHECK:       @ %bb.0: @ %entry
   7 ; CHECK-NEXT:    .save {r7, lr}
   8 ; CHECK-NEXT:    push {r7, lr}
   9 ; CHECK-NEXT:    mov.w lr, #126
  10 ; CHECK-NEXT:    adr r2, .LCPI0_0
  11 ; CHECK-NEXT:    vldrw.u32 q0, [r2]
  12 ; CHECK-NEXT:    mov.w r2, #500
  13 ; CHECK-NEXT:    vdup.32 q1, r2
  14 ; CHECK-NEXT:    movs r1, #0
  15 ; CHECK-NEXT:    movs r2, #0
  16 ; CHECK-NEXT:  .LBB0_1: @ %vector.body
  17 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
  18 ; CHECK-NEXT:    vadd.i32 q2, q0, r1
  19 ; CHECK-NEXT:    vdup.32 q3, r1
  20 ; CHECK-NEXT:    vcmp.u32 hi, q3, q2
  21 ; CHECK-NEXT:    adds r1, #4
  22 ; CHECK-NEXT:    vpnot
  23 ; CHECK-NEXT:    vpsttt
  24 ; CHECK-NEXT:    vcmpt.u32 hi, q1, q2
  25 ; CHECK-NEXT:    vldrwt.u32 q2, [r0], #16
  26 ; CHECK-NEXT:    vaddvat.u32 r2, q2
  27 ; CHECK-NEXT:    le lr, .LBB0_1
  28 ; CHECK-NEXT:  @ %bb.2: @ %for.cond.cleanup
  29 ; CHECK-NEXT:    mov r0, r2
  30 ; CHECK-NEXT:    pop {r7, pc}
  31 ; CHECK-NEXT:    .p2align 4
  32 ; CHECK-NEXT:  @ %bb.3:
  33 ; CHECK-NEXT:  .LCPI0_0:
  34 ; CHECK-NEXT:    .long 0 @ 0x0
  35 ; CHECK-NEXT:    .long 1 @ 0x1
  36 ; CHECK-NEXT:    .long 2 @ 0x2
  37 ; CHECK-NEXT:    .long 3 @ 0x3
  38 entry:
  39   br label %vector.body
  40
  41 vector.body:                                      ; preds = %vector.body, %entry
  42   %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
  43   %vec.phi = phi i32 [ 0, %entry ], [ %4, %vector.body ]
  44   %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 500)
  45   %0 = getelementptr inbounds i32, i32* %x, i32 %index
  46   %1 = bitcast i32* %0 to <4 x i32>*
  47   %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %1, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
  48   %2 = select <4 x i1> %active.lane.mask, <4 x i32> %wide.masked.load, <4 x i32> zeroinitializer
  49   %3 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %2)
  50   %4 = add i32 %3, %vec.phi
  51   %index.next = add i32 %index, 4
  52   %5 = icmp eq i32 %index.next, 504
  53   br i1 %5, label %for.cond.cleanup, label %vector.body
  54
  55 for.cond.cleanup:                                 ; preds = %vector.body
  56   ret i32 %4
  57 }
  58
  59 define dso_local i32 @test_501_504(i32* nocapture readonly %x) {
  60 ; CHECK-LABEL: test_501_504:
  61 ; CHECK:       @ %bb.0: @ %entry
  62 ; CHECK-NEXT:    .save {r7, lr}
  63 ; CHECK-NEXT:    push {r7, lr}
  64 ; CHECK-NEXT:    movw r1, #501
  65 ; CHECK-NEXT:    movs r2, #0
  66 ; CHECK-NEXT:    dlstp.32 lr, r1
  67 ; CHECK-NEXT:  .LBB1_1: @ %vector.body
  68 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
  69 ; CHECK-NEXT:    vldrw.u32 q0, [r0], #16
  70 ; CHECK-NEXT:    vaddva.u32 r2, q0
  71 ; CHECK-NEXT:    letp lr, .LBB1_1
  72 ; CHECK-NEXT:  @ %bb.2: @ %for.cond.cleanup
  73 ; CHECK-NEXT:    mov r0, r2
  74 ; CHECK-NEXT:    pop {r7, pc}
  75 entry:
  76   br label %vector.body
  77
  78 vector.body:                                      ; preds = %vector.body, %entry
  79   %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
  80   %vec.phi = phi i32 [ 0, %entry ], [ %4, %vector.body ]
  81   %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 501)
  82   %0 = getelementptr inbounds i32, i32* %x, i32 %index
  83   %1 = bitcast i32* %0 to <4 x i32>*
  84   %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %1, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
  85   %2 = select <4 x i1> %active.lane.mask, <4 x i32> %wide.masked.load, <4 x i32> zeroinitializer
  86   %3 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %2)
  87   %4 = add i32 %3, %vec.phi
  88   %index.next = add i32 %index, 4
  89   %5 = icmp eq i32 %index.next, 504
  90   br i1 %5, label %for.cond.cleanup, label %vector.body
  91
  92 for.cond.cleanup:                                 ; preds = %vector.body
  93   ret i32 %4
  94 }
  95
  96 define dso_local i32 @test_502_504(i32* nocapture readonly %x) {
  97 ; CHECK-LABEL: test_502_504:
  98 ; CHECK:       @ %bb.0: @ %entry
  99 ; CHECK-NEXT:    .save {r7, lr}
 100 ; CHECK-NEXT:    push {r7, lr}
 101 ; CHECK-NEXT:    mov.w r1, #502
 102 ; CHECK-NEXT:    movs r2, #0
 103 ; CHECK-NEXT:    dlstp.32 lr, r1
 104 ; CHECK-NEXT:  .LBB2_1: @ %vector.body
 105 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 106 ; CHECK-NEXT:    vldrw.u32 q0, [r0], #16
 107 ; CHECK-NEXT:    vaddva.u32 r2, q0
 108 ; CHECK-NEXT:    letp lr, .LBB2_1
 109 ; CHECK-NEXT:  @ %bb.2: @ %for.cond.cleanup
 110 ; CHECK-NEXT:    mov r0, r2
 111 ; CHECK-NEXT:    pop {r7, pc}
 112 entry:
 113   br label %vector.body
 114
 115 vector.body:                                      ; preds = %vector.body, %entry
 116   %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
 117   %vec.phi = phi i32 [ 0, %entry ], [ %4, %vector.body ]
 118   %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 502)
 119   %0 = getelementptr inbounds i32, i32* %x, i32 %index
 120   %1 = bitcast i32* %0 to <4 x i32>*
 121   %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %1, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
 122   %2 = select <4 x i1> %active.lane.mask, <4 x i32> %wide.masked.load, <4 x i32> zeroinitializer
 123   %3 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %2)
 124   %4 = add i32 %3, %vec.phi
 125   %index.next = add i32 %index, 4
 126   %5 = icmp eq i32 %index.next, 504
 127   br i1 %5, label %for.cond.cleanup, label %vector.body
 128
 129 for.cond.cleanup:                                 ; preds = %vector.body
 130   ret i32 %4
 131 }
 132
 133 define dso_local i32 @test_503_504(i32* nocapture readonly %x) {
 134 ; CHECK-LABEL: test_503_504:
 135 ; CHECK:       @ %bb.0: @ %entry
 136 ; CHECK-NEXT:    .save {r7, lr}
 137 ; CHECK-NEXT:    push {r7, lr}
 138 ; CHECK-NEXT:    movw r1, #503
 139 ; CHECK-NEXT:    movs r2, #0
 140 ; CHECK-NEXT:    dlstp.32 lr, r1
 141 ; CHECK-NEXT:  .LBB3_1: @ %vector.body
 142 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 143 ; CHECK-NEXT:    vldrw.u32 q0, [r0], #16
 144 ; CHECK-NEXT:    vaddva.u32 r2, q0
 145 ; CHECK-NEXT:    letp lr, .LBB3_1
 146 ; CHECK-NEXT:  @ %bb.2: @ %for.cond.cleanup
 147 ; CHECK-NEXT:    mov r0, r2
 148 ; CHECK-NEXT:    pop {r7, pc}
 149 entry:
 150   br label %vector.body
 151
 152 vector.body:                                      ; preds = %vector.body, %entry
 153   %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
 154   %vec.phi = phi i32 [ 0, %entry ], [ %4, %vector.body ]
 155   %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 503)
 156   %0 = getelementptr inbounds i32, i32* %x, i32 %index
 157   %1 = bitcast i32* %0 to <4 x i32>*
 158   %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %1, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
 159   %2 = select <4 x i1> %active.lane.mask, <4 x i32> %wide.masked.load, <4 x i32> zeroinitializer
 160   %3 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %2)
 161   %4 = add i32 %3, %vec.phi
 162   %index.next = add i32 %index, 4
 163   %5 = icmp eq i32 %index.next, 504
 164   br i1 %5, label %for.cond.cleanup, label %vector.body
 165
 166 for.cond.cleanup:                                 ; preds = %vector.body
 167   ret i32 %4
 168 }
 169
 170 define dso_local i32 @test_504_504(i32* nocapture readonly %x) {
 171 ; CHECK-LABEL: test_504_504:
 172 ; CHECK:       @ %bb.0: @ %entry
 173 ; CHECK-NEXT:    .save {r7, lr}
 174 ; CHECK-NEXT:    push {r7, lr}
 175 ; CHECK-NEXT:    mov.w r1, #504
 176 ; CHECK-NEXT:    movs r2, #0
 177 ; CHECK-NEXT:    dlstp.32 lr, r1
 178 ; CHECK-NEXT:  .LBB4_1: @ %vector.body
 179 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 180 ; CHECK-NEXT:    vldrw.u32 q0, [r0], #16
 181 ; CHECK-NEXT:    vaddva.u32 r2, q0
 182 ; CHECK-NEXT:    letp lr, .LBB4_1
 183 ; CHECK-NEXT:  @ %bb.2: @ %for.cond.cleanup
 184 ; CHECK-NEXT:    mov r0, r2
 185 ; CHECK-NEXT:    pop {r7, pc}
 186 entry:
 187   br label %vector.body
 188
 189 vector.body:                                      ; preds = %vector.body, %entry
 190   %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
 191   %vec.phi = phi i32 [ 0, %entry ], [ %4, %vector.body ]
 192   %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 504)
 193   %0 = getelementptr inbounds i32, i32* %x, i32 %index
 194   %1 = bitcast i32* %0 to <4 x i32>*
 195   %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %1, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
 196   %2 = select <4 x i1> %active.lane.mask, <4 x i32> %wide.masked.load, <4 x i32> zeroinitializer
 197   %3 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %2)
 198   %4 = add i32 %3, %vec.phi
 199   %index.next = add i32 %index, 4
 200   %5 = icmp eq i32 %index.next, 504
 201   br i1 %5, label %for.cond.cleanup, label %vector.body
 202
 203 for.cond.cleanup:                                 ; preds = %vector.body
 204   ret i32 %4
 205 }
 206
 207 declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32, i32)
 208 declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>)
 209 declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>)