llvm/test/CodeGen/AArch64/machine-licm-sub-loop.ll

   1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
   2 ; RUN: llc -mtriple aarch64-none-linux-gnu < %s | FileCheck %s
   3
   4 define void @foo(i32 noundef %limit, ptr %out, ptr %y) {
   5 ; CHECK-LABEL: foo:
   6 ; CHECK:       // %bb.0: // %entry
   7 ; CHECK-NEXT:    // kill: def $w0 killed $w0 def $x0
   8 ; CHECK-NEXT:    cmp w0, #1
   9 ; CHECK-NEXT:    b.lt .LBB0_10
  10 ; CHECK-NEXT:  // %bb.1: // %for.cond1.preheader.us.preheader
  11 ; CHECK-NEXT:    mov w10, w0
  12 ; CHECK-NEXT:    ubfiz x11, x0, #2, #32
  13 ; CHECK-NEXT:    mov x8, xzr
  14 ; CHECK-NEXT:    mov x9, xzr
  15 ; CHECK-NEXT:    and x12, x10, #0xfffffff0
  16 ; CHECK-NEXT:    add x13, x1, #32
  17 ; CHECK-NEXT:    add x14, x2, #16
  18 ; CHECK-NEXT:    b .LBB0_3
  19 ; CHECK-NEXT:  .LBB0_2: // %for.cond1.for.cond.cleanup3_crit_edge.us
  20 ; CHECK-NEXT:    // in Loop: Header=BB0_3 Depth=1
  21 ; CHECK-NEXT:    add x9, x9, #1
  22 ; CHECK-NEXT:    add x13, x13, x11
  23 ; CHECK-NEXT:    add x8, x8, x10
  24 ; CHECK-NEXT:    cmp x9, x10
  25 ; CHECK-NEXT:    b.eq .LBB0_10
  26 ; CHECK-NEXT:  .LBB0_3: // %for.cond1.preheader.us
  27 ; CHECK-NEXT:    // =>This Loop Header: Depth=1
  28 ; CHECK-NEXT:    // Child Loop BB0_6 Depth 2
  29 ; CHECK-NEXT:    // Child Loop BB0_9 Depth 2
  30 ; CHECK-NEXT:    ldrsh w15, [x2, x9, lsl #1]
  31 ; CHECK-NEXT:    cmp w0, #16
  32 ; CHECK-NEXT:    b.hs .LBB0_5
  33 ; CHECK-NEXT:  // %bb.4: // in Loop: Header=BB0_3 Depth=1
  34 ; CHECK-NEXT:    mov x18, xzr
  35 ; CHECK-NEXT:    b .LBB0_8
  36 ; CHECK-NEXT:  .LBB0_5: // %vector.ph
  37 ; CHECK-NEXT:    // in Loop: Header=BB0_3 Depth=1
  38 ; CHECK-NEXT:    mov x16, x14
  39 ; CHECK-NEXT:    mov x17, x13
  40 ; CHECK-NEXT:    mov x18, x12
  41 ; CHECK-NEXT:  .LBB0_6: // %vector.body
  42 ; CHECK-NEXT:    // Parent Loop BB0_3 Depth=1
  43 ; CHECK-NEXT:    // => This Inner Loop Header: Depth=2
  44 ; CHECK-NEXT:    dup v0.8h, w15
  45 ; CHECK-NEXT:    ldp q1, q4, [x16, #-16]
  46 ; CHECK-NEXT:    ldp q3, q2, [x17, #-32]
  47 ; CHECK-NEXT:    subs x18, x18, #16
  48 ; CHECK-NEXT:    ldp q6, q5, [x17]
  49 ; CHECK-NEXT:    add x16, x16, #32
  50 ; CHECK-NEXT:    smlal2 v2.4s, v0.8h, v1.8h
  51 ; CHECK-NEXT:    smlal v3.4s, v0.4h, v1.4h
  52 ; CHECK-NEXT:    smlal2 v5.4s, v0.8h, v4.8h
  53 ; CHECK-NEXT:    smlal v6.4s, v0.4h, v4.4h
  54 ; CHECK-NEXT:    stp q3, q2, [x17, #-32]
  55 ; CHECK-NEXT:    stp q6, q5, [x17], #64
  56 ; CHECK-NEXT:    b.ne .LBB0_6
  57 ; CHECK-NEXT:  // %bb.7: // %middle.block
  58 ; CHECK-NEXT:    // in Loop: Header=BB0_3 Depth=1
  59 ; CHECK-NEXT:    cmp x12, x10
  60 ; CHECK-NEXT:    mov x18, x12
  61 ; CHECK-NEXT:    b.eq .LBB0_2
  62 ; CHECK-NEXT:  .LBB0_8: // %for.body4.us.preheader
  63 ; CHECK-NEXT:    // in Loop: Header=BB0_3 Depth=1
  64 ; CHECK-NEXT:    add x16, x18, x8
  65 ; CHECK-NEXT:    add x17, x2, x18, lsl #1
  66 ; CHECK-NEXT:    sub x18, x10, x18
  67 ; CHECK-NEXT:    add x16, x1, x16, lsl #2
  68 ; CHECK-NEXT:  .LBB0_9: // %for.body4.us
  69 ; CHECK-NEXT:    // Parent Loop BB0_3 Depth=1
  70 ; CHECK-NEXT:    // => This Inner Loop Header: Depth=2
  71 ; CHECK-NEXT:    ldrsh w3, [x17], #2
  72 ; CHECK-NEXT:    ldr w4, [x16]
  73 ; CHECK-NEXT:    subs x18, x18, #1
  74 ; CHECK-NEXT:    madd w3, w3, w15, w4
  75 ; CHECK-NEXT:    str w3, [x16], #4
  76 ; CHECK-NEXT:    b.ne .LBB0_9
  77 ; CHECK-NEXT:    b .LBB0_2
  78 ; CHECK-NEXT:  .LBB0_10: // %for.cond.cleanup
  79 ; CHECK-NEXT:    ret
  80 entry:
  81   %cmp26 = icmp sgt i32 %limit, 0
  82   br i1 %cmp26, label %for.cond1.preheader.us.preheader, label %for.cond.cleanup
  83
  84 for.cond1.preheader.us.preheader:                 ; preds = %entry
  85   %0 = zext i32 %limit to i64
  86   %wide.trip.count34 = zext i32 %limit to i64
  87   %min.iters.check = icmp ult i32 %limit, 16
  88   %n.vec = and i64 %wide.trip.count34, 4294967280
  89   %cmp.n = icmp eq i64 %n.vec, %wide.trip.count34
  90   br label %for.cond1.preheader.us
  91
  92 for.cond1.preheader.us:                           ; preds = %for.cond1.preheader.us.preheader, %for.cond1.for.cond.cleanup3_crit_edge.us
  93   %indvars.iv30 = phi i64 [ 0, %for.cond1.preheader.us.preheader ], [ %indvars.iv.next31, %for.cond1.for.cond.cleanup3_crit_edge.us ]
  94   %arrayidx.us = getelementptr inbounds i16, ptr %y, i64 %indvars.iv30
  95   %1 = load i16, ptr %arrayidx.us, align 2
  96   %conv.us = sext i16 %1 to i32
  97   %2 = mul nsw i64 %indvars.iv30, %0
  98   br i1 %min.iters.check, label %for.body4.us.preheader, label %vector.ph
  99
 100 vector.ph:                                        ; preds = %for.cond1.preheader.us
 101   %broadcast.splatinsert = insertelement <8 x i32> poison, i32 %conv.us, i64 0
 102   %broadcast.splat = shufflevector <8 x i32> %broadcast.splatinsert, <8 x i32> poison, <8 x i32> zeroinitializer
 103   %broadcast.splatinsert37 = insertelement <8 x i32> poison, i32 %conv.us, i64 0
 104   %broadcast.splat38 = shufflevector <8 x i32> %broadcast.splatinsert37, <8 x i32> poison, <8 x i32> zeroinitializer
 105   br label %vector.body
 106
 107 vector.body:                                      ; preds = %vector.body, %vector.ph
 108   %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
 109   %3 = getelementptr inbounds i16, ptr %y, i64 %index
 110   %wide.load = load <8 x i16>, ptr %3, align 2
 111   %4 = getelementptr inbounds i16, ptr %3, i64 8
 112   %wide.load36 = load <8 x i16>, ptr %4, align 2
 113   %5 = sext <8 x i16> %wide.load to <8 x i32>
 114   %6 = sext <8 x i16> %wide.load36 to <8 x i32>
 115   %7 = mul nsw <8 x i32> %broadcast.splat, %5
 116   %8 = mul nsw <8 x i32> %broadcast.splat38, %6
 117   %9 = add nuw nsw i64 %index, %2
 118   %10 = getelementptr inbounds i32, ptr %out, i64 %9
 119   %wide.load39 = load <8 x i32>, ptr %10, align 4
 120   %11 = getelementptr inbounds i32, ptr %10, i64 8
 121   %wide.load40 = load <8 x i32>, ptr %11, align 4
 122   %12 = add nsw <8 x i32> %7, %wide.load39
 123   %13 = add nsw <8 x i32> %8, %wide.load40
 124   store <8 x i32> %12, ptr %10, align 4
 125   store <8 x i32> %13, ptr %11, align 4
 126   %index.next = add nuw i64 %index, 16
 127   %14 = icmp eq i64 %index.next, %n.vec
 128   br i1 %14, label %middle.block, label %vector.body
 129
 130 middle.block:                                     ; preds = %vector.body
 131   br i1 %cmp.n, label %for.cond1.for.cond.cleanup3_crit_edge.us, label %for.body4.us.preheader
 132
 133 for.body4.us.preheader:                           ; preds = %for.cond1.preheader.us, %middle.block
 134   %indvars.iv.ph = phi i64 [ 0, %for.cond1.preheader.us ], [ %n.vec, %middle.block ]
 135   br label %for.body4.us
 136
 137 for.body4.us:                                     ; preds = %for.body4.us.preheader, %for.body4.us
 138   %indvars.iv = phi i64 [ %indvars.iv.next, %for.body4.us ], [ %indvars.iv.ph, %for.body4.us.preheader ]
 139   %arrayidx6.us = getelementptr inbounds i16, ptr %y, i64 %indvars.iv
 140   %15 = load i16, ptr %arrayidx6.us, align 2
 141   %conv7.us = sext i16 %15 to i32
 142   %mul.us = mul nsw i32 %conv7.us, %conv.us
 143   %16 = add nuw nsw i64 %indvars.iv, %2
 144   %arrayidx10.us = getelementptr inbounds i32, ptr %out, i64 %16
 145   %17 = load i32, ptr %arrayidx10.us, align 4
 146   %add11.us = add nsw i32 %mul.us, %17
 147   store i32 %add11.us, ptr %arrayidx10.us, align 4
 148   %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
 149   %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count34
 150   br i1 %exitcond.not, label %for.cond1.for.cond.cleanup3_crit_edge.us, label %for.body4.us
 151
 152 for.cond1.for.cond.cleanup3_crit_edge.us:         ; preds = %for.body4.us, %middle.block
 153   %indvars.iv.next31 = add nuw nsw i64 %indvars.iv30, 1
 154   %exitcond35.not = icmp eq i64 %indvars.iv.next31, %wide.trip.count34
 155   br i1 %exitcond35.not, label %for.cond.cleanup, label %for.cond1.preheader.us
 156
 157 for.cond.cleanup:                                 ; preds = %for.cond1.for.cond.cleanup3_crit_edge.us, %entry
 158   ret void
 159 }