llvm/test/CodeGen/AArch64/sve-lsrchain.ll

   1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
   2 ; RUN: llc -mtriple aarch64 -mattr=+sve2 -o - %s | FileCheck %s
   3
   4 define void @test(ptr nocapture noundef readonly %kernel, i32 noundef %kw, float noundef nofpclass(nan inf) %kernel_factor, ptr %call5.i.i.i119) vscale_range(1, 16) {
   5 ; CHECK-LABEL: test:
   6 ; CHECK:       // %bb.0: // %entry
   7 ; CHECK-NEXT:    cmp w1, #1
   8 ; CHECK-NEXT:    b.lt .LBB0_6
   9 ; CHECK-NEXT:  // %bb.1: // %for.body.lr.ph
  10 ; CHECK-NEXT:    rdvl x8, #-2
  11 ; CHECK-NEXT:    mov w9, #608 // =0x260
  12 ; CHECK-NEXT:    ands x11, x8, x9
  13 ; CHECK-NEXT:    b.eq .LBB0_6
  14 ; CHECK-NEXT:  // %bb.2: // %for.body.us.preheader
  15 ; CHECK-NEXT:    ptrue p0.h
  16 ; CHECK-NEXT:    add x11, x2, x11, lsl #1
  17 ; CHECK-NEXT:    mov w8, wzr
  18 ; CHECK-NEXT:    ptrue p1.b
  19 ; CHECK-NEXT:    mov x9, xzr
  20 ; CHECK-NEXT:    mov w10, wzr
  21 ; CHECK-NEXT:    mov x12, #4 // =0x4
  22 ; CHECK-NEXT:    mov x13, #8 // =0x8
  23 ; CHECK-NEXT:  .LBB0_3: // %for.body.us
  24 ; CHECK-NEXT:    // =>This Loop Header: Depth=1
  25 ; CHECK-NEXT:    // Child Loop BB0_4 Depth 2
  26 ; CHECK-NEXT:    add x14, x0, x9, lsl #2
  27 ; CHECK-NEXT:    sbfiz x15, x8, #1, #32
  28 ; CHECK-NEXT:    mov x16, x2
  29 ; CHECK-NEXT:    ldp s0, s1, [x14]
  30 ; CHECK-NEXT:    add x15, x15, #8
  31 ; CHECK-NEXT:    ldp s2, s3, [x14, #8]
  32 ; CHECK-NEXT:    ubfiz x14, x8, #1, #32
  33 ; CHECK-NEXT:    fcvt h0, s0
  34 ; CHECK-NEXT:    fcvt h1, s1
  35 ; CHECK-NEXT:    fcvt h2, s2
  36 ; CHECK-NEXT:    fcvt h3, s3
  37 ; CHECK-NEXT:    mov z0.h, h0
  38 ; CHECK-NEXT:    mov z1.h, h1
  39 ; CHECK-NEXT:    mov z2.h, h2
  40 ; CHECK-NEXT:    mov z3.h, h3
  41 ; CHECK-NEXT:  .LBB0_4: // %for.cond.i.preheader.us
  42 ; CHECK-NEXT:    // Parent Loop BB0_3 Depth=1
  43 ; CHECK-NEXT:    // => This Inner Loop Header: Depth=2
  44 ; CHECK-NEXT:    ld1b { z4.b }, p1/z, [x16, x14]
  45 ; CHECK-NEXT:    ld1h { z5.h }, p0/z, [x16]
  46 ; CHECK-NEXT:    add x17, x16, x15
  47 ; CHECK-NEXT:    add x18, x16, x14
  48 ; CHECK-NEXT:    add x3, x17, #8
  49 ; CHECK-NEXT:    add x4, x17, #16
  50 ; CHECK-NEXT:    fmad z4.h, p0/m, z0.h, z5.h
  51 ; CHECK-NEXT:    ld1b { z5.b }, p1/z, [x16, x15]
  52 ; CHECK-NEXT:    fmla z4.h, p0/m, z5.h, z1.h
  53 ; CHECK-NEXT:    ld1h { z5.h }, p0/z, [x17, x12, lsl #1]
  54 ; CHECK-NEXT:    fmla z4.h, p0/m, z5.h, z2.h
  55 ; CHECK-NEXT:    ld1h { z5.h }, p0/z, [x17, x13, lsl #1]
  56 ; CHECK-NEXT:    fmla z4.h, p0/m, z5.h, z3.h
  57 ; CHECK-NEXT:    ld1h { z5.h }, p0/z, [x16, #1, mul vl]
  58 ; CHECK-NEXT:    st1h { z4.h }, p0, [x16]
  59 ; CHECK-NEXT:    ld1h { z4.h }, p0/z, [x18, #1, mul vl]
  60 ; CHECK-NEXT:    fmad z4.h, p0/m, z0.h, z5.h
  61 ; CHECK-NEXT:    ld1h { z5.h }, p0/z, [x17, #1, mul vl]
  62 ; CHECK-NEXT:    fmla z4.h, p0/m, z5.h, z1.h
  63 ; CHECK-NEXT:    ld1h { z5.h }, p0/z, [x3, #1, mul vl]
  64 ; CHECK-NEXT:    fmla z4.h, p0/m, z5.h, z2.h
  65 ; CHECK-NEXT:    ld1h { z5.h }, p0/z, [x4, #1, mul vl]
  66 ; CHECK-NEXT:    fmla z4.h, p0/m, z5.h, z3.h
  67 ; CHECK-NEXT:    ld1h { z5.h }, p0/z, [x16, #2, mul vl]
  68 ; CHECK-NEXT:    st1h { z4.h }, p0, [x16, #1, mul vl]
  69 ; CHECK-NEXT:    ld1h { z4.h }, p0/z, [x18, #2, mul vl]
  70 ; CHECK-NEXT:    fmad z4.h, p0/m, z0.h, z5.h
  71 ; CHECK-NEXT:    ld1h { z5.h }, p0/z, [x17, #2, mul vl]
  72 ; CHECK-NEXT:    fmla z4.h, p0/m, z5.h, z1.h
  73 ; CHECK-NEXT:    ld1h { z5.h }, p0/z, [x3, #2, mul vl]
  74 ; CHECK-NEXT:    fmla z4.h, p0/m, z5.h, z2.h
  75 ; CHECK-NEXT:    ld1h { z5.h }, p0/z, [x4, #2, mul vl]
  76 ; CHECK-NEXT:    fmla z4.h, p0/m, z5.h, z3.h
  77 ; CHECK-NEXT:    ld1h { z5.h }, p0/z, [x16, #3, mul vl]
  78 ; CHECK-NEXT:    st1h { z4.h }, p0, [x16, #2, mul vl]
  79 ; CHECK-NEXT:    ld1h { z4.h }, p0/z, [x18, #3, mul vl]
  80 ; CHECK-NEXT:    fmad z4.h, p0/m, z0.h, z5.h
  81 ; CHECK-NEXT:    ld1h { z5.h }, p0/z, [x17, #3, mul vl]
  82 ; CHECK-NEXT:    fmla z4.h, p0/m, z5.h, z1.h
  83 ; CHECK-NEXT:    ld1h { z5.h }, p0/z, [x3, #3, mul vl]
  84 ; CHECK-NEXT:    fmla z4.h, p0/m, z5.h, z2.h
  85 ; CHECK-NEXT:    ld1h { z5.h }, p0/z, [x4, #3, mul vl]
  86 ; CHECK-NEXT:    fmla z4.h, p0/m, z5.h, z3.h
  87 ; CHECK-NEXT:    st1h { z4.h }, p0, [x16, #3, mul vl]
  88 ; CHECK-NEXT:    addvl x16, x16, #4
  89 ; CHECK-NEXT:    cmp x16, x11
  90 ; CHECK-NEXT:    b.lo .LBB0_4
  91 ; CHECK-NEXT:  // %bb.5: // %while.cond.i..exit_crit_edge.us
  92 ; CHECK-NEXT:    // in Loop: Header=BB0_3 Depth=1
  93 ; CHECK-NEXT:    add w10, w10, #1
  94 ; CHECK-NEXT:    add x9, x9, #4
  95 ; CHECK-NEXT:    add w8, w8, #16
  96 ; CHECK-NEXT:    cmp w10, w1
  97 ; CHECK-NEXT:    b.ne .LBB0_3
  98 ; CHECK-NEXT:  .LBB0_6: // %exit78
  99 ; CHECK-NEXT:    ret
 100 entry:
 101   ;%call5.i.i.i119 = tail call noalias noundef nonnull dereferenceable(1248) ptr @_Znwm(i64 noundef 1248) #7
 102   %cmp139 = icmp sgt i32 %kw, 0
 103   ;tail call void @llvm.memset.p0.i64(ptr noundef nonnull align 2 dereferenceable(1248) %call5.i.i.i119, i8 0, i64 1248, i1 false)
 104   br i1 %cmp139, label %for.body.lr.ph, label %exit78
 105
 106 for.body.lr.ph:                                   ; preds = %entry
 107   %0 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
 108   %vscale = tail call i64 @llvm.vscale.i64()
 109   %mul5.i = shl nuw nsw i64 %vscale, 5
 110   %sub.not.i = sub nsw i64 0, %mul5.i
 111   %sub6.i = and i64 %sub.not.i, 608
 112   %add.ptr.i = getelementptr inbounds half, ptr %call5.i.i.i119, i64 %sub6.i
 113   %cmp.i133.not = icmp eq i64 %sub6.i, 0
 114   %vs2 = shl nuw nsw i64 %vscale, 4
 115   br i1 %cmp.i133.not, label %exit78, label %for.body.us.preheader
 116
 117 for.body.us.preheader:                            ; preds = %for.body.lr.ph
 118   %.idx.i.us.2 = shl nuw nsw i64 %vscale, 5
 119   %.idx.i.us.3 = mul nuw nsw i64 %vscale, 48
 120   br label %for.body.us
 121
 122 for.body.us:                                      ; preds = %for.body.us.preheader, %while.cond.i..exit_crit_edge.us
 123   %indvars.iv = phi i64 [ 0, %for.body.us.preheader ], [ %indvars.iv.next, %while.cond.i..exit_crit_edge.us ]
 124   %i4.0140.us = phi i32 [ 0, %for.body.us.preheader ], [ %inc.us, %while.cond.i..exit_crit_edge.us ]
 125   %3 = trunc nuw nsw i64 %indvars.iv to i32
 126   %mul6.us = shl i32 %3, 2
 127   %idx.ext.us = zext nneg i32 %mul6.us to i64
 128   %add.ptr.us = getelementptr inbounds half, ptr %call5.i.i.i119, i64 %idx.ext.us
 129   %mul11.us = or disjoint i32 %mul6.us, 4
 130   %idx.ext12.us = sext i32 %mul11.us to i64
 131   %add.ptr13.us = getelementptr inbounds half, ptr %call5.i.i.i119, i64 %idx.ext12.us
 132   %mul18.us = or disjoint i32 %mul6.us, 8
 133   %idx.ext19.us = sext i32 %mul18.us to i64
 134   %add.ptr20.us = getelementptr inbounds half, ptr %call5.i.i.i119, i64 %idx.ext19.us
 135   %mul25.us = or disjoint i32 %mul6.us, 12
 136   %idx.ext26.us = sext i32 %mul25.us to i64
 137   %add.ptr27.us = getelementptr inbounds half, ptr %call5.i.i.i119, i64 %idx.ext26.us
 138   %add.ptr29.us = getelementptr inbounds float, ptr %kernel, i64 %indvars.iv
 139   %4 = load float, ptr %add.ptr29.us, align 4
 140   %5 = fptrunc float %4 to half
 141   %.splatinsert.i.us = insertelement <vscale x 8 x half> poison, half %5, i64 0
 142   %6 = shufflevector <vscale x 8 x half> %.splatinsert.i.us, <vscale x 8 x half> poison, <vscale x 8 x i32> zeroinitializer
 143   %arrayidx2.i.us = getelementptr inbounds i8, ptr %add.ptr29.us, i64 4
 144   %7 = load float, ptr %arrayidx2.i.us, align 4
 145   %8 = fptrunc float %7 to half
 146   %.splatinsert57.i.us = insertelement <vscale x 8 x half> poison, half %8, i64 0
 147   %9 = shufflevector <vscale x 8 x half> %.splatinsert57.i.us, <vscale x 8 x half> poison, <vscale x 8 x i32> zeroinitializer
 148   %arrayidx3.i.us = getelementptr inbounds i8, ptr %add.ptr29.us, i64 8
 149   %10 = load float, ptr %arrayidx3.i.us, align 4
 150   %11 = fptrunc float %10 to half
 151   %.splatinsert58.i.us = insertelement <vscale x 8 x half> poison, half %11, i64 0
 152   %12 = shufflevector <vscale x 8 x half> %.splatinsert58.i.us, <vscale x 8 x half> poison, <vscale x 8 x i32> zeroinitializer
 153   %arrayidx4.i.us = getelementptr inbounds i8, ptr %add.ptr29.us, i64 12
 154   %13 = load float, ptr %arrayidx4.i.us, align 4
 155   %14 = fptrunc float %13 to half
 156   %.splatinsert59.i.us = insertelement <vscale x 8 x half> poison, half %14, i64 0
 157   %15 = shufflevector <vscale x 8 x half> %.splatinsert59.i.us, <vscale x 8 x half> poison, <vscale x 8 x i32> zeroinitializer
 158   br label %for.cond.i.preheader.us
 159
 160 for.cond.i.preheader.us:                          ; preds = %for.body.us, %for.cond.i.preheader.us
 161   %vdst.0.i138.us = phi ptr [ %call5.i.i.i119, %for.body.us ], [ %add.ptr15.i.us, %for.cond.i.preheader.us ]
 162   %s1.0.i137.us = phi ptr [ %add.ptr.us, %for.body.us ], [ %add.ptr16.i.us, %for.cond.i.preheader.us ]
 163   %s2.0.i136.us = phi ptr [ %add.ptr13.us, %for.body.us ], [ %add.ptr17.i.us, %for.cond.i.preheader.us ]
 164   %s3.0.i135.us = phi ptr [ %add.ptr20.us, %for.body.us ], [ %add.ptr18.i.us, %for.cond.i.preheader.us ]
 165   %s4.0.i134.us = phi ptr [ %add.ptr27.us, %for.body.us ], [ %add.ptr19.i.us, %for.cond.i.preheader.us ]
 166   %16 = load <vscale x 8 x half>, ptr %s1.0.i137.us, align 16
 167   %17 = load <vscale x 8 x half>, ptr %vdst.0.i138.us, align 16
 168   %18 = tail call fast <vscale x 8 x half> @llvm.aarch64.sve.fmla.u.nxv8f16(<vscale x 8 x i1> %0, <vscale x 8 x half> %17, <vscale x 8 x half> %16, <vscale x 8 x half> %6)
 169   %19 = load <vscale x 8 x half>, ptr %s2.0.i136.us, align 16
 170   %20 = tail call fast <vscale x 8 x half> @llvm.aarch64.sve.fmla.u.nxv8f16(<vscale x 8 x i1> %0, <vscale x 8 x half> %18, <vscale x 8 x half> %19, <vscale x 8 x half> %9)
 171   %21 = load <vscale x 8 x half>, ptr %s3.0.i135.us, align 16
 172   %22 = tail call fast <vscale x 8 x half> @llvm.aarch64.sve.fmla.u.nxv8f16(<vscale x 8 x i1> %0, <vscale x 8 x half> %20, <vscale x 8 x half> %21, <vscale x 8 x half> %12)
 173   %23 = load <vscale x 8 x half>, ptr %s4.0.i134.us, align 16
 174   %24 = tail call fast <vscale x 8 x half> @llvm.aarch64.sve.fmla.u.nxv8f16(<vscale x 8 x i1> %0, <vscale x 8 x half> %22, <vscale x 8 x half> %23, <vscale x 8 x half> %15)
 175   store <vscale x 8 x half> %24, ptr %vdst.0.i138.us, align 16
 176   %25 = getelementptr i8, ptr %s1.0.i137.us, i64 %vs2
 177   %26 = load <vscale x 8 x half>, ptr %25, align 16
 178   %27 = getelementptr i8, ptr %vdst.0.i138.us, i64 %vs2
 179   %28 = load <vscale x 8 x half>, ptr %27, align 16
 180   %29 = tail call fast <vscale x 8 x half> @llvm.aarch64.sve.fmla.u.nxv8f16(<vscale x 8 x i1> %0, <vscale x 8 x half> %28, <vscale x 8 x half> %26, <vscale x 8 x half> %6)
 181   %30 = getelementptr i8, ptr %s2.0.i136.us, i64 %vs2
 182   %31 = load <vscale x 8 x half>, ptr %30, align 16
 183   %32 = tail call fast <vscale x 8 x half> @llvm.aarch64.sve.fmla.u.nxv8f16(<vscale x 8 x i1> %0, <vscale x 8 x half> %29, <vscale x 8 x half> %31, <vscale x 8 x half> %9)
 184   %33 = getelementptr i8, ptr %s3.0.i135.us, i64 %vs2
 185   %34 = load <vscale x 8 x half>, ptr %33, align 16
 186   %35 = tail call fast <vscale x 8 x half> @llvm.aarch64.sve.fmla.u.nxv8f16(<vscale x 8 x i1> %0, <vscale x 8 x half> %32, <vscale x 8 x half> %34, <vscale x 8 x half> %12)
 187   %36 = getelementptr i8, ptr %s4.0.i134.us, i64 %vs2
 188   %37 = load <vscale x 8 x half>, ptr %36, align 16
 189   %38 = tail call fast <vscale x 8 x half> @llvm.aarch64.sve.fmla.u.nxv8f16(<vscale x 8 x i1> %0, <vscale x 8 x half> %35, <vscale x 8 x half> %37, <vscale x 8 x half> %15)
 190   store <vscale x 8 x half> %38, ptr %27, align 16
 191   %39 = getelementptr i8, ptr %s1.0.i137.us, i64 %.idx.i.us.2
 192   %40 = load <vscale x 8 x half>, ptr %39, align 16
 193   %41 = getelementptr i8, ptr %vdst.0.i138.us, i64 %.idx.i.us.2
 194   %42 = load <vscale x 8 x half>, ptr %41, align 16
 195   %43 = tail call fast <vscale x 8 x half> @llvm.aarch64.sve.fmla.u.nxv8f16(<vscale x 8 x i1> %0, <vscale x 8 x half> %42, <vscale x 8 x half> %40, <vscale x 8 x half> %6)
 196   %44 = getelementptr i8, ptr %s2.0.i136.us, i64 %.idx.i.us.2
 197   %45 = load <vscale x 8 x half>, ptr %44, align 16
 198   %46 = tail call fast <vscale x 8 x half> @llvm.aarch64.sve.fmla.u.nxv8f16(<vscale x 8 x i1> %0, <vscale x 8 x half> %43, <vscale x 8 x half> %45, <vscale x 8 x half> %9)
 199   %47 = getelementptr i8, ptr %s3.0.i135.us, i64 %.idx.i.us.2
 200   %48 = load <vscale x 8 x half>, ptr %47, align 16
 201   %49 = tail call fast <vscale x 8 x half> @llvm.aarch64.sve.fmla.u.nxv8f16(<vscale x 8 x i1> %0, <vscale x 8 x half> %46, <vscale x 8 x half> %48, <vscale x 8 x half> %12)
 202   %50 = getelementptr i8, ptr %s4.0.i134.us, i64 %.idx.i.us.2
 203   %51 = load <vscale x 8 x half>, ptr %50, align 16
 204   %52 = tail call fast <vscale x 8 x half> @llvm.aarch64.sve.fmla.u.nxv8f16(<vscale x 8 x i1> %0, <vscale x 8 x half> %49, <vscale x 8 x half> %51, <vscale x 8 x half> %15)
 205   store <vscale x 8 x half> %52, ptr %41, align 16
 206   %53 = getelementptr i8, ptr %s1.0.i137.us, i64 %.idx.i.us.3
 207   %54 = load <vscale x 8 x half>, ptr %53, align 16
 208   %55 = getelementptr i8, ptr %vdst.0.i138.us, i64 %.idx.i.us.3
 209   %56 = load <vscale x 8 x half>, ptr %55, align 16
 210   %57 = tail call fast <vscale x 8 x half> @llvm.aarch64.sve.fmla.u.nxv8f16(<vscale x 8 x i1> %0, <vscale x 8 x half> %56, <vscale x 8 x half> %54, <vscale x 8 x half> %6)
 211   %58 = getelementptr i8, ptr %s2.0.i136.us, i64 %.idx.i.us.3
 212   %59 = load <vscale x 8 x half>, ptr %58, align 16
 213   %60 = tail call fast <vscale x 8 x half> @llvm.aarch64.sve.fmla.u.nxv8f16(<vscale x 8 x i1> %0, <vscale x 8 x half> %57, <vscale x 8 x half> %59, <vscale x 8 x half> %9)
 214   %61 = getelementptr i8, ptr %s3.0.i135.us, i64 %.idx.i.us.3
 215   %62 = load <vscale x 8 x half>, ptr %61, align 16
 216   %63 = tail call fast <vscale x 8 x half> @llvm.aarch64.sve.fmla.u.nxv8f16(<vscale x 8 x i1> %0, <vscale x 8 x half> %60, <vscale x 8 x half> %62, <vscale x 8 x half> %12)
 217   %64 = getelementptr i8, ptr %s4.0.i134.us, i64 %.idx.i.us.3
 218   %65 = load <vscale x 8 x half>, ptr %64, align 16
 219   %66 = tail call fast <vscale x 8 x half> @llvm.aarch64.sve.fmla.u.nxv8f16(<vscale x 8 x i1> %0, <vscale x 8 x half> %63, <vscale x 8 x half> %65, <vscale x 8 x half> %15)
 220   store <vscale x 8 x half> %66, ptr %55, align 16
 221   %add.ptr15.i.us = getelementptr inbounds half, ptr %vdst.0.i138.us, i64 %mul5.i
 222   %add.ptr16.i.us = getelementptr inbounds half, ptr %s1.0.i137.us, i64 %mul5.i
 223   %add.ptr17.i.us = getelementptr inbounds half, ptr %s2.0.i136.us, i64 %mul5.i
 224   %add.ptr18.i.us = getelementptr inbounds half, ptr %s3.0.i135.us, i64 %mul5.i
 225   %add.ptr19.i.us = getelementptr inbounds half, ptr %s4.0.i134.us, i64 %mul5.i
 226   %cmp.i.us = icmp ult ptr %add.ptr15.i.us, %add.ptr.i
 227   br i1 %cmp.i.us, label %for.cond.i.preheader.us, label %while.cond.i..exit_crit_edge.us
 228
 229 while.cond.i..exit_crit_edge.us: ; preds = %for.cond.i.preheader.us
 230   %indvars.iv.next = add nuw nsw i64 %indvars.iv, 4
 231   %inc.us = add nuw nsw i32 %i4.0140.us, 1
 232   %exitcond.not = icmp eq i32 %inc.us, %kw
 233   br i1 %exitcond.not, label %exit78, label %for.body.us
 234
 235 exit78:                      ; preds = %while.cond.i..exit_crit_edge.us, %for.body.lr.ph, %entry
 236   ret void
 237 }