llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll

   1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
   2 ; RUN: llc -mtriple=aarch64-none-linux-gnu < %s -o -| FileCheck %s
   3
   4 define void @matrix_mul_unsigned(i32 %N, i32* nocapture %C, i16* nocapture readonly %A, i16 %val) {
   5 ; CHECK-LABEL: matrix_mul_unsigned:
   6 ; CHECK:       // %bb.0: // %vector.header
   7 ; CHECK-NEXT:    and w9, w3, #0xffff
   8 ; CHECK-NEXT:    // kill: def $w0 killed $w0 def $x0
   9 ; CHECK-NEXT:    and x8, x0, #0xfffffff8
  10 ; CHECK-NEXT:    dup v0.4h, w9
  11 ; CHECK-NEXT:  .LBB0_1: // %vector.body
  12 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
  13 ; CHECK-NEXT:    add x9, x2, w0, uxtw #1
  14 ; CHECK-NEXT:    ldp d1, d2, [x9]
  15 ; CHECK-NEXT:    add x9, x1, w0, uxtw #2
  16 ; CHECK-NEXT:    subs x8, x8, #8
  17 ; CHECK-NEXT:    add w0, w0, #8
  18 ; CHECK-NEXT:    umull v1.4s, v0.4h, v1.4h
  19 ; CHECK-NEXT:    umull v2.4s, v0.4h, v2.4h
  20 ; CHECK-NEXT:    stp q1, q2, [x9]
  21 ; CHECK-NEXT:    b.ne .LBB0_1
  22 ; CHECK-NEXT:  // %bb.2: // %for.end12
  23 ; CHECK-NEXT:    ret
  24 vector.header:
  25   %conv4 = zext i16 %val to i32
  26   %wide.trip.count = zext i32 %N to i64
  27   %0 = add nsw i64 %wide.trip.count, -1
  28   %min.iters.check = icmp ult i32 %N, 8
  29   %1 = trunc i64 %0 to i32
  30   %2 = icmp ugt i64 %0, 4294967295
  31   %n.vec = and i64 %wide.trip.count, 4294967288
  32   %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %conv4, i32 0
  33   %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
  34   %broadcast.splatinsert31 = insertelement <4 x i32> undef, i32 %conv4, i32 0
  35   %broadcast.splat32 = shufflevector <4 x i32> %broadcast.splatinsert31, <4 x i32> undef, <4 x i32> zeroinitializer
  36   %cmp.n = icmp eq i64 %n.vec, %wide.trip.count
  37   br label %vector.body
  38
  39 vector.body:                                      ; preds = %vector.header, %vector.body
  40   %index = phi i64 [ %index.next, %vector.body ], [ 0, %vector.header ]
  41   %3 = trunc i64 %index to i32
  42   %4 = add i32 %N, %3
  43   %5 = zext i32 %4 to i64
  44   %6 = getelementptr inbounds i16, i16* %A, i64 %5
  45   %7 = bitcast i16* %6 to <4 x i16>*
  46   %wide.load = load <4 x i16>, <4 x i16>* %7, align 2
  47   %8 = getelementptr inbounds i16, i16* %6, i64 4
  48   %9 = bitcast i16* %8 to <4 x i16>*
  49   %wide.load30 = load <4 x i16>, <4 x i16>* %9, align 2
  50   %10 = zext <4 x i16> %wide.load to <4 x i32>
  51   %11 = zext <4 x i16> %wide.load30 to <4 x i32>
  52   %12 = mul nuw nsw <4 x i32> %broadcast.splat, %10
  53   %13 = mul nuw nsw <4 x i32> %broadcast.splat32, %11
  54   %14 = getelementptr inbounds i32, i32* %C, i64 %5
  55   %15 = bitcast i32* %14 to <4 x i32>*
  56   store <4 x i32> %12, <4 x i32>* %15, align 4
  57   %16 = getelementptr inbounds i32, i32* %14, i64 4
  58   %17 = bitcast i32* %16 to <4 x i32>*
  59   store <4 x i32> %13, <4 x i32>* %17, align 4
  60   %index.next = add i64 %index, 8
  61   %18 = icmp eq i64 %index.next, %n.vec
  62   br i1 %18, label %for.end12, label %vector.body
  63
  64 for.end12:                                        ; preds = %vector.body
  65   ret void
  66 }
  67
  68 define void @matrix_mul_signed(i32 %N, i32* nocapture %C, i16* nocapture readonly %A, i16 %val) {
  69 ; CHECK-LABEL: matrix_mul_signed:
  70 ; CHECK:       // %bb.0: // %vector.header
  71 ; CHECK-NEXT:    sxth w9, w3
  72 ; CHECK-NEXT:    // kill: def $w0 killed $w0 def $x0
  73 ; CHECK-NEXT:    and x8, x0, #0xfffffff8
  74 ; CHECK-NEXT:    dup v0.4h, w9
  75 ; CHECK-NEXT:  .LBB1_1: // %vector.body
  76 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
  77 ; CHECK-NEXT:    add x9, x2, w0, sxtw #1
  78 ; CHECK-NEXT:    ldp d1, d2, [x9]
  79 ; CHECK-NEXT:    add x9, x1, w0, sxtw #2
  80 ; CHECK-NEXT:    subs x8, x8, #8
  81 ; CHECK-NEXT:    add w0, w0, #8
  82 ; CHECK-NEXT:    smull v1.4s, v0.4h, v1.4h
  83 ; CHECK-NEXT:    smull v2.4s, v0.4h, v2.4h
  84 ; CHECK-NEXT:    stp q1, q2, [x9]
  85 ; CHECK-NEXT:    b.ne .LBB1_1
  86 ; CHECK-NEXT:  // %bb.2: // %for.end12
  87 ; CHECK-NEXT:    ret
  88 vector.header:
  89   %conv4 = sext i16 %val to i32
  90   %wide.trip.count = sext i32 %N to i64
  91   %0 = add nsw i64 %wide.trip.count, -1
  92   %min.iters.check = icmp ult i32 %N, 8
  93   %1 = trunc i64 %0 to i32
  94   %2 = icmp ugt i64 %0, 4294967295
  95   %n.vec = and i64 %wide.trip.count, 4294967288
  96   %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %conv4, i32 0
  97   %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
  98   %broadcast.splatinsert31 = insertelement <4 x i32> undef, i32 %conv4, i32 0
  99   %broadcast.splat32 = shufflevector <4 x i32> %broadcast.splatinsert31, <4 x i32> undef, <4 x i32> zeroinitializer
 100   %cmp.n = icmp eq i64 %n.vec, %wide.trip.count
 101   br label %vector.body
 102
 103 vector.body:                                      ; preds = %vector.header, %vector.body
 104   %index = phi i64 [ %index.next, %vector.body ], [ 0, %vector.header ]
 105   %3 = trunc i64 %index to i32
 106   %4 = add i32 %N, %3
 107   %5 = sext i32 %4 to i64
 108   %6 = getelementptr inbounds i16, i16* %A, i64 %5
 109   %7 = bitcast i16* %6 to <4 x i16>*
 110   %wide.load = load <4 x i16>, <4 x i16>* %7, align 2
 111   %8 = getelementptr inbounds i16, i16* %6, i64 4
 112   %9 = bitcast i16* %8 to <4 x i16>*
 113   %wide.load30 = load <4 x i16>, <4 x i16>* %9, align 2
 114   %10 = sext <4 x i16> %wide.load to <4 x i32>
 115   %11 = sext <4 x i16> %wide.load30 to <4 x i32>
 116   %12 = mul nsw <4 x i32> %broadcast.splat, %10
 117   %13 = mul nsw <4 x i32> %broadcast.splat32, %11
 118   %14 = getelementptr inbounds i32, i32* %C, i64 %5
 119   %15 = bitcast i32* %14 to <4 x i32>*
 120   store <4 x i32> %12, <4 x i32>* %15, align 4
 121   %16 = getelementptr inbounds i32, i32* %14, i64 4
 122   %17 = bitcast i32* %16 to <4 x i32>*
 123   store <4 x i32> %13, <4 x i32>* %17, align 4
 124   %index.next = add i64 %index, 8
 125   %18 = icmp eq i64 %index.next, %n.vec
 126   br i1 %18, label %for.end12, label %vector.body
 127
 128 for.end12:                                        ; preds = %vector.body
 129   ret void
 130 }
 131
 132
 133 define void @matrix_mul_double_shuffle(i32 %N, i32* nocapture %C, i16* nocapture readonly %A, i16 %val) {
 134 ; CHECK-LABEL: matrix_mul_double_shuffle:
 135 ; CHECK:       // %bb.0: // %vector.header
 136 ; CHECK-NEXT:    and w9, w3, #0xffff
 137 ; CHECK-NEXT:    // kill: def $w0 killed $w0 def $x0
 138 ; CHECK-NEXT:    and x8, x0, #0xfffffff8
 139 ; CHECK-NEXT:    dup v0.4h, w9
 140 ; CHECK-NEXT:  .LBB2_1: // %vector.body
 141 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
 142 ; CHECK-NEXT:    ldrh w9, [x2], #16
 143 ; CHECK-NEXT:    mov w10, w0
 144 ; CHECK-NEXT:    subs x8, x8, #8
 145 ; CHECK-NEXT:    lsl x10, x10, #2
 146 ; CHECK-NEXT:    dup v1.4h, w9
 147 ; CHECK-NEXT:    umull v1.4s, v0.4h, v1.4h
 148 ; CHECK-NEXT:    add w0, w0, #8
 149 ; CHECK-NEXT:    str q1, [x1, x10]
 150 ; CHECK-NEXT:    b.ne .LBB2_1
 151 ; CHECK-NEXT:  // %bb.2: // %for.end12
 152 ; CHECK-NEXT:    ret
 153 vector.header:
 154   %conv4 = zext i16 %val to i32
 155   %wide.trip.count = zext i32 %N to i64
 156   %0 = add nsw i64 %wide.trip.count, -1
 157   %min.iters.check = icmp ult i32 %N, 8
 158   %1 = trunc i64 %0 to i32
 159   %2 = icmp ugt i64 %0, 4294967295
 160   %n.vec = and i64 %wide.trip.count, 4294967288
 161   %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %conv4, i32 0
 162   %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
 163   %cmp.n = icmp eq i64 %n.vec, %wide.trip.count
 164   br label %vector.body
 165
 166 vector.body:                                      ; preds = %vector.header, %vector.body
 167   %index = phi i64 [ %index.next, %vector.body ], [ 0, %vector.header ]
 168   %g = getelementptr inbounds i16, i16* %A, i64 %index
 169   %val1 = load i16, i16* %g
 170   %splat.input.ext = zext i16 %val1 to i32
 171   %broadcast.splatinsert31 = insertelement <4 x i32> undef, i32 %splat.input.ext, i32 0
 172   %broadcast.splat32 = shufflevector <4 x i32> %broadcast.splatinsert31, <4 x i32> %broadcast.splat, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
 173   %3 = trunc i64 %index to i32
 174   %4 = add i32 %N, %3
 175   %5 = zext i32 %4 to i64
 176   %6 = mul nuw nsw <4 x i32> %broadcast.splat, %broadcast.splat32
 177   %7 = getelementptr inbounds i32, i32* %C, i64 %5
 178   %8 = bitcast i32* %7 to <4 x i32>*
 179   store <4 x i32> %6, <4 x i32>* %8, align 4
 180   %index.next = add i64 %index, 8
 181   %9 = icmp eq i64 %index.next, %n.vec
 182   br i1 %9, label %for.end12, label %vector.body
 183
 184 for.end12:                                        ; preds = %vector.body
 185   ret void
 186 }