llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll

   1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
   2 ; RUN: llc -mtriple=aarch64-none-linux-gnu < %s -o -| FileCheck %s
   3
   4 define void @matrix_mul_unsigned(i32 %N, i32* nocapture %C, i16* nocapture readonly %A, i16 %val) {
   5 ; CHECK-LABEL: matrix_mul_unsigned:
   6 ; CHECK:       // %bb.0: // %vector.header
   7 ; CHECK-NEXT:    and w8, w3, #0xffff
   8 ; CHECK-NEXT:    // kill: def $w0 killed $w0 def $x0
   9 ; CHECK-NEXT:    dup v0.4h, w8
  10 ; CHECK-NEXT:    and x8, x0, #0xfffffff8
  11 ; CHECK-NEXT:  .LBB0_1: // %vector.body
  12 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
  13 ; CHECK-NEXT:    add x9, x2, w0, uxtw #1
  14 ; CHECK-NEXT:    subs x8, x8, #8
  15 ; CHECK-NEXT:    ldp d1, d2, [x9]
  16 ; CHECK-NEXT:    add x9, x1, w0, uxtw #2
  17 ; CHECK-NEXT:    add w0, w0, #8
  18 ; CHECK-NEXT:    umull v1.4s, v0.4h, v1.4h
  19 ; CHECK-NEXT:    umull v2.4s, v0.4h, v2.4h
  20 ; CHECK-NEXT:    stp q1, q2, [x9]
  21 ; CHECK-NEXT:    b.ne .LBB0_1
  22 ; CHECK-NEXT:  // %bb.2: // %for.end12
  23 ; CHECK-NEXT:    ret
  24 vector.header:
  25   %conv4 = zext i16 %val to i32
  26   %wide.trip.count = zext i32 %N to i64
  27   %0 = add nsw i64 %wide.trip.count, -1
  28   %min.iters.check = icmp ult i32 %N, 8
  29   %1 = trunc i64 %0 to i32
  30   %2 = icmp ugt i64 %0, 4294967295
  31   %n.vec = and i64 %wide.trip.count, 4294967288
  32   %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %conv4, i32 0
  33   %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
  34   %broadcast.splatinsert31 = insertelement <4 x i32> undef, i32 %conv4, i32 0
  35   %broadcast.splat32 = shufflevector <4 x i32> %broadcast.splatinsert31, <4 x i32> undef, <4 x i32> zeroinitializer
  36   %cmp.n = icmp eq i64 %n.vec, %wide.trip.count
  37   br label %vector.body
  38
  39 vector.body:                                      ; preds = %vector.header, %vector.body
  40   %index = phi i64 [ %index.next, %vector.body ], [ 0, %vector.header ]
  41   %3 = trunc i64 %index to i32
  42   %4 = add i32 %N, %3
  43   %5 = zext i32 %4 to i64
  44   %6 = getelementptr inbounds i16, i16* %A, i64 %5
  45   %7 = bitcast i16* %6 to <4 x i16>*
  46   %wide.load = load <4 x i16>, <4 x i16>* %7, align 2
  47   %8 = getelementptr inbounds i16, i16* %6, i64 4
  48   %9 = bitcast i16* %8 to <4 x i16>*
  49   %wide.load30 = load <4 x i16>, <4 x i16>* %9, align 2
  50   %10 = zext <4 x i16> %wide.load to <4 x i32>
  51   %11 = zext <4 x i16> %wide.load30 to <4 x i32>
  52   %12 = mul nuw nsw <4 x i32> %broadcast.splat, %10
  53   %13 = mul nuw nsw <4 x i32> %broadcast.splat32, %11
  54   %14 = getelementptr inbounds i32, i32* %C, i64 %5
  55   %15 = bitcast i32* %14 to <4 x i32>*
  56   store <4 x i32> %12, <4 x i32>* %15, align 4
  57   %16 = getelementptr inbounds i32, i32* %14, i64 4
  58   %17 = bitcast i32* %16 to <4 x i32>*
  59   store <4 x i32> %13, <4 x i32>* %17, align 4
  60   %index.next = add i64 %index, 8
  61   %18 = icmp eq i64 %index.next, %n.vec
  62   br i1 %18, label %for.end12, label %vector.body
  63
  64 for.end12:                                        ; preds = %vector.body
  65   ret void
  66 }
  67
  68 define void @matrix_mul_signed(i32 %N, i32* nocapture %C, i16* nocapture readonly %A, i16 %val) {
  69 ; CHECK-LABEL: matrix_mul_signed:
  70 ; CHECK:       // %bb.0: // %vector.header
  71 ; CHECK-NEXT:    sxth w8, w3
  72 ; CHECK-NEXT:    // kill: def $w0 killed $w0 def $x0
  73 ; CHECK-NEXT:    dup v0.4h, w8
  74 ; CHECK-NEXT:    and x8, x0, #0xfffffff8
  75 ; CHECK-NEXT:  .LBB1_1: // %vector.body
  76 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
  77 ; CHECK-NEXT:    add x9, x2, w0, sxtw #1
  78 ; CHECK-NEXT:    subs x8, x8, #8
  79 ; CHECK-NEXT:    ldp d1, d2, [x9]
  80 ; CHECK-NEXT:    add x9, x1, w0, sxtw #2
  81 ; CHECK-NEXT:    add w0, w0, #8
  82 ; CHECK-NEXT:    smull v1.4s, v0.4h, v1.4h
  83 ; CHECK-NEXT:    smull v2.4s, v0.4h, v2.4h
  84 ; CHECK-NEXT:    stp q1, q2, [x9]
  85 ; CHECK-NEXT:    b.ne .LBB1_1
  86 ; CHECK-NEXT:  // %bb.2: // %for.end12
  87 ; CHECK-NEXT:    ret
  88 vector.header:
  89   %conv4 = sext i16 %val to i32
  90   %wide.trip.count = sext i32 %N to i64
  91   %0 = add nsw i64 %wide.trip.count, -1
  92   %min.iters.check = icmp ult i32 %N, 8
  93   %1 = trunc i64 %0 to i32
  94   %2 = icmp ugt i64 %0, 4294967295
  95   %n.vec = and i64 %wide.trip.count, 4294967288
  96   %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %conv4, i32 0
  97   %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
  98   %broadcast.splatinsert31 = insertelement <4 x i32> undef, i32 %conv4, i32 0
  99   %broadcast.splat32 = shufflevector <4 x i32> %broadcast.splatinsert31, <4 x i32> undef, <4 x i32> zeroinitializer
 100   %cmp.n = icmp eq i64 %n.vec, %wide.trip.count
 101   br label %vector.body
 102
 103 vector.body:                                      ; preds = %vector.header, %vector.body
 104   %index = phi i64 [ %index.next, %vector.body ], [ 0, %vector.header ]
 105   %3 = trunc i64 %index to i32
 106   %4 = add i32 %N, %3
 107   %5 = sext i32 %4 to i64
 108   %6 = getelementptr inbounds i16, i16* %A, i64 %5
 109   %7 = bitcast i16* %6 to <4 x i16>*
 110   %wide.load = load <4 x i16>, <4 x i16>* %7, align 2
 111   %8 = getelementptr inbounds i16, i16* %6, i64 4
 112   %9 = bitcast i16* %8 to <4 x i16>*
 113   %wide.load30 = load <4 x i16>, <4 x i16>* %9, align 2
 114   %10 = sext <4 x i16> %wide.load to <4 x i32>
 115   %11 = sext <4 x i16> %wide.load30 to <4 x i32>
 116   %12 = mul nsw <4 x i32> %broadcast.splat, %10
 117   %13 = mul nsw <4 x i32> %broadcast.splat32, %11
 118   %14 = getelementptr inbounds i32, i32* %C, i64 %5
 119   %15 = bitcast i32* %14 to <4 x i32>*
 120   store <4 x i32> %12, <4 x i32>* %15, align 4
 121   %16 = getelementptr inbounds i32, i32* %14, i64 4
 122   %17 = bitcast i32* %16 to <4 x i32>*
 123   store <4 x i32> %13, <4 x i32>* %17, align 4
 124   %index.next = add i64 %index, 8
 125   %18 = icmp eq i64 %index.next, %n.vec
 126   br i1 %18, label %for.end12, label %vector.body
 127
 128 for.end12:                                        ; preds = %vector.body
 129   ret void
 130 }
 131
 132
 133 define void @matrix_mul_double_shuffle(i32 %N, i32* nocapture %C, i16* nocapture readonly %A, i16 %val) {
 134 ; CHECK-LABEL: matrix_mul_double_shuffle:
 135 ; CHECK:       // %bb.0: // %vector.header
 136 ; CHECK-NEXT:    and w8, w3, #0xffff
 137 ; CHECK-NEXT:    // kill: def $w0 killed $w0 def $x0
 138 ; CHECK-NEXT:    dup v0.4h, w8
 139 ; CHECK-NEXT:    and x8, x0, #0xfffffff8
 140 ; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0 def $x0
 141 ; CHECK-NEXT:  .LBB2_1: // %vector.body
 142 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
 143 ; CHECK-NEXT:    ldrh w9, [x2], #16
 144 ; CHECK-NEXT:    subs x8, x8, #8
 145 ; CHECK-NEXT:    dup v1.4h, w9
 146 ; CHECK-NEXT:    ubfiz x9, x0, #2, #32
 147 ; CHECK-NEXT:    add w0, w0, #8
 148 ; CHECK-NEXT:    umull v1.4s, v0.4h, v1.4h
 149 ; CHECK-NEXT:    str q1, [x1, x9]
 150 ; CHECK-NEXT:    b.ne .LBB2_1
 151 ; CHECK-NEXT:  // %bb.2: // %for.end12
 152 ; CHECK-NEXT:    ret
 153 vector.header:
 154   %conv4 = zext i16 %val to i32
 155   %wide.trip.count = zext i32 %N to i64
 156   %0 = add nsw i64 %wide.trip.count, -1
 157   %min.iters.check = icmp ult i32 %N, 8
 158   %1 = trunc i64 %0 to i32
 159   %2 = icmp ugt i64 %0, 4294967295
 160   %n.vec = and i64 %wide.trip.count, 4294967288
 161   %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %conv4, i32 0
 162   %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
 163   %cmp.n = icmp eq i64 %n.vec, %wide.trip.count
 164   br label %vector.body
 165
 166 vector.body:                                      ; preds = %vector.header, %vector.body
 167   %index = phi i64 [ %index.next, %vector.body ], [ 0, %vector.header ]
 168   %g = getelementptr inbounds i16, i16* %A, i64 %index
 169   %val1 = load i16, i16* %g
 170   %splat.input.ext = zext i16 %val1 to i32
 171   %broadcast.splatinsert31 = insertelement <4 x i32> undef, i32 %splat.input.ext, i32 0
 172   %broadcast.splat32 = shufflevector <4 x i32> %broadcast.splatinsert31, <4 x i32> %broadcast.splat, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
 173   %3 = trunc i64 %index to i32
 174   %4 = add i32 %N, %3
 175   %5 = zext i32 %4 to i64
 176   %6 = mul nuw nsw <4 x i32> %broadcast.splat, %broadcast.splat32
 177   %7 = getelementptr inbounds i32, i32* %C, i64 %5
 178   %8 = bitcast i32* %7 to <4 x i32>*
 179   store <4 x i32> %6, <4 x i32>* %8, align 4
 180   %index.next = add i64 %index, 8
 181   %9 = icmp eq i64 %index.next, %n.vec
 182   br i1 %9, label %for.end12, label %vector.body
 183
 184 for.end12:                                        ; preds = %vector.body
 185   ret void
 186 }
 187
 188
 189 define void @larger_smull(i16* nocapture noundef readonly %x, i16 noundef %y, i32* noalias nocapture noundef writeonly %s, i32 noundef %n) {
 190 ; CHECK-LABEL: larger_smull:
 191 ; CHECK:       // %bb.0: // %entry
 192 ; CHECK-NEXT:    cmp w3, #1
 193 ; CHECK-NEXT:    b.lt .LBB3_8
 194 ; CHECK-NEXT:  // %bb.1: // %for.body.preheader
 195 ; CHECK-NEXT:    sxth w8, w1
 196 ; CHECK-NEXT:    cmp w3, #15
 197 ; CHECK-NEXT:    mov w9, w3
 198 ; CHECK-NEXT:    b.hi .LBB3_3
 199 ; CHECK-NEXT:  // %bb.2:
 200 ; CHECK-NEXT:    mov x10, xzr
 201 ; CHECK-NEXT:    b .LBB3_6
 202 ; CHECK-NEXT:  .LBB3_3: // %vector.ph
 203 ; CHECK-NEXT:    dup v0.8h, w8
 204 ; CHECK-NEXT:    and x10, x9, #0xfffffff0
 205 ; CHECK-NEXT:    add x11, x2, #32
 206 ; CHECK-NEXT:    add x12, x0, #16
 207 ; CHECK-NEXT:    mov x13, x10
 208 ; CHECK-NEXT:  .LBB3_4: // %vector.body
 209 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
 210 ; CHECK-NEXT:    ldp q1, q2, [x12, #-16]
 211 ; CHECK-NEXT:    subs x13, x13, #16
 212 ; CHECK-NEXT:    add x12, x12, #32
 213 ; CHECK-NEXT:    smull2 v3.4s, v0.8h, v1.8h
 214 ; CHECK-NEXT:    smull v1.4s, v0.4h, v1.4h
 215 ; CHECK-NEXT:    smull2 v4.4s, v0.8h, v2.8h
 216 ; CHECK-NEXT:    smull v2.4s, v0.4h, v2.4h
 217 ; CHECK-NEXT:    stp q1, q3, [x11, #-32]
 218 ; CHECK-NEXT:    stp q2, q4, [x11], #64
 219 ; CHECK-NEXT:    b.ne .LBB3_4
 220 ; CHECK-NEXT:  // %bb.5: // %middle.block
 221 ; CHECK-NEXT:    cmp x10, x9
 222 ; CHECK-NEXT:    b.eq .LBB3_8
 223 ; CHECK-NEXT:  .LBB3_6: // %for.body.preheader1
 224 ; CHECK-NEXT:    add x11, x2, x10, lsl #2
 225 ; CHECK-NEXT:    add x12, x0, x10, lsl #1
 226 ; CHECK-NEXT:    sub x9, x9, x10
 227 ; CHECK-NEXT:  .LBB3_7: // %for.body
 228 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
 229 ; CHECK-NEXT:    ldrsh w10, [x12], #2
 230 ; CHECK-NEXT:    subs x9, x9, #1
 231 ; CHECK-NEXT:    mul w10, w10, w8
 232 ; CHECK-NEXT:    str w10, [x11], #4
 233 ; CHECK-NEXT:    b.ne .LBB3_7
 234 ; CHECK-NEXT:  .LBB3_8: // %for.cond.cleanup
 235 ; CHECK-NEXT:    ret
 236 entry:
 237   %conv1 = sext i16 %y to i32
 238   %cmp8 = icmp sgt i32 %n, 0
 239   br i1 %cmp8, label %for.body.preheader, label %for.cond.cleanup
 240
 241 for.body.preheader:                               ; preds = %entry
 242   %wide.trip.count = zext i32 %n to i64
 243   %min.iters.check = icmp ult i32 %n, 16
 244   br i1 %min.iters.check, label %for.body.preheader14, label %vector.ph
 245
 246 vector.ph:                                        ; preds = %for.body.preheader
 247   %n.vec = and i64 %wide.trip.count, 4294967280
 248   %broadcast.splatinsert = insertelement <8 x i32> poison, i32 %conv1, i64 0
 249   %broadcast.splat = shufflevector <8 x i32> %broadcast.splatinsert, <8 x i32> poison, <8 x i32> zeroinitializer
 250   %broadcast.splatinsert12 = insertelement <8 x i32> poison, i32 %conv1, i64 0
 251   %broadcast.splat13 = shufflevector <8 x i32> %broadcast.splatinsert12, <8 x i32> poison, <8 x i32> zeroinitializer
 252   br label %vector.body
 253
 254 vector.body:                                      ; preds = %vector.body, %vector.ph
 255   %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
 256   %0 = getelementptr inbounds i16, i16* %x, i64 %index
 257   %1 = bitcast i16* %0 to <8 x i16>*
 258   %wide.load = load <8 x i16>, <8 x i16>* %1, align 2
 259   %2 = getelementptr inbounds i16, i16* %0, i64 8
 260   %3 = bitcast i16* %2 to <8 x i16>*
 261   %wide.load11 = load <8 x i16>, <8 x i16>* %3, align 2
 262   %4 = sext <8 x i16> %wide.load to <8 x i32>
 263   %5 = sext <8 x i16> %wide.load11 to <8 x i32>
 264   %6 = mul nsw <8 x i32> %broadcast.splat, %4
 265   %7 = mul nsw <8 x i32> %broadcast.splat13, %5
 266   %8 = getelementptr inbounds i32, i32* %s, i64 %index
 267   %9 = bitcast i32* %8 to <8 x i32>*
 268   store <8 x i32> %6, <8 x i32>* %9, align 4
 269   %10 = getelementptr inbounds i32, i32* %8, i64 8
 270   %11 = bitcast i32* %10 to <8 x i32>*
 271   store <8 x i32> %7, <8 x i32>* %11, align 4
 272   %index.next = add nuw i64 %index, 16
 273   %12 = icmp eq i64 %index.next, %n.vec
 274   br i1 %12, label %middle.block, label %vector.body
 275
 276 middle.block:                                     ; preds = %vector.body
 277   %cmp.n = icmp eq i64 %n.vec, %wide.trip.count
 278   br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader14
 279
 280 for.body.preheader14:                             ; preds = %for.body.preheader, %middle.block
 281   %indvars.iv.ph = phi i64 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
 282   br label %for.body
 283
 284 for.cond.cleanup:                                 ; preds = %for.body, %middle.block, %entry
 285   ret void
 286
 287 for.body:                                         ; preds = %for.body.preheader14, %for.body
 288   %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ %indvars.iv.ph, %for.body.preheader14 ]
 289   %arrayidx = getelementptr inbounds i16, i16* %x, i64 %indvars.iv
 290   %13 = load i16, i16* %arrayidx, align 2
 291   %conv = sext i16 %13 to i32
 292   %mul = mul nsw i32 %conv, %conv1
 293   %arrayidx3 = getelementptr inbounds i32, i32* %s, i64 %indvars.iv
 294   store i32 %mul, i32* %arrayidx3, align 4
 295   %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
 296   %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
 297   br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
 298 }
 299
 300
 301 define void @larger_umull(i16* nocapture noundef readonly %x, i16 noundef %y, i32* noalias nocapture noundef writeonly %s, i32 noundef %n) {
 302 ; CHECK-LABEL: larger_umull:
 303 ; CHECK:       // %bb.0: // %entry
 304 ; CHECK-NEXT:    cmp w3, #1
 305 ; CHECK-NEXT:    b.lt .LBB4_8
 306 ; CHECK-NEXT:  // %bb.1: // %for.body.preheader
 307 ; CHECK-NEXT:    cmp w3, #15
 308 ; CHECK-NEXT:    and w8, w1, #0xffff
 309 ; CHECK-NEXT:    mov w9, w3
 310 ; CHECK-NEXT:    b.hi .LBB4_3
 311 ; CHECK-NEXT:  // %bb.2:
 312 ; CHECK-NEXT:    mov x10, xzr
 313 ; CHECK-NEXT:    b .LBB4_6
 314 ; CHECK-NEXT:  .LBB4_3: // %vector.ph
 315 ; CHECK-NEXT:    dup v0.8h, w8
 316 ; CHECK-NEXT:    and x10, x9, #0xfffffff0
 317 ; CHECK-NEXT:    add x11, x2, #32
 318 ; CHECK-NEXT:    add x12, x0, #16
 319 ; CHECK-NEXT:    mov x13, x10
 320 ; CHECK-NEXT:  .LBB4_4: // %vector.body
 321 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
 322 ; CHECK-NEXT:    ldp q1, q2, [x12, #-16]
 323 ; CHECK-NEXT:    subs x13, x13, #16
 324 ; CHECK-NEXT:    add x12, x12, #32
 325 ; CHECK-NEXT:    umull2 v3.4s, v0.8h, v1.8h
 326 ; CHECK-NEXT:    umull v1.4s, v0.4h, v1.4h
 327 ; CHECK-NEXT:    umull2 v4.4s, v0.8h, v2.8h
 328 ; CHECK-NEXT:    umull v2.4s, v0.4h, v2.4h
 329 ; CHECK-NEXT:    stp q1, q3, [x11, #-32]
 330 ; CHECK-NEXT:    stp q2, q4, [x11], #64
 331 ; CHECK-NEXT:    b.ne .LBB4_4
 332 ; CHECK-NEXT:  // %bb.5: // %middle.block
 333 ; CHECK-NEXT:    cmp x10, x9
 334 ; CHECK-NEXT:    b.eq .LBB4_8
 335 ; CHECK-NEXT:  .LBB4_6: // %for.body.preheader1
 336 ; CHECK-NEXT:    add x11, x2, x10, lsl #2
 337 ; CHECK-NEXT:    add x12, x0, x10, lsl #1
 338 ; CHECK-NEXT:    sub x9, x9, x10
 339 ; CHECK-NEXT:  .LBB4_7: // %for.body
 340 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
 341 ; CHECK-NEXT:    ldrh w10, [x12], #2
 342 ; CHECK-NEXT:    subs x9, x9, #1
 343 ; CHECK-NEXT:    mul w10, w10, w8
 344 ; CHECK-NEXT:    str w10, [x11], #4
 345 ; CHECK-NEXT:    b.ne .LBB4_7
 346 ; CHECK-NEXT:  .LBB4_8: // %for.cond.cleanup
 347 ; CHECK-NEXT:    ret
 348 entry:
 349   %conv1 = zext i16 %y to i32
 350   %cmp8 = icmp sgt i32 %n, 0
 351   br i1 %cmp8, label %for.body.preheader, label %for.cond.cleanup
 352
 353 for.body.preheader:                               ; preds = %entry
 354   %wide.trip.count = zext i32 %n to i64
 355   %min.iters.check = icmp ult i32 %n, 16
 356   br i1 %min.iters.check, label %for.body.preheader14, label %vector.ph
 357
 358 vector.ph:                                        ; preds = %for.body.preheader
 359   %n.vec = and i64 %wide.trip.count, 4294967280
 360   %broadcast.splatinsert = insertelement <8 x i32> poison, i32 %conv1, i64 0
 361   %broadcast.splat = shufflevector <8 x i32> %broadcast.splatinsert, <8 x i32> poison, <8 x i32> zeroinitializer
 362   %broadcast.splatinsert12 = insertelement <8 x i32> poison, i32 %conv1, i64 0
 363   %broadcast.splat13 = shufflevector <8 x i32> %broadcast.splatinsert12, <8 x i32> poison, <8 x i32> zeroinitializer
 364   br label %vector.body
 365
 366 vector.body:                                      ; preds = %vector.body, %vector.ph
 367   %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
 368   %0 = getelementptr inbounds i16, i16* %x, i64 %index
 369   %1 = bitcast i16* %0 to <8 x i16>*
 370   %wide.load = load <8 x i16>, <8 x i16>* %1, align 2
 371   %2 = getelementptr inbounds i16, i16* %0, i64 8
 372   %3 = bitcast i16* %2 to <8 x i16>*
 373   %wide.load11 = load <8 x i16>, <8 x i16>* %3, align 2
 374   %4 = zext <8 x i16> %wide.load to <8 x i32>
 375   %5 = zext <8 x i16> %wide.load11 to <8 x i32>
 376   %6 = mul nuw <8 x i32> %broadcast.splat, %4
 377   %7 = mul nuw <8 x i32> %broadcast.splat13, %5
 378   %8 = getelementptr inbounds i32, i32* %s, i64 %index
 379   %9 = bitcast i32* %8 to <8 x i32>*
 380   store <8 x i32> %6, <8 x i32>* %9, align 4
 381   %10 = getelementptr inbounds i32, i32* %8, i64 8
 382   %11 = bitcast i32* %10 to <8 x i32>*
 383   store <8 x i32> %7, <8 x i32>* %11, align 4
 384   %index.next = add nuw i64 %index, 16
 385   %12 = icmp eq i64 %index.next, %n.vec
 386   br i1 %12, label %middle.block, label %vector.body
 387
 388 middle.block:                                     ; preds = %vector.body
 389   %cmp.n = icmp eq i64 %n.vec, %wide.trip.count
 390   br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader14
 391
 392 for.body.preheader14:                             ; preds = %for.body.preheader, %middle.block
 393   %indvars.iv.ph = phi i64 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
 394   br label %for.body
 395
 396 for.cond.cleanup:                                 ; preds = %for.body, %middle.block, %entry
 397   ret void
 398
 399 for.body:                                         ; preds = %for.body.preheader14, %for.body
 400   %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ %indvars.iv.ph, %for.body.preheader14 ]
 401   %arrayidx = getelementptr inbounds i16, i16* %x, i64 %indvars.iv
 402   %13 = load i16, i16* %arrayidx, align 2
 403   %conv = zext i16 %13 to i32
 404   %mul = mul nuw i32 %conv, %conv1
 405   %arrayidx3 = getelementptr inbounds i32, i32* %s, i64 %indvars.iv
 406   store i32 %mul, i32* %arrayidx3, align 4
 407   %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
 408   %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
 409   br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
 410 }
 411
 412
 413 define i16 @red_mla_dup_ext_u8_s8_s16(i8* noalias nocapture noundef readonly %A, i8 noundef %B, i32 noundef %n) {
 414 ; CHECK-LABEL: red_mla_dup_ext_u8_s8_s16:
 415 ; CHECK:       // %bb.0: // %entry
 416 ; CHECK-NEXT:    cbz w2, .LBB5_3
 417 ; CHECK-NEXT:  // %bb.1: // %for.body.preheader
 418 ; CHECK-NEXT:    sxtb w9, w1
 419 ; CHECK-NEXT:    cmp w2, #15
 420 ; CHECK-NEXT:    mov w10, w2
 421 ; CHECK-NEXT:    b.hi .LBB5_4
 422 ; CHECK-NEXT:  // %bb.2:
 423 ; CHECK-NEXT:    mov x11, xzr
 424 ; CHECK-NEXT:    mov w8, wzr
 425 ; CHECK-NEXT:    b .LBB5_7
 426 ; CHECK-NEXT:  .LBB5_3:
 427 ; CHECK-NEXT:    mov w8, wzr
 428 ; CHECK-NEXT:    mov w0, w8
 429 ; CHECK-NEXT:    ret
 430 ; CHECK-NEXT:  .LBB5_4: // %vector.ph
 431 ; CHECK-NEXT:    movi v0.2d, #0000000000000000
 432 ; CHECK-NEXT:    movi v1.2d, #0000000000000000
 433 ; CHECK-NEXT:    and x11, x10, #0xfffffff0
 434 ; CHECK-NEXT:    dup v2.8h, w9
 435 ; CHECK-NEXT:    add x8, x0, #8
 436 ; CHECK-NEXT:    mov x12, x11
 437 ; CHECK-NEXT:  .LBB5_5: // %vector.body
 438 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
 439 ; CHECK-NEXT:    ldp d3, d4, [x8, #-8]
 440 ; CHECK-NEXT:    subs x12, x12, #16
 441 ; CHECK-NEXT:    add x8, x8, #16
 442 ; CHECK-NEXT:    ushll v3.8h, v3.8b, #0
 443 ; CHECK-NEXT:    ushll v4.8h, v4.8b, #0
 444 ; CHECK-NEXT:    mla v0.8h, v2.8h, v3.8h
 445 ; CHECK-NEXT:    mla v1.8h, v2.8h, v4.8h
 446 ; CHECK-NEXT:    b.ne .LBB5_5
 447 ; CHECK-NEXT:  // %bb.6: // %middle.block
 448 ; CHECK-NEXT:    add v0.8h, v1.8h, v0.8h
 449 ; CHECK-NEXT:    cmp x11, x10
 450 ; CHECK-NEXT:    addv h0, v0.8h
 451 ; CHECK-NEXT:    fmov w8, s0
 452 ; CHECK-NEXT:    b.eq .LBB5_9
 453 ; CHECK-NEXT:  .LBB5_7: // %for.body.preheader1
 454 ; CHECK-NEXT:    sub x10, x10, x11
 455 ; CHECK-NEXT:    add x11, x0, x11
 456 ; CHECK-NEXT:  .LBB5_8: // %for.body
 457 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
 458 ; CHECK-NEXT:    ldrb w12, [x11], #1
 459 ; CHECK-NEXT:    subs x10, x10, #1
 460 ; CHECK-NEXT:    madd w8, w12, w9, w8
 461 ; CHECK-NEXT:    b.ne .LBB5_8
 462 ; CHECK-NEXT:  .LBB5_9: // %for.cond.cleanup
 463 ; CHECK-NEXT:    mov w0, w8
 464 ; CHECK-NEXT:    ret
 465 entry:
 466   %conv2 = sext i8 %B to i16
 467   %cmp10.not = icmp eq i32 %n, 0
 468   br i1 %cmp10.not, label %for.cond.cleanup, label %for.body.preheader
 469
 470 for.body.preheader:                               ; preds = %entry
 471   %wide.trip.count = zext i32 %n to i64
 472   %min.iters.check = icmp ult i32 %n, 16
 473   br i1 %min.iters.check, label %for.body.preheader17, label %vector.ph
 474
 475 vector.ph:                                        ; preds = %for.body.preheader
 476   %n.vec = and i64 %wide.trip.count, 4294967280
 477   %broadcast.splatinsert = insertelement <8 x i16> poison, i16 %conv2, i64 0
 478   %broadcast.splat = shufflevector <8 x i16> %broadcast.splatinsert, <8 x i16> poison, <8 x i32> zeroinitializer
 479   %broadcast.splatinsert15 = insertelement <8 x i16> poison, i16 %conv2, i64 0
 480   %broadcast.splat16 = shufflevector <8 x i16> %broadcast.splatinsert15, <8 x i16> poison, <8 x i32> zeroinitializer
 481   br label %vector.body
 482
 483 vector.body:                                      ; preds = %vector.body, %vector.ph
 484   %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
 485   %vec.phi = phi <8 x i16> [ zeroinitializer, %vector.ph ], [ %8, %vector.body ]
 486   %vec.phi13 = phi <8 x i16> [ zeroinitializer, %vector.ph ], [ %9, %vector.body ]
 487   %0 = getelementptr inbounds i8, i8* %A, i64 %index
 488   %1 = bitcast i8* %0 to <8 x i8>*
 489   %wide.load = load <8 x i8>, <8 x i8>* %1, align 1
 490   %2 = getelementptr inbounds i8, i8* %0, i64 8
 491   %3 = bitcast i8* %2 to <8 x i8>*
 492   %wide.load14 = load <8 x i8>, <8 x i8>* %3, align 1
 493   %4 = zext <8 x i8> %wide.load to <8 x i16>
 494   %5 = zext <8 x i8> %wide.load14 to <8 x i16>
 495   %6 = mul nsw <8 x i16> %broadcast.splat, %4
 496   %7 = mul nsw <8 x i16> %broadcast.splat16, %5
 497   %8 = add <8 x i16> %6, %vec.phi
 498   %9 = add <8 x i16> %7, %vec.phi13
 499   %index.next = add nuw i64 %index, 16
 500   %10 = icmp eq i64 %index.next, %n.vec
 501   br i1 %10, label %middle.block, label %vector.body
 502
 503 middle.block:                                     ; preds = %vector.body
 504   %bin.rdx = add <8 x i16> %9, %8
 505   %11 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %bin.rdx)
 506   %cmp.n = icmp eq i64 %n.vec, %wide.trip.count
 507   br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader17
 508
 509 for.body.preheader17:                             ; preds = %for.body.preheader, %middle.block
 510   %indvars.iv.ph = phi i64 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
 511   %s.011.ph = phi i16 [ 0, %for.body.preheader ], [ %11, %middle.block ]
 512   br label %for.body
 513
 514 for.cond.cleanup:                                 ; preds = %for.body, %middle.block, %entry
 515   %s.0.lcssa = phi i16 [ 0, %entry ], [ %11, %middle.block ], [ %add, %for.body ]
 516   ret i16 %s.0.lcssa
 517
 518 for.body:                                         ; preds = %for.body.preheader17, %for.body
 519   %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ %indvars.iv.ph, %for.body.preheader17 ]
 520   %s.011 = phi i16 [ %add, %for.body ], [ %s.011.ph, %for.body.preheader17 ]
 521   %arrayidx = getelementptr inbounds i8, i8* %A, i64 %indvars.iv
 522   %12 = load i8, i8* %arrayidx, align 1
 523   %13 = zext i8 %12 to i16
 524   %mul = mul nsw i16 %13, %conv2
 525   %add = add i16 %mul, %s.011
 526   %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
 527   %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
 528   br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
 529 }
 530
 531 define void @sink_v2z64_1(i32 *%p, i32 *%d, i64 %n, <2 x i32> %a) {
 532 ; CHECK-LABEL: sink_v2z64_1:
 533 ; CHECK:       // %bb.0: // %entry
 534 ; CHECK-NEXT:    mov x8, xzr
 535 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
 536 ; CHECK-NEXT:  .LBB6_1: // %loop
 537 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
 538 ; CHECK-NEXT:    ldr d1, [x0]
 539 ; CHECK-NEXT:    subs x2, x2, #8
 540 ; CHECK-NEXT:    add x8, x8, #8
 541 ; CHECK-NEXT:    umull v1.2d, v1.2s, v0.s[1]
 542 ; CHECK-NEXT:    shrn v1.2s, v1.2d, #15
 543 ; CHECK-NEXT:    str d1, [x0], #32
 544 ; CHECK-NEXT:    b.ne .LBB6_1
 545 ; CHECK-NEXT:  // %bb.2: // %exit
 546 ; CHECK-NEXT:    ret
 547 entry:
 548   %ext = zext <2 x i32> %a to <2 x i64>
 549   %broadcast.splat = shufflevector <2 x i64> %ext, <2 x i64> poison, <2 x i32> <i32 1, i32 1>
 550   br label %loop
 551
 552 loop:
 553   %index = phi i64 [ 0, %entry ], [ %index.next, %loop ]
 554   %g = getelementptr inbounds i32, i32 *%p, i64 %index
 555   %gb = bitcast i32* %g to <2 x i32>*
 556   %l = load <2 x i32>, <2 x i32> *%gb, align 4
 557   %e = zext <2 x i32> %l to <2 x i64>
 558   %m = mul <2 x i64> %e, %broadcast.splat
 559   %s = ashr <2 x i64> %m, <i64 15, i64 15>
 560   %t = trunc <2 x i64> %s to <2 x i32>
 561   %h = getelementptr inbounds i32, i32 *%d, i64 %index
 562   %hb = bitcast i32* %g to <2 x i32>*
 563   store <2 x i32> %t, <2 x i32> *%hb, align 4
 564   %index.next = add nuw i64 %index, 8
 565   %c = icmp eq i64 %index.next, %n
 566   br i1 %c, label %exit, label %loop
 567
 568 exit:
 569   ret void
 570 }
 571
 572 define void @sink_v4i64_1(i32 *%p, i32 *%d, i64 %n, <2 x i32> %a) {
 573 ; CHECK-LABEL: sink_v4i64_1:
 574 ; CHECK:       // %bb.0: // %entry
 575 ; CHECK-NEXT:    mov x8, xzr
 576 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
 577 ; CHECK-NEXT:  .LBB7_1: // %loop
 578 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
 579 ; CHECK-NEXT:    ldr q1, [x0]
 580 ; CHECK-NEXT:    subs x2, x2, #8
 581 ; CHECK-NEXT:    add x8, x8, #8
 582 ; CHECK-NEXT:    smull v2.2d, v1.2s, v0.s[1]
 583 ; CHECK-NEXT:    smull2 v1.2d, v1.4s, v0.s[1]
 584 ; CHECK-NEXT:    shrn v2.2s, v2.2d, #15
 585 ; CHECK-NEXT:    shrn2 v2.4s, v1.2d, #15
 586 ; CHECK-NEXT:    str q2, [x0], #32
 587 ; CHECK-NEXT:    b.ne .LBB7_1
 588 ; CHECK-NEXT:  // %bb.2: // %exit
 589 ; CHECK-NEXT:    ret
 590 entry:
 591   %ext = sext <2 x i32> %a to <2 x i64>
 592   %broadcast.splat = shufflevector <2 x i64> %ext, <2 x i64> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
 593   br label %loop
 594
 595 loop:
 596   %index = phi i64 [ 0, %entry ], [ %index.next, %loop ]
 597   %g = getelementptr inbounds i32, i32 *%p, i64 %index
 598   %gb = bitcast i32* %g to <4 x i32>*
 599   %l = load <4 x i32>, <4 x i32> *%gb, align 4
 600   %e = sext <4 x i32> %l to <4 x i64>
 601   %m = mul <4 x i64> %e, %broadcast.splat
 602   %s = ashr <4 x i64> %m, <i64 15, i64 15, i64 15, i64 15>
 603   %t = trunc <4 x i64> %s to <4 x i32>
 604   %h = getelementptr inbounds i32, i32 *%d, i64 %index
 605   %hb = bitcast i32* %g to <4 x i32>*
 606   store <4 x i32> %t, <4 x i32> *%hb, align 4
 607   %index.next = add nuw i64 %index, 8
 608   %c = icmp eq i64 %index.next, %n
 609   br i1 %c, label %exit, label %loop
 610
 611 exit:
 612   ret void
 613 }
 614
 615 define void @sink_v8z16_0(i32 *%p, i32 *%d, i64 %n, <16 x i8> %a) {
 616 ; CHECK-LABEL: sink_v8z16_0:
 617 ; CHECK:       // %bb.0: // %entry
 618 ; CHECK-NEXT:    dup v0.8b, v0.b[0]
 619 ; CHECK-NEXT:    mov x8, xzr
 620 ; CHECK-NEXT:  .LBB8_1: // %loop
 621 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
 622 ; CHECK-NEXT:    ldr d1, [x0]
 623 ; CHECK-NEXT:    subs x2, x2, #8
 624 ; CHECK-NEXT:    add x8, x8, #8
 625 ; CHECK-NEXT:    umull v1.8h, v1.8b, v0.8b
 626 ; CHECK-NEXT:    cmlt v1.8h, v1.8h, #0
 627 ; CHECK-NEXT:    xtn v1.8b, v1.8h
 628 ; CHECK-NEXT:    str d1, [x0], #32
 629 ; CHECK-NEXT:    b.ne .LBB8_1
 630 ; CHECK-NEXT:  // %bb.2: // %exit
 631 ; CHECK-NEXT:    ret
 632 entry:
 633   %ext = zext <16 x i8> %a to <16 x i16>
 634   %broadcast.splat = shufflevector <16 x i16> %ext, <16 x i16> poison, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
 635   br label %loop
 636
 637 loop:
 638   %index = phi i64 [ 0, %entry ], [ %index.next, %loop ]
 639   %g = getelementptr inbounds i32, i32 *%p, i64 %index
 640   %gb = bitcast i32* %g to <8 x i8>*
 641   %l = load <8 x i8>, <8 x i8> *%gb, align 4
 642   %e = zext <8 x i8> %l to <8 x i16>
 643   %m = mul <8 x i16> %e, %broadcast.splat
 644   %s = ashr <8 x i16> %m, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
 645   %t = trunc <8 x i16> %s to <8 x i8>
 646   %h = getelementptr inbounds i32, i32 *%d, i64 %index
 647   %hb = bitcast i32* %g to <8 x i8>*
 648   store <8 x i8> %t, <8 x i8> *%hb, align 4
 649   %index.next = add nuw i64 %index, 8
 650   %c = icmp eq i64 %index.next, %n
 651   br i1 %c, label %exit, label %loop
 652
 653 exit:
 654   ret void
 655 }
 656
 657 define void @sink_v16s16_8(i32 *%p, i32 *%d, i64 %n, <16 x i8> %a) {
 658 ; CHECK-LABEL: sink_v16s16_8:
 659 ; CHECK:       // %bb.0: // %entry
 660 ; CHECK-NEXT:    dup v0.16b, v0.b[10]
 661 ; CHECK-NEXT:    mov x8, xzr
 662 ; CHECK-NEXT:  .LBB9_1: // %loop
 663 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
 664 ; CHECK-NEXT:    ldr q1, [x0]
 665 ; CHECK-NEXT:    subs x2, x2, #8
 666 ; CHECK-NEXT:    add x8, x8, #8
 667 ; CHECK-NEXT:    smull v2.8h, v1.8b, v0.8b
 668 ; CHECK-NEXT:    smull2 v1.8h, v1.16b, v0.16b
 669 ; CHECK-NEXT:    cmlt v1.8h, v1.8h, #0
 670 ; CHECK-NEXT:    cmlt v2.8h, v2.8h, #0
 671 ; CHECK-NEXT:    uzp1 v1.16b, v2.16b, v1.16b
 672 ; CHECK-NEXT:    str q1, [x0], #32
 673 ; CHECK-NEXT:    b.ne .LBB9_1
 674 ; CHECK-NEXT:  // %bb.2: // %exit
 675 ; CHECK-NEXT:    ret
 676 entry:
 677   %ext = sext <16 x i8> %a to <16 x i16>
 678   %broadcast.splat = shufflevector <16 x i16> %ext, <16 x i16> poison, <16 x i32> <i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10>
 679   br label %loop
 680
 681 loop:
 682   %index = phi i64 [ 0, %entry ], [ %index.next, %loop ]
 683   %g = getelementptr inbounds i32, i32 *%p, i64 %index
 684   %gb = bitcast i32* %g to <16 x i8>*
 685   %l = load <16 x i8>, <16 x i8> *%gb, align 4
 686   %e = sext <16 x i8> %l to <16 x i16>
 687   %m = mul <16 x i16> %e, %broadcast.splat
 688   %s = ashr <16 x i16> %m, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
 689   %t = trunc <16 x i16> %s to <16 x i8>
 690   %h = getelementptr inbounds i32, i32 *%d, i64 %index
 691   %hb = bitcast i32* %g to <16 x i8>*
 692   store <16 x i8> %t, <16 x i8> *%hb, align 4
 693   %index.next = add nuw i64 %index, 8
 694   %c = icmp eq i64 %index.next, %n
 695   br i1 %c, label %exit, label %loop
 696
 697 exit:
 698   ret void
 699 }
 700
 701 define void @matrix_mul_unsigned_and(i32 %N, i32* nocapture %C, i16* nocapture readonly %A, i32 %val) {
 702 ; CHECK-LABEL: matrix_mul_unsigned_and:
 703 ; CHECK:       // %bb.0: // %vector.header
 704 ; CHECK-NEXT:    and w8, w3, #0xffff
 705 ; CHECK-NEXT:    // kill: def $w0 killed $w0 def $x0
 706 ; CHECK-NEXT:    dup v0.4h, w8
 707 ; CHECK-NEXT:    and x8, x0, #0xfffffff8
 708 ; CHECK-NEXT:  .LBB10_1: // %vector.body
 709 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
 710 ; CHECK-NEXT:    add x9, x2, w0, uxtw #1
 711 ; CHECK-NEXT:    subs x8, x8, #8
 712 ; CHECK-NEXT:    ldp d1, d2, [x9]
 713 ; CHECK-NEXT:    add x9, x1, w0, uxtw #2
 714 ; CHECK-NEXT:    add w0, w0, #8
 715 ; CHECK-NEXT:    umull v1.4s, v0.4h, v1.4h
 716 ; CHECK-NEXT:    umull v2.4s, v0.4h, v2.4h
 717 ; CHECK-NEXT:    stp q1, q2, [x9]
 718 ; CHECK-NEXT:    b.ne .LBB10_1
 719 ; CHECK-NEXT:  // %bb.2: // %for.end12
 720 ; CHECK-NEXT:    ret
 721 vector.header:
 722   %conv4 = and i32 %val, 65535
 723   %wide.trip.count = zext i32 %N to i64
 724   %0 = add nsw i64 %wide.trip.count, -1
 725   %min.iters.check = icmp ult i32 %N, 8
 726   %1 = trunc i64 %0 to i32
 727   %2 = icmp ugt i64 %0, 4294967295
 728   %n.vec = and i64 %wide.trip.count, 4294967288
 729   %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %conv4, i32 0
 730   %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
 731   %broadcast.splatinsert31 = insertelement <4 x i32> undef, i32 %conv4, i32 0
 732   %broadcast.splat32 = shufflevector <4 x i32> %broadcast.splatinsert31, <4 x i32> undef, <4 x i32> zeroinitializer
 733   %cmp.n = icmp eq i64 %n.vec, %wide.trip.count
 734   br label %vector.body
 735
 736 vector.body:                                      ; preds = %vector.header, %vector.body
 737   %index = phi i64 [ %index.next, %vector.body ], [ 0, %vector.header ]
 738   %3 = trunc i64 %index to i32
 739   %4 = add i32 %N, %3
 740   %5 = zext i32 %4 to i64
 741   %6 = getelementptr inbounds i16, i16* %A, i64 %5
 742   %7 = bitcast i16* %6 to <4 x i16>*
 743   %wide.load = load <4 x i16>, <4 x i16>* %7, align 2
 744   %8 = getelementptr inbounds i16, i16* %6, i64 4
 745   %9 = bitcast i16* %8 to <4 x i16>*
 746   %wide.load30 = load <4 x i16>, <4 x i16>* %9, align 2
 747   %10 = zext <4 x i16> %wide.load to <4 x i32>
 748   %11 = zext <4 x i16> %wide.load30 to <4 x i32>
 749   %12 = mul nuw nsw <4 x i32> %broadcast.splat, %10
 750   %13 = mul nuw nsw <4 x i32> %broadcast.splat32, %11
 751   %14 = getelementptr inbounds i32, i32* %C, i64 %5
 752   %15 = bitcast i32* %14 to <4 x i32>*
 753   store <4 x i32> %12, <4 x i32>* %15, align 4
 754   %16 = getelementptr inbounds i32, i32* %14, i64 4
 755   %17 = bitcast i32* %16 to <4 x i32>*
 756   store <4 x i32> %13, <4 x i32>* %17, align 4
 757   %index.next = add i64 %index, 8
 758   %18 = icmp eq i64 %index.next, %n.vec
 759   br i1 %18, label %for.end12, label %vector.body
 760
 761 for.end12:                                        ; preds = %vector.body
 762   ret void
 763 }
 764
 765 define void @matrix_mul_unsigned_and_double(i32 %N, i32* nocapture %C, i16* nocapture readonly %A, i32 %val) {
 766 ; CHECK-LABEL: matrix_mul_unsigned_and_double:
 767 ; CHECK:       // %bb.0: // %vector.header
 768 ; CHECK-NEXT:    and w8, w3, #0xffff
 769 ; CHECK-NEXT:    // kill: def $w0 killed $w0 def $x0
 770 ; CHECK-NEXT:    dup v0.8h, w8
 771 ; CHECK-NEXT:    and x8, x0, #0xfffffff0
 772 ; CHECK-NEXT:  .LBB11_1: // %vector.body
 773 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
 774 ; CHECK-NEXT:    add x9, x2, w0, uxtw #1
 775 ; CHECK-NEXT:    subs x8, x8, #16
 776 ; CHECK-NEXT:    ldr q1, [x9]
 777 ; CHECK-NEXT:    ldur q2, [x9, #8]
 778 ; CHECK-NEXT:    add x9, x1, w0, uxtw #2
 779 ; CHECK-NEXT:    add w0, w0, #16
 780 ; CHECK-NEXT:    umull2 v3.4s, v0.8h, v1.8h
 781 ; CHECK-NEXT:    umull v1.4s, v0.4h, v1.4h
 782 ; CHECK-NEXT:    umull2 v4.4s, v0.8h, v2.8h
 783 ; CHECK-NEXT:    umull v2.4s, v0.4h, v2.4h
 784 ; CHECK-NEXT:    stp q1, q3, [x9]
 785 ; CHECK-NEXT:    stp q2, q4, [x9, #32]
 786 ; CHECK-NEXT:    b.ne .LBB11_1
 787 ; CHECK-NEXT:  // %bb.2: // %for.end12
 788 ; CHECK-NEXT:    ret
 789 vector.header:
 790   %conv4 = and i32 %val, 65535
 791   %wide.trip.count = zext i32 %N to i64
 792   %0 = add nsw i64 %wide.trip.count, -1
 793   %min.iters.check = icmp ult i32 %N, 16
 794   %1 = trunc i64 %0 to i32
 795   %2 = icmp ugt i64 %0, 4294967295
 796   %n.vec = and i64 %wide.trip.count, 4294967280
 797   %broadcast.splatinsert = insertelement <8 x i32> undef, i32 %conv4, i32 0
 798   %broadcast.splat = shufflevector <8 x i32> %broadcast.splatinsert, <8 x i32> undef, <8 x i32> zeroinitializer
 799   %broadcast.splatinsert31 = insertelement <8 x i32> undef, i32 %conv4, i32 0
 800   %broadcast.splat32 = shufflevector <8 x i32> %broadcast.splatinsert31, <8 x i32> undef, <8 x i32> zeroinitializer
 801   %cmp.n = icmp eq i64 %n.vec, %wide.trip.count
 802   br label %vector.body
 803
 804 vector.body:                                      ; preds = %vector.header, %vector.body
 805   %index = phi i64 [ %index.next, %vector.body ], [ 0, %vector.header ]
 806   %3 = trunc i64 %index to i32
 807   %4 = add i32 %N, %3
 808   %5 = zext i32 %4 to i64
 809   %6 = getelementptr inbounds i16, i16* %A, i64 %5
 810   %7 = bitcast i16* %6 to <8 x i16>*
 811   %wide.load = load <8 x i16>, <8 x i16>* %7, align 2
 812   %8 = getelementptr inbounds i16, i16* %6, i64 4
 813   %9 = bitcast i16* %8 to <8 x i16>*
 814   %wide.load30 = load <8 x i16>, <8 x i16>* %9, align 2
 815   %10 = zext <8 x i16> %wide.load to <8 x i32>
 816   %11 = zext <8 x i16> %wide.load30 to <8 x i32>
 817   %12 = mul nuw nsw <8 x i32> %broadcast.splat, %10
 818   %13 = mul nuw nsw <8 x i32> %broadcast.splat32, %11
 819   %14 = getelementptr inbounds i32, i32* %C, i64 %5
 820   %15 = bitcast i32* %14 to <8 x i32>*
 821   store <8 x i32> %12, <8 x i32>* %15, align 4
 822   %16 = getelementptr inbounds i32, i32* %14, i64 8
 823   %17 = bitcast i32* %16 to <8 x i32>*
 824   store <8 x i32> %13, <8 x i32>* %17, align 4
 825   %index.next = add i64 %index, 16
 826   %18 = icmp eq i64 %index.next, %n.vec
 827   br i1 %18, label %for.end12, label %vector.body
 828
 829 for.end12:                                        ; preds = %vector.body
 830   ret void
 831 }
 832
 833 define void @matrix_mul_signed_and(i32 %N, i32* nocapture %C, i16* nocapture readonly %A, i32 %val) {
 834 ; CHECK-LABEL: matrix_mul_signed_and:
 835 ; CHECK:       // %bb.0: // %vector.header
 836 ; CHECK-NEXT:    and w8, w3, #0xffff
 837 ; CHECK-NEXT:    // kill: def $w0 killed $w0 def $x0
 838 ; CHECK-NEXT:    dup v0.4s, w8
 839 ; CHECK-NEXT:    and x8, x0, #0xfffffff8
 840 ; CHECK-NEXT:  .LBB12_1: // %vector.body
 841 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
 842 ; CHECK-NEXT:    add x9, x2, w0, uxtw #1
 843 ; CHECK-NEXT:    subs x8, x8, #8
 844 ; CHECK-NEXT:    ldp d1, d2, [x9]
 845 ; CHECK-NEXT:    add x9, x1, w0, uxtw #2
 846 ; CHECK-NEXT:    add w0, w0, #8
 847 ; CHECK-NEXT:    sshll v1.4s, v1.4h, #0
 848 ; CHECK-NEXT:    sshll v2.4s, v2.4h, #0
 849 ; CHECK-NEXT:    mul v1.4s, v0.4s, v1.4s
 850 ; CHECK-NEXT:    mul v2.4s, v0.4s, v2.4s
 851 ; CHECK-NEXT:    stp q1, q2, [x9]
 852 ; CHECK-NEXT:    b.ne .LBB12_1
 853 ; CHECK-NEXT:  // %bb.2: // %for.end12
 854 ; CHECK-NEXT:    ret
 855 vector.header:
 856   %conv4 = and i32 %val, 65535
 857   %wide.trip.count = zext i32 %N to i64
 858   %0 = add nsw i64 %wide.trip.count, -1
 859   %min.iters.check = icmp ult i32 %N, 8
 860   %1 = trunc i64 %0 to i32
 861   %2 = icmp ugt i64 %0, 4294967295
 862   %n.vec = and i64 %wide.trip.count, 4294967288
 863   %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %conv4, i32 0
 864   %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
 865   %broadcast.splatinsert31 = insertelement <4 x i32> undef, i32 %conv4, i32 0
 866   %broadcast.splat32 = shufflevector <4 x i32> %broadcast.splatinsert31, <4 x i32> undef, <4 x i32> zeroinitializer
 867   %cmp.n = icmp eq i64 %n.vec, %wide.trip.count
 868   br label %vector.body
 869
 870 vector.body:                                      ; preds = %vector.header, %vector.body
 871   %index = phi i64 [ %index.next, %vector.body ], [ 0, %vector.header ]
 872   %3 = trunc i64 %index to i32
 873   %4 = add i32 %N, %3
 874   %5 = zext i32 %4 to i64
 875   %6 = getelementptr inbounds i16, i16* %A, i64 %5
 876   %7 = bitcast i16* %6 to <4 x i16>*
 877   %wide.load = load <4 x i16>, <4 x i16>* %7, align 2
 878   %8 = getelementptr inbounds i16, i16* %6, i64 4
 879   %9 = bitcast i16* %8 to <4 x i16>*
 880   %wide.load30 = load <4 x i16>, <4 x i16>* %9, align 2
 881   %10 = sext <4 x i16> %wide.load to <4 x i32>
 882   %11 = sext <4 x i16> %wide.load30 to <4 x i32>
 883   %12 = mul nuw nsw <4 x i32> %broadcast.splat, %10
 884   %13 = mul nuw nsw <4 x i32> %broadcast.splat32, %11
 885   %14 = getelementptr inbounds i32, i32* %C, i64 %5
 886   %15 = bitcast i32* %14 to <4 x i32>*
 887   store <4 x i32> %12, <4 x i32>* %15, align 4
 888   %16 = getelementptr inbounds i32, i32* %14, i64 4
 889   %17 = bitcast i32* %16 to <4 x i32>*
 890   store <4 x i32> %13, <4 x i32>* %17, align 4
 891   %index.next = add i64 %index, 8
 892   %18 = icmp eq i64 %index.next, %n.vec
 893   br i1 %18, label %for.end12, label %vector.body
 894
 895 for.end12:                                        ; preds = %vector.body
 896   ret void
 897 }
 898
 899 define void @matrix_mul_signed_and_double(i32 %N, i32* nocapture %C, i16* nocapture readonly %A, i32 %val) {
 900 ; CHECK-LABEL: matrix_mul_signed_and_double:
 901 ; CHECK:       // %bb.0: // %vector.header
 902 ; CHECK-NEXT:    and w8, w3, #0xffff
 903 ; CHECK-NEXT:    // kill: def $w0 killed $w0 def $x0
 904 ; CHECK-NEXT:    dup v0.4s, w8
 905 ; CHECK-NEXT:    and x8, x0, #0xfffffff0
 906 ; CHECK-NEXT:  .LBB13_1: // %vector.body
 907 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
 908 ; CHECK-NEXT:    add x9, x2, w0, uxtw #1
 909 ; CHECK-NEXT:    subs x8, x8, #16
 910 ; CHECK-NEXT:    ldr q1, [x9]
 911 ; CHECK-NEXT:    ldur q2, [x9, #8]
 912 ; CHECK-NEXT:    add x9, x1, w0, uxtw #2
 913 ; CHECK-NEXT:    add w0, w0, #16
 914 ; CHECK-NEXT:    sshll2 v3.4s, v1.8h, #0
 915 ; CHECK-NEXT:    sshll v1.4s, v1.4h, #0
 916 ; CHECK-NEXT:    sshll2 v4.4s, v2.8h, #0
 917 ; CHECK-NEXT:    sshll v2.4s, v2.4h, #0
 918 ; CHECK-NEXT:    mul v3.4s, v0.4s, v3.4s
 919 ; CHECK-NEXT:    mul v1.4s, v0.4s, v1.4s
 920 ; CHECK-NEXT:    mul v4.4s, v0.4s, v4.4s
 921 ; CHECK-NEXT:    mul v2.4s, v0.4s, v2.4s
 922 ; CHECK-NEXT:    stp q1, q3, [x9]
 923 ; CHECK-NEXT:    stp q2, q4, [x9, #32]
 924 ; CHECK-NEXT:    b.ne .LBB13_1
 925 ; CHECK-NEXT:  // %bb.2: // %for.end12
 926 ; CHECK-NEXT:    ret
 927 vector.header:
 928   %conv4 = and i32 %val, 65535
 929   %wide.trip.count = zext i32 %N to i64
 930   %0 = add nsw i64 %wide.trip.count, -1
 931   %min.iters.check = icmp ult i32 %N, 16
 932   %1 = trunc i64 %0 to i32
 933   %2 = icmp ugt i64 %0, 4294967295
 934   %n.vec = and i64 %wide.trip.count, 4294967280
 935   %broadcast.splatinsert = insertelement <8 x i32> undef, i32 %conv4, i32 0
 936   %broadcast.splat = shufflevector <8 x i32> %broadcast.splatinsert, <8 x i32> undef, <8 x i32> zeroinitializer
 937   %broadcast.splatinsert31 = insertelement <8 x i32> undef, i32 %conv4, i32 0
 938   %broadcast.splat32 = shufflevector <8 x i32> %broadcast.splatinsert31, <8 x i32> undef, <8 x i32> zeroinitializer
 939   %cmp.n = icmp eq i64 %n.vec, %wide.trip.count
 940   br label %vector.body
 941
 942 vector.body:                                      ; preds = %vector.header, %vector.body
 943   %index = phi i64 [ %index.next, %vector.body ], [ 0, %vector.header ]
 944   %3 = trunc i64 %index to i32
 945   %4 = add i32 %N, %3
 946   %5 = zext i32 %4 to i64
 947   %6 = getelementptr inbounds i16, i16* %A, i64 %5
 948   %7 = bitcast i16* %6 to <8 x i16>*
 949   %wide.load = load <8 x i16>, <8 x i16>* %7, align 2
 950   %8 = getelementptr inbounds i16, i16* %6, i64 4
 951   %9 = bitcast i16* %8 to <8 x i16>*
 952   %wide.load30 = load <8 x i16>, <8 x i16>* %9, align 2
 953   %10 = sext <8 x i16> %wide.load to <8 x i32>
 954   %11 = sext <8 x i16> %wide.load30 to <8 x i32>
 955   %12 = mul nuw nsw <8 x i32> %broadcast.splat, %10
 956   %13 = mul nuw nsw <8 x i32> %broadcast.splat32, %11
 957   %14 = getelementptr inbounds i32, i32* %C, i64 %5
 958   %15 = bitcast i32* %14 to <8 x i32>*
 959   store <8 x i32> %12, <8 x i32>* %15, align 4
 960   %16 = getelementptr inbounds i32, i32* %14, i64 8
 961   %17 = bitcast i32* %16 to <8 x i32>*
 962   store <8 x i32> %13, <8 x i32>* %17, align 4
 963   %index.next = add i64 %index, 16
 964   %18 = icmp eq i64 %index.next, %n.vec
 965   br i1 %18, label %for.end12, label %vector.body
 966
 967 for.end12:                                        ; preds = %vector.body
 968   ret void
 969 }
 970
 971 declare i16 @llvm.vector.reduce.add.v8i16(<8 x i16>)
 972