llvm/test/Analysis/CostModel/ARM/mve-vecreduce-add.ll

   1 ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
   2 ; RUN: opt < %s -S -mtriple=thumbv8.1m.main-none-eabi -mattr=+mve.fp -passes="print<cost-model>" 2>&1 -disable-output | FileCheck %s
   3
   4 target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
   5
   6 define void @add_i8() {
   7 ; CHECK-LABEL: 'add_i8'
   8 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a0 = call i8 @llvm.vector.reduce.add.v1i8(<1 x i8> undef)
   9 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %a1 = call i8 @llvm.vector.reduce.add.v2i8(<2 x i8> undef)
  10 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a2 = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> undef)
  11 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a3 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> undef)
  12 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a4 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> undef)
  13 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
  14 ;
  15   %a0 = call i8 @llvm.vector.reduce.add.v1i8(<1 x i8> undef)
  16
  17   %a1 = call i8 @llvm.vector.reduce.add.v2i8(<2 x i8> undef)
  18
  19   %a2 = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> undef)
  20
  21   %a3 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> undef)
  22
  23   %a4 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> undef)
  24
  25   ret void
  26 }
  27
  28 define void @add_i16() {
  29 ; CHECK-LABEL: 'add_i16'
  30 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a0za = zext <1 x i8> undef to <1 x i16>
  31 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a0z = call i16 @llvm.vector.reduce.add.v1i16(<1 x i16> %a0za)
  32 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a0sa = sext <1 x i8> undef to <1 x i16>
  33 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a0s = call i16 @llvm.vector.reduce.add.v1i16(<1 x i16> %a0sa)
  34 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a1za = zext <2 x i8> undef to <2 x i16>
  35 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %a1z = call i16 @llvm.vector.reduce.add.v2i16(<2 x i16> %a1za)
  36 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a1sa = sext <2 x i8> undef to <2 x i16>
  37 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %a1s = call i16 @llvm.vector.reduce.add.v2i16(<2 x i16> %a1sa)
  38 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a2za = zext <4 x i8> undef to <4 x i16>
  39 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a2z = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> %a2za)
  40 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a2sa = sext <4 x i8> undef to <4 x i16>
  41 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a2s = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> %a2sa)
  42 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a3za = zext <8 x i8> undef to <8 x i16>
  43 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a3z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %a3za)
  44 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a3sa = sext <8 x i8> undef to <8 x i16>
  45 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a3s = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %a3sa)
  46 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %a4za = zext <16 x i8> undef to <16 x i16>
  47 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a4z = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %a4za)
  48 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %a4sa = sext <16 x i8> undef to <16 x i16>
  49 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a4s = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %a4sa)
  50 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a5 = call i16 @llvm.vector.reduce.add.v1i16(<1 x i16> undef)
  51 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %a6 = call i16 @llvm.vector.reduce.add.v2i16(<2 x i16> undef)
  52 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a7 = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> undef)
  53 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a8 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> undef)
  54 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a9 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> undef)
  55 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
  56 ;
  57   %a0za = zext <1 x i8> undef to <1 x i16>
  58   %a0z = call i16 @llvm.vector.reduce.add.v1i16(<1 x i16> %a0za)
  59
  60   %a0sa = sext <1 x i8> undef to <1 x i16>
  61   %a0s = call i16 @llvm.vector.reduce.add.v1i16(<1 x i16> %a0sa)
  62
  63   %a1za = zext <2 x i8> undef to <2 x i16>
  64   %a1z = call i16 @llvm.vector.reduce.add.v2i16(<2 x i16> %a1za)
  65
  66   %a1sa = sext <2 x i8> undef to <2 x i16>
  67   %a1s = call i16 @llvm.vector.reduce.add.v2i16(<2 x i16> %a1sa)
  68
  69   %a2za = zext <4 x i8> undef to <4 x i16>
  70   %a2z = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> %a2za)
  71
  72   %a2sa = sext <4 x i8> undef to <4 x i16>
  73   %a2s = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> %a2sa)
  74
  75   %a3za = zext <8 x i8> undef to <8 x i16>
  76   %a3z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %a3za)
  77
  78   %a3sa = sext <8 x i8> undef to <8 x i16>
  79   %a3s = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %a3sa)
  80
  81   %a4za = zext <16 x i8> undef to <16 x i16>
  82   %a4z = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %a4za)
  83
  84   %a4sa = sext <16 x i8> undef to <16 x i16>
  85   %a4s = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %a4sa)
  86
  87   %a5 = call i16 @llvm.vector.reduce.add.v1i16(<1 x i16> undef)
  88
  89   %a6 = call i16 @llvm.vector.reduce.add.v2i16(<2 x i16> undef)
  90
  91   %a7 = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> undef)
  92
  93   %a8 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> undef)
  94
  95   %a9 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> undef)
  96
  97   ret void
  98 }
  99
 100 define void @add_i32() {
 101 ; CHECK-LABEL: 'add_i32'
 102 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a0za = zext <1 x i8> undef to <1 x i32>
 103 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a0z = call i32 @llvm.vector.reduce.add.v1i32(<1 x i32> %a0za)
 104 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a0sa = sext <1 x i8> undef to <1 x i32>
 105 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a0s = call i32 @llvm.vector.reduce.add.v1i32(<1 x i32> %a0sa)
 106 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a1za = zext <2 x i8> undef to <2 x i32>
 107 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %a1z = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> %a1za)
 108 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a1sa = sext <2 x i8> undef to <2 x i32>
 109 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %a1s = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> %a1sa)
 110 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a2za = zext <4 x i8> undef to <4 x i32>
 111 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a2z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %a2za)
 112 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a2sa = sext <4 x i8> undef to <4 x i32>
 113 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a2s = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %a2sa)
 114 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %a3za = zext <8 x i8> undef to <8 x i32>
 115 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a3z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %a3za)
 116 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %a3sa = sext <8 x i8> undef to <8 x i32>
 117 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a3s = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %a3sa)
 118 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 74 for instruction: %a4za = zext <16 x i8> undef to <16 x i32>
 119 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %a4z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %a4za)
 120 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 74 for instruction: %a4sa = sext <16 x i8> undef to <16 x i32>
 121 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %a4s = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %a4sa)
 122 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a5za = zext <1 x i16> undef to <1 x i32>
 123 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a5z = call i32 @llvm.vector.reduce.add.v1i32(<1 x i32> %a5za)
 124 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a5sa = sext <1 x i16> undef to <1 x i32>
 125 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a5s = call i32 @llvm.vector.reduce.add.v1i32(<1 x i32> %a5sa)
 126 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a6za = zext <2 x i16> undef to <2 x i32>
 127 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %a6z = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> %a6za)
 128 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a6sa = sext <2 x i16> undef to <2 x i32>
 129 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %a6s = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> %a6sa)
 130 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a7za = zext <4 x i16> undef to <4 x i32>
 131 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a7z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %a7za)
 132 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a7sa = sext <4 x i16> undef to <4 x i32>
 133 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a7s = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %a7sa)
 134 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %a8za = zext <8 x i16> undef to <8 x i32>
 135 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a8z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %a8za)
 136 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %a8sa = sext <8 x i16> undef to <8 x i32>
 137 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a8s = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %a8sa)
 138 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %a9za = zext <16 x i16> undef to <16 x i32>
 139 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %a9z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %a9za)
 140 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %a9sa = sext <16 x i16> undef to <16 x i32>
 141 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %a9s = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %a9sa)
 142 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a10 = call i32 @llvm.vector.reduce.add.v1i32(<1 x i32> undef)
 143 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %a11 = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> undef)
 144 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a12 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> undef)
 145 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a13 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> undef)
 146 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %a14 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> undef)
 147 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 148 ;
 149   %a0za = zext <1 x i8> undef to <1 x i32>
 150   %a0z = call i32 @llvm.vector.reduce.add.v1i32(<1 x i32> %a0za)
 151
 152   %a0sa = sext <1 x i8> undef to <1 x i32>
 153   %a0s = call i32 @llvm.vector.reduce.add.v1i32(<1 x i32> %a0sa)
 154
 155   %a1za = zext <2 x i8> undef to <2 x i32>
 156   %a1z = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> %a1za)
 157
 158   %a1sa = sext <2 x i8> undef to <2 x i32>
 159   %a1s = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> %a1sa)
 160
 161   %a2za = zext <4 x i8> undef to <4 x i32>
 162   %a2z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %a2za)
 163
 164   %a2sa = sext <4 x i8> undef to <4 x i32>
 165   %a2s = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %a2sa)
 166
 167   %a3za = zext <8 x i8> undef to <8 x i32>
 168   %a3z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %a3za)
 169
 170   %a3sa = sext <8 x i8> undef to <8 x i32>
 171   %a3s = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %a3sa)
 172
 173   %a4za = zext <16 x i8> undef to <16 x i32>
 174   %a4z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %a4za)
 175
 176   %a4sa = sext <16 x i8> undef to <16 x i32>
 177   %a4s = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %a4sa)
 178
 179   %a5za = zext <1 x i16> undef to <1 x i32>
 180   %a5z = call i32 @llvm.vector.reduce.add.v1i32(<1 x i32> %a5za)
 181
 182   %a5sa = sext <1 x i16> undef to <1 x i32>
 183   %a5s = call i32 @llvm.vector.reduce.add.v1i32(<1 x i32> %a5sa)
 184
 185   %a6za = zext <2 x i16> undef to <2 x i32>
 186   %a6z = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> %a6za)
 187
 188   %a6sa = sext <2 x i16> undef to <2 x i32>
 189   %a6s = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> %a6sa)
 190
 191   %a7za = zext <4 x i16> undef to <4 x i32>
 192   %a7z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %a7za)
 193
 194   %a7sa = sext <4 x i16> undef to <4 x i32>
 195   %a7s = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %a7sa)
 196
 197   %a8za = zext <8 x i16> undef to <8 x i32>
 198   %a8z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %a8za)
 199
 200   %a8sa = sext <8 x i16> undef to <8 x i32>
 201   %a8s = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %a8sa)
 202
 203   %a9za = zext <16 x i16> undef to <16 x i32>
 204   %a9z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %a9za)
 205
 206   %a9sa = sext <16 x i16> undef to <16 x i32>
 207   %a9s = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %a9sa)
 208
 209   %a10 = call i32 @llvm.vector.reduce.add.v1i32(<1 x i32> undef)
 210
 211   %a11 = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> undef)
 212
 213   %a12 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> undef)
 214
 215   %a13 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> undef)
 216
 217   %a14 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> undef)
 218
 219   ret void
 220 }
 221
 222 define void @add_i64() {
 223 ; CHECK-LABEL: 'add_i64'
 224 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %a0za = zext <1 x i8> undef to <1 x i64>
 225 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %a0z = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> %a0za)
 226 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %a0sa = sext <1 x i8> undef to <1 x i64>
 227 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %a0s = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> %a0sa)
 228 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a1za = zext <2 x i8> undef to <2 x i64>
 229 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 108 for instruction: %a1z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %a1za)
 230 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %a1sa = sext <2 x i8> undef to <2 x i64>
 231 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 108 for instruction: %a1s = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %a1sa)
 232 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %a2za = zext <4 x i8> undef to <4 x i64>
 233 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 208 for instruction: %a2z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %a2za)
 234 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 82 for instruction: %a2sa = sext <4 x i8> undef to <4 x i64>
 235 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 208 for instruction: %a2s = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %a2sa)
 236 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 74 for instruction: %a3za = zext <8 x i8> undef to <8 x i64>
 237 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 408 for instruction: %a3z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %a3za)
 238 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 330 for instruction: %a3sa = sext <8 x i8> undef to <8 x i64>
 239 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 408 for instruction: %a3s = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %a3sa)
 240 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 298 for instruction: %a4za = zext <16 x i8> undef to <16 x i64>
 241 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 808 for instruction: %a4z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %a4za)
 242 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1322 for instruction: %a4sa = sext <16 x i8> undef to <16 x i64>
 243 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 808 for instruction: %a4s = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %a4sa)
 244 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %a5za = zext <1 x i16> undef to <1 x i64>
 245 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %a5z = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> %a5za)
 246 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %a5sa = sext <1 x i16> undef to <1 x i64>
 247 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %a5s = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> %a5sa)
 248 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a6za = zext <2 x i16> undef to <2 x i64>
 249 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 108 for instruction: %a6z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %a6za)
 250 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %a6sa = sext <2 x i16> undef to <2 x i64>
 251 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 108 for instruction: %a6s = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %a6sa)
 252 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %a7za = zext <4 x i16> undef to <4 x i64>
 253 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 208 for instruction: %a7z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %a7za)
 254 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 82 for instruction: %a7sa = sext <4 x i16> undef to <4 x i64>
 255 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 208 for instruction: %a7s = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %a7sa)
 256 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 74 for instruction: %a8za = zext <8 x i16> undef to <8 x i64>
 257 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 408 for instruction: %a8z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %a8za)
 258 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 330 for instruction: %a8sa = sext <8 x i16> undef to <8 x i64>
 259 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 408 for instruction: %a8s = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %a8sa)
 260 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 296 for instruction: %a9za = zext <16 x i16> undef to <16 x i64>
 261 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 808 for instruction: %a9z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %a9za)
 262 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1320 for instruction: %a9sa = sext <16 x i16> undef to <16 x i64>
 263 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 808 for instruction: %a9s = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %a9sa)
 264 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %a10za = zext <1 x i32> undef to <1 x i64>
 265 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %a10z = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> %a10za)
 266 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %a10sa = sext <1 x i32> undef to <1 x i64>
 267 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %a10s = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> %a10sa)
 268 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a11za = zext <2 x i32> undef to <2 x i64>
 269 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 108 for instruction: %a11z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %a11za)
 270 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %a11sa = sext <2 x i32> undef to <2 x i64>
 271 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 108 for instruction: %a11s = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %a11sa)
 272 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %a12za = zext <4 x i32> undef to <4 x i64>
 273 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 208 for instruction: %a12z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %a12za)
 274 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %a12sa = sext <4 x i32> undef to <4 x i64>
 275 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 208 for instruction: %a12s = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %a12sa)
 276 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %a13za = zext <8 x i32> undef to <8 x i64>
 277 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 408 for instruction: %a13z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %a13za)
 278 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 264 for instruction: %a13sa = sext <8 x i32> undef to <8 x i64>
 279 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 408 for instruction: %a13s = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %a13sa)
 280 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 288 for instruction: %a14za = zext <16 x i32> undef to <16 x i64>
 281 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 808 for instruction: %a14z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %a14za)
 282 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1056 for instruction: %a14sa = sext <16 x i32> undef to <16 x i64>
 283 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 808 for instruction: %a14s = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %a14sa)
 284 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %a15 = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> undef)
 285 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 108 for instruction: %a16 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> undef)
 286 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 208 for instruction: %a17 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> undef)
 287 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 408 for instruction: %a18 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> undef)
 288 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 808 for instruction: %a19 = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> undef)
 289 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 290 ;
 291   %a0za = zext <1 x i8> undef to <1 x i64>
 292   %a0z = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> %a0za)
 293
 294   %a0sa = sext <1 x i8> undef to <1 x i64>
 295   %a0s = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> %a0sa)
 296
 297   %a1za = zext <2 x i8> undef to <2 x i64>
 298   %a1z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %a1za)
 299
 300   %a1sa = sext <2 x i8> undef to <2 x i64>
 301   %a1s = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %a1sa)
 302
 303   %a2za = zext <4 x i8> undef to <4 x i64>
 304   %a2z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %a2za)
 305
 306   %a2sa = sext <4 x i8> undef to <4 x i64>
 307   %a2s = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %a2sa)
 308
 309   %a3za = zext <8 x i8> undef to <8 x i64>
 310   %a3z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %a3za)
 311
 312   %a3sa = sext <8 x i8> undef to <8 x i64>
 313   %a3s = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %a3sa)
 314
 315   %a4za = zext <16 x i8> undef to <16 x i64>
 316   %a4z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %a4za)
 317
 318   %a4sa = sext <16 x i8> undef to <16 x i64>
 319   %a4s = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %a4sa)
 320
 321   %a5za = zext <1 x i16> undef to <1 x i64>
 322   %a5z = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> %a5za)
 323
 324   %a5sa = sext <1 x i16> undef to <1 x i64>
 325   %a5s = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> %a5sa)
 326
 327   %a6za = zext <2 x i16> undef to <2 x i64>
 328   %a6z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %a6za)
 329
 330   %a6sa = sext <2 x i16> undef to <2 x i64>
 331   %a6s = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %a6sa)
 332
 333   %a7za = zext <4 x i16> undef to <4 x i64>
 334   %a7z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %a7za)
 335
 336   %a7sa = sext <4 x i16> undef to <4 x i64>
 337   %a7s = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %a7sa)
 338
 339   %a8za = zext <8 x i16> undef to <8 x i64>
 340   %a8z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %a8za)
 341
 342   %a8sa = sext <8 x i16> undef to <8 x i64>
 343   %a8s = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %a8sa)
 344
 345   %a9za = zext <16 x i16> undef to <16 x i64>
 346   %a9z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %a9za)
 347
 348   %a9sa = sext <16 x i16> undef to <16 x i64>
 349   %a9s = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %a9sa)
 350
 351   %a10za = zext <1 x i32> undef to <1 x i64>
 352   %a10z = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> %a10za)
 353
 354   %a10sa = sext <1 x i32> undef to <1 x i64>
 355   %a10s = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> %a10sa)
 356
 357   %a11za = zext <2 x i32> undef to <2 x i64>
 358   %a11z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %a11za)
 359
 360   %a11sa = sext <2 x i32> undef to <2 x i64>
 361   %a11s = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %a11sa)
 362
 363   %a12za = zext <4 x i32> undef to <4 x i64>
 364   %a12z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %a12za)
 365
 366   %a12sa = sext <4 x i32> undef to <4 x i64>
 367   %a12s = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %a12sa)
 368
 369   %a13za = zext <8 x i32> undef to <8 x i64>
 370   %a13z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %a13za)
 371
 372   %a13sa = sext <8 x i32> undef to <8 x i64>
 373   %a13s = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %a13sa)
 374
 375   %a14za = zext <16 x i32> undef to <16 x i64>
 376   %a14z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %a14za)
 377
 378   %a14sa = sext <16 x i32> undef to <16 x i64>
 379   %a14s = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %a14sa)
 380
 381   %a15 = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> undef)
 382
 383   %a16 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> undef)
 384
 385   %a17 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> undef)
 386
 387   %a18 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> undef)
 388
 389   %a19 = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> undef)
 390
 391   ret void
 392 }
 393
 394 define void @mla_i8() {
 395 ; CHECK-LABEL: 'mla_i8'
 396 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a0m = mul <1 x i8> undef, undef
 397 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a0 = call i8 @llvm.vector.reduce.add.v1i8(<1 x i8> %a0m)
 398 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %a1m = mul <2 x i8> undef, undef
 399 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %a1 = call i8 @llvm.vector.reduce.add.v2i8(<2 x i8> %a1m)
 400 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a2m = mul <4 x i8> undef, undef
 401 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a2 = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> %a2m)
 402 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a3m = mul <8 x i8> undef, undef
 403 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a3 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> %a3m)
 404 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a4m = mul <16 x i8> undef, undef
 405 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a4 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %a4m)
 406 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 407 ;
 408   %a0m = mul <1 x i8> undef, undef
 409   %a0 = call i8 @llvm.vector.reduce.add.v1i8(<1 x i8> %a0m)
 410
 411   %a1m = mul <2 x i8> undef, undef
 412   %a1 = call i8 @llvm.vector.reduce.add.v2i8(<2 x i8> %a1m)
 413
 414   %a2m = mul <4 x i8> undef, undef
 415   %a2 = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> %a2m)
 416
 417   %a3m = mul <8 x i8> undef, undef
 418   %a3 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> %a3m)
 419
 420   %a4m = mul <16 x i8> undef, undef
 421   %a4 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %a4m)
 422
 423   ret void
 424 }
 425
 426 define void @mla_i16() {
 427 ; CHECK-LABEL: 'mla_i16'
 428 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a0za = zext <1 x i8> undef to <1 x i16>
 429 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a0zb = zext <1 x i8> undef to <1 x i16>
 430 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a0zm = mul <1 x i16> %a0za, %a0zb
 431 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a0z = call i16 @llvm.vector.reduce.add.v1i16(<1 x i16> %a0zm)
 432 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a0sa = sext <1 x i8> undef to <1 x i16>
 433 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a0sb = sext <1 x i8> undef to <1 x i16>
 434 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a0sm = mul <1 x i16> %a0sa, %a0sb
 435 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a0s = call i16 @llvm.vector.reduce.add.v1i16(<1 x i16> %a0sm)
 436 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a1za = zext <2 x i8> undef to <2 x i16>
 437 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a1zb = zext <2 x i8> undef to <2 x i16>
 438 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %a1zm = mul <2 x i16> %a1za, %a1zb
 439 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %a1z = call i16 @llvm.vector.reduce.add.v2i16(<2 x i16> %a1zm)
 440 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a1sa = sext <2 x i8> undef to <2 x i16>
 441 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a1sb = sext <2 x i8> undef to <2 x i16>
 442 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %a1sm = mul <2 x i16> %a1sa, %a1sb
 443 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %a1s = call i16 @llvm.vector.reduce.add.v2i16(<2 x i16> %a1sm)
 444 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a2za = zext <4 x i8> undef to <4 x i16>
 445 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a2zb = zext <4 x i8> undef to <4 x i16>
 446 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a2zm = mul <4 x i16> %a2za, %a2zb
 447 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a2z = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> %a2zm)
 448 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a2sa = sext <4 x i8> undef to <4 x i16>
 449 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a2sb = sext <4 x i8> undef to <4 x i16>
 450 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a2sm = mul <4 x i16> %a2sa, %a2sb
 451 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a2s = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> %a2sm)
 452 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a3za = zext <8 x i8> undef to <8 x i16>
 453 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a3zb = zext <8 x i8> undef to <8 x i16>
 454 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a3zm = mul <8 x i16> %a3za, %a3zb
 455 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a3z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %a3zm)
 456 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a3sa = sext <8 x i8> undef to <8 x i16>
 457 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a3sb = sext <8 x i8> undef to <8 x i16>
 458 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a3sm = mul <8 x i16> %a3sa, %a3sb
 459 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a3s = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %a3sm)
 460 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %a4za = zext <16 x i8> undef to <16 x i16>
 461 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %a4zb = zext <16 x i8> undef to <16 x i16>
 462 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a4zm = mul <16 x i16> %a4za, %a4zb
 463 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a4z = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %a4zm)
 464 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %a4sa = sext <16 x i8> undef to <16 x i16>
 465 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %a4sb = sext <16 x i8> undef to <16 x i16>
 466 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a4sm = mul <16 x i16> %a4sa, %a4sb
 467 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a4s = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %a4sm)
 468 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a5m = mul <1 x i16> undef, undef
 469 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a5 = call i16 @llvm.vector.reduce.add.v1i16(<1 x i16> %a5m)
 470 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %a6m = mul <2 x i16> undef, undef
 471 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %a6 = call i16 @llvm.vector.reduce.add.v2i16(<2 x i16> %a6m)
 472 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a7m = mul <4 x i16> undef, undef
 473 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a7 = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> %a7m)
 474 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a8m = mul <8 x i16> undef, undef
 475 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a8 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %a8m)
 476 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a9m = mul <16 x i16> undef, undef
 477 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a9 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %a9m)
 478 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 479 ;
 480   %a0za = zext <1 x i8> undef to <1 x i16>
 481   %a0zb = zext <1 x i8> undef to <1 x i16>
 482   %a0zm = mul <1 x i16> %a0za, %a0zb
 483   %a0z = call i16 @llvm.vector.reduce.add.v1i16(<1 x i16> %a0zm)
 484
 485   %a0sa = sext <1 x i8> undef to <1 x i16>
 486   %a0sb = sext <1 x i8> undef to <1 x i16>
 487   %a0sm = mul <1 x i16> %a0sa, %a0sb
 488   %a0s = call i16 @llvm.vector.reduce.add.v1i16(<1 x i16> %a0sm)
 489
 490   %a1za = zext <2 x i8> undef to <2 x i16>
 491   %a1zb = zext <2 x i8> undef to <2 x i16>
 492   %a1zm = mul <2 x i16> %a1za, %a1zb
 493   %a1z = call i16 @llvm.vector.reduce.add.v2i16(<2 x i16> %a1zm)
 494
 495   %a1sa = sext <2 x i8> undef to <2 x i16>
 496   %a1sb = sext <2 x i8> undef to <2 x i16>
 497   %a1sm = mul <2 x i16> %a1sa, %a1sb
 498   %a1s = call i16 @llvm.vector.reduce.add.v2i16(<2 x i16> %a1sm)
 499
 500   %a2za = zext <4 x i8> undef to <4 x i16>
 501   %a2zb = zext <4 x i8> undef to <4 x i16>
 502   %a2zm = mul <4 x i16> %a2za, %a2zb
 503   %a2z = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> %a2zm)
 504
 505   %a2sa = sext <4 x i8> undef to <4 x i16>
 506   %a2sb = sext <4 x i8> undef to <4 x i16>
 507   %a2sm = mul <4 x i16> %a2sa, %a2sb
 508   %a2s = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> %a2sm)
 509
 510   %a3za = zext <8 x i8> undef to <8 x i16>
 511   %a3zb = zext <8 x i8> undef to <8 x i16>
 512   %a3zm = mul <8 x i16> %a3za, %a3zb
 513   %a3z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %a3zm)
 514
 515   %a3sa = sext <8 x i8> undef to <8 x i16>
 516   %a3sb = sext <8 x i8> undef to <8 x i16>
 517   %a3sm = mul <8 x i16> %a3sa, %a3sb
 518   %a3s = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %a3sm)
 519
 520   %a4za = zext <16 x i8> undef to <16 x i16>
 521   %a4zb = zext <16 x i8> undef to <16 x i16>
 522   %a4zm = mul <16 x i16> %a4za, %a4zb
 523   %a4z = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %a4zm)
 524
 525   %a4sa = sext <16 x i8> undef to <16 x i16>
 526   %a4sb = sext <16 x i8> undef to <16 x i16>
 527   %a4sm = mul <16 x i16> %a4sa, %a4sb
 528   %a4s = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %a4sm)
 529
 530   %a5m = mul <1 x i16> undef, undef
 531   %a5 = call i16 @llvm.vector.reduce.add.v1i16(<1 x i16> %a5m)
 532
 533   %a6m = mul <2 x i16> undef, undef
 534   %a6 = call i16 @llvm.vector.reduce.add.v2i16(<2 x i16> %a6m)
 535
 536   %a7m = mul <4 x i16> undef, undef
 537   %a7 = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> %a7m)
 538
 539   %a8m = mul <8 x i16> undef, undef
 540   %a8 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %a8m)
 541
 542   %a9m = mul <16 x i16> undef, undef
 543   %a9 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %a9m)
 544
 545   ret void
 546 }
 547
 548 define void @mla_i32() {
 549 ; CHECK-LABEL: 'mla_i32'
 550 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a0za = zext <1 x i8> undef to <1 x i32>
 551 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a0zb = zext <1 x i8> undef to <1 x i32>
 552 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a0zm = mul <1 x i32> %a0za, %a0zb
 553 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a0z = call i32 @llvm.vector.reduce.add.v1i32(<1 x i32> %a0zm)
 554 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a0sa = sext <1 x i8> undef to <1 x i32>
 555 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a0sb = sext <1 x i8> undef to <1 x i32>
 556 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a0sm = mul <1 x i32> %a0sa, %a0sb
 557 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a0s = call i32 @llvm.vector.reduce.add.v1i32(<1 x i32> %a0sm)
 558 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a1za = zext <2 x i8> undef to <2 x i32>
 559 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a1zb = zext <2 x i8> undef to <2 x i32>
 560 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %a1zm = mul <2 x i32> %a1za, %a1zb
 561 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %a1z = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> %a1zm)
 562 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a1sa = sext <2 x i8> undef to <2 x i32>
 563 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a1sb = sext <2 x i8> undef to <2 x i32>
 564 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %a1sm = mul <2 x i32> %a1sa, %a1sb
 565 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %a1s = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> %a1sm)
 566 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a2za = zext <4 x i8> undef to <4 x i32>
 567 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a2zb = zext <4 x i8> undef to <4 x i32>
 568 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a2zm = mul <4 x i32> %a2za, %a2zb
 569 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a2z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %a2zm)
 570 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a2sa = sext <4 x i8> undef to <4 x i32>
 571 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a2sb = sext <4 x i8> undef to <4 x i32>
 572 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a2sm = mul <4 x i32> %a2sa, %a2sb
 573 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a2s = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %a2sm)
 574 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %a3za = zext <8 x i8> undef to <8 x i32>
 575 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %a3zb = zext <8 x i8> undef to <8 x i32>
 576 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a3zm = mul <8 x i32> %a3za, %a3zb
 577 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a3z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %a3zm)
 578 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %a3sa = sext <8 x i8> undef to <8 x i32>
 579 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %a3sb = sext <8 x i8> undef to <8 x i32>
 580 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a3sm = mul <8 x i32> %a3sa, %a3sb
 581 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a3s = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %a3sm)
 582 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 74 for instruction: %a4za = zext <16 x i8> undef to <16 x i32>
 583 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 74 for instruction: %a4zb = zext <16 x i8> undef to <16 x i32>
 584 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %a4zm = mul <16 x i32> %a4za, %a4zb
 585 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %a4z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %a4zm)
 586 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 74 for instruction: %a4sa = sext <16 x i8> undef to <16 x i32>
 587 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 74 for instruction: %a4sb = sext <16 x i8> undef to <16 x i32>
 588 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %a4sm = mul <16 x i32> %a4sa, %a4sb
 589 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %a4s = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %a4sm)
 590 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a5za = zext <1 x i16> undef to <1 x i32>
 591 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a5zb = zext <1 x i16> undef to <1 x i32>
 592 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a5zm = mul <1 x i32> %a5za, %a5zb
 593 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a5z = call i32 @llvm.vector.reduce.add.v1i32(<1 x i32> %a5zm)
 594 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a5sa = sext <1 x i16> undef to <1 x i32>
 595 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a5sb = sext <1 x i16> undef to <1 x i32>
 596 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a5sm = mul <1 x i32> %a5sa, %a5sb
 597 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a5s = call i32 @llvm.vector.reduce.add.v1i32(<1 x i32> %a5sm)
 598 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a6za = zext <2 x i16> undef to <2 x i32>
 599 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a6zb = zext <2 x i16> undef to <2 x i32>
 600 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %a6zm = mul <2 x i32> %a6za, %a6zb
 601 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %a6z = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> %a6zm)
 602 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a6sa = sext <2 x i16> undef to <2 x i32>
 603 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a6sb = sext <2 x i16> undef to <2 x i32>
 604 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %a6sm = mul <2 x i32> %a6sa, %a6sb
 605 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %a6s = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> %a6sm)
 606 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a7za = zext <4 x i16> undef to <4 x i32>
 607 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a7zb = zext <4 x i16> undef to <4 x i32>
 608 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a7zm = mul <4 x i32> %a7za, %a7zb
 609 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a7z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %a7zm)
 610 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a7sa = sext <4 x i16> undef to <4 x i32>
 611 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a7sb = sext <4 x i16> undef to <4 x i32>
 612 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a7sm = mul <4 x i32> %a7sa, %a7sb
 613 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a7s = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %a7sm)
 614 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %a8za = zext <8 x i16> undef to <8 x i32>
 615 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %a8zb = zext <8 x i16> undef to <8 x i32>
 616 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a8zm = mul <8 x i32> %a8za, %a8zb
 617 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a8z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %a8zm)
 618 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %a8sa = sext <8 x i16> undef to <8 x i32>
 619 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %a8sb = sext <8 x i16> undef to <8 x i32>
 620 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a8sm = mul <8 x i32> %a8sa, %a8sb
 621 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a8s = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %a8sm)
 622 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %a9za = zext <16 x i16> undef to <16 x i32>
 623 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %a9zb = zext <16 x i16> undef to <16 x i32>
 624 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %a9zm = mul <16 x i32> %a9za, %a9zb
 625 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %a9z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %a9zm)
 626 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %a9sa = sext <16 x i16> undef to <16 x i32>
 627 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %a9sb = sext <16 x i16> undef to <16 x i32>
 628 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %a9sm = mul <16 x i32> %a9sa, %a9sb
 629 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %a9s = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %a9sm)
 630 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a10m = mul <1 x i32> undef, undef
 631 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a10 = call i32 @llvm.vector.reduce.add.v1i32(<1 x i32> %a10m)
 632 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %a11m = mul <2 x i32> undef, undef
 633 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %a11 = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> %a11m)
 634 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a12m = mul <4 x i32> undef, undef
 635 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a12 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %a12m)
 636 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a13m = mul <8 x i32> undef, undef
 637 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a13 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %a13m)
 638 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %a14m = mul <16 x i32> undef, undef
 639 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %a14 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %a14m)
 640 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 641 ;
 642   %a0za = zext <1 x i8> undef to <1 x i32>
 643   %a0zb = zext <1 x i8> undef to <1 x i32>
 644   %a0zm = mul <1 x i32> %a0za, %a0zb
 645   %a0z = call i32 @llvm.vector.reduce.add.v1i32(<1 x i32> %a0zm)
 646
 647   %a0sa = sext <1 x i8> undef to <1 x i32>
 648   %a0sb = sext <1 x i8> undef to <1 x i32>
 649   %a0sm = mul <1 x i32> %a0sa, %a0sb
 650   %a0s = call i32 @llvm.vector.reduce.add.v1i32(<1 x i32> %a0sm)
 651
 652   %a1za = zext <2 x i8> undef to <2 x i32>
 653   %a1zb = zext <2 x i8> undef to <2 x i32>
 654   %a1zm = mul <2 x i32> %a1za, %a1zb
 655   %a1z = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> %a1zm)
 656
 657   %a1sa = sext <2 x i8> undef to <2 x i32>
 658   %a1sb = sext <2 x i8> undef to <2 x i32>
 659   %a1sm = mul <2 x i32> %a1sa, %a1sb
 660   %a1s = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> %a1sm)
 661
 662   %a2za = zext <4 x i8> undef to <4 x i32>
 663   %a2zb = zext <4 x i8> undef to <4 x i32>
 664   %a2zm = mul <4 x i32> %a2za, %a2zb
 665   %a2z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %a2zm)
 666
 667   %a2sa = sext <4 x i8> undef to <4 x i32>
 668   %a2sb = sext <4 x i8> undef to <4 x i32>
 669   %a2sm = mul <4 x i32> %a2sa, %a2sb
 670   %a2s = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %a2sm)
 671
 672   %a3za = zext <8 x i8> undef to <8 x i32>
 673   %a3zb = zext <8 x i8> undef to <8 x i32>
 674   %a3zm = mul <8 x i32> %a3za, %a3zb
 675   %a3z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %a3zm)
 676
 677   %a3sa = sext <8 x i8> undef to <8 x i32>
 678   %a3sb = sext <8 x i8> undef to <8 x i32>
 679   %a3sm = mul <8 x i32> %a3sa, %a3sb
 680   %a3s = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %a3sm)
 681
 682   %a4za = zext <16 x i8> undef to <16 x i32>
 683   %a4zb = zext <16 x i8> undef to <16 x i32>
 684   %a4zm = mul <16 x i32> %a4za, %a4zb
 685   %a4z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %a4zm)
 686
 687   %a4sa = sext <16 x i8> undef to <16 x i32>
 688   %a4sb = sext <16 x i8> undef to <16 x i32>
 689   %a4sm = mul <16 x i32> %a4sa, %a4sb
 690   %a4s = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %a4sm)
 691
 692   %a5za = zext <1 x i16> undef to <1 x i32>
 693   %a5zb = zext <1 x i16> undef to <1 x i32>
 694   %a5zm = mul <1 x i32> %a5za, %a5zb
 695   %a5z = call i32 @llvm.vector.reduce.add.v1i32(<1 x i32> %a5zm)
 696
 697   %a5sa = sext <1 x i16> undef to <1 x i32>
 698   %a5sb = sext <1 x i16> undef to <1 x i32>
 699   %a5sm = mul <1 x i32> %a5sa, %a5sb
 700   %a5s = call i32 @llvm.vector.reduce.add.v1i32(<1 x i32> %a5sm)
 701
 702   %a6za = zext <2 x i16> undef to <2 x i32>
 703   %a6zb = zext <2 x i16> undef to <2 x i32>
 704   %a6zm = mul <2 x i32> %a6za, %a6zb
 705   %a6z = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> %a6zm)
 706
 707   %a6sa = sext <2 x i16> undef to <2 x i32>
 708   %a6sb = sext <2 x i16> undef to <2 x i32>
 709   %a6sm = mul <2 x i32> %a6sa, %a6sb
 710   %a6s = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> %a6sm)
 711
 712   %a7za = zext <4 x i16> undef to <4 x i32>
 713   %a7zb = zext <4 x i16> undef to <4 x i32>
 714   %a7zm = mul <4 x i32> %a7za, %a7zb
 715   %a7z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %a7zm)
 716
 717   %a7sa = sext <4 x i16> undef to <4 x i32>
 718   %a7sb = sext <4 x i16> undef to <4 x i32>
 719   %a7sm = mul <4 x i32> %a7sa, %a7sb
 720   %a7s = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %a7sm)
 721
 722   %a8za = zext <8 x i16> undef to <8 x i32>
 723   %a8zb = zext <8 x i16> undef to <8 x i32>
 724   %a8zm = mul <8 x i32> %a8za, %a8zb
 725   %a8z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %a8zm)
 726
 727   %a8sa = sext <8 x i16> undef to <8 x i32>
 728   %a8sb = sext <8 x i16> undef to <8 x i32>
 729   %a8sm = mul <8 x i32> %a8sa, %a8sb
 730   %a8s = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %a8sm)
 731
 732   %a9za = zext <16 x i16> undef to <16 x i32>
 733   %a9zb = zext <16 x i16> undef to <16 x i32>
 734   %a9zm = mul <16 x i32> %a9za, %a9zb
 735   %a9z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %a9zm)
 736
 737   %a9sa = sext <16 x i16> undef to <16 x i32>
 738   %a9sb = sext <16 x i16> undef to <16 x i32>
 739   %a9sm = mul <16 x i32> %a9sa, %a9sb
 740   %a9s = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %a9sm)
 741
 742   %a10m = mul <1 x i32> undef, undef
 743   %a10 = call i32 @llvm.vector.reduce.add.v1i32(<1 x i32> %a10m)
 744
 745   %a11m = mul <2 x i32> undef, undef
 746   %a11 = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> %a11m)
 747
 748   %a12m = mul <4 x i32> undef, undef
 749   %a12 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %a12m)
 750
 751   %a13m = mul <8 x i32> undef, undef
 752   %a13 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %a13m)
 753
 754   %a14m = mul <16 x i32> undef, undef
 755   %a14 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %a14m)
 756
 757   ret void
 758 }
 759
 760 define void @mla_i64() {
 761 ; CHECK-LABEL: 'mla_i64'
 762 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %a0za = zext <1 x i8> undef to <1 x i64>
 763 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %a0zb = zext <1 x i8> undef to <1 x i64>
 764 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a0zm = mul <1 x i64> %a0za, %a0zb
 765 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %a0z = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> %a0zm)
 766 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %a0sa = sext <1 x i8> undef to <1 x i64>
 767 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %a0sb = sext <1 x i8> undef to <1 x i64>
 768 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a0sm = mul <1 x i64> %a0sa, %a0sb
 769 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %a0s = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> %a0sm)
 770 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a1za = zext <2 x i8> undef to <2 x i64>
 771 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a1zb = zext <2 x i8> undef to <2 x i64>
 772 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %a1zm = mul <2 x i64> %a1za, %a1zb
 773 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 108 for instruction: %a1z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %a1zm)
 774 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %a1sa = sext <2 x i8> undef to <2 x i64>
 775 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %a1sb = sext <2 x i8> undef to <2 x i64>
 776 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %a1sm = mul <2 x i64> %a1sa, %a1sb
 777 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 108 for instruction: %a1s = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %a1sm)
 778 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %a2za = zext <4 x i8> undef to <4 x i64>
 779 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %a2zb = zext <4 x i8> undef to <4 x i64>
 780 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 104 for instruction: %a2zm = mul <4 x i64> %a2za, %a2zb
 781 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 208 for instruction: %a2z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %a2zm)
 782 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 82 for instruction: %a2sa = sext <4 x i8> undef to <4 x i64>
 783 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 82 for instruction: %a2sb = sext <4 x i8> undef to <4 x i64>
 784 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 104 for instruction: %a2sm = mul <4 x i64> %a2sa, %a2sb
 785 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 208 for instruction: %a2s = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %a2sm)
 786 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 74 for instruction: %a3za = zext <8 x i8> undef to <8 x i64>
 787 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 74 for instruction: %a3zb = zext <8 x i8> undef to <8 x i64>
 788 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 208 for instruction: %a3zm = mul <8 x i64> %a3za, %a3zb
 789 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 408 for instruction: %a3z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %a3zm)
 790 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 330 for instruction: %a3sa = sext <8 x i8> undef to <8 x i64>
 791 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 330 for instruction: %a3sb = sext <8 x i8> undef to <8 x i64>
 792 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 208 for instruction: %a3sm = mul <8 x i64> %a3sa, %a3sb
 793 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 408 for instruction: %a3s = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %a3sm)
 794 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 298 for instruction: %a4za = zext <16 x i8> undef to <16 x i64>
 795 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 298 for instruction: %a4zb = zext <16 x i8> undef to <16 x i64>
 796 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 416 for instruction: %a4zm = mul <16 x i64> %a4za, %a4zb
 797 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 808 for instruction: %a4z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %a4zm)
 798 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1322 for instruction: %a4sa = sext <16 x i8> undef to <16 x i64>
 799 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1322 for instruction: %a4sb = sext <16 x i8> undef to <16 x i64>
 800 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 416 for instruction: %a4sm = mul <16 x i64> %a4sa, %a4sb
 801 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 808 for instruction: %a4s = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %a4sm)
 802 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %a5za = zext <1 x i16> undef to <1 x i64>
 803 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %a5zb = zext <1 x i16> undef to <1 x i64>
 804 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a5zm = mul <1 x i64> %a5za, %a5zb
 805 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %a5z = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> %a5zm)
 806 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %a5sa = sext <1 x i16> undef to <1 x i64>
 807 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %a5sb = sext <1 x i16> undef to <1 x i64>
 808 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a5sm = mul <1 x i64> %a5sa, %a5sb
 809 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %a5s = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> %a5sm)
 810 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a6za = zext <2 x i16> undef to <2 x i64>
 811 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a6zb = zext <2 x i16> undef to <2 x i64>
 812 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %a6zm = mul <2 x i64> %a6za, %a6zb
 813 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 108 for instruction: %a6z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %a6zm)
 814 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %a6sa = sext <2 x i16> undef to <2 x i64>
 815 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %a6sb = sext <2 x i16> undef to <2 x i64>
 816 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %a6sm = mul <2 x i64> %a6sa, %a6sb
 817 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 108 for instruction: %a6s = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %a6sm)
 818 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %a7za = zext <4 x i16> undef to <4 x i64>
 819 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %a7zb = zext <4 x i16> undef to <4 x i64>
 820 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 104 for instruction: %a7zm = mul <4 x i64> %a7za, %a7zb
 821 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 208 for instruction: %a7z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %a7zm)
 822 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 82 for instruction: %a7sa = sext <4 x i16> undef to <4 x i64>
 823 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 82 for instruction: %a7sb = sext <4 x i16> undef to <4 x i64>
 824 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 104 for instruction: %a7sm = mul <4 x i64> %a7sa, %a7sb
 825 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 208 for instruction: %a7s = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %a7sm)
 826 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 74 for instruction: %a8za = zext <8 x i16> undef to <8 x i64>
 827 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 74 for instruction: %a8zb = zext <8 x i16> undef to <8 x i64>
 828 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 208 for instruction: %a8zm = mul <8 x i64> %a8za, %a8zb
 829 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 408 for instruction: %a8z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %a8zm)
 830 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 330 for instruction: %a8sa = sext <8 x i16> undef to <8 x i64>
 831 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 330 for instruction: %a8sb = sext <8 x i16> undef to <8 x i64>
 832 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 208 for instruction: %a8sm = mul <8 x i64> %a8sa, %a8sb
 833 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 408 for instruction: %a8s = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %a8sm)
 834 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 296 for instruction: %a9za = zext <16 x i16> undef to <16 x i64>
 835 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 296 for instruction: %a9zb = zext <16 x i16> undef to <16 x i64>
 836 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 416 for instruction: %a9zm = mul <16 x i64> %a9za, %a9zb
 837 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 808 for instruction: %a9z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %a9zm)
 838 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1320 for instruction: %a9sa = sext <16 x i16> undef to <16 x i64>
 839 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1320 for instruction: %a9sb = sext <16 x i16> undef to <16 x i64>
 840 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 416 for instruction: %a9sm = mul <16 x i64> %a9sa, %a9sb
 841 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 808 for instruction: %a9s = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %a9sm)
 842 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %a10za = zext <1 x i32> undef to <1 x i64>
 843 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %a10zb = zext <1 x i32> undef to <1 x i64>
 844 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a10zm = mul <1 x i64> %a10za, %a10zb
 845 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %a10z = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> %a10zm)
 846 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %a10sa = sext <1 x i32> undef to <1 x i64>
 847 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %a10sb = sext <1 x i32> undef to <1 x i64>
 848 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a10sm = mul <1 x i64> %a10sa, %a10sb
 849 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %a10s = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> %a10sm)
 850 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a11za = zext <2 x i32> undef to <2 x i64>
 851 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a11zb = zext <2 x i32> undef to <2 x i64>
 852 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %a11zm = mul <2 x i64> %a11za, %a11zb
 853 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 108 for instruction: %a11z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %a11zm)
 854 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %a11sa = sext <2 x i32> undef to <2 x i64>
 855 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %a11sb = sext <2 x i32> undef to <2 x i64>
 856 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %a11sm = mul <2 x i64> %a11sa, %a11sb
 857 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 108 for instruction: %a11s = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %a11sm)
 858 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %a12za = zext <4 x i32> undef to <4 x i64>
 859 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %a12zb = zext <4 x i32> undef to <4 x i64>
 860 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 104 for instruction: %a12zm = mul <4 x i64> %a12za, %a12zb
 861 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 208 for instruction: %a12z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %a12zm)
 862 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %a12sa = sext <4 x i32> undef to <4 x i64>
 863 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %a12sb = sext <4 x i32> undef to <4 x i64>
 864 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 104 for instruction: %a12sm = mul <4 x i64> %a12sa, %a12sb
 865 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 208 for instruction: %a12s = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %a12sm)
 866 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %a13za = zext <8 x i32> undef to <8 x i64>
 867 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %a13zb = zext <8 x i32> undef to <8 x i64>
 868 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 208 for instruction: %a13zm = mul <8 x i64> %a13za, %a13zb
 869 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 408 for instruction: %a13z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %a13zm)
 870 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 264 for instruction: %a13sa = sext <8 x i32> undef to <8 x i64>
 871 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 264 for instruction: %a13sb = sext <8 x i32> undef to <8 x i64>
 872 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 208 for instruction: %a13sm = mul <8 x i64> %a13sa, %a13sb
 873 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 408 for instruction: %a13s = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %a13sm)
 874 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 288 for instruction: %a14za = zext <16 x i32> undef to <16 x i64>
 875 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 288 for instruction: %a14zb = zext <16 x i32> undef to <16 x i64>
 876 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 416 for instruction: %a14zm = mul <16 x i64> %a14za, %a14zb
 877 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 808 for instruction: %a14z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %a14zm)
 878 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1056 for instruction: %a14sa = sext <16 x i32> undef to <16 x i64>
 879 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1056 for instruction: %a14sb = sext <16 x i32> undef to <16 x i64>
 880 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 416 for instruction: %a14sm = mul <16 x i64> %a14sa, %a14sb
 881 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 808 for instruction: %a14s = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %a14sm)
 882 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a15m = mul <1 x i64> undef, undef
 883 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %a15 = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> %a15m)
 884 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %a16m = mul <2 x i64> undef, undef
 885 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 108 for instruction: %a16 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %a16m)
 886 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %a17m = mul <4 x i64> undef, undef
 887 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 208 for instruction: %a17 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %a17m)
 888 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %a18m = mul <8 x i64> undef, undef
 889 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 408 for instruction: %a18 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %a18m)
 890 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %a19m = mul <16 x i64> undef, undef
 891 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 808 for instruction: %a19 = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %a19m)
 892 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 893 ;
 894   %a0za = zext <1 x i8> undef to <1 x i64>
 895   %a0zb = zext <1 x i8> undef to <1 x i64>
 896   %a0zm = mul <1 x i64> %a0za, %a0zb
 897   %a0z = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> %a0zm)
 898
 899   %a0sa = sext <1 x i8> undef to <1 x i64>
 900   %a0sb = sext <1 x i8> undef to <1 x i64>
 901   %a0sm = mul <1 x i64> %a0sa, %a0sb
 902   %a0s = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> %a0sm)
 903
 904   %a1za = zext <2 x i8> undef to <2 x i64>
 905   %a1zb = zext <2 x i8> undef to <2 x i64>
 906   %a1zm = mul <2 x i64> %a1za, %a1zb
 907   %a1z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %a1zm)
 908
 909   %a1sa = sext <2 x i8> undef to <2 x i64>
 910   %a1sb = sext <2 x i8> undef to <2 x i64>
 911   %a1sm = mul <2 x i64> %a1sa, %a1sb
 912   %a1s = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %a1sm)
 913
 914   %a2za = zext <4 x i8> undef to <4 x i64>
 915   %a2zb = zext <4 x i8> undef to <4 x i64>
 916   %a2zm = mul <4 x i64> %a2za, %a2zb
 917   %a2z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %a2zm)
 918
 919   %a2sa = sext <4 x i8> undef to <4 x i64>
 920   %a2sb = sext <4 x i8> undef to <4 x i64>
 921   %a2sm = mul <4 x i64> %a2sa, %a2sb
 922   %a2s = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %a2sm)
 923
 924   %a3za = zext <8 x i8> undef to <8 x i64>
 925   %a3zb = zext <8 x i8> undef to <8 x i64>
 926   %a3zm = mul <8 x i64> %a3za, %a3zb
 927   %a3z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %a3zm)
 928
 929   %a3sa = sext <8 x i8> undef to <8 x i64>
 930   %a3sb = sext <8 x i8> undef to <8 x i64>
 931   %a3sm = mul <8 x i64> %a3sa, %a3sb
 932   %a3s = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %a3sm)
 933
 934   %a4za = zext <16 x i8> undef to <16 x i64>
 935   %a4zb = zext <16 x i8> undef to <16 x i64>
 936   %a4zm = mul <16 x i64> %a4za, %a4zb
 937   %a4z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %a4zm)
 938
 939   %a4sa = sext <16 x i8> undef to <16 x i64>
 940   %a4sb = sext <16 x i8> undef to <16 x i64>
 941   %a4sm = mul <16 x i64> %a4sa, %a4sb
 942   %a4s = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %a4sm)
 943
 944   %a5za = zext <1 x i16> undef to <1 x i64>
 945   %a5zb = zext <1 x i16> undef to <1 x i64>
 946   %a5zm = mul <1 x i64> %a5za, %a5zb
 947   %a5z = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> %a5zm)
 948
 949   %a5sa = sext <1 x i16> undef to <1 x i64>
 950   %a5sb = sext <1 x i16> undef to <1 x i64>
 951   %a5sm = mul <1 x i64> %a5sa, %a5sb
 952   %a5s = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> %a5sm)
 953
 954   %a6za = zext <2 x i16> undef to <2 x i64>
 955   %a6zb = zext <2 x i16> undef to <2 x i64>
 956   %a6zm = mul <2 x i64> %a6za, %a6zb
 957   %a6z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %a6zm)
 958
 959   %a6sa = sext <2 x i16> undef to <2 x i64>
 960   %a6sb = sext <2 x i16> undef to <2 x i64>
 961   %a6sm = mul <2 x i64> %a6sa, %a6sb
 962   %a6s = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %a6sm)
 963
 964   %a7za = zext <4 x i16> undef to <4 x i64>
 965   %a7zb = zext <4 x i16> undef to <4 x i64>
 966   %a7zm = mul <4 x i64> %a7za, %a7zb
 967   %a7z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %a7zm)
 968
 969   %a7sa = sext <4 x i16> undef to <4 x i64>
 970   %a7sb = sext <4 x i16> undef to <4 x i64>
 971   %a7sm = mul <4 x i64> %a7sa, %a7sb
 972   %a7s = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %a7sm)
 973
 974   %a8za = zext <8 x i16> undef to <8 x i64>
 975   %a8zb = zext <8 x i16> undef to <8 x i64>
 976   %a8zm = mul <8 x i64> %a8za, %a8zb
 977   %a8z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %a8zm)
 978
 979   %a8sa = sext <8 x i16> undef to <8 x i64>
 980   %a8sb = sext <8 x i16> undef to <8 x i64>
 981   %a8sm = mul <8 x i64> %a8sa, %a8sb
 982   %a8s = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %a8sm)
 983
 984   %a9za = zext <16 x i16> undef to <16 x i64>
 985   %a9zb = zext <16 x i16> undef to <16 x i64>
 986   %a9zm = mul <16 x i64> %a9za, %a9zb
 987   %a9z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %a9zm)
 988
 989   %a9sa = sext <16 x i16> undef to <16 x i64>
 990   %a9sb = sext <16 x i16> undef to <16 x i64>
 991   %a9sm = mul <16 x i64> %a9sa, %a9sb
 992   %a9s = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %a9sm)
 993
 994   %a10za = zext <1 x i32> undef to <1 x i64>
 995   %a10zb = zext <1 x i32> undef to <1 x i64>
 996   %a10zm = mul <1 x i64> %a10za, %a10zb
 997   %a10z = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> %a10zm)
 998
 999   %a10sa = sext <1 x i32> undef to <1 x i64>
1000   %a10sb = sext <1 x i32> undef to <1 x i64>
1001   %a10sm = mul <1 x i64> %a10sa, %a10sb
1002   %a10s = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> %a10sm)
1003
1004   %a11za = zext <2 x i32> undef to <2 x i64>
1005   %a11zb = zext <2 x i32> undef to <2 x i64>
1006   %a11zm = mul <2 x i64> %a11za, %a11zb
1007   %a11z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %a11zm)
1008
1009   %a11sa = sext <2 x i32> undef to <2 x i64>
1010   %a11sb = sext <2 x i32> undef to <2 x i64>
1011   %a11sm = mul <2 x i64> %a11sa, %a11sb
1012   %a11s = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %a11sm)
1013
1014   %a12za = zext <4 x i32> undef to <4 x i64>
1015   %a12zb = zext <4 x i32> undef to <4 x i64>
1016   %a12zm = mul <4 x i64> %a12za, %a12zb
1017   %a12z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %a12zm)
1018
1019   %a12sa = sext <4 x i32> undef to <4 x i64>
1020   %a12sb = sext <4 x i32> undef to <4 x i64>
1021   %a12sm = mul <4 x i64> %a12sa, %a12sb
1022   %a12s = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %a12sm)
1023
1024   %a13za = zext <8 x i32> undef to <8 x i64>
1025   %a13zb = zext <8 x i32> undef to <8 x i64>
1026   %a13zm = mul <8 x i64> %a13za, %a13zb
1027   %a13z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %a13zm)
1028
1029   %a13sa = sext <8 x i32> undef to <8 x i64>
1030   %a13sb = sext <8 x i32> undef to <8 x i64>
1031   %a13sm = mul <8 x i64> %a13sa, %a13sb
1032   %a13s = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %a13sm)
1033
1034   %a14za = zext <16 x i32> undef to <16 x i64>
1035   %a14zb = zext <16 x i32> undef to <16 x i64>
1036   %a14zm = mul <16 x i64> %a14za, %a14zb
1037   %a14z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %a14zm)
1038
1039   %a14sa = sext <16 x i32> undef to <16 x i64>
1040   %a14sb = sext <16 x i32> undef to <16 x i64>
1041   %a14sm = mul <16 x i64> %a14sa, %a14sb
1042   %a14s = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %a14sm)
1043
1044   %a15m = mul <1 x i64> undef, undef
1045   %a15 = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> %a15m)
1046
1047   %a16m = mul <2 x i64> undef, undef
1048   %a16 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %a16m)
1049
1050   %a17m = mul <4 x i64> undef, undef
1051   %a17 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %a17m)
1052
1053   %a18m = mul <8 x i64> undef, undef
1054   %a18 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %a18m)
1055
1056   %a19m = mul <16 x i64> undef, undef
1057   %a19 = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %a19m)
1058
1059   ret void
1060 }
1061
1062 declare i16 @llvm.vector.reduce.add.v16i16(<16 x i16>)
1063 declare i16 @llvm.vector.reduce.add.v1i16(<1 x i16>)
1064 declare i16 @llvm.vector.reduce.add.v2i16(<2 x i16>)
1065 declare i16 @llvm.vector.reduce.add.v4i16(<4 x i16>)
1066 declare i16 @llvm.vector.reduce.add.v8i16(<8 x i16>)
1067 declare i32 @llvm.vector.reduce.add.v16i32(<16 x i32>)
1068 declare i32 @llvm.vector.reduce.add.v1i32(<1 x i32>)
1069 declare i32 @llvm.vector.reduce.add.v2i32(<2 x i32>)
1070 declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>)
1071 declare i32 @llvm.vector.reduce.add.v8i32(<8 x i32>)
1072 declare i64 @llvm.vector.reduce.add.v16i64(<16 x i64>)
1073 declare i64 @llvm.vector.reduce.add.v1i64(<1 x i64>)
1074 declare i64 @llvm.vector.reduce.add.v2i64(<2 x i64>)
1075 declare i64 @llvm.vector.reduce.add.v4i64(<4 x i64>)
1076 declare i64 @llvm.vector.reduce.add.v8i64(<8 x i64>)
1077 declare i8 @llvm.vector.reduce.add.v16i8(<16 x i8>)
1078 declare i8 @llvm.vector.reduce.add.v1i8(<1 x i8>)
1079 declare i8 @llvm.vector.reduce.add.v2i8(<2 x i8>)
1080 declare i8 @llvm.vector.reduce.add.v4i8(<4 x i8>)
1081 declare i8 @llvm.vector.reduce.add.v8i8(<8 x i8>)