llvm/test/Analysis/CostModel/X86/reduce-umax.ll

   1 ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
   2 ; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+sse2 | FileCheck %s --check-prefixes=SSE2
   3 ; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+ssse3 | FileCheck %s --check-prefixes=SSSE3
   4 ; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE4,SSE41
   5 ; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+sse4.2 | FileCheck %s --check-prefixes=SSE4,SSE42
   6 ; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+avx | FileCheck %s --check-prefixes=AVX1
   7 ; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+avx2 | FileCheck %s --check-prefixes=AVX2
   8 ; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512,AVX512F
   9 ; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW
  10 ; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+avx512f,+avx512dq | FileCheck %s --check-prefixes=AVX512,AVX512DQ
  11
  12 define i32 @reduce_i64(i32 %arg) {
  13 ; SSE2-LABEL: 'reduce_i64'
  14 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.umax.v1i64(<1 x i64> undef)
  15 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2 = call i64 @llvm.vector.reduce.umax.v2i64(<2 x i64> undef)
  16 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4 = call i64 @llvm.vector.reduce.umax.v4i64(<4 x i64> undef)
  17 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V8 = call i64 @llvm.vector.reduce.umax.v8i64(<8 x i64> undef)
  18 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V16 = call i64 @llvm.vector.reduce.umax.v16i64(<16 x i64> undef)
  19 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
  20 ;
  21 ; SSSE3-LABEL: 'reduce_i64'
  22 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.umax.v1i64(<1 x i64> undef)
  23 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2 = call i64 @llvm.vector.reduce.umax.v2i64(<2 x i64> undef)
  24 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4 = call i64 @llvm.vector.reduce.umax.v4i64(<4 x i64> undef)
  25 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V8 = call i64 @llvm.vector.reduce.umax.v8i64(<8 x i64> undef)
  26 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V16 = call i64 @llvm.vector.reduce.umax.v16i64(<16 x i64> undef)
  27 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
  28 ;
  29 ; SSE4-LABEL: 'reduce_i64'
  30 ; SSE4-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.umax.v1i64(<1 x i64> undef)
  31 ; SSE4-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i64 @llvm.vector.reduce.umax.v2i64(<2 x i64> undef)
  32 ; SSE4-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4 = call i64 @llvm.vector.reduce.umax.v4i64(<4 x i64> undef)
  33 ; SSE4-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8 = call i64 @llvm.vector.reduce.umax.v8i64(<8 x i64> undef)
  34 ; SSE4-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V16 = call i64 @llvm.vector.reduce.umax.v16i64(<16 x i64> undef)
  35 ; SSE4-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
  36 ;
  37 ; AVX1-LABEL: 'reduce_i64'
  38 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.umax.v1i64(<1 x i64> undef)
  39 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2 = call i64 @llvm.vector.reduce.umax.v2i64(<2 x i64> undef)
  40 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4 = call i64 @llvm.vector.reduce.umax.v4i64(<4 x i64> undef)
  41 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V8 = call i64 @llvm.vector.reduce.umax.v8i64(<8 x i64> undef)
  42 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V16 = call i64 @llvm.vector.reduce.umax.v16i64(<16 x i64> undef)
  43 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
  44 ;
  45 ; AVX2-LABEL: 'reduce_i64'
  46 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.umax.v1i64(<1 x i64> undef)
  47 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i64 @llvm.vector.reduce.umax.v2i64(<2 x i64> undef)
  48 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i64 @llvm.vector.reduce.umax.v4i64(<4 x i64> undef)
  49 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8 = call i64 @llvm.vector.reduce.umax.v8i64(<8 x i64> undef)
  50 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i64 @llvm.vector.reduce.umax.v16i64(<16 x i64> undef)
  51 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
  52 ;
  53 ; AVX512-LABEL: 'reduce_i64'
  54 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.umax.v1i64(<1 x i64> undef)
  55 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.vector.reduce.umax.v2i64(<2 x i64> undef)
  56 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i64 @llvm.vector.reduce.umax.v4i64(<4 x i64> undef)
  57 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i64 @llvm.vector.reduce.umax.v8i64(<8 x i64> undef)
  58 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i64 @llvm.vector.reduce.umax.v16i64(<16 x i64> undef)
  59 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
  60 ;
  61   %V1  = call i64 @llvm.vector.reduce.umax.v1i64(<1 x i64> undef)
  62   %V2  = call i64 @llvm.vector.reduce.umax.v2i64(<2 x i64> undef)
  63   %V4  = call i64 @llvm.vector.reduce.umax.v4i64(<4 x i64> undef)
  64   %V8  = call i64 @llvm.vector.reduce.umax.v8i64(<8 x i64> undef)
  65   %V16 = call i64 @llvm.vector.reduce.umax.v16i64(<16 x i64> undef)
  66   ret i32 undef
  67 }
  68
  69 define i32 @reduce_i32(i32 %arg) {
  70 ; SSE2-LABEL: 'reduce_i32'
  71 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i32 @llvm.vector.reduce.umax.v2i32(<2 x i32> undef)
  72 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> undef)
  73 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8 = call i32 @llvm.vector.reduce.umax.v8i32(<8 x i32> undef)
  74 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i32 @llvm.vector.reduce.umax.v16i32(<16 x i32> undef)
  75 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V32 = call i32 @llvm.vector.reduce.umax.v32i32(<32 x i32> undef)
  76 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
  77 ;
  78 ; SSSE3-LABEL: 'reduce_i32'
  79 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i32 @llvm.vector.reduce.umax.v2i32(<2 x i32> undef)
  80 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> undef)
  81 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8 = call i32 @llvm.vector.reduce.umax.v8i32(<8 x i32> undef)
  82 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i32 @llvm.vector.reduce.umax.v16i32(<16 x i32> undef)
  83 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V32 = call i32 @llvm.vector.reduce.umax.v32i32(<32 x i32> undef)
  84 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
  85 ;
  86 ; SSE4-LABEL: 'reduce_i32'
  87 ; SSE4-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.vector.reduce.umax.v2i32(<2 x i32> undef)
  88 ; SSE4-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> undef)
  89 ; SSE4-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i32 @llvm.vector.reduce.umax.v8i32(<8 x i32> undef)
  90 ; SSE4-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i32 @llvm.vector.reduce.umax.v16i32(<16 x i32> undef)
  91 ; SSE4-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V32 = call i32 @llvm.vector.reduce.umax.v32i32(<32 x i32> undef)
  92 ; SSE4-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
  93 ;
  94 ; AVX1-LABEL: 'reduce_i32'
  95 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.vector.reduce.umax.v2i32(<2 x i32> undef)
  96 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> undef)
  97 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i32 @llvm.vector.reduce.umax.v8i32(<8 x i32> undef)
  98 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V16 = call i32 @llvm.vector.reduce.umax.v16i32(<16 x i32> undef)
  99 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V32 = call i32 @llvm.vector.reduce.umax.v32i32(<32 x i32> undef)
 100 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 101 ;
 102 ; AVX2-LABEL: 'reduce_i32'
 103 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.vector.reduce.umax.v2i32(<2 x i32> undef)
 104 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> undef)
 105 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i32 @llvm.vector.reduce.umax.v8i32(<8 x i32> undef)
 106 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i32 @llvm.vector.reduce.umax.v16i32(<16 x i32> undef)
 107 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i32 @llvm.vector.reduce.umax.v32i32(<32 x i32> undef)
 108 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 109 ;
 110 ; AVX512-LABEL: 'reduce_i32'
 111 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.vector.reduce.umax.v2i32(<2 x i32> undef)
 112 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> undef)
 113 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i32 @llvm.vector.reduce.umax.v8i32(<8 x i32> undef)
 114 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i32 @llvm.vector.reduce.umax.v16i32(<16 x i32> undef)
 115 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i32 @llvm.vector.reduce.umax.v32i32(<32 x i32> undef)
 116 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 117 ;
 118   %V2  = call i32 @llvm.vector.reduce.umax.v2i32(<2 x i32> undef)
 119   %V4  = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> undef)
 120   %V8  = call i32 @llvm.vector.reduce.umax.v8i32(<8 x i32> undef)
 121   %V16 = call i32 @llvm.vector.reduce.umax.v16i32(<16 x i32> undef)
 122   %V32 = call i32 @llvm.vector.reduce.umax.v32i32(<32 x i32> undef)
 123   ret i32 undef
 124 }
 125
 126 define i32 @reduce_i16(i32 %arg) {
 127 ; SSE2-LABEL: 'reduce_i16'
 128 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2 = call i16 @llvm.vector.reduce.umax.v2i16(<2 x i16> undef)
 129 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i16 @llvm.vector.reduce.umax.v4i16(<4 x i16> undef)
 130 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8 = call i16 @llvm.vector.reduce.umax.v8i16(<8 x i16> undef)
 131 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V16 = call i16 @llvm.vector.reduce.umax.v16i16(<16 x i16> undef)
 132 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V32 = call i16 @llvm.vector.reduce.umax.v32i16(<32 x i16> undef)
 133 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V64 = call i16 @llvm.vector.reduce.umax.v64i16(<64 x i16> undef)
 134 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 135 ;
 136 ; SSSE3-LABEL: 'reduce_i16'
 137 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2 = call i16 @llvm.vector.reduce.umax.v2i16(<2 x i16> undef)
 138 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i16 @llvm.vector.reduce.umax.v4i16(<4 x i16> undef)
 139 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8 = call i16 @llvm.vector.reduce.umax.v8i16(<8 x i16> undef)
 140 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V16 = call i16 @llvm.vector.reduce.umax.v16i16(<16 x i16> undef)
 141 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V32 = call i16 @llvm.vector.reduce.umax.v32i16(<32 x i16> undef)
 142 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V64 = call i16 @llvm.vector.reduce.umax.v64i16(<64 x i16> undef)
 143 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 144 ;
 145 ; SSE4-LABEL: 'reduce_i16'
 146 ; SSE4-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2 = call i16 @llvm.vector.reduce.umax.v2i16(<2 x i16> undef)
 147 ; SSE4-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i16 @llvm.vector.reduce.umax.v4i16(<4 x i16> undef)
 148 ; SSE4-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.vector.reduce.umax.v8i16(<8 x i16> undef)
 149 ; SSE4-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16 = call i16 @llvm.vector.reduce.umax.v16i16(<16 x i16> undef)
 150 ; SSE4-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V32 = call i16 @llvm.vector.reduce.umax.v32i16(<32 x i16> undef)
 151 ; SSE4-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V64 = call i16 @llvm.vector.reduce.umax.v64i16(<64 x i16> undef)
 152 ; SSE4-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 153 ;
 154 ; AVX1-LABEL: 'reduce_i16'
 155 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2 = call i16 @llvm.vector.reduce.umax.v2i16(<2 x i16> undef)
 156 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i16 @llvm.vector.reduce.umax.v4i16(<4 x i16> undef)
 157 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.vector.reduce.umax.v8i16(<8 x i16> undef)
 158 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i16 @llvm.vector.reduce.umax.v16i16(<16 x i16> undef)
 159 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i16 @llvm.vector.reduce.umax.v32i16(<32 x i16> undef)
 160 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V64 = call i16 @llvm.vector.reduce.umax.v64i16(<64 x i16> undef)
 161 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 162 ;
 163 ; AVX2-LABEL: 'reduce_i16'
 164 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2 = call i16 @llvm.vector.reduce.umax.v2i16(<2 x i16> undef)
 165 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i16 @llvm.vector.reduce.umax.v4i16(<4 x i16> undef)
 166 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.vector.reduce.umax.v8i16(<8 x i16> undef)
 167 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i16 @llvm.vector.reduce.umax.v16i16(<16 x i16> undef)
 168 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V32 = call i16 @llvm.vector.reduce.umax.v32i16(<32 x i16> undef)
 169 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V64 = call i16 @llvm.vector.reduce.umax.v64i16(<64 x i16> undef)
 170 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 171 ;
 172 ; AVX512F-LABEL: 'reduce_i16'
 173 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2 = call i16 @llvm.vector.reduce.umax.v2i16(<2 x i16> undef)
 174 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i16 @llvm.vector.reduce.umax.v4i16(<4 x i16> undef)
 175 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.vector.reduce.umax.v8i16(<8 x i16> undef)
 176 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i16 @llvm.vector.reduce.umax.v16i16(<16 x i16> undef)
 177 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i16 @llvm.vector.reduce.umax.v32i16(<32 x i16> undef)
 178 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V64 = call i16 @llvm.vector.reduce.umax.v64i16(<64 x i16> undef)
 179 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 180 ;
 181 ; AVX512BW-LABEL: 'reduce_i16'
 182 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2 = call i16 @llvm.vector.reduce.umax.v2i16(<2 x i16> undef)
 183 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i16 @llvm.vector.reduce.umax.v4i16(<4 x i16> undef)
 184 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.vector.reduce.umax.v8i16(<8 x i16> undef)
 185 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i16 @llvm.vector.reduce.umax.v16i16(<16 x i16> undef)
 186 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i16 @llvm.vector.reduce.umax.v32i16(<32 x i16> undef)
 187 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V64 = call i16 @llvm.vector.reduce.umax.v64i16(<64 x i16> undef)
 188 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 189 ;
 190 ; AVX512DQ-LABEL: 'reduce_i16'
 191 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2 = call i16 @llvm.vector.reduce.umax.v2i16(<2 x i16> undef)
 192 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i16 @llvm.vector.reduce.umax.v4i16(<4 x i16> undef)
 193 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.vector.reduce.umax.v8i16(<8 x i16> undef)
 194 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i16 @llvm.vector.reduce.umax.v16i16(<16 x i16> undef)
 195 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i16 @llvm.vector.reduce.umax.v32i16(<32 x i16> undef)
 196 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V64 = call i16 @llvm.vector.reduce.umax.v64i16(<64 x i16> undef)
 197 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 198 ;
 199   %V2  = call i16 @llvm.vector.reduce.umax.v2i16(<2 x i16> undef)
 200   %V4  = call i16 @llvm.vector.reduce.umax.v4i16(<4 x i16> undef)
 201   %V8  = call i16 @llvm.vector.reduce.umax.v8i16(<8 x i16> undef)
 202   %V16 = call i16 @llvm.vector.reduce.umax.v16i16(<16 x i16> undef)
 203   %V32 = call i16 @llvm.vector.reduce.umax.v32i16(<32 x i16> undef)
 204   %V64 = call i16 @llvm.vector.reduce.umax.v64i16(<64 x i16> undef)
 205   ret i32 undef
 206 }
 207
 208 define i32 @reduce_i8(i32 %arg) {
 209 ; SSE2-LABEL: 'reduce_i8'
 210 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.vector.reduce.umax.v2i8(<2 x i8> undef)
 211 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.vector.reduce.umax.v4i8(<4 x i8> undef)
 212 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.vector.reduce.umax.v8i8(<8 x i8> undef)
 213 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.vector.reduce.umax.v16i8(<16 x i8> undef)
 214 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i8 @llvm.vector.reduce.umax.v32i8(<32 x i8> undef)
 215 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i8 @llvm.vector.reduce.umax.v64i8(<64 x i8> undef)
 216 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i8 @llvm.vector.reduce.umax.v128i8(<128 x i8> undef)
 217 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 218 ;
 219 ; SSSE3-LABEL: 'reduce_i8'
 220 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.vector.reduce.umax.v2i8(<2 x i8> undef)
 221 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.vector.reduce.umax.v4i8(<4 x i8> undef)
 222 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.vector.reduce.umax.v8i8(<8 x i8> undef)
 223 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.vector.reduce.umax.v16i8(<16 x i8> undef)
 224 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i8 @llvm.vector.reduce.umax.v32i8(<32 x i8> undef)
 225 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i8 @llvm.vector.reduce.umax.v64i8(<64 x i8> undef)
 226 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i8 @llvm.vector.reduce.umax.v128i8(<128 x i8> undef)
 227 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 228 ;
 229 ; SSE4-LABEL: 'reduce_i8'
 230 ; SSE4-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.vector.reduce.umax.v2i8(<2 x i8> undef)
 231 ; SSE4-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.vector.reduce.umax.v4i8(<4 x i8> undef)
 232 ; SSE4-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.vector.reduce.umax.v8i8(<8 x i8> undef)
 233 ; SSE4-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.vector.reduce.umax.v16i8(<16 x i8> undef)
 234 ; SSE4-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V32 = call i8 @llvm.vector.reduce.umax.v32i8(<32 x i8> undef)
 235 ; SSE4-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V64 = call i8 @llvm.vector.reduce.umax.v64i8(<64 x i8> undef)
 236 ; SSE4-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V128 = call i8 @llvm.vector.reduce.umax.v128i8(<128 x i8> undef)
 237 ; SSE4-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 238 ;
 239 ; AVX1-LABEL: 'reduce_i8'
 240 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.vector.reduce.umax.v2i8(<2 x i8> undef)
 241 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.vector.reduce.umax.v4i8(<4 x i8> undef)
 242 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.vector.reduce.umax.v8i8(<8 x i8> undef)
 243 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.vector.reduce.umax.v16i8(<16 x i8> undef)
 244 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i8 @llvm.vector.reduce.umax.v32i8(<32 x i8> undef)
 245 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i8 @llvm.vector.reduce.umax.v64i8(<64 x i8> undef)
 246 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V128 = call i8 @llvm.vector.reduce.umax.v128i8(<128 x i8> undef)
 247 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 248 ;
 249 ; AVX2-LABEL: 'reduce_i8'
 250 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.vector.reduce.umax.v2i8(<2 x i8> undef)
 251 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.vector.reduce.umax.v4i8(<4 x i8> undef)
 252 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.vector.reduce.umax.v8i8(<8 x i8> undef)
 253 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.vector.reduce.umax.v16i8(<16 x i8> undef)
 254 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i8 @llvm.vector.reduce.umax.v32i8(<32 x i8> undef)
 255 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V64 = call i8 @llvm.vector.reduce.umax.v64i8(<64 x i8> undef)
 256 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V128 = call i8 @llvm.vector.reduce.umax.v128i8(<128 x i8> undef)
 257 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 258 ;
 259 ; AVX512F-LABEL: 'reduce_i8'
 260 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.vector.reduce.umax.v2i8(<2 x i8> undef)
 261 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.vector.reduce.umax.v4i8(<4 x i8> undef)
 262 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.vector.reduce.umax.v8i8(<8 x i8> undef)
 263 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.vector.reduce.umax.v16i8(<16 x i8> undef)
 264 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i8 @llvm.vector.reduce.umax.v32i8(<32 x i8> undef)
 265 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V64 = call i8 @llvm.vector.reduce.umax.v64i8(<64 x i8> undef)
 266 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i8 @llvm.vector.reduce.umax.v128i8(<128 x i8> undef)
 267 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 268 ;
 269 ; AVX512BW-LABEL: 'reduce_i8'
 270 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.vector.reduce.umax.v2i8(<2 x i8> undef)
 271 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.vector.reduce.umax.v4i8(<4 x i8> undef)
 272 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.vector.reduce.umax.v8i8(<8 x i8> undef)
 273 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.vector.reduce.umax.v16i8(<16 x i8> undef)
 274 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i8 @llvm.vector.reduce.umax.v32i8(<32 x i8> undef)
 275 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V64 = call i8 @llvm.vector.reduce.umax.v64i8(<64 x i8> undef)
 276 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V128 = call i8 @llvm.vector.reduce.umax.v128i8(<128 x i8> undef)
 277 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 278 ;
 279 ; AVX512DQ-LABEL: 'reduce_i8'
 280 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.vector.reduce.umax.v2i8(<2 x i8> undef)
 281 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.vector.reduce.umax.v4i8(<4 x i8> undef)
 282 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.vector.reduce.umax.v8i8(<8 x i8> undef)
 283 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.vector.reduce.umax.v16i8(<16 x i8> undef)
 284 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i8 @llvm.vector.reduce.umax.v32i8(<32 x i8> undef)
 285 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V64 = call i8 @llvm.vector.reduce.umax.v64i8(<64 x i8> undef)
 286 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i8 @llvm.vector.reduce.umax.v128i8(<128 x i8> undef)
 287 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 288 ;
 289   %V2   = call i8 @llvm.vector.reduce.umax.v2i8(<2 x i8> undef)
 290   %V4   = call i8 @llvm.vector.reduce.umax.v4i8(<4 x i8> undef)
 291   %V8   = call i8 @llvm.vector.reduce.umax.v8i8(<8 x i8> undef)
 292   %V16  = call i8 @llvm.vector.reduce.umax.v16i8(<16 x i8> undef)
 293   %V32  = call i8 @llvm.vector.reduce.umax.v32i8(<32 x i8> undef)
 294   %V64  = call i8 @llvm.vector.reduce.umax.v64i8(<64 x i8> undef)
 295   %V128 = call i8 @llvm.vector.reduce.umax.v128i8(<128 x i8> undef)
 296   ret i32 undef
 297 }
 298
 299 declare i64 @llvm.vector.reduce.umax.v1i64(<1 x i64>)
 300 declare i64 @llvm.vector.reduce.umax.v2i64(<2 x i64>)
 301 declare i64 @llvm.vector.reduce.umax.v4i64(<4 x i64>)
 302 declare i64 @llvm.vector.reduce.umax.v8i64(<8 x i64>)
 303 declare i64 @llvm.vector.reduce.umax.v16i64(<16 x i64>)
 304
 305 declare i32 @llvm.vector.reduce.umax.v2i32(<2 x i32>)
 306 declare i32 @llvm.vector.reduce.umax.v4i32(<4 x i32>)
 307 declare i32 @llvm.vector.reduce.umax.v8i32(<8 x i32>)
 308 declare i32 @llvm.vector.reduce.umax.v16i32(<16 x i32>)
 309 declare i32 @llvm.vector.reduce.umax.v32i32(<32 x i32>)
 310
 311 declare i16 @llvm.vector.reduce.umax.v2i16(<2 x i16>)
 312 declare i16 @llvm.vector.reduce.umax.v4i16(<4 x i16>)
 313 declare i16 @llvm.vector.reduce.umax.v8i16(<8 x i16>)
 314 declare i16 @llvm.vector.reduce.umax.v16i16(<16 x i16>)
 315 declare i16 @llvm.vector.reduce.umax.v32i16(<32 x i16>)
 316 declare i16 @llvm.vector.reduce.umax.v64i16(<64 x i16>)
 317
 318 declare i8 @llvm.vector.reduce.umax.v2i8(<2 x i8>)
 319 declare i8 @llvm.vector.reduce.umax.v4i8(<4 x i8>)
 320 declare i8 @llvm.vector.reduce.umax.v8i8(<8 x i8>)
 321 declare i8 @llvm.vector.reduce.umax.v16i8(<16 x i8>)
 322 declare i8 @llvm.vector.reduce.umax.v32i8(<32 x i8>)
 323 declare i8 @llvm.vector.reduce.umax.v64i8(<64 x i8>)
 324 declare i8 @llvm.vector.reduce.umax.v128i8(<128 x i8>)
 325 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
 326 ; SSE41: {{.*}}
 327 ; SSE42: {{.*}}