llvm/test/Analysis/CostModel/X86/reduce-fmax.ll

   1 ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
   2 ; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+sse2 | FileCheck %s --check-prefixes=SSE
   3 ; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+ssse3 | FileCheck %s --check-prefixes=SSE
   4 ; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE
   5 ; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+sse4.2 | FileCheck %s --check-prefixes=SSE
   6 ; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1
   7 ; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2
   8 ; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512
   9 ; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512
  10 ; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+avx512f,+avx512dq | FileCheck %s --check-prefixes=AVX512
  11
  12 define i32 @reduce_f64(i32 %arg) {
  13 ; SSE-LABEL: 'reduce_f64'
  14 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call double @llvm.vector.reduce.fmax.v1f64(<1 x double> undef)
  15 ; SSE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2 = call double @llvm.vector.reduce.fmax.v2f64(<2 x double> undef)
  16 ; SSE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4 = call double @llvm.vector.reduce.fmax.v4f64(<4 x double> undef)
  17 ; SSE-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8 = call double @llvm.vector.reduce.fmax.v8f64(<8 x double> undef)
  18 ; SSE-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16 = call double @llvm.vector.reduce.fmax.v16f64(<16 x double> undef)
  19 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
  20 ;
  21 ; AVX1-LABEL: 'reduce_f64'
  22 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call double @llvm.vector.reduce.fmax.v1f64(<1 x double> undef)
  23 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call double @llvm.vector.reduce.fmax.v2f64(<2 x double> undef)
  24 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call double @llvm.vector.reduce.fmax.v4f64(<4 x double> undef)
  25 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V8 = call double @llvm.vector.reduce.fmax.v8f64(<8 x double> undef)
  26 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V16 = call double @llvm.vector.reduce.fmax.v16f64(<16 x double> undef)
  27 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
  28 ;
  29 ; AVX2-LABEL: 'reduce_f64'
  30 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call double @llvm.vector.reduce.fmax.v1f64(<1 x double> undef)
  31 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call double @llvm.vector.reduce.fmax.v2f64(<2 x double> undef)
  32 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4 = call double @llvm.vector.reduce.fmax.v4f64(<4 x double> undef)
  33 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8 = call double @llvm.vector.reduce.fmax.v8f64(<8 x double> undef)
  34 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V16 = call double @llvm.vector.reduce.fmax.v16f64(<16 x double> undef)
  35 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
  36 ;
  37 ; AVX512-LABEL: 'reduce_f64'
  38 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call double @llvm.vector.reduce.fmax.v1f64(<1 x double> undef)
  39 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call double @llvm.vector.reduce.fmax.v2f64(<2 x double> undef)
  40 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4 = call double @llvm.vector.reduce.fmax.v4f64(<4 x double> undef)
  41 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call double @llvm.vector.reduce.fmax.v8f64(<8 x double> undef)
  42 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V16 = call double @llvm.vector.reduce.fmax.v16f64(<16 x double> undef)
  43 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
  44 ;
  45   %V1  = call double @llvm.vector.reduce.fmax.v1f64(<1 x double> undef)
  46   %V2  = call double @llvm.vector.reduce.fmax.v2f64(<2 x double> undef)
  47   %V4  = call double @llvm.vector.reduce.fmax.v4f64(<4 x double> undef)
  48   %V8  = call double @llvm.vector.reduce.fmax.v8f64(<8 x double> undef)
  49   %V16 = call double @llvm.vector.reduce.fmax.v16f64(<16 x double> undef)
  50   ret i32 undef
  51 }
  52
  53 define i32 @reduce_f32(i32 %arg) {
  54 ; SSE-LABEL: 'reduce_f32'
  55 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call float @llvm.vector.reduce.fmax.v1f32(<1 x float> undef)
  56 ; SSE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2 = call float @llvm.vector.reduce.fmax.v2f32(<2 x float> undef)
  57 ; SSE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4 = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> undef)
  58 ; SSE-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call float @llvm.vector.reduce.fmax.v8f32(<8 x float> undef)
  59 ; SSE-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V16 = call float @llvm.vector.reduce.fmax.v16f32(<16 x float> undef)
  60 ; SSE-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V32 = call float @llvm.vector.reduce.fmax.v32f32(<32 x float> undef)
  61 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
  62 ;
  63 ; AVX1-LABEL: 'reduce_f32'
  64 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call float @llvm.vector.reduce.fmax.v1f32(<1 x float> undef)
  65 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call float @llvm.vector.reduce.fmax.v2f32(<2 x float> undef)
  66 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> undef)
  67 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8 = call float @llvm.vector.reduce.fmax.v8f32(<8 x float> undef)
  68 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V16 = call float @llvm.vector.reduce.fmax.v16f32(<16 x float> undef)
  69 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V32 = call float @llvm.vector.reduce.fmax.v32f32(<32 x float> undef)
  70 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
  71 ;
  72 ; AVX2-LABEL: 'reduce_f32'
  73 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call float @llvm.vector.reduce.fmax.v1f32(<1 x float> undef)
  74 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call float @llvm.vector.reduce.fmax.v2f32(<2 x float> undef)
  75 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4 = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> undef)
  76 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8 = call float @llvm.vector.reduce.fmax.v8f32(<8 x float> undef)
  77 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V16 = call float @llvm.vector.reduce.fmax.v16f32(<16 x float> undef)
  78 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V32 = call float @llvm.vector.reduce.fmax.v32f32(<32 x float> undef)
  79 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
  80 ;
  81 ; AVX512-LABEL: 'reduce_f32'
  82 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call float @llvm.vector.reduce.fmax.v1f32(<1 x float> undef)
  83 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call float @llvm.vector.reduce.fmax.v2f32(<2 x float> undef)
  84 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4 = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> undef)
  85 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8 = call float @llvm.vector.reduce.fmax.v8f32(<8 x float> undef)
  86 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call float @llvm.vector.reduce.fmax.v16f32(<16 x float> undef)
  87 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V32 = call float @llvm.vector.reduce.fmax.v32f32(<32 x float> undef)
  88 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
  89 ;
  90   %V1  = call float @llvm.vector.reduce.fmax.v1f32(<1 x float> undef)
  91   %V2  = call float @llvm.vector.reduce.fmax.v2f32(<2 x float> undef)
  92   %V4  = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> undef)
  93   %V8  = call float @llvm.vector.reduce.fmax.v8f32(<8 x float> undef)
  94   %V16 = call float @llvm.vector.reduce.fmax.v16f32(<16 x float> undef)
  95   %V32 = call float @llvm.vector.reduce.fmax.v32f32(<32 x float> undef)
  96   ret i32 undef
  97 }
  98
  99 ; Fast Reductions
 100
 101 define i32 @reduce_f64_fast(i32 %arg) {
 102 ; SSE-LABEL: 'reduce_f64_fast'
 103 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call fast double @llvm.vector.reduce.fmax.v1f64(<1 x double> undef)
 104 ; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call fast double @llvm.vector.reduce.fmax.v2f64(<2 x double> undef)
 105 ; SSE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4 = call fast double @llvm.vector.reduce.fmax.v4f64(<4 x double> undef)
 106 ; SSE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8 = call fast double @llvm.vector.reduce.fmax.v8f64(<8 x double> undef)
 107 ; SSE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call fast double @llvm.vector.reduce.fmax.v16f64(<16 x double> undef)
 108 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 109 ;
 110 ; AVX-LABEL: 'reduce_f64_fast'
 111 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call fast double @llvm.vector.reduce.fmax.v1f64(<1 x double> undef)
 112 ; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call fast double @llvm.vector.reduce.fmax.v2f64(<2 x double> undef)
 113 ; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4 = call fast double @llvm.vector.reduce.fmax.v4f64(<4 x double> undef)
 114 ; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8 = call fast double @llvm.vector.reduce.fmax.v8f64(<8 x double> undef)
 115 ; AVX-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16 = call fast double @llvm.vector.reduce.fmax.v16f64(<16 x double> undef)
 116 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 117 ;
 118 ; AVX512-LABEL: 'reduce_f64_fast'
 119 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call fast double @llvm.vector.reduce.fmax.v1f64(<1 x double> undef)
 120 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call fast double @llvm.vector.reduce.fmax.v2f64(<2 x double> undef)
 121 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4 = call fast double @llvm.vector.reduce.fmax.v4f64(<4 x double> undef)
 122 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8 = call fast double @llvm.vector.reduce.fmax.v8f64(<8 x double> undef)
 123 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16 = call fast double @llvm.vector.reduce.fmax.v16f64(<16 x double> undef)
 124 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 125 ;
 126   %V1  = call fast double @llvm.vector.reduce.fmax.v1f64(<1 x double> undef)
 127   %V2  = call fast double @llvm.vector.reduce.fmax.v2f64(<2 x double> undef)
 128   %V4  = call fast double @llvm.vector.reduce.fmax.v4f64(<4 x double> undef)
 129   %V8  = call fast double @llvm.vector.reduce.fmax.v8f64(<8 x double> undef)
 130   %V16 = call fast double @llvm.vector.reduce.fmax.v16f64(<16 x double> undef)
 131   ret i32 undef
 132 }
 133
 134 define i32 @reduce_f32_fast(i32 %arg) {
 135 ; SSE-LABEL: 'reduce_f32_fast'
 136 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call fast float @llvm.vector.reduce.fmax.v1f32(<1 x float> undef)
 137 ; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call fast float @llvm.vector.reduce.fmax.v2f32(<2 x float> undef)
 138 ; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4 = call fast float @llvm.vector.reduce.fmax.v4f32(<4 x float> undef)
 139 ; SSE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8 = call fast float @llvm.vector.reduce.fmax.v8f32(<8 x float> undef)
 140 ; SSE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16 = call fast float @llvm.vector.reduce.fmax.v16f32(<16 x float> undef)
 141 ; SSE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V32 = call fast float @llvm.vector.reduce.fmax.v32f32(<32 x float> undef)
 142 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 143 ;
 144 ; AVX-LABEL: 'reduce_f32_fast'
 145 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call fast float @llvm.vector.reduce.fmax.v1f32(<1 x float> undef)
 146 ; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call fast float @llvm.vector.reduce.fmax.v2f32(<2 x float> undef)
 147 ; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4 = call fast float @llvm.vector.reduce.fmax.v4f32(<4 x float> undef)
 148 ; AVX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8 = call fast float @llvm.vector.reduce.fmax.v8f32(<8 x float> undef)
 149 ; AVX-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16 = call fast float @llvm.vector.reduce.fmax.v16f32(<16 x float> undef)
 150 ; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V32 = call fast float @llvm.vector.reduce.fmax.v32f32(<32 x float> undef)
 151 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 152 ;
 153 ; AVX512-LABEL: 'reduce_f32_fast'
 154 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call fast float @llvm.vector.reduce.fmax.v1f32(<1 x float> undef)
 155 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call fast float @llvm.vector.reduce.fmax.v2f32(<2 x float> undef)
 156 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4 = call fast float @llvm.vector.reduce.fmax.v4f32(<4 x float> undef)
 157 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8 = call fast float @llvm.vector.reduce.fmax.v8f32(<8 x float> undef)
 158 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call fast float @llvm.vector.reduce.fmax.v16f32(<16 x float> undef)
 159 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V32 = call fast float @llvm.vector.reduce.fmax.v32f32(<32 x float> undef)
 160 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 161 ;
 162   %V1  = call fast float @llvm.vector.reduce.fmax.v1f32(<1 x float> undef)
 163   %V2  = call fast float @llvm.vector.reduce.fmax.v2f32(<2 x float> undef)
 164   %V4  = call fast float @llvm.vector.reduce.fmax.v4f32(<4 x float> undef)
 165   %V8  = call fast float @llvm.vector.reduce.fmax.v8f32(<8 x float> undef)
 166   %V16 = call fast float @llvm.vector.reduce.fmax.v16f32(<16 x float> undef)
 167   %V32 = call fast float @llvm.vector.reduce.fmax.v32f32(<32 x float> undef)
 168   ret i32 undef
 169 }
 170
 171 declare double @llvm.vector.reduce.fmax.v1f64(<1 x double>)
 172 declare double @llvm.vector.reduce.fmax.v2f64(<2 x double>)
 173 declare double @llvm.vector.reduce.fmax.v4f64(<4 x double>)
 174 declare double @llvm.vector.reduce.fmax.v8f64(<8 x double>)
 175 declare double @llvm.vector.reduce.fmax.v16f64(<16 x double>)
 176
 177 declare float @llvm.vector.reduce.fmax.v1f32(<1 x float>)
 178 declare float @llvm.vector.reduce.fmax.v2f32(<2 x float>)
 179 declare float @llvm.vector.reduce.fmax.v4f32(<4 x float>)
 180 declare float @llvm.vector.reduce.fmax.v8f32(<8 x float>)
 181 declare float @llvm.vector.reduce.fmax.v16f32(<16 x float>)
 182 declare float @llvm.vector.reduce.fmax.v32f32(<32 x float>)