gcc/config/aarch64/tuning_models/neoversen2.h

   1 /* Tuning model description for AArch64 architecture.
   2    Copyright (C) 2009-2025 Free Software Foundation, Inc.
   3
   4    This file is part of GCC.
   5
   6    GCC is free software; you can redistribute it and/or modify it
   7    under the terms of the GNU General Public License as published by
   8    the Free Software Foundation; either version 3, or (at your option)
   9    any later version.
  10
  11    GCC is distributed in the hope that it will be useful, but
  12    WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14    General Public License for more details.
  15
  16    You should have received a copy of the GNU General Public License
  17    along with GCC; see the file COPYING3.  If not see
  18    <http://www.gnu.org/licenses/>.  */
  19
  20 #ifndef GCC_AARCH64_H_NEOVERSEN2
  21 #define GCC_AARCH64_H_NEOVERSEN2
  22
  23 #include "generic.h"
  24
  25 static const struct cpu_regmove_cost neoversen2_regmove_cost =
  26 {
  27   1, /* GP2GP  */
  28   /* Spilling to int<->fp instead of memory is recommended so set
  29      realistic costs compared to memmov_cost.  */
  30   3, /* GP2FP  */
  31   2, /* FP2GP  */
  32   2 /* FP2FP  */
  33 };
  34
  35 static const advsimd_vec_cost neoversen2_advsimd_vector_cost =
  36 {
  37   2, /* int_stmt_cost  */
  38   2, /* fp_stmt_cost  */
  39   2, /* ld2_st2_permute_cost */
  40   2, /* ld3_st3_permute_cost  */
  41   3, /* ld4_st4_permute_cost  */
  42   2, /* permute_cost  */
  43   4, /* reduc_i8_cost  */
  44   4, /* reduc_i16_cost  */
  45   2, /* reduc_i32_cost  */
  46   2, /* reduc_i64_cost  */
  47   6, /* reduc_f16_cost  */
  48   4, /* reduc_f32_cost  */
  49   2, /* reduc_f64_cost  */
  50   2, /* store_elt_extra_cost  */
  51   /* This value is just inherited from the Cortex-A57 table.  */
  52   8, /* vec_to_scalar_cost  */
  53   /* This depends very much on what the scalar value is and
  54      where it comes from.  E.g. some constants take two dependent
  55      instructions or a load, while others might be moved from a GPR.
  56      4 seems to be a reasonable compromise in practice.  */
  57   4, /* scalar_to_vec_cost  */
  58   4, /* align_load_cost  */
  59   4, /* unalign_load_cost  */
  60   /* Although stores have a latency of 2 and compete for the
  61      vector pipes, in practice it's better not to model that.  */
  62   1, /* unalign_store_cost  */
  63   1  /* store_cost  */
  64 };
  65
  66 static const sve_vec_cost neoversen2_sve_vector_cost =
  67 {
  68   {
  69     2, /* int_stmt_cost  */
  70     2, /* fp_stmt_cost  */
  71     2, /* ld2_st2_permute_cost  */
  72     3, /* ld3_st3_permute_cost  */
  73     3, /* ld4_st4_permute_cost  */
  74     2, /* permute_cost  */
  75     /* Theoretically, a reduction involving 15 scalar ADDs could
  76        complete in ~5 cycles and would have a cost of 15.  [SU]ADDV
  77        completes in 9 cycles, so give it a cost of 15 + 4.  */
  78     19, /* reduc_i8_cost  */
  79     /* Likewise for 7 scalar ADDs (~3 cycles) vs. 8: 7 + 5.  */
  80     12, /* reduc_i16_cost  */
  81     /* Likewise for 3 scalar ADDs (~2 cycles) vs. 6: 3 + 4.  */
  82     7, /* reduc_i32_cost  */
  83     /* Likewise for 1 scalar ADDs (~1 cycles) vs. 4: 1 + 3.  */
  84     4, /* reduc_i64_cost  */
  85     /* Theoretically, a reduction involving 7 scalar FADDs could
  86        complete in ~8 cycles and would have a cost of  14.  FADDV
  87        completes in 6 cycles, so give it a cost of 14 + -2.  */
  88     12, /* reduc_f16_cost  */
  89     /* Likewise for 3 scalar FADDs (~4 cycles) vs. 4: 6 + 0.  */
  90     6, /* reduc_f32_cost  */
  91     /* Likewise for 1 scalar FADD (~2 cycles) vs. 2: 2 + 0.  */
  92     2, /* reduc_f64_cost  */
  93     2, /* store_elt_extra_cost  */
  94     /* This value is just inherited from the Cortex-A57 table.  */
  95     8, /* vec_to_scalar_cost  */
  96     /* See the comment above the Advanced SIMD versions.  */
  97     4, /* scalar_to_vec_cost  */
  98     4, /* align_load_cost  */
  99     4, /* unalign_load_cost  */
 100     /* Although stores have a latency of 2 and compete for the
 101        vector pipes, in practice it's better not to model that.  */
 102     1, /* unalign_store_cost  */
 103     1  /* store_cost  */
 104   },
 105   3, /* clast_cost  */
 106   10, /* fadda_f16_cost  */
 107   6, /* fadda_f32_cost  */
 108   4, /* fadda_f64_cost  */
 109   /* A strided Advanced SIMD x64 load would take two parallel FP loads
 110      (8 cycles) plus an insertion (2 cycles).  Assume a 64-bit SVE gather
 111      is 1 cycle more.  The Advanced SIMD version is costed as 2 scalar loads
 112      (cost 8) and a vec_construct (cost 4).  Add a full vector operation
 113      (cost 2) to that, to avoid the difference being lost in rounding.
 114
 115      There is no easy comparison between a strided Advanced SIMD x32 load
 116      and an SVE 32-bit gather, but cost an SVE 32-bit gather as 1 vector
 117      operation more than a 64-bit gather.  */
 118   14, /* gather_load_x32_cost  */
 119   12, /* gather_load_x64_cost  */
 120   42, /* gather_load_x32_init_cost  */
 121   24, /* gather_load_x64_init_cost  */
 122   3 /* scatter_store_elt_cost  */
 123 };
 124
 125 static const aarch64_scalar_vec_issue_info neoversen2_scalar_issue_info =
 126 {
 127   3, /* loads_stores_per_cycle  */
 128   2, /* stores_per_cycle  */
 129   4, /* general_ops_per_cycle  */
 130   0, /* fp_simd_load_general_ops  */
 131   1 /* fp_simd_store_general_ops  */
 132 };
 133
 134 static const aarch64_advsimd_vec_issue_info neoversen2_advsimd_issue_info =
 135 {
 136   {
 137     3, /* loads_stores_per_cycle  */
 138     2, /* stores_per_cycle  */
 139     2, /* general_ops_per_cycle  */
 140     0, /* fp_simd_load_general_ops  */
 141     1 /* fp_simd_store_general_ops  */
 142   },
 143   2, /* ld2_st2_general_ops  */
 144   2, /* ld3_st3_general_ops  */
 145   3 /* ld4_st4_general_ops  */
 146 };
 147
 148 static const aarch64_sve_vec_issue_info neoversen2_sve_issue_info =
 149 {
 150   {
 151     {
 152       3, /* loads_stores_per_cycle  */
 153       2, /* stores_per_cycle  */
 154       2, /* general_ops_per_cycle  */
 155       0, /* fp_simd_load_general_ops  */
 156       1 /* fp_simd_store_general_ops  */
 157     },
 158     2, /* ld2_st2_general_ops  */
 159     2, /* ld3_st3_general_ops  */
 160     3 /* ld4_st4_general_ops  */
 161   },
 162   2, /* pred_ops_per_cycle  */
 163   2, /* while_pred_ops  */
 164   2, /* int_cmp_pred_ops  */
 165   1, /* fp_cmp_pred_ops  */
 166   1, /* gather_scatter_pair_general_ops  */
 167   1 /* gather_scatter_pair_pred_ops  */
 168 };
 169
 170 static const aarch64_vec_issue_info neoversen2_vec_issue_info =
 171 {
 172   &neoversen2_scalar_issue_info,
 173   &neoversen2_advsimd_issue_info,
 174   &neoversen2_sve_issue_info
 175 };
 176
 177 /* Neoversen2 costs for vector insn classes.  */
 178 static const struct cpu_vector_cost neoversen2_vector_cost =
 179 {
 180   1, /* scalar_int_stmt_cost  */
 181   2, /* scalar_fp_stmt_cost  */
 182   4, /* scalar_load_cost  */
 183   1, /* scalar_store_cost  */
 184   1, /* cond_taken_branch_cost  */
 185   1, /* cond_not_taken_branch_cost  */
 186   &neoversen2_advsimd_vector_cost, /* advsimd  */
 187   &neoversen2_sve_vector_cost, /* sve  */
 188   &neoversen2_vec_issue_info /* issue_info  */
 189 };
 190
 191 static const struct tune_params neoversen2_tunings =
 192 {
 193   &cortexa76_extra_costs,
 194   &generic_armv9_a_addrcost_table,
 195   &neoversen2_regmove_cost,
 196   &neoversen2_vector_cost,
 197   &generic_branch_cost,
 198   &generic_approx_modes,
 199   SVE_128, /* sve_width  */
 200   { 4, /* load_int.  */
 201     1, /* store_int.  */
 202     6, /* load_fp.  */
 203     2, /* store_fp.  */
 204     6, /* load_pred.  */
 205     1 /* store_pred.  */
 206   }, /* memmov_cost.  */
 207   5, /* issue_rate  */
 208   AARCH64_FUSE_NEOVERSE_BASE, /* fusible_ops  */
 209   "32:16",      /* function_align.  */
 210   "4",          /* jump_align.  */
 211   "32:16",      /* loop_align.  */
 212   2,    /* int_reassoc_width.  */
 213   4,    /* fp_reassoc_width.  */
 214   1,    /* fma_reassoc_width.  */
 215   2,    /* vec_reassoc_width.  */
 216   2,    /* min_div_recip_mul_sf.  */
 217   2,    /* min_div_recip_mul_df.  */
 218   0,    /* max_case_values.  */
 219   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 220   (AARCH64_EXTRA_TUNE_BASE
 221    | AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS
 222    | AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT
 223    | AARCH64_EXTRA_TUNE_AVOID_PRED_RMW),        /* tune_flags.  */
 224   &generic_armv9a_prefetch_tune,
 225   AARCH64_LDP_STP_POLICY_ALWAYS,   /* ldp_policy_model.  */
 226   AARCH64_LDP_STP_POLICY_ALWAYS    /* stp_policy_model.  */
 227 };
 228
 229 #endif /* GCC_AARCH64_H_NEOVERSEN2.  */