1 /* Tuning model description for AArch64 architecture.
2 Copyright (C) 2009-2025 Free Software Foundation, Inc.
4 This file is part of GCC.
6 GCC is free software; you can redistribute it and/or modify it
7 under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 3, or (at your option)
11 GCC is distributed in the hope that it will be useful, but
12 WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with GCC; see the file COPYING3. If not see
18 <http://www.gnu.org/licenses/>. */
20 #ifndef GCC_AARCH64_H_NEOVERSEN2
21 #define GCC_AARCH64_H_NEOVERSEN2
25 static const struct cpu_regmove_cost neoversen2_regmove_cost
=
28 /* Spilling to int<->fp instead of memory is recommended so set
29 realistic costs compared to memmov_cost. */
35 static const advsimd_vec_cost neoversen2_advsimd_vector_cost
=
37 2, /* int_stmt_cost */
39 2, /* ld2_st2_permute_cost */
40 2, /* ld3_st3_permute_cost */
41 3, /* ld4_st4_permute_cost */
43 4, /* reduc_i8_cost */
44 4, /* reduc_i16_cost */
45 2, /* reduc_i32_cost */
46 2, /* reduc_i64_cost */
47 6, /* reduc_f16_cost */
48 4, /* reduc_f32_cost */
49 2, /* reduc_f64_cost */
50 2, /* store_elt_extra_cost */
51 /* This value is just inherited from the Cortex-A57 table. */
52 8, /* vec_to_scalar_cost */
53 /* This depends very much on what the scalar value is and
54 where it comes from. E.g. some constants take two dependent
55 instructions or a load, while others might be moved from a GPR.
56 4 seems to be a reasonable compromise in practice. */
57 4, /* scalar_to_vec_cost */
58 4, /* align_load_cost */
59 4, /* unalign_load_cost */
60 /* Although stores have a latency of 2 and compete for the
61 vector pipes, in practice it's better not to model that. */
62 1, /* unalign_store_cost */
66 static const sve_vec_cost neoversen2_sve_vector_cost
=
69 2, /* int_stmt_cost */
71 2, /* ld2_st2_permute_cost */
72 3, /* ld3_st3_permute_cost */
73 3, /* ld4_st4_permute_cost */
75 /* Theoretically, a reduction involving 15 scalar ADDs could
76 complete in ~5 cycles and would have a cost of 15. [SU]ADDV
77 completes in 9 cycles, so give it a cost of 15 + 4. */
78 19, /* reduc_i8_cost */
79 /* Likewise for 7 scalar ADDs (~3 cycles) vs. 8: 7 + 5. */
80 12, /* reduc_i16_cost */
81 /* Likewise for 3 scalar ADDs (~2 cycles) vs. 6: 3 + 4. */
82 7, /* reduc_i32_cost */
83 /* Likewise for 1 scalar ADDs (~1 cycles) vs. 4: 1 + 3. */
84 4, /* reduc_i64_cost */
85 /* Theoretically, a reduction involving 7 scalar FADDs could
86 complete in ~8 cycles and would have a cost of 14. FADDV
87 completes in 6 cycles, so give it a cost of 14 + -2. */
88 12, /* reduc_f16_cost */
89 /* Likewise for 3 scalar FADDs (~4 cycles) vs. 4: 6 + 0. */
90 6, /* reduc_f32_cost */
91 /* Likewise for 1 scalar FADD (~2 cycles) vs. 2: 2 + 0. */
92 2, /* reduc_f64_cost */
93 2, /* store_elt_extra_cost */
94 /* This value is just inherited from the Cortex-A57 table. */
95 8, /* vec_to_scalar_cost */
96 /* See the comment above the Advanced SIMD versions. */
97 4, /* scalar_to_vec_cost */
98 4, /* align_load_cost */
99 4, /* unalign_load_cost */
100 /* Although stores have a latency of 2 and compete for the
101 vector pipes, in practice it's better not to model that. */
102 1, /* unalign_store_cost */
106 10, /* fadda_f16_cost */
107 6, /* fadda_f32_cost */
108 4, /* fadda_f64_cost */
109 /* A strided Advanced SIMD x64 load would take two parallel FP loads
110 (8 cycles) plus an insertion (2 cycles). Assume a 64-bit SVE gather
111 is 1 cycle more. The Advanced SIMD version is costed as 2 scalar loads
112 (cost 8) and a vec_construct (cost 4). Add a full vector operation
113 (cost 2) to that, to avoid the difference being lost in rounding.
115 There is no easy comparison between a strided Advanced SIMD x32 load
116 and an SVE 32-bit gather, but cost an SVE 32-bit gather as 1 vector
117 operation more than a 64-bit gather. */
118 14, /* gather_load_x32_cost */
119 12, /* gather_load_x64_cost */
120 42, /* gather_load_x32_init_cost */
121 24, /* gather_load_x64_init_cost */
122 3 /* scatter_store_elt_cost */
125 static const aarch64_scalar_vec_issue_info neoversen2_scalar_issue_info
=
127 3, /* loads_stores_per_cycle */
128 2, /* stores_per_cycle */
129 4, /* general_ops_per_cycle */
130 0, /* fp_simd_load_general_ops */
131 1 /* fp_simd_store_general_ops */
134 static const aarch64_advsimd_vec_issue_info neoversen2_advsimd_issue_info
=
137 3, /* loads_stores_per_cycle */
138 2, /* stores_per_cycle */
139 2, /* general_ops_per_cycle */
140 0, /* fp_simd_load_general_ops */
141 1 /* fp_simd_store_general_ops */
143 2, /* ld2_st2_general_ops */
144 2, /* ld3_st3_general_ops */
145 3 /* ld4_st4_general_ops */
148 static const aarch64_sve_vec_issue_info neoversen2_sve_issue_info
=
152 3, /* loads_stores_per_cycle */
153 2, /* stores_per_cycle */
154 2, /* general_ops_per_cycle */
155 0, /* fp_simd_load_general_ops */
156 1 /* fp_simd_store_general_ops */
158 2, /* ld2_st2_general_ops */
159 2, /* ld3_st3_general_ops */
160 3 /* ld4_st4_general_ops */
162 2, /* pred_ops_per_cycle */
163 2, /* while_pred_ops */
164 2, /* int_cmp_pred_ops */
165 1, /* fp_cmp_pred_ops */
166 1, /* gather_scatter_pair_general_ops */
167 1 /* gather_scatter_pair_pred_ops */
170 static const aarch64_vec_issue_info neoversen2_vec_issue_info
=
172 &neoversen2_scalar_issue_info
,
173 &neoversen2_advsimd_issue_info
,
174 &neoversen2_sve_issue_info
177 /* Neoversen2 costs for vector insn classes. */
178 static const struct cpu_vector_cost neoversen2_vector_cost
=
180 1, /* scalar_int_stmt_cost */
181 2, /* scalar_fp_stmt_cost */
182 4, /* scalar_load_cost */
183 1, /* scalar_store_cost */
184 1, /* cond_taken_branch_cost */
185 1, /* cond_not_taken_branch_cost */
186 &neoversen2_advsimd_vector_cost
, /* advsimd */
187 &neoversen2_sve_vector_cost
, /* sve */
188 &neoversen2_vec_issue_info
/* issue_info */
191 static const struct tune_params neoversen2_tunings
=
193 &cortexa76_extra_costs
,
194 &generic_armv9_a_addrcost_table
,
195 &neoversen2_regmove_cost
,
196 &neoversen2_vector_cost
,
197 &generic_branch_cost
,
198 &generic_approx_modes
,
199 SVE_128
, /* sve_width */
206 }, /* memmov_cost. */
208 AARCH64_FUSE_NEOVERSE_BASE
, /* fusible_ops */
209 "32:16", /* function_align. */
210 "4", /* jump_align. */
211 "32:16", /* loop_align. */
212 2, /* int_reassoc_width. */
213 4, /* fp_reassoc_width. */
214 1, /* fma_reassoc_width. */
215 2, /* vec_reassoc_width. */
216 2, /* min_div_recip_mul_sf. */
217 2, /* min_div_recip_mul_df. */
218 0, /* max_case_values. */
219 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
220 (AARCH64_EXTRA_TUNE_BASE
221 | AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS
222 | AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT
223 | AARCH64_EXTRA_TUNE_AVOID_PRED_RMW
), /* tune_flags. */
224 &generic_armv9a_prefetch_tune
,
225 AARCH64_LDP_STP_POLICY_ALWAYS
, /* ldp_policy_model. */
226 AARCH64_LDP_STP_POLICY_ALWAYS
/* stp_policy_model. */
229 #endif /* GCC_AARCH64_H_NEOVERSEN2. */