1 /* Tuning model description for AArch64 architecture.
2 Copyright (C) 2009-2025 Free Software Foundation, Inc.
4 This file is part of GCC.
6 GCC is free software; you can redistribute it and/or modify it
7 under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 3, or (at your option)
11 GCC is distributed in the hope that it will be useful, but
12 WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with GCC; see the file COPYING3. If not see
18 <http://www.gnu.org/licenses/>. */
20 #ifndef GCC_AARCH64_H_NEOVERSE512TVB
21 #define GCC_AARCH64_H_NEOVERSE512TVB
25 static const sve_vec_cost neoverse512tvb_sve_vector_cost
=
28 2, /* int_stmt_cost */
30 4, /* ld2_st2_permute_cost */
31 5, /* ld3_st3_permute_cost */
32 5, /* ld4_st4_permute_cost */
34 /* Theoretically, a reduction involving 15 scalar ADDs could
35 complete in ~5 cycles and would have a cost of 15. Assume that
36 [SU]ADDV completes in 11 cycles and so give it a cost of 15 + 6. */
37 21, /* reduc_i8_cost */
38 /* Likewise for 7 scalar ADDs (~3 cycles) vs. 9: 7 + 6. */
39 13, /* reduc_i16_cost */
40 /* Likewise for 3 scalar ADDs (~2 cycles) vs. 8: 3 + 6. */
41 9, /* reduc_i32_cost */
42 /* Likewise for 1 scalar ADD (1 cycle) vs. 8: 1 + 7. */
43 8, /* reduc_i64_cost */
44 /* Theoretically, a reduction involving 7 scalar FADDs could
45 complete in ~6 cycles and would have a cost of 14. Assume that
46 FADDV completes in 8 cycles and so give it a cost of 14 + 2. */
47 16, /* reduc_f16_cost */
48 /* Likewise for 3 scalar FADDs (~4 cycles) vs. 6: 6 + 2. */
49 8, /* reduc_f32_cost */
50 /* Likewise for 1 scalar FADD (2 cycles) vs. 4: 2 + 2. */
51 4, /* reduc_f64_cost */
52 2, /* store_elt_extra_cost */
53 /* This value is just inherited from the Cortex-A57 table. */
54 8, /* vec_to_scalar_cost */
55 /* This depends very much on what the scalar value is and
56 where it comes from. E.g. some constants take two dependent
57 instructions or a load, while others might be moved from a GPR.
58 4 seems to be a reasonable compromise in practice. */
59 4, /* scalar_to_vec_cost */
60 4, /* align_load_cost */
61 4, /* unalign_load_cost */
62 /* Although stores generally have a latency of 2 and compete for the
63 vector pipes, in practice it's better not to model that. */
64 1, /* unalign_store_cost */
68 10, /* fadda_f16_cost */
69 6, /* fadda_f32_cost */
70 4, /* fadda_f64_cost */
71 /* A strided Advanced SIMD x64 load would take two parallel FP loads
72 (6 cycles) plus an insertion (2 cycles). Assume a 64-bit SVE gather
73 is 1 cycle more. The Advanced SIMD version is costed as 2 scalar loads
74 (cost 8) and a vec_construct (cost 2). Add a full vector operation
75 (cost 2) to that, to avoid the difference being lost in rounding.
77 There is no easy comparison between a strided Advanced SIMD x32 load
78 and an SVE 32-bit gather, but cost an SVE 32-bit gather as 1 vector
79 operation more than a 64-bit gather. */
80 14, /* gather_load_x32_cost */
81 12, /* gather_load_x64_cost */
82 42, /* gather_load_x32_init_cost */
83 24, /* gather_load_x64_init_cost */
84 3 /* scatter_store_elt_cost */
87 static const aarch64_sve_vec_issue_info neoverse512tvb_sve_issue_info
=
91 3, /* loads_per_cycle */
92 2, /* stores_per_cycle */
93 4, /* general_ops_per_cycle */
94 0, /* fp_simd_load_general_ops */
95 1 /* fp_simd_store_general_ops */
97 2, /* ld2_st2_general_ops */
98 2, /* ld3_st3_general_ops */
99 3 /* ld4_st4_general_ops */
101 2, /* pred_ops_per_cycle */
102 2, /* while_pred_ops */
103 2, /* int_cmp_pred_ops */
104 1, /* fp_cmp_pred_ops */
105 1, /* gather_scatter_pair_general_ops */
106 1 /* gather_scatter_pair_pred_ops */
109 static const aarch64_vec_issue_info neoverse512tvb_vec_issue_info
=
111 &neoversev1_scalar_issue_info
,
112 &neoversev1_advsimd_issue_info
,
113 &neoverse512tvb_sve_issue_info
116 static const struct cpu_vector_cost neoverse512tvb_vector_cost
=
118 1, /* scalar_int_stmt_cost */
119 2, /* scalar_fp_stmt_cost */
120 4, /* scalar_load_cost */
121 1, /* scalar_store_cost */
122 1, /* cond_taken_branch_cost */
123 1, /* cond_not_taken_branch_cost */
124 &neoversev1_advsimd_vector_cost
, /* advsimd */
125 &neoverse512tvb_sve_vector_cost
, /* sve */
126 &neoverse512tvb_vec_issue_info
/* issue_info */
129 static const struct tune_params neoverse512tvb_tunings
=
131 &cortexa76_extra_costs
,
132 &neoversev1_addrcost_table
,
133 &neoversev1_regmove_cost
,
134 &neoverse512tvb_vector_cost
,
135 &generic_branch_cost
,
136 &generic_approx_modes
,
137 SVE_128
| SVE_256
, /* sve_width */
144 }, /* memmov_cost. */
146 AARCH64_FUSE_NEOVERSE_BASE
, /* fusible_ops */
147 "32:16", /* function_align. */
148 "4", /* jump_align. */
149 "32:16", /* loop_align. */
150 2, /* int_reassoc_width. */
151 4, /* fp_reassoc_width. */
152 4, /* fma_reassoc_width. */
153 2, /* vec_reassoc_width. */
154 2, /* min_div_recip_mul_sf. */
155 2, /* min_div_recip_mul_df. */
156 0, /* max_case_values. */
157 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
158 (AARCH64_EXTRA_TUNE_BASE
159 | AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS
160 | AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT
161 | AARCH64_EXTRA_TUNE_AVOID_PRED_RMW
), /* tune_flags. */
162 &generic_armv9a_prefetch_tune
,
163 AARCH64_LDP_STP_POLICY_ALWAYS
, /* ldp_policy_model. */
164 AARCH64_LDP_STP_POLICY_ALWAYS
/* stp_policy_model. */
167 #endif /* GCC_AARCH64_H_NEOVERSE512TVB. */