[PR testsuite/116860] Testsuite adjustment for recently added tests
[official-gcc.git] / gcc / config / aarch64 / aarch64.cc
blobbe99137b052d1692ce06c4b69ac8bca5a7422f89
1 /* Machine description for AArch64 architecture.
2 Copyright (C) 2009-2025 Free Software Foundation, Inc.
3 Contributed by ARM Ltd.
5 This file is part of GCC.
7 GCC is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3, or (at your option)
10 any later version.
12 GCC is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
21 #define IN_TARGET_CODE 1
23 #define INCLUDE_STRING
24 #define INCLUDE_ALGORITHM
25 #define INCLUDE_VECTOR
26 #include "config.h"
27 #include "system.h"
28 #include "coretypes.h"
29 #include "backend.h"
30 #include "target.h"
31 #include "rtl.h"
32 #include "tree.h"
33 #include "memmodel.h"
34 #include "gimple.h"
35 #include "cfghooks.h"
36 #include "cfgloop.h"
37 #include "df.h"
38 #include "tm_p.h"
39 #include "stringpool.h"
40 #include "attribs.h"
41 #include "optabs.h"
42 #include "regs.h"
43 #include "emit-rtl.h"
44 #include "recog.h"
45 #include "cgraph.h"
46 #include "diagnostic.h"
47 #include "insn-attr.h"
48 #include "alias.h"
49 #include "fold-const.h"
50 #include "stor-layout.h"
51 #include "calls.h"
52 #include "varasm.h"
53 #include "output.h"
54 #include "flags.h"
55 #include "explow.h"
56 #include "expr.h"
57 #include "reload.h"
58 #include "langhooks.h"
59 #include "opts.h"
60 #include "gimplify.h"
61 #include "dwarf2.h"
62 #include "dwarf2out.h"
63 #include "gimple-iterator.h"
64 #include "tree-vectorizer.h"
65 #include "aarch64-cost-tables.h"
66 #include "dumpfile.h"
67 #include "builtins.h"
68 #include "rtl-iter.h"
69 #include "tm-constrs.h"
70 #include "sched-int.h"
71 #include "target-globals.h"
72 #include "common/common-target.h"
73 #include "cfgrtl.h"
74 #include "selftest.h"
75 #include "selftest-rtl.h"
76 #include "rtx-vector-builder.h"
77 #include "intl.h"
78 #include "expmed.h"
79 #include "function-abi.h"
80 #include "gimple-pretty-print.h"
81 #include "tree-ssa-loop-niter.h"
82 #include "fractional-cost.h"
83 #include "rtlanal.h"
84 #include "tree-dfa.h"
85 #include "asan.h"
86 #include "aarch64-feature-deps.h"
87 #include "config/arm/aarch-common.h"
88 #include "config/arm/aarch-common-protos.h"
89 #include "common/config/aarch64/cpuinfo.h"
90 #include "ssa.h"
91 #include "except.h"
92 #include "tree-pass.h"
93 #include "cfgbuild.h"
94 #include "symbol-summary.h"
95 #include "sreal.h"
96 #include "ipa-cp.h"
97 #include "ipa-prop.h"
98 #include "ipa-fnsummary.h"
99 #include "hash-map.h"
101 /* This file should be included last. */
102 #include "target-def.h"
104 /* Defined for convenience. */
105 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
107 /* Maximum bytes set for an inline memset expansion. With -Os use 3 STP
108 and 1 MOVI/DUP (same size as a call). */
109 #define MAX_SET_SIZE(speed) (speed ? 256 : 96)
111 /* Flags that describe how a function shares certain architectural state
112 with its callers.
114 - AARCH64_STATE_SHARED indicates that the function does share the state
115 with callers.
117 - AARCH64_STATE_IN indicates that the function reads (or might read) the
118 incoming state. The converse is that the function ignores the incoming
119 state.
121 - AARCH64_STATE_OUT indicates that the function returns new state.
122 The converse is that the state on return is the same as it was on entry.
124 A function that partially modifies the state treats it as both IN
125 and OUT (because the value on return depends to some extent on the
126 value on input). */
127 constexpr auto AARCH64_STATE_SHARED = 1U << 0;
128 constexpr auto AARCH64_STATE_IN = 1U << 1;
129 constexpr auto AARCH64_STATE_OUT = 1U << 2;
131 /* Enum to distinguish which type of check is to be done in
132 aarch64_simd_valid_imm. */
133 enum simd_immediate_check {
134 AARCH64_CHECK_MOV,
135 AARCH64_CHECK_ORR,
136 AARCH64_CHECK_AND,
137 AARCH64_CHECK_XOR
140 /* Information about a legitimate vector immediate operand. */
141 struct simd_immediate_info
143 enum insn_type { MOV, MVN, INDEX, PTRUE, SVE_MOV };
144 enum modifier_type { LSL, MSL };
146 simd_immediate_info () {}
147 simd_immediate_info (scalar_float_mode, rtx);
148 simd_immediate_info (scalar_int_mode, unsigned HOST_WIDE_INT,
149 insn_type = MOV, modifier_type = LSL,
150 unsigned int = 0);
151 simd_immediate_info (scalar_mode, rtx, rtx);
152 simd_immediate_info (scalar_int_mode, aarch64_svpattern);
154 /* The mode of the elements. */
155 scalar_mode elt_mode;
157 /* The instruction to use to move the immediate into a vector. */
158 insn_type insn;
160 union
162 /* For MOV and MVN. */
163 struct
165 /* The value of each element. */
166 rtx value;
168 /* The kind of shift modifier to use, and the number of bits to shift.
169 This is (LSL, 0) if no shift is needed. */
170 modifier_type modifier;
171 unsigned int shift;
172 } mov;
174 /* For INDEX. */
175 struct
177 /* The value of the first element and the step to be added for each
178 subsequent element. */
179 rtx base, step;
180 } index;
182 /* For PTRUE. */
183 aarch64_svpattern pattern;
184 } u;
187 /* Construct a floating-point immediate in which each element has mode
188 ELT_MODE_IN and value VALUE_IN. */
189 inline simd_immediate_info
190 ::simd_immediate_info (scalar_float_mode elt_mode_in, rtx value_in)
191 : elt_mode (elt_mode_in), insn (MOV)
193 u.mov.value = value_in;
194 u.mov.modifier = LSL;
195 u.mov.shift = 0;
198 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
199 and value VALUE_IN. The other parameters are as for the structure
200 fields. */
201 inline simd_immediate_info
202 ::simd_immediate_info (scalar_int_mode elt_mode_in,
203 unsigned HOST_WIDE_INT value_in,
204 insn_type insn_in, modifier_type modifier_in,
205 unsigned int shift_in)
206 : elt_mode (elt_mode_in), insn (insn_in)
208 u.mov.value = gen_int_mode (value_in, elt_mode_in);
209 u.mov.modifier = modifier_in;
210 u.mov.shift = shift_in;
213 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
214 and where element I is equal to BASE_IN + I * STEP_IN. */
215 inline simd_immediate_info
216 ::simd_immediate_info (scalar_mode elt_mode_in, rtx base_in, rtx step_in)
217 : elt_mode (elt_mode_in), insn (INDEX)
219 u.index.base = base_in;
220 u.index.step = step_in;
223 /* Construct a predicate that controls elements of mode ELT_MODE_IN
224 and has PTRUE pattern PATTERN_IN. */
225 inline simd_immediate_info
226 ::simd_immediate_info (scalar_int_mode elt_mode_in,
227 aarch64_svpattern pattern_in)
228 : elt_mode (elt_mode_in), insn (PTRUE)
230 u.pattern = pattern_in;
233 namespace {
235 /* Describes types that map to Pure Scalable Types (PSTs) in the AAPCS64. */
236 class pure_scalable_type_info
238 public:
239 /* Represents the result of analyzing a type. All values are nonzero,
240 in the possibly forlorn hope that accidental conversions to bool
241 trigger a warning. */
242 enum analysis_result
244 /* The type does not have an ABI identity; i.e. it doesn't contain
245 at least one object whose type is a Fundamental Data Type. */
246 NO_ABI_IDENTITY = 1,
248 /* The type is definitely a Pure Scalable Type. */
249 IS_PST,
251 /* The type is definitely not a Pure Scalable Type. */
252 ISNT_PST,
254 /* It doesn't matter for PCS purposes whether the type is a Pure
255 Scalable Type or not, since the type will be handled the same
256 way regardless.
258 Specifically, this means that if the type is a Pure Scalable Type,
259 there aren't enough argument registers to hold it, and so it will
260 need to be passed or returned in memory. If the type isn't a
261 Pure Scalable Type, it's too big to be passed or returned in core
262 or SIMD&FP registers, and so again will need to go in memory. */
263 DOESNT_MATTER
266 /* Aggregates of 17 bytes or more are normally passed and returned
267 in memory, so aggregates of that size can safely be analyzed as
268 DOESNT_MATTER. We need to be able to collect enough pieces to
269 represent a PST that is smaller than that. Since predicates are
270 2 bytes in size for -msve-vector-bits=128, that means we need to be
271 able to store at least 8 pieces.
273 We also need to be able to store enough pieces to represent
274 a single vector in each vector argument register and a single
275 predicate in each predicate argument register. This means that
276 we need at least 12 pieces. */
277 static const unsigned int MAX_PIECES = NUM_FP_ARG_REGS + NUM_PR_ARG_REGS;
278 static_assert (MAX_PIECES >= 8, "Need to store at least 8 predicates");
280 /* Describes one piece of a PST. Each piece is one of:
282 - a single Scalable Vector Type (SVT)
283 - a single Scalable Predicate Type (SPT)
284 - a PST containing 2, 3 or 4 SVTs, with no padding
286 It either represents a single built-in type or a PST formed from
287 multiple homogeneous built-in types. */
288 struct piece
290 rtx get_rtx (unsigned int, unsigned int) const;
292 /* The number of vector and predicate registers that the piece
293 occupies. One of the two is always zero. */
294 unsigned int num_zr;
295 unsigned int num_pr;
297 /* The mode of the registers described above. */
298 machine_mode mode;
300 /* If this piece is formed from multiple homogeneous built-in types,
301 this is the mode of the built-in types, otherwise it is MODE. */
302 machine_mode orig_mode;
304 /* The offset in bytes of the piece from the start of the type. */
305 poly_uint64 offset;
308 /* Divides types analyzed as IS_PST into individual pieces. The pieces
309 are in memory order. */
310 auto_vec<piece, MAX_PIECES> pieces;
312 unsigned int num_zr () const;
313 unsigned int num_pr () const;
315 rtx get_rtx (machine_mode mode, unsigned int, unsigned int) const;
317 analysis_result analyze (const_tree);
318 bool analyze_registers (const_tree);
320 private:
321 analysis_result analyze_array (const_tree);
322 analysis_result analyze_record (const_tree);
323 void add_piece (const piece &);
327 /* The current code model. */
328 enum aarch64_code_model aarch64_cmodel;
330 enum aarch64_tp_reg aarch64_tpidr_register;
332 /* The number of 64-bit elements in an SVE vector. */
333 poly_uint16 aarch64_sve_vg;
335 #ifdef HAVE_AS_TLS
336 #undef TARGET_HAVE_TLS
337 #define TARGET_HAVE_TLS 1
338 #endif
340 static bool aarch64_composite_type_p (const_tree, machine_mode);
341 static bool aarch64_return_in_memory_1 (const_tree);
342 static bool aarch64_vfp_is_call_or_return_candidate (machine_mode,
343 const_tree,
344 machine_mode *, int *,
345 bool *, bool);
346 static void aarch64_elf_asm_constructor (rtx, int) ATTRIBUTE_UNUSED;
347 static void aarch64_elf_asm_destructor (rtx, int) ATTRIBUTE_UNUSED;
348 static void aarch64_override_options_after_change (void);
349 static bool aarch64_vector_mode_supported_p (machine_mode);
350 static int aarch64_address_cost (rtx, machine_mode, addr_space_t, bool);
351 static bool aarch64_builtin_support_vector_misalignment (machine_mode mode,
352 const_tree type,
353 int misalignment,
354 bool is_packed);
355 static machine_mode aarch64_simd_container_mode (scalar_mode, poly_int64);
356 static bool aarch64_print_address_internal (FILE*, machine_mode, rtx,
357 aarch64_addr_query_type);
359 /* The processor for which instructions should be scheduled. */
360 enum aarch64_cpu aarch64_tune = AARCH64_CPU_cortexa53;
362 /* Global flag for PC relative loads. */
363 bool aarch64_pcrelative_literal_loads;
365 /* Global flag for whether frame pointer is enabled. */
366 bool aarch64_use_frame_pointer;
368 /* Support for command line parsing of boolean flags in the tuning
369 structures. */
370 struct aarch64_flag_desc
372 const char* name;
373 unsigned int flag;
376 #define AARCH64_FUSION_PAIR(name, internal_name) \
377 { name, AARCH64_FUSE_##internal_name },
378 static const struct aarch64_flag_desc aarch64_fusible_pairs[] =
380 { "none", AARCH64_FUSE_NOTHING },
381 #include "aarch64-fusion-pairs.def"
382 { "all", AARCH64_FUSE_ALL },
383 { NULL, AARCH64_FUSE_NOTHING }
386 #define AARCH64_EXTRA_TUNING_OPTION(name, internal_name) \
387 { name, AARCH64_EXTRA_TUNE_##internal_name },
388 static const struct aarch64_flag_desc aarch64_tuning_flags[] =
390 { "none", AARCH64_EXTRA_TUNE_NONE },
391 #include "aarch64-tuning-flags.def"
392 { "all", AARCH64_EXTRA_TUNE_ALL },
393 { NULL, AARCH64_EXTRA_TUNE_NONE }
396 /* Tuning parameters. */
397 #include "tuning_models/generic.h"
398 #include "tuning_models/generic_armv8_a.h"
399 #include "tuning_models/generic_armv9_a.h"
400 #include "tuning_models/cortexa35.h"
401 #include "tuning_models/cortexa53.h"
402 #include "tuning_models/cortexa57.h"
403 #include "tuning_models/cortexa72.h"
404 #include "tuning_models/cortexa73.h"
405 #include "tuning_models/cortexx925.h"
406 #include "tuning_models/exynosm1.h"
407 #include "tuning_models/thunderxt88.h"
408 #include "tuning_models/thunderx.h"
409 #include "tuning_models/tsv110.h"
410 #include "tuning_models/xgene1.h"
411 #include "tuning_models/emag.h"
412 #include "tuning_models/qdf24xx.h"
413 #include "tuning_models/saphira.h"
414 #include "tuning_models/thunderx2t99.h"
415 #include "tuning_models/thunderx3t110.h"
416 #include "tuning_models/neoversen1.h"
417 #include "tuning_models/ampere1.h"
418 #include "tuning_models/ampere1a.h"
419 #include "tuning_models/ampere1b.h"
420 #include "tuning_models/neoversev1.h"
421 #include "tuning_models/neoverse512tvb.h"
422 #include "tuning_models/neoversen2.h"
423 #include "tuning_models/neoversen3.h"
424 #include "tuning_models/neoversev2.h"
425 #include "tuning_models/neoversev3.h"
426 #include "tuning_models/neoversev3ae.h"
427 #include "tuning_models/a64fx.h"
428 #include "tuning_models/fujitsu_monaka.h"
430 /* Support for fine-grained override of the tuning structures. */
431 struct aarch64_tuning_override_function
433 const char* name;
434 void (*parse_override)(const char*, struct tune_params*);
437 static void aarch64_parse_fuse_string (const char*, struct tune_params*);
438 static void aarch64_parse_tune_string (const char*, struct tune_params*);
439 static void aarch64_parse_sve_width_string (const char*, struct tune_params*);
441 static const struct aarch64_tuning_override_function
442 aarch64_tuning_override_functions[] =
444 { "fuse", aarch64_parse_fuse_string },
445 { "tune", aarch64_parse_tune_string },
446 { "sve_width", aarch64_parse_sve_width_string },
447 { NULL, NULL }
450 /* A processor implementing AArch64. */
451 struct processor
453 const char *name;
454 aarch64_cpu ident;
455 aarch64_cpu sched_core;
456 aarch64_arch arch;
457 aarch64_feature_flags flags;
458 const tune_params *tune;
461 /* Architectures implementing AArch64. */
462 static CONSTEXPR const processor all_architectures[] =
464 #define AARCH64_ARCH(NAME, CORE, ARCH_IDENT, D, E) \
465 {NAME, AARCH64_CPU_##CORE, AARCH64_CPU_##CORE, AARCH64_ARCH_##ARCH_IDENT, \
466 feature_deps::ARCH_IDENT ().enable, NULL},
467 #include "aarch64-arches.def"
468 {NULL, aarch64_no_cpu, aarch64_no_cpu, aarch64_no_arch, 0, NULL}
471 /* Processor cores implementing AArch64. */
472 static const struct processor all_cores[] =
474 #define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, E, COSTS, G, H, I) \
475 {NAME, AARCH64_CPU_##IDENT, AARCH64_CPU_##SCHED, AARCH64_ARCH_##ARCH, \
476 feature_deps::cpu_##IDENT, &COSTS##_tunings},
477 #include "aarch64-cores.def"
478 {NULL, aarch64_no_cpu, aarch64_no_cpu, aarch64_no_arch, 0, NULL}
480 /* Internal representation of system registers. */
481 typedef struct {
482 const char *name;
483 /* Stringified sysreg encoding values, represented as
484 s<sn>_<op1>_c<cn>_c<cm>_<op2>. */
485 const char *encoding;
486 /* Flags affecting sysreg usage, such as read/write-only. */
487 unsigned properties;
488 /* Architectural features implied by sysreg. */
489 aarch64_feature_flags arch_reqs;
490 } sysreg_t;
492 /* An aarch64_feature_set initializer for a single feature,
493 AARCH64_FEATURE_<FEAT>. */
494 #define AARCH64_FEATURE(FEAT) AARCH64_FL_##FEAT
496 /* Used by AARCH64_FEATURES. */
497 #define AARCH64_OR_FEATURES_1(X, F1) \
498 AARCH64_FEATURE (F1)
499 #define AARCH64_OR_FEATURES_2(X, F1, F2) \
500 (AARCH64_FEATURE (F1) | AARCH64_OR_FEATURES_1 (X, F2))
501 #define AARCH64_OR_FEATURES_3(X, F1, ...) \
502 (AARCH64_FEATURE (F1) | AARCH64_OR_FEATURES_2 (X, __VA_ARGS__))
504 /* An aarch64_feature_set initializer for the N features listed in "...". */
505 #define AARCH64_FEATURES(N, ...) \
506 AARCH64_OR_FEATURES_##N (0, __VA_ARGS__)
508 #define AARCH64_NO_FEATURES 0
510 /* Flags associated with the properties of system registers. It mainly serves
511 to mark particular registers as read or write only. */
512 #define F_DEPRECATED (1 << 1)
513 #define F_REG_READ (1 << 2)
514 #define F_REG_WRITE (1 << 3)
515 #define F_ARCHEXT (1 << 4)
516 /* Flag indicating register name is alias for another system register. */
517 #define F_REG_ALIAS (1 << 5)
518 /* Flag indicatinig registers which may be implemented with 128-bits. */
519 #define F_REG_128 (1 << 6)
521 /* Database of system registers, their encodings and architectural
522 requirements. */
523 const sysreg_t aarch64_sysregs[] =
525 #define CPENC(SN, OP1, CN, CM, OP2) "s"#SN"_"#OP1"_c"#CN"_c"#CM"_"#OP2
526 #define SYSREG(NAME, ENC, FLAGS, ARCH) \
527 { NAME, ENC, FLAGS, ARCH },
528 #include "aarch64-sys-regs.def"
529 #undef CPENC
532 #undef AARCH64_NO_FEATURES
534 using sysreg_map_t = hash_map<nofree_string_hash, const sysreg_t *>;
535 static sysreg_map_t *sysreg_map = nullptr;
537 /* Map system register names to their hardware metadata: encoding,
538 feature flags and architectural feature requirements, all of which
539 are encoded in a sysreg_t struct. */
540 void
541 aarch64_register_sysreg (const char *name, const sysreg_t *metadata)
543 bool dup = sysreg_map->put (name, metadata);
544 gcc_checking_assert (!dup);
547 /* Lazily initialize hash table for system register validation,
548 checking the validity of supplied register name and returning
549 register's associated metadata. */
550 static void
551 aarch64_init_sysregs (void)
553 gcc_assert (!sysreg_map);
554 sysreg_map = new sysreg_map_t;
557 for (unsigned i = 0; i < ARRAY_SIZE (aarch64_sysregs); i++)
559 const sysreg_t *reg = aarch64_sysregs + i;
560 aarch64_register_sysreg (reg->name, reg);
564 /* No direct access to the sysreg hash-map should be made. Doing so
565 risks trying to acess an unitialized hash-map and dereferencing the
566 returned double pointer without due care risks dereferencing a
567 null-pointer. */
568 const sysreg_t *
569 aarch64_lookup_sysreg_map (const char *regname)
571 if (!sysreg_map)
572 aarch64_init_sysregs ();
574 const sysreg_t **sysreg_entry = sysreg_map->get (regname);
575 if (sysreg_entry != NULL)
576 return *sysreg_entry;
577 return NULL;
580 /* The current tuning set. */
581 struct tune_params aarch64_tune_params = generic_tunings;
583 /* If NAME is the name of an arm:: attribute that describes shared state,
584 return its associated AARCH64_STATE_* flags, otherwise return 0. */
585 static unsigned int
586 aarch64_attribute_shared_state_flags (const char *name)
588 if (strcmp (name, "in") == 0)
589 return AARCH64_STATE_SHARED | AARCH64_STATE_IN;
590 if (strcmp (name, "inout") == 0)
591 return AARCH64_STATE_SHARED | AARCH64_STATE_IN | AARCH64_STATE_OUT;
592 if (strcmp (name, "out") == 0)
593 return AARCH64_STATE_SHARED | AARCH64_STATE_OUT;
594 if (strcmp (name, "preserves") == 0)
595 return AARCH64_STATE_SHARED;
596 return 0;
599 /* See whether attribute list ATTRS has any sharing information
600 for state STATE_NAME. Return the associated state flags if so,
601 otherwise return 0. */
602 static unsigned int
603 aarch64_lookup_shared_state_flags (tree attrs, const char *state_name)
605 for (tree attr = attrs; attr; attr = TREE_CHAIN (attr))
607 if (!is_attribute_namespace_p ("arm", attr))
608 continue;
610 auto attr_name = IDENTIFIER_POINTER (get_attribute_name (attr));
611 auto flags = aarch64_attribute_shared_state_flags (attr_name);
612 if (!flags)
613 continue;
615 for (tree arg = TREE_VALUE (attr); arg; arg = TREE_CHAIN (arg))
617 tree value = TREE_VALUE (arg);
618 if (TREE_CODE (value) == STRING_CST
619 && strcmp (TREE_STRING_POINTER (value), state_name) == 0)
620 return flags;
623 return 0;
626 /* Return true if DECL creates a new scope for state STATE_STRING. */
627 static bool
628 aarch64_fndecl_has_new_state (const_tree decl, const char *state_name)
630 if (tree attr = lookup_attribute ("arm", "new", DECL_ATTRIBUTES (decl)))
631 for (tree arg = TREE_VALUE (attr); arg; arg = TREE_CHAIN (arg))
633 tree value = TREE_VALUE (arg);
634 if (TREE_CODE (value) == STRING_CST
635 && strcmp (TREE_STRING_POINTER (value), state_name) == 0)
636 return true;
638 return false;
641 /* Return true if attribute argument VALUE is a recognized state string,
642 otherwise report an error. NAME is the name of the attribute to which
643 VALUE is being passed. */
644 static bool
645 aarch64_check_state_string (tree name, tree value)
647 if (TREE_CODE (value) != STRING_CST)
649 error ("the arguments to %qE must be constant strings", name);
650 return false;
653 const char *state_name = TREE_STRING_POINTER (value);
654 if (strcmp (state_name, "za") != 0
655 && strcmp (state_name, "zt0") != 0)
657 error ("unrecognized state string %qs", state_name);
658 return false;
661 return true;
664 /* qsort callback to compare two STRING_CSTs. */
665 static int
666 cmp_string_csts (const void *a, const void *b)
668 return strcmp (TREE_STRING_POINTER (*(const_tree const *) a),
669 TREE_STRING_POINTER (*(const_tree const *) b));
672 /* Canonicalize a list of state strings. ARGS contains the arguments to
673 a new attribute while OLD_ATTR, if nonnull, contains a previous attribute
674 of the same type. If CAN_MERGE_IN_PLACE, it is safe to adjust OLD_ATTR's
675 arguments and drop the new attribute. Otherwise, the new attribute must
676 be kept and ARGS must include the information in OLD_ATTR.
678 In both cases, the new arguments must be a sorted list of state strings
679 with duplicates removed.
681 Return true if new attribute should be kept, false if it should be
682 dropped. */
683 static bool
684 aarch64_merge_string_arguments (tree args, tree old_attr,
685 bool can_merge_in_place)
687 /* Get a sorted list of all state strings (including duplicates). */
688 auto add_args = [](vec<tree> &strings, const_tree args)
690 for (const_tree arg = args; arg; arg = TREE_CHAIN (arg))
691 if (TREE_CODE (TREE_VALUE (arg)) == STRING_CST)
692 strings.safe_push (TREE_VALUE (arg));
694 auto_vec<tree, 16> strings;
695 add_args (strings, args);
696 if (old_attr)
697 add_args (strings, TREE_VALUE (old_attr));
698 strings.qsort (cmp_string_csts);
700 /* The list can be empty if there was no previous attribute and if all
701 the new arguments are erroneous. Drop the attribute in that case. */
702 if (strings.is_empty ())
703 return false;
705 /* Destructively modify one of the argument lists, removing duplicates
706 on the fly. */
707 bool use_old_attr = old_attr && can_merge_in_place;
708 tree *end = use_old_attr ? &TREE_VALUE (old_attr) : &args;
709 tree prev = NULL_TREE;
710 for (tree arg : strings)
712 if (prev && simple_cst_equal (arg, prev))
713 continue;
714 prev = arg;
715 if (!*end)
716 *end = tree_cons (NULL_TREE, arg, NULL_TREE);
717 else
718 TREE_VALUE (*end) = arg;
719 end = &TREE_CHAIN (*end);
721 *end = NULL_TREE;
722 return !use_old_attr;
725 /* Check whether an 'aarch64_vector_pcs' attribute is valid. */
727 static tree
728 handle_aarch64_vector_pcs_attribute (tree *node, tree name, tree,
729 int, bool *no_add_attrs)
731 /* Since we set fn_type_req to true, the caller should have checked
732 this for us. */
733 gcc_assert (FUNC_OR_METHOD_TYPE_P (*node));
734 switch ((arm_pcs) fntype_abi (*node).id ())
736 case ARM_PCS_AAPCS64:
737 case ARM_PCS_SIMD:
738 return NULL_TREE;
740 case ARM_PCS_SVE:
741 error ("the %qE attribute cannot be applied to an SVE function type",
742 name);
743 *no_add_attrs = true;
744 return NULL_TREE;
746 case ARM_PCS_TLSDESC:
747 case ARM_PCS_UNKNOWN:
748 break;
750 gcc_unreachable ();
753 /* Return true if arm::new(ARGS) is compatible with the type of decl DECL,
754 otherwise report an error. */
755 static bool
756 aarch64_check_arm_new_against_type (tree args, tree decl)
758 tree type_attrs = TYPE_ATTRIBUTES (TREE_TYPE (decl));
759 for (tree arg = args; arg; arg = TREE_CHAIN (arg))
761 tree value = TREE_VALUE (arg);
762 if (TREE_CODE (value) == STRING_CST)
764 const char *state_name = TREE_STRING_POINTER (value);
765 if (aarch64_lookup_shared_state_flags (type_attrs, state_name))
767 error_at (DECL_SOURCE_LOCATION (decl),
768 "cannot create a new %qs scope since %qs is shared"
769 " with callers", state_name, state_name);
770 return false;
774 return true;
777 /* Callback for arm::new attributes. */
778 static tree
779 handle_arm_new (tree *node, tree name, tree args, int, bool *no_add_attrs)
781 tree decl = *node;
782 if (TREE_CODE (decl) != FUNCTION_DECL)
784 error ("%qE attribute applies only to function definitions", name);
785 *no_add_attrs = true;
786 return NULL_TREE;
788 if (TREE_TYPE (decl) == error_mark_node)
790 *no_add_attrs = true;
791 return NULL_TREE;
794 for (tree arg = args; arg; arg = TREE_CHAIN (arg))
795 aarch64_check_state_string (name, TREE_VALUE (arg));
797 if (!aarch64_check_arm_new_against_type (args, decl))
799 *no_add_attrs = true;
800 return NULL_TREE;
803 /* If there is an old attribute, we should try to update it in-place,
804 so that there is only one (definitive) arm::new attribute on the decl. */
805 tree old_attr = lookup_attribute ("arm", "new", DECL_ATTRIBUTES (decl));
806 if (!aarch64_merge_string_arguments (args, old_attr, true))
807 *no_add_attrs = true;
809 return NULL_TREE;
812 /* Callback for arm::{in,out,inout,preserves} attributes. */
813 static tree
814 handle_arm_shared (tree *node, tree name, tree args,
815 int, bool *no_add_attrs)
817 tree type = *node;
818 tree old_attrs = TYPE_ATTRIBUTES (type);
819 auto flags = aarch64_attribute_shared_state_flags (IDENTIFIER_POINTER (name));
820 for (tree arg = args; arg; arg = TREE_CHAIN (arg))
822 tree value = TREE_VALUE (arg);
823 if (aarch64_check_state_string (name, value))
825 const char *state_name = TREE_STRING_POINTER (value);
826 auto old_flags = aarch64_lookup_shared_state_flags (old_attrs,
827 state_name);
828 if (old_flags && old_flags != flags)
830 error ("inconsistent attributes for state %qs", state_name);
831 *no_add_attrs = true;
832 return NULL_TREE;
837 /* We can't update an old attribute in-place, since types are shared.
838 Instead make sure that this new attribute contains all the
839 information, so that the old attribute becomes redundant. */
840 tree old_attr = lookup_attribute ("arm", IDENTIFIER_POINTER (name),
841 old_attrs);
842 if (!aarch64_merge_string_arguments (args, old_attr, false))
843 *no_add_attrs = true;
845 return NULL_TREE;
848 /* Mutually-exclusive function type attributes for controlling PSTATE.SM. */
849 static const struct attribute_spec::exclusions attr_streaming_exclusions[] =
851 /* Attribute name exclusion applies to:
852 function, type, variable */
853 { "streaming", false, true, false },
854 { "streaming_compatible", false, true, false },
855 { NULL, false, false, false }
858 /* Table of machine attributes. */
859 static const attribute_spec aarch64_gnu_attributes[] =
861 /* { name, min_len, max_len, decl_req, type_req, fn_type_req,
862 affects_type_identity, handler, exclude } */
863 { "aarch64_vector_pcs", 0, 0, false, true, true, true,
864 handle_aarch64_vector_pcs_attribute, NULL },
865 { "indirect_return", 0, 0, false, true, true, true, NULL, NULL },
866 { "arm_sve_vector_bits", 1, 1, false, true, false, true,
867 aarch64_sve::handle_arm_sve_vector_bits_attribute,
868 NULL },
869 { "Advanced SIMD type", 1, 1, false, true, false, true, NULL, NULL },
870 { "SVE type", 3, 3, false, true, false, true, NULL, NULL },
871 { "SVE sizeless type", 0, 0, false, true, false, true, NULL, NULL },
872 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
873 { "dllimport", 0, 0, false, false, false, false, handle_dll_attribute, NULL },
874 { "dllexport", 0, 0, false, false, false, false, handle_dll_attribute, NULL },
875 #endif
876 #ifdef SUBTARGET_ATTRIBUTE_TABLE
877 SUBTARGET_ATTRIBUTE_TABLE
878 #endif
881 static const scoped_attribute_specs aarch64_gnu_attribute_table =
883 "gnu", { aarch64_gnu_attributes }
886 static const attribute_spec aarch64_arm_attributes[] =
888 { "streaming", 0, 0, false, true, true, true,
889 NULL, attr_streaming_exclusions },
890 { "streaming_compatible", 0, 0, false, true, true, true,
891 NULL, attr_streaming_exclusions },
892 { "locally_streaming", 0, 0, true, false, false, false, NULL, NULL },
893 { "new", 1, -1, true, false, false, false,
894 handle_arm_new, NULL },
895 { "preserves", 1, -1, false, true, true, true,
896 handle_arm_shared, NULL },
897 { "in", 1, -1, false, true, true, true,
898 handle_arm_shared, NULL },
899 { "out", 1, -1, false, true, true, true,
900 handle_arm_shared, NULL },
901 { "inout", 1, -1, false, true, true, true,
902 handle_arm_shared, NULL }
905 static const scoped_attribute_specs aarch64_arm_attribute_table =
907 "arm", { aarch64_arm_attributes }
910 static const scoped_attribute_specs *const aarch64_attribute_table[] =
912 &aarch64_gnu_attribute_table,
913 &aarch64_arm_attribute_table
916 typedef enum aarch64_cond_code
918 AARCH64_EQ = 0, AARCH64_NE, AARCH64_CS, AARCH64_CC, AARCH64_MI, AARCH64_PL,
919 AARCH64_VS, AARCH64_VC, AARCH64_HI, AARCH64_LS, AARCH64_GE, AARCH64_LT,
920 AARCH64_GT, AARCH64_LE, AARCH64_AL, AARCH64_NV
922 aarch64_cc;
924 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
927 /* The condition codes of the processor, and the inverse function. */
928 static const char * const aarch64_condition_codes[] =
930 "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
931 "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
934 /* The preferred condition codes for SVE conditions. */
935 static const char *const aarch64_sve_condition_codes[] =
937 "none", "any", "nlast", "last", "first", "nfrst", "vs", "vc",
938 "pmore", "plast", "tcont", "tstop", "gt", "le", "al", "nv"
941 /* Return the assembly token for svpattern value VALUE. */
943 static const char *
944 svpattern_token (enum aarch64_svpattern pattern)
946 switch (pattern)
948 #define CASE(UPPER, LOWER, VALUE) case AARCH64_SV_##UPPER: return #LOWER;
949 AARCH64_FOR_SVPATTERN (CASE)
950 #undef CASE
951 case AARCH64_NUM_SVPATTERNS:
952 break;
954 gcc_unreachable ();
957 /* Return the location of a piece that is known to be passed or returned
958 in registers. FIRST_ZR is the first unused vector argument register
959 and FIRST_PR is the first unused predicate argument register. */
962 pure_scalable_type_info::piece::get_rtx (unsigned int first_zr,
963 unsigned int first_pr) const
965 gcc_assert (VECTOR_MODE_P (mode)
966 && first_zr + num_zr <= V0_REGNUM + NUM_FP_ARG_REGS
967 && first_pr + num_pr <= P0_REGNUM + NUM_PR_ARG_REGS);
969 if (num_zr > 0 && num_pr == 0)
970 return gen_rtx_REG (mode, first_zr);
972 if (num_zr == 0 && num_pr > 0)
973 return gen_rtx_REG (mode, first_pr);
975 gcc_unreachable ();
978 /* Return the total number of vector registers required by the PST. */
980 unsigned int
981 pure_scalable_type_info::num_zr () const
983 unsigned int res = 0;
984 for (unsigned int i = 0; i < pieces.length (); ++i)
985 res += pieces[i].num_zr;
986 return res;
989 /* Return the total number of predicate registers required by the PST. */
991 unsigned int
992 pure_scalable_type_info::num_pr () const
994 unsigned int res = 0;
995 for (unsigned int i = 0; i < pieces.length (); ++i)
996 res += pieces[i].num_pr;
997 return res;
1000 /* Return the location of a PST that is known to be passed or returned
1001 in registers. FIRST_ZR is the first unused vector argument register
1002 and FIRST_PR is the first unused predicate argument register. */
1005 pure_scalable_type_info::get_rtx (machine_mode mode,
1006 unsigned int first_zr,
1007 unsigned int first_pr) const
1009 /* Try to return a single REG if possible. This leads to better
1010 code generation; it isn't required for correctness. */
1011 if (mode == pieces[0].mode)
1013 gcc_assert (pieces.length () == 1);
1014 return pieces[0].get_rtx (first_zr, first_pr);
1017 /* Build up a PARALLEL that contains the individual pieces. */
1018 rtvec rtxes = rtvec_alloc (pieces.length ());
1019 for (unsigned int i = 0; i < pieces.length (); ++i)
1021 rtx reg = pieces[i].get_rtx (first_zr, first_pr);
1022 rtx offset = gen_int_mode (pieces[i].offset, Pmode);
1023 RTVEC_ELT (rtxes, i) = gen_rtx_EXPR_LIST (VOIDmode, reg, offset);
1024 first_zr += pieces[i].num_zr;
1025 first_pr += pieces[i].num_pr;
1027 return gen_rtx_PARALLEL (mode, rtxes);
1030 /* Analyze whether TYPE is a Pure Scalable Type according to the rules
1031 in the AAPCS64. */
1033 pure_scalable_type_info::analysis_result
1034 pure_scalable_type_info::analyze (const_tree type)
1036 /* Prevent accidental reuse. */
1037 gcc_assert (pieces.is_empty ());
1039 /* No code will be generated for erroneous types, so we won't establish
1040 an ABI mapping. */
1041 if (type == error_mark_node)
1042 return NO_ABI_IDENTITY;
1044 /* Zero-sized types disappear in the language->ABI mapping. */
1045 if (TYPE_SIZE (type) && integer_zerop (TYPE_SIZE (type)))
1046 return NO_ABI_IDENTITY;
1048 /* Check for SVTs, SPTs, and built-in tuple types that map to PSTs. */
1049 piece p = {};
1050 if (aarch64_sve::builtin_type_p (type, &p.num_zr, &p.num_pr))
1052 machine_mode mode = TYPE_MODE_RAW (type);
1053 gcc_assert (VECTOR_MODE_P (mode)
1054 && (!TARGET_SVE || aarch64_sve_mode_p (mode)));
1056 p.mode = p.orig_mode = mode;
1057 add_piece (p);
1058 return IS_PST;
1061 /* Check for user-defined PSTs. */
1062 if (TREE_CODE (type) == ARRAY_TYPE)
1063 return analyze_array (type);
1064 if (TREE_CODE (type) == RECORD_TYPE)
1065 return analyze_record (type);
1067 return ISNT_PST;
1070 /* Analyze a type that is known not to be passed or returned in memory.
1071 Return true if it has an ABI identity and is a Pure Scalable Type. */
1073 bool
1074 pure_scalable_type_info::analyze_registers (const_tree type)
1076 analysis_result result = analyze (type);
1077 gcc_assert (result != DOESNT_MATTER);
1078 return result == IS_PST;
1081 /* Subroutine of analyze for handling ARRAY_TYPEs. */
1083 pure_scalable_type_info::analysis_result
1084 pure_scalable_type_info::analyze_array (const_tree type)
1086 /* Analyze the element type. */
1087 pure_scalable_type_info element_info;
1088 analysis_result result = element_info.analyze (TREE_TYPE (type));
1089 if (result != IS_PST)
1090 return result;
1092 /* An array of unknown, flexible or variable length will be passed and
1093 returned by reference whatever we do. */
1094 tree nelts_minus_one = array_type_nelts_minus_one (type);
1095 if (!tree_fits_uhwi_p (nelts_minus_one))
1096 return DOESNT_MATTER;
1098 /* Likewise if the array is constant-sized but too big to be interesting.
1099 The double checks against MAX_PIECES are to protect against overflow. */
1100 unsigned HOST_WIDE_INT count = tree_to_uhwi (nelts_minus_one);
1101 if (count > MAX_PIECES)
1102 return DOESNT_MATTER;
1103 count += 1;
1104 if (count * element_info.pieces.length () > MAX_PIECES)
1105 return DOESNT_MATTER;
1107 /* The above checks should have weeded out elements of unknown size. */
1108 poly_uint64 element_bytes;
1109 if (!poly_int_tree_p (TYPE_SIZE_UNIT (TREE_TYPE (type)), &element_bytes))
1110 gcc_unreachable ();
1112 /* Build up the list of individual vectors and predicates. */
1113 gcc_assert (!element_info.pieces.is_empty ());
1114 for (unsigned int i = 0; i < count; ++i)
1115 for (unsigned int j = 0; j < element_info.pieces.length (); ++j)
1117 piece p = element_info.pieces[j];
1118 p.offset += i * element_bytes;
1119 add_piece (p);
1121 return IS_PST;
1124 /* Subroutine of analyze for handling RECORD_TYPEs. */
1126 pure_scalable_type_info::analysis_result
1127 pure_scalable_type_info::analyze_record (const_tree type)
1129 for (tree field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
1131 if (TREE_CODE (field) != FIELD_DECL)
1132 continue;
1134 /* Zero-sized fields disappear in the language->ABI mapping. */
1135 if (DECL_SIZE (field) && integer_zerop (DECL_SIZE (field)))
1136 continue;
1138 /* All fields with an ABI identity must be PSTs for the record as
1139 a whole to be a PST. If any individual field is too big to be
1140 interesting then the record is too. */
1141 pure_scalable_type_info field_info;
1142 analysis_result subresult = field_info.analyze (TREE_TYPE (field));
1143 if (subresult == NO_ABI_IDENTITY)
1144 continue;
1145 if (subresult != IS_PST)
1146 return subresult;
1148 /* Since all previous fields are PSTs, we ought to be able to track
1149 the field offset using poly_ints. */
1150 tree bitpos = bit_position (field);
1151 gcc_assert (poly_int_tree_p (bitpos));
1153 /* For the same reason, it shouldn't be possible to create a PST field
1154 whose offset isn't byte-aligned. */
1155 poly_widest_int wide_bytepos = exact_div (wi::to_poly_widest (bitpos),
1156 BITS_PER_UNIT);
1158 /* Punt if the record is too big to be interesting. */
1159 poly_uint64 bytepos;
1160 if (!wide_bytepos.to_uhwi (&bytepos)
1161 || pieces.length () + field_info.pieces.length () > MAX_PIECES)
1162 return DOESNT_MATTER;
1164 /* Add the individual vectors and predicates in the field to the
1165 record's list. */
1166 gcc_assert (!field_info.pieces.is_empty ());
1167 for (unsigned int i = 0; i < field_info.pieces.length (); ++i)
1169 piece p = field_info.pieces[i];
1170 p.offset += bytepos;
1171 add_piece (p);
1174 /* Empty structures disappear in the language->ABI mapping. */
1175 return pieces.is_empty () ? NO_ABI_IDENTITY : IS_PST;
1178 /* Add P to the list of pieces in the type. */
1180 void
1181 pure_scalable_type_info::add_piece (const piece &p)
1183 /* Try to fold the new piece into the previous one to form a
1184 single-mode PST. For example, if we see three consecutive vectors
1185 of the same mode, we can represent them using the corresponding
1186 3-tuple mode.
1188 This is purely an optimization. */
1189 if (!pieces.is_empty ())
1191 piece &prev = pieces.last ();
1192 gcc_assert (VECTOR_MODE_P (p.mode) && VECTOR_MODE_P (prev.mode));
1193 unsigned int nelems1, nelems2;
1194 if (prev.orig_mode == p.orig_mode
1195 && GET_MODE_CLASS (p.orig_mode) != MODE_VECTOR_BOOL
1196 && known_eq (prev.offset + GET_MODE_SIZE (prev.mode), p.offset)
1197 && constant_multiple_p (GET_MODE_NUNITS (prev.mode),
1198 GET_MODE_NUNITS (p.orig_mode), &nelems1)
1199 && constant_multiple_p (GET_MODE_NUNITS (p.mode),
1200 GET_MODE_NUNITS (p.orig_mode), &nelems2)
1201 && targetm.array_mode (p.orig_mode,
1202 nelems1 + nelems2).exists (&prev.mode))
1204 prev.num_zr += p.num_zr;
1205 prev.num_pr += p.num_pr;
1206 return;
1209 pieces.quick_push (p);
1212 /* Return true if at least one possible value of type TYPE includes at
1213 least one object of Pure Scalable Type, in the sense of the AAPCS64.
1215 This is a relatively expensive test for some types, so it should
1216 generally be made as late as possible. */
1218 static bool
1219 aarch64_some_values_include_pst_objects_p (const_tree type)
1221 if (TYPE_SIZE (type) && integer_zerop (TYPE_SIZE (type)))
1222 return false;
1224 if (aarch64_sve::builtin_type_p (type))
1225 return true;
1227 if (TREE_CODE (type) == ARRAY_TYPE || TREE_CODE (type) == COMPLEX_TYPE)
1228 return aarch64_some_values_include_pst_objects_p (TREE_TYPE (type));
1230 if (RECORD_OR_UNION_TYPE_P (type))
1231 for (tree field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
1232 if (TREE_CODE (field) == FIELD_DECL
1233 && aarch64_some_values_include_pst_objects_p (TREE_TYPE (field)))
1234 return true;
1236 return false;
1239 /* Return the descriptor of the SIMD ABI. */
1241 static const predefined_function_abi &
1242 aarch64_simd_abi (void)
1244 predefined_function_abi &simd_abi = function_abis[ARM_PCS_SIMD];
1245 if (!simd_abi.initialized_p ())
1247 HARD_REG_SET full_reg_clobbers
1248 = default_function_abi.full_reg_clobbers ();
1249 for (int regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
1250 if (FP_SIMD_SAVED_REGNUM_P (regno))
1251 CLEAR_HARD_REG_BIT (full_reg_clobbers, regno);
1252 simd_abi.initialize (ARM_PCS_SIMD, full_reg_clobbers);
1254 return simd_abi;
1257 /* Return the descriptor of the SVE PCS. */
1259 static const predefined_function_abi &
1260 aarch64_sve_abi (void)
1262 predefined_function_abi &sve_abi = function_abis[ARM_PCS_SVE];
1263 if (!sve_abi.initialized_p ())
1265 HARD_REG_SET full_reg_clobbers
1266 = default_function_abi.full_reg_clobbers ();
1267 for (int regno = V8_REGNUM; regno <= V23_REGNUM; ++regno)
1268 CLEAR_HARD_REG_BIT (full_reg_clobbers, regno);
1269 for (int regno = P4_REGNUM; regno <= P15_REGNUM; ++regno)
1270 CLEAR_HARD_REG_BIT (full_reg_clobbers, regno);
1271 sve_abi.initialize (ARM_PCS_SVE, full_reg_clobbers);
1273 return sve_abi;
1276 /* If X is an UNSPEC_SALT_ADDR expression, return the address that it
1277 wraps, otherwise return X itself. */
1279 static rtx
1280 strip_salt (rtx x)
1282 rtx search = x;
1283 if (GET_CODE (search) == CONST)
1284 search = XEXP (search, 0);
1285 if (GET_CODE (search) == UNSPEC && XINT (search, 1) == UNSPEC_SALT_ADDR)
1286 x = XVECEXP (search, 0, 0);
1287 return x;
1290 /* Like strip_offset, but also strip any UNSPEC_SALT_ADDR from the
1291 expression. */
1293 static rtx
1294 strip_offset_and_salt (rtx addr, poly_int64 *offset)
1296 return strip_salt (strip_offset (addr, offset));
1299 /* Generate code to enable conditional branches in functions over 1 MiB. */
1300 const char *
1301 aarch64_gen_far_branch (rtx * operands, int pos_label, const char * dest,
1302 const char * branch_format)
1304 rtx_code_label * tmp_label = gen_label_rtx ();
1305 char label_buf[256];
1306 char buffer[128];
1307 ASM_GENERATE_INTERNAL_LABEL (label_buf, dest,
1308 CODE_LABEL_NUMBER (tmp_label));
1309 const char *label_ptr = targetm.strip_name_encoding (label_buf);
1310 rtx dest_label = operands[pos_label];
1311 operands[pos_label] = tmp_label;
1313 snprintf (buffer, sizeof (buffer), "%s%s", branch_format, label_ptr);
1314 output_asm_insn (buffer, operands);
1316 snprintf (buffer, sizeof (buffer), "b\t%%l%d\n%s:", pos_label, label_ptr);
1317 operands[pos_label] = dest_label;
1318 output_asm_insn (buffer, operands);
1319 return "";
1322 void
1323 aarch64_err_no_fpadvsimd (machine_mode mode)
1325 if (TARGET_GENERAL_REGS_ONLY)
1326 if (FLOAT_MODE_P (mode))
1327 error ("%qs is incompatible with the use of floating-point types",
1328 "-mgeneral-regs-only");
1329 else
1330 error ("%qs is incompatible with the use of vector types",
1331 "-mgeneral-regs-only");
1332 else
1333 if (FLOAT_MODE_P (mode))
1334 error ("%qs feature modifier is incompatible with the use of"
1335 " floating-point types", "+nofp");
1336 else
1337 error ("%qs feature modifier is incompatible with the use of"
1338 " vector types", "+nofp");
1341 /* Report when we try to do something that requires SVE when SVE is disabled.
1342 This is an error of last resort and isn't very high-quality. It usually
1343 involves attempts to measure the vector length in some way. */
1344 static void
1345 aarch64_report_sve_required (void)
1347 static bool reported_p = false;
1349 /* Avoid reporting a slew of messages for a single oversight. */
1350 if (reported_p)
1351 return;
1353 error ("this operation requires the SVE ISA extension");
1354 inform (input_location, "you can enable SVE using the command-line"
1355 " option %<-march%>, or by using the %<target%>"
1356 " attribute or pragma");
1357 reported_p = true;
1360 /* Return true if REGNO is P0-P15 or one of the special FFR-related
1361 registers. */
1362 inline bool
1363 pr_or_ffr_regnum_p (unsigned int regno)
1365 return PR_REGNUM_P (regno) || regno == FFR_REGNUM || regno == FFRT_REGNUM;
1368 /* Implement TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS.
1369 The register allocator chooses POINTER_AND_FP_REGS if FP_REGS and
1370 GENERAL_REGS have the same cost - even if POINTER_AND_FP_REGS has a much
1371 higher cost. POINTER_AND_FP_REGS is also used if the cost of both FP_REGS
1372 and GENERAL_REGS is lower than the memory cost (in this case the best class
1373 is the lowest cost one). Using POINTER_AND_FP_REGS irrespectively of its
1374 cost results in bad allocations with many redundant int<->FP moves which
1375 are expensive on various cores.
1376 To avoid this we don't allow POINTER_AND_FP_REGS as the allocno class, but
1377 force a decision between FP_REGS and GENERAL_REGS. We use the allocno class
1378 if it isn't POINTER_AND_FP_REGS. Similarly, use the best class if it isn't
1379 POINTER_AND_FP_REGS. Otherwise set the allocno class depending on the mode.
1380 The result of this is that it is no longer inefficient to have a higher
1381 memory move cost than the register move cost.
1384 static reg_class_t
1385 aarch64_ira_change_pseudo_allocno_class (int regno, reg_class_t allocno_class,
1386 reg_class_t best_class)
1388 machine_mode mode;
1390 if (!reg_class_subset_p (GENERAL_REGS, allocno_class)
1391 || !reg_class_subset_p (FP_REGS, allocno_class))
1392 return allocno_class;
1394 if (!reg_class_subset_p (GENERAL_REGS, best_class)
1395 || !reg_class_subset_p (FP_REGS, best_class))
1396 return best_class;
1398 mode = PSEUDO_REGNO_MODE (regno);
1399 return FLOAT_MODE_P (mode) || VECTOR_MODE_P (mode) ? FP_REGS : GENERAL_REGS;
1402 static unsigned int
1403 aarch64_min_divisions_for_recip_mul (machine_mode mode)
1405 if (GET_MODE_UNIT_SIZE (mode) == 4)
1406 return aarch64_tune_params.min_div_recip_mul_sf;
1407 return aarch64_tune_params.min_div_recip_mul_df;
1410 /* Return the reassociation width of treeop OPC with mode MODE. */
1411 static int
1412 aarch64_reassociation_width (unsigned opc, machine_mode mode)
1414 if (VECTOR_MODE_P (mode))
1415 return aarch64_tune_params.vec_reassoc_width;
1416 if (INTEGRAL_MODE_P (mode))
1417 return aarch64_tune_params.int_reassoc_width;
1418 /* Reassociation reduces the number of FMAs which may result in worse
1419 performance. Use a per-CPU setting for FMA reassociation which allows
1420 narrow CPUs with few FP pipes to switch it off (value of 1), and wider
1421 CPUs with many FP pipes to enable reassociation.
1422 Since the reassociation pass doesn't understand FMA at all, assume
1423 that any FP addition might turn into FMA. */
1424 if (FLOAT_MODE_P (mode))
1425 return opc == PLUS_EXPR ? aarch64_tune_params.fma_reassoc_width
1426 : aarch64_tune_params.fp_reassoc_width;
1427 return 1;
1430 /* Provide a mapping from gcc register numbers to dwarf register numbers. */
1431 unsigned
1432 aarch64_debugger_regno (unsigned regno)
1434 if (GP_REGNUM_P (regno))
1435 return AARCH64_DWARF_R0 + regno - R0_REGNUM;
1436 else if (regno == SP_REGNUM)
1437 return AARCH64_DWARF_SP;
1438 else if (FP_REGNUM_P (regno))
1439 return AARCH64_DWARF_V0 + regno - V0_REGNUM;
1440 else if (PR_REGNUM_P (regno))
1441 return AARCH64_DWARF_P0 + regno - P0_REGNUM;
1442 else if (regno == VG_REGNUM)
1443 return AARCH64_DWARF_VG;
1445 /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
1446 equivalent DWARF register. */
1447 return DWARF_FRAME_REGISTERS;
1450 /* Implement TARGET_DWARF_FRAME_REG_MODE. */
1451 static machine_mode
1452 aarch64_dwarf_frame_reg_mode (int regno)
1454 /* Predicate registers are call-clobbered in the EH ABI (which is
1455 ARM_PCS_AAPCS64), so they should not be described by CFI.
1456 Their size changes as VL changes, so any values computed by
1457 __builtin_init_dwarf_reg_size_table might not be valid for
1458 all frames. */
1459 if (PR_REGNUM_P (regno))
1460 return VOIDmode;
1461 return default_dwarf_frame_reg_mode (regno);
1464 /* Implement TARGET_OUTPUT_CFI_DIRECTIVE. */
1465 static bool
1466 aarch64_output_cfi_directive (FILE *f, dw_cfi_ref cfi)
1468 bool found = false;
1469 if (cfi->dw_cfi_opc == DW_CFA_AARCH64_negate_ra_state)
1471 fprintf (f, "\t.cfi_negate_ra_state\n");
1472 found = true;
1474 return found;
1477 /* Implement TARGET_DW_CFI_OPRND1_DESC. */
1478 static bool
1479 aarch64_dw_cfi_oprnd1_desc (dwarf_call_frame_info cfi_opc,
1480 dw_cfi_oprnd_type &oprnd_type)
1482 if (cfi_opc == DW_CFA_AARCH64_negate_ra_state)
1484 oprnd_type = dw_cfi_oprnd_unused;
1485 return true;
1487 return false;
1490 /* If X is a CONST_DOUBLE, return its bit representation as a constant
1491 integer, otherwise return X unmodified. */
1492 static rtx
1493 aarch64_bit_representation (rtx x)
1495 if (CONST_DOUBLE_P (x))
1496 x = gen_lowpart (int_mode_for_mode (GET_MODE (x)).require (), x);
1497 return x;
1500 /* Return an estimate for the number of quadwords in an SVE vector. This is
1501 equivalent to the number of Advanced SIMD vectors in an SVE vector. */
1502 static unsigned int
1503 aarch64_estimated_sve_vq ()
1505 return estimated_poly_value (BITS_PER_SVE_VECTOR) / 128;
1508 /* Return true if MODE is an SVE predicate mode. */
1509 static bool
1510 aarch64_sve_pred_mode_p (machine_mode mode)
1512 return (TARGET_SVE
1513 && (mode == VNx16BImode
1514 || mode == VNx8BImode
1515 || mode == VNx4BImode
1516 || mode == VNx2BImode));
1519 /* Three mutually-exclusive flags describing a vector or predicate type. */
1520 const unsigned int VEC_ADVSIMD = 1;
1521 const unsigned int VEC_SVE_DATA = 2;
1522 const unsigned int VEC_SVE_PRED = 4;
1523 /* Indicates a structure of 2, 3 or 4 vectors or predicates. */
1524 const unsigned int VEC_STRUCT = 8;
1525 /* Can be used in combination with VEC_SVE_DATA to indicate that the
1526 vector has fewer significant bytes than a full SVE vector. */
1527 const unsigned int VEC_PARTIAL = 16;
1528 /* Useful combinations of the above. */
1529 const unsigned int VEC_ANY_SVE = VEC_SVE_DATA | VEC_SVE_PRED;
1530 const unsigned int VEC_ANY_DATA = VEC_ADVSIMD | VEC_SVE_DATA;
1532 /* Return a set of flags describing the vector properties of mode MODE.
1533 If ANY_TARGET_P is false (the default), ignore modes that are not supported
1534 by the current target. Otherwise categorize the modes that can be used
1535 with the set of all targets supported by the port. */
1537 static unsigned int
1538 aarch64_classify_vector_mode (machine_mode mode, bool any_target_p = false)
1540 if (aarch64_sve_pred_mode_p (mode))
1541 return VEC_SVE_PRED;
1543 /* Make the decision based on the mode's enum value rather than its
1544 properties, so that we keep the correct classification regardless
1545 of -msve-vector-bits. */
1546 switch (mode)
1548 /* Partial SVE QI vectors. */
1549 case E_VNx2QImode:
1550 case E_VNx4QImode:
1551 case E_VNx8QImode:
1552 /* Partial SVE HI vectors. */
1553 case E_VNx2HImode:
1554 case E_VNx4HImode:
1555 /* Partial SVE SI vector. */
1556 case E_VNx2SImode:
1557 /* Partial SVE HF vectors. */
1558 case E_VNx2HFmode:
1559 case E_VNx4HFmode:
1560 /* Partial SVE BF vectors. */
1561 case E_VNx2BFmode:
1562 case E_VNx4BFmode:
1563 /* Partial SVE SF vector. */
1564 case E_VNx2SFmode:
1565 return (TARGET_SVE || any_target_p) ? VEC_SVE_DATA | VEC_PARTIAL : 0;
1567 case E_VNx16QImode:
1568 case E_VNx8HImode:
1569 case E_VNx4SImode:
1570 case E_VNx2DImode:
1571 case E_VNx8BFmode:
1572 case E_VNx8HFmode:
1573 case E_VNx4SFmode:
1574 case E_VNx2DFmode:
1575 return (TARGET_SVE || any_target_p) ? VEC_SVE_DATA : 0;
1577 /* x2 SVE vectors. */
1578 case E_VNx32QImode:
1579 case E_VNx16HImode:
1580 case E_VNx8SImode:
1581 case E_VNx4DImode:
1582 case E_VNx16BFmode:
1583 case E_VNx16HFmode:
1584 case E_VNx8SFmode:
1585 case E_VNx4DFmode:
1586 /* x3 SVE vectors. */
1587 case E_VNx48QImode:
1588 case E_VNx24HImode:
1589 case E_VNx12SImode:
1590 case E_VNx6DImode:
1591 case E_VNx24BFmode:
1592 case E_VNx24HFmode:
1593 case E_VNx12SFmode:
1594 case E_VNx6DFmode:
1595 /* x4 SVE vectors. */
1596 case E_VNx64QImode:
1597 case E_VNx32HImode:
1598 case E_VNx16SImode:
1599 case E_VNx8DImode:
1600 case E_VNx32BFmode:
1601 case E_VNx32HFmode:
1602 case E_VNx16SFmode:
1603 case E_VNx8DFmode:
1604 return (TARGET_SVE || any_target_p) ? VEC_SVE_DATA | VEC_STRUCT : 0;
1606 case E_OImode:
1607 case E_CImode:
1608 case E_XImode:
1609 return (TARGET_FLOAT || any_target_p) ? VEC_ADVSIMD | VEC_STRUCT : 0;
1611 /* Structures of 64-bit Advanced SIMD vectors. */
1612 case E_V2x8QImode:
1613 case E_V2x4HImode:
1614 case E_V2x2SImode:
1615 case E_V2x1DImode:
1616 case E_V2x4BFmode:
1617 case E_V2x4HFmode:
1618 case E_V2x2SFmode:
1619 case E_V2x1DFmode:
1620 case E_V3x8QImode:
1621 case E_V3x4HImode:
1622 case E_V3x2SImode:
1623 case E_V3x1DImode:
1624 case E_V3x4BFmode:
1625 case E_V3x4HFmode:
1626 case E_V3x2SFmode:
1627 case E_V3x1DFmode:
1628 case E_V4x8QImode:
1629 case E_V4x4HImode:
1630 case E_V4x2SImode:
1631 case E_V4x1DImode:
1632 case E_V4x4BFmode:
1633 case E_V4x4HFmode:
1634 case E_V4x2SFmode:
1635 case E_V4x1DFmode:
1636 return (TARGET_FLOAT || any_target_p)
1637 ? VEC_ADVSIMD | VEC_STRUCT | VEC_PARTIAL : 0;
1639 /* Structures of 128-bit Advanced SIMD vectors. */
1640 case E_V2x16QImode:
1641 case E_V2x8HImode:
1642 case E_V2x4SImode:
1643 case E_V2x2DImode:
1644 case E_V2x8BFmode:
1645 case E_V2x8HFmode:
1646 case E_V2x4SFmode:
1647 case E_V2x2DFmode:
1648 case E_V3x16QImode:
1649 case E_V3x8HImode:
1650 case E_V3x4SImode:
1651 case E_V3x2DImode:
1652 case E_V3x8BFmode:
1653 case E_V3x8HFmode:
1654 case E_V3x4SFmode:
1655 case E_V3x2DFmode:
1656 case E_V4x16QImode:
1657 case E_V4x8HImode:
1658 case E_V4x4SImode:
1659 case E_V4x2DImode:
1660 case E_V4x8BFmode:
1661 case E_V4x8HFmode:
1662 case E_V4x4SFmode:
1663 case E_V4x2DFmode:
1664 return (TARGET_FLOAT || any_target_p) ? VEC_ADVSIMD | VEC_STRUCT : 0;
1666 /* 64-bit Advanced SIMD vectors. */
1667 case E_V8QImode:
1668 case E_V4HImode:
1669 case E_V2SImode:
1670 case E_V1DImode:
1671 case E_V4HFmode:
1672 case E_V4BFmode:
1673 case E_V2SFmode:
1674 case E_V1DFmode:
1675 /* 128-bit Advanced SIMD vectors. */
1676 case E_V16QImode:
1677 case E_V8HImode:
1678 case E_V4SImode:
1679 case E_V2DImode:
1680 case E_V8HFmode:
1681 case E_V8BFmode:
1682 case E_V4SFmode:
1683 case E_V2DFmode:
1684 return (TARGET_FLOAT || any_target_p) ? VEC_ADVSIMD : 0;
1686 case E_VNx32BImode:
1687 case E_VNx64BImode:
1688 return TARGET_SVE ? VEC_SVE_PRED | VEC_STRUCT : 0;
1690 default:
1691 return 0;
1695 /* Like aarch64_classify_vector_mode, but also include modes that are used
1696 for memory operands but not register operands. Such modes do not count
1697 as real vector modes; they are just an internal construct to make things
1698 easier to describe. */
1699 static unsigned int
1700 aarch64_classify_vector_memory_mode (machine_mode mode)
1702 switch (mode)
1704 case VNx1SImode:
1705 case VNx1DImode:
1706 return TARGET_SVE ? VEC_SVE_DATA | VEC_PARTIAL : 0;
1708 case VNx1TImode:
1709 return TARGET_SVE ? VEC_SVE_DATA : 0;
1711 case VNx2TImode:
1712 case VNx3TImode:
1713 case VNx4TImode:
1714 return TARGET_SVE ? VEC_SVE_DATA | VEC_STRUCT : 0;
1716 default:
1717 return aarch64_classify_vector_mode (mode);
1721 /* Return true if MODE is any of the Advanced SIMD structure modes. */
1722 bool
1723 aarch64_advsimd_struct_mode_p (machine_mode mode)
1725 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
1726 return (vec_flags & VEC_ADVSIMD) && (vec_flags & VEC_STRUCT);
1729 /* Return true if MODE is an Advanced SIMD D-register structure mode. */
1730 static bool
1731 aarch64_advsimd_partial_struct_mode_p (machine_mode mode)
1733 return (aarch64_classify_vector_mode (mode)
1734 == (VEC_ADVSIMD | VEC_STRUCT | VEC_PARTIAL));
1737 /* Return true if MODE is an Advanced SIMD Q-register structure mode. */
1738 static bool
1739 aarch64_advsimd_full_struct_mode_p (machine_mode mode)
1741 return (aarch64_classify_vector_mode (mode) == (VEC_ADVSIMD | VEC_STRUCT));
1744 /* Return true if MODE is any of the data vector modes, including
1745 structure modes. */
1746 static bool
1747 aarch64_vector_data_mode_p (machine_mode mode)
1749 return aarch64_classify_vector_mode (mode) & VEC_ANY_DATA;
1752 /* Return true if MODE is any form of SVE mode, including predicates,
1753 vectors and structures. */
1754 bool
1755 aarch64_sve_mode_p (machine_mode mode)
1757 return aarch64_classify_vector_mode (mode) & VEC_ANY_SVE;
1760 /* Return true if MODE is an SVE data vector mode; either a single vector
1761 or a structure of vectors. */
1762 static bool
1763 aarch64_sve_data_mode_p (machine_mode mode)
1765 return aarch64_classify_vector_mode (mode) & VEC_SVE_DATA;
1768 /* Return the number of defined bytes in one constituent vector of
1769 SVE mode MODE, which has vector flags VEC_FLAGS. */
1770 static poly_int64
1771 aarch64_vl_bytes (machine_mode mode, unsigned int vec_flags)
1773 if (vec_flags & VEC_PARTIAL)
1774 /* A single partial vector. */
1775 return GET_MODE_SIZE (mode);
1777 if (vec_flags & VEC_SVE_DATA)
1778 /* A single vector or a tuple. */
1779 return BYTES_PER_SVE_VECTOR;
1781 /* A single predicate. */
1782 gcc_assert (vec_flags & VEC_SVE_PRED);
1783 return BYTES_PER_SVE_PRED;
1786 /* If MODE holds an array of vectors, return the number of vectors
1787 in the array, otherwise return 1. */
1789 static unsigned int
1790 aarch64_ldn_stn_vectors (machine_mode mode)
1792 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
1793 if (vec_flags == (VEC_ADVSIMD | VEC_PARTIAL | VEC_STRUCT))
1794 return exact_div (GET_MODE_SIZE (mode), 8).to_constant ();
1795 if (vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
1796 return exact_div (GET_MODE_SIZE (mode), 16).to_constant ();
1797 if (vec_flags == (VEC_SVE_DATA | VEC_STRUCT))
1798 return exact_div (GET_MODE_SIZE (mode),
1799 BYTES_PER_SVE_VECTOR).to_constant ();
1800 return 1;
1803 /* Given an Advanced SIMD vector mode MODE and a tuple size NELEMS, return the
1804 corresponding vector structure mode. */
1805 opt_machine_mode
1806 aarch64_advsimd_vector_array_mode (machine_mode mode,
1807 unsigned HOST_WIDE_INT nelems)
1809 unsigned int flags = VEC_ADVSIMD | VEC_STRUCT;
1810 if (known_eq (GET_MODE_SIZE (mode), 8))
1811 flags |= VEC_PARTIAL;
1813 machine_mode struct_mode;
1814 FOR_EACH_MODE_IN_CLASS (struct_mode, GET_MODE_CLASS (mode))
1815 if (aarch64_classify_vector_mode (struct_mode) == flags
1816 && GET_MODE_INNER (struct_mode) == GET_MODE_INNER (mode)
1817 && known_eq (GET_MODE_NUNITS (struct_mode),
1818 GET_MODE_NUNITS (mode) * nelems))
1819 return struct_mode;
1820 return opt_machine_mode ();
1823 /* Return the SVE vector mode that has NUNITS elements of mode INNER_MODE. */
1825 opt_machine_mode
1826 aarch64_sve_data_mode (scalar_mode inner_mode, poly_uint64 nunits)
1828 enum mode_class mclass = (is_a <scalar_float_mode> (inner_mode)
1829 ? MODE_VECTOR_FLOAT : MODE_VECTOR_INT);
1830 machine_mode mode;
1831 FOR_EACH_MODE_IN_CLASS (mode, mclass)
1832 if (inner_mode == GET_MODE_INNER (mode)
1833 && known_eq (nunits, GET_MODE_NUNITS (mode))
1834 && aarch64_sve_data_mode_p (mode))
1835 return mode;
1836 return opt_machine_mode ();
1839 /* Implement target hook TARGET_ARRAY_MODE. */
1840 static opt_machine_mode
1841 aarch64_array_mode (machine_mode mode, unsigned HOST_WIDE_INT nelems)
1843 if (TARGET_SVE && GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL)
1845 /* Use VNx32BI and VNx64BI for tuples of predicates, but explicitly
1846 reject giving a mode to other array sizes. Using integer modes
1847 requires a round trip through memory and generates terrible code. */
1848 if (nelems == 1)
1849 return mode;
1850 if (mode == VNx16BImode && nelems == 2)
1851 return VNx32BImode;
1852 if (mode == VNx16BImode && nelems == 4)
1853 return VNx64BImode;
1854 return BLKmode;
1857 auto flags = aarch64_classify_vector_mode (mode);
1858 if (flags == VEC_SVE_DATA && IN_RANGE (nelems, 2, 4))
1859 return aarch64_sve_data_mode (GET_MODE_INNER (mode),
1860 GET_MODE_NUNITS (mode) * nelems);
1862 if (flags == VEC_ADVSIMD && IN_RANGE (nelems, 2, 4))
1863 return aarch64_advsimd_vector_array_mode (mode, nelems);
1865 return opt_machine_mode ();
1868 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P. */
1869 static bool
1870 aarch64_array_mode_supported_p (machine_mode mode,
1871 unsigned HOST_WIDE_INT nelems)
1873 if (TARGET_BASE_SIMD
1874 && (AARCH64_VALID_SIMD_QREG_MODE (mode)
1875 || AARCH64_VALID_SIMD_DREG_MODE (mode))
1876 && (nelems >= 2 && nelems <= 4))
1877 return true;
1879 return false;
1882 /* MODE is some form of SVE vector mode. For data modes, return the number
1883 of vector register bits that each element of MODE occupies, such as 64
1884 for both VNx2DImode and VNx2SImode (where each 32-bit value is stored
1885 in a 64-bit container). For predicate modes, return the number of
1886 data bits controlled by each significant predicate bit. */
1888 static unsigned int
1889 aarch64_sve_container_bits (machine_mode mode)
1891 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
1892 poly_uint64 vector_bits = (vec_flags & (VEC_PARTIAL | VEC_SVE_PRED)
1893 ? BITS_PER_SVE_VECTOR
1894 : GET_MODE_BITSIZE (mode));
1895 return vector_element_size (vector_bits, GET_MODE_NUNITS (mode));
1898 /* Return the SVE predicate mode to use for elements that have
1899 ELEM_NBYTES bytes, if such a mode exists. */
1901 opt_machine_mode
1902 aarch64_sve_pred_mode (unsigned int elem_nbytes)
1904 if (TARGET_SVE)
1906 if (elem_nbytes == 1)
1907 return VNx16BImode;
1908 if (elem_nbytes == 2)
1909 return VNx8BImode;
1910 if (elem_nbytes == 4)
1911 return VNx4BImode;
1912 if (elem_nbytes == 8)
1913 return VNx2BImode;
1915 return opt_machine_mode ();
1918 /* Return the SVE predicate mode that should be used to control
1919 SVE mode MODE. */
1921 machine_mode
1922 aarch64_sve_pred_mode (machine_mode mode)
1924 unsigned int bits = aarch64_sve_container_bits (mode);
1925 return aarch64_sve_pred_mode (bits / BITS_PER_UNIT).require ();
1928 /* Implement TARGET_VECTORIZE_GET_MASK_MODE. */
1930 static opt_machine_mode
1931 aarch64_get_mask_mode (machine_mode mode)
1933 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
1934 if (vec_flags & VEC_SVE_DATA)
1935 return aarch64_sve_pred_mode (mode);
1937 return default_get_mask_mode (mode);
1940 /* Return the integer element mode associated with SVE mode MODE. */
1942 static scalar_int_mode
1943 aarch64_sve_element_int_mode (machine_mode mode)
1945 poly_uint64 vector_bits = (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL
1946 ? BITS_PER_SVE_VECTOR
1947 : GET_MODE_BITSIZE (mode));
1948 unsigned int elt_bits = vector_element_size (vector_bits,
1949 GET_MODE_NUNITS (mode));
1950 return int_mode_for_size (elt_bits, 0).require ();
1953 /* Return an integer element mode that contains exactly
1954 aarch64_sve_container_bits (MODE) bits. This is wider than
1955 aarch64_sve_element_int_mode if MODE is a partial vector,
1956 otherwise it's the same. */
1958 static scalar_int_mode
1959 aarch64_sve_container_int_mode (machine_mode mode)
1961 return int_mode_for_size (aarch64_sve_container_bits (mode), 0).require ();
1964 /* Return the integer vector mode associated with SVE mode MODE.
1965 Unlike related_int_vector_mode, this can handle the case in which
1966 MODE is a predicate (and thus has a different total size). */
1968 machine_mode
1969 aarch64_sve_int_mode (machine_mode mode)
1971 scalar_int_mode int_mode = aarch64_sve_element_int_mode (mode);
1972 return aarch64_sve_data_mode (int_mode, GET_MODE_NUNITS (mode)).require ();
1975 /* Look for a vector mode with the same classification as VEC_MODE,
1976 but with each group of FACTOR elements coalesced into a single element.
1977 In other words, look for a mode in which the elements are FACTOR times
1978 larger and in which the number of elements is FACTOR times smaller.
1980 Return the mode found, if one exists. */
1982 static opt_machine_mode
1983 aarch64_coalesce_units (machine_mode vec_mode, unsigned int factor)
1985 auto elt_bits = vector_element_size (GET_MODE_BITSIZE (vec_mode),
1986 GET_MODE_NUNITS (vec_mode));
1987 auto vec_flags = aarch64_classify_vector_mode (vec_mode);
1988 if (vec_flags & VEC_SVE_PRED)
1990 if (known_eq (GET_MODE_SIZE (vec_mode), BYTES_PER_SVE_PRED))
1991 return aarch64_sve_pred_mode (elt_bits * factor);
1992 return {};
1995 scalar_mode new_elt_mode;
1996 if (!int_mode_for_size (elt_bits * factor, false).exists (&new_elt_mode))
1997 return {};
1999 if (vec_flags == VEC_ADVSIMD)
2001 auto mode = aarch64_simd_container_mode (new_elt_mode,
2002 GET_MODE_BITSIZE (vec_mode));
2003 if (mode != word_mode)
2004 return mode;
2006 else if (vec_flags & VEC_SVE_DATA)
2008 poly_uint64 new_nunits;
2009 if (multiple_p (GET_MODE_NUNITS (vec_mode), factor, &new_nunits))
2010 return aarch64_sve_data_mode (new_elt_mode, new_nunits);
2012 return {};
2015 /* Implement TARGET_VECTORIZE_RELATED_MODE. */
2017 static opt_machine_mode
2018 aarch64_vectorize_related_mode (machine_mode vector_mode,
2019 scalar_mode element_mode,
2020 poly_uint64 nunits)
2022 unsigned int vec_flags = aarch64_classify_vector_mode (vector_mode);
2024 /* If we're operating on SVE vectors, try to return an SVE mode. */
2025 poly_uint64 sve_nunits;
2026 if ((vec_flags & VEC_SVE_DATA)
2027 && multiple_p (BYTES_PER_SVE_VECTOR,
2028 GET_MODE_SIZE (element_mode), &sve_nunits))
2030 machine_mode sve_mode;
2031 if (maybe_ne (nunits, 0U))
2033 /* Try to find a full or partial SVE mode with exactly
2034 NUNITS units. */
2035 if (multiple_p (sve_nunits, nunits)
2036 && aarch64_sve_data_mode (element_mode,
2037 nunits).exists (&sve_mode))
2038 return sve_mode;
2040 else
2042 /* Take the preferred number of units from the number of bytes
2043 that fit in VECTOR_MODE. We always start by "autodetecting"
2044 a full vector mode with preferred_simd_mode, so vectors
2045 chosen here will also be full vector modes. Then
2046 autovectorize_vector_modes tries smaller starting modes
2047 and thus smaller preferred numbers of units. */
2048 sve_nunits = ordered_min (sve_nunits, GET_MODE_SIZE (vector_mode));
2049 if (aarch64_sve_data_mode (element_mode,
2050 sve_nunits).exists (&sve_mode))
2051 return sve_mode;
2055 /* Prefer to use 1 128-bit vector instead of 2 64-bit vectors. */
2056 if (TARGET_SIMD
2057 && (vec_flags & VEC_ADVSIMD)
2058 && known_eq (nunits, 0U)
2059 && known_eq (GET_MODE_BITSIZE (vector_mode), 64U)
2060 && maybe_ge (GET_MODE_BITSIZE (element_mode)
2061 * GET_MODE_NUNITS (vector_mode), 128U))
2063 machine_mode res = aarch64_simd_container_mode (element_mode, 128);
2064 if (VECTOR_MODE_P (res))
2065 return res;
2068 return default_vectorize_related_mode (vector_mode, element_mode, nunits);
2071 /* Implement TARGET_VECTORIZE_PREFERRED_DIV_AS_SHIFTS_OVER_MULT. */
2073 static bool
2074 aarch64_vectorize_preferred_div_as_shifts_over_mult (const_tree type)
2076 machine_mode mode = TYPE_MODE (type);
2077 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
2078 bool sve_p = (vec_flags & VEC_ANY_SVE);
2079 bool simd_p = (vec_flags & VEC_ADVSIMD);
2081 return (sve_p && TARGET_SVE2) || (simd_p && TARGET_SIMD);
2084 /* Implement TARGET_PREFERRED_ELSE_VALUE. For binary operations,
2085 prefer to use the first arithmetic operand as the else value if
2086 the else value doesn't matter, since that exactly matches the SVE
2087 destructive merging form. For ternary operations we could either
2088 pick the first operand and use FMAD-like instructions or the last
2089 operand and use FMLA-like instructions; the latter seems more
2090 natural. */
2092 static tree
2093 aarch64_preferred_else_value (unsigned, tree, unsigned int nops, tree *ops)
2095 return nops == 3 ? ops[2] : ops[0];
2098 /* Implement TARGET_HARD_REGNO_NREGS. */
2100 static unsigned int
2101 aarch64_hard_regno_nregs (unsigned regno, machine_mode mode)
2103 /* ??? Logically we should only need to provide a value when
2104 HARD_REGNO_MODE_OK says that the combination is valid,
2105 but at the moment we need to handle all modes. Just ignore
2106 any runtime parts for registers that can't store them. */
2107 HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
2108 switch (aarch64_regno_regclass (regno))
2110 case FP_REGS:
2111 case FP_LO_REGS:
2112 case FP_LO8_REGS:
2114 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
2115 if (vec_flags & VEC_SVE_DATA)
2116 return exact_div (GET_MODE_SIZE (mode),
2117 aarch64_vl_bytes (mode, vec_flags)).to_constant ();
2118 if (vec_flags == (VEC_ADVSIMD | VEC_STRUCT | VEC_PARTIAL))
2119 return GET_MODE_SIZE (mode).to_constant () / 8;
2120 return CEIL (lowest_size, UNITS_PER_VREG);
2123 case PR_REGS:
2124 case PR_LO_REGS:
2125 case PR_HI_REGS:
2126 return mode == VNx64BImode ? 4 : mode == VNx32BImode ? 2 : 1;
2128 case MOVEABLE_SYSREGS:
2129 case FFR_REGS:
2130 case PR_AND_FFR_REGS:
2131 case FAKE_REGS:
2132 return 1;
2134 default:
2135 return CEIL (lowest_size, UNITS_PER_WORD);
2137 gcc_unreachable ();
2140 /* Implement TARGET_HARD_REGNO_MODE_OK. */
2142 static bool
2143 aarch64_hard_regno_mode_ok (unsigned regno, machine_mode mode)
2145 if (mode == V8DImode)
2146 return IN_RANGE (regno, R0_REGNUM, R23_REGNUM)
2147 && multiple_p (regno - R0_REGNUM, 2);
2149 if (GET_MODE_CLASS (mode) == MODE_CC)
2150 return regno == CC_REGNUM;
2152 if (regno == VG_REGNUM)
2153 /* This must have the same size as _Unwind_Word. */
2154 return mode == DImode;
2156 if (regno == FPM_REGNUM)
2157 return mode == QImode || mode == HImode || mode == SImode || mode == DImode;
2159 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
2160 if (vec_flags == VEC_SVE_PRED)
2161 return pr_or_ffr_regnum_p (regno);
2163 if (vec_flags == (VEC_SVE_PRED | VEC_STRUCT))
2164 return PR_REGNUM_P (regno);
2166 if (pr_or_ffr_regnum_p (regno))
2167 return false;
2169 /* These registers are abstract; their modes don't matter. */
2170 if (FAKE_REGNUM_P (regno))
2171 return true;
2173 if (regno == SP_REGNUM)
2174 /* The purpose of comparing with ptr_mode is to support the
2175 global register variable associated with the stack pointer
2176 register via the syntax of asm ("wsp") in ILP32. */
2177 return mode == Pmode || mode == ptr_mode;
2179 if (regno == FRAME_POINTER_REGNUM || regno == ARG_POINTER_REGNUM)
2180 return mode == Pmode;
2182 if (GP_REGNUM_P (regno))
2184 if (vec_flags & (VEC_ANY_SVE | VEC_STRUCT))
2185 return false;
2186 if (known_le (GET_MODE_SIZE (mode), 8))
2187 return true;
2188 if (known_le (GET_MODE_SIZE (mode), 16))
2189 return (regno & 1) == 0;
2191 else if (FP_REGNUM_P (regno))
2193 if (vec_flags & VEC_STRUCT)
2194 return end_hard_regno (mode, regno) - 1 <= V31_REGNUM;
2195 else
2196 return !VECTOR_MODE_P (mode) || vec_flags != 0;
2199 return false;
2202 /* Return true if a function with type FNTYPE returns its value in
2203 SVE vector or predicate registers. */
2205 static bool
2206 aarch64_returns_value_in_sve_regs_p (const_tree fntype)
2208 tree return_type = TREE_TYPE (fntype);
2210 pure_scalable_type_info pst_info;
2211 switch (pst_info.analyze (return_type))
2213 case pure_scalable_type_info::IS_PST:
2214 return (pst_info.num_zr () <= NUM_FP_ARG_REGS
2215 && pst_info.num_pr () <= NUM_PR_ARG_REGS);
2217 case pure_scalable_type_info::DOESNT_MATTER:
2218 gcc_assert (aarch64_return_in_memory_1 (return_type));
2219 return false;
2221 case pure_scalable_type_info::NO_ABI_IDENTITY:
2222 case pure_scalable_type_info::ISNT_PST:
2223 return false;
2225 gcc_unreachable ();
2228 /* Return true if a function with type FNTYPE takes arguments in
2229 SVE vector or predicate registers. */
2231 static bool
2232 aarch64_takes_arguments_in_sve_regs_p (const_tree fntype)
2234 CUMULATIVE_ARGS args_so_far_v;
2235 aarch64_init_cumulative_args (&args_so_far_v, NULL_TREE, NULL_RTX,
2236 NULL_TREE, 0, true);
2237 cumulative_args_t args_so_far = pack_cumulative_args (&args_so_far_v);
2239 for (tree chain = TYPE_ARG_TYPES (fntype);
2240 chain && chain != void_list_node;
2241 chain = TREE_CHAIN (chain))
2243 tree arg_type = TREE_VALUE (chain);
2244 if (arg_type == error_mark_node)
2245 return false;
2247 function_arg_info arg (arg_type, /*named=*/true);
2248 apply_pass_by_reference_rules (&args_so_far_v, arg);
2249 pure_scalable_type_info pst_info;
2250 if (pst_info.analyze_registers (arg.type))
2252 unsigned int end_zr = args_so_far_v.aapcs_nvrn + pst_info.num_zr ();
2253 unsigned int end_pr = args_so_far_v.aapcs_nprn + pst_info.num_pr ();
2254 gcc_assert (end_zr <= NUM_FP_ARG_REGS && end_pr <= NUM_PR_ARG_REGS);
2255 return true;
2258 targetm.calls.function_arg_advance (args_so_far, arg);
2260 return false;
2263 /* Implement TARGET_FNTYPE_ABI. */
2265 static const predefined_function_abi &
2266 aarch64_fntype_abi (const_tree fntype)
2268 if (lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (fntype)))
2269 return aarch64_simd_abi ();
2271 if (aarch64_returns_value_in_sve_regs_p (fntype)
2272 || aarch64_takes_arguments_in_sve_regs_p (fntype))
2273 return aarch64_sve_abi ();
2275 return default_function_abi;
2278 /* Return the state of PSTATE.SM on entry to functions of type FNTYPE. */
2280 static aarch64_isa_mode
2281 aarch64_fntype_pstate_sm (const_tree fntype)
2283 if (lookup_attribute ("arm", "streaming", TYPE_ATTRIBUTES (fntype)))
2284 return AARCH64_ISA_MODE_SM_ON;
2286 if (lookup_attribute ("arm", "streaming_compatible",
2287 TYPE_ATTRIBUTES (fntype)))
2288 return 0;
2290 return AARCH64_ISA_MODE_SM_OFF;
2293 /* Return state flags that describe whether and how functions of type
2294 FNTYPE share state STATE_NAME with their callers. */
2296 static unsigned int
2297 aarch64_fntype_shared_flags (const_tree fntype, const char *state_name)
2299 return aarch64_lookup_shared_state_flags (TYPE_ATTRIBUTES (fntype),
2300 state_name);
2303 /* Return the state of PSTATE.ZA on entry to functions of type FNTYPE. */
2305 static aarch64_isa_mode
2306 aarch64_fntype_pstate_za (const_tree fntype)
2308 if (aarch64_fntype_shared_flags (fntype, "za")
2309 || aarch64_fntype_shared_flags (fntype, "zt0"))
2310 return AARCH64_ISA_MODE_ZA_ON;
2312 return 0;
2315 /* Return the ISA mode on entry to functions of type FNTYPE. */
2317 static aarch64_isa_mode
2318 aarch64_fntype_isa_mode (const_tree fntype)
2320 return (aarch64_fntype_pstate_sm (fntype)
2321 | aarch64_fntype_pstate_za (fntype));
2324 /* Return true if FNDECL uses streaming mode internally, as an
2325 implementation choice. */
2327 static bool
2328 aarch64_fndecl_is_locally_streaming (const_tree fndecl)
2330 return lookup_attribute ("arm", "locally_streaming",
2331 DECL_ATTRIBUTES (fndecl));
2334 /* Return the state of PSTATE.SM when compiling the body of
2335 function FNDECL. This might be different from the state of
2336 PSTATE.SM on entry. */
2338 static aarch64_isa_mode
2339 aarch64_fndecl_pstate_sm (const_tree fndecl)
2341 if (aarch64_fndecl_is_locally_streaming (fndecl))
2342 return AARCH64_ISA_MODE_SM_ON;
2344 return aarch64_fntype_pstate_sm (TREE_TYPE (fndecl));
2347 /* Return true if function FNDECL has state STATE_NAME, either by creating
2348 new state itself or by sharing state with callers. */
2350 static bool
2351 aarch64_fndecl_has_state (tree fndecl, const char *state_name)
2353 return (aarch64_fndecl_has_new_state (fndecl, state_name)
2354 || aarch64_fntype_shared_flags (TREE_TYPE (fndecl),
2355 state_name) != 0);
2358 /* Return the state of PSTATE.ZA when compiling the body of function FNDECL.
2359 This might be different from the state of PSTATE.ZA on entry. */
2361 static aarch64_isa_mode
2362 aarch64_fndecl_pstate_za (const_tree fndecl)
2364 if (aarch64_fndecl_has_new_state (fndecl, "za")
2365 || aarch64_fndecl_has_new_state (fndecl, "zt0"))
2366 return AARCH64_ISA_MODE_ZA_ON;
2368 return aarch64_fntype_pstate_za (TREE_TYPE (fndecl));
2371 /* Return the ISA mode that should be used to compile the body of
2372 function FNDECL. */
2374 static aarch64_isa_mode
2375 aarch64_fndecl_isa_mode (const_tree fndecl)
2377 return (aarch64_fndecl_pstate_sm (fndecl)
2378 | aarch64_fndecl_pstate_za (fndecl));
2381 /* Return the state of PSTATE.SM on entry to the current function.
2382 This might be different from the state of PSTATE.SM in the function
2383 body. */
2385 static aarch64_isa_mode
2386 aarch64_cfun_incoming_pstate_sm ()
2388 return aarch64_fntype_pstate_sm (TREE_TYPE (cfun->decl));
2391 /* Return the state of PSTATE.ZA on entry to the current function.
2392 This might be different from the state of PSTATE.ZA in the function
2393 body. */
2395 static aarch64_isa_mode
2396 aarch64_cfun_incoming_pstate_za ()
2398 return aarch64_fntype_pstate_za (TREE_TYPE (cfun->decl));
2401 /* Return state flags that describe whether and how the current function shares
2402 state STATE_NAME with callers. */
2404 static unsigned int
2405 aarch64_cfun_shared_flags (const char *state_name)
2407 return aarch64_fntype_shared_flags (TREE_TYPE (cfun->decl), state_name);
2410 /* Return true if the current function creates new state of type STATE_NAME
2411 (as opposed to sharing the state with its callers or ignoring the state
2412 altogether). */
2414 static bool
2415 aarch64_cfun_has_new_state (const char *state_name)
2417 return aarch64_fndecl_has_new_state (cfun->decl, state_name);
2420 /* Return true if PSTATE.SM is 1 in the body of the current function,
2421 but is not guaranteed to be 1 on entry. */
2423 static bool
2424 aarch64_cfun_enables_pstate_sm ()
2426 return (aarch64_fndecl_is_locally_streaming (cfun->decl)
2427 && aarch64_cfun_incoming_pstate_sm () != AARCH64_ISA_MODE_SM_ON);
2430 /* Return true if the current function has state STATE_NAME, either by
2431 creating new state itself or by sharing state with callers. */
2433 static bool
2434 aarch64_cfun_has_state (const char *state_name)
2436 return aarch64_fndecl_has_state (cfun->decl, state_name);
2439 /* Return true if a call from the current function to a function with
2440 ISA mode CALLEE_MODE would involve a change to PSTATE.SM around
2441 the BL instruction. */
2443 static bool
2444 aarch64_call_switches_pstate_sm (aarch64_isa_mode callee_mode)
2446 return (bool) (callee_mode & ~AARCH64_ISA_MODE & AARCH64_ISA_MODE_SM_STATE);
2449 /* Implement TARGET_COMPATIBLE_VECTOR_TYPES_P. */
2451 static bool
2452 aarch64_compatible_vector_types_p (const_tree type1, const_tree type2)
2454 return (aarch64_sve::builtin_type_p (type1)
2455 == aarch64_sve::builtin_type_p (type2));
2458 /* Return true if we should emit CFI for register REGNO. */
2460 static bool
2461 aarch64_emit_cfi_for_reg_p (unsigned int regno)
2463 return (GP_REGNUM_P (regno)
2464 || !default_function_abi.clobbers_full_reg_p (regno));
2467 /* Return the mode we should use to save and restore register REGNO. */
2469 static machine_mode
2470 aarch64_reg_save_mode (unsigned int regno)
2472 if (GP_REGNUM_P (regno) || regno == VG_REGNUM)
2473 return DImode;
2475 if (FP_REGNUM_P (regno))
2476 switch (crtl->abi->id ())
2478 case ARM_PCS_AAPCS64:
2479 /* Only the low 64 bits are saved by the base PCS. */
2480 return DFmode;
2482 case ARM_PCS_SIMD:
2483 /* The vector PCS saves the low 128 bits (which is the full
2484 register on non-SVE targets). */
2485 return V16QImode;
2487 case ARM_PCS_SVE:
2488 /* Use vectors of DImode for registers that need frame
2489 information, so that the first 64 bytes of the save slot
2490 are always the equivalent of what storing D<n> would give. */
2491 if (aarch64_emit_cfi_for_reg_p (regno))
2492 return VNx2DImode;
2494 /* Use vectors of bytes otherwise, so that the layout is
2495 endian-agnostic, and so that we can use LDR and STR for
2496 big-endian targets. */
2497 return VNx16QImode;
2499 case ARM_PCS_TLSDESC:
2500 case ARM_PCS_UNKNOWN:
2501 break;
2504 if (PR_REGNUM_P (regno))
2505 /* Save the full predicate register. */
2506 return VNx16BImode;
2508 gcc_unreachable ();
2511 /* Return the CONST_INT that should be placed in an UNSPEC_CALLEE_ABI rtx.
2512 This value encodes the following information:
2513 - the ISA mode on entry to a callee (ISA_MODE)
2514 - the ABI of the callee (PCS_VARIANT)
2515 - whether the callee has an indirect_return
2516 attribute (INDIRECT_RETURN). */
2519 aarch64_gen_callee_cookie (aarch64_isa_mode isa_mode, arm_pcs pcs_variant,
2520 bool indirect_return)
2522 unsigned int im = (unsigned int) isa_mode;
2523 unsigned int ir = (indirect_return ? 1 : 0) << AARCH64_NUM_ISA_MODES;
2524 unsigned int pv = (unsigned int) pcs_variant
2525 << (AARCH64_NUM_ABI_ATTRIBUTES + AARCH64_NUM_ISA_MODES);
2526 return gen_int_mode (im | ir | pv, DImode);
2529 /* COOKIE is a CONST_INT from an UNSPEC_CALLEE_ABI rtx. Return the
2530 callee's ABI. */
2532 static const predefined_function_abi &
2533 aarch64_callee_abi (rtx cookie)
2535 return function_abis[UINTVAL (cookie)
2536 >> (AARCH64_NUM_ABI_ATTRIBUTES + AARCH64_NUM_ISA_MODES)];
2539 /* COOKIE is a CONST_INT from an UNSPEC_CALLEE_ABI rtx. Return the
2540 required ISA mode on entry to the callee, which is also the ISA
2541 mode on return from the callee. */
2543 static aarch64_isa_mode
2544 aarch64_callee_isa_mode (rtx cookie)
2546 return UINTVAL (cookie) & ((1 << AARCH64_NUM_ISA_MODES) - 1);
2549 /* COOKIE is a CONST_INT from an UNSPEC_CALLEE_ABI rtx. Return
2550 whether function was marked with an indirect_return attribute. */
2552 static bool
2553 aarch64_callee_indirect_return (rtx cookie)
2555 return ((UINTVAL (cookie) >> AARCH64_NUM_ISA_MODES) & 1) == 1;
2558 /* INSN is a call instruction. Return the CONST_INT stored in its
2559 UNSPEC_CALLEE_ABI rtx. */
2561 static rtx
2562 aarch64_insn_callee_cookie (const rtx_insn *insn)
2564 rtx pat = PATTERN (insn);
2565 gcc_assert (GET_CODE (pat) == PARALLEL);
2566 rtx unspec = XVECEXP (pat, 0, 1);
2567 gcc_assert (GET_CODE (unspec) == UNSPEC
2568 && XINT (unspec, 1) == UNSPEC_CALLEE_ABI);
2569 return XVECEXP (unspec, 0, 0);
2572 /* INSN is a call instruction. Return true if the callee has an
2573 indirect_return attribute. */
2575 bool
2576 aarch_fun_is_indirect_return (rtx_insn *insn)
2578 rtx cookie = aarch64_insn_callee_cookie (insn);
2579 return aarch64_callee_indirect_return (cookie);
2582 /* Implement TARGET_INSN_CALLEE_ABI. */
2584 const predefined_function_abi &
2585 aarch64_insn_callee_abi (const rtx_insn *insn)
2587 return aarch64_callee_abi (aarch64_insn_callee_cookie (insn));
2590 /* INSN is a call instruction. Return the required ISA mode on entry to
2591 the callee, which is also the ISA mode on return from the callee. */
2593 static aarch64_isa_mode
2594 aarch64_insn_callee_isa_mode (const rtx_insn *insn)
2596 return aarch64_callee_isa_mode (aarch64_insn_callee_cookie (insn));
2599 /* Implement TARGET_HARD_REGNO_CALL_PART_CLOBBERED. The callee only saves
2600 the lower 64 bits of a 128-bit register. Tell the compiler the callee
2601 clobbers the top 64 bits when restoring the bottom 64 bits. */
2603 static bool
2604 aarch64_hard_regno_call_part_clobbered (unsigned int abi_id,
2605 unsigned int regno,
2606 machine_mode mode)
2608 if (FP_REGNUM_P (regno) && abi_id != ARM_PCS_SVE)
2610 poly_int64 per_register_size = GET_MODE_SIZE (mode);
2611 unsigned int nregs = hard_regno_nregs (regno, mode);
2612 if (nregs > 1)
2613 per_register_size = exact_div (per_register_size, nregs);
2614 if (abi_id == ARM_PCS_SIMD || abi_id == ARM_PCS_TLSDESC)
2615 return maybe_gt (per_register_size, 16);
2616 return maybe_gt (per_register_size, 8);
2618 return false;
2621 /* Implement REGMODE_NATURAL_SIZE. */
2622 poly_uint64
2623 aarch64_regmode_natural_size (machine_mode mode)
2625 /* The natural size for SVE data modes is one SVE data vector,
2626 and similarly for predicates. We can't independently modify
2627 anything smaller than that. */
2628 /* ??? For now, only do this for variable-width SVE registers.
2629 Doing it for constant-sized registers breaks lower-subreg.cc. */
2630 /* ??? And once that's fixed, we should probably have similar
2631 code for Advanced SIMD. */
2632 if (!aarch64_sve_vg.is_constant ())
2634 /* REGMODE_NATURAL_SIZE influences general subreg validity rules,
2635 so we need to handle memory-only modes as well. */
2636 unsigned int vec_flags = aarch64_classify_vector_memory_mode (mode);
2637 if (vec_flags & VEC_SVE_PRED)
2638 return BYTES_PER_SVE_PRED;
2639 if (vec_flags & VEC_SVE_DATA)
2640 return BYTES_PER_SVE_VECTOR;
2642 return UNITS_PER_WORD;
2645 /* Implement HARD_REGNO_CALLER_SAVE_MODE. */
2646 machine_mode
2647 aarch64_hard_regno_caller_save_mode (unsigned regno, unsigned,
2648 machine_mode mode)
2650 /* The predicate mode determines which bits are significant and
2651 which are "don't care". Decreasing the number of lanes would
2652 lose data while increasing the number of lanes would make bits
2653 unnecessarily significant. */
2654 if (PR_REGNUM_P (regno))
2655 return mode;
2656 if (known_lt (GET_MODE_SIZE (mode), 4)
2657 && REG_CAN_CHANGE_MODE_P (regno, mode, SImode)
2658 && REG_CAN_CHANGE_MODE_P (regno, SImode, mode))
2659 return SImode;
2660 return mode;
2663 /* Return true if I's bits are consecutive ones from the MSB. */
2664 bool
2665 aarch64_high_bits_all_ones_p (HOST_WIDE_INT i)
2667 return exact_log2 (-i) != HOST_WIDE_INT_M1;
2670 /* Implement TARGET_CONSTANT_ALIGNMENT. Make strings word-aligned so
2671 that strcpy from constants will be faster. */
2673 static HOST_WIDE_INT
2674 aarch64_constant_alignment (const_tree exp, HOST_WIDE_INT align)
2676 if (TREE_CODE (exp) == STRING_CST && !optimize_size)
2677 return MAX (align, BITS_PER_WORD);
2678 return align;
2681 /* Align definitions of arrays, unions and structures so that
2682 initializations and copies can be made more efficient. This is not
2683 ABI-changing, so it only affects places where we can see the
2684 definition. Increasing the alignment tends to introduce padding,
2685 so don't do this when optimizing for size/conserving stack space. */
2687 unsigned
2688 aarch64_data_alignment (const_tree type, unsigned align)
2690 if (optimize_size)
2691 return align;
2693 if (AGGREGATE_TYPE_P (type))
2695 unsigned HOST_WIDE_INT size = 0;
2697 if (TYPE_SIZE (type) && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
2698 && tree_fits_uhwi_p (TYPE_SIZE (type)))
2699 size = tree_to_uhwi (TYPE_SIZE (type));
2701 /* Align small structs/arrays to 32 bits, or 64 bits if larger. */
2702 if (align < 32 && size <= 32)
2703 align = 32;
2704 else if (align < 64)
2705 align = 64;
2708 return align;
2711 unsigned
2712 aarch64_stack_alignment (const_tree type, unsigned align)
2714 if (flag_conserve_stack)
2715 return align;
2717 if (AGGREGATE_TYPE_P (type))
2719 unsigned HOST_WIDE_INT size = 0;
2721 if (TYPE_SIZE (type) && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
2722 && tree_fits_uhwi_p (TYPE_SIZE (type)))
2723 size = tree_to_uhwi (TYPE_SIZE (type));
2725 /* Align small structs/arrays to 32 bits, or 64 bits if larger. */
2726 if (align < 32 && size <= 32)
2727 align = 32;
2728 else if (align < 64)
2729 align = 64;
2732 return align;
2735 /* Return true if calls to DECL should be treated as
2736 long-calls (ie called via a register). */
2737 static bool
2738 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED)
2740 return false;
2743 /* Return true if calls to symbol-ref SYM should be treated as
2744 long-calls (ie called via a register). */
2745 bool
2746 aarch64_is_long_call_p (rtx sym)
2748 return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym));
2751 /* Return true if calls to symbol-ref SYM should not go through
2752 plt stubs. */
2754 bool
2755 aarch64_is_noplt_call_p (rtx sym)
2757 const_tree decl = SYMBOL_REF_DECL (sym);
2759 if (flag_pic
2760 && decl
2761 && (!flag_plt
2762 || lookup_attribute ("noplt", DECL_ATTRIBUTES (decl)))
2763 && !targetm.binds_local_p (decl))
2764 return true;
2766 return false;
2769 /* Emit an insn that's a simple single-set. Both the operands must be
2770 known to be valid. */
2771 inline static rtx_insn *
2772 emit_set_insn (rtx x, rtx y)
2774 return emit_insn (gen_rtx_SET (x, y));
2777 /* X and Y are two things to compare using CODE. Emit the compare insn and
2778 return the rtx for register 0 in the proper mode. */
2780 aarch64_gen_compare_reg (RTX_CODE code, rtx x, rtx y)
2782 machine_mode cmp_mode = GET_MODE (x);
2783 machine_mode cc_mode;
2784 rtx cc_reg;
2786 if (cmp_mode == TImode)
2788 gcc_assert (code == NE);
2790 cc_mode = CCmode;
2791 cc_reg = gen_rtx_REG (cc_mode, CC_REGNUM);
2793 rtx x_lo = operand_subword (x, 0, 0, TImode);
2794 rtx y_lo = operand_subword (y, 0, 0, TImode);
2795 emit_set_insn (cc_reg, gen_rtx_COMPARE (cc_mode, x_lo, y_lo));
2797 rtx x_hi = operand_subword (x, 1, 0, TImode);
2798 rtx y_hi = operand_subword (y, 1, 0, TImode);
2799 emit_insn (gen_ccmpccdi (cc_reg, cc_reg, x_hi, y_hi,
2800 gen_rtx_EQ (cc_mode, cc_reg, const0_rtx),
2801 GEN_INT (AARCH64_EQ)));
2803 else
2805 cc_mode = SELECT_CC_MODE (code, x, y);
2806 cc_reg = gen_rtx_REG (cc_mode, CC_REGNUM);
2807 emit_set_insn (cc_reg, gen_rtx_COMPARE (cc_mode, x, y));
2809 return cc_reg;
2812 /* Similarly, but maybe zero-extend Y if Y_MODE < SImode. */
2814 static rtx
2815 aarch64_gen_compare_reg_maybe_ze (RTX_CODE code, rtx x, rtx y,
2816 machine_mode y_mode)
2818 if (y_mode == E_QImode || y_mode == E_HImode)
2820 if (CONST_INT_P (y))
2822 y = GEN_INT (INTVAL (y) & GET_MODE_MASK (y_mode));
2823 y_mode = SImode;
2825 else
2827 rtx t, cc_reg;
2828 machine_mode cc_mode;
2830 t = gen_rtx_ZERO_EXTEND (SImode, y);
2831 t = gen_rtx_COMPARE (CC_SWPmode, t, x);
2832 cc_mode = CC_SWPmode;
2833 cc_reg = gen_rtx_REG (cc_mode, CC_REGNUM);
2834 emit_set_insn (cc_reg, t);
2835 return cc_reg;
2839 if (!aarch64_plus_operand (y, y_mode))
2840 y = force_reg (y_mode, y);
2842 return aarch64_gen_compare_reg (code, x, y);
2845 /* Generate conditional branch to LABEL, comparing X to 0 using CODE.
2846 Return the jump instruction. */
2848 static rtx
2849 aarch64_gen_compare_zero_and_branch (rtx_code code, rtx x,
2850 rtx_code_label *label)
2852 if (aarch64_track_speculation)
2854 /* Emit an explicit compare instruction, so that we can correctly
2855 track the condition codes. */
2856 rtx cc_reg = aarch64_gen_compare_reg (code, x, const0_rtx);
2857 x = gen_rtx_fmt_ee (code, GET_MODE (cc_reg), cc_reg, const0_rtx);
2859 else
2860 x = gen_rtx_fmt_ee (code, VOIDmode, x, const0_rtx);
2862 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
2863 gen_rtx_LABEL_REF (Pmode, label), pc_rtx);
2864 return gen_rtx_SET (pc_rtx, x);
2867 /* Return an rtx that branches to LABEL based on the value of bit BITNUM of X.
2868 If CODE is NE, it branches to LABEL when the bit is set; if CODE is EQ,
2869 it branches to LABEL when the bit is clear. */
2871 static rtx
2872 aarch64_gen_test_and_branch (rtx_code code, rtx x, int bitnum,
2873 rtx_code_label *label)
2875 auto mode = GET_MODE (x);
2876 if (aarch64_track_speculation)
2878 auto mask = gen_int_mode (HOST_WIDE_INT_1U << bitnum, mode);
2879 emit_insn (gen_aarch64_and3nr_compare0 (mode, x, mask));
2880 rtx cc_reg = gen_rtx_REG (CC_NZVmode, CC_REGNUM);
2881 rtx x = gen_rtx_fmt_ee (code, CC_NZVmode, cc_reg, const0_rtx);
2882 return gen_condjump (x, cc_reg, label);
2884 return gen_aarch64_tb (code, mode, mode,
2885 x, gen_int_mode (bitnum, mode), label);
2888 /* Consider the operation:
2890 OPERANDS[0] = CODE (OPERANDS[1], OPERANDS[2]) + OPERANDS[3]
2892 where:
2894 - CODE is [SU]MAX or [SU]MIN
2895 - OPERANDS[2] and OPERANDS[3] are constant integers
2896 - OPERANDS[3] is a positive or negative shifted 12-bit immediate
2897 - all operands have mode MODE
2899 Decide whether it is possible to implement the operation using:
2901 SUBS <tmp>, OPERANDS[1], -OPERANDS[3]
2903 ADDS <tmp>, OPERANDS[1], OPERANDS[3]
2905 followed by:
2907 <insn> OPERANDS[0], <tmp>, [wx]zr, <cond>
2909 where <insn> is one of CSEL, CSINV or CSINC. Return true if so.
2910 If GENERATE_P is true, also update OPERANDS as follows:
2912 OPERANDS[4] = -OPERANDS[3]
2913 OPERANDS[5] = the rtl condition representing <cond>
2914 OPERANDS[6] = <tmp>
2915 OPERANDS[7] = 0 for CSEL, -1 for CSINV or 1 for CSINC. */
2916 bool
2917 aarch64_maxmin_plus_const (rtx_code code, rtx *operands, bool generate_p)
2919 signop sgn = (code == UMAX || code == UMIN ? UNSIGNED : SIGNED);
2920 rtx dst = operands[0];
2921 rtx maxmin_op = operands[2];
2922 rtx add_op = operands[3];
2923 machine_mode mode = GET_MODE (dst);
2925 /* max (x, y) - z == (x >= y + 1 ? x : y) - z
2926 == (x >= y ? x : y) - z
2927 == (x > y ? x : y) - z
2928 == (x > y - 1 ? x : y) - z
2930 min (x, y) - z == (x <= y - 1 ? x : y) - z
2931 == (x <= y ? x : y) - z
2932 == (x < y ? x : y) - z
2933 == (x < y + 1 ? x : y) - z
2935 Check whether z is in { y - 1, y, y + 1 } and pick the form(s) for
2936 which x is compared with z. Set DIFF to y - z. Thus the supported
2937 combinations are as follows, with DIFF being the value after the ":":
2939 max (x, y) - z == x >= y + 1 ? x - (y + 1) : -1 [z == y + 1]
2940 == x >= y ? x - y : 0 [z == y]
2941 == x > y ? x - y : 0 [z == y]
2942 == x > y - 1 ? x - (y - 1) : 1 [z == y - 1]
2944 min (x, y) - z == x <= y - 1 ? x - (y - 1) : 1 [z == y - 1]
2945 == x <= y ? x - y : 0 [z == y]
2946 == x < y ? x - y : 0 [z == y]
2947 == x < y + 1 ? x - (y + 1) : -1 [z == y + 1]. */
2948 auto maxmin_val = rtx_mode_t (maxmin_op, mode);
2949 auto add_val = rtx_mode_t (add_op, mode);
2950 auto sub_val = wi::neg (add_val);
2951 auto diff = wi::sub (maxmin_val, sub_val);
2952 if (!(diff == 0
2953 || (diff == 1 && wi::gt_p (maxmin_val, sub_val, sgn))
2954 || (diff == -1 && wi::lt_p (maxmin_val, sub_val, sgn))))
2955 return false;
2957 if (!generate_p)
2958 return true;
2960 rtx_code cmp;
2961 switch (code)
2963 case SMAX:
2964 cmp = diff == 1 ? GT : GE;
2965 break;
2966 case UMAX:
2967 cmp = diff == 1 ? GTU : GEU;
2968 break;
2969 case SMIN:
2970 cmp = diff == -1 ? LT : LE;
2971 break;
2972 case UMIN:
2973 cmp = diff == -1 ? LTU : LEU;
2974 break;
2975 default:
2976 gcc_unreachable ();
2978 rtx cc = gen_rtx_REG (CCmode, CC_REGNUM);
2980 operands[4] = immed_wide_int_const (sub_val, mode);
2981 operands[5] = gen_rtx_fmt_ee (cmp, VOIDmode, cc, const0_rtx);
2982 if (can_create_pseudo_p ())
2983 operands[6] = gen_reg_rtx (mode);
2984 else
2985 operands[6] = dst;
2986 operands[7] = immed_wide_int_const (diff, mode);
2988 return true;
2992 /* Build the SYMBOL_REF for __tls_get_addr. */
2994 static GTY(()) rtx tls_get_addr_libfunc;
2997 aarch64_tls_get_addr (void)
2999 if (!tls_get_addr_libfunc)
3000 tls_get_addr_libfunc = init_one_libfunc ("__tls_get_addr");
3001 return tls_get_addr_libfunc;
3004 /* Return the TLS model to use for ADDR. */
3006 static enum tls_model
3007 tls_symbolic_operand_type (rtx addr)
3009 enum tls_model tls_kind = TLS_MODEL_NONE;
3010 poly_int64 offset;
3011 addr = strip_offset_and_salt (addr, &offset);
3012 if (SYMBOL_REF_P (addr))
3013 tls_kind = SYMBOL_REF_TLS_MODEL (addr);
3015 return tls_kind;
3018 /* We'll allow lo_sum's in addresses in our legitimate addresses
3019 so that combine would take care of combining addresses where
3020 necessary, but for generation purposes, we'll generate the address
3021 as :
3022 RTL Absolute
3023 tmp = hi (symbol_ref); adrp x1, foo
3024 dest = lo_sum (tmp, symbol_ref); add dest, x1, :lo_12:foo
3027 PIC TLS
3028 adrp x1, :got:foo adrp tmp, :tlsgd:foo
3029 ldr x1, [:got_lo12:foo] add dest, tmp, :tlsgd_lo12:foo
3030 bl __tls_get_addr
3033 Load TLS symbol, depending on TLS mechanism and TLS access model.
3035 Global Dynamic - Traditional TLS:
3036 adrp tmp, :tlsgd:imm
3037 add dest, tmp, #:tlsgd_lo12:imm
3038 bl __tls_get_addr
3040 Global Dynamic - TLS Descriptors:
3041 adrp dest, :tlsdesc:imm
3042 ldr tmp, [dest, #:tlsdesc_lo12:imm]
3043 add dest, dest, #:tlsdesc_lo12:imm
3044 blr tmp
3045 mrs tp, tpidr_el0
3046 add dest, dest, tp
3048 Initial Exec:
3049 mrs tp, tpidr_el0
3050 adrp tmp, :gottprel:imm
3051 ldr dest, [tmp, #:gottprel_lo12:imm]
3052 add dest, dest, tp
3054 Local Exec:
3055 mrs tp, tpidr_el0
3056 add t0, tp, #:tprel_hi12:imm, lsl #12
3057 add t0, t0, #:tprel_lo12_nc:imm
3060 static void
3061 aarch64_load_symref_appropriately (rtx dest, rtx imm,
3062 enum aarch64_symbol_type type)
3064 #if TARGET_PECOFF
3065 rtx tmp = legitimize_pe_coff_symbol (imm, true);
3066 if (tmp)
3068 emit_insn (gen_rtx_SET (dest, tmp));
3069 return;
3071 #endif
3073 switch (type)
3075 case SYMBOL_SMALL_ABSOLUTE:
3077 /* In ILP32, the mode of dest can be either SImode or DImode. */
3078 rtx tmp_reg = dest;
3079 machine_mode mode = GET_MODE (dest);
3081 gcc_assert (mode == Pmode || mode == ptr_mode);
3083 if (can_create_pseudo_p ())
3084 tmp_reg = gen_reg_rtx (mode);
3086 HOST_WIDE_INT mid_const = 0;
3087 if (TARGET_PECOFF)
3089 poly_int64 offset;
3090 strip_offset (imm, &offset);
3092 HOST_WIDE_INT const_offset;
3093 if (offset.is_constant (&const_offset))
3094 /* Written this way for the sake of negative offsets. */
3095 mid_const = const_offset / (1 << 20) * (1 << 20);
3097 imm = plus_constant (mode, imm, -mid_const);
3099 emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, copy_rtx (imm)));
3100 if (mid_const)
3101 emit_set_insn (tmp_reg, plus_constant (mode, tmp_reg, mid_const));
3102 emit_insn (gen_add_losym (dest, tmp_reg, imm));
3103 return;
3106 case SYMBOL_TINY_ABSOLUTE:
3107 emit_insn (gen_rtx_SET (dest, imm));
3108 return;
3110 case SYMBOL_SMALL_GOT_28K:
3112 machine_mode mode = GET_MODE (dest);
3113 rtx gp_rtx = pic_offset_table_rtx;
3114 rtx insn;
3115 rtx mem;
3117 /* NOTE: pic_offset_table_rtx can be NULL_RTX, because we can reach
3118 here before rtl expand. Tree IVOPT will generate rtl pattern to
3119 decide rtx costs, in which case pic_offset_table_rtx is not
3120 initialized. For that case no need to generate the first adrp
3121 instruction as the final cost for global variable access is
3122 one instruction. */
3123 if (gp_rtx != NULL)
3125 /* -fpic for -mcmodel=small allow 32K GOT table size (but we are
3126 using the page base as GOT base, the first page may be wasted,
3127 in the worst scenario, there is only 28K space for GOT).
3129 The generate instruction sequence for accessing global variable
3132 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym]
3134 Only one instruction needed. But we must initialize
3135 pic_offset_table_rtx properly. We generate initialize insn for
3136 every global access, and allow CSE to remove all redundant.
3138 The final instruction sequences will look like the following
3139 for multiply global variables access.
3141 adrp pic_offset_table_rtx, _GLOBAL_OFFSET_TABLE_
3143 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym1]
3144 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym2]
3145 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym3]
3146 ... */
3148 rtx s = gen_rtx_SYMBOL_REF (Pmode, "_GLOBAL_OFFSET_TABLE_");
3149 crtl->uses_pic_offset_table = 1;
3150 emit_move_insn (gp_rtx, gen_rtx_HIGH (Pmode, s));
3152 if (mode != GET_MODE (gp_rtx))
3153 gp_rtx = gen_lowpart (mode, gp_rtx);
3157 if (mode == ptr_mode)
3159 if (mode == DImode)
3160 insn = gen_ldr_got_small_28k_di (dest, gp_rtx, imm);
3161 else
3162 insn = gen_ldr_got_small_28k_si (dest, gp_rtx, imm);
3164 mem = XVECEXP (SET_SRC (insn), 0, 0);
3166 else
3168 gcc_assert (mode == Pmode);
3170 insn = gen_ldr_got_small_28k_sidi (dest, gp_rtx, imm);
3171 mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
3174 /* The operand is expected to be MEM. Whenever the related insn
3175 pattern changed, above code which calculate mem should be
3176 updated. */
3177 gcc_assert (MEM_P (mem));
3178 MEM_READONLY_P (mem) = 1;
3179 MEM_NOTRAP_P (mem) = 1;
3180 emit_insn (insn);
3181 return;
3184 case SYMBOL_SMALL_GOT_4G:
3185 emit_insn (gen_rtx_SET (dest, imm));
3186 return;
3188 case SYMBOL_SMALL_TLSGD:
3190 rtx_insn *insns;
3191 /* The return type of __tls_get_addr is the C pointer type
3192 so use ptr_mode. */
3193 rtx result = gen_rtx_REG (ptr_mode, R0_REGNUM);
3194 rtx tmp_reg = dest;
3196 if (GET_MODE (dest) != ptr_mode)
3197 tmp_reg = can_create_pseudo_p () ? gen_reg_rtx (ptr_mode) : result;
3199 start_sequence ();
3200 if (ptr_mode == SImode)
3201 aarch64_emit_call_insn (gen_tlsgd_small_si (result, imm));
3202 else
3203 aarch64_emit_call_insn (gen_tlsgd_small_di (result, imm));
3204 insns = get_insns ();
3205 end_sequence ();
3207 RTL_CONST_CALL_P (insns) = 1;
3208 emit_libcall_block (insns, tmp_reg, result, imm);
3209 /* Convert back to the mode of the dest adding a zero_extend
3210 from SImode (ptr_mode) to DImode (Pmode). */
3211 if (dest != tmp_reg)
3212 convert_move (dest, tmp_reg, true);
3213 return;
3216 case SYMBOL_SMALL_TLSDESC:
3218 machine_mode mode = GET_MODE (dest);
3219 rtx x0 = gen_rtx_REG (mode, R0_REGNUM);
3220 rtx tp;
3222 gcc_assert (mode == Pmode || mode == ptr_mode);
3224 /* In ILP32, the got entry is always of SImode size. Unlike
3225 small GOT, the dest is fixed at reg 0. */
3226 if (TARGET_ILP32)
3227 emit_insn (gen_tlsdesc_small_si (imm));
3228 else
3229 emit_insn (gen_tlsdesc_small_di (imm));
3230 tp = aarch64_load_tp (NULL);
3232 if (mode != Pmode)
3233 tp = gen_lowpart (mode, tp);
3235 emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, x0)));
3236 if (REG_P (dest))
3237 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
3238 return;
3241 case SYMBOL_SMALL_TLSIE:
3243 /* In ILP32, the mode of dest can be either SImode or DImode,
3244 while the got entry is always of SImode size. The mode of
3245 dest depends on how dest is used: if dest is assigned to a
3246 pointer (e.g. in the memory), it has SImode; it may have
3247 DImode if dest is dereferenced to access the memeory.
3248 This is why we have to handle three different tlsie_small
3249 patterns here (two patterns for ILP32). */
3250 machine_mode mode = GET_MODE (dest);
3251 rtx tmp_reg = gen_reg_rtx (mode);
3252 rtx tp = aarch64_load_tp (NULL);
3254 if (mode == ptr_mode)
3256 if (mode == DImode)
3257 emit_insn (gen_tlsie_small_di (tmp_reg, imm));
3258 else
3260 emit_insn (gen_tlsie_small_si (tmp_reg, imm));
3261 tp = gen_lowpart (mode, tp);
3264 else
3266 gcc_assert (mode == Pmode);
3267 emit_insn (gen_tlsie_small_sidi (tmp_reg, imm));
3270 emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, tmp_reg)));
3271 if (REG_P (dest))
3272 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
3273 return;
3276 case SYMBOL_TLSLE12:
3277 case SYMBOL_TLSLE24:
3278 case SYMBOL_TLSLE32:
3279 case SYMBOL_TLSLE48:
3281 machine_mode mode = GET_MODE (dest);
3282 rtx tp = aarch64_load_tp (NULL);
3284 if (mode != Pmode)
3285 tp = gen_lowpart (mode, tp);
3287 switch (type)
3289 case SYMBOL_TLSLE12:
3290 emit_insn ((mode == DImode ? gen_tlsle12_di : gen_tlsle12_si)
3291 (dest, tp, imm));
3292 break;
3293 case SYMBOL_TLSLE24:
3294 emit_insn ((mode == DImode ? gen_tlsle24_di : gen_tlsle24_si)
3295 (dest, tp, imm));
3296 break;
3297 case SYMBOL_TLSLE32:
3298 emit_insn ((mode == DImode ? gen_tlsle32_di : gen_tlsle32_si)
3299 (dest, imm));
3300 emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
3301 (dest, dest, tp));
3302 break;
3303 case SYMBOL_TLSLE48:
3304 emit_insn ((mode == DImode ? gen_tlsle48_di : gen_tlsle48_si)
3305 (dest, imm));
3306 emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
3307 (dest, dest, tp));
3308 break;
3309 default:
3310 gcc_unreachable ();
3313 if (REG_P (dest))
3314 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
3315 return;
3318 case SYMBOL_TINY_GOT:
3320 rtx insn;
3321 machine_mode mode = GET_MODE (dest);
3323 if (mode == ptr_mode)
3324 insn = gen_ldr_got_tiny (mode, dest, imm);
3325 else
3327 gcc_assert (mode == Pmode);
3328 insn = gen_ldr_got_tiny_sidi (dest, imm);
3331 emit_insn (insn);
3332 return;
3335 case SYMBOL_TINY_TLSIE:
3337 machine_mode mode = GET_MODE (dest);
3338 rtx tp = aarch64_load_tp (NULL);
3340 if (mode == ptr_mode)
3342 if (mode == DImode)
3343 emit_insn (gen_tlsie_tiny_di (dest, imm, tp));
3344 else
3346 tp = gen_lowpart (mode, tp);
3347 emit_insn (gen_tlsie_tiny_si (dest, imm, tp));
3350 else
3352 gcc_assert (mode == Pmode);
3353 emit_insn (gen_tlsie_tiny_sidi (dest, imm, tp));
3356 if (REG_P (dest))
3357 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
3358 return;
3361 default:
3362 gcc_unreachable ();
3366 /* Emit a move from SRC to DEST. Assume that the move expanders can
3367 handle all moves if !can_create_pseudo_p (). The distinction is
3368 important because, unlike emit_move_insn, the move expanders know
3369 how to force Pmode objects into the constant pool even when the
3370 constant pool address is not itself legitimate. */
3371 static rtx
3372 aarch64_emit_move (rtx dest, rtx src)
3374 return (can_create_pseudo_p ()
3375 ? emit_move_insn (dest, src)
3376 : emit_move_insn_1 (dest, src));
3379 /* Apply UNOPTAB to OP and store the result in DEST. */
3381 static void
3382 aarch64_emit_unop (rtx dest, optab unoptab, rtx op)
3384 rtx tmp = expand_unop (GET_MODE (dest), unoptab, op, dest, 0);
3385 if (dest != tmp)
3386 emit_move_insn (dest, tmp);
3389 /* Apply BINOPTAB to OP0 and OP1 and store the result in DEST. */
3391 static void
3392 aarch64_emit_binop (rtx dest, optab binoptab, rtx op0, rtx op1)
3394 rtx tmp = expand_binop (GET_MODE (dest), binoptab, op0, op1, dest, 0,
3395 OPTAB_DIRECT);
3396 if (dest != tmp)
3397 emit_move_insn (dest, tmp);
3400 /* Split a move from SRC to DST into multiple moves of mode SINGLE_MODE. */
3402 void
3403 aarch64_split_move (rtx dst, rtx src, machine_mode single_mode)
3405 machine_mode mode = GET_MODE (dst);
3406 auto npieces = exact_div (GET_MODE_SIZE (mode),
3407 GET_MODE_SIZE (single_mode)).to_constant ();
3408 auto_vec<rtx, 4> dst_pieces, src_pieces;
3410 for (unsigned int i = 0; i < npieces; ++i)
3412 auto off = i * GET_MODE_SIZE (single_mode);
3413 dst_pieces.safe_push (simplify_gen_subreg (single_mode, dst, mode, off));
3414 src_pieces.safe_push (simplify_gen_subreg (single_mode, src, mode, off));
3417 /* At most one pairing may overlap. */
3418 if (reg_overlap_mentioned_p (dst_pieces[0], src))
3419 for (unsigned int i = npieces; i-- > 0;)
3420 aarch64_emit_move (dst_pieces[i], src_pieces[i]);
3421 else
3422 for (unsigned int i = 0; i < npieces; ++i)
3423 aarch64_emit_move (dst_pieces[i], src_pieces[i]);
3426 /* Split a 128-bit move operation into two 64-bit move operations,
3427 taking care to handle partial overlap of register to register
3428 copies. Special cases are needed when moving between GP regs and
3429 FP regs. SRC can be a register, constant or memory; DST a register
3430 or memory. If either operand is memory it must not have any side
3431 effects. */
3432 void
3433 aarch64_split_128bit_move (rtx dst, rtx src)
3435 machine_mode mode = GET_MODE (dst);
3437 gcc_assert (mode == TImode || mode == TFmode || mode == TDmode);
3438 gcc_assert (!(side_effects_p (src) || side_effects_p (dst)));
3439 gcc_assert (mode == GET_MODE (src) || GET_MODE (src) == VOIDmode);
3441 if (REG_P (dst) && REG_P (src))
3443 int src_regno = REGNO (src);
3444 int dst_regno = REGNO (dst);
3446 /* Handle FP <-> GP regs. */
3447 if (FP_REGNUM_P (dst_regno) && GP_REGNUM_P (src_regno))
3449 rtx src_lo = gen_lowpart (word_mode, src);
3450 rtx src_hi = gen_highpart (word_mode, src);
3452 emit_insn (gen_aarch64_movlow_di (mode, dst, src_lo));
3453 emit_insn (gen_aarch64_movhigh_di (mode, dst, src_hi));
3454 return;
3456 else if (GP_REGNUM_P (dst_regno) && FP_REGNUM_P (src_regno))
3458 rtx dst_lo = gen_lowpart (word_mode, dst);
3459 rtx dst_hi = gen_highpart (word_mode, dst);
3461 emit_insn (gen_aarch64_movdi_low (mode, dst_lo, src));
3462 emit_insn (gen_aarch64_movdi_high (mode, dst_hi, src));
3463 return;
3467 aarch64_split_move (dst, src, word_mode);
3470 /* Return true if we should split a move from 128-bit value SRC
3471 to 128-bit register DEST. */
3473 bool
3474 aarch64_split_128bit_move_p (rtx dst, rtx src)
3476 if (FP_REGNUM_P (REGNO (dst)))
3477 return REG_P (src) && !FP_REGNUM_P (REGNO (src));
3478 /* All moves to GPRs need to be split. */
3479 return true;
3482 /* Split a complex SIMD move. */
3484 void
3485 aarch64_split_simd_move (rtx dst, rtx src)
3487 machine_mode src_mode = GET_MODE (src);
3488 machine_mode dst_mode = GET_MODE (dst);
3490 gcc_assert (VECTOR_MODE_P (dst_mode));
3492 if (REG_P (dst) && REG_P (src))
3494 gcc_assert (VECTOR_MODE_P (src_mode));
3495 emit_insn (gen_aarch64_split_simd_mov (src_mode, dst, src));
3499 /* Return a register that contains SVE value X reinterpreted as SVE mode MODE.
3500 The semantics of those of svreinterpret rather than those of subregs;
3501 see the comment at the head of aarch64-sve.md for details about the
3502 difference. */
3505 aarch64_sve_reinterpret (machine_mode mode, rtx x)
3507 if (GET_MODE (x) == mode)
3508 return x;
3510 /* can_change_mode_class must only return true if subregs and svreinterprets
3511 have the same semantics. */
3512 if (targetm.can_change_mode_class (GET_MODE (x), mode, FP_REGS))
3513 return force_lowpart_subreg (mode, x, GET_MODE (x));
3515 rtx res = gen_reg_rtx (mode);
3516 x = force_reg (GET_MODE (x), x);
3517 emit_insn (gen_aarch64_sve_reinterpret (mode, res, x));
3518 return res;
3521 bool
3522 aarch64_zero_extend_const_eq (machine_mode xmode, rtx x,
3523 machine_mode ymode, rtx y)
3525 rtx r = simplify_const_unary_operation (ZERO_EXTEND, xmode, y, ymode);
3526 gcc_assert (r != NULL);
3527 return rtx_equal_p (x, r);
3530 /* Return TARGET if it is nonnull and a register of mode MODE.
3531 Otherwise, return a fresh register of mode MODE if we can,
3532 or TARGET reinterpreted as MODE if we can't. */
3534 static rtx
3535 aarch64_target_reg (rtx target, machine_mode mode)
3537 if (target && REG_P (target) && GET_MODE (target) == mode)
3538 return target;
3539 if (!can_create_pseudo_p ())
3541 gcc_assert (target);
3542 return gen_lowpart (mode, target);
3544 return gen_reg_rtx (mode);
3547 /* Return a register that contains the constant in BUILDER, given that
3548 the constant is a legitimate move operand. Use TARGET as the register
3549 if it is nonnull and convenient. */
3551 static rtx
3552 aarch64_emit_set_immediate (rtx target, rtx_vector_builder &builder)
3554 rtx src = builder.build ();
3555 target = aarch64_target_reg (target, GET_MODE (src));
3556 emit_insn (gen_rtx_SET (target, src));
3557 return target;
3560 static rtx
3561 aarch64_force_temporary (machine_mode mode, rtx x, rtx value)
3563 if (can_create_pseudo_p ())
3564 return force_reg (mode, value);
3565 else
3567 gcc_assert (x);
3568 aarch64_emit_move (x, value);
3569 return x;
3573 /* Return true if predicate value X is a constant in which every element
3574 is a CONST_INT. When returning true, describe X in BUILDER as a VNx16BI
3575 value, i.e. as a predicate in which all bits are significant. */
3577 static bool
3578 aarch64_get_sve_pred_bits (rtx_vector_builder &builder, rtx x)
3580 if (!CONST_VECTOR_P (x))
3581 return false;
3583 unsigned int factor = vector_element_size (GET_MODE_NUNITS (VNx16BImode),
3584 GET_MODE_NUNITS (GET_MODE (x)));
3585 unsigned int npatterns = CONST_VECTOR_NPATTERNS (x) * factor;
3586 unsigned int nelts_per_pattern = CONST_VECTOR_NELTS_PER_PATTERN (x);
3587 builder.new_vector (VNx16BImode, npatterns, nelts_per_pattern);
3589 unsigned int nelts = const_vector_encoded_nelts (x);
3590 for (unsigned int i = 0; i < nelts; ++i)
3592 rtx elt = CONST_VECTOR_ENCODED_ELT (x, i);
3593 if (!CONST_INT_P (elt))
3594 return false;
3596 builder.quick_push (elt);
3597 for (unsigned int j = 1; j < factor; ++j)
3598 builder.quick_push (const0_rtx);
3600 builder.finalize ();
3601 return true;
3604 /* BUILDER contains a predicate constant of mode VNx16BI. Return the
3605 widest predicate element size it can have (that is, the largest size
3606 for which each element would still be 0 or 1). */
3608 unsigned int
3609 aarch64_widest_sve_pred_elt_size (rtx_vector_builder &builder)
3611 /* Start with the most optimistic assumption: that we only need
3612 one bit per pattern. This is what we will use if only the first
3613 bit in each pattern is ever set. */
3614 unsigned int mask = GET_MODE_SIZE (DImode);
3615 mask |= builder.npatterns ();
3617 /* Look for set bits. */
3618 unsigned int nelts = builder.encoded_nelts ();
3619 for (unsigned int i = 1; i < nelts; ++i)
3620 if (INTVAL (builder.elt (i)) != 0)
3622 if (i & 1)
3623 return 1;
3624 mask |= i;
3626 return mask & -mask;
3629 /* If VNx16BImode rtx X is a canonical PTRUE for a predicate mode,
3630 return that predicate mode, otherwise return opt_machine_mode (). */
3632 opt_machine_mode
3633 aarch64_ptrue_all_mode (rtx x)
3635 gcc_assert (GET_MODE (x) == VNx16BImode);
3636 if (!CONST_VECTOR_P (x)
3637 || !CONST_VECTOR_DUPLICATE_P (x)
3638 || !CONST_INT_P (CONST_VECTOR_ENCODED_ELT (x, 0))
3639 || INTVAL (CONST_VECTOR_ENCODED_ELT (x, 0)) == 0)
3640 return opt_machine_mode ();
3642 unsigned int nelts = const_vector_encoded_nelts (x);
3643 for (unsigned int i = 1; i < nelts; ++i)
3644 if (CONST_VECTOR_ENCODED_ELT (x, i) != const0_rtx)
3645 return opt_machine_mode ();
3647 return aarch64_sve_pred_mode (nelts);
3650 /* BUILDER is a predicate constant of mode VNx16BI. Consider the value
3651 that the constant would have with predicate element size ELT_SIZE
3652 (ignoring the upper bits in each element) and return:
3654 * -1 if all bits are set
3655 * N if the predicate has N leading set bits followed by all clear bits
3656 * 0 if the predicate does not have any of these forms. */
3659 aarch64_partial_ptrue_length (rtx_vector_builder &builder,
3660 unsigned int elt_size)
3662 /* If nelts_per_pattern is 3, we have set bits followed by clear bits
3663 followed by set bits. */
3664 if (builder.nelts_per_pattern () == 3)
3665 return 0;
3667 /* Skip over leading set bits. */
3668 unsigned int nelts = builder.encoded_nelts ();
3669 unsigned int i = 0;
3670 for (; i < nelts; i += elt_size)
3671 if (INTVAL (builder.elt (i)) == 0)
3672 break;
3673 unsigned int vl = i / elt_size;
3675 /* Check for the all-true case. */
3676 if (i == nelts)
3677 return -1;
3679 /* If nelts_per_pattern is 1, then either VL is zero, or we have a
3680 repeating pattern of set bits followed by clear bits. */
3681 if (builder.nelts_per_pattern () != 2)
3682 return 0;
3684 /* We have a "foreground" value and a duplicated "background" value.
3685 If the background might repeat and the last set bit belongs to it,
3686 we might have set bits followed by clear bits followed by set bits. */
3687 if (i > builder.npatterns () && maybe_ne (nelts, builder.full_nelts ()))
3688 return 0;
3690 /* Make sure that the rest are all clear. */
3691 for (; i < nelts; i += elt_size)
3692 if (INTVAL (builder.elt (i)) != 0)
3693 return 0;
3695 return vl;
3698 /* See if there is an svpattern that encodes an SVE predicate of mode
3699 PRED_MODE in which the first VL bits are set and the rest are clear.
3700 Return the pattern if so, otherwise return AARCH64_NUM_SVPATTERNS.
3701 A VL of -1 indicates an all-true vector. */
3703 aarch64_svpattern
3704 aarch64_svpattern_for_vl (machine_mode pred_mode, int vl)
3706 if (vl < 0)
3707 return AARCH64_SV_ALL;
3709 if (maybe_gt (vl, GET_MODE_NUNITS (pred_mode)))
3710 return AARCH64_NUM_SVPATTERNS;
3712 if (vl >= 1 && vl <= 8)
3713 return aarch64_svpattern (AARCH64_SV_VL1 + (vl - 1));
3715 if (vl >= 16 && vl <= 256 && pow2p_hwi (vl))
3716 return aarch64_svpattern (AARCH64_SV_VL16 + (exact_log2 (vl) - 4));
3718 int max_vl;
3719 if (GET_MODE_NUNITS (pred_mode).is_constant (&max_vl))
3721 if (vl == (max_vl / 3) * 3)
3722 return AARCH64_SV_MUL3;
3723 /* These would only trigger for non-power-of-2 lengths. */
3724 if (vl == (max_vl & -4))
3725 return AARCH64_SV_MUL4;
3726 if (vl == (1 << floor_log2 (max_vl)))
3727 return AARCH64_SV_POW2;
3728 if (vl == max_vl)
3729 return AARCH64_SV_ALL;
3731 return AARCH64_NUM_SVPATTERNS;
3734 /* Return a VNx16BImode constant in which every sequence of ELT_SIZE
3735 bits has the lowest bit set and the upper bits clear. This is the
3736 VNx16BImode equivalent of a PTRUE for controlling elements of
3737 ELT_SIZE bytes. However, because the constant is VNx16BImode,
3738 all bits are significant, even the upper zeros. */
3741 aarch64_ptrue_all (unsigned int elt_size)
3743 rtx_vector_builder builder (VNx16BImode, elt_size, 1);
3744 builder.quick_push (const1_rtx);
3745 for (unsigned int i = 1; i < elt_size; ++i)
3746 builder.quick_push (const0_rtx);
3747 return builder.build ();
3750 /* Return an all-true predicate register of mode MODE. */
3753 aarch64_ptrue_reg (machine_mode mode)
3755 gcc_assert (aarch64_sve_pred_mode_p (mode));
3756 rtx reg = force_reg (VNx16BImode, CONSTM1_RTX (VNx16BImode));
3757 return gen_lowpart (mode, reg);
3760 /* Return an all-true (restricted to the leading VL bits) predicate register of
3761 mode MODE. */
3764 aarch64_ptrue_reg (machine_mode mode, unsigned int vl)
3766 gcc_assert (aarch64_sve_pred_mode_p (mode));
3768 rtx_vector_builder builder (VNx16BImode, vl, 2);
3770 for (unsigned i = 0; i < vl; i++)
3771 builder.quick_push (CONST1_RTX (BImode));
3773 for (unsigned i = 0; i < vl; i++)
3774 builder.quick_push (CONST0_RTX (BImode));
3776 rtx const_vec = builder.build ();
3777 rtx reg = force_reg (VNx16BImode, const_vec);
3778 return gen_lowpart (mode, reg);
3781 /* Return a register of mode PRED_MODE for controlling data of mode DATA_MODE.
3783 DATA_MODE can be a scalar, an Advanced SIMD vector, or an SVE vector.
3784 If it's an N-byte scalar or an Advanced SIMD vector, the first N bits
3785 of the predicate will be active and the rest will be inactive.
3786 If DATA_MODE is an SVE mode, every bit of the predicate will be active. */
3788 aarch64_ptrue_reg (machine_mode pred_mode, machine_mode data_mode)
3790 if (aarch64_sve_mode_p (data_mode))
3791 return aarch64_ptrue_reg (pred_mode);
3793 auto size = GET_MODE_SIZE (data_mode).to_constant ();
3794 return aarch64_ptrue_reg (pred_mode, size);
3797 /* Return an all-false predicate register of mode MODE. */
3800 aarch64_pfalse_reg (machine_mode mode)
3802 gcc_assert (aarch64_sve_pred_mode_p (mode));
3803 rtx reg = force_reg (VNx16BImode, CONST0_RTX (VNx16BImode));
3804 return gen_lowpart (mode, reg);
3807 /* PRED1[0] is a PTEST predicate and PRED1[1] is an aarch64_sve_ptrue_flag
3808 for it. PRED2[0] is the predicate for the instruction whose result
3809 is tested by the PTEST and PRED2[1] is again an aarch64_sve_ptrue_flag
3810 for it. Return true if we can prove that the two predicates are
3811 equivalent for PTEST purposes; that is, if we can replace PRED2[0]
3812 with PRED1[0] without changing behavior. */
3814 bool
3815 aarch64_sve_same_pred_for_ptest_p (rtx *pred1, rtx *pred2)
3817 machine_mode mode = GET_MODE (pred1[0]);
3818 gcc_assert (aarch64_sve_pred_mode_p (mode)
3819 && mode == GET_MODE (pred2[0])
3820 && aarch64_sve_ptrue_flag (pred1[1], SImode)
3821 && aarch64_sve_ptrue_flag (pred2[1], SImode));
3823 bool ptrue1_p = (pred1[0] == CONSTM1_RTX (mode)
3824 || INTVAL (pred1[1]) == SVE_KNOWN_PTRUE);
3825 bool ptrue2_p = (pred2[0] == CONSTM1_RTX (mode)
3826 || INTVAL (pred2[1]) == SVE_KNOWN_PTRUE);
3827 return (ptrue1_p && ptrue2_p) || rtx_equal_p (pred1[0], pred2[0]);
3830 /* Emit a comparison CMP between OP0 and OP1, both of which have mode
3831 DATA_MODE, and return the result in a predicate of mode PRED_MODE.
3832 Use TARGET as the target register if nonnull and convenient. */
3834 static rtx
3835 aarch64_sve_emit_int_cmp (rtx target, machine_mode pred_mode, rtx_code cmp,
3836 machine_mode data_mode, rtx op1, rtx op2)
3838 insn_code icode = code_for_aarch64_pred_cmp (cmp, data_mode);
3839 expand_operand ops[5];
3840 create_output_operand (&ops[0], target, pred_mode);
3841 create_input_operand (&ops[1], CONSTM1_RTX (pred_mode), pred_mode);
3842 create_integer_operand (&ops[2], SVE_KNOWN_PTRUE);
3843 create_input_operand (&ops[3], op1, data_mode);
3844 create_input_operand (&ops[4], op2, data_mode);
3845 expand_insn (icode, 5, ops);
3846 return ops[0].value;
3849 /* Use a comparison to convert integer vector SRC into MODE, which is
3850 the corresponding SVE predicate mode. Use TARGET for the result
3851 if it's nonnull and convenient. */
3854 aarch64_convert_sve_data_to_pred (rtx target, machine_mode mode, rtx src)
3856 machine_mode src_mode = GET_MODE (src);
3857 return aarch64_sve_emit_int_cmp (target, mode, NE, src_mode,
3858 src, CONST0_RTX (src_mode));
3861 /* Return the assembly token for svprfop value PRFOP. */
3863 static const char *
3864 svprfop_token (enum aarch64_svprfop prfop)
3866 switch (prfop)
3868 #define CASE(UPPER, LOWER, VALUE) case AARCH64_SV_##UPPER: return #LOWER;
3869 AARCH64_FOR_SVPRFOP (CASE)
3870 #undef CASE
3871 case AARCH64_NUM_SVPRFOPS:
3872 break;
3874 gcc_unreachable ();
3877 /* Return the assembly string for an SVE prefetch operation with
3878 mnemonic MNEMONIC, given that PRFOP_RTX is the prefetch operation
3879 and that SUFFIX is the format for the remaining operands. */
3881 char *
3882 aarch64_output_sve_prefetch (const char *mnemonic, rtx prfop_rtx,
3883 const char *suffix)
3885 static char buffer[128];
3886 aarch64_svprfop prfop = (aarch64_svprfop) INTVAL (prfop_rtx);
3887 unsigned int written = snprintf (buffer, sizeof (buffer), "%s\t%s, %s",
3888 mnemonic, svprfop_token (prfop), suffix);
3889 gcc_assert (written < sizeof (buffer));
3890 return buffer;
3893 /* Check whether we can calculate the number of elements in PATTERN
3894 at compile time, given that there are NELTS_PER_VQ elements per
3895 128-bit block. Return the value if so, otherwise return -1. */
3897 HOST_WIDE_INT
3898 aarch64_fold_sve_cnt_pat (aarch64_svpattern pattern, unsigned int nelts_per_vq)
3900 unsigned int vl, const_vg;
3901 if (pattern >= AARCH64_SV_VL1 && pattern <= AARCH64_SV_VL8)
3902 vl = 1 + (pattern - AARCH64_SV_VL1);
3903 else if (pattern >= AARCH64_SV_VL16 && pattern <= AARCH64_SV_VL256)
3904 vl = 16 << (pattern - AARCH64_SV_VL16);
3905 else if (aarch64_sve_vg.is_constant (&const_vg))
3907 /* There are two vector granules per quadword. */
3908 unsigned int nelts = (const_vg / 2) * nelts_per_vq;
3909 switch (pattern)
3911 case AARCH64_SV_POW2: return 1 << floor_log2 (nelts);
3912 case AARCH64_SV_MUL4: return nelts & -4;
3913 case AARCH64_SV_MUL3: return (nelts / 3) * 3;
3914 case AARCH64_SV_ALL: return nelts;
3915 default: gcc_unreachable ();
3918 else
3919 return -1;
3921 /* There are two vector granules per quadword. */
3922 poly_uint64 nelts_all = exact_div (aarch64_sve_vg, 2) * nelts_per_vq;
3923 if (known_le (vl, nelts_all))
3924 return vl;
3926 /* Requesting more elements than are available results in a PFALSE. */
3927 if (known_gt (vl, nelts_all))
3928 return 0;
3930 return -1;
3933 /* Return true if a single CNT[BHWD] instruction can multiply FACTOR
3934 by the number of 128-bit quadwords in an SVE vector. */
3936 static bool
3937 aarch64_sve_cnt_factor_p (HOST_WIDE_INT factor)
3939 /* The coefficient must be [1, 16] * {2, 4, 8, 16}. */
3940 return (IN_RANGE (factor, 2, 16 * 16)
3941 && (factor & 1) == 0
3942 && factor <= 16 * (factor & -factor));
3945 /* Return true if we can move VALUE into a register using a single
3946 CNT[BHWD] instruction. */
3948 static bool
3949 aarch64_sve_cnt_immediate_p (poly_int64 value)
3951 HOST_WIDE_INT factor = value.coeffs[0];
3952 return value.coeffs[1] == factor && aarch64_sve_cnt_factor_p (factor);
3955 /* Likewise for rtx X. */
3957 bool
3958 aarch64_sve_cnt_immediate_p (rtx x)
3960 poly_int64 value;
3961 return poly_int_rtx_p (x, &value) && aarch64_sve_cnt_immediate_p (value);
3964 /* Return the asm string for an instruction with a CNT-like vector size
3965 operand (a vector pattern followed by a multiplier in the range [1, 16]).
3966 PREFIX is the mnemonic without the size suffix and OPERANDS is the
3967 first part of the operands template (the part that comes before the
3968 vector size itself). PATTERN is the pattern to use. FACTOR is the
3969 number of quadwords. NELTS_PER_VQ, if nonzero, is the number of elements
3970 in each quadword. If it is zero, we can use any element size. */
3972 static char *
3973 aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
3974 aarch64_svpattern pattern,
3975 unsigned int factor,
3976 unsigned int nelts_per_vq)
3978 static char buffer[sizeof ("sqincd\t%x0, %w0, vl256, mul #16")];
3980 if (nelts_per_vq == 0)
3981 /* There is some overlap in the ranges of the four CNT instructions.
3982 Here we always use the smallest possible element size, so that the
3983 multiplier is 1 whereever possible. */
3984 nelts_per_vq = factor & -factor;
3985 int shift = std::min (exact_log2 (nelts_per_vq), 4);
3986 gcc_assert (IN_RANGE (shift, 1, 4));
3987 char suffix = "dwhb"[shift - 1];
3989 factor >>= shift;
3990 unsigned int written;
3991 if (pattern == AARCH64_SV_ALL && factor == 1)
3992 written = snprintf (buffer, sizeof (buffer), "%s%c\t%s",
3993 prefix, suffix, operands);
3994 else if (factor == 1)
3995 written = snprintf (buffer, sizeof (buffer), "%s%c\t%s, %s",
3996 prefix, suffix, operands, svpattern_token (pattern));
3997 else
3998 written = snprintf (buffer, sizeof (buffer), "%s%c\t%s, %s, mul #%d",
3999 prefix, suffix, operands, svpattern_token (pattern),
4000 factor);
4001 gcc_assert (written < sizeof (buffer));
4002 return buffer;
4005 /* Return the asm string for an instruction with a CNT-like vector size
4006 operand (a vector pattern followed by a multiplier in the range [1, 16]).
4007 PREFIX is the mnemonic without the size suffix and OPERANDS is the
4008 first part of the operands template (the part that comes before the
4009 vector size itself). X is the value of the vector size operand,
4010 as a polynomial integer rtx; we need to convert this into an "all"
4011 pattern with a multiplier. */
4013 char *
4014 aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
4015 rtx x)
4017 poly_int64 value = rtx_to_poly_int64 (x);
4018 gcc_assert (aarch64_sve_cnt_immediate_p (value));
4019 return aarch64_output_sve_cnt_immediate (prefix, operands, AARCH64_SV_ALL,
4020 value.coeffs[1], 0);
4023 /* Return the asm string for an instruction with a CNT-like vector size
4024 operand (a vector pattern followed by a multiplier in the range [1, 16]).
4025 PREFIX is the mnemonic without the size suffix and OPERANDS is the
4026 first part of the operands template (the part that comes before the
4027 vector size itself). CNT_PAT[0..2] are the operands of the
4028 UNSPEC_SVE_CNT_PAT; see aarch64_sve_cnt_pat for details. */
4030 char *
4031 aarch64_output_sve_cnt_pat_immediate (const char *prefix,
4032 const char *operands, rtx *cnt_pat)
4034 aarch64_svpattern pattern = (aarch64_svpattern) INTVAL (cnt_pat[0]);
4035 unsigned int nelts_per_vq = INTVAL (cnt_pat[1]);
4036 unsigned int factor = INTVAL (cnt_pat[2]) * nelts_per_vq;
4037 return aarch64_output_sve_cnt_immediate (prefix, operands, pattern,
4038 factor, nelts_per_vq);
4041 /* Return true if we can add X using a single SVE INC or DEC instruction. */
4043 bool
4044 aarch64_sve_scalar_inc_dec_immediate_p (rtx x)
4046 poly_int64 value;
4047 return (poly_int_rtx_p (x, &value)
4048 && (aarch64_sve_cnt_immediate_p (value)
4049 || aarch64_sve_cnt_immediate_p (-value)));
4052 /* Return the asm string for adding SVE INC/DEC immediate OFFSET to
4053 operand 0. */
4055 char *
4056 aarch64_output_sve_scalar_inc_dec (rtx offset)
4058 poly_int64 offset_value = rtx_to_poly_int64 (offset);
4059 gcc_assert (offset_value.coeffs[0] == offset_value.coeffs[1]);
4060 if (offset_value.coeffs[1] > 0)
4061 return aarch64_output_sve_cnt_immediate ("inc", "%x0", AARCH64_SV_ALL,
4062 offset_value.coeffs[1], 0);
4063 else
4064 return aarch64_output_sve_cnt_immediate ("dec", "%x0", AARCH64_SV_ALL,
4065 -offset_value.coeffs[1], 0);
4068 /* Return true if a single RDVL instruction can multiply FACTOR by the
4069 number of 128-bit quadwords in an SVE vector. This is also the
4070 range of ADDVL. */
4072 static bool
4073 aarch64_sve_rdvl_addvl_factor_p (HOST_WIDE_INT factor)
4075 return (multiple_p (factor, 16)
4076 && IN_RANGE (factor, -32 * 16, 31 * 16));
4079 /* Return true if ADDPL can be used to add FACTOR multiplied by the number
4080 of quadwords in an SVE vector. */
4082 static bool
4083 aarch64_sve_addpl_factor_p (HOST_WIDE_INT factor)
4085 return (multiple_p (factor, 2)
4086 && IN_RANGE (factor, -32 * 2, 31 * 2));
4089 /* Return true if we can move VALUE into a register using a single
4090 RDVL instruction. */
4092 static bool
4093 aarch64_sve_rdvl_immediate_p (poly_int64 value)
4095 HOST_WIDE_INT factor = value.coeffs[0];
4096 return value.coeffs[1] == factor && aarch64_sve_rdvl_addvl_factor_p (factor);
4099 /* Likewise for rtx X. */
4101 bool
4102 aarch64_sve_rdvl_immediate_p (rtx x)
4104 poly_int64 value;
4105 return poly_int_rtx_p (x, &value) && aarch64_sve_rdvl_immediate_p (value);
4108 /* Return the asm string for moving RDVL immediate OFFSET into register
4109 operand 0. */
4111 char *
4112 aarch64_output_sve_rdvl (rtx offset)
4114 static char buffer[sizeof ("rdvl\t%x0, #-") + 3 * sizeof (int)];
4115 poly_int64 offset_value = rtx_to_poly_int64 (offset);
4116 gcc_assert (aarch64_sve_rdvl_immediate_p (offset_value));
4118 int factor = offset_value.coeffs[1];
4119 snprintf (buffer, sizeof (buffer), "rdvl\t%%x0, #%d", factor / 16);
4120 return buffer;
4123 /* Return true if we can add VALUE to a register using a single ADDVL
4124 or ADDPL instruction. */
4126 static bool
4127 aarch64_sve_addvl_addpl_immediate_p (poly_int64 value)
4129 HOST_WIDE_INT factor = value.coeffs[0];
4130 if (factor == 0 || value.coeffs[1] != factor)
4131 return false;
4132 return (aarch64_sve_rdvl_addvl_factor_p (factor)
4133 || aarch64_sve_addpl_factor_p (factor));
4136 /* Likewise for rtx X. */
4138 bool
4139 aarch64_sve_addvl_addpl_immediate_p (rtx x)
4141 poly_int64 value;
4142 return (poly_int_rtx_p (x, &value)
4143 && aarch64_sve_addvl_addpl_immediate_p (value));
4146 /* Return the asm string for adding ADDVL or ADDPL immediate OFFSET
4147 to operand 1 and storing the result in operand 0. */
4149 char *
4150 aarch64_output_sve_addvl_addpl (rtx offset)
4152 static char buffer[sizeof ("addpl\t%x0, %x1, #-") + 3 * sizeof (int)];
4153 poly_int64 offset_value = rtx_to_poly_int64 (offset);
4154 gcc_assert (aarch64_sve_addvl_addpl_immediate_p (offset_value));
4156 int factor = offset_value.coeffs[1];
4157 if ((factor & 15) == 0)
4158 snprintf (buffer, sizeof (buffer), "addvl\t%%x0, %%x1, #%d", factor / 16);
4159 else
4160 snprintf (buffer, sizeof (buffer), "addpl\t%%x0, %%x1, #%d", factor / 2);
4161 return buffer;
4164 /* Return true if X is a valid immediate for an SVE vector INC or DEC
4165 instruction. If it is, store the number of elements in each vector
4166 quadword in *NELTS_PER_VQ_OUT (if nonnull) and store the multiplication
4167 factor in *FACTOR_OUT (if nonnull). */
4169 bool
4170 aarch64_sve_vector_inc_dec_immediate_p (rtx x, int *factor_out,
4171 unsigned int *nelts_per_vq_out)
4173 rtx elt;
4174 poly_int64 value;
4176 if (!const_vec_duplicate_p (x, &elt)
4177 || !poly_int_rtx_p (elt, &value))
4178 return false;
4180 unsigned int nelts_per_vq = 128 / GET_MODE_UNIT_BITSIZE (GET_MODE (x));
4181 if (nelts_per_vq != 8 && nelts_per_vq != 4 && nelts_per_vq != 2)
4182 /* There's no vector INCB. */
4183 return false;
4185 HOST_WIDE_INT factor = value.coeffs[0];
4186 if (value.coeffs[1] != factor)
4187 return false;
4189 /* The coefficient must be [1, 16] * NELTS_PER_VQ. */
4190 if ((factor % nelts_per_vq) != 0
4191 || !IN_RANGE (abs (factor), nelts_per_vq, 16 * nelts_per_vq))
4192 return false;
4194 if (factor_out)
4195 *factor_out = factor;
4196 if (nelts_per_vq_out)
4197 *nelts_per_vq_out = nelts_per_vq;
4198 return true;
4201 /* Return true if X is a valid immediate for an SVE vector INC or DEC
4202 instruction. */
4204 bool
4205 aarch64_sve_vector_inc_dec_immediate_p (rtx x)
4207 return aarch64_sve_vector_inc_dec_immediate_p (x, NULL, NULL);
4210 /* Return the asm template for an SVE vector INC or DEC instruction.
4211 OPERANDS gives the operands before the vector count and X is the
4212 value of the vector count operand itself. */
4214 char *
4215 aarch64_output_sve_vector_inc_dec (const char *operands, rtx x)
4217 int factor;
4218 unsigned int nelts_per_vq;
4219 if (!aarch64_sve_vector_inc_dec_immediate_p (x, &factor, &nelts_per_vq))
4220 gcc_unreachable ();
4221 if (factor < 0)
4222 return aarch64_output_sve_cnt_immediate ("dec", operands, AARCH64_SV_ALL,
4223 -factor, nelts_per_vq);
4224 else
4225 return aarch64_output_sve_cnt_immediate ("inc", operands, AARCH64_SV_ALL,
4226 factor, nelts_per_vq);
4229 /* Return a constant that represents FACTOR multiplied by the
4230 number of 128-bit quadwords in an SME vector. ISA_MODE is the
4231 ISA mode in which the calculation is being performed. */
4234 aarch64_sme_vq_immediate (machine_mode mode, HOST_WIDE_INT factor,
4235 aarch64_isa_mode isa_mode)
4237 gcc_assert (aarch64_sve_rdvl_addvl_factor_p (factor));
4238 if (isa_mode & AARCH64_ISA_MODE_SM_ON)
4239 /* We're in streaming mode, so we can use normal poly-int values. */
4240 return gen_int_mode ({ factor, factor }, mode);
4242 rtvec vec = gen_rtvec (1, gen_int_mode (factor, SImode));
4243 rtx unspec = gen_rtx_UNSPEC (mode, vec, UNSPEC_SME_VQ);
4244 return gen_rtx_CONST (mode, unspec);
4247 /* Return true if X is a constant that represents some number X
4248 multiplied by the number of quadwords in an SME vector. Store this X
4249 in *FACTOR if so. */
4251 static bool
4252 aarch64_sme_vq_unspec_p (const_rtx x, HOST_WIDE_INT *factor)
4254 if (!TARGET_SME || GET_CODE (x) != CONST)
4255 return false;
4257 x = XEXP (x, 0);
4258 if (GET_CODE (x) != UNSPEC
4259 || XINT (x, 1) != UNSPEC_SME_VQ
4260 || XVECLEN (x, 0) != 1)
4261 return false;
4263 x = XVECEXP (x, 0, 0);
4264 if (!CONST_INT_P (x))
4265 return false;
4267 *factor = INTVAL (x);
4268 return true;
4271 /* Return true if X is a constant that represents some number Y
4272 multiplied by the number of quadwords in an SME vector, and if
4273 that Y is in the range of RDSVL. */
4275 bool
4276 aarch64_rdsvl_immediate_p (const_rtx x)
4278 HOST_WIDE_INT factor;
4279 return (aarch64_sme_vq_unspec_p (x, &factor)
4280 && aarch64_sve_rdvl_addvl_factor_p (factor));
4283 /* Return the asm string for an RDSVL instruction that calculates X,
4284 which is a constant that satisfies aarch64_rdsvl_immediate_p. */
4286 char *
4287 aarch64_output_rdsvl (const_rtx x)
4289 gcc_assert (aarch64_rdsvl_immediate_p (x));
4290 static char buffer[sizeof ("rdsvl\t%x0, #-") + 3 * sizeof (int)];
4291 x = XVECEXP (XEXP (x, 0), 0, 0);
4292 snprintf (buffer, sizeof (buffer), "rdsvl\t%%x0, #%d",
4293 (int) INTVAL (x) / 16);
4294 return buffer;
4297 /* Return true if X is a constant that can be added using ADDSVL or ADDSPL. */
4299 bool
4300 aarch64_addsvl_addspl_immediate_p (const_rtx x)
4302 HOST_WIDE_INT factor;
4303 return (aarch64_sme_vq_unspec_p (x, &factor)
4304 && (aarch64_sve_rdvl_addvl_factor_p (factor)
4305 || aarch64_sve_addpl_factor_p (factor)));
4308 /* X is a constant that satisfies aarch64_addsvl_addspl_immediate_p.
4309 Return the asm string for the associated instruction. */
4311 char *
4312 aarch64_output_addsvl_addspl (rtx x)
4314 static char buffer[sizeof ("addspl\t%x0, %x1, #-") + 3 * sizeof (int)];
4315 HOST_WIDE_INT factor;
4316 if (!aarch64_sme_vq_unspec_p (x, &factor))
4317 gcc_unreachable ();
4318 if (aarch64_sve_rdvl_addvl_factor_p (factor))
4319 snprintf (buffer, sizeof (buffer), "addsvl\t%%x0, %%x1, #%d",
4320 (int) factor / 16);
4321 else if (aarch64_sve_addpl_factor_p (factor))
4322 snprintf (buffer, sizeof (buffer), "addspl\t%%x0, %%x1, #%d",
4323 (int) factor / 2);
4324 else
4325 gcc_unreachable ();
4326 return buffer;
4329 /* Multipliers for repeating bitmasks of width 32, 16, 8, 4, and 2. */
4331 static const unsigned HOST_WIDE_INT bitmask_imm_mul[] =
4333 0x0000000100000001ull,
4334 0x0001000100010001ull,
4335 0x0101010101010101ull,
4336 0x1111111111111111ull,
4337 0x5555555555555555ull,
4342 /* Return true if 64-bit VAL is a valid bitmask immediate. */
4343 static bool
4344 aarch64_bitmask_imm (unsigned HOST_WIDE_INT val)
4346 unsigned HOST_WIDE_INT tmp, mask, first_one, next_one;
4347 int bits;
4349 /* Check for a single sequence of one bits and return quickly if so.
4350 The special cases of all ones and all zeroes returns false. */
4351 tmp = val + (val & -val);
4353 if (tmp == (tmp & -tmp))
4354 return (val + 1) > 1;
4356 /* Invert if the immediate doesn't start with a zero bit - this means we
4357 only need to search for sequences of one bits. */
4358 if (val & 1)
4359 val = ~val;
4361 /* Find the first set bit and set tmp to val with the first sequence of one
4362 bits removed. Return success if there is a single sequence of ones. */
4363 first_one = val & -val;
4364 tmp = val & (val + first_one);
4366 if (tmp == 0)
4367 return true;
4369 /* Find the next set bit and compute the difference in bit position. */
4370 next_one = tmp & -tmp;
4371 bits = clz_hwi (first_one) - clz_hwi (next_one);
4372 mask = val ^ tmp;
4374 /* Check the bit position difference is a power of 2, and that the first
4375 sequence of one bits fits within 'bits' bits. */
4376 if ((mask >> bits) != 0 || bits != (bits & -bits))
4377 return false;
4379 /* Check the sequence of one bits is repeated 64/bits times. */
4380 return val == mask * bitmask_imm_mul[__builtin_clz (bits) - 26];
4384 /* Return true if VAL is a valid bitmask immediate for MODE. */
4385 bool
4386 aarch64_bitmask_imm (unsigned HOST_WIDE_INT val, machine_mode mode)
4388 if (mode == DImode)
4389 return aarch64_bitmask_imm (val);
4391 if (mode == SImode)
4392 return aarch64_bitmask_imm ((val & 0xffffffff) | (val << 32));
4394 /* Replicate small immediates to fit 64 bits. */
4395 int size = GET_MODE_UNIT_PRECISION (mode);
4396 val &= (HOST_WIDE_INT_1U << size) - 1;
4397 val *= bitmask_imm_mul[__builtin_clz (size) - 26];
4399 return aarch64_bitmask_imm (val);
4403 /* Return true if the immediate VAL can be a bitfield immediate
4404 by changing the given MASK bits in VAL to zeroes, ones or bits
4405 from the other half of VAL. Return the new immediate in VAL2. */
4406 static inline bool
4407 aarch64_check_bitmask (unsigned HOST_WIDE_INT val,
4408 unsigned HOST_WIDE_INT &val2,
4409 unsigned HOST_WIDE_INT mask)
4411 val2 = val & ~mask;
4412 if (val2 != val && aarch64_bitmask_imm (val2))
4413 return true;
4414 val2 = val | mask;
4415 if (val2 != val && aarch64_bitmask_imm (val2))
4416 return true;
4417 val = val & ~mask;
4418 val2 = val | (((val >> 32) | (val << 32)) & mask);
4419 if (val2 != val && aarch64_bitmask_imm (val2))
4420 return true;
4421 val2 = val | (((val >> 16) | (val << 48)) & mask);
4422 if (val2 != val && aarch64_bitmask_imm (val2))
4423 return true;
4424 return false;
4428 /* Return true if VAL is a valid MOVZ immediate. */
4429 static inline bool
4430 aarch64_is_movz (unsigned HOST_WIDE_INT val)
4432 return (val >> (ctz_hwi (val) & 48)) < 65536;
4436 /* Return true if immediate VAL can be created by a 64-bit MOVI/MOVN/MOVZ. */
4437 bool
4438 aarch64_is_mov_xn_imm (unsigned HOST_WIDE_INT val)
4440 return aarch64_is_movz (val) || aarch64_is_movz (~val)
4441 || aarch64_bitmask_imm (val);
4445 /* Return true if VAL is an immediate that can be created by a single
4446 MOV instruction. */
4447 bool
4448 aarch64_move_imm (unsigned HOST_WIDE_INT val, machine_mode mode)
4450 gcc_assert (mode == SImode || mode == DImode);
4452 if (val < 65536)
4453 return true;
4455 unsigned HOST_WIDE_INT mask =
4456 (val >> 32) == 0 || mode == SImode ? 0xffffffff : HOST_WIDE_INT_M1U;
4458 if (aarch64_is_movz (val & mask) || aarch64_is_movz (~val & mask))
4459 return true;
4461 val = (val & mask) | ((val << 32) & ~mask);
4462 return aarch64_bitmask_imm (val);
4466 static int
4467 aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
4468 machine_mode mode)
4470 int i;
4471 unsigned HOST_WIDE_INT val, val2, val3, mask;
4472 int one_match, zero_match;
4473 int num_insns;
4475 gcc_assert (mode == SImode || mode == DImode);
4477 val = INTVAL (imm);
4479 if (aarch64_move_imm (val, mode))
4481 if (generate)
4482 emit_insn (gen_rtx_SET (dest, imm));
4483 return 1;
4486 if ((val >> 32) == 0 || mode == SImode)
4488 if (generate)
4490 emit_insn (gen_rtx_SET (dest, GEN_INT (val & 0xffff)));
4491 if (mode == SImode)
4492 emit_insn (gen_insv_immsi (dest, GEN_INT (16),
4493 GEN_INT ((val >> 16) & 0xffff)));
4494 else
4495 emit_insn (gen_insv_immdi (dest, GEN_INT (16),
4496 GEN_INT ((val >> 16) & 0xffff)));
4498 return 2;
4501 /* Remaining cases are all for DImode. */
4503 mask = 0xffff;
4504 zero_match = ((val & mask) == 0) + ((val & (mask << 16)) == 0) +
4505 ((val & (mask << 32)) == 0) + ((val & (mask << 48)) == 0);
4506 one_match = ((~val & mask) == 0) + ((~val & (mask << 16)) == 0) +
4507 ((~val & (mask << 32)) == 0) + ((~val & (mask << 48)) == 0);
4509 /* Try a bitmask immediate and a movk to generate the immediate
4510 in 2 instructions. */
4512 if (zero_match < 2 && one_match < 2)
4514 for (i = 0; i < 64; i += 16)
4516 if (aarch64_check_bitmask (val, val2, mask << i))
4517 break;
4519 val2 = val & ~(mask << i);
4520 if ((val2 >> 32) == 0 && aarch64_move_imm (val2, DImode))
4521 break;
4524 if (i != 64)
4526 if (generate)
4528 emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
4529 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
4530 GEN_INT ((val >> i) & 0xffff)));
4532 return 2;
4535 /* Try 2 bitmask immediates which are xor'd together. */
4536 for (i = 0; i < 64; i += 16)
4538 val2 = (val >> i) & mask;
4539 val2 |= val2 << 16;
4540 val2 |= val2 << 32;
4541 if (aarch64_bitmask_imm (val2) && aarch64_bitmask_imm (val ^ val2))
4542 break;
4545 if (i != 64)
4547 if (generate)
4549 emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
4550 emit_insn (gen_xordi3 (dest, dest, GEN_INT (val ^ val2)));
4552 return 2;
4556 /* Try a bitmask plus 2 movk to generate the immediate in 3 instructions. */
4557 if (zero_match + one_match == 0)
4559 for (i = 0; i < 48; i += 16)
4560 for (int j = i + 16; j < 64; j += 16)
4561 if (aarch64_check_bitmask (val, val2, (mask << i) | (mask << j)))
4563 if (generate)
4565 emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
4566 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
4567 GEN_INT ((val >> i) & 0xffff)));
4568 emit_insn (gen_insv_immdi (dest, GEN_INT (j),
4569 GEN_INT ((val >> j) & 0xffff)));
4571 return 3;
4574 /* Try shifting and inserting the bottom 32-bits into the top bits. */
4575 val2 = val & 0xffffffff;
4576 val3 = 0xffffffff;
4577 val3 = val2 | (val3 << 32);
4578 for (i = 17; i < 48; i++)
4579 if ((val2 | (val2 << i)) == val)
4581 if (generate)
4583 emit_insn (gen_rtx_SET (dest, GEN_INT (val2 & 0xffff)));
4584 emit_insn (gen_insv_immdi (dest, GEN_INT (16),
4585 GEN_INT (val2 >> 16)));
4586 emit_insn (gen_ior_ashldi3 (dest, dest, GEN_INT (i), dest));
4588 return 3;
4590 else if ((val3 & ~(val3 << i)) == val)
4592 if (generate)
4594 emit_insn (gen_rtx_SET (dest, GEN_INT (val3 | 0xffff0000)));
4595 emit_insn (gen_insv_immdi (dest, GEN_INT (16),
4596 GEN_INT (val2 >> 16)));
4597 emit_insn (gen_and_one_cmpl_ashldi3 (dest, dest, GEN_INT (i),
4598 dest));
4600 return 3;
4604 /* Generate 2-4 instructions, skipping 16 bits of all zeroes or ones which
4605 are emitted by the initial mov. If one_match > zero_match, skip set bits,
4606 otherwise skip zero bits. */
4608 num_insns = 1;
4609 mask = 0xffff;
4610 val2 = one_match > zero_match ? ~val : val;
4611 i = (val2 & mask) != 0 ? 0 : (val2 & (mask << 16)) != 0 ? 16 : 32;
4613 if (generate)
4614 emit_insn (gen_rtx_SET (dest, GEN_INT (one_match > zero_match
4615 ? (val | ~(mask << i))
4616 : (val & (mask << i)))));
4617 for (i += 16; i < 64; i += 16)
4619 if ((val2 & (mask << i)) == 0)
4620 continue;
4621 if (generate)
4622 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
4623 GEN_INT ((val >> i) & 0xffff)));
4624 num_insns ++;
4627 return num_insns;
4630 /* Return whether imm is a 128-bit immediate which is simple enough to
4631 expand inline. */
4632 bool
4633 aarch64_mov128_immediate (rtx imm)
4635 if (CONST_INT_P (imm))
4636 return true;
4638 gcc_assert (CONST_WIDE_INT_NUNITS (imm) == 2);
4640 rtx lo = GEN_INT (CONST_WIDE_INT_ELT (imm, 0));
4641 rtx hi = GEN_INT (CONST_WIDE_INT_ELT (imm, 1));
4643 return aarch64_internal_mov_immediate (NULL_RTX, lo, false, DImode)
4644 + aarch64_internal_mov_immediate (NULL_RTX, hi, false, DImode) <= 4;
4648 /* Return true if val can be encoded as a 12-bit unsigned immediate with
4649 a left shift of 0 or 12 bits. */
4650 bool
4651 aarch64_uimm12_shift (unsigned HOST_WIDE_INT val)
4653 return val < 4096 || (val & 0xfff000) == val;
4656 /* Returns the nearest value to VAL that will fit as a 12-bit unsigned immediate
4657 that can be created with a left shift of 0 or 12. */
4658 static HOST_WIDE_INT
4659 aarch64_clamp_to_uimm12_shift (unsigned HOST_WIDE_INT val)
4661 /* Check to see if the value fits in 24 bits, as that is the maximum we can
4662 handle correctly. */
4663 gcc_assert (val < 0x1000000);
4665 if (val < 4096)
4666 return val;
4668 return val & 0xfff000;
4672 /* Test whether:
4674 X = (X & AND_VAL) | IOR_VAL;
4676 can be implemented using:
4678 MOVK X, #(IOR_VAL >> shift), LSL #shift
4680 Return the shift if so, otherwise return -1. */
4682 aarch64_movk_shift (const wide_int_ref &and_val,
4683 const wide_int_ref &ior_val)
4685 unsigned int precision = and_val.get_precision ();
4686 unsigned HOST_WIDE_INT mask = 0xffff;
4687 for (unsigned int shift = 0; shift < precision; shift += 16)
4689 if (and_val == ~mask && (ior_val & mask) == ior_val)
4690 return shift;
4691 mask <<= 16;
4693 return -1;
4696 /* Create mask of ones, covering the lowest to highest bits set in VAL_IN.
4697 Assumed precondition: VAL_IN Is not zero. */
4699 unsigned HOST_WIDE_INT
4700 aarch64_and_split_imm1 (HOST_WIDE_INT val_in)
4702 int lowest_bit_set = ctz_hwi (val_in);
4703 int highest_bit_set = floor_log2 (val_in);
4704 gcc_assert (val_in != 0);
4706 return ((HOST_WIDE_INT_UC (2) << highest_bit_set) -
4707 (HOST_WIDE_INT_1U << lowest_bit_set));
4710 /* Create constant where bits outside of lowest bit set to highest bit set
4711 are set to 1. */
4713 unsigned HOST_WIDE_INT
4714 aarch64_and_split_imm2 (HOST_WIDE_INT val_in)
4716 return val_in | ~aarch64_and_split_imm1 (val_in);
4719 /* Return true if VAL_IN is a valid 'and' bitmask immediate. */
4721 bool
4722 aarch64_and_bitmask_imm (unsigned HOST_WIDE_INT val_in, machine_mode mode)
4724 scalar_int_mode int_mode;
4725 if (!is_a <scalar_int_mode> (mode, &int_mode))
4726 return false;
4728 if (aarch64_bitmask_imm (val_in, int_mode))
4729 return false;
4731 if (aarch64_move_imm (val_in, int_mode))
4732 return false;
4734 unsigned HOST_WIDE_INT imm2 = aarch64_and_split_imm2 (val_in);
4736 return aarch64_bitmask_imm (imm2, int_mode);
4739 /* Return the number of temporary registers that aarch64_add_offset_1
4740 would need to add OFFSET to a register. */
4742 static unsigned int
4743 aarch64_add_offset_1_temporaries (HOST_WIDE_INT offset)
4745 return absu_hwi (offset) < 0x1000000 ? 0 : 1;
4748 /* A subroutine of aarch64_add_offset. Set DEST to SRC + OFFSET for
4749 a non-polynomial OFFSET. MODE is the mode of the addition.
4750 FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
4751 be set and CFA adjustments added to the generated instructions.
4753 TEMP1, if nonnull, is a register of mode MODE that can be used as a
4754 temporary if register allocation is already complete. This temporary
4755 register may overlap DEST but must not overlap SRC. If TEMP1 is known
4756 to hold abs (OFFSET), EMIT_MOVE_IMM can be set to false to avoid emitting
4757 the immediate again.
4759 Since this function may be used to adjust the stack pointer, we must
4760 ensure that it cannot cause transient stack deallocation (for example
4761 by first incrementing SP and then decrementing when adjusting by a
4762 large immediate). */
4764 static void
4765 aarch64_add_offset_1 (scalar_int_mode mode, rtx dest,
4766 rtx src, HOST_WIDE_INT offset, rtx temp1,
4767 bool frame_related_p, bool emit_move_imm)
4769 gcc_assert (emit_move_imm || temp1 != NULL_RTX);
4770 gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
4772 unsigned HOST_WIDE_INT moffset = absu_hwi (offset);
4773 rtx_insn *insn;
4775 if (!moffset)
4777 if (!rtx_equal_p (dest, src))
4779 insn = emit_insn (gen_rtx_SET (dest, src));
4780 RTX_FRAME_RELATED_P (insn) = frame_related_p;
4782 return;
4785 /* Single instruction adjustment. */
4786 if (aarch64_uimm12_shift (moffset))
4788 insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (offset)));
4789 RTX_FRAME_RELATED_P (insn) = frame_related_p;
4790 return;
4793 /* Emit 2 additions/subtractions if the adjustment is less than 24 bits
4794 and either:
4796 a) the offset cannot be loaded by a 16-bit move or
4797 b) there is no spare register into which we can move it. */
4798 if (moffset < 0x1000000
4799 && ((!temp1 && !can_create_pseudo_p ())
4800 || !aarch64_move_imm (moffset, mode)))
4802 HOST_WIDE_INT low_off = moffset & 0xfff;
4804 low_off = offset < 0 ? -low_off : low_off;
4805 insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (low_off)));
4806 RTX_FRAME_RELATED_P (insn) = frame_related_p;
4807 insn = emit_insn (gen_add2_insn (dest, GEN_INT (offset - low_off)));
4808 RTX_FRAME_RELATED_P (insn) = frame_related_p;
4809 return;
4812 /* Emit a move immediate if required and an addition/subtraction. */
4813 if (emit_move_imm)
4815 gcc_assert (temp1 != NULL_RTX || can_create_pseudo_p ());
4816 temp1 = aarch64_force_temporary (mode, temp1,
4817 gen_int_mode (moffset, mode));
4819 insn = emit_insn (offset < 0
4820 ? gen_sub3_insn (dest, src, temp1)
4821 : gen_add3_insn (dest, src, temp1));
4822 if (frame_related_p)
4824 RTX_FRAME_RELATED_P (insn) = frame_related_p;
4825 rtx adj = plus_constant (mode, src, offset);
4826 add_reg_note (insn, REG_CFA_ADJUST_CFA, gen_rtx_SET (dest, adj));
4830 /* Return the number of temporary registers that aarch64_add_offset
4831 would need to move OFFSET into a register or add OFFSET to a register;
4832 ADD_P is true if we want the latter rather than the former. */
4834 static unsigned int
4835 aarch64_offset_temporaries (bool add_p, poly_int64 offset)
4837 /* This follows the same structure as aarch64_add_offset. */
4838 if (add_p && aarch64_sve_addvl_addpl_immediate_p (offset))
4839 return 0;
4841 unsigned int count = 0;
4842 HOST_WIDE_INT factor = offset.coeffs[1];
4843 HOST_WIDE_INT constant = offset.coeffs[0] - factor;
4844 poly_int64 poly_offset (factor, factor);
4845 if (add_p && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
4846 /* Need one register for the ADDVL/ADDPL result. */
4847 count += 1;
4848 else if (factor != 0)
4850 factor /= (HOST_WIDE_INT) least_bit_hwi (factor);
4851 if (!IN_RANGE (factor, -32, 31))
4852 /* Need one register for the CNT or RDVL result and one for the
4853 multiplication factor. If necessary, the second temporary
4854 can be reused for the constant part of the offset. */
4855 return 2;
4856 /* Need one register for the CNT or RDVL result (which might then
4857 be shifted). */
4858 count += 1;
4860 return count + aarch64_add_offset_1_temporaries (constant);
4863 /* If X can be represented as a poly_int64, return the number
4864 of temporaries that are required to add it to a register.
4865 Return -1 otherwise. */
4868 aarch64_add_offset_temporaries (rtx x)
4870 poly_int64 offset;
4871 if (!poly_int_rtx_p (x, &offset))
4872 return -1;
4873 return aarch64_offset_temporaries (true, offset);
4876 /* Set DEST to SRC + OFFSET. MODE is the mode of the addition.
4877 FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
4878 be set and CFA adjustments added to the generated instructions.
4880 TEMP1, if nonnull, is a register of mode MODE that can be used as a
4881 temporary if register allocation is already complete. This temporary
4882 register may overlap DEST if !FRAME_RELATED_P but must not overlap SRC.
4883 If TEMP1 is known to hold abs (OFFSET), EMIT_MOVE_IMM can be set to
4884 false to avoid emitting the immediate again.
4886 TEMP2, if nonnull, is a second temporary register that doesn't
4887 overlap either DEST or REG.
4889 FORCE_ISA_MODE is AARCH64_ISA_MODE_SM_ON if any variable component of OFFSET
4890 is measured relative to the SME vector length instead of the current
4891 prevailing vector length. It is 0 otherwise.
4893 Since this function may be used to adjust the stack pointer, we must
4894 ensure that it cannot cause transient stack deallocation (for example
4895 by first incrementing SP and then decrementing when adjusting by a
4896 large immediate). */
4898 static void
4899 aarch64_add_offset (scalar_int_mode mode, rtx dest, rtx src,
4900 poly_int64 offset, rtx temp1, rtx temp2,
4901 aarch64_isa_mode force_isa_mode,
4902 bool frame_related_p, bool emit_move_imm = true)
4904 gcc_assert (emit_move_imm || temp1 != NULL_RTX);
4905 gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
4906 gcc_assert (temp1 == NULL_RTX
4907 || !frame_related_p
4908 || !reg_overlap_mentioned_p (temp1, dest));
4909 gcc_assert (temp2 == NULL_RTX || !reg_overlap_mentioned_p (dest, temp2));
4911 /* Try using ADDVL or ADDPL to add the whole value. */
4912 if (src != const0_rtx && aarch64_sve_addvl_addpl_immediate_p (offset))
4914 gcc_assert (offset.coeffs[0] == offset.coeffs[1]);
4915 rtx offset_rtx;
4916 if (force_isa_mode == 0)
4917 offset_rtx = gen_int_mode (offset, mode);
4918 else
4919 offset_rtx = aarch64_sme_vq_immediate (mode, offset.coeffs[0], 0);
4920 rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
4921 RTX_FRAME_RELATED_P (insn) = frame_related_p;
4922 if (frame_related_p && (force_isa_mode & AARCH64_ISA_MODE_SM_ON))
4923 add_reg_note (insn, REG_CFA_ADJUST_CFA,
4924 gen_rtx_SET (dest, plus_constant (Pmode, src,
4925 offset)));
4926 return;
4929 /* Coefficient 1 is multiplied by the number of 128-bit blocks in an
4930 SVE vector register, over and above the minimum size of 128 bits.
4931 This is equivalent to half the value returned by CNTD with a
4932 vector shape of ALL. */
4933 HOST_WIDE_INT factor = offset.coeffs[1];
4934 HOST_WIDE_INT constant = offset.coeffs[0] - factor;
4936 /* Try using ADDVL or ADDPL to add the VG-based part. */
4937 poly_int64 poly_offset (factor, factor);
4938 if (src != const0_rtx
4939 && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
4941 rtx offset_rtx;
4942 if (force_isa_mode == 0)
4943 offset_rtx = gen_int_mode (poly_offset, mode);
4944 else
4945 offset_rtx = aarch64_sme_vq_immediate (mode, factor, 0);
4946 if (frame_related_p)
4948 rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
4949 RTX_FRAME_RELATED_P (insn) = true;
4950 if (force_isa_mode & AARCH64_ISA_MODE_SM_ON)
4951 add_reg_note (insn, REG_CFA_ADJUST_CFA,
4952 gen_rtx_SET (dest, plus_constant (Pmode, src,
4953 poly_offset)));
4954 src = dest;
4956 else
4958 rtx addr = gen_rtx_PLUS (mode, src, offset_rtx);
4959 src = aarch64_force_temporary (mode, temp1, addr);
4960 temp1 = temp2;
4961 temp2 = NULL_RTX;
4964 /* Otherwise use a CNT-based sequence. */
4965 else if (factor != 0)
4967 /* Calculate CNTB * FACTOR / 16 as CNTB * REL_FACTOR * 2**SHIFT,
4968 with negative shifts indicating a shift right. */
4969 HOST_WIDE_INT low_bit = least_bit_hwi (factor);
4970 HOST_WIDE_INT rel_factor = factor / low_bit;
4971 int shift = exact_log2 (low_bit) - 4;
4972 gcc_assert (shift >= -4 && (rel_factor & 1) != 0);
4974 /* Set CODE, VAL and SHIFT so that [+-] VAL * 2**SHIFT is
4975 equal to CNTB * FACTOR / 16, with CODE being the [+-].
4977 We can avoid a multiplication if REL_FACTOR is in the range
4978 of RDVL, although there are then various optimizations that
4979 we can try on top. */
4980 rtx_code code = PLUS;
4981 rtx val;
4982 if (IN_RANGE (rel_factor, -32, 31))
4984 if (force_isa_mode & AARCH64_ISA_MODE_SM_ON)
4986 /* Try to use an unshifted RDSVL, otherwise fall back on
4987 a shifted RDSVL #1. */
4988 if (aarch64_sve_rdvl_addvl_factor_p (factor))
4989 shift = 0;
4990 else
4991 factor = rel_factor * 16;
4992 val = aarch64_sme_vq_immediate (mode, factor, 0);
4994 /* Try to use an unshifted CNT[BHWD] or RDVL. */
4995 else if (aarch64_sve_cnt_factor_p (factor)
4996 || aarch64_sve_rdvl_addvl_factor_p (factor))
4998 val = gen_int_mode (poly_int64 (factor, factor), mode);
4999 shift = 0;
5001 /* Try to subtract an unshifted CNT[BHWD]. */
5002 else if (aarch64_sve_cnt_factor_p (-factor))
5004 code = MINUS;
5005 val = gen_int_mode (poly_int64 (-factor, -factor), mode);
5006 shift = 0;
5008 /* If subtraction is free, prefer to load a positive constant.
5009 In the best case this will fit a shifted CNTB. */
5010 else if (src != const0_rtx && rel_factor < 0)
5012 code = MINUS;
5013 val = gen_int_mode (-rel_factor * BYTES_PER_SVE_VECTOR, mode);
5015 /* Otherwise use a shifted RDVL or CNT[BHWD]. */
5016 else
5017 val = gen_int_mode (rel_factor * BYTES_PER_SVE_VECTOR, mode);
5019 else
5021 /* If we can calculate CNTB << SHIFT directly, prefer to do that,
5022 since it should increase the chances of being able to use
5023 a shift and add sequence for the multiplication.
5024 If CNTB << SHIFT is out of range, stick with the current
5025 shift factor. */
5026 if (force_isa_mode == 0
5027 && IN_RANGE (low_bit, 2, 16 * 16))
5029 val = gen_int_mode (poly_int64 (low_bit, low_bit), mode);
5030 shift = 0;
5032 else if ((force_isa_mode & AARCH64_ISA_MODE_SM_ON)
5033 && aarch64_sve_rdvl_addvl_factor_p (low_bit))
5035 val = aarch64_sme_vq_immediate (mode, low_bit, 0);
5036 shift = 0;
5038 else
5039 val = gen_int_mode (BYTES_PER_SVE_VECTOR, mode);
5041 val = aarch64_force_temporary (mode, temp1, val);
5043 /* Prefer to multiply by a positive factor and subtract rather
5044 than multiply by a negative factor and add, since positive
5045 values are usually easier to move. */
5046 if (rel_factor < 0 && src != const0_rtx)
5048 rel_factor = -rel_factor;
5049 code = MINUS;
5052 if (can_create_pseudo_p ())
5054 rtx coeff1 = gen_int_mode (rel_factor, mode);
5055 val = expand_mult (mode, val, coeff1, NULL_RTX, true, true);
5057 else
5059 rtx coeff1 = gen_int_mode (rel_factor, mode);
5060 coeff1 = aarch64_force_temporary (mode, temp2, coeff1);
5061 val = gen_rtx_MULT (mode, val, coeff1);
5065 /* Multiply by 2 ** SHIFT. */
5066 if (shift > 0)
5068 val = aarch64_force_temporary (mode, temp1, val);
5069 val = gen_rtx_ASHIFT (mode, val, GEN_INT (shift));
5071 else if (shift < 0)
5073 val = aarch64_force_temporary (mode, temp1, val);
5074 val = gen_rtx_ASHIFTRT (mode, val, GEN_INT (-shift));
5077 /* Add the result to SRC or subtract the result from SRC. */
5078 if (src != const0_rtx)
5080 val = aarch64_force_temporary (mode, temp1, val);
5081 val = gen_rtx_fmt_ee (code, mode, src, val);
5083 else if (code == MINUS)
5085 val = aarch64_force_temporary (mode, temp1, val);
5086 val = gen_rtx_NEG (mode, val);
5089 if (constant == 0 || frame_related_p)
5091 rtx_insn *insn = emit_insn (gen_rtx_SET (dest, val));
5092 if (frame_related_p)
5094 RTX_FRAME_RELATED_P (insn) = true;
5095 add_reg_note (insn, REG_CFA_ADJUST_CFA,
5096 gen_rtx_SET (dest, plus_constant (Pmode, src,
5097 poly_offset)));
5099 src = dest;
5100 if (constant == 0)
5101 return;
5103 else
5105 src = aarch64_force_temporary (mode, temp1, val);
5106 temp1 = temp2;
5107 temp2 = NULL_RTX;
5110 emit_move_imm = true;
5113 aarch64_add_offset_1 (mode, dest, src, constant, temp1,
5114 frame_related_p, emit_move_imm);
5117 /* Like aarch64_add_offset, but the offset is given as an rtx rather
5118 than a poly_int64. */
5120 void
5121 aarch64_split_add_offset (scalar_int_mode mode, rtx dest, rtx src,
5122 rtx offset_rtx, rtx temp1, rtx temp2)
5124 aarch64_add_offset (mode, dest, src, rtx_to_poly_int64 (offset_rtx),
5125 temp1, temp2, 0, false);
5128 /* Add DELTA to the stack pointer, marking the instructions frame-related.
5129 TEMP1 is available as a temporary if nonnull. FORCE_ISA_MODE is as
5130 for aarch64_add_offset. EMIT_MOVE_IMM is false if TEMP1 already
5131 contains abs (DELTA). */
5133 static inline void
5134 aarch64_add_sp (rtx temp1, rtx temp2, poly_int64 delta,
5135 aarch64_isa_mode force_isa_mode, bool emit_move_imm)
5137 aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, delta,
5138 temp1, temp2, force_isa_mode, true, emit_move_imm);
5141 /* Subtract DELTA from the stack pointer, marking the instructions
5142 frame-related if FRAME_RELATED_P. FORCE_ISA_MODE is as for
5143 aarch64_add_offset. TEMP1 is available as a temporary if nonnull. */
5145 static inline void
5146 aarch64_sub_sp (rtx temp1, rtx temp2, poly_int64 delta,
5147 aarch64_isa_mode force_isa_mode,
5148 bool frame_related_p, bool emit_move_imm = true)
5150 aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, -delta,
5151 temp1, temp2, force_isa_mode, frame_related_p,
5152 emit_move_imm);
5155 /* A streaming-compatible function needs to switch temporarily to the known
5156 PSTATE.SM mode described by LOCAL_MODE. The low bit of OLD_SVCR contains
5157 the runtime state of PSTATE.SM in the streaming-compatible code, before
5158 the start of the switch to LOCAL_MODE.
5160 Emit instructions to branch around the mode switch if PSTATE.SM already
5161 matches LOCAL_MODE. Return the label that the branch jumps to. */
5163 static rtx_insn *
5164 aarch64_guard_switch_pstate_sm (rtx old_svcr, aarch64_isa_mode local_mode)
5166 local_mode &= AARCH64_ISA_MODE_SM_STATE;
5167 gcc_assert (local_mode != 0);
5168 auto already_ok_cond = (local_mode & AARCH64_ISA_MODE_SM_ON ? NE : EQ);
5169 auto *label = gen_label_rtx ();
5170 auto branch = aarch64_gen_test_and_branch (already_ok_cond, old_svcr, 0,
5171 label);
5172 auto *jump = emit_jump_insn (branch);
5173 JUMP_LABEL (jump) = label;
5174 return label;
5177 /* Emit code to switch from the PSTATE.SM state in OLD_MODE to the PSTATE.SM
5178 state in NEW_MODE. This is known to involve either an SMSTART SM or
5179 an SMSTOP SM. */
5181 static void
5182 aarch64_switch_pstate_sm (aarch64_isa_mode old_mode, aarch64_isa_mode new_mode)
5184 old_mode &= AARCH64_ISA_MODE_SM_STATE;
5185 new_mode &= AARCH64_ISA_MODE_SM_STATE;
5186 gcc_assert (old_mode != new_mode);
5188 if ((new_mode & AARCH64_ISA_MODE_SM_ON)
5189 || (!new_mode && (old_mode & AARCH64_ISA_MODE_SM_OFF)))
5190 emit_insn (gen_aarch64_smstart_sm ());
5191 else
5192 emit_insn (gen_aarch64_smstop_sm ());
5195 /* As a side-effect, SMSTART SM and SMSTOP SM clobber the contents of all
5196 FP and predicate registers. This class emits code to preserve any
5197 necessary registers around the mode switch.
5199 The class uses four approaches to saving and restoring contents, enumerated
5200 by group_type:
5202 - GPR: save and restore the contents of FP registers using GPRs.
5203 This is used if the FP register contains no more than 64 significant
5204 bits. The registers used are FIRST_GPR onwards.
5206 - MEM_128: save and restore 128-bit SIMD registers using memory.
5208 - MEM_SVE_PRED: save and restore full SVE predicate registers using memory.
5210 - MEM_SVE_DATA: save and restore full SVE vector registers using memory.
5212 The save slots within each memory group are consecutive, with the
5213 MEM_SVE_PRED slots occupying a region below the MEM_SVE_DATA slots.
5215 There will only be two mode switches for each use of SME, so they should
5216 not be particularly performance-sensitive. It's also rare for SIMD, SVE
5217 or predicate registers to be live across mode switches. We therefore
5218 don't preallocate the save slots but instead allocate them locally on
5219 demand. This makes the code emitted by the class self-contained. */
5221 class aarch64_sme_mode_switch_regs
5223 public:
5224 static const unsigned int FIRST_GPR = R10_REGNUM;
5226 void add_reg (machine_mode, unsigned int);
5227 void add_call_args (rtx_call_insn *);
5228 void add_call_result (rtx_call_insn *);
5229 void add_call_preserved_reg (unsigned int);
5230 void add_call_preserved_regs (bitmap);
5232 void emit_prologue ();
5233 void emit_epilogue ();
5235 /* The number of GPRs needed to save FP registers, starting from
5236 FIRST_GPR. */
5237 unsigned int num_gprs () { return m_group_count[GPR]; }
5239 private:
5240 enum sequence { PROLOGUE, EPILOGUE };
5241 enum group_type { GPR, MEM_128, MEM_SVE_PRED, MEM_SVE_DATA, NUM_GROUPS };
5243 /* Information about the save location for one FP, SIMD, SVE data, or
5244 SVE predicate register. */
5245 struct save_location {
5246 /* The register to be saved. */
5247 rtx reg;
5249 /* Which group the save location belongs to. */
5250 group_type group;
5252 /* A zero-based index of the register within the group. */
5253 unsigned int index;
5256 unsigned int sve_data_headroom ();
5257 rtx get_slot_mem (machine_mode, poly_int64);
5258 void emit_stack_adjust (sequence, poly_int64);
5259 void emit_mem_move (sequence, const save_location &, poly_int64);
5261 void emit_gpr_moves (sequence);
5262 void emit_mem_128_moves (sequence);
5263 void emit_sve_sp_adjust (sequence);
5264 void emit_sve_pred_moves (sequence);
5265 void emit_sve_data_moves (sequence);
5267 /* All save locations, in no particular order. */
5268 auto_vec<save_location, 12> m_save_locations;
5270 /* The number of registers in each group. */
5271 unsigned int m_group_count[NUM_GROUPS] = {};
5274 /* Record that (reg:MODE REGNO) needs to be preserved around the mode
5275 switch. */
5277 void
5278 aarch64_sme_mode_switch_regs::add_reg (machine_mode mode, unsigned int regno)
5280 if (!FP_REGNUM_P (regno) && !PR_REGNUM_P (regno))
5281 return;
5283 unsigned int end_regno = end_hard_regno (mode, regno);
5284 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
5285 gcc_assert ((vec_flags & VEC_STRUCT) || end_regno == regno + 1);
5286 for (; regno < end_regno; regno++)
5288 /* Force the mode of SVE saves and restores even for single registers.
5289 This is necessary because big-endian targets only allow LDR Z and
5290 STR Z to be used with byte modes. */
5291 machine_mode submode = mode;
5292 if (vec_flags & VEC_SVE_PRED)
5293 submode = VNx16BImode;
5294 else if (vec_flags & VEC_SVE_DATA)
5295 submode = SVE_BYTE_MODE;
5296 else if (vec_flags & VEC_STRUCT)
5298 if (vec_flags & VEC_PARTIAL)
5299 submode = V8QImode;
5300 else
5301 submode = V16QImode;
5303 save_location loc;
5304 loc.reg = gen_rtx_REG (submode, regno);
5305 if (vec_flags & VEC_SVE_PRED)
5307 gcc_assert (PR_REGNUM_P (regno));
5308 loc.group = MEM_SVE_PRED;
5310 else
5312 gcc_assert (FP_REGNUM_P (regno));
5313 if (known_le (GET_MODE_SIZE (submode), 8))
5314 loc.group = GPR;
5315 else if (known_eq (GET_MODE_SIZE (submode), 16))
5316 loc.group = MEM_128;
5317 else
5318 loc.group = MEM_SVE_DATA;
5320 loc.index = m_group_count[loc.group]++;
5321 m_save_locations.quick_push (loc);
5325 /* Record that the arguments to CALL_INSN need to be preserved around
5326 the mode switch. */
5328 void
5329 aarch64_sme_mode_switch_regs::add_call_args (rtx_call_insn *call_insn)
5331 for (rtx node = CALL_INSN_FUNCTION_USAGE (call_insn);
5332 node; node = XEXP (node, 1))
5334 rtx item = XEXP (node, 0);
5335 if (GET_CODE (item) != USE)
5336 continue;
5337 item = XEXP (item, 0);
5338 if (!REG_P (item))
5339 continue;
5340 add_reg (GET_MODE (item), REGNO (item));
5344 /* Record that the return value from CALL_INSN (if any) needs to be
5345 preserved around the mode switch. */
5347 void
5348 aarch64_sme_mode_switch_regs::add_call_result (rtx_call_insn *call_insn)
5350 rtx pat = PATTERN (call_insn);
5351 gcc_assert (GET_CODE (pat) == PARALLEL);
5352 pat = XVECEXP (pat, 0, 0);
5353 if (GET_CODE (pat) == CALL)
5354 return;
5355 rtx dest = SET_DEST (pat);
5356 if (GET_CODE (dest) == PARALLEL)
5357 for (int i = 0; i < XVECLEN (dest, 0); ++i)
5359 rtx x = XVECEXP (dest, 0, i);
5360 gcc_assert (GET_CODE (x) == EXPR_LIST);
5361 rtx reg = XEXP (x, 0);
5362 add_reg (GET_MODE (reg), REGNO (reg));
5364 else
5365 add_reg (GET_MODE (dest), REGNO (dest));
5368 /* REGNO is a register that is call-preserved under the current function's ABI.
5369 Record that it must be preserved around the mode switch. */
5371 void
5372 aarch64_sme_mode_switch_regs::add_call_preserved_reg (unsigned int regno)
5374 if (FP_REGNUM_P (regno))
5375 switch (crtl->abi->id ())
5377 case ARM_PCS_SVE:
5378 add_reg (VNx16QImode, regno);
5379 break;
5380 case ARM_PCS_SIMD:
5381 add_reg (V16QImode, regno);
5382 break;
5383 case ARM_PCS_AAPCS64:
5384 add_reg (DImode, regno);
5385 break;
5386 default:
5387 gcc_unreachable ();
5389 else if (PR_REGNUM_P (regno))
5390 add_reg (VNx16BImode, regno);
5393 /* The hard registers in REGS are call-preserved under the current function's
5394 ABI. Record that they must be preserved around the mode switch. */
5396 void
5397 aarch64_sme_mode_switch_regs::add_call_preserved_regs (bitmap regs)
5399 bitmap_iterator bi;
5400 unsigned int regno;
5401 EXECUTE_IF_SET_IN_BITMAP (regs, 0, regno, bi)
5402 if (HARD_REGISTER_NUM_P (regno))
5403 add_call_preserved_reg (regno);
5404 else
5405 break;
5408 /* Emit code to save registers before the mode switch. */
5410 void
5411 aarch64_sme_mode_switch_regs::emit_prologue ()
5413 emit_sve_sp_adjust (PROLOGUE);
5414 emit_sve_pred_moves (PROLOGUE);
5415 emit_sve_data_moves (PROLOGUE);
5416 emit_mem_128_moves (PROLOGUE);
5417 emit_gpr_moves (PROLOGUE);
5420 /* Emit code to restore registers after the mode switch. */
5422 void
5423 aarch64_sme_mode_switch_regs::emit_epilogue ()
5425 emit_gpr_moves (EPILOGUE);
5426 emit_mem_128_moves (EPILOGUE);
5427 emit_sve_pred_moves (EPILOGUE);
5428 emit_sve_data_moves (EPILOGUE);
5429 emit_sve_sp_adjust (EPILOGUE);
5432 /* The SVE predicate registers are stored below the SVE data registers,
5433 with the predicate save area being padded to a data-register-sized
5434 boundary. Return the size of this padded area as a whole number
5435 of data register slots. */
5437 unsigned int
5438 aarch64_sme_mode_switch_regs::sve_data_headroom ()
5440 return CEIL (m_group_count[MEM_SVE_PRED], 8);
5443 /* Return a memory reference of mode MODE to OFFSET bytes from the
5444 stack pointer. */
5447 aarch64_sme_mode_switch_regs::get_slot_mem (machine_mode mode,
5448 poly_int64 offset)
5450 rtx addr = plus_constant (Pmode, stack_pointer_rtx, offset);
5451 return gen_rtx_MEM (mode, addr);
5454 /* Allocate or deallocate SIZE bytes of stack space: SEQ decides which. */
5456 void
5457 aarch64_sme_mode_switch_regs::emit_stack_adjust (sequence seq,
5458 poly_int64 size)
5460 if (seq == PROLOGUE)
5461 size = -size;
5462 emit_insn (gen_rtx_SET (stack_pointer_rtx,
5463 plus_constant (Pmode, stack_pointer_rtx, size)));
5466 /* Save or restore the register in LOC, whose slot is OFFSET bytes from
5467 the stack pointer. SEQ chooses between saving and restoring. */
5469 void
5470 aarch64_sme_mode_switch_regs::emit_mem_move (sequence seq,
5471 const save_location &loc,
5472 poly_int64 offset)
5474 rtx mem = get_slot_mem (GET_MODE (loc.reg), offset);
5475 if (seq == PROLOGUE)
5476 emit_move_insn (mem, loc.reg);
5477 else
5478 emit_move_insn (loc.reg, mem);
5481 /* Emit instructions to save or restore the GPR group. SEQ chooses between
5482 saving and restoring. */
5484 void
5485 aarch64_sme_mode_switch_regs::emit_gpr_moves (sequence seq)
5487 for (auto &loc : m_save_locations)
5488 if (loc.group == GPR)
5490 gcc_assert (loc.index < 8);
5491 rtx gpr = gen_rtx_REG (GET_MODE (loc.reg), FIRST_GPR + loc.index);
5492 if (seq == PROLOGUE)
5493 emit_move_insn (gpr, loc.reg);
5494 else
5495 emit_move_insn (loc.reg, gpr);
5499 /* Emit instructions to save or restore the MEM_128 group. SEQ chooses
5500 between saving and restoring. */
5502 void
5503 aarch64_sme_mode_switch_regs::emit_mem_128_moves (sequence seq)
5505 HOST_WIDE_INT count = m_group_count[MEM_128];
5506 if (count == 0)
5507 return;
5509 auto sp = stack_pointer_rtx;
5510 auto sp_adjust = (seq == PROLOGUE ? -count : count) * 16;
5512 /* Pick a common mode that supports LDR & STR with pre/post-modification
5513 and LDP & STP with pre/post-modification. */
5514 auto mode = TFmode;
5516 /* An instruction pattern that should be emitted at the end. */
5517 rtx last_pat = NULL_RTX;
5519 /* A previous MEM_128 location that hasn't been handled yet. */
5520 save_location *prev_loc = nullptr;
5522 /* Look for LDP/STPs and record any leftover LDR/STR in PREV_LOC. */
5523 for (auto &loc : m_save_locations)
5524 if (loc.group == MEM_128)
5526 if (!prev_loc)
5528 prev_loc = &loc;
5529 continue;
5531 gcc_assert (loc.index == prev_loc->index + 1);
5533 /* The offset of the base of the save area from the current
5534 stack pointer. */
5535 HOST_WIDE_INT bias = 0;
5536 if (prev_loc->index == 0 && seq == PROLOGUE)
5537 bias = sp_adjust;
5539 /* Get the two sets in the LDP/STP. */
5540 rtx ops[] = {
5541 gen_rtx_REG (mode, REGNO (prev_loc->reg)),
5542 get_slot_mem (mode, prev_loc->index * 16 + bias),
5543 gen_rtx_REG (mode, REGNO (loc.reg)),
5544 get_slot_mem (mode, loc.index * 16 + bias)
5546 unsigned int lhs = (seq == PROLOGUE);
5547 rtx set1 = gen_rtx_SET (ops[lhs], ops[1 - lhs]);
5548 rtx set2 = gen_rtx_SET (ops[lhs + 2], ops[3 - lhs]);
5550 /* Combine the sets with any stack allocation/deallocation. */
5551 rtx pat;
5552 if (prev_loc->index == 0)
5554 rtx plus_sp = plus_constant (Pmode, sp, sp_adjust);
5555 rtvec vec = gen_rtvec (3, gen_rtx_SET (sp, plus_sp), set1, set2);
5556 pat = gen_rtx_PARALLEL (VOIDmode, vec);
5558 else if (seq == PROLOGUE)
5559 pat = aarch64_gen_store_pair (ops[1], ops[0], ops[2]);
5560 else
5561 pat = aarch64_gen_load_pair (ops[0], ops[2], ops[1]);
5563 /* Queue a deallocation to the end, otherwise emit the
5564 instruction now. */
5565 if (seq == EPILOGUE && prev_loc->index == 0)
5566 last_pat = pat;
5567 else
5568 emit_insn (pat);
5569 prev_loc = nullptr;
5572 /* Handle any leftover LDR/STR. */
5573 if (prev_loc)
5575 rtx reg = gen_rtx_REG (mode, REGNO (prev_loc->reg));
5576 rtx addr;
5577 if (prev_loc->index != 0)
5578 addr = plus_constant (Pmode, sp, prev_loc->index * 16);
5579 else if (seq == PROLOGUE)
5581 rtx allocate = plus_constant (Pmode, sp, -count * 16);
5582 addr = gen_rtx_PRE_MODIFY (Pmode, sp, allocate);
5584 else
5586 rtx deallocate = plus_constant (Pmode, sp, count * 16);
5587 addr = gen_rtx_POST_MODIFY (Pmode, sp, deallocate);
5589 rtx mem = gen_rtx_MEM (mode, addr);
5590 if (seq == PROLOGUE)
5591 emit_move_insn (mem, reg);
5592 else
5593 emit_move_insn (reg, mem);
5596 if (last_pat)
5597 emit_insn (last_pat);
5600 /* Allocate or deallocate the stack space needed by the SVE groups.
5601 SEQ chooses between allocating and deallocating. */
5603 void
5604 aarch64_sme_mode_switch_regs::emit_sve_sp_adjust (sequence seq)
5606 if (unsigned int count = m_group_count[MEM_SVE_DATA] + sve_data_headroom ())
5607 emit_stack_adjust (seq, count * BYTES_PER_SVE_VECTOR);
5610 /* Save or restore the MEM_SVE_DATA group. SEQ chooses between saving
5611 and restoring. */
5613 void
5614 aarch64_sme_mode_switch_regs::emit_sve_data_moves (sequence seq)
5616 for (auto &loc : m_save_locations)
5617 if (loc.group == MEM_SVE_DATA)
5619 auto index = loc.index + sve_data_headroom ();
5620 emit_mem_move (seq, loc, index * BYTES_PER_SVE_VECTOR);
5624 /* Save or restore the MEM_SVE_PRED group. SEQ chooses between saving
5625 and restoring. */
5627 void
5628 aarch64_sme_mode_switch_regs::emit_sve_pred_moves (sequence seq)
5630 for (auto &loc : m_save_locations)
5631 if (loc.group == MEM_SVE_PRED)
5632 emit_mem_move (seq, loc, loc.index * BYTES_PER_SVE_PRED);
5635 /* Set DEST to (vec_series BASE STEP). */
5637 static void
5638 aarch64_expand_vec_series (rtx dest, rtx base, rtx step)
5640 machine_mode mode = GET_MODE (dest);
5641 scalar_mode inner = GET_MODE_INNER (mode);
5643 /* Each operand can be a register or an immediate in the range [-16, 15]. */
5644 if (!aarch64_sve_index_immediate_p (base))
5645 base = force_reg (inner, base);
5646 if (!aarch64_sve_index_immediate_p (step))
5647 step = force_reg (inner, step);
5649 emit_set_insn (dest, gen_rtx_VEC_SERIES (mode, base, step));
5652 /* Duplicate 128-bit Advanced SIMD vector SRC so that it fills an SVE
5653 register of mode MODE. Use TARGET for the result if it's nonnull
5654 and convenient.
5656 The two vector modes must have the same element mode. The behavior
5657 is to duplicate architectural lane N of SRC into architectural lanes
5658 N + I * STEP of the result. On big-endian targets, architectural
5659 lane 0 of an Advanced SIMD vector is the last element of the vector
5660 in memory layout, so for big-endian targets this operation has the
5661 effect of reversing SRC before duplicating it. Callers need to
5662 account for this. */
5665 aarch64_expand_sve_dupq (rtx target, machine_mode mode, rtx src)
5667 machine_mode src_mode = GET_MODE (src);
5668 gcc_assert (GET_MODE_INNER (mode) == GET_MODE_INNER (src_mode));
5669 insn_code icode = (BYTES_BIG_ENDIAN
5670 ? code_for_aarch64_vec_duplicate_vq_be (mode)
5671 : code_for_aarch64_vec_duplicate_vq_le (mode));
5673 unsigned int i = 0;
5674 expand_operand ops[3];
5675 create_output_operand (&ops[i++], target, mode);
5676 create_output_operand (&ops[i++], src, src_mode);
5677 if (BYTES_BIG_ENDIAN)
5679 /* Create a PARALLEL describing the reversal of SRC. */
5680 unsigned int nelts_per_vq = 128 / GET_MODE_UNIT_BITSIZE (mode);
5681 rtx sel = aarch64_gen_stepped_int_parallel (nelts_per_vq,
5682 nelts_per_vq - 1, -1);
5683 create_fixed_operand (&ops[i++], sel);
5685 expand_insn (icode, i, ops);
5686 return ops[0].value;
5689 /* Try to force 128-bit vector value SRC into memory and use LD1RQ to fetch
5690 the memory image into DEST. Return true on success. */
5692 static bool
5693 aarch64_expand_sve_ld1rq (rtx dest, rtx src)
5695 src = force_const_mem (GET_MODE (src), src);
5696 if (!src)
5697 return false;
5699 /* Make sure that the address is legitimate. */
5700 if (!aarch64_sve_ld1rq_operand_p (src))
5702 rtx addr = force_reg (Pmode, XEXP (src, 0));
5703 src = replace_equiv_address (src, addr);
5706 machine_mode mode = GET_MODE (dest);
5707 machine_mode pred_mode = aarch64_sve_pred_mode (mode);
5708 rtx ptrue = aarch64_ptrue_reg (pred_mode);
5709 emit_insn (gen_aarch64_sve_ld1rq (mode, dest, src, ptrue));
5710 return true;
5713 /* SRC is an SVE CONST_VECTOR that contains N "foreground" values followed
5714 by N "background" values. Try to move it into TARGET using:
5716 PTRUE PRED.<T>, VL<N>
5717 MOV TRUE.<T>, #<foreground>
5718 MOV FALSE.<T>, #<background>
5719 SEL TARGET.<T>, PRED.<T>, TRUE.<T>, FALSE.<T>
5721 The PTRUE is always a single instruction but the MOVs might need a
5722 longer sequence. If the background value is zero (as it often is),
5723 the sequence can sometimes collapse to a PTRUE followed by a
5724 zero-predicated move.
5726 Return the target on success, otherwise return null. */
5728 static rtx
5729 aarch64_expand_sve_const_vector_sel (rtx target, rtx src)
5731 gcc_assert (CONST_VECTOR_NELTS_PER_PATTERN (src) == 2);
5733 /* Make sure that the PTRUE is valid. */
5734 machine_mode mode = GET_MODE (src);
5735 machine_mode pred_mode = aarch64_sve_pred_mode (mode);
5736 unsigned int npatterns = CONST_VECTOR_NPATTERNS (src);
5737 if (aarch64_svpattern_for_vl (pred_mode, npatterns)
5738 == AARCH64_NUM_SVPATTERNS)
5739 return NULL_RTX;
5741 rtx_vector_builder pred_builder (pred_mode, npatterns, 2);
5742 rtx_vector_builder true_builder (mode, npatterns, 1);
5743 rtx_vector_builder false_builder (mode, npatterns, 1);
5744 for (unsigned int i = 0; i < npatterns; ++i)
5746 true_builder.quick_push (CONST_VECTOR_ENCODED_ELT (src, i));
5747 pred_builder.quick_push (CONST1_RTX (BImode));
5749 for (unsigned int i = 0; i < npatterns; ++i)
5751 false_builder.quick_push (CONST_VECTOR_ENCODED_ELT (src, i + npatterns));
5752 pred_builder.quick_push (CONST0_RTX (BImode));
5754 expand_operand ops[4];
5755 create_output_operand (&ops[0], target, mode);
5756 create_input_operand (&ops[1], true_builder.build (), mode);
5757 create_input_operand (&ops[2], false_builder.build (), mode);
5758 create_input_operand (&ops[3], pred_builder.build (), pred_mode);
5759 expand_insn (code_for_vcond_mask (mode, mode), 4, ops);
5760 return target;
5763 /* Return a register containing CONST_VECTOR SRC, given that SRC has an
5764 SVE data mode and isn't a legitimate constant. Use TARGET for the
5765 result if convenient.
5767 The returned register can have whatever mode seems most natural
5768 given the contents of SRC. */
5770 static rtx
5771 aarch64_expand_sve_const_vector (rtx target, rtx src)
5773 machine_mode mode = GET_MODE (src);
5774 unsigned int npatterns = CONST_VECTOR_NPATTERNS (src);
5775 unsigned int nelts_per_pattern = CONST_VECTOR_NELTS_PER_PATTERN (src);
5776 scalar_mode elt_mode = GET_MODE_INNER (mode);
5777 unsigned int elt_bits = GET_MODE_BITSIZE (elt_mode);
5778 unsigned int container_bits = aarch64_sve_container_bits (mode);
5779 unsigned int encoded_bits = npatterns * nelts_per_pattern * container_bits;
5781 if (nelts_per_pattern == 1
5782 && encoded_bits <= 128
5783 && container_bits != elt_bits)
5785 /* We have a partial vector mode and a constant whose full-vector
5786 equivalent would occupy a repeating 128-bit sequence. Build that
5787 full-vector equivalent instead, so that we have the option of
5788 using LD1RQ and Advanced SIMD operations. */
5789 unsigned int repeat = container_bits / elt_bits;
5790 machine_mode full_mode = aarch64_full_sve_mode (elt_mode).require ();
5791 rtx_vector_builder builder (full_mode, npatterns * repeat, 1);
5792 for (unsigned int i = 0; i < npatterns; ++i)
5793 for (unsigned int j = 0; j < repeat; ++j)
5794 builder.quick_push (CONST_VECTOR_ENCODED_ELT (src, i));
5795 target = aarch64_target_reg (target, full_mode);
5796 return aarch64_expand_sve_const_vector (target, builder.build ());
5799 if (nelts_per_pattern == 1 && encoded_bits == 128)
5801 /* The constant is a duplicated quadword but can't be narrowed
5802 beyond a quadword. Get the memory image of the first quadword
5803 as a 128-bit vector and try using LD1RQ to load it from memory.
5805 The effect for both endiannesses is to load memory lane N into
5806 architectural lanes N + I * STEP of the result. On big-endian
5807 targets, the layout of the 128-bit vector in an Advanced SIMD
5808 register would be different from its layout in an SVE register,
5809 but this 128-bit vector is a memory value only. */
5810 machine_mode vq_mode = aarch64_v128_mode (elt_mode).require ();
5811 rtx vq_value = simplify_gen_subreg (vq_mode, src, mode, 0);
5812 if (vq_value && aarch64_expand_sve_ld1rq (target, vq_value))
5813 return target;
5816 if (nelts_per_pattern == 1 && encoded_bits < 128)
5818 /* The vector is a repeating sequence of 64 bits or fewer.
5819 See if we can load them using an Advanced SIMD move and then
5820 duplicate it to fill a vector. This is better than using a GPR
5821 move because it keeps everything in the same register file. */
5822 machine_mode vq_mode = aarch64_v128_mode (elt_mode).require ();
5823 rtx_vector_builder builder (vq_mode, npatterns, 1);
5824 for (unsigned int i = 0; i < npatterns; ++i)
5826 /* We want memory lane N to go into architectural lane N,
5827 so reverse for big-endian targets. The DUP .Q pattern
5828 has a compensating reverse built-in. */
5829 unsigned int srci = BYTES_BIG_ENDIAN ? npatterns - i - 1 : i;
5830 builder.quick_push (CONST_VECTOR_ENCODED_ELT (src, srci));
5832 rtx vq_src = builder.build ();
5833 if (aarch64_simd_valid_mov_imm (vq_src))
5835 vq_src = force_reg (vq_mode, vq_src);
5836 return aarch64_expand_sve_dupq (target, mode, vq_src);
5839 /* Get an integer representation of the repeating part of Advanced
5840 SIMD vector VQ_SRC. This preserves the endianness of VQ_SRC,
5841 which for big-endian targets is lane-swapped wrt a normal
5842 Advanced SIMD vector. This means that for both endiannesses,
5843 memory lane N of SVE vector SRC corresponds to architectural
5844 lane N of a register holding VQ_SRC. This in turn means that
5845 memory lane 0 of SVE vector SRC is in the lsb of VQ_SRC (viewed
5846 as a single 128-bit value) and thus that memory lane 0 of SRC is
5847 in the lsb of the integer. Duplicating the integer therefore
5848 ensures that memory lane N of SRC goes into architectural lane
5849 N + I * INDEX of the SVE register. */
5850 scalar_mode int_mode = int_mode_for_size (encoded_bits, 0).require ();
5851 rtx elt_value = simplify_gen_subreg (int_mode, vq_src, vq_mode, 0);
5852 if (elt_value)
5854 /* Pretend that we had a vector of INT_MODE to start with. */
5855 elt_mode = int_mode;
5856 mode = aarch64_full_sve_mode (int_mode).require ();
5858 /* If the integer can be moved into a general register by a
5859 single instruction, do that and duplicate the result. */
5860 if (CONST_INT_P (elt_value)
5861 && aarch64_move_imm (INTVAL (elt_value),
5862 encoded_bits <= 32 ? SImode : DImode))
5864 elt_value = force_reg (elt_mode, elt_value);
5865 return expand_vector_broadcast (mode, elt_value);
5868 else if (npatterns == 1)
5869 /* We're duplicating a single value, but can't do better than
5870 force it to memory and load from there. This handles things
5871 like symbolic constants. */
5872 elt_value = CONST_VECTOR_ENCODED_ELT (src, 0);
5874 if (elt_value)
5876 /* Load the element from memory if we can, otherwise move it into
5877 a register and use a DUP. */
5878 rtx op = force_const_mem (elt_mode, elt_value);
5879 if (!op)
5880 op = force_reg (elt_mode, elt_value);
5881 return expand_vector_broadcast (mode, op);
5885 /* Try using INDEX. */
5886 rtx base, step;
5887 if (const_vec_series_p (src, &base, &step))
5889 aarch64_expand_vec_series (target, base, step);
5890 return target;
5893 /* From here on, it's better to force the whole constant to memory
5894 if we can. */
5895 if (GET_MODE_NUNITS (mode).is_constant ())
5896 return NULL_RTX;
5898 if (nelts_per_pattern == 2)
5899 if (rtx res = aarch64_expand_sve_const_vector_sel (target, src))
5900 return res;
5902 /* Expand each pattern individually. */
5903 gcc_assert (npatterns > 1);
5904 rtx_vector_builder builder;
5905 auto_vec<rtx, 16> vectors (npatterns);
5906 for (unsigned int i = 0; i < npatterns; ++i)
5908 builder.new_vector (mode, 1, nelts_per_pattern);
5909 for (unsigned int j = 0; j < nelts_per_pattern; ++j)
5910 builder.quick_push (CONST_VECTOR_ELT (src, i + j * npatterns));
5911 vectors.quick_push (force_reg (mode, builder.build ()));
5914 /* Use permutes to interleave the separate vectors. */
5915 while (npatterns > 1)
5917 npatterns /= 2;
5918 for (unsigned int i = 0; i < npatterns; ++i)
5920 rtx tmp = (npatterns == 1 ? target : gen_reg_rtx (mode));
5921 rtvec v = gen_rtvec (2, vectors[i], vectors[i + npatterns]);
5922 emit_set_insn (tmp, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
5923 vectors[i] = tmp;
5926 gcc_assert (vectors[0] == target);
5927 return target;
5930 /* Use WHILE to set a predicate register of mode MODE in which the first
5931 VL bits are set and the rest are clear. Use TARGET for the register
5932 if it's nonnull and convenient. */
5934 static rtx
5935 aarch64_sve_move_pred_via_while (rtx target, machine_mode mode,
5936 unsigned int vl)
5938 rtx limit = force_reg (DImode, gen_int_mode (vl, DImode));
5939 target = aarch64_target_reg (target, mode);
5940 emit_insn (gen_while (UNSPEC_WHILELO, DImode, mode,
5941 target, const0_rtx, limit));
5942 return target;
5945 static rtx
5946 aarch64_expand_sve_const_pred_1 (rtx, rtx_vector_builder &, bool);
5948 /* BUILDER is a constant predicate in which the index of every set bit
5949 is a multiple of ELT_SIZE (which is <= 8). Try to load the constant
5950 by inverting every element at a multiple of ELT_SIZE and EORing the
5951 result with an ELT_SIZE PTRUE.
5953 Return a register that contains the constant on success, otherwise
5954 return null. Use TARGET as the register if it is nonnull and
5955 convenient. */
5957 static rtx
5958 aarch64_expand_sve_const_pred_eor (rtx target, rtx_vector_builder &builder,
5959 unsigned int elt_size)
5961 /* Invert every element at a multiple of ELT_SIZE, keeping the
5962 other bits zero. */
5963 rtx_vector_builder inv_builder (VNx16BImode, builder.npatterns (),
5964 builder.nelts_per_pattern ());
5965 for (unsigned int i = 0; i < builder.encoded_nelts (); ++i)
5966 if ((i & (elt_size - 1)) == 0 && INTVAL (builder.elt (i)) == 0)
5967 inv_builder.quick_push (const1_rtx);
5968 else
5969 inv_builder.quick_push (const0_rtx);
5970 inv_builder.finalize ();
5972 /* See if we can load the constant cheaply. */
5973 rtx inv = aarch64_expand_sve_const_pred_1 (NULL_RTX, inv_builder, false);
5974 if (!inv)
5975 return NULL_RTX;
5977 /* EOR the result with an ELT_SIZE PTRUE. */
5978 rtx mask = aarch64_ptrue_all (elt_size);
5979 mask = force_reg (VNx16BImode, mask);
5980 inv = gen_lowpart (VNx16BImode, inv);
5981 target = aarch64_target_reg (target, VNx16BImode);
5982 emit_insn (gen_aarch64_pred_z (XOR, VNx16BImode, target, mask, inv, mask));
5983 return target;
5986 /* BUILDER is a constant predicate in which the index of every set bit
5987 is a multiple of ELT_SIZE (which is <= 8). Try to load the constant
5988 using a TRN1 of size PERMUTE_SIZE, which is >= ELT_SIZE. Return the
5989 register on success, otherwise return null. Use TARGET as the register
5990 if nonnull and convenient. */
5992 static rtx
5993 aarch64_expand_sve_const_pred_trn (rtx target, rtx_vector_builder &builder,
5994 unsigned int elt_size,
5995 unsigned int permute_size)
5997 /* We're going to split the constant into two new constants A and B,
5998 with element I of BUILDER going into A if (I & PERMUTE_SIZE) == 0
5999 and into B otherwise. E.g. for PERMUTE_SIZE == 4 && ELT_SIZE == 1:
6001 A: { 0, 1, 2, 3, _, _, _, _, 8, 9, 10, 11, _, _, _, _ }
6002 B: { 4, 5, 6, 7, _, _, _, _, 12, 13, 14, 15, _, _, _, _ }
6004 where _ indicates elements that will be discarded by the permute.
6006 First calculate the ELT_SIZEs for A and B. */
6007 unsigned int a_elt_size = GET_MODE_SIZE (DImode);
6008 unsigned int b_elt_size = GET_MODE_SIZE (DImode);
6009 for (unsigned int i = 0; i < builder.encoded_nelts (); i += elt_size)
6010 if (INTVAL (builder.elt (i)) != 0)
6012 if (i & permute_size)
6013 b_elt_size |= i - permute_size;
6014 else
6015 a_elt_size |= i;
6017 a_elt_size &= -a_elt_size;
6018 b_elt_size &= -b_elt_size;
6020 /* Now construct the vectors themselves. */
6021 rtx_vector_builder a_builder (VNx16BImode, builder.npatterns (),
6022 builder.nelts_per_pattern ());
6023 rtx_vector_builder b_builder (VNx16BImode, builder.npatterns (),
6024 builder.nelts_per_pattern ());
6025 unsigned int nelts = builder.encoded_nelts ();
6026 for (unsigned int i = 0; i < nelts; ++i)
6027 if (i & (elt_size - 1))
6029 a_builder.quick_push (const0_rtx);
6030 b_builder.quick_push (const0_rtx);
6032 else if ((i & permute_size) == 0)
6034 /* The A and B elements are significant. */
6035 a_builder.quick_push (builder.elt (i));
6036 b_builder.quick_push (builder.elt (i + permute_size));
6038 else
6040 /* The A and B elements are going to be discarded, so pick whatever
6041 is likely to give a nice constant. We are targeting element
6042 sizes A_ELT_SIZE and B_ELT_SIZE for A and B respectively,
6043 with the aim of each being a sequence of ones followed by
6044 a sequence of zeros. So:
6046 * if X_ELT_SIZE <= PERMUTE_SIZE, the best approach is to
6047 duplicate the last X_ELT_SIZE element, to extend the
6048 current sequence of ones or zeros.
6050 * if X_ELT_SIZE > PERMUTE_SIZE, the best approach is to add a
6051 zero, so that the constant really does have X_ELT_SIZE and
6052 not a smaller size. */
6053 if (a_elt_size > permute_size)
6054 a_builder.quick_push (const0_rtx);
6055 else
6056 a_builder.quick_push (a_builder.elt (i - a_elt_size));
6057 if (b_elt_size > permute_size)
6058 b_builder.quick_push (const0_rtx);
6059 else
6060 b_builder.quick_push (b_builder.elt (i - b_elt_size));
6062 a_builder.finalize ();
6063 b_builder.finalize ();
6065 /* Try loading A into a register. */
6066 rtx_insn *last = get_last_insn ();
6067 rtx a = aarch64_expand_sve_const_pred_1 (NULL_RTX, a_builder, false);
6068 if (!a)
6069 return NULL_RTX;
6071 /* Try loading B into a register. */
6072 rtx b = a;
6073 if (a_builder != b_builder)
6075 b = aarch64_expand_sve_const_pred_1 (NULL_RTX, b_builder, false);
6076 if (!b)
6078 delete_insns_since (last);
6079 return NULL_RTX;
6083 /* Emit the TRN1 itself. We emit a TRN that operates on VNx16BI
6084 operands but permutes them as though they had mode MODE. */
6085 machine_mode mode = aarch64_sve_pred_mode (permute_size).require ();
6086 target = aarch64_target_reg (target, GET_MODE (a));
6087 rtx type_reg = CONST0_RTX (mode);
6088 emit_insn (gen_aarch64_sve_trn1_conv (mode, target, a, b, type_reg));
6089 return target;
6092 /* Subroutine of aarch64_expand_sve_const_pred. Try to load the VNx16BI
6093 constant in BUILDER into an SVE predicate register. Return the register
6094 on success, otherwise return null. Use TARGET for the register if
6095 nonnull and convenient.
6097 ALLOW_RECURSE_P is true if we can use methods that would call this
6098 function recursively. */
6100 static rtx
6101 aarch64_expand_sve_const_pred_1 (rtx target, rtx_vector_builder &builder,
6102 bool allow_recurse_p)
6104 if (builder.encoded_nelts () == 1)
6105 /* A PFALSE or a PTRUE .B ALL. */
6106 return aarch64_emit_set_immediate (target, builder);
6108 unsigned int elt_size = aarch64_widest_sve_pred_elt_size (builder);
6109 if (int vl = aarch64_partial_ptrue_length (builder, elt_size))
6111 /* If we can load the constant using PTRUE, use it as-is. */
6112 machine_mode mode = aarch64_sve_pred_mode (elt_size).require ();
6113 if (aarch64_svpattern_for_vl (mode, vl) != AARCH64_NUM_SVPATTERNS)
6114 return aarch64_emit_set_immediate (target, builder);
6116 /* Otherwise use WHILE to set the first VL bits. */
6117 return aarch64_sve_move_pred_via_while (target, mode, vl);
6120 if (!allow_recurse_p)
6121 return NULL_RTX;
6123 /* Try inverting the vector in element size ELT_SIZE and then EORing
6124 the result with an ELT_SIZE PTRUE. */
6125 if (INTVAL (builder.elt (0)) == 0)
6126 if (rtx res = aarch64_expand_sve_const_pred_eor (target, builder,
6127 elt_size))
6128 return res;
6130 /* Try using TRN1 to permute two simpler constants. */
6131 for (unsigned int i = elt_size; i <= 8; i *= 2)
6132 if (rtx res = aarch64_expand_sve_const_pred_trn (target, builder,
6133 elt_size, i))
6134 return res;
6136 return NULL_RTX;
6139 /* Return an SVE predicate register that contains the VNx16BImode
6140 constant in BUILDER, without going through the move expanders.
6142 The returned register can have whatever mode seems most natural
6143 given the contents of BUILDER. Use TARGET for the result if
6144 convenient. */
6146 static rtx
6147 aarch64_expand_sve_const_pred (rtx target, rtx_vector_builder &builder)
6149 /* Try loading the constant using pure predicate operations. */
6150 if (rtx res = aarch64_expand_sve_const_pred_1 (target, builder, true))
6151 return res;
6153 /* Try forcing the constant to memory. */
6154 if (builder.full_nelts ().is_constant ())
6155 if (rtx mem = force_const_mem (VNx16BImode, builder.build ()))
6157 target = aarch64_target_reg (target, VNx16BImode);
6158 emit_move_insn (target, mem);
6159 return target;
6162 /* The last resort is to load the constant as an integer and then
6163 compare it against zero. Use -1 for set bits in order to increase
6164 the changes of using SVE DUPM or an Advanced SIMD byte mask. */
6165 rtx_vector_builder int_builder (VNx16QImode, builder.npatterns (),
6166 builder.nelts_per_pattern ());
6167 for (unsigned int i = 0; i < builder.encoded_nelts (); ++i)
6168 int_builder.quick_push (INTVAL (builder.elt (i))
6169 ? constm1_rtx : const0_rtx);
6170 return aarch64_convert_sve_data_to_pred (target, VNx16BImode,
6171 int_builder.build ());
6174 /* Set DEST to immediate IMM. */
6176 void
6177 aarch64_expand_mov_immediate (rtx dest, rtx imm)
6179 machine_mode mode = GET_MODE (dest);
6181 /* Check on what type of symbol it is. */
6182 scalar_int_mode int_mode;
6183 if ((SYMBOL_REF_P (imm)
6184 || LABEL_REF_P (imm)
6185 || GET_CODE (imm) == CONST
6186 || GET_CODE (imm) == CONST_POLY_INT)
6187 && is_a <scalar_int_mode> (mode, &int_mode))
6189 rtx mem;
6190 poly_int64 offset;
6191 HOST_WIDE_INT const_offset;
6192 enum aarch64_symbol_type sty;
6194 /* If we have (const (plus symbol offset)), separate out the offset
6195 before we start classifying the symbol. */
6196 rtx base = strip_offset (imm, &offset);
6198 /* We must always add an offset involving VL separately, rather than
6199 folding it into the relocation. */
6200 if (!offset.is_constant (&const_offset))
6202 if (!TARGET_SVE)
6204 aarch64_report_sve_required ();
6205 return;
6207 if (base == const0_rtx
6208 && (aarch64_sve_cnt_immediate_p (offset)
6209 || aarch64_sve_rdvl_immediate_p (offset)))
6210 emit_insn (gen_rtx_SET (dest, imm));
6211 else
6213 /* Do arithmetic on 32-bit values if the result is smaller
6214 than that. */
6215 if (partial_subreg_p (int_mode, SImode))
6217 /* It is invalid to do symbol calculations in modes
6218 narrower than SImode. */
6219 gcc_assert (base == const0_rtx);
6220 dest = gen_lowpart (SImode, dest);
6221 int_mode = SImode;
6223 if (base != const0_rtx)
6225 base = aarch64_force_temporary (int_mode, dest, base);
6226 aarch64_add_offset (int_mode, dest, base, offset,
6227 NULL_RTX, NULL_RTX, 0, false);
6229 else
6230 aarch64_add_offset (int_mode, dest, base, offset,
6231 dest, NULL_RTX, 0, false);
6233 return;
6236 if (aarch64_rdsvl_immediate_p (base))
6238 /* We could handle non-constant offsets if they are ever
6239 generated. */
6240 gcc_assert (const_offset == 0);
6241 emit_insn (gen_rtx_SET (dest, imm));
6242 return;
6245 sty = aarch64_classify_symbol (base, const_offset);
6246 switch (sty)
6248 case SYMBOL_FORCE_TO_MEM:
6249 if (int_mode != ptr_mode)
6250 imm = convert_memory_address (ptr_mode, imm);
6252 if (const_offset != 0
6253 && targetm.cannot_force_const_mem (ptr_mode, imm))
6255 gcc_assert (can_create_pseudo_p ());
6256 base = aarch64_force_temporary (int_mode, dest, base);
6257 aarch64_add_offset (int_mode, dest, base, const_offset,
6258 NULL_RTX, NULL_RTX, 0, false);
6259 return;
6262 mem = force_const_mem (ptr_mode, imm);
6263 gcc_assert (mem);
6265 /* If we aren't generating PC relative literals, then
6266 we need to expand the literal pool access carefully.
6267 This is something that needs to be done in a number
6268 of places, so could well live as a separate function. */
6269 if (!aarch64_pcrelative_literal_loads)
6271 gcc_assert (can_create_pseudo_p ());
6272 base = gen_reg_rtx (ptr_mode);
6273 aarch64_expand_mov_immediate (base, XEXP (mem, 0));
6274 if (ptr_mode != Pmode)
6275 base = convert_memory_address (Pmode, base);
6276 mem = gen_rtx_MEM (ptr_mode, base);
6279 if (int_mode != ptr_mode)
6280 mem = gen_rtx_ZERO_EXTEND (int_mode, mem);
6282 emit_insn (gen_rtx_SET (dest, mem));
6284 return;
6286 case SYMBOL_SMALL_TLSGD:
6287 case SYMBOL_SMALL_TLSDESC:
6288 case SYMBOL_SMALL_TLSIE:
6289 case SYMBOL_SMALL_GOT_28K:
6290 case SYMBOL_SMALL_GOT_4G:
6291 case SYMBOL_TINY_GOT:
6292 case SYMBOL_TINY_TLSIE:
6293 if (const_offset != 0)
6295 gcc_assert(can_create_pseudo_p ());
6296 base = aarch64_force_temporary (int_mode, dest, base);
6297 aarch64_add_offset (int_mode, dest, base, const_offset,
6298 NULL_RTX, NULL_RTX, 0, false);
6299 return;
6301 /* FALLTHRU */
6303 case SYMBOL_SMALL_ABSOLUTE:
6304 case SYMBOL_TINY_ABSOLUTE:
6305 case SYMBOL_TLSLE12:
6306 case SYMBOL_TLSLE24:
6307 case SYMBOL_TLSLE32:
6308 case SYMBOL_TLSLE48:
6309 aarch64_load_symref_appropriately (dest, imm, sty);
6310 return;
6312 default:
6313 gcc_unreachable ();
6317 if (!CONST_INT_P (imm))
6319 if (aarch64_sve_pred_mode_p (mode))
6321 /* Only the low bit of each .H, .S and .D element is defined,
6322 so we can set the upper bits to whatever we like. If the
6323 predicate is all-true in MODE, prefer to set all the undefined
6324 bits as well, so that we can share a single .B predicate for
6325 all modes. */
6326 if (imm == CONSTM1_RTX (mode))
6327 imm = CONSTM1_RTX (VNx16BImode);
6329 /* All methods for constructing predicate modes wider than VNx16BI
6330 will set the upper bits of each element to zero. Expose this
6331 by moving such constants as a VNx16BI, so that all bits are
6332 significant and so that constants for different modes can be
6333 shared. The wider constant will still be available as a
6334 REG_EQUAL note. */
6335 rtx_vector_builder builder;
6336 if (aarch64_get_sve_pred_bits (builder, imm))
6338 rtx res = aarch64_expand_sve_const_pred (dest, builder);
6339 if (dest != res)
6340 emit_move_insn (dest, gen_lowpart (mode, res));
6341 return;
6345 if (GET_CODE (imm) == HIGH || aarch64_simd_valid_mov_imm (imm))
6347 emit_insn (gen_rtx_SET (dest, imm));
6348 return;
6351 if (CONST_VECTOR_P (imm) && aarch64_sve_data_mode_p (mode))
6352 if (rtx res = aarch64_expand_sve_const_vector (dest, imm))
6354 if (dest != res)
6355 emit_insn (gen_aarch64_sve_reinterpret (mode, dest, res));
6356 return;
6359 rtx mem = force_const_mem (mode, imm);
6360 gcc_assert (mem);
6361 emit_move_insn (dest, mem);
6362 return;
6365 aarch64_internal_mov_immediate (dest, imm, true, mode);
6368 /* Return the MEM rtx that provides the canary value that should be used
6369 for stack-smashing protection. MODE is the mode of the memory.
6370 For SSP_GLOBAL, DECL_RTL is the MEM rtx for the canary variable
6371 (__stack_chk_guard), otherwise it has no useful value. SALT_TYPE
6372 indicates whether the caller is performing a SET or a TEST operation. */
6375 aarch64_stack_protect_canary_mem (machine_mode mode, rtx decl_rtl,
6376 aarch64_salt_type salt_type)
6378 rtx addr;
6379 if (aarch64_stack_protector_guard == SSP_GLOBAL)
6381 gcc_assert (MEM_P (decl_rtl));
6382 addr = XEXP (decl_rtl, 0);
6383 poly_int64 offset;
6384 rtx base = strip_offset_and_salt (addr, &offset);
6385 if (!SYMBOL_REF_P (base))
6386 return decl_rtl;
6388 rtvec v = gen_rtvec (2, base, GEN_INT (salt_type));
6389 addr = gen_rtx_UNSPEC (Pmode, v, UNSPEC_SALT_ADDR);
6390 addr = gen_rtx_CONST (Pmode, addr);
6391 addr = plus_constant (Pmode, addr, offset);
6393 else
6395 /* Calculate the address from the system register. */
6396 rtx salt = GEN_INT (salt_type);
6397 addr = gen_reg_rtx (mode);
6398 if (mode == DImode)
6399 emit_insn (gen_reg_stack_protect_address_di (addr, salt));
6400 else
6402 emit_insn (gen_reg_stack_protect_address_si (addr, salt));
6403 addr = convert_memory_address (Pmode, addr);
6405 addr = plus_constant (Pmode, addr, aarch64_stack_protector_guard_offset);
6407 return gen_rtx_MEM (mode, force_reg (Pmode, addr));
6410 /* Emit an SVE predicated move from SRC to DEST. PRED is a predicate
6411 that is known to contain PTRUE. */
6413 void
6414 aarch64_emit_sve_pred_move (rtx dest, rtx pred, rtx src)
6416 expand_operand ops[3];
6417 machine_mode mode = GET_MODE (dest);
6418 create_output_operand (&ops[0], dest, mode);
6419 create_input_operand (&ops[1], pred, GET_MODE(pred));
6420 create_input_operand (&ops[2], src, mode);
6421 temporary_volatile_ok v (true);
6422 expand_insn (code_for_aarch64_pred_mov (mode), 3, ops);
6425 /* Expand a pre-RA SVE data move from SRC to DEST in which at least one
6426 operand is in memory. In this case we need to use the predicated LD1
6427 and ST1 instead of LDR and STR, both for correctness on big-endian
6428 targets and because LD1 and ST1 support a wider range of addressing modes.
6429 PRED_MODE is the mode of the predicate.
6431 See the comment at the head of aarch64-sve.md for details about the
6432 big-endian handling. */
6434 void
6435 aarch64_expand_sve_mem_move (rtx dest, rtx src, machine_mode pred_mode)
6437 machine_mode mode = GET_MODE (dest);
6438 rtx ptrue = aarch64_ptrue_reg (pred_mode);
6439 if (!register_operand (src, mode)
6440 && !register_operand (dest, mode))
6442 rtx tmp = gen_reg_rtx (mode);
6443 if (MEM_P (src))
6444 aarch64_emit_sve_pred_move (tmp, ptrue, src);
6445 else
6446 emit_move_insn (tmp, src);
6447 src = tmp;
6449 aarch64_emit_sve_pred_move (dest, ptrue, src);
6452 /* Called only on big-endian targets. See whether an SVE vector move
6453 from SRC to DEST is effectively a REV[BHW] instruction, because at
6454 least one operand is a subreg of an SVE vector that has wider or
6455 narrower elements. Return true and emit the instruction if so.
6457 For example:
6459 (set (reg:VNx8HI R1) (subreg:VNx8HI (reg:VNx16QI R2) 0))
6461 represents a VIEW_CONVERT between the following vectors, viewed
6462 in memory order:
6464 R2: { [0].high, [0].low, [1].high, [1].low, ... }
6465 R1: { [0], [1], [2], [3], ... }
6467 The high part of lane X in R2 should therefore correspond to lane X*2
6468 of R1, but the register representations are:
6470 msb lsb
6471 R2: ...... [1].high [1].low [0].high [0].low
6472 R1: ...... [3] [2] [1] [0]
6474 where the low part of lane X in R2 corresponds to lane X*2 in R1.
6475 We therefore need a reverse operation to swap the high and low values
6476 around.
6478 This is purely an optimization. Without it we would spill the
6479 subreg operand to the stack in one mode and reload it in the
6480 other mode, which has the same effect as the REV. */
6482 bool
6483 aarch64_maybe_expand_sve_subreg_move (rtx dest, rtx src)
6485 gcc_assert (BYTES_BIG_ENDIAN);
6487 /* Do not try to optimize subregs that LRA has created for matched
6488 reloads. These subregs only exist as a temporary measure to make
6489 the RTL well-formed, but they are exempt from the usual
6490 TARGET_CAN_CHANGE_MODE_CLASS rules.
6492 For example, if we have:
6494 (set (reg:VNx8HI R1) (foo:VNx8HI (reg:VNx4SI R2)))
6496 and the constraints require R1 and R2 to be in the same register,
6497 LRA may need to create RTL such as:
6499 (set (subreg:VNx4SI (reg:VNx8HI TMP) 0) (reg:VNx4SI R2))
6500 (set (reg:VNx8HI TMP) (foo:VNx8HI (subreg:VNx4SI (reg:VNx8HI TMP) 0)))
6501 (set (reg:VNx8HI R1) (reg:VNx8HI TMP))
6503 which forces both the input and output of the original instruction
6504 to use the same hard register. But for this to work, the normal
6505 rules have to be suppressed on the subreg input, otherwise LRA
6506 would need to reload that input too, meaning that the process
6507 would never terminate. To compensate for this, the normal rules
6508 are also suppressed for the subreg output of the first move.
6509 Ignoring the special case and handling the first move normally
6510 would therefore generate wrong code: we would reverse the elements
6511 for the first subreg but not reverse them back for the second subreg. */
6512 if (SUBREG_P (dest) && !LRA_SUBREG_P (dest))
6513 dest = SUBREG_REG (dest);
6514 if (SUBREG_P (src) && !LRA_SUBREG_P (src))
6515 src = SUBREG_REG (src);
6517 /* The optimization handles two single SVE REGs with different element
6518 sizes. */
6519 if (!REG_P (dest)
6520 || !REG_P (src)
6521 || aarch64_classify_vector_mode (GET_MODE (dest)) != VEC_SVE_DATA
6522 || aarch64_classify_vector_mode (GET_MODE (src)) != VEC_SVE_DATA
6523 || (GET_MODE_UNIT_SIZE (GET_MODE (dest))
6524 == GET_MODE_UNIT_SIZE (GET_MODE (src))))
6525 return false;
6527 /* Generate *aarch64_sve_mov<mode>_subreg_be. */
6528 rtx ptrue = aarch64_ptrue_reg (VNx16BImode);
6529 rtx unspec = gen_rtx_UNSPEC (GET_MODE (dest), gen_rtvec (2, ptrue, src),
6530 UNSPEC_REV_SUBREG);
6531 emit_insn (gen_rtx_SET (dest, unspec));
6532 return true;
6535 /* Return a copy of X with mode MODE, without changing its other
6536 attributes. Unlike gen_lowpart, this doesn't care whether the
6537 mode change is valid. */
6540 aarch64_replace_reg_mode (rtx x, machine_mode mode)
6542 if (GET_MODE (x) == mode)
6543 return x;
6545 x = shallow_copy_rtx (x);
6546 set_mode_and_regno (x, mode, REGNO (x));
6547 return x;
6550 /* Return the SVE REV[BHW] unspec for reversing quantites of mode MODE
6551 stored in wider integer containers. */
6553 static unsigned int
6554 aarch64_sve_rev_unspec (machine_mode mode)
6556 switch (GET_MODE_UNIT_SIZE (mode))
6558 case 1: return UNSPEC_REVB;
6559 case 2: return UNSPEC_REVH;
6560 case 4: return UNSPEC_REVW;
6562 gcc_unreachable ();
6565 /* Split a *aarch64_sve_mov<mode>_subreg_be pattern with the given
6566 operands. */
6568 void
6569 aarch64_split_sve_subreg_move (rtx dest, rtx ptrue, rtx src)
6571 /* Decide which REV operation we need. The mode with wider elements
6572 determines the mode of the operands and the mode with the narrower
6573 elements determines the reverse width. */
6574 machine_mode mode_with_wider_elts = aarch64_sve_int_mode (GET_MODE (dest));
6575 machine_mode mode_with_narrower_elts = aarch64_sve_int_mode (GET_MODE (src));
6576 if (GET_MODE_UNIT_SIZE (mode_with_wider_elts)
6577 < GET_MODE_UNIT_SIZE (mode_with_narrower_elts))
6578 std::swap (mode_with_wider_elts, mode_with_narrower_elts);
6580 unsigned int unspec = aarch64_sve_rev_unspec (mode_with_narrower_elts);
6581 machine_mode pred_mode = aarch64_sve_pred_mode (mode_with_wider_elts);
6583 /* Get the operands in the appropriate modes and emit the instruction. */
6584 ptrue = gen_lowpart (pred_mode, ptrue);
6585 dest = aarch64_replace_reg_mode (dest, mode_with_wider_elts);
6586 src = aarch64_replace_reg_mode (src, mode_with_wider_elts);
6587 emit_insn (gen_aarch64_pred (unspec, mode_with_wider_elts,
6588 dest, ptrue, src));
6591 static bool
6592 aarch64_function_ok_for_sibcall (tree, tree exp)
6594 auto from_abi = crtl->abi->id ();
6595 auto to_abi = expr_callee_abi (exp).id ();
6597 /* ARM_PCS_SVE preserves strictly more than ARM_PCS_SIMD, which in
6598 turn preserves strictly more than the base PCS. The callee must
6599 preserve everything that the caller is required to preserve. */
6600 if (from_abi != to_abi && to_abi == ARM_PCS_SVE)
6601 to_abi = ARM_PCS_SIMD;
6602 if (from_abi != to_abi && to_abi == ARM_PCS_SIMD)
6603 to_abi = ARM_PCS_AAPCS64;
6604 if (from_abi != to_abi)
6605 return false;
6607 tree fntype = TREE_TYPE (TREE_TYPE (CALL_EXPR_FN (exp)));
6608 if (aarch64_fntype_pstate_sm (fntype) & ~aarch64_cfun_incoming_pstate_sm ())
6609 return false;
6610 for (auto state : { "za", "zt0" })
6611 if (bool (aarch64_cfun_shared_flags (state))
6612 != bool (aarch64_fntype_shared_flags (fntype, state)))
6613 return false;
6615 /* BTI J is needed where indirect_return functions may return
6616 if bti is enabled there. */
6617 if (lookup_attribute ("indirect_return", TYPE_ATTRIBUTES (fntype))
6618 && !lookup_attribute ("indirect_return",
6619 TYPE_ATTRIBUTES (TREE_TYPE (cfun->decl))))
6620 return false;
6622 return true;
6625 /* Subroutine of aarch64_pass_by_reference for arguments that are not
6626 passed in SVE registers. */
6628 static bool
6629 aarch64_pass_by_reference_1 (CUMULATIVE_ARGS *pcum,
6630 const function_arg_info &arg)
6632 HOST_WIDE_INT size;
6633 machine_mode dummymode;
6634 int nregs;
6636 /* GET_MODE_SIZE (BLKmode) is useless since it is 0. */
6637 if (arg.mode == BLKmode && arg.type)
6638 size = int_size_in_bytes (arg.type);
6639 else
6640 /* No frontends can create types with variable-sized modes, so we
6641 shouldn't be asked to pass or return them. */
6642 size = GET_MODE_SIZE (arg.mode).to_constant ();
6644 /* Aggregates are passed by reference based on their size. */
6645 if (arg.aggregate_type_p ())
6646 size = int_size_in_bytes (arg.type);
6648 /* Variable sized arguments are always returned by reference. */
6649 if (size < 0)
6650 return true;
6652 /* Can this be a candidate to be passed in fp/simd register(s)? */
6653 if (aarch64_vfp_is_call_or_return_candidate (arg.mode, arg.type,
6654 &dummymode, &nregs, NULL,
6655 !pcum || pcum->silent_p))
6656 return false;
6658 /* Arguments which are variable sized or larger than 2 registers are
6659 passed by reference unless they are a homogenous floating point
6660 aggregate. */
6661 return size > 2 * UNITS_PER_WORD;
6664 /* Implement TARGET_PASS_BY_REFERENCE. */
6666 static bool
6667 aarch64_pass_by_reference (cumulative_args_t pcum_v,
6668 const function_arg_info &arg)
6670 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
6672 if (!arg.type)
6673 return aarch64_pass_by_reference_1 (pcum, arg);
6675 pure_scalable_type_info pst_info;
6676 switch (pst_info.analyze (arg.type))
6678 case pure_scalable_type_info::IS_PST:
6679 if (pcum && !pcum->silent_p && !TARGET_SVE)
6680 /* We can't gracefully recover at this point, so make this a
6681 fatal error. */
6682 fatal_error (input_location, "arguments of type %qT require"
6683 " the SVE ISA extension", arg.type);
6685 /* Variadic SVE types are passed by reference. Normal non-variadic
6686 arguments are too if we've run out of registers. */
6687 return (!arg.named
6688 || pcum->aapcs_nvrn + pst_info.num_zr () > NUM_FP_ARG_REGS
6689 || pcum->aapcs_nprn + pst_info.num_pr () > NUM_PR_ARG_REGS);
6691 case pure_scalable_type_info::DOESNT_MATTER:
6692 gcc_assert (aarch64_pass_by_reference_1 (pcum, arg));
6693 return true;
6695 case pure_scalable_type_info::NO_ABI_IDENTITY:
6696 case pure_scalable_type_info::ISNT_PST:
6697 return aarch64_pass_by_reference_1 (pcum, arg);
6699 gcc_unreachable ();
6702 /* Return TRUE if VALTYPE is padded to its least significant bits. */
6703 static bool
6704 aarch64_return_in_msb (const_tree valtype)
6706 machine_mode dummy_mode;
6707 int dummy_int;
6709 /* Never happens in little-endian mode. */
6710 if (!BYTES_BIG_ENDIAN)
6711 return false;
6713 /* Only composite types smaller than or equal to 16 bytes can
6714 be potentially returned in registers. */
6715 if (!aarch64_composite_type_p (valtype, TYPE_MODE (valtype))
6716 || int_size_in_bytes (valtype) <= 0
6717 || int_size_in_bytes (valtype) > 16)
6718 return false;
6720 /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
6721 or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
6722 is always passed/returned in the least significant bits of fp/simd
6723 register(s). */
6724 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype), valtype,
6725 &dummy_mode, &dummy_int, NULL,
6726 false))
6727 return false;
6729 /* Likewise pure scalable types for SVE vector and predicate registers. */
6730 pure_scalable_type_info pst_info;
6731 if (pst_info.analyze_registers (valtype))
6732 return false;
6734 return true;
6737 /* Implement TARGET_FUNCTION_VALUE.
6738 Define how to find the value returned by a function. */
6740 static rtx
6741 aarch64_function_value (const_tree type, const_tree func,
6742 bool outgoing ATTRIBUTE_UNUSED)
6744 machine_mode mode;
6745 int unsignedp;
6747 mode = TYPE_MODE (type);
6748 if (INTEGRAL_TYPE_P (type))
6749 mode = promote_function_mode (type, mode, &unsignedp, func, 1);
6751 pure_scalable_type_info pst_info;
6752 if (type && pst_info.analyze_registers (type))
6753 return pst_info.get_rtx (mode, V0_REGNUM, P0_REGNUM);
6755 /* Generic vectors that map to full SVE modes with -msve-vector-bits=N
6756 are returned in memory, not by value. */
6757 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
6758 bool sve_p = (vec_flags & VEC_ANY_SVE);
6760 if (aarch64_return_in_msb (type))
6762 HOST_WIDE_INT size = int_size_in_bytes (type);
6764 if (size % UNITS_PER_WORD != 0)
6766 size += UNITS_PER_WORD - size % UNITS_PER_WORD;
6767 mode = int_mode_for_size (size * BITS_PER_UNIT, 0).require ();
6771 int count;
6772 machine_mode ag_mode;
6773 if (aarch64_vfp_is_call_or_return_candidate (mode, type, &ag_mode, &count,
6774 NULL, false))
6776 gcc_assert (!sve_p);
6777 if (!aarch64_composite_type_p (type, mode))
6779 gcc_assert (count == 1 && mode == ag_mode);
6780 return gen_rtx_REG (mode, V0_REGNUM);
6782 else if (aarch64_advsimd_full_struct_mode_p (mode)
6783 && known_eq (GET_MODE_SIZE (ag_mode), 16))
6784 return gen_rtx_REG (mode, V0_REGNUM);
6785 else if (aarch64_advsimd_partial_struct_mode_p (mode)
6786 && known_eq (GET_MODE_SIZE (ag_mode), 8))
6787 return gen_rtx_REG (mode, V0_REGNUM);
6788 else
6790 int i;
6791 rtx par;
6793 par = gen_rtx_PARALLEL (mode, rtvec_alloc (count));
6794 for (i = 0; i < count; i++)
6796 rtx tmp = gen_rtx_REG (ag_mode, V0_REGNUM + i);
6797 rtx offset = gen_int_mode (i * GET_MODE_SIZE (ag_mode), Pmode);
6798 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
6799 XVECEXP (par, 0, i) = tmp;
6801 return par;
6804 else
6806 if (sve_p)
6808 /* Vector types can acquire a partial SVE mode using things like
6809 __attribute__((vector_size(N))), and this is potentially useful.
6810 However, the choice of mode doesn't affect the type's ABI
6811 identity, so we should treat the types as though they had
6812 the associated integer mode, just like they did before SVE
6813 was introduced.
6815 We know that the vector must be 128 bits or smaller,
6816 otherwise we'd have returned it in memory instead. */
6817 gcc_assert (type
6818 && (aarch64_some_values_include_pst_objects_p (type)
6819 || (vec_flags & VEC_PARTIAL)));
6821 scalar_int_mode int_mode = int_mode_for_mode (mode).require ();
6822 rtx reg = gen_rtx_REG (int_mode, R0_REGNUM);
6823 rtx pair = gen_rtx_EXPR_LIST (VOIDmode, reg, const0_rtx);
6824 return gen_rtx_PARALLEL (mode, gen_rtvec (1, pair));
6826 return gen_rtx_REG (mode, R0_REGNUM);
6830 /* Implements TARGET_FUNCTION_VALUE_REGNO_P.
6831 Return true if REGNO is the number of a hard register in which the values
6832 of called function may come back. */
6834 static bool
6835 aarch64_function_value_regno_p (const unsigned int regno)
6837 /* Maximum of 16 bytes can be returned in the general registers. Examples
6838 of 16-byte return values are: 128-bit integers and 16-byte small
6839 structures (excluding homogeneous floating-point aggregates). */
6840 if (regno == R0_REGNUM || regno == R1_REGNUM)
6841 return true;
6843 /* Up to four fp/simd registers can return a function value, e.g. a
6844 homogeneous floating-point aggregate having four members. */
6845 if (regno >= V0_REGNUM && regno < V0_REGNUM + HA_MAX_NUM_FLDS)
6846 return TARGET_FLOAT;
6848 if (regno >= P0_REGNUM && regno < P0_REGNUM + HA_MAX_NUM_FLDS)
6849 return TARGET_SVE;
6851 return false;
6854 /* Subroutine for aarch64_return_in_memory for types that are not returned
6855 in SVE registers. */
6857 static bool
6858 aarch64_return_in_memory_1 (const_tree type)
6860 HOST_WIDE_INT size;
6861 machine_mode ag_mode;
6862 int count;
6864 if (!AGGREGATE_TYPE_P (type)
6865 && TREE_CODE (type) != BITINT_TYPE
6866 && TREE_CODE (type) != COMPLEX_TYPE
6867 && TREE_CODE (type) != VECTOR_TYPE)
6868 /* Simple scalar types always returned in registers. */
6869 return false;
6871 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type), type,
6872 &ag_mode, &count, NULL, false))
6873 return false;
6875 /* Types larger than 2 registers returned in memory. */
6876 size = int_size_in_bytes (type);
6877 return (size < 0 || size > 2 * UNITS_PER_WORD);
6880 /* Implement TARGET_RETURN_IN_MEMORY.
6882 If the type T of the result of a function is such that
6883 void func (T arg)
6884 would require that arg be passed as a value in a register (or set of
6885 registers) according to the parameter passing rules, then the result
6886 is returned in the same registers as would be used for such an
6887 argument. */
6889 static bool
6890 aarch64_return_in_memory (const_tree type, const_tree fndecl ATTRIBUTE_UNUSED)
6892 pure_scalable_type_info pst_info;
6893 switch (pst_info.analyze (type))
6895 case pure_scalable_type_info::IS_PST:
6896 return (pst_info.num_zr () > NUM_FP_ARG_REGS
6897 || pst_info.num_pr () > NUM_PR_ARG_REGS);
6899 case pure_scalable_type_info::DOESNT_MATTER:
6900 gcc_assert (aarch64_return_in_memory_1 (type));
6901 return true;
6903 case pure_scalable_type_info::NO_ABI_IDENTITY:
6904 case pure_scalable_type_info::ISNT_PST:
6905 return aarch64_return_in_memory_1 (type);
6907 gcc_unreachable ();
6910 static bool
6911 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v, machine_mode mode,
6912 const_tree type, int *nregs)
6914 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
6915 return aarch64_vfp_is_call_or_return_candidate (mode, type,
6916 &pcum->aapcs_vfp_rmode,
6917 nregs, NULL, pcum->silent_p);
6920 /* Given MODE and TYPE of a function argument, return the alignment in
6921 bits. The idea is to suppress any stronger alignment requested by
6922 the user and opt for the natural alignment (specified in AAPCS64 \S
6923 4.1). ABI_BREAK_GCC_9 is set to the old alignment if the alignment
6924 was incorrectly calculated in versions of GCC prior to GCC 9.
6925 ABI_BREAK_GCC_13 is set to the old alignment if it was incorrectly
6926 calculated in versions between GCC 9 and GCC 13. If the alignment
6927 might have changed between GCC 13 and GCC 14, ABI_BREAK_GCC_14
6928 is the old GCC 13 alignment, otherwise it is zero.
6930 This is a helper function for local use only. */
6932 static unsigned int
6933 aarch64_function_arg_alignment (machine_mode mode, const_tree type,
6934 unsigned int *abi_break_gcc_9,
6935 unsigned int *abi_break_gcc_13,
6936 unsigned int *abi_break_gcc_14)
6938 *abi_break_gcc_9 = 0;
6939 *abi_break_gcc_13 = 0;
6940 *abi_break_gcc_14 = 0;
6941 if (!type)
6942 return GET_MODE_ALIGNMENT (mode);
6944 if (integer_zerop (TYPE_SIZE (type)))
6945 return 0;
6947 gcc_assert (TYPE_MODE (type) == mode);
6949 if (!AGGREGATE_TYPE_P (type))
6951 /* The ABI alignment is the natural alignment of the type, without
6952 any attributes applied. Normally this is the alignment of the
6953 TYPE_MAIN_VARIANT, but not always; see PR108910 for a counterexample.
6954 For now we just handle the known exceptions explicitly. */
6955 type = TYPE_MAIN_VARIANT (type);
6956 if (POINTER_TYPE_P (type))
6958 gcc_assert (known_eq (POINTER_SIZE, GET_MODE_BITSIZE (mode)));
6959 return POINTER_SIZE;
6961 if (TREE_CODE (type) == ENUMERAL_TYPE && TREE_TYPE (type))
6963 *abi_break_gcc_14 = TYPE_ALIGN (type);
6964 type = TYPE_MAIN_VARIANT (TREE_TYPE (type));
6966 gcc_assert (!TYPE_USER_ALIGN (type));
6967 return TYPE_ALIGN (type);
6970 if (TREE_CODE (type) == ARRAY_TYPE)
6971 return TYPE_ALIGN (TREE_TYPE (type));
6973 unsigned int alignment = 0;
6974 unsigned int bitfield_alignment_with_packed = 0;
6975 unsigned int bitfield_alignment = 0;
6976 for (tree field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
6977 if (TREE_CODE (field) == FIELD_DECL)
6979 /* Note that we explicitly consider zero-sized fields here,
6980 even though they don't map to AAPCS64 machine types.
6981 For example, in:
6983 struct __attribute__((aligned(8))) empty {};
6985 struct s {
6986 [[no_unique_address]] empty e;
6987 int x;
6990 "s" contains only one Fundamental Data Type (the int field)
6991 but gains 8-byte alignment and size thanks to "e". */
6992 alignment = std::max (alignment, DECL_ALIGN (field));
6993 if (DECL_BIT_FIELD_TYPE (field))
6995 /* Take the bit-field type's alignment into account only
6996 if the user didn't reduce this field's alignment with
6997 the packed attribute. */
6998 if (!DECL_PACKED (field))
6999 bitfield_alignment
7000 = std::max (bitfield_alignment,
7001 TYPE_ALIGN (DECL_BIT_FIELD_TYPE (field)));
7003 /* Compute the alignment even if the bit-field is
7004 packed, so that we can emit a warning in case the
7005 alignment changed between GCC versions. */
7006 bitfield_alignment_with_packed
7007 = std::max (bitfield_alignment_with_packed,
7008 TYPE_ALIGN (DECL_BIT_FIELD_TYPE (field)));
7012 /* Emit a warning if the alignment is different when taking the
7013 'packed' attribute into account. */
7014 if (bitfield_alignment != bitfield_alignment_with_packed
7015 && bitfield_alignment_with_packed > alignment)
7016 *abi_break_gcc_13 = bitfield_alignment_with_packed;
7018 if (bitfield_alignment > alignment)
7020 *abi_break_gcc_9 = alignment;
7021 return bitfield_alignment;
7024 return alignment;
7027 /* Return true if TYPE describes a _BitInt(N) or an angreggate that uses the
7028 _BitInt(N) type. These include ARRAY_TYPE's with an element that is a
7029 _BitInt(N) or an aggregate that uses it, and a RECORD_TYPE or a UNION_TYPE
7030 with a field member that is a _BitInt(N) or an aggregate that uses it.
7031 Return false otherwise. */
7033 static bool
7034 bitint_or_aggr_of_bitint_p (tree type)
7036 if (!type)
7037 return false;
7039 if (TREE_CODE (type) == BITINT_TYPE)
7040 return true;
7042 /* If ARRAY_TYPE, check it's element type. */
7043 if (TREE_CODE (type) == ARRAY_TYPE)
7044 return bitint_or_aggr_of_bitint_p (TREE_TYPE (type));
7046 /* If RECORD_TYPE or UNION_TYPE, check the fields' types. */
7047 if (RECORD_OR_UNION_TYPE_P (type))
7048 for (tree field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
7050 if (TREE_CODE (field) != FIELD_DECL)
7051 continue;
7052 if (bitint_or_aggr_of_bitint_p (TREE_TYPE (field)))
7053 return true;
7055 return false;
7058 /* Layout a function argument according to the AAPCS64 rules. The rule
7059 numbers refer to the rule numbers in the AAPCS64. ORIG_MODE is the
7060 mode that was originally given to us by the target hook, whereas the
7061 mode in ARG might be the result of replacing partial SVE modes with
7062 the equivalent integer mode. */
7064 static void
7065 aarch64_layout_arg (cumulative_args_t pcum_v, const function_arg_info &arg)
7067 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
7068 tree type = arg.type;
7069 machine_mode mode = arg.mode;
7070 int ncrn, nvrn, nregs;
7071 bool allocate_ncrn, allocate_nvrn;
7072 HOST_WIDE_INT size;
7073 unsigned int abi_break_gcc_9;
7074 unsigned int abi_break_gcc_13;
7075 unsigned int abi_break_gcc_14;
7077 /* We need to do this once per argument. */
7078 if (pcum->aapcs_arg_processed)
7079 return;
7081 bool warn_pcs_change
7082 = (warn_psabi
7083 && !pcum->silent_p
7084 && (currently_expanding_function_start
7085 || currently_expanding_gimple_stmt));
7087 /* HFAs and HVAs can have an alignment greater than 16 bytes. For example:
7089 typedef struct foo {
7090 __Int8x16_t foo[2] __attribute__((aligned(32)));
7091 } foo;
7093 is still a HVA despite its larger-than-normal alignment.
7094 However, such over-aligned HFAs and HVAs are guaranteed to have
7095 no padding.
7097 If we exclude HFAs and HVAs from the discussion below, then there
7098 are several things to note:
7100 - Both the C and AAPCS64 interpretations of a type's alignment should
7101 give a value that is no greater than the type's size.
7103 - Types bigger than 16 bytes are passed indirectly.
7105 - If an argument of type T is passed indirectly, TYPE and MODE describe
7106 a pointer to T rather than T iself.
7108 It follows that the AAPCS64 alignment of TYPE must be no greater
7109 than 16 bytes.
7111 Versions prior to GCC 9.1 ignored a bitfield's underlying type
7112 and so could calculate an alignment that was too small. If this
7113 happened for TYPE then ABI_BREAK_GCC_9 is this older, too-small alignment.
7115 Although GCC 9.1 fixed that bug, it introduced a different one:
7116 it would consider the alignment of a bitfield's underlying type even
7117 if the field was packed (which should have the effect of overriding
7118 the alignment of the underlying type). This was fixed in GCC 13.1.
7120 As a result of this bug, GCC 9 to GCC 12 could calculate an alignment
7121 that was too big. If this happened for TYPE, ABI_BREAK_GCC_13 is
7122 this older, too-big alignment.
7124 Also, the fact that GCC 9 to GCC 12 considered irrelevant
7125 alignments meant they could calculate type alignments that were
7126 bigger than the type's size, contrary to the assumption above.
7127 The handling of register arguments was nevertheless (and justifiably)
7128 written to follow the assumption that the alignment can never be
7129 greater than the size. The same was not true for stack arguments;
7130 their alignment was instead handled by MIN bounds in
7131 aarch64_function_arg_boundary.
7133 The net effect is that, if GCC 9 to GCC 12 incorrectly calculated
7134 an alignment of more than 16 bytes for TYPE then:
7136 - If the argument was passed in registers, these GCC versions
7137 would treat the alignment as though it was *less than* 16 bytes.
7139 - If the argument was passed on the stack, these GCC versions
7140 would treat the alignment as though it was *equal to* 16 bytes.
7142 Both behaviors were wrong, but in different cases. */
7144 pcum->aapcs_arg_processed = true;
7146 pure_scalable_type_info pst_info;
7147 if (type && pst_info.analyze_registers (type))
7149 /* aarch64_function_arg_alignment has never had an effect on
7150 this case. */
7152 /* The PCS says that it is invalid to pass an SVE value to an
7153 unprototyped function. There is no ABI-defined location we
7154 can return in this case, so we have no real choice but to raise
7155 an error immediately, even though this is only a query function. */
7156 if (arg.named && pcum->pcs_variant != ARM_PCS_SVE)
7158 gcc_assert (!pcum->silent_p);
7159 error ("SVE type %qT cannot be passed to an unprototyped function",
7160 arg.type);
7161 /* Avoid repeating the message, and avoid tripping the assert
7162 below. */
7163 pcum->pcs_variant = ARM_PCS_SVE;
7166 /* We would have converted the argument into pass-by-reference
7167 form if it didn't fit in registers. */
7168 pcum->aapcs_nextnvrn = pcum->aapcs_nvrn + pst_info.num_zr ();
7169 pcum->aapcs_nextnprn = pcum->aapcs_nprn + pst_info.num_pr ();
7170 gcc_assert (arg.named
7171 && pcum->pcs_variant == ARM_PCS_SVE
7172 && pcum->aapcs_nextnvrn <= NUM_FP_ARG_REGS
7173 && pcum->aapcs_nextnprn <= NUM_PR_ARG_REGS);
7174 pcum->aapcs_reg = pst_info.get_rtx (mode, V0_REGNUM + pcum->aapcs_nvrn,
7175 P0_REGNUM + pcum->aapcs_nprn);
7176 return;
7179 /* Generic vectors that map to full SVE modes with -msve-vector-bits=N
7180 are passed by reference, not by value. */
7181 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
7182 bool sve_p = (vec_flags & VEC_ANY_SVE);
7183 if (sve_p)
7184 /* Vector types can acquire a partial SVE mode using things like
7185 __attribute__((vector_size(N))), and this is potentially useful.
7186 However, the choice of mode doesn't affect the type's ABI
7187 identity, so we should treat the types as though they had
7188 the associated integer mode, just like they did before SVE
7189 was introduced.
7191 We know that the vector must be 128 bits or smaller,
7192 otherwise we'd have passed it in memory instead. */
7193 gcc_assert (type
7194 && (aarch64_some_values_include_pst_objects_p (type)
7195 || (vec_flags & VEC_PARTIAL)));
7197 /* Size in bytes, rounded to the nearest multiple of 8 bytes. */
7198 if (type)
7199 size = int_size_in_bytes (type);
7200 else
7201 /* No frontends can create types with variable-sized modes, so we
7202 shouldn't be asked to pass or return them. */
7203 size = GET_MODE_SIZE (mode).to_constant ();
7204 size = ROUND_UP (size, UNITS_PER_WORD);
7206 allocate_ncrn = (type) ? !(FLOAT_TYPE_P (type)) : !FLOAT_MODE_P (mode);
7207 allocate_nvrn = aarch64_vfp_is_call_candidate (pcum_v,
7208 mode,
7209 type,
7210 &nregs);
7211 gcc_assert (!sve_p || !allocate_nvrn);
7213 unsigned int alignment
7214 = aarch64_function_arg_alignment (mode, type, &abi_break_gcc_9,
7215 &abi_break_gcc_13, &abi_break_gcc_14);
7217 gcc_assert ((allocate_nvrn || alignment <= 16 * BITS_PER_UNIT)
7218 && (!alignment || abi_break_gcc_9 < alignment)
7219 && (!abi_break_gcc_13 || alignment < abi_break_gcc_13));
7221 /* _BitInt(N) was only added in GCC 14. */
7222 bool warn_pcs_change_le_gcc14
7223 = warn_pcs_change && !bitint_or_aggr_of_bitint_p (type);
7225 /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
7226 The following code thus handles passing by SIMD/FP registers first. */
7228 nvrn = pcum->aapcs_nvrn;
7230 /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
7231 and homogenous short-vector aggregates (HVA). */
7232 if (allocate_nvrn)
7234 /* aarch64_function_arg_alignment has never had an effect on
7235 this case. */
7236 if (!pcum->silent_p && !TARGET_FLOAT)
7237 aarch64_err_no_fpadvsimd (mode);
7239 if (nvrn + nregs <= NUM_FP_ARG_REGS)
7241 pcum->aapcs_nextnvrn = nvrn + nregs;
7242 if (!aarch64_composite_type_p (type, mode))
7244 gcc_assert (nregs == 1);
7245 pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
7247 else if (aarch64_advsimd_full_struct_mode_p (mode)
7248 && known_eq (GET_MODE_SIZE (pcum->aapcs_vfp_rmode), 16))
7249 pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
7250 else if (aarch64_advsimd_partial_struct_mode_p (mode)
7251 && known_eq (GET_MODE_SIZE (pcum->aapcs_vfp_rmode), 8))
7252 pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
7253 else
7255 rtx par;
7256 int i;
7257 par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
7258 for (i = 0; i < nregs; i++)
7260 rtx tmp = gen_rtx_REG (pcum->aapcs_vfp_rmode,
7261 V0_REGNUM + nvrn + i);
7262 rtx offset = gen_int_mode
7263 (i * GET_MODE_SIZE (pcum->aapcs_vfp_rmode), Pmode);
7264 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
7265 XVECEXP (par, 0, i) = tmp;
7267 pcum->aapcs_reg = par;
7269 return;
7271 else
7273 /* C.3 NSRN is set to 8. */
7274 pcum->aapcs_nextnvrn = NUM_FP_ARG_REGS;
7275 goto on_stack;
7279 ncrn = pcum->aapcs_ncrn;
7280 nregs = size / UNITS_PER_WORD;
7282 /* C6 - C9. though the sign and zero extension semantics are
7283 handled elsewhere. This is the case where the argument fits
7284 entirely general registers. */
7285 if (allocate_ncrn && (ncrn + nregs <= NUM_ARG_REGS))
7287 gcc_assert (nregs == 0 || nregs == 1 || nregs == 2);
7289 /* C.8 if the argument has an alignment of 16 then the NGRN is
7290 rounded up to the next even number. */
7291 if (nregs == 2
7292 && ncrn % 2)
7294 /* Emit a warning if the alignment changed when taking the
7295 'packed' attribute into account. */
7296 if (warn_pcs_change_le_gcc14
7297 && abi_break_gcc_13
7298 && ((abi_break_gcc_13 == 16 * BITS_PER_UNIT)
7299 != (alignment == 16 * BITS_PER_UNIT)))
7300 inform (input_location, "parameter passing for argument of type "
7301 "%qT changed in GCC 13.1", type);
7303 if (warn_pcs_change_le_gcc14
7304 && abi_break_gcc_14
7305 && ((abi_break_gcc_14 == 16 * BITS_PER_UNIT)
7306 != (alignment == 16 * BITS_PER_UNIT)))
7307 inform (input_location, "parameter passing for argument of type "
7308 "%qT changed in GCC 14.1", type);
7310 /* The == 16 * BITS_PER_UNIT instead of >= 16 * BITS_PER_UNIT
7311 comparison is there because for > 16 * BITS_PER_UNIT
7312 alignment nregs should be > 2 and therefore it should be
7313 passed by reference rather than value. */
7314 if (alignment == 16 * BITS_PER_UNIT)
7316 if (warn_pcs_change_le_gcc14
7317 && abi_break_gcc_9)
7318 inform (input_location, "parameter passing for argument of type "
7319 "%qT changed in GCC 9.1", type);
7320 ++ncrn;
7321 gcc_assert (ncrn + nregs <= NUM_ARG_REGS);
7325 /* If an argument with an SVE mode needs to be shifted up to the
7326 high part of the register, treat it as though it had an integer mode.
7327 Using the normal (parallel [...]) would suppress the shifting. */
7328 if (sve_p
7329 && BYTES_BIG_ENDIAN
7330 && maybe_ne (GET_MODE_SIZE (mode), nregs * UNITS_PER_WORD)
7331 && aarch64_pad_reg_upward (mode, type, false))
7333 mode = int_mode_for_mode (mode).require ();
7334 sve_p = false;
7337 /* NREGS can be 0 when e.g. an empty structure is to be passed.
7338 A reg is still generated for it, but the caller should be smart
7339 enough not to use it. */
7340 if (nregs == 0
7341 || (nregs == 1 && !sve_p)
7342 || GET_MODE_CLASS (mode) == MODE_INT)
7343 pcum->aapcs_reg = gen_rtx_REG (mode, R0_REGNUM + ncrn);
7344 else
7346 rtx par;
7347 int i;
7349 par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
7350 for (i = 0; i < nregs; i++)
7352 scalar_int_mode reg_mode = word_mode;
7353 if (nregs == 1)
7354 reg_mode = int_mode_for_mode (mode).require ();
7355 rtx tmp = gen_rtx_REG (reg_mode, R0_REGNUM + ncrn + i);
7356 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
7357 GEN_INT (i * UNITS_PER_WORD));
7358 XVECEXP (par, 0, i) = tmp;
7360 pcum->aapcs_reg = par;
7363 pcum->aapcs_nextncrn = ncrn + nregs;
7364 return;
7367 /* C.11 */
7368 pcum->aapcs_nextncrn = NUM_ARG_REGS;
7370 /* The argument is passed on stack; record the needed number of words for
7371 this argument and align the total size if necessary. */
7372 on_stack:
7373 pcum->aapcs_stack_words = size / UNITS_PER_WORD;
7375 if (warn_pcs_change_le_gcc14
7376 && abi_break_gcc_13
7377 && ((abi_break_gcc_13 >= 16 * BITS_PER_UNIT)
7378 != (alignment >= 16 * BITS_PER_UNIT)))
7379 inform (input_location, "parameter passing for argument of type "
7380 "%qT changed in GCC 13.1", type);
7382 if (warn_pcs_change_le_gcc14
7383 && abi_break_gcc_14
7384 && ((abi_break_gcc_14 >= 16 * BITS_PER_UNIT)
7385 != (alignment >= 16 * BITS_PER_UNIT)))
7386 inform (input_location, "parameter passing for argument of type "
7387 "%qT changed in GCC 14.1", type);
7389 if (alignment == 16 * BITS_PER_UNIT)
7391 int new_size = ROUND_UP (pcum->aapcs_stack_size, 16 / UNITS_PER_WORD);
7392 if (pcum->aapcs_stack_size != new_size)
7394 if (warn_pcs_change_le_gcc14
7395 && abi_break_gcc_9)
7396 inform (input_location, "parameter passing for argument of type "
7397 "%qT changed in GCC 9.1", type);
7398 pcum->aapcs_stack_size = new_size;
7401 return;
7404 /* Add the current argument register to the set of those that need
7405 to be saved and restored around a change to PSTATE.SM. */
7407 static void
7408 aarch64_record_sme_mode_switch_args (CUMULATIVE_ARGS *pcum)
7410 subrtx_var_iterator::array_type array;
7411 FOR_EACH_SUBRTX_VAR (iter, array, pcum->aapcs_reg, NONCONST)
7413 rtx x = *iter;
7414 if (REG_P (x) && (FP_REGNUM_P (REGNO (x)) || PR_REGNUM_P (REGNO (x))))
7416 unsigned int i = pcum->num_sme_mode_switch_args++;
7417 gcc_assert (i < ARRAY_SIZE (pcum->sme_mode_switch_args));
7418 pcum->sme_mode_switch_args[i] = x;
7423 /* Return a parallel that contains all the registers that need to be
7424 saved around a change to PSTATE.SM. Return const0_rtx if there is
7425 no such mode switch, or if no registers need to be saved. */
7427 static rtx
7428 aarch64_finish_sme_mode_switch_args (CUMULATIVE_ARGS *pcum)
7430 if (!pcum->num_sme_mode_switch_args)
7431 return const0_rtx;
7433 auto argvec = gen_rtvec_v (pcum->num_sme_mode_switch_args,
7434 pcum->sme_mode_switch_args);
7435 return gen_rtx_PARALLEL (VOIDmode, argvec);
7438 /* Implement TARGET_FUNCTION_ARG. */
7440 static rtx
7441 aarch64_function_arg (cumulative_args_t pcum_v, const function_arg_info &arg)
7443 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
7444 gcc_assert (pcum->pcs_variant == ARM_PCS_AAPCS64
7445 || pcum->pcs_variant == ARM_PCS_SIMD
7446 || pcum->pcs_variant == ARM_PCS_SVE);
7448 if (arg.end_marker_p ())
7450 rtx abi_cookie = aarch64_gen_callee_cookie (pcum->isa_mode,
7451 pcum->pcs_variant,
7452 pcum->indirect_return);
7453 rtx sme_mode_switch_args = aarch64_finish_sme_mode_switch_args (pcum);
7454 rtx shared_za_flags = gen_int_mode (pcum->shared_za_flags, SImode);
7455 rtx shared_zt0_flags = gen_int_mode (pcum->shared_zt0_flags, SImode);
7456 return gen_rtx_PARALLEL (VOIDmode, gen_rtvec (4, abi_cookie,
7457 sme_mode_switch_args,
7458 shared_za_flags,
7459 shared_zt0_flags));
7462 aarch64_layout_arg (pcum_v, arg);
7463 return pcum->aapcs_reg;
7466 void
7467 aarch64_init_cumulative_args (CUMULATIVE_ARGS *pcum,
7468 const_tree fntype,
7469 rtx libname ATTRIBUTE_UNUSED,
7470 const_tree fndecl,
7471 unsigned n_named ATTRIBUTE_UNUSED,
7472 bool silent_p)
7474 pcum->aapcs_ncrn = 0;
7475 pcum->aapcs_nvrn = 0;
7476 pcum->aapcs_nprn = 0;
7477 pcum->aapcs_nextncrn = 0;
7478 pcum->aapcs_nextnvrn = 0;
7479 pcum->aapcs_nextnprn = 0;
7480 if (fntype)
7482 pcum->pcs_variant = (arm_pcs) fntype_abi (fntype).id ();
7483 pcum->isa_mode = aarch64_fntype_isa_mode (fntype);
7484 pcum->indirect_return = lookup_attribute ("indirect_return",
7485 TYPE_ATTRIBUTES (fntype));
7487 else
7489 pcum->pcs_variant = ARM_PCS_AAPCS64;
7490 pcum->isa_mode = AARCH64_DEFAULT_ISA_MODE;
7491 pcum->indirect_return = false;
7493 pcum->aapcs_reg = NULL_RTX;
7494 pcum->aapcs_arg_processed = false;
7495 pcum->aapcs_stack_words = 0;
7496 pcum->aapcs_stack_size = 0;
7497 pcum->silent_p = silent_p;
7498 pcum->shared_za_flags
7499 = (fntype ? aarch64_fntype_shared_flags (fntype, "za") : 0U);
7500 pcum->shared_zt0_flags
7501 = (fntype ? aarch64_fntype_shared_flags (fntype, "zt0") : 0U);
7502 pcum->num_sme_mode_switch_args = 0;
7504 if (!silent_p
7505 && !TARGET_FLOAT
7506 && fntype && fntype != error_mark_node)
7508 const_tree type = TREE_TYPE (fntype);
7509 machine_mode mode ATTRIBUTE_UNUSED; /* To pass pointer as argument. */
7510 int nregs ATTRIBUTE_UNUSED; /* Likewise. */
7511 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type), type,
7512 &mode, &nregs, NULL, false))
7513 aarch64_err_no_fpadvsimd (TYPE_MODE (type));
7516 if (!silent_p
7517 && !TARGET_SVE
7518 && pcum->pcs_variant == ARM_PCS_SVE)
7520 /* We can't gracefully recover at this point, so make this a
7521 fatal error. */
7522 if (fndecl)
7523 fatal_error (input_location, "%qE requires the SVE ISA extension",
7524 fndecl);
7525 else
7526 fatal_error (input_location, "calls to functions of type %qT require"
7527 " the SVE ISA extension", fntype);
7531 static void
7532 aarch64_function_arg_advance (cumulative_args_t pcum_v,
7533 const function_arg_info &arg)
7535 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
7536 if (pcum->pcs_variant == ARM_PCS_AAPCS64
7537 || pcum->pcs_variant == ARM_PCS_SIMD
7538 || pcum->pcs_variant == ARM_PCS_SVE)
7540 aarch64_layout_arg (pcum_v, arg);
7541 gcc_assert ((pcum->aapcs_reg != NULL_RTX)
7542 != (pcum->aapcs_stack_words != 0));
7543 if (pcum->aapcs_reg
7544 && aarch64_call_switches_pstate_sm (pcum->isa_mode))
7545 aarch64_record_sme_mode_switch_args (pcum);
7547 pcum->aapcs_arg_processed = false;
7548 pcum->aapcs_ncrn = pcum->aapcs_nextncrn;
7549 pcum->aapcs_nvrn = pcum->aapcs_nextnvrn;
7550 pcum->aapcs_nprn = pcum->aapcs_nextnprn;
7551 pcum->aapcs_stack_size += pcum->aapcs_stack_words;
7552 pcum->aapcs_stack_words = 0;
7553 pcum->aapcs_reg = NULL_RTX;
7557 bool
7558 aarch64_function_arg_regno_p (unsigned regno)
7560 return ((GP_REGNUM_P (regno) && regno < R0_REGNUM + NUM_ARG_REGS)
7561 || (FP_REGNUM_P (regno) && regno < V0_REGNUM + NUM_FP_ARG_REGS)
7562 || (PR_REGNUM_P (regno) && regno < P0_REGNUM + NUM_PR_ARG_REGS));
7565 /* Implement FUNCTION_ARG_BOUNDARY. Every parameter gets at least
7566 PARM_BOUNDARY bits of alignment, but will be given anything up
7567 to STACK_BOUNDARY bits if the type requires it. This makes sure
7568 that both before and after the layout of each argument, the Next
7569 Stacked Argument Address (NSAA) will have a minimum alignment of
7570 8 bytes. */
7572 static unsigned int
7573 aarch64_function_arg_boundary (machine_mode mode, const_tree type)
7575 unsigned int abi_break_gcc_9;
7576 unsigned int abi_break_gcc_13;
7577 unsigned int abi_break_gcc_14;
7578 unsigned int alignment = aarch64_function_arg_alignment (mode, type,
7579 &abi_break_gcc_9,
7580 &abi_break_gcc_13,
7581 &abi_break_gcc_14);
7582 /* We rely on aarch64_layout_arg and aarch64_gimplify_va_arg_expr
7583 to emit warnings about ABI incompatibility. */
7584 alignment = MIN (MAX (alignment, PARM_BOUNDARY), STACK_BOUNDARY);
7585 return alignment;
7588 /* Implement TARGET_GET_RAW_RESULT_MODE and TARGET_GET_RAW_ARG_MODE. */
7590 static fixed_size_mode
7591 aarch64_get_reg_raw_mode (int regno)
7593 /* Don't use any non GP registers for __builtin_apply and
7594 __builtin_return if general registers only mode is requested. */
7595 if (TARGET_GENERAL_REGS_ONLY && !GP_REGNUM_P (regno))
7596 return as_a <fixed_size_mode> (VOIDmode);
7597 if (TARGET_SVE && FP_REGNUM_P (regno))
7598 /* Don't use the SVE part of the register for __builtin_apply and
7599 __builtin_return. The SVE registers aren't used by the normal PCS,
7600 so using them there would be a waste of time. The PCS extensions
7601 for SVE types are fundamentally incompatible with the
7602 __builtin_return/__builtin_apply interface. */
7603 return as_a <fixed_size_mode> (V16QImode);
7604 if (PR_REGNUM_P (regno))
7605 /* For SVE PR regs, indicate that they should be ignored for
7606 __builtin_apply/__builtin_return. */
7607 return as_a <fixed_size_mode> (VOIDmode);
7608 return default_get_reg_raw_mode (regno);
7611 /* Implement TARGET_FUNCTION_ARG_PADDING.
7613 Small aggregate types are placed in the lowest memory address.
7615 The related parameter passing rules are B.4, C.3, C.5 and C.14. */
7617 static pad_direction
7618 aarch64_function_arg_padding (machine_mode mode, const_tree type)
7620 /* On little-endian targets, the least significant byte of every stack
7621 argument is passed at the lowest byte address of the stack slot. */
7622 if (!BYTES_BIG_ENDIAN)
7623 return PAD_UPWARD;
7625 /* Otherwise, integral, floating-point and pointer types are padded downward:
7626 the least significant byte of a stack argument is passed at the highest
7627 byte address of the stack slot. */
7628 if (type
7629 ? (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type)
7630 || POINTER_TYPE_P (type))
7631 : (SCALAR_INT_MODE_P (mode) || SCALAR_FLOAT_MODE_P (mode)))
7632 return PAD_DOWNWARD;
7634 /* Everything else padded upward, i.e. data in first byte of stack slot. */
7635 return PAD_UPWARD;
7638 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
7640 It specifies padding for the last (may also be the only)
7641 element of a block move between registers and memory. If
7642 assuming the block is in the memory, padding upward means that
7643 the last element is padded after its highest significant byte,
7644 while in downward padding, the last element is padded at the
7645 its least significant byte side.
7647 Small aggregates and small complex types are always padded
7648 upwards.
7650 We don't need to worry about homogeneous floating-point or
7651 short-vector aggregates; their move is not affected by the
7652 padding direction determined here. Regardless of endianness,
7653 each element of such an aggregate is put in the least
7654 significant bits of a fp/simd register.
7656 Return !BYTES_BIG_ENDIAN if the least significant byte of the
7657 register has useful data, and return the opposite if the most
7658 significant byte does. */
7660 bool
7661 aarch64_pad_reg_upward (machine_mode mode, const_tree type,
7662 bool first ATTRIBUTE_UNUSED)
7665 /* Aside from pure scalable types, small composite types are always
7666 padded upward. */
7667 if (BYTES_BIG_ENDIAN && aarch64_composite_type_p (type, mode))
7669 HOST_WIDE_INT size;
7670 if (type)
7671 size = int_size_in_bytes (type);
7672 else
7673 /* No frontends can create types with variable-sized modes, so we
7674 shouldn't be asked to pass or return them. */
7675 size = GET_MODE_SIZE (mode).to_constant ();
7676 if (size < 2 * UNITS_PER_WORD)
7678 pure_scalable_type_info pst_info;
7679 if (pst_info.analyze_registers (type))
7680 return false;
7681 return true;
7685 /* Otherwise, use the default padding. */
7686 return !BYTES_BIG_ENDIAN;
7689 static scalar_int_mode
7690 aarch64_libgcc_cmp_return_mode (void)
7692 return SImode;
7695 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
7697 /* We use the 12-bit shifted immediate arithmetic instructions so values
7698 must be multiple of (1 << 12), i.e. 4096. */
7699 #define ARITH_FACTOR 4096
7701 #if (PROBE_INTERVAL % ARITH_FACTOR) != 0
7702 #error Cannot use simple address calculation for stack probing
7703 #endif
7705 /* Emit code to probe a range of stack addresses from FIRST to FIRST+POLY_SIZE,
7706 inclusive. These are offsets from the current stack pointer. */
7708 static void
7709 aarch64_emit_probe_stack_range (HOST_WIDE_INT first, poly_int64 poly_size)
7711 HOST_WIDE_INT size;
7712 if (!poly_size.is_constant (&size))
7714 sorry ("stack probes for SVE frames");
7715 return;
7718 rtx reg1 = gen_rtx_REG (Pmode, PROBE_STACK_FIRST_REGNUM);
7720 /* See the same assertion on PROBE_INTERVAL above. */
7721 gcc_assert ((first % ARITH_FACTOR) == 0);
7723 /* See if we have a constant small number of probes to generate. If so,
7724 that's the easy case. */
7725 if (size <= PROBE_INTERVAL)
7727 const HOST_WIDE_INT base = ROUND_UP (size, ARITH_FACTOR);
7729 emit_set_insn (reg1,
7730 plus_constant (Pmode,
7731 stack_pointer_rtx, -(first + base)));
7732 emit_stack_probe (plus_constant (Pmode, reg1, base - size));
7735 /* The run-time loop is made up of 8 insns in the generic case while the
7736 compile-time loop is made up of 4+2*(n-2) insns for n # of intervals. */
7737 else if (size <= 4 * PROBE_INTERVAL)
7739 HOST_WIDE_INT i, rem;
7741 emit_set_insn (reg1,
7742 plus_constant (Pmode,
7743 stack_pointer_rtx,
7744 -(first + PROBE_INTERVAL)));
7745 emit_stack_probe (reg1);
7747 /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 2 until
7748 it exceeds SIZE. If only two probes are needed, this will not
7749 generate any code. Then probe at FIRST + SIZE. */
7750 for (i = 2 * PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
7752 emit_set_insn (reg1,
7753 plus_constant (Pmode, reg1, -PROBE_INTERVAL));
7754 emit_stack_probe (reg1);
7757 rem = size - (i - PROBE_INTERVAL);
7758 if (rem > 256)
7760 const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
7762 emit_set_insn (reg1, plus_constant (Pmode, reg1, -base));
7763 emit_stack_probe (plus_constant (Pmode, reg1, base - rem));
7765 else
7766 emit_stack_probe (plus_constant (Pmode, reg1, -rem));
7769 /* Otherwise, do the same as above, but in a loop. Note that we must be
7770 extra careful with variables wrapping around because we might be at
7771 the very top (or the very bottom) of the address space and we have
7772 to be able to handle this case properly; in particular, we use an
7773 equality test for the loop condition. */
7774 else
7776 rtx reg2 = gen_rtx_REG (Pmode, PROBE_STACK_SECOND_REGNUM);
7778 /* Step 1: round SIZE to the previous multiple of the interval. */
7780 HOST_WIDE_INT rounded_size = size & -PROBE_INTERVAL;
7783 /* Step 2: compute initial and final value of the loop counter. */
7785 /* TEST_ADDR = SP + FIRST. */
7786 emit_set_insn (reg1,
7787 plus_constant (Pmode, stack_pointer_rtx, -first));
7789 /* LAST_ADDR = SP + FIRST + ROUNDED_SIZE. */
7790 HOST_WIDE_INT adjustment = - (first + rounded_size);
7791 if (! aarch64_uimm12_shift (adjustment))
7793 aarch64_internal_mov_immediate (reg2, GEN_INT (adjustment),
7794 true, Pmode);
7795 emit_set_insn (reg2, gen_rtx_PLUS (Pmode, stack_pointer_rtx, reg2));
7797 else
7798 emit_set_insn (reg2,
7799 plus_constant (Pmode, stack_pointer_rtx, adjustment));
7801 /* Step 3: the loop
7805 TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
7806 probe at TEST_ADDR
7808 while (TEST_ADDR != LAST_ADDR)
7810 probes at FIRST + N * PROBE_INTERVAL for values of N from 1
7811 until it is equal to ROUNDED_SIZE. */
7813 emit_insn (gen_probe_stack_range (reg1, reg1, reg2));
7816 /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
7817 that SIZE is equal to ROUNDED_SIZE. */
7819 if (size != rounded_size)
7821 HOST_WIDE_INT rem = size - rounded_size;
7823 if (rem > 256)
7825 const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
7827 emit_set_insn (reg2, plus_constant (Pmode, reg2, -base));
7828 emit_stack_probe (plus_constant (Pmode, reg2, base - rem));
7830 else
7831 emit_stack_probe (plus_constant (Pmode, reg2, -rem));
7835 /* Make sure nothing is scheduled before we are done. */
7836 emit_insn (gen_blockage ());
7839 /* Probe a range of stack addresses from REG1 to REG2 inclusive. These are
7840 absolute addresses. */
7842 const char *
7843 aarch64_output_probe_stack_range (rtx reg1, rtx reg2)
7845 static int labelno = 0;
7846 char loop_lab[32];
7847 rtx xops[2];
7849 ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
7851 /* Loop. */
7852 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
7854 HOST_WIDE_INT stack_clash_probe_interval
7855 = 1 << param_stack_clash_protection_guard_size;
7857 /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL. */
7858 xops[0] = reg1;
7859 HOST_WIDE_INT interval;
7860 if (flag_stack_clash_protection)
7861 interval = stack_clash_probe_interval;
7862 else
7863 interval = PROBE_INTERVAL;
7865 gcc_assert (aarch64_uimm12_shift (interval));
7866 xops[1] = GEN_INT (interval);
7868 output_asm_insn ("sub\t%0, %0, %1", xops);
7870 /* If doing stack clash protection then we probe up by the ABI specified
7871 amount. We do this because we're dropping full pages at a time in the
7872 loop. But if we're doing non-stack clash probing, probe at SP 0. */
7873 if (flag_stack_clash_protection)
7874 xops[1] = GEN_INT (STACK_CLASH_CALLER_GUARD);
7875 else
7876 xops[1] = CONST0_RTX (GET_MODE (xops[1]));
7878 /* Probe at TEST_ADDR. If we're inside the loop it is always safe to probe
7879 by this amount for each iteration. */
7880 output_asm_insn ("str\txzr, [%0, %1]", xops);
7882 /* Test if TEST_ADDR == LAST_ADDR. */
7883 xops[1] = reg2;
7884 output_asm_insn ("cmp\t%0, %1", xops);
7886 /* Branch. */
7887 fputs ("\tb.ne\t", asm_out_file);
7888 assemble_name_raw (asm_out_file, loop_lab);
7889 fputc ('\n', asm_out_file);
7891 return "";
7894 /* Emit the probe loop for doing stack clash probes and stack adjustments for
7895 SVE. This emits probes from BASE to BASE - ADJUSTMENT based on a guard size
7896 of GUARD_SIZE. When a probe is emitted it is done at most
7897 MIN_PROBE_THRESHOLD bytes from the current BASE at an interval of
7898 at most MIN_PROBE_THRESHOLD. By the end of this function
7899 BASE = BASE - ADJUSTMENT. */
7901 const char *
7902 aarch64_output_probe_sve_stack_clash (rtx base, rtx adjustment,
7903 rtx min_probe_threshold, rtx guard_size)
7905 /* This function is not allowed to use any instruction generation function
7906 like gen_ and friends. If you do you'll likely ICE during CFG validation,
7907 so instead emit the code you want using output_asm_insn. */
7908 gcc_assert (flag_stack_clash_protection);
7909 gcc_assert (CONST_INT_P (min_probe_threshold) && CONST_INT_P (guard_size));
7910 gcc_assert (INTVAL (guard_size) > INTVAL (min_probe_threshold));
7912 /* The minimum required allocation before the residual requires probing. */
7913 HOST_WIDE_INT residual_probe_guard = INTVAL (min_probe_threshold);
7915 /* Clamp the value down to the nearest value that can be used with a cmp. */
7916 residual_probe_guard = aarch64_clamp_to_uimm12_shift (residual_probe_guard);
7917 rtx probe_offset_value_rtx = gen_int_mode (residual_probe_guard, Pmode);
7919 gcc_assert (INTVAL (min_probe_threshold) >= residual_probe_guard);
7920 gcc_assert (aarch64_uimm12_shift (residual_probe_guard));
7922 static int labelno = 0;
7923 char loop_start_lab[32];
7924 char loop_end_lab[32];
7925 rtx xops[2];
7927 ASM_GENERATE_INTERNAL_LABEL (loop_start_lab, "SVLPSPL", labelno);
7928 ASM_GENERATE_INTERNAL_LABEL (loop_end_lab, "SVLPEND", labelno++);
7930 /* Emit loop start label. */
7931 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_start_lab);
7933 /* ADJUSTMENT < RESIDUAL_PROBE_GUARD. */
7934 xops[0] = adjustment;
7935 xops[1] = probe_offset_value_rtx;
7936 output_asm_insn ("cmp\t%0, %1", xops);
7938 /* Branch to end if not enough adjustment to probe. */
7939 fputs ("\tb.lt\t", asm_out_file);
7940 assemble_name_raw (asm_out_file, loop_end_lab);
7941 fputc ('\n', asm_out_file);
7943 /* BASE = BASE - RESIDUAL_PROBE_GUARD. */
7944 xops[0] = base;
7945 xops[1] = probe_offset_value_rtx;
7946 output_asm_insn ("sub\t%0, %0, %1", xops);
7948 /* Probe at BASE. */
7949 xops[1] = const0_rtx;
7950 output_asm_insn ("str\txzr, [%0, %1]", xops);
7952 /* ADJUSTMENT = ADJUSTMENT - RESIDUAL_PROBE_GUARD. */
7953 xops[0] = adjustment;
7954 xops[1] = probe_offset_value_rtx;
7955 output_asm_insn ("sub\t%0, %0, %1", xops);
7957 /* Branch to start if still more bytes to allocate. */
7958 fputs ("\tb\t", asm_out_file);
7959 assemble_name_raw (asm_out_file, loop_start_lab);
7960 fputc ('\n', asm_out_file);
7962 /* No probe leave. */
7963 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_end_lab);
7965 /* BASE = BASE - ADJUSTMENT. */
7966 xops[0] = base;
7967 xops[1] = adjustment;
7968 output_asm_insn ("sub\t%0, %0, %1", xops);
7969 return "";
7972 /* Determine whether a frame chain needs to be generated. */
7973 static bool
7974 aarch64_needs_frame_chain (void)
7976 if (frame_pointer_needed)
7977 return true;
7979 /* A leaf function cannot have calls or write LR. */
7980 bool is_leaf = crtl->is_leaf && !df_regs_ever_live_p (LR_REGNUM);
7982 /* Don't use a frame chain in leaf functions if leaf frame pointers
7983 are disabled. */
7984 if (flag_omit_leaf_frame_pointer && is_leaf)
7985 return false;
7987 return aarch64_use_frame_pointer;
7990 /* Return true if the current function should save registers above
7991 the locals area, rather than below it. */
7993 static bool
7994 aarch64_save_regs_above_locals_p ()
7996 /* When using stack smash protection, make sure that the canary slot
7997 comes between the locals and the saved registers. Otherwise,
7998 it would be possible for a carefully sized smash attack to change
7999 the saved registers (particularly LR and FP) without reaching the
8000 canary. */
8001 return crtl->stack_protect_guard;
8004 /* Return true if the current function needs to record the incoming
8005 value of PSTATE.SM. */
8006 static bool
8007 aarch64_need_old_pstate_sm ()
8009 /* Exit early if the incoming value of PSTATE.SM is known at
8010 compile time. */
8011 if (aarch64_cfun_incoming_pstate_sm () != 0)
8012 return false;
8014 if (aarch64_cfun_enables_pstate_sm ())
8015 return true;
8017 /* Non-local goto receivers are entered with PSTATE.SM equal to 0,
8018 but the function needs to return with PSTATE.SM unchanged. */
8019 if (nonlocal_goto_handler_labels)
8020 return true;
8022 /* Likewise for exception handlers. */
8023 eh_landing_pad lp;
8024 for (unsigned int i = 1; vec_safe_iterate (cfun->eh->lp_array, i, &lp); ++i)
8025 if (lp && lp->post_landing_pad)
8026 return true;
8028 /* Non-local gotos need to set PSTATE.SM to zero. It's possible to call
8029 streaming-compatible functions without SME being available, so PSTATE.SM
8030 should only be changed if it is currently set to one. */
8031 if (crtl->has_nonlocal_goto)
8032 return true;
8034 if (cfun->machine->call_switches_pstate_sm)
8035 for (auto insn = get_insns (); insn; insn = NEXT_INSN (insn))
8036 if (auto *call = dyn_cast<rtx_call_insn *> (insn))
8037 if (!SIBLING_CALL_P (call))
8039 /* Return true if there is a call to a non-streaming-compatible
8040 function. */
8041 auto callee_isa_mode = aarch64_insn_callee_isa_mode (call);
8042 if (aarch64_call_switches_pstate_sm (callee_isa_mode))
8043 return true;
8045 return false;
8048 /* Mark the registers that need to be saved by the callee and calculate
8049 the size of the callee-saved registers area and frame record (both FP
8050 and LR may be omitted). */
8051 static void
8052 aarch64_layout_frame (void)
8054 unsigned regno, last_fp_reg = INVALID_REGNUM;
8055 machine_mode vector_save_mode = aarch64_reg_save_mode (V8_REGNUM);
8056 poly_int64 vector_save_size = GET_MODE_SIZE (vector_save_mode);
8057 bool frame_related_fp_reg_p = false;
8058 aarch64_frame &frame = cfun->machine->frame;
8059 poly_int64 top_of_locals = -1;
8060 bool enables_pstate_sm = aarch64_cfun_enables_pstate_sm ();
8062 vec_safe_truncate (frame.saved_gprs, 0);
8063 vec_safe_truncate (frame.saved_fprs, 0);
8064 vec_safe_truncate (frame.saved_prs, 0);
8066 frame.emit_frame_chain = aarch64_needs_frame_chain ();
8068 /* Adjust the outgoing arguments size if required. Keep it in sync with what
8069 the mid-end is doing. */
8070 crtl->outgoing_args_size = STACK_DYNAMIC_OFFSET (cfun);
8072 #define SLOT_NOT_REQUIRED (-2)
8073 #define SLOT_REQUIRED (-1)
8075 frame.wb_push_candidate1 = INVALID_REGNUM;
8076 frame.wb_push_candidate2 = INVALID_REGNUM;
8077 frame.spare_pred_reg = INVALID_REGNUM;
8079 /* First mark all the registers that really need to be saved... */
8080 for (regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
8081 frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
8082 frame.old_svcr_offset = SLOT_NOT_REQUIRED;
8084 /* ... that includes the eh data registers (if needed)... */
8085 if (crtl->calls_eh_return)
8086 for (regno = 0; EH_RETURN_DATA_REGNO (regno) != INVALID_REGNUM; regno++)
8087 frame.reg_offset[EH_RETURN_DATA_REGNO (regno)] = SLOT_REQUIRED;
8089 /* ... and any callee saved register that dataflow says is live. */
8090 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
8091 if (df_regs_ever_live_p (regno)
8092 && !fixed_regs[regno]
8093 && (regno == R30_REGNUM
8094 || !crtl->abi->clobbers_full_reg_p (regno)))
8095 frame.reg_offset[regno] = SLOT_REQUIRED;
8097 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
8098 if ((enables_pstate_sm || df_regs_ever_live_p (regno))
8099 && !fixed_regs[regno]
8100 && !crtl->abi->clobbers_full_reg_p (regno))
8102 frame.reg_offset[regno] = SLOT_REQUIRED;
8103 last_fp_reg = regno;
8104 if (aarch64_emit_cfi_for_reg_p (regno))
8105 frame_related_fp_reg_p = true;
8108 /* Big-endian SVE frames need a spare predicate register in order
8109 to save Z8-Z15. Decide which register they should use. Prefer
8110 an unused argument register if possible, so that we don't force P4
8111 to be saved unnecessarily. */
8112 if (frame_related_fp_reg_p
8113 && crtl->abi->id () == ARM_PCS_SVE
8114 && BYTES_BIG_ENDIAN)
8116 bitmap live1 = df_get_live_out (ENTRY_BLOCK_PTR_FOR_FN (cfun));
8117 bitmap live2 = df_get_live_in (EXIT_BLOCK_PTR_FOR_FN (cfun));
8118 for (regno = P0_REGNUM; regno <= P7_REGNUM; regno++)
8119 if (!bitmap_bit_p (live1, regno) && !bitmap_bit_p (live2, regno))
8120 break;
8121 gcc_assert (regno <= P7_REGNUM);
8122 frame.spare_pred_reg = regno;
8123 df_set_regs_ever_live (regno, true);
8126 for (regno = P0_REGNUM; regno <= P15_REGNUM; regno++)
8127 if ((enables_pstate_sm || df_regs_ever_live_p (regno))
8128 && !fixed_regs[regno]
8129 && !crtl->abi->clobbers_full_reg_p (regno))
8130 frame.reg_offset[regno] = SLOT_REQUIRED;
8132 bool regs_at_top_p = aarch64_save_regs_above_locals_p ();
8134 poly_int64 offset = crtl->outgoing_args_size;
8135 gcc_assert (multiple_p (offset, STACK_BOUNDARY / BITS_PER_UNIT));
8136 if (regs_at_top_p)
8138 offset += get_frame_size ();
8139 offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT);
8140 top_of_locals = offset;
8142 frame.bytes_below_saved_regs = offset;
8143 frame.sve_save_and_probe = INVALID_REGNUM;
8145 /* Now assign stack slots for the registers. Start with the predicate
8146 registers, since predicate LDR and STR have a relatively small
8147 offset range. These saves happen below the hard frame pointer. */
8148 for (regno = P0_REGNUM; regno <= P15_REGNUM; regno++)
8149 if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED))
8151 vec_safe_push (frame.saved_prs, regno);
8152 if (frame.sve_save_and_probe == INVALID_REGNUM)
8153 frame.sve_save_and_probe = regno;
8154 frame.reg_offset[regno] = offset;
8155 offset += BYTES_PER_SVE_PRED;
8158 poly_int64 saved_prs_size = offset - frame.bytes_below_saved_regs;
8159 if (maybe_ne (saved_prs_size, 0))
8161 /* If we have any vector registers to save above the predicate registers,
8162 the offset of the vector register save slots need to be a multiple
8163 of the vector size. This lets us use the immediate forms of LDR/STR
8164 (or LD1/ST1 for big-endian).
8166 A vector register is 8 times the size of a predicate register,
8167 and we need to save a maximum of 12 predicate registers, so the
8168 first vector register will be at either #1, MUL VL or #2, MUL VL.
8170 If we don't have any vector registers to save, and we know how
8171 big the predicate save area is, we can just round it up to the
8172 next 16-byte boundary. */
8173 if (last_fp_reg == INVALID_REGNUM && offset.is_constant ())
8174 offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT);
8175 else
8177 if (known_le (saved_prs_size, vector_save_size))
8178 offset = frame.bytes_below_saved_regs + vector_save_size;
8179 else if (known_le (saved_prs_size, vector_save_size * 2))
8180 offset = frame.bytes_below_saved_regs + vector_save_size * 2;
8181 else
8182 gcc_unreachable ();
8186 /* If we need to save any SVE vector registers, add them next. */
8187 if (last_fp_reg != INVALID_REGNUM && crtl->abi->id () == ARM_PCS_SVE)
8188 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
8189 if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED))
8191 vec_safe_push (frame.saved_fprs, regno);
8192 if (frame.sve_save_and_probe == INVALID_REGNUM)
8193 frame.sve_save_and_probe = regno;
8194 frame.reg_offset[regno] = offset;
8195 offset += vector_save_size;
8198 /* OFFSET is now the offset of the hard frame pointer from the bottom
8199 of the callee save area. */
8200 auto below_hard_fp_saved_regs_size = offset - frame.bytes_below_saved_regs;
8201 bool saves_below_hard_fp_p = maybe_ne (below_hard_fp_saved_regs_size, 0);
8202 gcc_assert (!saves_below_hard_fp_p
8203 || (frame.sve_save_and_probe != INVALID_REGNUM
8204 && known_eq (frame.reg_offset[frame.sve_save_and_probe],
8205 frame.bytes_below_saved_regs)));
8207 frame.bytes_below_hard_fp = offset;
8208 frame.hard_fp_save_and_probe = INVALID_REGNUM;
8210 auto allocate_gpr_slot = [&](unsigned int regno)
8212 vec_safe_push (frame.saved_gprs, regno);
8213 frame.reg_offset[regno] = offset;
8214 offset += UNITS_PER_WORD;
8217 if (frame.emit_frame_chain)
8219 /* FP and LR are placed in the linkage record. */
8220 allocate_gpr_slot (R29_REGNUM);
8221 allocate_gpr_slot (R30_REGNUM);
8223 else if ((flag_stack_clash_protection || !frame.is_scs_enabled)
8224 && known_eq (frame.reg_offset[R30_REGNUM], SLOT_REQUIRED))
8225 /* Put the LR save slot first, since it makes a good choice of probe
8226 for stack clash purposes. The idea is that the link register usually
8227 has to be saved before a call anyway, and so we lose little by
8228 stopping it from being individually shrink-wrapped. */
8229 allocate_gpr_slot (R30_REGNUM);
8231 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
8232 if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED))
8233 allocate_gpr_slot (regno);
8235 if (aarch64_need_old_pstate_sm ())
8237 frame.old_svcr_offset = offset;
8238 offset += UNITS_PER_WORD;
8241 /* If the current function changes the SVE vector length, ensure that the
8242 old value of the DWARF VG register is saved and available in the CFI,
8243 so that outer frames with VL-sized offsets can be processed correctly. */
8244 if (cfun->machine->call_switches_pstate_sm
8245 || aarch64_cfun_enables_pstate_sm ())
8247 frame.reg_offset[VG_REGNUM] = offset;
8248 offset += UNITS_PER_WORD;
8251 poly_int64 max_int_offset = offset;
8252 offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT);
8253 bool has_align_gap = maybe_ne (offset, max_int_offset);
8255 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
8256 if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED))
8258 vec_safe_push (frame.saved_fprs, regno);
8259 /* If there is an alignment gap between integer and fp callee-saves,
8260 allocate the last fp register to it if possible. */
8261 if (regno == last_fp_reg
8262 && has_align_gap
8263 && known_eq (vector_save_size, 8)
8264 && multiple_p (offset, 16))
8266 frame.reg_offset[regno] = max_int_offset;
8267 break;
8270 frame.reg_offset[regno] = offset;
8271 offset += vector_save_size;
8274 offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT);
8275 auto saved_regs_size = offset - frame.bytes_below_saved_regs;
8277 array_slice<unsigned int> push_regs = (!vec_safe_is_empty (frame.saved_gprs)
8278 ? frame.saved_gprs
8279 : frame.saved_fprs);
8280 if (!push_regs.empty ()
8281 && known_eq (frame.reg_offset[push_regs[0]], frame.bytes_below_hard_fp))
8283 frame.hard_fp_save_and_probe = push_regs[0];
8284 frame.wb_push_candidate1 = push_regs[0];
8285 if (push_regs.size () > 1)
8286 frame.wb_push_candidate2 = push_regs[1];
8289 /* With stack-clash, a register must be saved in non-leaf functions.
8290 The saving of the bottommost register counts as an implicit probe,
8291 which allows us to maintain the invariant described in the comment
8292 at expand_prologue. */
8293 gcc_assert (crtl->is_leaf || maybe_ne (saved_regs_size, 0));
8295 if (!regs_at_top_p)
8297 offset += get_frame_size ();
8298 offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT);
8299 top_of_locals = offset;
8301 offset += frame.saved_varargs_size;
8302 gcc_assert (multiple_p (offset, STACK_BOUNDARY / BITS_PER_UNIT));
8303 frame.frame_size = offset;
8305 frame.bytes_above_hard_fp = frame.frame_size - frame.bytes_below_hard_fp;
8306 gcc_assert (known_ge (top_of_locals, 0));
8307 frame.bytes_above_locals = frame.frame_size - top_of_locals;
8309 frame.initial_adjust = 0;
8310 frame.final_adjust = 0;
8311 frame.callee_adjust = 0;
8312 frame.sve_callee_adjust = 0;
8314 frame.wb_pop_candidate1 = frame.wb_push_candidate1;
8315 frame.wb_pop_candidate2 = frame.wb_push_candidate2;
8317 /* Shadow call stack only deals with functions where the LR is pushed
8318 onto the stack and without specifying the "no_sanitize" attribute
8319 with the argument "shadow-call-stack". */
8320 frame.is_scs_enabled
8321 = (!crtl->calls_eh_return
8322 && sanitize_flags_p (SANITIZE_SHADOW_CALL_STACK)
8323 && known_ge (frame.reg_offset[LR_REGNUM], 0));
8325 /* When shadow call stack is enabled, the scs_pop in the epilogue will
8326 restore x30, and we don't need to pop x30 again in the traditional
8327 way. Pop candidates record the registers that need to be popped
8328 eventually. */
8329 if (frame.is_scs_enabled)
8331 if (frame.wb_pop_candidate2 == R30_REGNUM)
8332 frame.wb_pop_candidate2 = INVALID_REGNUM;
8333 else if (frame.wb_pop_candidate1 == R30_REGNUM)
8334 frame.wb_pop_candidate1 = INVALID_REGNUM;
8337 /* If candidate2 is INVALID_REGNUM, we need to adjust max_push_offset to
8338 256 to ensure that the offset meets the requirements of emit_move_insn.
8339 Similarly, if candidate1 is INVALID_REGNUM, we need to set
8340 max_push_offset to 0, because no registers are popped at this time,
8341 so callee_adjust cannot be adjusted. */
8342 HOST_WIDE_INT max_push_offset = 0;
8343 if (frame.wb_pop_candidate1 != INVALID_REGNUM)
8345 if (frame.wb_pop_candidate2 != INVALID_REGNUM)
8346 max_push_offset = 512;
8347 else
8348 max_push_offset = 256;
8351 HOST_WIDE_INT const_size, const_below_saved_regs, const_above_fp;
8352 HOST_WIDE_INT const_saved_regs_size;
8353 if (known_eq (saved_regs_size, 0))
8354 frame.initial_adjust = frame.frame_size;
8355 else if (frame.frame_size.is_constant (&const_size)
8356 && const_size < max_push_offset
8357 && known_eq (frame.bytes_above_hard_fp, const_size))
8359 /* Simple, small frame with no data below the saved registers.
8361 stp reg1, reg2, [sp, -frame_size]!
8362 stp reg3, reg4, [sp, 16] */
8363 frame.callee_adjust = const_size;
8365 else if (frame.bytes_below_saved_regs.is_constant (&const_below_saved_regs)
8366 && saved_regs_size.is_constant (&const_saved_regs_size)
8367 && const_below_saved_regs + const_saved_regs_size < 512
8368 /* We could handle this case even with data below the saved
8369 registers, provided that that data left us with valid offsets
8370 for all predicate and vector save slots. It's such a rare
8371 case that it hardly seems worth the effort though. */
8372 && (!saves_below_hard_fp_p || const_below_saved_regs == 0)
8373 && !(cfun->calls_alloca
8374 && frame.bytes_above_hard_fp.is_constant (&const_above_fp)
8375 && const_above_fp < max_push_offset))
8377 /* Frame with small area below the saved registers:
8379 sub sp, sp, frame_size
8380 stp reg1, reg2, [sp, bytes_below_saved_regs]
8381 stp reg3, reg4, [sp, bytes_below_saved_regs + 16] */
8382 frame.initial_adjust = frame.frame_size;
8384 else if (saves_below_hard_fp_p
8385 && known_eq (saved_regs_size, below_hard_fp_saved_regs_size))
8387 /* Frame in which all saves are SVE saves:
8389 sub sp, sp, frame_size - bytes_below_saved_regs
8390 save SVE registers relative to SP
8391 sub sp, sp, bytes_below_saved_regs */
8392 frame.initial_adjust = frame.frame_size - frame.bytes_below_saved_regs;
8393 frame.final_adjust = frame.bytes_below_saved_regs;
8395 else if (frame.wb_push_candidate1 != INVALID_REGNUM
8396 && frame.bytes_above_hard_fp.is_constant (&const_above_fp)
8397 && const_above_fp < max_push_offset)
8399 /* Frame with large area below the saved registers, or with SVE saves,
8400 but with a small area above:
8402 stp reg1, reg2, [sp, -hard_fp_offset]!
8403 stp reg3, reg4, [sp, 16]
8404 [sub sp, sp, below_hard_fp_saved_regs_size]
8405 [save SVE registers relative to SP]
8406 sub sp, sp, bytes_below_saved_regs */
8407 frame.callee_adjust = const_above_fp;
8408 frame.sve_callee_adjust = below_hard_fp_saved_regs_size;
8409 frame.final_adjust = frame.bytes_below_saved_regs;
8411 else
8413 /* General case:
8415 sub sp, sp, hard_fp_offset
8416 stp x29, x30, [sp, 0]
8417 add x29, sp, 0
8418 stp reg3, reg4, [sp, 16]
8419 [sub sp, sp, below_hard_fp_saved_regs_size]
8420 [save SVE registers relative to SP]
8421 sub sp, sp, bytes_below_saved_regs */
8422 frame.initial_adjust = frame.bytes_above_hard_fp;
8423 frame.sve_callee_adjust = below_hard_fp_saved_regs_size;
8424 frame.final_adjust = frame.bytes_below_saved_regs;
8427 /* The frame is allocated in pieces, with each non-final piece
8428 including a register save at offset 0 that acts as a probe for
8429 the following piece. In addition, the save of the bottommost register
8430 acts as a probe for callees and allocas. Roll back any probes that
8431 aren't needed.
8433 A probe isn't needed if it is associated with the final allocation
8434 (including callees and allocas) that happens before the epilogue is
8435 executed. */
8436 if (crtl->is_leaf
8437 && !cfun->calls_alloca
8438 && known_eq (frame.final_adjust, 0))
8440 if (maybe_ne (frame.sve_callee_adjust, 0))
8441 frame.sve_save_and_probe = INVALID_REGNUM;
8442 else
8443 frame.hard_fp_save_and_probe = INVALID_REGNUM;
8446 /* Make sure the individual adjustments add up to the full frame size. */
8447 gcc_assert (known_eq (frame.initial_adjust
8448 + frame.callee_adjust
8449 + frame.sve_callee_adjust
8450 + frame.final_adjust, frame.frame_size));
8452 if (frame.callee_adjust == 0)
8454 /* We've decided not to do a "real" push and pop. However,
8455 setting up the frame chain is treated as being essentially
8456 a multi-instruction push. */
8457 frame.wb_pop_candidate1 = frame.wb_pop_candidate2 = INVALID_REGNUM;
8458 if (!frame.emit_frame_chain)
8459 frame.wb_push_candidate1 = frame.wb_push_candidate2 = INVALID_REGNUM;
8462 frame.laid_out = true;
8465 /* Return true if the register REGNO is saved on entry to
8466 the current function. */
8468 static bool
8469 aarch64_register_saved_on_entry (int regno)
8471 return known_ge (cfun->machine->frame.reg_offset[regno], 0);
8474 /* Push the register number REGNO of mode MODE to the stack with write-back
8475 adjusting the stack by ADJUSTMENT. */
8477 static void
8478 aarch64_pushwb_single_reg (machine_mode mode, unsigned regno,
8479 HOST_WIDE_INT adjustment)
8481 rtx base_rtx = stack_pointer_rtx;
8482 rtx insn, reg, mem;
8484 reg = gen_rtx_REG (mode, regno);
8485 mem = gen_rtx_PRE_MODIFY (Pmode, base_rtx,
8486 plus_constant (Pmode, base_rtx, -adjustment));
8487 mem = gen_frame_mem (mode, mem);
8489 insn = emit_move_insn (mem, reg);
8490 RTX_FRAME_RELATED_P (insn) = 1;
8493 /* Generate and return an instruction to store the pair of registers
8494 REG and REG2 of mode MODE to location BASE with write-back adjusting
8495 the stack location BASE by ADJUSTMENT. */
8497 static rtx
8498 aarch64_gen_storewb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
8499 HOST_WIDE_INT adjustment)
8501 rtx new_base = plus_constant (Pmode, base, -adjustment);
8502 rtx mem = gen_frame_mem (mode, new_base);
8503 rtx mem2 = adjust_address_nv (mem, mode, GET_MODE_SIZE (mode));
8505 return gen_rtx_PARALLEL (VOIDmode,
8506 gen_rtvec (3,
8507 gen_rtx_SET (base, new_base),
8508 gen_rtx_SET (mem, reg),
8509 gen_rtx_SET (mem2, reg2)));
8512 /* Push registers numbered REGNO1 and REGNO2 to the stack, adjusting the
8513 stack pointer by ADJUSTMENT. */
8515 static void
8516 aarch64_push_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment)
8518 rtx_insn *insn;
8519 machine_mode mode = aarch64_reg_save_mode (regno1);
8521 if (regno2 == INVALID_REGNUM)
8522 return aarch64_pushwb_single_reg (mode, regno1, adjustment);
8524 rtx reg1 = gen_rtx_REG (mode, regno1);
8525 rtx reg2 = gen_rtx_REG (mode, regno2);
8527 insn = emit_insn (aarch64_gen_storewb_pair (mode, stack_pointer_rtx, reg1,
8528 reg2, adjustment));
8529 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 2)) = 1;
8530 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
8531 RTX_FRAME_RELATED_P (insn) = 1;
8534 /* Load the pair of register REG, REG2 of mode MODE from stack location BASE,
8535 adjusting it by ADJUSTMENT afterwards. */
8537 static rtx
8538 aarch64_gen_loadwb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
8539 HOST_WIDE_INT adjustment)
8541 rtx mem = gen_frame_mem (mode, base);
8542 rtx mem2 = adjust_address_nv (mem, mode, GET_MODE_SIZE (mode));
8543 rtx new_base = plus_constant (Pmode, base, adjustment);
8545 return gen_rtx_PARALLEL (VOIDmode,
8546 gen_rtvec (3,
8547 gen_rtx_SET (base, new_base),
8548 gen_rtx_SET (reg, mem),
8549 gen_rtx_SET (reg2, mem2)));
8552 /* Pop the two registers numbered REGNO1, REGNO2 from the stack, adjusting it
8553 afterwards by ADJUSTMENT and writing the appropriate REG_CFA_RESTORE notes
8554 into CFI_OPS. */
8556 static void
8557 aarch64_pop_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment,
8558 rtx *cfi_ops)
8560 machine_mode mode = aarch64_reg_save_mode (regno1);
8561 rtx reg1 = gen_rtx_REG (mode, regno1);
8563 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg1, *cfi_ops);
8565 if (regno2 == INVALID_REGNUM)
8567 rtx mem = plus_constant (Pmode, stack_pointer_rtx, adjustment);
8568 mem = gen_rtx_POST_MODIFY (Pmode, stack_pointer_rtx, mem);
8569 emit_move_insn (reg1, gen_frame_mem (mode, mem));
8571 else
8573 rtx reg2 = gen_rtx_REG (mode, regno2);
8574 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
8575 emit_insn (aarch64_gen_loadwb_pair (mode, stack_pointer_rtx, reg1,
8576 reg2, adjustment));
8580 /* Given an ldp/stp register operand mode MODE, return a suitable mode to use
8581 for a mem rtx representing the entire pair. */
8583 static machine_mode
8584 aarch64_pair_mode_for_mode (machine_mode mode)
8586 if (known_eq (GET_MODE_SIZE (mode), 4))
8587 return V2x4QImode;
8588 else if (known_eq (GET_MODE_SIZE (mode), 8))
8589 return V2x8QImode;
8590 else if (known_eq (GET_MODE_SIZE (mode), 16))
8591 return V2x16QImode;
8592 else
8593 gcc_unreachable ();
8596 /* Given a base mem MEM with mode and address suitable for a single ldp/stp
8597 operand, return an rtx like MEM which instead represents the entire pair. */
8599 static rtx
8600 aarch64_pair_mem_from_base (rtx mem)
8602 auto pair_mode = aarch64_pair_mode_for_mode (GET_MODE (mem));
8603 mem = adjust_bitfield_address_nv (mem, pair_mode, 0);
8604 gcc_assert (aarch64_mem_pair_lanes_operand (mem, pair_mode));
8605 return mem;
8608 /* Generate and return a store pair instruction to store REG1 and REG2
8609 into memory starting at BASE_MEM. All three rtxes should have modes of the
8610 same size. */
8613 aarch64_gen_store_pair (rtx base_mem, rtx reg1, rtx reg2)
8615 rtx pair_mem = aarch64_pair_mem_from_base (base_mem);
8617 return gen_rtx_SET (pair_mem,
8618 gen_rtx_UNSPEC (GET_MODE (pair_mem),
8619 gen_rtvec (2, reg1, reg2),
8620 UNSPEC_STP));
8623 /* Generate and return a load pair instruction to load a pair of
8624 registers starting at BASE_MEM into REG1 and REG2. If CODE is
8625 UNKNOWN, all three rtxes should have modes of the same size.
8626 Otherwise, CODE is {SIGN,ZERO}_EXTEND, base_mem should be in SImode,
8627 and REG{1,2} should be in DImode. */
8630 aarch64_gen_load_pair (rtx reg1, rtx reg2, rtx base_mem, enum rtx_code code)
8632 rtx pair_mem = aarch64_pair_mem_from_base (base_mem);
8634 const bool any_extend_p = (code == ZERO_EXTEND || code == SIGN_EXTEND);
8635 if (any_extend_p)
8636 gcc_checking_assert (GET_MODE (base_mem) == SImode
8637 && GET_MODE (reg1) == DImode
8638 && GET_MODE (reg2) == DImode);
8639 else
8640 gcc_assert (code == UNKNOWN);
8642 rtx unspecs[2] = {
8643 gen_rtx_UNSPEC (any_extend_p ? SImode : GET_MODE (reg1),
8644 gen_rtvec (1, pair_mem),
8645 UNSPEC_LDP_FST),
8646 gen_rtx_UNSPEC (any_extend_p ? SImode : GET_MODE (reg2),
8647 gen_rtvec (1, copy_rtx (pair_mem)),
8648 UNSPEC_LDP_SND)
8651 if (any_extend_p)
8652 for (int i = 0; i < 2; i++)
8653 unspecs[i] = gen_rtx_fmt_e (code, DImode, unspecs[i]);
8655 return gen_rtx_PARALLEL (VOIDmode,
8656 gen_rtvec (2,
8657 gen_rtx_SET (reg1, unspecs[0]),
8658 gen_rtx_SET (reg2, unspecs[1])));
8661 /* Return TRUE if return address signing should be enabled for the current
8662 function, otherwise return FALSE. */
8664 bool
8665 aarch64_return_address_signing_enabled (void)
8667 /* This function should only be called after frame laid out. */
8668 gcc_assert (cfun->machine->frame.laid_out);
8670 /* If signing scope is AARCH_FUNCTION_NON_LEAF, we only sign a leaf function
8671 if its LR is pushed onto stack. */
8672 return (aarch_ra_sign_scope == AARCH_FUNCTION_ALL
8673 || (aarch_ra_sign_scope == AARCH_FUNCTION_NON_LEAF
8674 && known_ge (cfun->machine->frame.reg_offset[LR_REGNUM], 0)));
8677 /* Only used by the arm backend. */
8678 void aarch_bti_arch_check (void)
8681 /* Return TRUE if Branch Target Identification Mechanism is enabled. */
8682 bool
8683 aarch_bti_enabled (void)
8685 return (aarch_enable_bti == 1);
8688 /* Check if INSN is a BTI J insn. */
8689 bool
8690 aarch_bti_j_insn_p (rtx_insn *insn)
8692 if (!insn || !INSN_P (insn))
8693 return false;
8695 rtx pat = PATTERN (insn);
8696 return GET_CODE (pat) == UNSPEC_VOLATILE && XINT (pat, 1) == UNSPECV_BTI_J;
8699 /* Return TRUE if Guarded Control Stack is enabled. */
8700 bool
8701 aarch64_gcs_enabled (void)
8703 return (aarch64_enable_gcs == 1);
8706 /* Check if X (or any sub-rtx of X) is a PACIASP/PACIBSP instruction. */
8707 bool
8708 aarch_pac_insn_p (rtx x)
8710 if (!INSN_P (x))
8711 return false;
8713 subrtx_var_iterator::array_type array;
8714 FOR_EACH_SUBRTX_VAR (iter, array, PATTERN (x), ALL)
8716 rtx sub = *iter;
8717 if (sub && GET_CODE (sub) == UNSPEC)
8719 int unspec_val = XINT (sub, 1);
8720 switch (unspec_val)
8722 case UNSPEC_PACIASP:
8723 case UNSPEC_PACIBSP:
8724 return true;
8726 default:
8727 return false;
8729 iter.skip_subrtxes ();
8732 return false;
8735 rtx aarch_gen_bti_c (void)
8737 return gen_bti_c ();
8740 rtx aarch_gen_bti_j (void)
8742 return gen_bti_j ();
8745 /* The caller is going to use ST1D or LD1D to save or restore an SVE
8746 register in mode MODE at BASE_RTX + OFFSET, where OFFSET is in
8747 the range [1, 16] * GET_MODE_SIZE (MODE). Prepare for this by:
8749 (1) updating BASE_RTX + OFFSET so that it is a legitimate ST1D
8750 or LD1D address
8752 (2) setting PRED to a valid predicate register for the ST1D or LD1D,
8753 if the variable isn't already nonnull
8755 (1) is needed when OFFSET is in the range [8, 16] * GET_MODE_SIZE (MODE).
8756 Handle this case using a temporary base register that is suitable for
8757 all offsets in that range. Use ANCHOR_REG as this base register if it
8758 is nonnull, otherwise create a new register and store it in ANCHOR_REG. */
8760 static inline void
8761 aarch64_adjust_sve_callee_save_base (machine_mode mode, rtx &base_rtx,
8762 rtx &anchor_reg, poly_int64 &offset,
8763 rtx &ptrue)
8765 if (maybe_ge (offset, 8 * GET_MODE_SIZE (mode)))
8767 /* This is the maximum valid offset of the anchor from the base.
8768 Lower values would be valid too. */
8769 poly_int64 anchor_offset = 16 * GET_MODE_SIZE (mode);
8770 if (!anchor_reg)
8772 anchor_reg = gen_rtx_REG (Pmode, STACK_CLASH_SVE_CFA_REGNUM);
8773 emit_insn (gen_add3_insn (anchor_reg, base_rtx,
8774 gen_int_mode (anchor_offset, Pmode)));
8776 base_rtx = anchor_reg;
8777 offset -= anchor_offset;
8779 if (!ptrue)
8781 int pred_reg = cfun->machine->frame.spare_pred_reg;
8782 emit_move_insn (gen_rtx_REG (VNx16BImode, pred_reg),
8783 CONSTM1_RTX (VNx16BImode));
8784 ptrue = gen_rtx_REG (VNx2BImode, pred_reg);
8788 /* Add a REG_CFA_EXPRESSION note to INSN to say that register REG
8789 is saved at BASE + OFFSET. */
8791 static void
8792 aarch64_add_cfa_expression (rtx_insn *insn, rtx reg,
8793 rtx base, poly_int64 offset)
8795 rtx mem = gen_frame_mem (GET_MODE (reg),
8796 plus_constant (Pmode, base, offset));
8797 add_reg_note (insn, REG_CFA_EXPRESSION, gen_rtx_SET (mem, reg));
8800 /* Emit code to save the callee-saved registers in REGS. Skip any
8801 write-back candidates if SKIP_WB is true, otherwise consider only
8802 write-back candidates.
8804 The stack pointer is currently BYTES_BELOW_SP bytes above the bottom
8805 of the static frame. HARD_FP_VALID_P is true if the hard frame pointer
8806 has been set up. */
8808 static void
8809 aarch64_save_callee_saves (poly_int64 bytes_below_sp,
8810 array_slice<unsigned int> regs, bool skip_wb,
8811 bool hard_fp_valid_p)
8813 aarch64_frame &frame = cfun->machine->frame;
8814 rtx_insn *insn;
8815 rtx anchor_reg = NULL_RTX, ptrue = NULL_RTX;
8817 auto skip_save_p = [&](unsigned int regno)
8819 if (cfun->machine->reg_is_wrapped_separately[regno])
8820 return true;
8822 if (skip_wb == (regno == frame.wb_push_candidate1
8823 || regno == frame.wb_push_candidate2))
8824 return true;
8826 return false;
8829 for (unsigned int i = 0; i < regs.size (); ++i)
8831 unsigned int regno = regs[i];
8832 poly_int64 offset;
8833 bool frame_related_p = aarch64_emit_cfi_for_reg_p (regno);
8835 if (skip_save_p (regno))
8836 continue;
8838 machine_mode mode = aarch64_reg_save_mode (regno);
8839 rtx reg = gen_rtx_REG (mode, regno);
8840 rtx move_src = reg;
8841 offset = frame.reg_offset[regno] - bytes_below_sp;
8842 if (regno == VG_REGNUM)
8844 move_src = gen_rtx_REG (DImode, IP0_REGNUM);
8845 emit_move_insn (move_src, gen_int_mode (aarch64_sve_vg, DImode));
8847 rtx base_rtx = stack_pointer_rtx;
8848 poly_int64 sp_offset = offset;
8850 HOST_WIDE_INT const_offset;
8851 if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
8852 aarch64_adjust_sve_callee_save_base (mode, base_rtx, anchor_reg,
8853 offset, ptrue);
8854 else if (GP_REGNUM_P (REGNO (reg))
8855 && (!offset.is_constant (&const_offset) || const_offset >= 512))
8857 poly_int64 fp_offset = frame.bytes_below_hard_fp - bytes_below_sp;
8858 if (hard_fp_valid_p)
8859 base_rtx = hard_frame_pointer_rtx;
8860 else
8862 if (!anchor_reg)
8864 anchor_reg = gen_rtx_REG (Pmode, STACK_CLASH_SVE_CFA_REGNUM);
8865 emit_insn (gen_add3_insn (anchor_reg, base_rtx,
8866 gen_int_mode (fp_offset, Pmode)));
8868 base_rtx = anchor_reg;
8870 offset -= fp_offset;
8872 rtx mem = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
8873 rtx cfi_mem = gen_frame_mem (mode, plus_constant (Pmode,
8874 stack_pointer_rtx,
8875 sp_offset));
8876 rtx cfi_set = gen_rtx_SET (cfi_mem, reg);
8877 bool need_cfi_note_p = (base_rtx != stack_pointer_rtx);
8879 unsigned int regno2;
8880 if (!aarch64_sve_mode_p (mode)
8881 && reg == move_src
8882 && i + 1 < regs.size ()
8883 && (regno2 = regs[i + 1], !skip_save_p (regno2))
8884 && known_eq (GET_MODE_SIZE (mode),
8885 frame.reg_offset[regno2] - frame.reg_offset[regno]))
8887 rtx reg2 = gen_rtx_REG (mode, regno2);
8889 offset += GET_MODE_SIZE (mode);
8890 insn = emit_insn (aarch64_gen_store_pair (mem, reg, reg2));
8892 rtx cfi_mem2
8893 = gen_frame_mem (mode,
8894 plus_constant (Pmode,
8895 stack_pointer_rtx,
8896 sp_offset + GET_MODE_SIZE (mode)));
8897 rtx cfi_set2 = gen_rtx_SET (cfi_mem2, reg2);
8899 /* The first part of a frame-related parallel insn is always
8900 assumed to be relevant to the frame calculations;
8901 subsequent parts, are only frame-related if
8902 explicitly marked. */
8903 if (aarch64_emit_cfi_for_reg_p (regno2))
8904 RTX_FRAME_RELATED_P (cfi_set2) = 1;
8906 /* Add a REG_FRAME_RELATED_EXPR note since the unspec
8907 representation of stp cannot be understood directly by
8908 dwarf2cfi. */
8909 rtx par = gen_rtx_PARALLEL (VOIDmode,
8910 gen_rtvec (2, cfi_set, cfi_set2));
8911 add_reg_note (insn, REG_FRAME_RELATED_EXPR, par);
8913 regno = regno2;
8914 ++i;
8916 else
8918 if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
8920 insn = emit_insn (gen_aarch64_pred_mov (mode, mem,
8921 ptrue, move_src));
8922 need_cfi_note_p = true;
8924 else if (aarch64_sve_mode_p (mode))
8925 insn = emit_insn (gen_rtx_SET (mem, move_src));
8926 else
8927 insn = emit_move_insn (mem, move_src);
8929 if (frame_related_p && (need_cfi_note_p || move_src != reg))
8930 add_reg_note (insn, REG_FRAME_RELATED_EXPR, cfi_set);
8933 RTX_FRAME_RELATED_P (insn) = frame_related_p;
8935 /* Emit a fake instruction to indicate that the VG save slot has
8936 been initialized. */
8937 if (regno == VG_REGNUM)
8938 emit_insn (gen_aarch64_old_vg_saved (move_src, mem));
8942 /* Emit code to restore the callee registers in REGS, ignoring pop candidates
8943 and any other registers that are handled separately. Write the appropriate
8944 REG_CFA_RESTORE notes into CFI_OPS.
8946 The stack pointer is currently BYTES_BELOW_SP bytes above the bottom
8947 of the static frame. */
8949 static void
8950 aarch64_restore_callee_saves (poly_int64 bytes_below_sp,
8951 array_slice<unsigned int> regs, rtx *cfi_ops)
8953 aarch64_frame &frame = cfun->machine->frame;
8954 poly_int64 offset;
8955 rtx anchor_reg = NULL_RTX, ptrue = NULL_RTX;
8957 auto skip_restore_p = [&](unsigned int regno)
8959 if (cfun->machine->reg_is_wrapped_separately[regno])
8960 return true;
8962 if (regno == frame.wb_pop_candidate1
8963 || regno == frame.wb_pop_candidate2)
8964 return true;
8966 /* The shadow call stack code restores LR separately. */
8967 if (frame.is_scs_enabled && regno == LR_REGNUM)
8968 return true;
8970 return false;
8973 for (unsigned int i = 0; i < regs.size (); ++i)
8975 unsigned int regno = regs[i];
8976 bool frame_related_p = aarch64_emit_cfi_for_reg_p (regno);
8977 if (skip_restore_p (regno))
8978 continue;
8980 machine_mode mode = aarch64_reg_save_mode (regno);
8981 rtx reg = gen_rtx_REG (mode, regno);
8982 offset = frame.reg_offset[regno] - bytes_below_sp;
8983 rtx base_rtx = stack_pointer_rtx;
8984 if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
8985 aarch64_adjust_sve_callee_save_base (mode, base_rtx, anchor_reg,
8986 offset, ptrue);
8987 rtx mem = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
8989 unsigned int regno2;
8990 if (!aarch64_sve_mode_p (mode)
8991 && i + 1 < regs.size ()
8992 && (regno2 = regs[i + 1], !skip_restore_p (regno2))
8993 && known_eq (GET_MODE_SIZE (mode),
8994 frame.reg_offset[regno2] - frame.reg_offset[regno]))
8996 rtx reg2 = gen_rtx_REG (mode, regno2);
8998 offset += GET_MODE_SIZE (mode);
8999 emit_insn (aarch64_gen_load_pair (reg, reg2, mem));
9001 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
9002 regno = regno2;
9003 ++i;
9005 else if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
9006 emit_insn (gen_aarch64_pred_mov (mode, reg, ptrue, mem));
9007 else if (aarch64_sve_mode_p (mode))
9008 emit_insn (gen_rtx_SET (reg, mem));
9009 else
9010 emit_move_insn (reg, mem);
9011 if (frame_related_p)
9012 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg, *cfi_ops);
9016 /* Return true if OFFSET is a signed 4-bit value multiplied by the size
9017 of MODE. */
9019 static inline bool
9020 offset_4bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
9022 HOST_WIDE_INT multiple;
9023 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
9024 && IN_RANGE (multiple, -8, 7));
9027 /* Return true if OFFSET is a signed 6-bit value multiplied by the size
9028 of MODE. */
9030 static inline bool
9031 offset_6bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
9033 HOST_WIDE_INT multiple;
9034 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
9035 && IN_RANGE (multiple, -32, 31));
9038 /* Return true if OFFSET is an unsigned 6-bit value multiplied by the size
9039 of MODE. */
9041 static inline bool
9042 offset_6bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
9044 HOST_WIDE_INT multiple;
9045 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
9046 && IN_RANGE (multiple, 0, 63));
9049 /* Return true if OFFSET is a signed 7-bit value multiplied by the size
9050 of MODE. */
9052 bool
9053 aarch64_offset_7bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
9055 HOST_WIDE_INT multiple;
9056 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
9057 && IN_RANGE (multiple, -64, 63));
9060 /* Return true if OFFSET is a signed 9-bit value. */
9062 bool
9063 aarch64_offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED,
9064 poly_int64 offset)
9066 HOST_WIDE_INT const_offset;
9067 return (offset.is_constant (&const_offset)
9068 && IN_RANGE (const_offset, -256, 255));
9071 /* Return true if OFFSET is a signed 9-bit value multiplied by the size
9072 of MODE. */
9074 static inline bool
9075 offset_9bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
9077 HOST_WIDE_INT multiple;
9078 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
9079 && IN_RANGE (multiple, -256, 255));
9082 /* Return true if OFFSET is an unsigned 12-bit value multiplied by the size
9083 of MODE. */
9085 static inline bool
9086 offset_12bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
9088 HOST_WIDE_INT multiple;
9089 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
9090 && IN_RANGE (multiple, 0, 4095));
9093 /* Implement TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS. */
9095 static sbitmap
9096 aarch64_get_separate_components (void)
9098 aarch64_frame &frame = cfun->machine->frame;
9099 sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
9100 bitmap_clear (components);
9102 /* The registers we need saved to the frame. */
9103 bool enables_pstate_sm = aarch64_cfun_enables_pstate_sm ();
9104 for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
9105 if (aarch64_register_saved_on_entry (regno))
9107 /* Disallow shrink wrapping for registers that will be clobbered
9108 by an SMSTART SM in the prologue. */
9109 if (enables_pstate_sm
9110 && (FP_REGNUM_P (regno) || PR_REGNUM_P (regno)))
9111 continue;
9113 /* Punt on saves and restores that use ST1D and LD1D. We could
9114 try to be smarter, but it would involve making sure that the
9115 spare predicate register itself is safe to use at the save
9116 and restore points. Also, when a frame pointer is being used,
9117 the slots are often out of reach of ST1D and LD1D anyway. */
9118 machine_mode mode = aarch64_reg_save_mode (regno);
9119 if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
9120 continue;
9122 poly_int64 offset = frame.reg_offset[regno];
9124 /* Get the offset relative to the register we'll use. */
9125 if (frame_pointer_needed)
9126 offset -= frame.bytes_below_hard_fp;
9128 /* Check that we can access the stack slot of the register with one
9129 direct load with no adjustments needed. */
9130 if (aarch64_sve_mode_p (mode)
9131 ? offset_9bit_signed_scaled_p (mode, offset)
9132 : offset_12bit_unsigned_scaled_p (mode, offset))
9133 bitmap_set_bit (components, regno);
9136 /* Don't mess with the hard frame pointer. */
9137 if (frame_pointer_needed)
9138 bitmap_clear_bit (components, HARD_FRAME_POINTER_REGNUM);
9140 /* If the spare predicate register used by big-endian SVE code
9141 is call-preserved, it must be saved in the main prologue
9142 before any saves that use it. */
9143 if (frame.spare_pred_reg != INVALID_REGNUM)
9144 bitmap_clear_bit (components, frame.spare_pred_reg);
9146 unsigned reg1 = frame.wb_push_candidate1;
9147 unsigned reg2 = frame.wb_push_candidate2;
9148 /* If registers have been chosen to be stored/restored with
9149 writeback don't interfere with them to avoid having to output explicit
9150 stack adjustment instructions. */
9151 if (reg2 != INVALID_REGNUM)
9152 bitmap_clear_bit (components, reg2);
9153 if (reg1 != INVALID_REGNUM)
9154 bitmap_clear_bit (components, reg1);
9156 bitmap_clear_bit (components, LR_REGNUM);
9157 bitmap_clear_bit (components, SP_REGNUM);
9158 if (flag_stack_clash_protection)
9160 if (frame.sve_save_and_probe != INVALID_REGNUM)
9161 bitmap_clear_bit (components, frame.sve_save_and_probe);
9162 if (frame.hard_fp_save_and_probe != INVALID_REGNUM)
9163 bitmap_clear_bit (components, frame.hard_fp_save_and_probe);
9166 /* The VG save sequence needs a temporary GPR. Punt for now on trying
9167 to find one. */
9168 bitmap_clear_bit (components, VG_REGNUM);
9170 return components;
9173 /* Implement TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB. */
9175 static sbitmap
9176 aarch64_components_for_bb (basic_block bb)
9178 bitmap in = DF_LIVE_IN (bb);
9179 bitmap gen = &DF_LIVE_BB_INFO (bb)->gen;
9180 bitmap kill = &DF_LIVE_BB_INFO (bb)->kill;
9182 sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
9183 bitmap_clear (components);
9185 /* Clobbered registers don't generate values in any meaningful sense,
9186 since nothing after the clobber can rely on their value. And we can't
9187 say that partially-clobbered registers are unconditionally killed,
9188 because whether they're killed or not depends on the mode of the
9189 value they're holding. Thus partially call-clobbered registers
9190 appear in neither the kill set nor the gen set.
9192 Check manually for any calls that clobber more of a register than the
9193 current function can. */
9194 function_abi_aggregator callee_abis;
9195 rtx_insn *insn;
9196 FOR_BB_INSNS (bb, insn)
9197 if (CALL_P (insn))
9198 callee_abis.note_callee_abi (insn_callee_abi (insn));
9199 HARD_REG_SET extra_caller_saves = callee_abis.caller_save_regs (*crtl->abi);
9201 /* GPRs are used in a bb if they are in the IN, GEN, or KILL sets. */
9202 for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
9203 if (!fixed_regs[regno]
9204 && !crtl->abi->clobbers_full_reg_p (regno)
9205 && (TEST_HARD_REG_BIT (extra_caller_saves, regno)
9206 || bitmap_bit_p (in, regno)
9207 || bitmap_bit_p (gen, regno)
9208 || bitmap_bit_p (kill, regno)))
9210 bitmap_set_bit (components, regno);
9212 /* If there is a callee-save at an adjacent offset, add it too
9213 to increase the use of LDP/STP. */
9214 poly_int64 offset = cfun->machine->frame.reg_offset[regno];
9215 unsigned regno2 = multiple_p (offset, 16) ? regno + 1 : regno - 1;
9217 if (regno2 <= LAST_SAVED_REGNUM)
9219 poly_int64 offset2 = cfun->machine->frame.reg_offset[regno2];
9220 if (regno < regno2
9221 ? known_eq (offset + 8, offset2)
9222 : multiple_p (offset2, 16) && known_eq (offset2 + 8, offset))
9223 bitmap_set_bit (components, regno2);
9227 return components;
9230 /* Implement TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS.
9231 Nothing to do for aarch64. */
9233 static void
9234 aarch64_disqualify_components (sbitmap, edge, sbitmap, bool)
9238 /* Return the next set bit in BMP from START onwards. Return the total number
9239 of bits in BMP if no set bit is found at or after START. */
9241 static unsigned int
9242 aarch64_get_next_set_bit (sbitmap bmp, unsigned int start)
9244 unsigned int nbits = SBITMAP_SIZE (bmp);
9245 if (start == nbits)
9246 return start;
9248 gcc_assert (start < nbits);
9249 for (unsigned int i = start; i < nbits; i++)
9250 if (bitmap_bit_p (bmp, i))
9251 return i;
9253 return nbits;
9256 /* Do the work for aarch64_emit_prologue_components and
9257 aarch64_emit_epilogue_components. COMPONENTS is the bitmap of registers
9258 to save/restore, PROLOGUE_P indicates whether to emit the prologue sequence
9259 for these components or the epilogue sequence. That is, it determines
9260 whether we should emit stores or loads and what kind of CFA notes to attach
9261 to the insns. Otherwise the logic for the two sequences is very
9262 similar. */
9264 static void
9265 aarch64_process_components (sbitmap components, bool prologue_p)
9267 aarch64_frame &frame = cfun->machine->frame;
9268 rtx ptr_reg = gen_rtx_REG (Pmode, frame_pointer_needed
9269 ? HARD_FRAME_POINTER_REGNUM
9270 : STACK_POINTER_REGNUM);
9272 unsigned last_regno = SBITMAP_SIZE (components);
9273 unsigned regno = aarch64_get_next_set_bit (components, R0_REGNUM);
9274 rtx_insn *insn = NULL;
9276 while (regno != last_regno)
9278 bool frame_related_p = aarch64_emit_cfi_for_reg_p (regno);
9279 machine_mode mode = aarch64_reg_save_mode (regno);
9281 rtx reg = gen_rtx_REG (mode, regno);
9282 poly_int64 offset = frame.reg_offset[regno];
9283 if (frame_pointer_needed)
9284 offset -= frame.bytes_below_hard_fp;
9286 rtx addr = plus_constant (Pmode, ptr_reg, offset);
9287 rtx mem = gen_frame_mem (mode, addr);
9289 rtx set = prologue_p ? gen_rtx_SET (mem, reg) : gen_rtx_SET (reg, mem);
9290 unsigned regno2 = aarch64_get_next_set_bit (components, regno + 1);
9291 /* No more registers to handle after REGNO.
9292 Emit a single save/restore and exit. */
9293 if (regno2 == last_regno)
9295 insn = emit_insn (set);
9296 if (frame_related_p)
9298 RTX_FRAME_RELATED_P (insn) = 1;
9299 if (prologue_p)
9300 add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
9301 else
9302 add_reg_note (insn, REG_CFA_RESTORE, reg);
9304 break;
9307 poly_int64 offset2 = frame.reg_offset[regno2];
9308 /* The next register is not of the same class or its offset is not
9309 mergeable with the current one into a pair. */
9310 if (aarch64_sve_mode_p (mode)
9311 || !satisfies_constraint_Ump (mem)
9312 || GP_REGNUM_P (regno) != GP_REGNUM_P (regno2)
9313 || (crtl->abi->id () == ARM_PCS_SIMD && FP_REGNUM_P (regno))
9314 || maybe_ne ((offset2 - frame.reg_offset[regno]),
9315 GET_MODE_SIZE (mode)))
9317 insn = emit_insn (set);
9318 if (frame_related_p)
9320 RTX_FRAME_RELATED_P (insn) = 1;
9321 if (prologue_p)
9322 add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
9323 else
9324 add_reg_note (insn, REG_CFA_RESTORE, reg);
9327 regno = regno2;
9328 continue;
9331 bool frame_related2_p = aarch64_emit_cfi_for_reg_p (regno2);
9333 /* REGNO2 can be saved/restored in a pair with REGNO. */
9334 rtx reg2 = gen_rtx_REG (mode, regno2);
9335 if (frame_pointer_needed)
9336 offset2 -= frame.bytes_below_hard_fp;
9337 rtx addr2 = plus_constant (Pmode, ptr_reg, offset2);
9338 rtx mem2 = gen_frame_mem (mode, addr2);
9339 rtx set2 = prologue_p ? gen_rtx_SET (mem2, reg2)
9340 : gen_rtx_SET (reg2, mem2);
9342 if (prologue_p)
9343 insn = emit_insn (aarch64_gen_store_pair (mem, reg, reg2));
9344 else
9345 insn = emit_insn (aarch64_gen_load_pair (reg, reg2, mem));
9347 if (frame_related_p || frame_related2_p)
9349 RTX_FRAME_RELATED_P (insn) = 1;
9350 if (prologue_p)
9352 if (frame_related_p)
9353 add_reg_note (insn, REG_CFA_OFFSET, set);
9354 if (frame_related2_p)
9355 add_reg_note (insn, REG_CFA_OFFSET, set2);
9357 else
9359 if (frame_related_p)
9360 add_reg_note (insn, REG_CFA_RESTORE, reg);
9361 if (frame_related2_p)
9362 add_reg_note (insn, REG_CFA_RESTORE, reg2);
9366 regno = aarch64_get_next_set_bit (components, regno2 + 1);
9370 /* Implement TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS. */
9372 static void
9373 aarch64_emit_prologue_components (sbitmap components)
9375 aarch64_process_components (components, true);
9378 /* Implement TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS. */
9380 static void
9381 aarch64_emit_epilogue_components (sbitmap components)
9383 aarch64_process_components (components, false);
9386 /* Implement TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS. */
9388 static void
9389 aarch64_set_handled_components (sbitmap components)
9391 for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
9392 if (bitmap_bit_p (components, regno))
9393 cfun->machine->reg_is_wrapped_separately[regno] = true;
9396 /* On AArch64 we have an ABI defined safe buffer. This constant is used to
9397 determining the probe offset for alloca. */
9399 static HOST_WIDE_INT
9400 aarch64_stack_clash_protection_alloca_probe_range (void)
9402 return STACK_CLASH_CALLER_GUARD;
9405 /* Emit a stack tie that acts as a scheduling barrier for all previous and
9406 subsequent memory accesses and that requires the stack pointer and REG
9407 to have their current values. REG can be stack_pointer_rtx if no
9408 other register's value needs to be fixed. */
9410 static void
9411 aarch64_emit_stack_tie (rtx reg)
9413 emit_insn (gen_stack_tie (reg, gen_int_mode (REGNO (reg), DImode)));
9416 /* Allocate POLY_SIZE bytes of stack space using TEMP1 and TEMP2 as scratch
9417 registers. If POLY_SIZE is not large enough to require a probe this function
9418 will only adjust the stack. When allocating the stack space
9419 FRAME_RELATED_P is then used to indicate if the allocation is frame related.
9420 FINAL_ADJUSTMENT_P indicates whether we are allocating the area below
9421 the saved registers. If we are then we ensure that any allocation
9422 larger than the ABI defined buffer needs a probe so that the
9423 invariant of having a 1KB buffer is maintained.
9425 We emit barriers after each stack adjustment to prevent optimizations from
9426 breaking the invariant that we never drop the stack more than a page. This
9427 invariant is needed to make it easier to correctly handle asynchronous
9428 events, e.g. if we were to allow the stack to be dropped by more than a page
9429 and then have multiple probes up and we take a signal somewhere in between
9430 then the signal handler doesn't know the state of the stack and can make no
9431 assumptions about which pages have been probed.
9433 FORCE_ISA_MODE is AARCH64_ISA_MODE_SM_ON if any variable component of
9434 POLY_SIZE is measured relative to the SME vector length instead of the
9435 current prevailing vector length. It is 0 otherwise. */
9437 static void
9438 aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2,
9439 poly_int64 poly_size,
9440 aarch64_isa_mode force_isa_mode,
9441 bool frame_related_p,
9442 bool final_adjustment_p)
9444 aarch64_frame &frame = cfun->machine->frame;
9445 HOST_WIDE_INT guard_size
9446 = 1 << param_stack_clash_protection_guard_size;
9447 HOST_WIDE_INT guard_used_by_caller = STACK_CLASH_CALLER_GUARD;
9448 HOST_WIDE_INT byte_sp_alignment = STACK_BOUNDARY / BITS_PER_UNIT;
9449 gcc_assert (multiple_p (poly_size, byte_sp_alignment));
9450 HOST_WIDE_INT min_probe_threshold
9451 = (final_adjustment_p
9452 ? guard_used_by_caller + byte_sp_alignment
9453 : guard_size - guard_used_by_caller);
9454 poly_int64 frame_size = frame.frame_size;
9456 /* We should always have a positive probe threshold. */
9457 gcc_assert (min_probe_threshold > 0);
9459 if (flag_stack_clash_protection && !final_adjustment_p)
9461 poly_int64 initial_adjust = frame.initial_adjust;
9462 poly_int64 sve_callee_adjust = frame.sve_callee_adjust;
9463 poly_int64 final_adjust = frame.final_adjust;
9465 if (known_eq (frame_size, 0))
9467 dump_stack_clash_frame_info (NO_PROBE_NO_FRAME, false);
9469 else if (known_lt (initial_adjust + sve_callee_adjust,
9470 guard_size - guard_used_by_caller)
9471 && known_lt (final_adjust, guard_used_by_caller))
9473 dump_stack_clash_frame_info (NO_PROBE_SMALL_FRAME, true);
9477 /* If SIZE is not large enough to require probing, just adjust the stack and
9478 exit. */
9479 if (known_lt (poly_size, min_probe_threshold)
9480 || !flag_stack_clash_protection)
9482 aarch64_sub_sp (temp1, temp2, poly_size, force_isa_mode,
9483 frame_related_p);
9484 return;
9487 HOST_WIDE_INT size;
9488 /* Handle the SVE non-constant case first. */
9489 if (!poly_size.is_constant (&size))
9491 if (dump_file)
9493 fprintf (dump_file, "Stack clash SVE prologue: ");
9494 print_dec (poly_size, dump_file);
9495 fprintf (dump_file, " bytes, dynamic probing will be required.\n");
9498 /* First calculate the amount of bytes we're actually spilling. */
9499 aarch64_add_offset (Pmode, temp1, CONST0_RTX (Pmode),
9500 poly_size, temp1, temp2, force_isa_mode,
9501 false, true);
9503 rtx_insn *insn = get_last_insn ();
9505 if (frame_related_p)
9507 /* This is done to provide unwinding information for the stack
9508 adjustments we're about to do, however to prevent the optimizers
9509 from removing the R11 move and leaving the CFA note (which would be
9510 very wrong) we tie the old and new stack pointer together.
9511 The tie will expand to nothing but the optimizers will not touch
9512 the instruction. */
9513 rtx stack_ptr_copy = gen_rtx_REG (Pmode, STACK_CLASH_SVE_CFA_REGNUM);
9514 emit_move_insn (stack_ptr_copy, stack_pointer_rtx);
9515 aarch64_emit_stack_tie (stack_ptr_copy);
9517 /* We want the CFA independent of the stack pointer for the
9518 duration of the loop. */
9519 add_reg_note (insn, REG_CFA_DEF_CFA, stack_ptr_copy);
9520 RTX_FRAME_RELATED_P (insn) = 1;
9523 rtx probe_const = gen_int_mode (min_probe_threshold, Pmode);
9524 rtx guard_const = gen_int_mode (guard_size, Pmode);
9526 insn = emit_insn (gen_probe_sve_stack_clash (Pmode, stack_pointer_rtx,
9527 stack_pointer_rtx, temp1,
9528 probe_const, guard_const));
9530 /* Now reset the CFA register if needed. */
9531 if (frame_related_p)
9533 add_reg_note (insn, REG_CFA_DEF_CFA,
9534 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
9535 gen_int_mode (poly_size, Pmode)));
9536 RTX_FRAME_RELATED_P (insn) = 1;
9539 return;
9542 if (dump_file)
9543 fprintf (dump_file,
9544 "Stack clash AArch64 prologue: " HOST_WIDE_INT_PRINT_DEC
9545 " bytes, probing will be required.\n", size);
9547 /* Round size to the nearest multiple of guard_size, and calculate the
9548 residual as the difference between the original size and the rounded
9549 size. */
9550 HOST_WIDE_INT rounded_size = ROUND_DOWN (size, guard_size);
9551 HOST_WIDE_INT residual = size - rounded_size;
9553 /* We can handle a small number of allocations/probes inline. Otherwise
9554 punt to a loop. */
9555 if (rounded_size <= STACK_CLASH_MAX_UNROLL_PAGES * guard_size)
9557 for (HOST_WIDE_INT i = 0; i < rounded_size; i += guard_size)
9559 aarch64_sub_sp (NULL, temp2, guard_size, force_isa_mode, true);
9560 emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
9561 guard_used_by_caller));
9562 emit_insn (gen_blockage ());
9564 dump_stack_clash_frame_info (PROBE_INLINE, size != rounded_size);
9566 else
9568 /* Compute the ending address. */
9569 aarch64_add_offset (Pmode, temp1, stack_pointer_rtx, -rounded_size,
9570 temp1, NULL, force_isa_mode, false, true);
9571 rtx_insn *insn = get_last_insn ();
9573 /* For the initial allocation, we don't have a frame pointer
9574 set up, so we always need CFI notes. If we're doing the
9575 final allocation, then we may have a frame pointer, in which
9576 case it is the CFA, otherwise we need CFI notes.
9578 We can determine which allocation we are doing by looking at
9579 the value of FRAME_RELATED_P since the final allocations are not
9580 frame related. */
9581 if (frame_related_p)
9583 /* We want the CFA independent of the stack pointer for the
9584 duration of the loop. */
9585 add_reg_note (insn, REG_CFA_DEF_CFA,
9586 plus_constant (Pmode, temp1, rounded_size));
9587 RTX_FRAME_RELATED_P (insn) = 1;
9590 /* This allocates and probes the stack. Note that this re-uses some of
9591 the existing Ada stack protection code. However we are guaranteed not
9592 to enter the non loop or residual branches of that code.
9594 The non-loop part won't be entered because if our allocation amount
9595 doesn't require a loop, the case above would handle it.
9597 The residual amount won't be entered because TEMP1 is a mutliple of
9598 the allocation size. The residual will always be 0. As such, the only
9599 part we are actually using from that code is the loop setup. The
9600 actual probing is done in aarch64_output_probe_stack_range. */
9601 insn = emit_insn (gen_probe_stack_range (stack_pointer_rtx,
9602 stack_pointer_rtx, temp1));
9604 /* Now reset the CFA register if needed. */
9605 if (frame_related_p)
9607 add_reg_note (insn, REG_CFA_DEF_CFA,
9608 plus_constant (Pmode, stack_pointer_rtx, rounded_size));
9609 RTX_FRAME_RELATED_P (insn) = 1;
9612 emit_insn (gen_blockage ());
9613 dump_stack_clash_frame_info (PROBE_LOOP, size != rounded_size);
9616 /* Handle any residuals. Residuals of at least MIN_PROBE_THRESHOLD have to
9617 be probed. This maintains the requirement that each page is probed at
9618 least once. For initial probing we probe only if the allocation is
9619 more than GUARD_SIZE - buffer, and below the saved registers we probe
9620 if the amount is larger than buffer. GUARD_SIZE - buffer + buffer ==
9621 GUARD_SIZE. This works that for any allocation that is large enough to
9622 trigger a probe here, we'll have at least one, and if they're not large
9623 enough for this code to emit anything for them, The page would have been
9624 probed by the saving of FP/LR either by this function or any callees. If
9625 we don't have any callees then we won't have more stack adjustments and so
9626 are still safe. */
9627 if (residual)
9629 gcc_assert (guard_used_by_caller + byte_sp_alignment <= size);
9631 /* If we're doing final adjustments, and we've done any full page
9632 allocations then any residual needs to be probed. */
9633 if (final_adjustment_p && rounded_size != 0)
9634 min_probe_threshold = 0;
9636 aarch64_sub_sp (temp1, temp2, residual, force_isa_mode, frame_related_p);
9637 if (residual >= min_probe_threshold)
9639 if (dump_file)
9640 fprintf (dump_file,
9641 "Stack clash AArch64 prologue residuals: "
9642 HOST_WIDE_INT_PRINT_DEC " bytes, probing will be required."
9643 "\n", residual);
9645 emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
9646 guard_used_by_caller));
9647 emit_insn (gen_blockage ());
9652 /* Implement TARGET_EXTRA_LIVE_ON_ENTRY. */
9654 void
9655 aarch64_extra_live_on_entry (bitmap regs)
9657 if (TARGET_ZA)
9659 bitmap_set_bit (regs, LOWERING_REGNUM);
9660 bitmap_set_bit (regs, SME_STATE_REGNUM);
9661 bitmap_set_bit (regs, TPIDR2_SETUP_REGNUM);
9662 bitmap_set_bit (regs, ZA_FREE_REGNUM);
9663 bitmap_set_bit (regs, ZA_SAVED_REGNUM);
9665 /* The only time ZA can't have live contents on entry is when
9666 the function explicitly treats it as a pure output. */
9667 auto za_flags = aarch64_cfun_shared_flags ("za");
9668 if (za_flags != (AARCH64_STATE_SHARED | AARCH64_STATE_OUT))
9669 bitmap_set_bit (regs, ZA_REGNUM);
9671 /* Since ZT0 is call-clobbered, it is only live on input if
9672 it is explicitly shared, and is not a pure output. */
9673 auto zt0_flags = aarch64_cfun_shared_flags ("zt0");
9674 if (zt0_flags != 0
9675 && zt0_flags != (AARCH64_STATE_SHARED | AARCH64_STATE_OUT))
9676 bitmap_set_bit (regs, ZT0_REGNUM);
9680 /* Return 1 if the register is used by the epilogue. We need to say the
9681 return register is used, but only after epilogue generation is complete.
9682 Note that in the case of sibcalls, the values "used by the epilogue" are
9683 considered live at the start of the called function. */
9686 aarch64_epilogue_uses (int regno)
9688 if (epilogue_completed)
9690 if (regno == LR_REGNUM)
9691 return 1;
9693 if (regno == LOWERING_REGNUM && TARGET_ZA)
9694 return 1;
9695 if (regno == SME_STATE_REGNUM && TARGET_ZA)
9696 return 1;
9697 if (regno == TPIDR2_SETUP_REGNUM && TARGET_ZA)
9698 return 1;
9699 /* If the function shares SME state with its caller, ensure that that
9700 data is not in the lazy save buffer on exit. */
9701 if (regno == ZA_SAVED_REGNUM && aarch64_cfun_incoming_pstate_za () != 0)
9702 return 1;
9703 if (regno == ZA_REGNUM && aarch64_cfun_shared_flags ("za") != 0)
9704 return 1;
9705 if (regno == ZT0_REGNUM && aarch64_cfun_shared_flags ("zt0") != 0)
9706 return 1;
9707 return 0;
9710 /* Implement TARGET_USE_LATE_PROLOGUE_EPILOGUE. */
9712 static bool
9713 aarch64_use_late_prologue_epilogue ()
9715 return aarch64_cfun_enables_pstate_sm ();
9718 /* The current function's frame has a save slot for the incoming state
9719 of SVCR. Return a legitimate memory for the slot, based on the hard
9720 frame pointer. */
9722 static rtx
9723 aarch64_old_svcr_mem ()
9725 gcc_assert (frame_pointer_needed
9726 && known_ge (cfun->machine->frame.old_svcr_offset, 0));
9727 rtx base = hard_frame_pointer_rtx;
9728 poly_int64 offset = (0
9729 /* hard fp -> bottom of frame. */
9730 - cfun->machine->frame.bytes_below_hard_fp
9731 /* bottom of frame -> save slot. */
9732 + cfun->machine->frame.old_svcr_offset);
9733 return gen_frame_mem (DImode, plus_constant (Pmode, base, offset));
9736 /* The current function's frame has a save slot for the incoming state
9737 of SVCR. Load the slot into register REGNO and return the register. */
9739 static rtx
9740 aarch64_read_old_svcr (unsigned int regno)
9742 rtx svcr = gen_rtx_REG (DImode, regno);
9743 emit_move_insn (svcr, aarch64_old_svcr_mem ());
9744 return svcr;
9747 /* Like the rtx version of aarch64_guard_switch_pstate_sm, but first
9748 load the incoming value of SVCR from its save slot into temporary
9749 register REGNO. */
9751 static rtx_insn *
9752 aarch64_guard_switch_pstate_sm (unsigned int regno,
9753 aarch64_isa_mode local_mode)
9755 rtx old_svcr = aarch64_read_old_svcr (regno);
9756 return aarch64_guard_switch_pstate_sm (old_svcr, local_mode);
9759 /* AArch64 stack frames generated by this compiler look like:
9761 +-------------------------------+
9763 | incoming stack arguments |
9765 +-------------------------------+
9766 | | <-- incoming stack pointer (aligned)
9767 | callee-allocated save area |
9768 | for register varargs |
9770 +-------------------------------+
9771 | local variables (1) | <-- frame_pointer_rtx
9773 +-------------------------------+
9774 | padding (1) |
9775 +-------------------------------+
9776 | callee-saved registers |
9777 +-------------------------------+
9778 | LR' |
9779 +-------------------------------+
9780 | FP' |
9781 +-------------------------------+ <-- hard_frame_pointer_rtx (aligned)
9782 | SVE vector registers |
9783 +-------------------------------+
9784 | SVE predicate registers |
9785 +-------------------------------+
9786 | local variables (2) |
9787 +-------------------------------+
9788 | padding (2) |
9789 +-------------------------------+
9790 | dynamic allocation |
9791 +-------------------------------+
9792 | padding |
9793 +-------------------------------+
9794 | outgoing stack arguments | <-- arg_pointer
9796 +-------------------------------+
9797 | | <-- stack_pointer_rtx (aligned)
9799 The regions marked (1) and (2) are mutually exclusive. (2) is used
9800 when aarch64_save_regs_above_locals_p is true.
9802 Dynamic stack allocations via alloca() decrease stack_pointer_rtx
9803 but leave frame_pointer_rtx and hard_frame_pointer_rtx
9804 unchanged.
9806 By default for stack-clash we assume the guard is at least 64KB, but this
9807 value is configurable to either 4KB or 64KB. We also force the guard size to
9808 be the same as the probing interval and both values are kept in sync.
9810 With those assumptions the callee can allocate up to 63KB (or 3KB depending
9811 on the guard size) of stack space without probing.
9813 When probing is needed, we emit a probe at the start of the prologue
9814 and every PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE bytes thereafter.
9816 We can also use register saves as probes. These are stored in
9817 sve_save_and_probe and hard_fp_save_and_probe.
9819 For outgoing arguments we probe if the size is larger than 1KB, such that
9820 the ABI specified buffer is maintained for the next callee.
9822 The following registers are reserved during frame layout and should not be
9823 used for any other purpose:
9825 - r11: Used by stack clash protection when SVE is enabled, and also
9826 as an anchor register when saving and restoring registers
9827 - r12(EP0) and r13(EP1): Used as temporaries for stack adjustment.
9828 - r14 and r15: Used for speculation tracking.
9829 - r16(IP0), r17(IP1): Used by indirect tailcalls.
9830 - r30(LR), r29(FP): Used by standard frame layout.
9832 These registers must be avoided in frame layout related code unless the
9833 explicit intention is to interact with one of the features listed above. */
9835 /* Generate the prologue instructions for entry into a function.
9836 Establish the stack frame by decreasing the stack pointer with a
9837 properly calculated size and, if necessary, create a frame record
9838 filled with the values of LR and previous frame pointer. The
9839 current FP is also set up if it is in use. */
9841 void
9842 aarch64_expand_prologue (void)
9844 aarch64_frame &frame = cfun->machine->frame;
9845 poly_int64 frame_size = frame.frame_size;
9846 poly_int64 initial_adjust = frame.initial_adjust;
9847 HOST_WIDE_INT callee_adjust = frame.callee_adjust;
9848 poly_int64 final_adjust = frame.final_adjust;
9849 poly_int64 sve_callee_adjust = frame.sve_callee_adjust;
9850 unsigned reg1 = frame.wb_push_candidate1;
9851 unsigned reg2 = frame.wb_push_candidate2;
9852 bool emit_frame_chain = frame.emit_frame_chain;
9853 rtx_insn *insn;
9854 aarch64_isa_mode force_isa_mode = 0;
9855 if (aarch64_cfun_enables_pstate_sm ())
9856 force_isa_mode = AARCH64_ISA_MODE_SM_ON;
9858 if (flag_stack_clash_protection
9859 && known_eq (callee_adjust, 0)
9860 && known_lt (frame.reg_offset[VG_REGNUM], 0))
9862 /* Fold the SVE allocation into the initial allocation.
9863 We don't do this in aarch64_layout_arg to avoid pessimizing
9864 the epilogue code. */
9865 initial_adjust += sve_callee_adjust;
9866 sve_callee_adjust = 0;
9869 /* Sign return address for functions. */
9870 if (aarch64_return_address_signing_enabled ())
9872 switch (aarch64_ra_sign_key)
9874 case AARCH64_KEY_A:
9875 insn = emit_insn (gen_paciasp ());
9876 break;
9877 case AARCH64_KEY_B:
9878 insn = emit_insn (gen_pacibsp ());
9879 break;
9880 default:
9881 gcc_unreachable ();
9883 add_reg_note (insn, REG_CFA_NEGATE_RA_STATE, const0_rtx);
9884 RTX_FRAME_RELATED_P (insn) = 1;
9887 /* Push return address to shadow call stack. */
9888 if (frame.is_scs_enabled)
9889 emit_insn (gen_scs_push ());
9891 if (flag_stack_usage_info)
9892 current_function_static_stack_size = constant_lower_bound (frame_size);
9894 if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
9896 if (crtl->is_leaf && !cfun->calls_alloca)
9898 if (maybe_gt (frame_size, PROBE_INTERVAL)
9899 && maybe_gt (frame_size, get_stack_check_protect ()))
9900 aarch64_emit_probe_stack_range (get_stack_check_protect (),
9901 (frame_size
9902 - get_stack_check_protect ()));
9904 else if (maybe_gt (frame_size, 0))
9905 aarch64_emit_probe_stack_range (get_stack_check_protect (), frame_size);
9908 rtx tmp0_rtx = gen_rtx_REG (Pmode, EP0_REGNUM);
9909 rtx tmp1_rtx = gen_rtx_REG (Pmode, EP1_REGNUM);
9911 /* In theory we should never have both an initial adjustment
9912 and a callee save adjustment. Verify that is the case since the
9913 code below does not handle it for -fstack-clash-protection. */
9914 gcc_assert (known_eq (initial_adjust, 0) || callee_adjust == 0);
9916 /* Will only probe if the initial adjustment is larger than the guard
9917 less the amount of the guard reserved for use by the caller's
9918 outgoing args. */
9919 aarch64_allocate_and_probe_stack_space (tmp0_rtx, tmp1_rtx, initial_adjust,
9920 force_isa_mode, true, false);
9922 if (callee_adjust != 0)
9923 aarch64_push_regs (reg1, reg2, callee_adjust);
9925 /* The offset of the current SP from the bottom of the static frame. */
9926 poly_int64 bytes_below_sp = frame_size - initial_adjust - callee_adjust;
9928 if (emit_frame_chain)
9930 /* The offset of the frame chain record (if any) from the current SP. */
9931 poly_int64 chain_offset = (initial_adjust + callee_adjust
9932 - frame.bytes_above_hard_fp);
9933 gcc_assert (known_ge (chain_offset, 0));
9935 gcc_assert (reg1 == R29_REGNUM && reg2 == R30_REGNUM);
9936 if (callee_adjust == 0)
9937 aarch64_save_callee_saves (bytes_below_sp, frame.saved_gprs,
9938 false, false);
9939 else
9940 gcc_assert (known_eq (chain_offset, 0));
9941 aarch64_add_offset (Pmode, hard_frame_pointer_rtx,
9942 stack_pointer_rtx, chain_offset,
9943 tmp1_rtx, tmp0_rtx, force_isa_mode,
9944 frame_pointer_needed);
9945 if (frame_pointer_needed && !frame_size.is_constant ())
9947 /* Variable-sized frames need to describe the save slot
9948 address using DW_CFA_expression rather than DW_CFA_offset.
9949 This means that, without taking further action, the
9950 locations of the registers that we've already saved would
9951 remain based on the stack pointer even after we redefine
9952 the CFA based on the frame pointer. We therefore need new
9953 DW_CFA_expressions to re-express the save slots with addresses
9954 based on the frame pointer. */
9955 rtx_insn *insn = get_last_insn ();
9956 gcc_assert (RTX_FRAME_RELATED_P (insn));
9958 /* Add an explicit CFA definition if this was previously
9959 implicit. */
9960 if (!find_reg_note (insn, REG_CFA_ADJUST_CFA, NULL_RTX))
9962 rtx src = plus_constant (Pmode, stack_pointer_rtx, chain_offset);
9963 add_reg_note (insn, REG_CFA_ADJUST_CFA,
9964 gen_rtx_SET (hard_frame_pointer_rtx, src));
9967 /* Change the save slot expressions for the registers that
9968 we've already saved. */
9969 aarch64_add_cfa_expression (insn, regno_reg_rtx[reg2],
9970 hard_frame_pointer_rtx, UNITS_PER_WORD);
9971 aarch64_add_cfa_expression (insn, regno_reg_rtx[reg1],
9972 hard_frame_pointer_rtx, 0);
9974 aarch64_emit_stack_tie (hard_frame_pointer_rtx);
9977 aarch64_save_callee_saves (bytes_below_sp, frame.saved_gprs, true,
9978 emit_frame_chain);
9979 if (maybe_ge (frame.reg_offset[VG_REGNUM], 0))
9981 unsigned int saved_regs[] = { VG_REGNUM };
9982 aarch64_save_callee_saves (bytes_below_sp, saved_regs, true,
9983 emit_frame_chain);
9985 if (maybe_ne (sve_callee_adjust, 0))
9987 gcc_assert (!flag_stack_clash_protection
9988 || known_eq (initial_adjust, 0)
9989 /* The VG save isn't shrink-wrapped and so serves as
9990 a probe of the initial allocation. */
9991 || known_eq (frame.reg_offset[VG_REGNUM], bytes_below_sp));
9992 aarch64_allocate_and_probe_stack_space (tmp1_rtx, tmp0_rtx,
9993 sve_callee_adjust,
9994 force_isa_mode,
9995 !frame_pointer_needed, false);
9996 bytes_below_sp -= sve_callee_adjust;
9998 aarch64_save_callee_saves (bytes_below_sp, frame.saved_prs, true,
9999 emit_frame_chain);
10000 aarch64_save_callee_saves (bytes_below_sp, frame.saved_fprs, true,
10001 emit_frame_chain);
10003 /* We may need to probe the final adjustment if it is larger than the guard
10004 that is assumed by the called. */
10005 gcc_assert (known_eq (bytes_below_sp, final_adjust));
10006 aarch64_allocate_and_probe_stack_space (tmp1_rtx, tmp0_rtx, final_adjust,
10007 force_isa_mode,
10008 !frame_pointer_needed, true);
10009 if (emit_frame_chain && maybe_ne (final_adjust, 0))
10010 aarch64_emit_stack_tie (hard_frame_pointer_rtx);
10012 /* Save the incoming value of PSTATE.SM, if required. Code further
10013 down does this for locally-streaming functions. */
10014 if (known_ge (frame.old_svcr_offset, 0)
10015 && !aarch64_cfun_enables_pstate_sm ())
10017 rtx mem = aarch64_old_svcr_mem ();
10018 MEM_VOLATILE_P (mem) = 1;
10019 if (TARGET_SME)
10021 rtx reg = gen_rtx_REG (DImode, IP0_REGNUM);
10022 emit_insn (gen_aarch64_read_svcr (reg));
10023 emit_move_insn (mem, reg);
10025 else
10027 rtx old_r0 = NULL_RTX, old_r1 = NULL_RTX;
10028 auto &args = crtl->args.info;
10029 if (args.aapcs_ncrn > 0)
10031 old_r0 = gen_rtx_REG (DImode, PROBE_STACK_FIRST_REGNUM);
10032 emit_move_insn (old_r0, gen_rtx_REG (DImode, R0_REGNUM));
10034 if (args.aapcs_ncrn > 1)
10036 old_r1 = gen_rtx_REG (DImode, PROBE_STACK_SECOND_REGNUM);
10037 emit_move_insn (old_r1, gen_rtx_REG (DImode, R1_REGNUM));
10039 emit_insn (gen_aarch64_get_sme_state ());
10040 emit_move_insn (mem, gen_rtx_REG (DImode, R0_REGNUM));
10041 if (old_r0)
10042 emit_move_insn (gen_rtx_REG (DImode, R0_REGNUM), old_r0);
10043 if (old_r1)
10044 emit_move_insn (gen_rtx_REG (DImode, R1_REGNUM), old_r1);
10048 /* Enable PSTATE.SM, if required. */
10049 if (aarch64_cfun_enables_pstate_sm ())
10051 rtx_insn *guard_label = nullptr;
10052 if (known_ge (cfun->machine->frame.old_svcr_offset, 0))
10054 /* The current function is streaming-compatible. Save the
10055 original state of PSTATE.SM. */
10056 rtx svcr = gen_rtx_REG (DImode, IP0_REGNUM);
10057 emit_insn (gen_aarch64_read_svcr (svcr));
10058 emit_move_insn (aarch64_old_svcr_mem (), svcr);
10059 guard_label = aarch64_guard_switch_pstate_sm (svcr,
10060 AARCH64_ISA_MODE);
10062 aarch64_sme_mode_switch_regs args_switch;
10063 auto &args = crtl->args.info;
10064 for (unsigned int i = 0; i < args.num_sme_mode_switch_args; ++i)
10066 rtx x = args.sme_mode_switch_args[i];
10067 args_switch.add_reg (GET_MODE (x), REGNO (x));
10069 args_switch.emit_prologue ();
10070 emit_insn (gen_aarch64_smstart_sm ());
10071 args_switch.emit_epilogue ();
10072 if (guard_label)
10073 emit_label (guard_label);
10077 /* Return TRUE if we can use a simple_return insn.
10079 This function checks whether the callee saved stack is empty, which
10080 means no restore actions are need. The pro_and_epilogue will use
10081 this to check whether shrink-wrapping opt is feasible. */
10083 bool
10084 aarch64_use_return_insn_p (void)
10086 if (!reload_completed)
10087 return false;
10089 if (crtl->profile)
10090 return false;
10092 return known_eq (cfun->machine->frame.frame_size, 0);
10095 /* Generate the epilogue instructions for returning from a function.
10096 This is almost exactly the reverse of the prolog sequence, except
10097 that we need to insert barriers to avoid scheduling loads that read
10098 from a deallocated stack, and we optimize the unwind records by
10099 emitting them all together if possible. */
10100 void
10101 aarch64_expand_epilogue (rtx_call_insn *sibcall)
10103 aarch64_frame &frame = cfun->machine->frame;
10104 poly_int64 initial_adjust = frame.initial_adjust;
10105 HOST_WIDE_INT callee_adjust = frame.callee_adjust;
10106 poly_int64 final_adjust = frame.final_adjust;
10107 poly_int64 sve_callee_adjust = frame.sve_callee_adjust;
10108 poly_int64 bytes_below_hard_fp = frame.bytes_below_hard_fp;
10109 unsigned reg1 = frame.wb_pop_candidate1;
10110 unsigned reg2 = frame.wb_pop_candidate2;
10111 rtx cfi_ops = NULL;
10112 rtx_insn *insn;
10113 /* A stack clash protection prologue may not have left EP0_REGNUM or
10114 EP1_REGNUM in a usable state. The same is true for allocations
10115 with an SVE component, since we then need both temporary registers
10116 for each allocation. For stack clash we are in a usable state if
10117 the adjustment is less than GUARD_SIZE - GUARD_USED_BY_CALLER. */
10118 HOST_WIDE_INT guard_size
10119 = 1 << param_stack_clash_protection_guard_size;
10120 HOST_WIDE_INT guard_used_by_caller = STACK_CLASH_CALLER_GUARD;
10121 aarch64_isa_mode force_isa_mode = 0;
10122 if (aarch64_cfun_enables_pstate_sm ())
10123 force_isa_mode = AARCH64_ISA_MODE_SM_ON;
10125 /* We can re-use the registers when:
10127 (a) the deallocation amount is the same as the corresponding
10128 allocation amount (which is false if we combine the initial
10129 and SVE callee save allocations in the prologue); and
10131 (b) the allocation amount doesn't need a probe (which is false
10132 if the amount is guard_size - guard_used_by_caller or greater).
10134 In such situations the register should remain live with the correct
10135 value. */
10136 bool can_inherit_p = (initial_adjust.is_constant ()
10137 && final_adjust.is_constant ()
10138 && (!flag_stack_clash_protection
10139 || (known_lt (initial_adjust,
10140 guard_size - guard_used_by_caller)
10141 && known_eq (sve_callee_adjust, 0))));
10143 /* We need to add memory barrier to prevent read from deallocated stack. */
10144 bool need_barrier_p
10145 = maybe_ne (get_frame_size ()
10146 + frame.saved_varargs_size, 0);
10148 /* Reset PSTATE.SM, if required. */
10149 if (aarch64_cfun_enables_pstate_sm ())
10151 rtx_insn *guard_label = nullptr;
10152 if (known_ge (cfun->machine->frame.old_svcr_offset, 0))
10153 guard_label = aarch64_guard_switch_pstate_sm (IP0_REGNUM,
10154 AARCH64_ISA_MODE);
10155 aarch64_sme_mode_switch_regs return_switch;
10156 if (sibcall)
10157 return_switch.add_call_args (sibcall);
10158 else if (crtl->return_rtx && REG_P (crtl->return_rtx))
10159 return_switch.add_reg (GET_MODE (crtl->return_rtx),
10160 REGNO (crtl->return_rtx));
10161 return_switch.emit_prologue ();
10162 emit_insn (gen_aarch64_smstop_sm ());
10163 return_switch.emit_epilogue ();
10164 if (guard_label)
10165 emit_label (guard_label);
10168 /* Emit a barrier to prevent loads from a deallocated stack. */
10169 if (maybe_gt (final_adjust, crtl->outgoing_args_size)
10170 || cfun->calls_alloca
10171 || crtl->calls_eh_return)
10173 aarch64_emit_stack_tie (stack_pointer_rtx);
10174 need_barrier_p = false;
10177 /* Restore the stack pointer from the frame pointer if it may not
10178 be the same as the stack pointer. */
10179 rtx tmp0_rtx = gen_rtx_REG (Pmode, EP0_REGNUM);
10180 rtx tmp1_rtx = gen_rtx_REG (Pmode, EP1_REGNUM);
10181 if (frame_pointer_needed
10182 && (maybe_ne (final_adjust, 0) || cfun->calls_alloca))
10183 /* If writeback is used when restoring callee-saves, the CFA
10184 is restored on the instruction doing the writeback. */
10185 aarch64_add_offset (Pmode, stack_pointer_rtx,
10186 hard_frame_pointer_rtx,
10187 -bytes_below_hard_fp + final_adjust,
10188 tmp1_rtx, tmp0_rtx, force_isa_mode,
10189 callee_adjust == 0);
10190 else
10191 /* The case where we need to re-use the register here is very rare, so
10192 avoid the complicated condition and just always emit a move if the
10193 immediate doesn't fit. */
10194 aarch64_add_sp (tmp1_rtx, tmp0_rtx, final_adjust, force_isa_mode, true);
10196 /* Restore the vector registers before the predicate registers,
10197 so that we can use P4 as a temporary for big-endian SVE frames. */
10198 aarch64_restore_callee_saves (final_adjust, frame.saved_fprs, &cfi_ops);
10199 aarch64_restore_callee_saves (final_adjust, frame.saved_prs, &cfi_ops);
10200 if (maybe_ne (sve_callee_adjust, 0))
10201 aarch64_add_sp (NULL_RTX, NULL_RTX, sve_callee_adjust,
10202 force_isa_mode, true);
10204 /* When shadow call stack is enabled, the scs_pop in the epilogue will
10205 restore x30, we don't need to restore x30 again in the traditional
10206 way. */
10207 aarch64_restore_callee_saves (final_adjust + sve_callee_adjust,
10208 frame.saved_gprs, &cfi_ops);
10210 if (need_barrier_p)
10211 aarch64_emit_stack_tie (stack_pointer_rtx);
10213 if (callee_adjust != 0)
10214 aarch64_pop_regs (reg1, reg2, callee_adjust, &cfi_ops);
10216 /* If we have no register restore information, the CFA must have been
10217 defined in terms of the stack pointer since the end of the prologue. */
10218 gcc_assert (cfi_ops || !frame_pointer_needed);
10220 if (cfi_ops && (callee_adjust != 0 || maybe_gt (initial_adjust, 65536)))
10222 /* Emit delayed restores and set the CFA to be SP + initial_adjust. */
10223 insn = get_last_insn ();
10224 rtx new_cfa = plus_constant (Pmode, stack_pointer_rtx, initial_adjust);
10225 REG_NOTES (insn) = alloc_reg_note (REG_CFA_DEF_CFA, new_cfa, cfi_ops);
10226 RTX_FRAME_RELATED_P (insn) = 1;
10227 cfi_ops = NULL;
10230 /* Liveness of EP0_REGNUM can not be trusted across function calls either, so
10231 add restriction on emit_move optimization to leaf functions. */
10232 aarch64_add_sp (tmp0_rtx, tmp1_rtx, initial_adjust, force_isa_mode,
10233 (!can_inherit_p || !crtl->is_leaf
10234 || df_regs_ever_live_p (EP0_REGNUM)));
10236 if (cfi_ops)
10238 /* Emit delayed restores and reset the CFA to be SP. */
10239 insn = get_last_insn ();
10240 cfi_ops = alloc_reg_note (REG_CFA_DEF_CFA, stack_pointer_rtx, cfi_ops);
10241 REG_NOTES (insn) = cfi_ops;
10242 RTX_FRAME_RELATED_P (insn) = 1;
10245 /* Pop return address from shadow call stack. */
10246 if (frame.is_scs_enabled)
10248 machine_mode mode = aarch64_reg_save_mode (R30_REGNUM);
10249 rtx reg = gen_rtx_REG (mode, R30_REGNUM);
10251 insn = emit_insn (gen_scs_pop ());
10252 add_reg_note (insn, REG_CFA_RESTORE, reg);
10253 RTX_FRAME_RELATED_P (insn) = 1;
10256 /* Stack adjustment for exception handler. */
10257 if (crtl->calls_eh_return && !sibcall)
10259 /* If the EH_RETURN_TAKEN_RTX flag is set then we need
10260 to unwind the stack and jump to the handler, otherwise
10261 skip this eh_return logic and continue with normal
10262 return after the label. We have already reset the CFA
10263 to be SP; letting the CFA move during this adjustment
10264 is just as correct as retaining the CFA from the body
10265 of the function. Therefore, do nothing special. */
10266 rtx_code_label *label = gen_label_rtx ();
10267 rtx x = aarch64_gen_compare_zero_and_branch (EQ, EH_RETURN_TAKEN_RTX,
10268 label);
10269 rtx jump = emit_jump_insn (x);
10270 JUMP_LABEL (jump) = label;
10271 LABEL_NUSES (label)++;
10272 emit_insn (gen_add2_insn (stack_pointer_rtx,
10273 EH_RETURN_STACKADJ_RTX));
10274 emit_jump_insn (gen_indirect_jump (EH_RETURN_HANDLER_RTX));
10275 emit_barrier ();
10276 emit_label (label);
10279 /* We prefer to emit the combined return/authenticate instruction RETAA,
10280 however there are three cases in which we must instead emit an explicit
10281 authentication instruction.
10283 1) Sibcalls don't return in a normal way, so if we're about to call one
10284 we must authenticate.
10286 2) The RETAA instruction is not available without FEAT_PAuth, so if we
10287 are generating code for !TARGET_PAUTH we can't use it and must
10288 explicitly authenticate.
10290 if (aarch64_return_address_signing_enabled ()
10291 && (sibcall || !TARGET_PAUTH))
10293 switch (aarch64_ra_sign_key)
10295 case AARCH64_KEY_A:
10296 insn = emit_insn (gen_autiasp ());
10297 break;
10298 case AARCH64_KEY_B:
10299 insn = emit_insn (gen_autibsp ());
10300 break;
10301 default:
10302 gcc_unreachable ();
10304 add_reg_note (insn, REG_CFA_NEGATE_RA_STATE, const0_rtx);
10305 RTX_FRAME_RELATED_P (insn) = 1;
10308 emit_use (gen_rtx_REG (DImode, LR_REGNUM));
10309 if (!sibcall)
10310 emit_jump_insn (ret_rtx);
10313 /* Output code to add DELTA to the first argument, and then jump
10314 to FUNCTION. Used for C++ multiple inheritance. */
10315 static void
10316 aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
10317 HOST_WIDE_INT delta,
10318 HOST_WIDE_INT vcall_offset,
10319 tree function)
10321 /* The this pointer is always in x0. Note that this differs from
10322 Arm where the this pointer maybe bumped to r1 if r0 is required
10323 to return a pointer to an aggregate. On AArch64 a result value
10324 pointer will be in x8. */
10325 int this_regno = R0_REGNUM;
10326 rtx this_rtx, temp0, temp1, addr, funexp;
10327 rtx_insn *insn;
10328 const char *fnname = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (thunk));
10330 if (aarch_bti_enabled ())
10331 emit_insn (gen_bti_c());
10333 reload_completed = 1;
10334 emit_note (NOTE_INSN_PROLOGUE_END);
10336 this_rtx = gen_rtx_REG (Pmode, this_regno);
10337 temp0 = gen_rtx_REG (Pmode, EP0_REGNUM);
10338 temp1 = gen_rtx_REG (Pmode, EP1_REGNUM);
10340 if (vcall_offset == 0)
10341 aarch64_add_offset (Pmode, this_rtx, this_rtx, delta, temp1, temp0,
10342 0, false);
10343 else
10345 gcc_assert ((vcall_offset & (POINTER_BYTES - 1)) == 0);
10347 addr = this_rtx;
10348 if (delta != 0)
10350 if (delta >= -256 && delta < 256)
10351 addr = gen_rtx_PRE_MODIFY (Pmode, this_rtx,
10352 plus_constant (Pmode, this_rtx, delta));
10353 else
10354 aarch64_add_offset (Pmode, this_rtx, this_rtx, delta,
10355 temp1, temp0, 0, false);
10358 if (Pmode == ptr_mode)
10359 aarch64_emit_move (temp0, gen_rtx_MEM (ptr_mode, addr));
10360 else
10361 aarch64_emit_move (temp0,
10362 gen_rtx_ZERO_EXTEND (Pmode,
10363 gen_rtx_MEM (ptr_mode, addr)));
10365 if (vcall_offset >= -256 && vcall_offset < 4096 * POINTER_BYTES)
10366 addr = plus_constant (Pmode, temp0, vcall_offset);
10367 else
10369 aarch64_internal_mov_immediate (temp1, GEN_INT (vcall_offset), true,
10370 Pmode);
10371 addr = gen_rtx_PLUS (Pmode, temp0, temp1);
10374 if (Pmode == ptr_mode)
10375 aarch64_emit_move (temp1, gen_rtx_MEM (ptr_mode,addr));
10376 else
10377 aarch64_emit_move (temp1,
10378 gen_rtx_SIGN_EXTEND (Pmode,
10379 gen_rtx_MEM (ptr_mode, addr)));
10381 emit_insn (gen_add2_insn (this_rtx, temp1));
10384 /* Generate a tail call to the target function. */
10385 if (!TREE_USED (function))
10387 assemble_external (function);
10388 TREE_USED (function) = 1;
10390 funexp = XEXP (DECL_RTL (function), 0);
10391 funexp = gen_rtx_MEM (FUNCTION_MODE, funexp);
10392 auto isa_mode = aarch64_fntype_isa_mode (TREE_TYPE (function));
10393 auto pcs_variant = arm_pcs (fndecl_abi (function).id ());
10394 bool ir = lookup_attribute ("indirect_return",
10395 TYPE_ATTRIBUTES (TREE_TYPE (function)));
10396 rtx callee_abi = aarch64_gen_callee_cookie (isa_mode, pcs_variant, ir);
10397 insn = emit_call_insn (gen_sibcall (funexp, const0_rtx, callee_abi));
10398 SIBLING_CALL_P (insn) = 1;
10400 insn = get_insns ();
10401 shorten_branches (insn);
10403 assemble_start_function (thunk, fnname);
10404 final_start_function (insn, file, 1);
10405 final (insn, file, 1);
10406 final_end_function ();
10407 assemble_end_function (thunk, fnname);
10409 /* Stop pretending to be a post-reload pass. */
10410 reload_completed = 0;
10413 static bool
10414 aarch64_tls_referenced_p (rtx x)
10416 if (!TARGET_HAVE_TLS)
10417 return false;
10418 subrtx_iterator::array_type array;
10419 FOR_EACH_SUBRTX (iter, array, x, ALL)
10421 const_rtx x = *iter;
10422 if (SYMBOL_REF_P (x) && SYMBOL_REF_TLS_MODEL (x) != 0)
10423 return true;
10424 /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
10425 TLS offsets, not real symbol references. */
10426 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
10427 iter.skip_subrtxes ();
10429 return false;
10433 static bool
10434 aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
10436 if (GET_CODE (x) == HIGH)
10437 return true;
10439 /* There's no way to calculate VL-based values using relocations. */
10440 subrtx_iterator::array_type array;
10441 HOST_WIDE_INT factor;
10442 FOR_EACH_SUBRTX (iter, array, x, ALL)
10443 if (GET_CODE (*iter) == CONST_POLY_INT
10444 || aarch64_sme_vq_unspec_p (x, &factor))
10445 return true;
10447 poly_int64 offset;
10448 rtx base = strip_offset_and_salt (x, &offset);
10449 if (SYMBOL_REF_P (base) || LABEL_REF_P (base))
10451 /* We checked for POLY_INT_CST offsets above. */
10452 if (aarch64_classify_symbol (base, offset.to_constant ())
10453 != SYMBOL_FORCE_TO_MEM)
10454 return true;
10455 else
10456 /* Avoid generating a 64-bit relocation in ILP32; leave
10457 to aarch64_expand_mov_immediate to handle it properly. */
10458 return mode != ptr_mode;
10461 return aarch64_tls_referenced_p (x);
10464 /* Implement TARGET_CASE_VALUES_THRESHOLD.
10465 The expansion for a table switch is quite expensive due to the number
10466 of instructions, the table lookup and hard to predict indirect jump.
10467 When optimizing for speed, and -O3 enabled, use the per-core tuning if
10468 set, otherwise use tables for >= 11 cases as a tradeoff between size and
10469 performance. When optimizing for size, use 8 for smallest codesize. */
10471 static unsigned int
10472 aarch64_case_values_threshold (void)
10474 /* Use the specified limit for the number of cases before using jump
10475 tables at higher optimization levels. */
10476 if (optimize > 2
10477 && aarch64_tune_params.max_case_values != 0)
10478 return aarch64_tune_params.max_case_values;
10479 else
10480 return optimize_size ? 8 : 11;
10483 /* Return true if register REGNO is a valid index register.
10484 STRICT_P is true if REG_OK_STRICT is in effect. */
10486 bool
10487 aarch64_regno_ok_for_index_p (int regno, bool strict_p)
10489 if (!HARD_REGISTER_NUM_P (regno))
10491 if (!strict_p)
10492 return true;
10494 if (!reg_renumber)
10495 return false;
10497 regno = reg_renumber[regno];
10499 return GP_REGNUM_P (regno);
10502 /* Return true if register REGNO is a valid base register for mode MODE.
10503 STRICT_P is true if REG_OK_STRICT is in effect. */
10505 bool
10506 aarch64_regno_ok_for_base_p (int regno, bool strict_p)
10508 if (!HARD_REGISTER_NUM_P (regno))
10510 if (!strict_p)
10511 return true;
10513 if (!reg_renumber)
10514 return false;
10516 regno = reg_renumber[regno];
10519 /* The fake registers will be eliminated to either the stack or
10520 hard frame pointer, both of which are usually valid base registers.
10521 Reload deals with the cases where the eliminated form isn't valid. */
10522 return (GP_REGNUM_P (regno)
10523 || regno == SP_REGNUM
10524 || regno == FRAME_POINTER_REGNUM
10525 || regno == ARG_POINTER_REGNUM);
10528 /* Return true if X is a valid base register for mode MODE.
10529 STRICT_P is true if REG_OK_STRICT is in effect. */
10531 static bool
10532 aarch64_base_register_rtx_p (rtx x, bool strict_p)
10534 if (!strict_p
10535 && SUBREG_P (x)
10536 && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (x))])
10537 x = SUBREG_REG (x);
10539 return (REG_P (x) && aarch64_regno_ok_for_base_p (REGNO (x), strict_p));
10542 /* Return true if address offset is a valid index. If it is, fill in INFO
10543 appropriately. STRICT_P is true if REG_OK_STRICT is in effect. */
10545 static bool
10546 aarch64_classify_index (struct aarch64_address_info *info, rtx x,
10547 machine_mode mode, bool strict_p)
10549 enum aarch64_address_type type;
10550 rtx index;
10551 int shift;
10553 /* (reg:P) */
10554 if ((REG_P (x) || SUBREG_P (x))
10555 && GET_MODE (x) == Pmode)
10557 type = ADDRESS_REG_REG;
10558 index = x;
10559 shift = 0;
10561 /* (sign_extend:DI (reg:SI)) */
10562 else if ((GET_CODE (x) == SIGN_EXTEND
10563 || GET_CODE (x) == ZERO_EXTEND)
10564 && GET_MODE (x) == DImode
10565 && GET_MODE (XEXP (x, 0)) == SImode)
10567 type = (GET_CODE (x) == SIGN_EXTEND)
10568 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
10569 index = XEXP (x, 0);
10570 shift = 0;
10572 /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
10573 else if (GET_CODE (x) == MULT
10574 && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
10575 || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
10576 && GET_MODE (XEXP (x, 0)) == DImode
10577 && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
10578 && CONST_INT_P (XEXP (x, 1)))
10580 type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
10581 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
10582 index = XEXP (XEXP (x, 0), 0);
10583 shift = exact_log2 (INTVAL (XEXP (x, 1)));
10585 /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
10586 else if (GET_CODE (x) == ASHIFT
10587 && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
10588 || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
10589 && GET_MODE (XEXP (x, 0)) == DImode
10590 && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
10591 && CONST_INT_P (XEXP (x, 1)))
10593 type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
10594 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
10595 index = XEXP (XEXP (x, 0), 0);
10596 shift = INTVAL (XEXP (x, 1));
10598 /* (and:DI (mult:DI (reg:DI) (const_int scale))
10599 (const_int 0xffffffff<<shift)) */
10600 else if (GET_CODE (x) == AND
10601 && GET_MODE (x) == DImode
10602 && GET_CODE (XEXP (x, 0)) == MULT
10603 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
10604 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
10605 && CONST_INT_P (XEXP (x, 1)))
10607 type = ADDRESS_REG_UXTW;
10608 index = XEXP (XEXP (x, 0), 0);
10609 shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
10610 /* Avoid undefined code dealing with shift being -1. */
10611 if (shift != -1
10612 && INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
10613 shift = -1;
10615 /* (and:DI (ashift:DI (reg:DI) (const_int shift))
10616 (const_int 0xffffffff<<shift)) */
10617 else if (GET_CODE (x) == AND
10618 && GET_MODE (x) == DImode
10619 && GET_CODE (XEXP (x, 0)) == ASHIFT
10620 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
10621 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
10622 && CONST_INT_P (XEXP (x, 1)))
10624 type = ADDRESS_REG_UXTW;
10625 index = XEXP (XEXP (x, 0), 0);
10626 shift = INTVAL (XEXP (XEXP (x, 0), 1));
10627 if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
10628 shift = -1;
10630 /* (mult:P (reg:P) (const_int scale)) */
10631 else if (GET_CODE (x) == MULT
10632 && GET_MODE (x) == Pmode
10633 && GET_MODE (XEXP (x, 0)) == Pmode
10634 && CONST_INT_P (XEXP (x, 1)))
10636 type = ADDRESS_REG_REG;
10637 index = XEXP (x, 0);
10638 shift = exact_log2 (INTVAL (XEXP (x, 1)));
10640 /* (ashift:P (reg:P) (const_int shift)) */
10641 else if (GET_CODE (x) == ASHIFT
10642 && GET_MODE (x) == Pmode
10643 && GET_MODE (XEXP (x, 0)) == Pmode
10644 && CONST_INT_P (XEXP (x, 1)))
10646 type = ADDRESS_REG_REG;
10647 index = XEXP (x, 0);
10648 shift = INTVAL (XEXP (x, 1));
10650 else
10651 return false;
10653 if (!strict_p
10654 && SUBREG_P (index)
10655 && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (index))])
10656 index = SUBREG_REG (index);
10658 auto vec_flags = aarch64_classify_vector_memory_mode (mode);
10659 if (vec_flags & VEC_SVE_DATA)
10661 if (type != ADDRESS_REG_REG
10662 || (1 << shift) != GET_MODE_UNIT_SIZE (mode))
10663 return false;
10665 else
10667 if (shift != 0
10668 && !(IN_RANGE (shift, 1, 3)
10669 && known_eq (1 << shift, GET_MODE_SIZE (mode))))
10670 return false;
10673 if (REG_P (index)
10674 && aarch64_regno_ok_for_index_p (REGNO (index), strict_p))
10676 info->type = type;
10677 info->offset = index;
10678 info->shift = shift;
10679 return true;
10682 return false;
10685 /* Return true if MODE is one of the modes for which we
10686 support LDP/STP operations. */
10688 static bool
10689 aarch64_mode_valid_for_sched_fusion_p (machine_mode mode)
10691 return mode == SImode || mode == DImode
10692 || mode == SFmode || mode == DFmode
10693 || mode == SDmode || mode == DDmode
10694 || (aarch64_vector_mode_supported_p (mode)
10695 && (known_eq (GET_MODE_SIZE (mode), 8)
10696 || known_eq (GET_MODE_SIZE (mode), 16)));
10699 /* Return true if REGNO is a virtual pointer register, or an eliminable
10700 "soft" frame register. Like REGNO_PTR_FRAME_P except that we don't
10701 include stack_pointer or hard_frame_pointer. */
10702 static bool
10703 virt_or_elim_regno_p (unsigned regno)
10705 return ((regno >= FIRST_VIRTUAL_REGISTER
10706 && regno <= LAST_VIRTUAL_POINTER_REGISTER)
10707 || regno == FRAME_POINTER_REGNUM
10708 || regno == ARG_POINTER_REGNUM);
10711 /* Return true if X is a valid address of type TYPE for machine mode MODE.
10712 If it is, fill in INFO appropriately. STRICT_P is true if
10713 REG_OK_STRICT is in effect. */
10715 bool
10716 aarch64_classify_address (struct aarch64_address_info *info,
10717 rtx x, machine_mode mode, bool strict_p,
10718 aarch64_addr_query_type type)
10720 enum rtx_code code = GET_CODE (x);
10721 rtx op0, op1;
10722 poly_int64 offset;
10724 HOST_WIDE_INT const_size;
10726 /* Whether a vector mode is partial doesn't affect address legitimacy.
10727 Partial vectors like VNx8QImode allow the same indexed addressing
10728 mode and MUL VL addressing mode as full vectors like VNx16QImode;
10729 in both cases, MUL VL counts multiples of GET_MODE_SIZE. */
10730 unsigned int vec_flags = aarch64_classify_vector_memory_mode (mode);
10731 vec_flags &= ~VEC_PARTIAL;
10733 /* We use load/store pair for all large int mode load/stores.
10734 TI/TF/TDmode may also use a load/store pair. */
10735 bool advsimd_struct_p = (vec_flags == (VEC_ADVSIMD | VEC_STRUCT));
10736 bool load_store_pair_p = (type == ADDR_QUERY_LDP_STP
10737 || type == ADDR_QUERY_LDP_STP_N
10738 || mode == TImode
10739 || mode == TFmode
10740 || mode == TDmode
10741 || advsimd_struct_p);
10742 /* If we are dealing with ADDR_QUERY_LDP_STP_N that means the incoming mode
10743 corresponds to the actual size of the memory being loaded/stored and the
10744 mode of the corresponding addressing mode is half of that. */
10745 if (type == ADDR_QUERY_LDP_STP_N)
10747 if (known_eq (GET_MODE_SIZE (mode), 32))
10748 mode = V16QImode;
10749 else if (known_eq (GET_MODE_SIZE (mode), 16))
10750 mode = DFmode;
10751 else if (known_eq (GET_MODE_SIZE (mode), 8))
10752 mode = SFmode;
10753 else
10754 return false;
10756 /* This isn't really an Advanced SIMD struct mode, but a mode
10757 used to represent the complete mem in a load/store pair. */
10758 advsimd_struct_p = false;
10761 bool allow_reg_index_p = (!load_store_pair_p
10762 && ((vec_flags == 0
10763 && known_lt (GET_MODE_SIZE (mode), 16))
10764 || vec_flags == VEC_ADVSIMD
10765 || vec_flags & VEC_SVE_DATA));
10767 /* For SVE, only accept [Rn], [Rn, #offset, MUL VL] and [Rn, Rm, LSL #shift].
10768 The latter is not valid for SVE predicates, and that's rejected through
10769 allow_reg_index_p above. */
10770 if ((vec_flags & (VEC_SVE_DATA | VEC_SVE_PRED)) != 0
10771 && (code != REG && code != PLUS))
10772 return false;
10774 gcc_checking_assert (GET_MODE (x) == VOIDmode
10775 || SCALAR_INT_MODE_P (GET_MODE (x)));
10777 switch (code)
10779 case REG:
10780 case SUBREG:
10781 info->type = ADDRESS_REG_IMM;
10782 info->base = x;
10783 info->offset = const0_rtx;
10784 info->const_offset = 0;
10785 return aarch64_base_register_rtx_p (x, strict_p);
10787 case PLUS:
10788 op0 = XEXP (x, 0);
10789 op1 = XEXP (x, 1);
10791 if (! strict_p
10792 && REG_P (op0)
10793 && virt_or_elim_regno_p (REGNO (op0))
10794 && poly_int_rtx_p (op1, &offset))
10796 info->type = ADDRESS_REG_IMM;
10797 info->base = op0;
10798 info->offset = op1;
10799 info->const_offset = offset;
10801 return true;
10804 if (maybe_ne (GET_MODE_SIZE (mode), 0)
10805 && aarch64_base_register_rtx_p (op0, strict_p)
10806 && poly_int_rtx_p (op1, &offset))
10808 info->type = ADDRESS_REG_IMM;
10809 info->base = op0;
10810 info->offset = op1;
10811 info->const_offset = offset;
10813 /* TImode, TFmode and TDmode values are allowed in both pairs of X
10814 registers and individual Q registers. The available
10815 address modes are:
10816 X,X: 7-bit signed scaled offset
10817 Q: 9-bit signed offset
10818 We conservatively require an offset representable in either mode.
10819 When performing the check for pairs of X registers i.e. LDP/STP
10820 pass down DImode since that is the natural size of the LDP/STP
10821 instruction memory accesses. */
10822 if (mode == TImode || mode == TFmode || mode == TDmode)
10823 return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
10824 && (aarch64_offset_9bit_signed_unscaled_p (mode, offset)
10825 || offset_12bit_unsigned_scaled_p (mode, offset)));
10827 if (mode == V8DImode)
10828 return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
10829 && aarch64_offset_7bit_signed_scaled_p (DImode, offset + 48));
10831 /* A 7bit offset check because OImode will emit a ldp/stp
10832 instruction (only !TARGET_SIMD or big endian will get here).
10833 For ldp/stp instructions, the offset is scaled for the size of a
10834 single element of the pair. */
10835 if (aarch64_advsimd_partial_struct_mode_p (mode)
10836 && known_eq (GET_MODE_SIZE (mode), 16))
10837 return aarch64_offset_7bit_signed_scaled_p (DImode, offset);
10838 if (aarch64_advsimd_full_struct_mode_p (mode)
10839 && known_eq (GET_MODE_SIZE (mode), 32))
10840 return aarch64_offset_7bit_signed_scaled_p (TImode, offset);
10842 /* Three 9/12 bit offsets checks because CImode will emit three
10843 ldr/str instructions (only !TARGET_SIMD or big endian will
10844 get here). */
10845 if (aarch64_advsimd_partial_struct_mode_p (mode)
10846 && known_eq (GET_MODE_SIZE (mode), 24))
10847 return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
10848 && (aarch64_offset_9bit_signed_unscaled_p (DImode,
10849 offset + 16)
10850 || offset_12bit_unsigned_scaled_p (DImode,
10851 offset + 16)));
10852 if (aarch64_advsimd_full_struct_mode_p (mode)
10853 && known_eq (GET_MODE_SIZE (mode), 48))
10854 return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
10855 && (aarch64_offset_9bit_signed_unscaled_p (TImode,
10856 offset + 32)
10857 || offset_12bit_unsigned_scaled_p (TImode,
10858 offset + 32)));
10860 /* Two 7bit offsets checks because XImode will emit two ldp/stp
10861 instructions (only big endian will get here). */
10862 if (aarch64_advsimd_partial_struct_mode_p (mode)
10863 && known_eq (GET_MODE_SIZE (mode), 32))
10864 return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
10865 && aarch64_offset_7bit_signed_scaled_p (DImode,
10866 offset + 16));
10867 if (aarch64_advsimd_full_struct_mode_p (mode)
10868 && known_eq (GET_MODE_SIZE (mode), 64))
10869 return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
10870 && aarch64_offset_7bit_signed_scaled_p (TImode,
10871 offset + 32));
10873 /* Make "m" use the LD1 offset range for SVE data modes, so
10874 that pre-RTL optimizers like ivopts will work to that
10875 instead of the wider LDR/STR range. */
10876 if (vec_flags == VEC_SVE_DATA)
10877 return (type == ADDR_QUERY_M
10878 ? offset_4bit_signed_scaled_p (mode, offset)
10879 : offset_9bit_signed_scaled_p (mode, offset));
10881 if (vec_flags == (VEC_SVE_DATA | VEC_STRUCT))
10883 poly_int64 end_offset = (offset
10884 + GET_MODE_SIZE (mode)
10885 - BYTES_PER_SVE_VECTOR);
10886 return (type == ADDR_QUERY_M
10887 ? offset_4bit_signed_scaled_p (mode, offset)
10888 : (offset_9bit_signed_scaled_p (SVE_BYTE_MODE, offset)
10889 && offset_9bit_signed_scaled_p (SVE_BYTE_MODE,
10890 end_offset)));
10893 if (vec_flags == VEC_SVE_PRED)
10894 return offset_9bit_signed_scaled_p (mode, offset);
10896 if (vec_flags == (VEC_SVE_PRED | VEC_STRUCT))
10898 poly_int64 end_offset = (offset
10899 + GET_MODE_SIZE (mode)
10900 - BYTES_PER_SVE_PRED);
10901 return (offset_9bit_signed_scaled_p (VNx16BImode, end_offset)
10902 && offset_9bit_signed_scaled_p (VNx16BImode, offset));
10905 if (load_store_pair_p)
10906 return ((known_eq (GET_MODE_SIZE (mode), 4)
10907 || known_eq (GET_MODE_SIZE (mode), 8)
10908 || known_eq (GET_MODE_SIZE (mode), 16))
10909 && aarch64_offset_7bit_signed_scaled_p (mode, offset));
10910 else
10911 return (aarch64_offset_9bit_signed_unscaled_p (mode, offset)
10912 || offset_12bit_unsigned_scaled_p (mode, offset));
10915 if (allow_reg_index_p)
10917 /* Look for base + (scaled/extended) index register. */
10918 if (aarch64_base_register_rtx_p (op0, strict_p)
10919 && aarch64_classify_index (info, op1, mode, strict_p))
10921 info->base = op0;
10922 return true;
10924 if (aarch64_base_register_rtx_p (op1, strict_p)
10925 && aarch64_classify_index (info, op0, mode, strict_p))
10927 info->base = op1;
10928 return true;
10932 return false;
10934 case POST_INC:
10935 case POST_DEC:
10936 case PRE_INC:
10937 case PRE_DEC:
10938 info->type = ADDRESS_REG_WB;
10939 info->base = XEXP (x, 0);
10940 info->offset = NULL_RTX;
10941 return aarch64_base_register_rtx_p (info->base, strict_p);
10943 case POST_MODIFY:
10944 case PRE_MODIFY:
10945 info->type = ADDRESS_REG_WB;
10946 info->base = XEXP (x, 0);
10947 if (GET_CODE (XEXP (x, 1)) == PLUS
10948 && poly_int_rtx_p (XEXP (XEXP (x, 1), 1), &offset)
10949 && rtx_equal_p (XEXP (XEXP (x, 1), 0), info->base)
10950 && aarch64_base_register_rtx_p (info->base, strict_p))
10952 info->offset = XEXP (XEXP (x, 1), 1);
10953 info->const_offset = offset;
10955 /* TImode, TFmode and TDmode values are allowed in both pairs of X
10956 registers and individual Q registers. The available
10957 address modes are:
10958 X,X: 7-bit signed scaled offset
10959 Q: 9-bit signed offset
10960 We conservatively require an offset representable in either mode.
10962 if (mode == TImode || mode == TFmode || mode == TDmode)
10963 return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
10964 && aarch64_offset_9bit_signed_unscaled_p (mode, offset));
10966 if (load_store_pair_p)
10967 return ((known_eq (GET_MODE_SIZE (mode), 4)
10968 || known_eq (GET_MODE_SIZE (mode), 8)
10969 || known_eq (GET_MODE_SIZE (mode), 16))
10970 && aarch64_offset_7bit_signed_scaled_p (mode, offset));
10971 else
10972 return aarch64_offset_9bit_signed_unscaled_p (mode, offset);
10974 return false;
10976 case CONST:
10977 case SYMBOL_REF:
10978 case LABEL_REF:
10979 /* load literal: pc-relative constant pool entry. Only supported
10980 for SI mode or larger. */
10981 info->type = ADDRESS_SYMBOLIC;
10983 if (!load_store_pair_p
10984 && GET_MODE_SIZE (mode).is_constant (&const_size)
10985 && const_size >= 4)
10987 poly_int64 offset;
10988 rtx sym = strip_offset_and_salt (x, &offset);
10989 return ((LABEL_REF_P (sym)
10990 || (SYMBOL_REF_P (sym)
10991 && CONSTANT_POOL_ADDRESS_P (sym)
10992 && aarch64_pcrelative_literal_loads)));
10994 return false;
10996 case LO_SUM:
10997 info->type = ADDRESS_LO_SUM;
10998 info->base = XEXP (x, 0);
10999 info->offset = XEXP (x, 1);
11000 if (allow_reg_index_p
11001 && aarch64_base_register_rtx_p (info->base, strict_p))
11003 poly_int64 offset;
11004 HOST_WIDE_INT const_offset;
11005 rtx sym = strip_offset_and_salt (info->offset, &offset);
11006 if (SYMBOL_REF_P (sym)
11007 && offset.is_constant (&const_offset)
11008 && (aarch64_classify_symbol (sym, const_offset)
11009 == SYMBOL_SMALL_ABSOLUTE))
11011 /* The symbol and offset must be aligned to the access size. */
11012 unsigned int align;
11014 if (CONSTANT_POOL_ADDRESS_P (sym))
11015 align = GET_MODE_ALIGNMENT (get_pool_mode (sym));
11016 else if (TREE_CONSTANT_POOL_ADDRESS_P (sym))
11018 tree exp = SYMBOL_REF_DECL (sym);
11019 align = TYPE_ALIGN (TREE_TYPE (exp));
11020 align = aarch64_constant_alignment (exp, align);
11022 else if (SYMBOL_REF_DECL (sym))
11023 align = DECL_ALIGN (SYMBOL_REF_DECL (sym));
11024 else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym)
11025 && SYMBOL_REF_BLOCK (sym) != NULL)
11026 align = SYMBOL_REF_BLOCK (sym)->alignment;
11027 else
11028 align = BITS_PER_UNIT;
11030 poly_int64 ref_size = GET_MODE_SIZE (mode);
11031 if (known_eq (ref_size, 0))
11032 ref_size = GET_MODE_SIZE (DImode);
11034 return (multiple_p (const_offset, ref_size)
11035 && multiple_p (align / BITS_PER_UNIT, ref_size));
11038 return false;
11040 default:
11041 return false;
11045 /* Return true if the address X is valid for a PRFM instruction.
11046 STRICT_P is true if we should do strict checking with
11047 aarch64_classify_address. */
11049 bool
11050 aarch64_address_valid_for_prefetch_p (rtx x, bool strict_p)
11052 struct aarch64_address_info addr;
11054 /* PRFM accepts the same addresses as DImode... */
11055 bool res = aarch64_classify_address (&addr, x, DImode, strict_p);
11056 if (!res)
11057 return false;
11059 /* ... except writeback forms. */
11060 return addr.type != ADDRESS_REG_WB;
11063 bool
11064 aarch64_symbolic_address_p (rtx x)
11066 poly_int64 offset;
11067 x = strip_offset_and_salt (x, &offset);
11068 return SYMBOL_REF_P (x) || LABEL_REF_P (x);
11071 /* Classify the base of symbolic expression X. */
11073 enum aarch64_symbol_type
11074 aarch64_classify_symbolic_expression (rtx x)
11076 rtx offset;
11078 split_const (x, &x, &offset);
11079 return aarch64_classify_symbol (x, INTVAL (offset));
11083 /* Return TRUE if X is a legitimate address for accessing memory in
11084 mode MODE. */
11085 static bool
11086 aarch64_legitimate_address_hook_p (machine_mode mode, rtx x, bool strict_p,
11087 code_helper = ERROR_MARK)
11089 struct aarch64_address_info addr;
11091 return aarch64_classify_address (&addr, x, mode, strict_p);
11094 /* Return TRUE if X is a legitimate address of type TYPE for accessing
11095 memory in mode MODE. STRICT_P is true if REG_OK_STRICT is in effect. */
11096 bool
11097 aarch64_legitimate_address_p (machine_mode mode, rtx x, bool strict_p,
11098 aarch64_addr_query_type type)
11100 struct aarch64_address_info addr;
11102 return aarch64_classify_address (&addr, x, mode, strict_p, type);
11105 /* Implement TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT. */
11107 static bool
11108 aarch64_legitimize_address_displacement (rtx *offset1, rtx *offset2,
11109 poly_int64 orig_offset,
11110 machine_mode mode)
11112 HOST_WIDE_INT size;
11113 if (GET_MODE_SIZE (mode).is_constant (&size))
11115 HOST_WIDE_INT const_offset, second_offset;
11117 /* A general SVE offset is A * VQ + B. Remove the A component from
11118 coefficient 0 in order to get the constant B. */
11119 const_offset = orig_offset.coeffs[0] - orig_offset.coeffs[1];
11121 /* Split an out-of-range address displacement into a base and
11122 offset. Use 4KB range for 1- and 2-byte accesses and a 16KB
11123 range otherwise to increase opportunities for sharing the base
11124 address of different sizes. Unaligned accesses use the signed
11125 9-bit range, TImode/TFmode/TDmode use the intersection of signed
11126 scaled 7-bit and signed 9-bit offset. */
11127 if (mode == TImode || mode == TFmode || mode == TDmode)
11128 second_offset = ((const_offset + 0x100) & 0x1f8) - 0x100;
11129 else if ((const_offset & (size - 1)) != 0)
11130 second_offset = ((const_offset + 0x100) & 0x1ff) - 0x100;
11131 else
11132 second_offset = const_offset & (size < 4 ? 0xfff : 0x3ffc);
11134 if (second_offset == 0 || known_eq (orig_offset, second_offset))
11135 return false;
11137 /* Split the offset into second_offset and the rest. */
11138 *offset1 = gen_int_mode (orig_offset - second_offset, Pmode);
11139 *offset2 = gen_int_mode (second_offset, Pmode);
11140 return true;
11142 else
11144 /* Get the mode we should use as the basis of the range. For structure
11145 modes this is the mode of one vector. */
11146 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
11147 machine_mode step_mode
11148 = (vec_flags & VEC_STRUCT) != 0 ? SVE_BYTE_MODE : mode;
11150 /* Get the "mul vl" multiplier we'd like to use. */
11151 HOST_WIDE_INT factor = GET_MODE_SIZE (step_mode).coeffs[1];
11152 HOST_WIDE_INT vnum = orig_offset.coeffs[1] / factor;
11153 if (vec_flags & VEC_SVE_DATA)
11154 /* LDR supports a 9-bit range, but the move patterns for
11155 structure modes require all vectors to be in range of the
11156 same base. The simplest way of accomodating that while still
11157 promoting reuse of anchor points between different modes is
11158 to use an 8-bit range unconditionally. */
11159 vnum = ((vnum + 128) & 255) - 128;
11160 else
11161 /* Predicates are only handled singly, so we might as well use
11162 the full range. */
11163 vnum = ((vnum + 256) & 511) - 256;
11164 if (vnum == 0)
11165 return false;
11167 /* Convert the "mul vl" multiplier into a byte offset. */
11168 poly_int64 second_offset = GET_MODE_SIZE (step_mode) * vnum;
11169 if (known_eq (second_offset, orig_offset))
11170 return false;
11172 /* Split the offset into second_offset and the rest. */
11173 *offset1 = gen_int_mode (orig_offset - second_offset, Pmode);
11174 *offset2 = gen_int_mode (second_offset, Pmode);
11175 return true;
11179 /* Return the binary representation of floating point constant VALUE in INTVAL.
11180 If the value cannot be converted, return false without setting INTVAL.
11181 The conversion is done in the given MODE. */
11182 bool
11183 aarch64_reinterpret_float_as_int (rtx value, unsigned HOST_WIDE_INT *intval)
11186 /* We make a general exception for 0. */
11187 if (aarch64_float_const_zero_rtx_p (value))
11189 *intval = 0;
11190 return true;
11193 scalar_float_mode mode;
11194 if (!CONST_DOUBLE_P (value)
11195 || !is_a <scalar_float_mode> (GET_MODE (value), &mode)
11196 || GET_MODE_BITSIZE (mode) > HOST_BITS_PER_WIDE_INT
11197 /* Only support up to DF mode. */
11198 || GET_MODE_BITSIZE (mode) > GET_MODE_BITSIZE (DFmode))
11199 return false;
11201 unsigned HOST_WIDE_INT ival = 0;
11203 long res[2];
11204 real_to_target (res,
11205 CONST_DOUBLE_REAL_VALUE (value),
11206 REAL_MODE_FORMAT (mode));
11208 if (mode == DFmode || mode == DDmode)
11210 int order = BYTES_BIG_ENDIAN ? 1 : 0;
11211 ival = zext_hwi (res[order], 32);
11212 ival |= (zext_hwi (res[1 - order], 32) << 32);
11214 else
11215 ival = zext_hwi (res[0], 32);
11217 *intval = ival;
11218 return true;
11221 /* Return TRUE if rtx X is an immediate constant that can be moved using a
11222 single MOV(+MOVK) followed by an FMOV. */
11223 bool
11224 aarch64_float_const_rtx_p (rtx x)
11226 machine_mode mode = GET_MODE (x);
11227 if (mode == VOIDmode)
11228 return false;
11230 /* Determine whether it's cheaper to write float constants as
11231 mov/movk pairs over ldr/adrp pairs. */
11232 unsigned HOST_WIDE_INT ival;
11234 if (CONST_DOUBLE_P (x)
11235 && SCALAR_FLOAT_MODE_P (mode)
11236 && aarch64_reinterpret_float_as_int (x, &ival))
11238 machine_mode imode = known_eq (GET_MODE_SIZE (mode), 8) ? DImode : SImode;
11239 int num_instr = aarch64_internal_mov_immediate
11240 (NULL_RTX, gen_int_mode (ival, imode), false, imode);
11241 return num_instr < 3;
11244 return false;
11247 /* Return TRUE if rtx X is immediate constant 0.0 (but not in Decimal
11248 Floating Point). */
11249 bool
11250 aarch64_float_const_zero_rtx_p (rtx x)
11252 /* 0.0 in Decimal Floating Point cannot be represented by #0 or
11253 zr as our callers expect, so no need to check the actual
11254 value if X is of Decimal Floating Point type. */
11255 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_DECIMAL_FLOAT)
11256 return false;
11258 if (REAL_VALUE_MINUS_ZERO (*CONST_DOUBLE_REAL_VALUE (x)))
11259 return !HONOR_SIGNED_ZEROS (GET_MODE (x));
11260 return real_equal (CONST_DOUBLE_REAL_VALUE (x), &dconst0);
11263 /* Return true if X is any kind of constant zero rtx. */
11265 bool
11266 aarch64_const_zero_rtx_p (rtx x)
11268 return (x == CONST0_RTX (GET_MODE (x))
11269 || (CONST_DOUBLE_P (x) && aarch64_float_const_zero_rtx_p (x)));
11272 /* Return TRUE if rtx X is immediate constant that fits in a single
11273 MOVI immediate operation. */
11274 bool
11275 aarch64_can_const_movi_rtx_p (rtx x, machine_mode mode)
11277 if (!TARGET_SIMD)
11278 return false;
11280 machine_mode vmode;
11281 scalar_int_mode imode;
11282 unsigned HOST_WIDE_INT ival;
11284 if (CONST_DOUBLE_P (x)
11285 && SCALAR_FLOAT_MODE_P (mode))
11287 if (!aarch64_reinterpret_float_as_int (x, &ival))
11288 return false;
11290 /* We make a general exception for 0. */
11291 if (aarch64_float_const_zero_rtx_p (x))
11292 return true;
11294 imode = int_mode_for_mode (mode).require ();
11296 else if (CONST_INT_P (x)
11297 && is_a <scalar_int_mode> (mode, &imode))
11298 ival = INTVAL (x);
11299 else
11300 return false;
11302 /* use a 64 bit mode for everything except for DI/DF/DD mode, where we use
11303 a 128 bit vector mode. */
11304 int width = GET_MODE_BITSIZE (imode) == 64 ? 128 : 64;
11306 vmode = aarch64_simd_container_mode (imode, width);
11307 rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, ival);
11309 return aarch64_simd_valid_mov_imm (v_op);
11312 /* Return TRUE if DST and SRC with mode MODE is a valid fp move. */
11313 bool
11314 aarch64_valid_fp_move (rtx dst, rtx src, machine_mode mode)
11316 if (!TARGET_FLOAT)
11317 return false;
11319 if (aarch64_reg_or_fp_zero (src, mode))
11320 return true;
11322 if (!register_operand (dst, mode))
11323 return false;
11325 if (MEM_P (src))
11326 return true;
11328 if (!DECIMAL_FLOAT_MODE_P (mode))
11330 if (aarch64_can_const_movi_rtx_p (src, mode)
11331 || aarch64_float_const_representable_p (src)
11332 || aarch64_float_const_zero_rtx_p (src))
11333 return true;
11335 /* Block FP immediates which are split during expand. */
11336 if (aarch64_float_const_rtx_p (src))
11337 return false;
11340 return can_create_pseudo_p ();
11343 /* Return the fixed registers used for condition codes. */
11345 static bool
11346 aarch64_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
11348 *p1 = CC_REGNUM;
11349 *p2 = INVALID_REGNUM;
11350 return true;
11353 /* Return a fresh memory reference to the current function's TPIDR2 block,
11354 creating a block if necessary. */
11356 static rtx
11357 aarch64_get_tpidr2_block ()
11359 if (!cfun->machine->tpidr2_block)
11360 /* The TPIDR2 block is 16 bytes in size and must be aligned to a 128-bit
11361 boundary. */
11362 cfun->machine->tpidr2_block = assign_stack_local (V16QImode, 16, 128);
11363 return copy_rtx (cfun->machine->tpidr2_block);
11366 /* Return a fresh register that points to the current function's
11367 TPIDR2 block, creating a block if necessary. */
11369 static rtx
11370 aarch64_get_tpidr2_ptr ()
11372 rtx block = aarch64_get_tpidr2_block ();
11373 return force_reg (Pmode, XEXP (block, 0));
11376 /* Emit instructions to allocate a ZA lazy save buffer and initialize the
11377 current function's TPIDR2 block. */
11379 static void
11380 aarch64_init_tpidr2_block ()
11382 rtx block = aarch64_get_tpidr2_block ();
11384 /* The ZA save buffer is SVL.B*SVL.B bytes in size. */
11385 rtx svl_bytes = aarch64_sme_vq_immediate (Pmode, 16, AARCH64_ISA_MODE);
11386 rtx svl_bytes_reg = force_reg (DImode, svl_bytes);
11387 rtx za_size = expand_simple_binop (Pmode, MULT, svl_bytes_reg,
11388 svl_bytes_reg, NULL, 0, OPTAB_LIB_WIDEN);
11389 rtx za_save_buffer = allocate_dynamic_stack_space (za_size, 128,
11390 BITS_PER_UNIT, -1, true);
11391 za_save_buffer = force_reg (Pmode, za_save_buffer);
11392 cfun->machine->za_save_buffer = za_save_buffer;
11394 /* The first word of the block points to the save buffer and the second
11395 word is the number of ZA slices to save. */
11396 rtx block_0 = adjust_address (block, DImode, 0);
11397 emit_insn (aarch64_gen_store_pair (block_0, za_save_buffer, svl_bytes_reg));
11399 if (!memory_operand (block, V16QImode))
11400 block = replace_equiv_address (block, force_reg (Pmode, XEXP (block, 0)));
11401 emit_insn (gen_aarch64_setup_local_tpidr2 (block));
11404 /* Restore the contents of ZA from the lazy save buffer, given that
11405 register TPIDR2_BLOCK points to the current function's TPIDR2 block.
11406 PSTATE.ZA is known to be 0 and TPIDR2_EL0 is known to be null. */
11408 void
11409 aarch64_restore_za (rtx tpidr2_block)
11411 emit_insn (gen_aarch64_smstart_za ());
11412 if (REGNO (tpidr2_block) != R0_REGNUM)
11413 emit_move_insn (gen_rtx_REG (Pmode, R0_REGNUM), tpidr2_block);
11414 emit_insn (gen_aarch64_tpidr2_restore ());
11417 /* Return the ZT0 save buffer, creating one if necessary. */
11419 static rtx
11420 aarch64_get_zt0_save_buffer ()
11422 if (!cfun->machine->zt0_save_buffer)
11423 cfun->machine->zt0_save_buffer = assign_stack_local (V8DImode, 64, 128);
11424 return cfun->machine->zt0_save_buffer;
11427 /* Save ZT0 to the current function's save buffer. */
11429 static void
11430 aarch64_save_zt0 ()
11432 rtx mem = aarch64_get_zt0_save_buffer ();
11433 mem = replace_equiv_address (mem, force_reg (Pmode, XEXP (mem, 0)));
11434 emit_insn (gen_aarch64_sme_str_zt0 (mem));
11437 /* Restore ZT0 from the current function's save buffer. FROM_LAZY_SAVE_P
11438 is true if the load is happening after a call to a private-ZA function,
11439 false if it can be treated as a normal load. */
11441 static void
11442 aarch64_restore_zt0 (bool from_lazy_save_p)
11444 rtx mem = aarch64_get_zt0_save_buffer ();
11445 mem = replace_equiv_address (mem, force_reg (Pmode, XEXP (mem, 0)));
11446 emit_insn (from_lazy_save_p
11447 ? gen_aarch64_restore_zt0 (mem)
11448 : gen_aarch64_sme_ldr_zt0 (mem));
11451 /* Implement TARGET_START_CALL_ARGS. */
11453 static void
11454 aarch64_start_call_args (cumulative_args_t ca_v)
11456 CUMULATIVE_ARGS *ca = get_cumulative_args (ca_v);
11458 if (!TARGET_SME && (ca->isa_mode & AARCH64_ISA_MODE_SM_ON))
11460 error ("calling a streaming function requires the ISA extension %qs",
11461 "sme");
11462 inform (input_location, "you can enable %qs using the command-line"
11463 " option %<-march%>, or by using the %<target%>"
11464 " attribute or pragma", "sme");
11467 if ((ca->shared_za_flags & (AARCH64_STATE_IN | AARCH64_STATE_OUT))
11468 && !aarch64_cfun_has_state ("za"))
11469 error ("call to a function that shares %qs state from a function"
11470 " that has no %qs state", "za", "za");
11471 else if ((ca->shared_zt0_flags & (AARCH64_STATE_IN | AARCH64_STATE_OUT))
11472 && !aarch64_cfun_has_state ("zt0"))
11473 error ("call to a function that shares %qs state from a function"
11474 " that has no %qs state", "zt0", "zt0");
11475 else if (!TARGET_ZA && (ca->isa_mode & AARCH64_ISA_MODE_ZA_ON))
11476 error ("call to a function that shares SME state from a function"
11477 " that has no SME state");
11479 /* If this is a call to a private ZA function, emit a marker to
11480 indicate where any necessary set-up code could be inserted.
11481 The code itself is inserted by the mode-switching pass. */
11482 if (TARGET_ZA && !(ca->isa_mode & AARCH64_ISA_MODE_ZA_ON))
11483 emit_insn (gen_aarch64_start_private_za_call ());
11485 /* If this is a call to a shared-ZA function that doesn't share ZT0,
11486 save and restore ZT0 around the call. */
11487 if (aarch64_cfun_has_state ("zt0")
11488 && (ca->isa_mode & AARCH64_ISA_MODE_ZA_ON)
11489 && ca->shared_zt0_flags == 0)
11490 aarch64_save_zt0 ();
11493 /* This function is used by the call expanders of the machine description.
11494 RESULT is the register in which the result is returned. It's NULL for
11495 "call" and "sibcall".
11496 MEM is the location of the function call.
11497 COOKIE is either:
11498 - a const_int that gives the argument to the call's UNSPEC_CALLEE_ABI.
11499 - a PARALLEL that contains such a const_int as its first element.
11500 The second element is a PARALLEL that lists all the argument
11501 registers that need to be saved and restored around a change
11502 in PSTATE.SM, or const0_rtx if no such switch is needed.
11503 The third and fourth elements are const_ints that contain the
11504 sharing flags for ZA and ZT0 respectively.
11505 SIBCALL indicates whether this function call is normal call or sibling call.
11506 It will generate different pattern accordingly. */
11508 void
11509 aarch64_expand_call (rtx result, rtx mem, rtx cookie, bool sibcall)
11511 rtx call, callee, tmp;
11512 rtvec vec;
11513 machine_mode mode;
11515 rtx callee_abi = cookie;
11516 rtx sme_mode_switch_args = const0_rtx;
11517 unsigned int shared_za_flags = 0;
11518 unsigned int shared_zt0_flags = 0;
11519 if (GET_CODE (cookie) == PARALLEL)
11521 callee_abi = XVECEXP (cookie, 0, 0);
11522 sme_mode_switch_args = XVECEXP (cookie, 0, 1);
11523 shared_za_flags = INTVAL (XVECEXP (cookie, 0, 2));
11524 shared_zt0_flags = INTVAL (XVECEXP (cookie, 0, 3));
11527 gcc_assert (CONST_INT_P (callee_abi));
11528 auto callee_isa_mode = aarch64_callee_isa_mode (callee_abi);
11530 if (aarch64_cfun_has_state ("za")
11531 && (callee_isa_mode & AARCH64_ISA_MODE_ZA_ON)
11532 && !shared_za_flags)
11534 sorry ("call to a function that shares state other than %qs"
11535 " from a function that has %qs state", "za", "za");
11536 inform (input_location, "use %<__arm_preserves(\"za\")%> if the"
11537 " callee preserves ZA");
11540 gcc_assert (MEM_P (mem));
11541 callee = XEXP (mem, 0);
11543 #if TARGET_PECOFF
11544 tmp = legitimize_pe_coff_symbol (callee, false);
11545 if (tmp)
11546 callee = tmp;
11547 #endif
11549 mode = GET_MODE (callee);
11550 gcc_assert (mode == Pmode);
11552 /* Decide if we should generate indirect calls by loading the
11553 address of the callee into a register before performing
11554 the branch-and-link. */
11555 if (SYMBOL_REF_P (callee)
11556 ? (aarch64_is_long_call_p (callee)
11557 || aarch64_is_noplt_call_p (callee))
11558 : !REG_P (callee))
11559 XEXP (mem, 0) = force_reg (mode, callee);
11561 /* Accumulate the return values, including state that is shared via
11562 attributes. */
11563 auto_vec<rtx, 8> return_values;
11564 if (result)
11566 if (GET_CODE (result) == PARALLEL)
11567 for (int i = 0; i < XVECLEN (result, 0); ++i)
11568 return_values.safe_push (XVECEXP (result, 0, i));
11569 else
11570 return_values.safe_push (result);
11572 unsigned int orig_num_return_values = return_values.length ();
11573 if (shared_za_flags & AARCH64_STATE_OUT)
11574 return_values.safe_push (gen_rtx_REG (VNx16BImode, ZA_REGNUM));
11575 /* When calling private-ZA functions from functions with ZA state,
11576 we want to know whether the call committed a lazy save. */
11577 if (TARGET_ZA && !shared_za_flags)
11578 return_values.safe_push (gen_rtx_REG (VNx16BImode, ZA_SAVED_REGNUM));
11579 if (shared_zt0_flags & AARCH64_STATE_OUT)
11580 return_values.safe_push (gen_rtx_REG (V8DImode, ZT0_REGNUM));
11582 /* Create the new return value, if necessary. */
11583 if (orig_num_return_values != return_values.length ())
11585 if (return_values.length () == 1)
11586 result = return_values[0];
11587 else
11589 for (rtx &x : return_values)
11590 if (GET_CODE (x) != EXPR_LIST)
11591 x = gen_rtx_EXPR_LIST (VOIDmode, x, const0_rtx);
11592 rtvec v = gen_rtvec_v (return_values.length (),
11593 return_values.address ());
11594 result = gen_rtx_PARALLEL (VOIDmode, v);
11598 call = gen_rtx_CALL (VOIDmode, mem, const0_rtx);
11600 if (result != NULL_RTX)
11601 call = gen_rtx_SET (result, call);
11603 if (sibcall)
11604 tmp = ret_rtx;
11605 else
11606 tmp = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (Pmode, LR_REGNUM));
11608 callee_abi = gen_rtx_UNSPEC (DImode, gen_rtvec (1, callee_abi),
11609 UNSPEC_CALLEE_ABI);
11611 vec = gen_rtvec (3, call, callee_abi, tmp);
11612 call = gen_rtx_PARALLEL (VOIDmode, vec);
11614 auto call_insn = aarch64_emit_call_insn (call);
11616 /* Check whether the call requires a change to PSTATE.SM. We can't
11617 emit the instructions to change PSTATE.SM yet, since they involve
11618 a change in vector length and a change in instruction set, which
11619 cannot be represented in RTL.
11621 For now, just record which registers will be clobbered and used
11622 by the changes to PSTATE.SM. */
11623 if (!sibcall && aarch64_call_switches_pstate_sm (callee_isa_mode))
11625 aarch64_sme_mode_switch_regs args_switch;
11626 if (sme_mode_switch_args != const0_rtx)
11628 unsigned int num_args = XVECLEN (sme_mode_switch_args, 0);
11629 for (unsigned int i = 0; i < num_args; ++i)
11631 rtx x = XVECEXP (sme_mode_switch_args, 0, i);
11632 args_switch.add_reg (GET_MODE (x), REGNO (x));
11636 aarch64_sme_mode_switch_regs result_switch;
11637 if (result)
11638 result_switch.add_call_result (call_insn);
11640 unsigned int num_gprs = MAX (args_switch.num_gprs (),
11641 result_switch.num_gprs ());
11642 for (unsigned int i = 0; i < num_gprs; ++i)
11643 clobber_reg (&CALL_INSN_FUNCTION_USAGE (call_insn),
11644 gen_rtx_REG (DImode, args_switch.FIRST_GPR + i));
11646 for (int regno = V0_REGNUM; regno < V0_REGNUM + 32; regno += 4)
11647 clobber_reg (&CALL_INSN_FUNCTION_USAGE (call_insn),
11648 gen_rtx_REG (V4x16QImode, regno));
11650 for (int regno = P0_REGNUM; regno < P0_REGNUM + 16; regno += 1)
11651 clobber_reg (&CALL_INSN_FUNCTION_USAGE (call_insn),
11652 gen_rtx_REG (VNx16BImode, regno));
11654 /* Ensure that the VG save slot has been initialized. Also emit
11655 an instruction to model the effect of the temporary clobber
11656 of VG, so that the prologue/epilogue pass sees the need to
11657 save the old value. */
11658 use_reg (&CALL_INSN_FUNCTION_USAGE (call_insn),
11659 gen_rtx_REG (DImode, VG_REGNUM));
11660 emit_insn_before (gen_aarch64_update_vg (), call_insn);
11662 cfun->machine->call_switches_pstate_sm = true;
11665 /* Add any ZA-related information.
11667 ZA_REGNUM represents the current function's ZA state, rather than
11668 the contents of the ZA register itself. We ensure that the function's
11669 ZA state is preserved by private-ZA call sequences, so the call itself
11670 does not use or clobber ZA_REGNUM. The same thing applies to
11671 ZT0_REGNUM. */
11672 if (TARGET_ZA)
11674 /* The callee requires ZA to be active if the callee is shared-ZA,
11675 otherwise it requires ZA to be dormant or off. The state of ZA is
11676 captured by a combination of SME_STATE_REGNUM, TPIDR2_SETUP_REGNUM,
11677 and ZA_SAVED_REGNUM. */
11678 use_reg (&CALL_INSN_FUNCTION_USAGE (call_insn),
11679 gen_rtx_REG (DImode, SME_STATE_REGNUM));
11680 use_reg (&CALL_INSN_FUNCTION_USAGE (call_insn),
11681 gen_rtx_REG (DImode, TPIDR2_SETUP_REGNUM));
11682 use_reg (&CALL_INSN_FUNCTION_USAGE (call_insn),
11683 gen_rtx_REG (VNx16BImode, ZA_SAVED_REGNUM));
11685 /* Keep the aarch64_start/end_private_za_call markers live. */
11686 if (!(callee_isa_mode & AARCH64_ISA_MODE_ZA_ON))
11687 use_reg (&CALL_INSN_FUNCTION_USAGE (call_insn),
11688 gen_rtx_REG (VNx16BImode, LOWERING_REGNUM));
11690 /* If the callee is a shared-ZA function, record whether it uses the
11691 current value of ZA and ZT0. */
11692 if (shared_za_flags & AARCH64_STATE_IN)
11693 use_reg (&CALL_INSN_FUNCTION_USAGE (call_insn),
11694 gen_rtx_REG (VNx16BImode, ZA_REGNUM));
11696 if (shared_zt0_flags & AARCH64_STATE_IN)
11697 use_reg (&CALL_INSN_FUNCTION_USAGE (call_insn),
11698 gen_rtx_REG (V8DImode, ZT0_REGNUM));
11702 /* Implement TARGET_END_CALL_ARGS. */
11704 static void
11705 aarch64_end_call_args (cumulative_args_t ca_v)
11707 CUMULATIVE_ARGS *ca = get_cumulative_args (ca_v);
11709 /* If this is a call to a private ZA function, emit a marker to
11710 indicate where any necessary restoration code could be inserted.
11711 The code itself is inserted by the mode-switching pass. */
11712 if (TARGET_ZA && !(ca->isa_mode & AARCH64_ISA_MODE_ZA_ON))
11713 emit_insn (gen_aarch64_end_private_za_call ());
11715 /* If this is a call to a shared-ZA function that doesn't share ZT0,
11716 save and restore ZT0 around the call. */
11717 if (aarch64_cfun_has_state ("zt0")
11718 && (ca->isa_mode & AARCH64_ISA_MODE_ZA_ON)
11719 && ca->shared_zt0_flags == 0)
11720 aarch64_restore_zt0 (false);
11723 /* Emit call insn with PAT and do aarch64-specific handling. */
11725 rtx_call_insn *
11726 aarch64_emit_call_insn (rtx pat)
11728 auto insn = emit_call_insn (pat);
11730 rtx *fusage = &CALL_INSN_FUNCTION_USAGE (insn);
11731 clobber_reg (fusage, gen_rtx_REG (word_mode, IP0_REGNUM));
11732 clobber_reg (fusage, gen_rtx_REG (word_mode, IP1_REGNUM));
11733 return as_a<rtx_call_insn *> (insn);
11736 machine_mode
11737 aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
11739 machine_mode mode_x = GET_MODE (x);
11740 rtx_code code_x = GET_CODE (x);
11742 /* All floating point compares return CCFP if it is an equality
11743 comparison, and CCFPE otherwise. */
11744 if (GET_MODE_CLASS (mode_x) == MODE_FLOAT)
11746 switch (code)
11748 case EQ:
11749 case NE:
11750 case UNORDERED:
11751 case ORDERED:
11752 case UNLT:
11753 case UNLE:
11754 case UNGT:
11755 case UNGE:
11756 case UNEQ:
11757 return CCFPmode;
11759 case LT:
11760 case LE:
11761 case GT:
11762 case GE:
11763 case LTGT:
11764 return CCFPEmode;
11766 default:
11767 gcc_unreachable ();
11771 /* Equality comparisons of short modes against zero can be performed
11772 using the TST instruction with the appropriate bitmask. */
11773 if (y == const0_rtx && (REG_P (x) || SUBREG_P (x))
11774 && (code == EQ || code == NE)
11775 && (mode_x == HImode || mode_x == QImode))
11776 return CC_Zmode;
11778 /* Similarly, comparisons of zero_extends from shorter modes can
11779 be performed using an ANDS with an immediate mask. */
11780 if (y == const0_rtx && code_x == ZERO_EXTEND
11781 && (mode_x == SImode || mode_x == DImode)
11782 && (GET_MODE (XEXP (x, 0)) == HImode || GET_MODE (XEXP (x, 0)) == QImode)
11783 && (code == EQ || code == NE))
11784 return CC_Zmode;
11786 /* Zero extracts support equality comparisons. */
11787 if ((mode_x == SImode || mode_x == DImode)
11788 && y == const0_rtx
11789 && (code_x == ZERO_EXTRACT && CONST_INT_P (XEXP (x, 1))
11790 && CONST_INT_P (XEXP (x, 2)))
11791 && (code == EQ || code == NE))
11792 return CC_Zmode;
11794 /* ANDS/BICS/TST support equality and all signed comparisons. */
11795 if ((mode_x == SImode || mode_x == DImode)
11796 && y == const0_rtx
11797 && (code_x == AND)
11798 && (code == EQ || code == NE || code == LT || code == GE
11799 || code == GT || code == LE))
11800 return CC_NZVmode;
11802 /* ADDS/SUBS correctly set N and Z flags. */
11803 if ((mode_x == SImode || mode_x == DImode)
11804 && y == const0_rtx
11805 && (code == EQ || code == NE || code == LT || code == GE)
11806 && (code_x == PLUS || code_x == MINUS || code_x == NEG))
11807 return CC_NZmode;
11809 /* A compare with a shifted operand. Because of canonicalization,
11810 the comparison will have to be swapped when we emit the assembly
11811 code. */
11812 if ((mode_x == SImode || mode_x == DImode)
11813 && (REG_P (y) || SUBREG_P (y) || y == const0_rtx)
11814 && (code_x == ASHIFT || code_x == ASHIFTRT
11815 || code_x == LSHIFTRT
11816 || code_x == ZERO_EXTEND || code_x == SIGN_EXTEND))
11817 return CC_SWPmode;
11819 /* Similarly for a negated operand, but we can only do this for
11820 equalities. */
11821 if ((mode_x == SImode || mode_x == DImode)
11822 && (REG_P (y) || SUBREG_P (y))
11823 && (code == EQ || code == NE)
11824 && code_x == NEG)
11825 return CC_Zmode;
11827 /* A test for unsigned overflow from an addition. */
11828 if ((mode_x == DImode || mode_x == TImode)
11829 && (code == LTU || code == GEU)
11830 && code_x == PLUS
11831 && rtx_equal_p (XEXP (x, 0), y))
11832 return CC_Cmode;
11834 /* A test for unsigned overflow from an add with carry. */
11835 if ((mode_x == DImode || mode_x == TImode)
11836 && (code == LTU || code == GEU)
11837 && code_x == PLUS
11838 && CONST_SCALAR_INT_P (y)
11839 && (rtx_mode_t (y, mode_x)
11840 == (wi::shwi (1, mode_x)
11841 << (GET_MODE_BITSIZE (mode_x).to_constant () / 2))))
11842 return CC_ADCmode;
11844 /* A test for signed overflow. */
11845 if ((mode_x == DImode || mode_x == TImode)
11846 && code == NE
11847 && code_x == PLUS
11848 && GET_CODE (y) == SIGN_EXTEND)
11849 return CC_Vmode;
11851 /* For everything else, return CCmode. */
11852 return CCmode;
11855 static int
11856 aarch64_get_condition_code_1 (machine_mode, enum rtx_code);
11859 aarch64_get_condition_code (rtx x)
11861 machine_mode mode = GET_MODE (XEXP (x, 0));
11862 enum rtx_code comp_code = GET_CODE (x);
11864 if (GET_MODE_CLASS (mode) != MODE_CC)
11865 mode = SELECT_CC_MODE (comp_code, XEXP (x, 0), XEXP (x, 1));
11866 return aarch64_get_condition_code_1 (mode, comp_code);
11869 static int
11870 aarch64_get_condition_code_1 (machine_mode mode, enum rtx_code comp_code)
11872 switch (mode)
11874 case E_CCFPmode:
11875 case E_CCFPEmode:
11876 switch (comp_code)
11878 case GE: return AARCH64_GE;
11879 case GT: return AARCH64_GT;
11880 case LE: return AARCH64_LS;
11881 case LT: return AARCH64_MI;
11882 case NE: return AARCH64_NE;
11883 case EQ: return AARCH64_EQ;
11884 case ORDERED: return AARCH64_VC;
11885 case UNORDERED: return AARCH64_VS;
11886 case UNLT: return AARCH64_LT;
11887 case UNLE: return AARCH64_LE;
11888 case UNGT: return AARCH64_HI;
11889 case UNGE: return AARCH64_PL;
11890 default: return -1;
11892 break;
11894 case E_CCmode:
11895 switch (comp_code)
11897 case NE: return AARCH64_NE;
11898 case EQ: return AARCH64_EQ;
11899 case GE: return AARCH64_GE;
11900 case GT: return AARCH64_GT;
11901 case LE: return AARCH64_LE;
11902 case LT: return AARCH64_LT;
11903 case GEU: return AARCH64_CS;
11904 case GTU: return AARCH64_HI;
11905 case LEU: return AARCH64_LS;
11906 case LTU: return AARCH64_CC;
11907 default: return -1;
11909 break;
11911 case E_CC_SWPmode:
11912 switch (comp_code)
11914 case NE: return AARCH64_NE;
11915 case EQ: return AARCH64_EQ;
11916 case GE: return AARCH64_LE;
11917 case GT: return AARCH64_LT;
11918 case LE: return AARCH64_GE;
11919 case LT: return AARCH64_GT;
11920 case GEU: return AARCH64_LS;
11921 case GTU: return AARCH64_CC;
11922 case LEU: return AARCH64_CS;
11923 case LTU: return AARCH64_HI;
11924 default: return -1;
11926 break;
11928 case E_CC_NZCmode:
11929 switch (comp_code)
11931 case NE: return AARCH64_NE; /* = any */
11932 case EQ: return AARCH64_EQ; /* = none */
11933 case GE: return AARCH64_PL; /* = nfrst */
11934 case LT: return AARCH64_MI; /* = first */
11935 case GEU: return AARCH64_CS; /* = nlast */
11936 case GTU: return AARCH64_HI; /* = pmore */
11937 case LEU: return AARCH64_LS; /* = plast */
11938 case LTU: return AARCH64_CC; /* = last */
11939 default: return -1;
11941 break;
11943 case E_CC_NZVmode:
11944 switch (comp_code)
11946 case NE: return AARCH64_NE;
11947 case EQ: return AARCH64_EQ;
11948 case GE: return AARCH64_PL;
11949 case LT: return AARCH64_MI;
11950 case GT: return AARCH64_GT;
11951 case LE: return AARCH64_LE;
11952 default: return -1;
11954 break;
11956 case E_CC_NZmode:
11957 switch (comp_code)
11959 case NE: return AARCH64_NE;
11960 case EQ: return AARCH64_EQ;
11961 case GE: return AARCH64_PL;
11962 case LT: return AARCH64_MI;
11963 default: return -1;
11965 break;
11967 case E_CC_Zmode:
11968 switch (comp_code)
11970 case NE: return AARCH64_NE;
11971 case EQ: return AARCH64_EQ;
11972 default: return -1;
11974 break;
11976 case E_CC_Cmode:
11977 switch (comp_code)
11979 case LTU: return AARCH64_CS;
11980 case GEU: return AARCH64_CC;
11981 default: return -1;
11983 break;
11985 case E_CC_ADCmode:
11986 switch (comp_code)
11988 case GEU: return AARCH64_CS;
11989 case LTU: return AARCH64_CC;
11990 default: return -1;
11992 break;
11994 case E_CC_Vmode:
11995 switch (comp_code)
11997 case NE: return AARCH64_VS;
11998 case EQ: return AARCH64_VC;
11999 default: return -1;
12001 break;
12003 default:
12004 return -1;
12007 return -1;
12010 /* Return true if X is a CONST_INT, CONST_WIDE_INT or a constant vector
12011 duplicate of such constants. If so, store in RET_WI the wide_int
12012 representation of the constant paired with the inner mode of the vector mode
12013 or MODE for scalar X constants. If MODE is not provided then TImode is
12014 used. */
12016 static bool
12017 aarch64_extract_vec_duplicate_wide_int (rtx x, wide_int *ret_wi,
12018 scalar_mode mode = TImode)
12020 rtx elt = unwrap_const_vec_duplicate (x);
12021 if (!CONST_SCALAR_INT_P (elt))
12022 return false;
12023 scalar_mode smode
12024 = CONST_SCALAR_INT_P (x) ? mode : GET_MODE_INNER (GET_MODE (x));
12025 *ret_wi = rtx_mode_t (elt, smode);
12026 return true;
12029 /* Return true if X is a scalar or a constant vector of integer
12030 immediates that represent the rounding constant used in the fixed-point
12031 arithmetic instructions.
12032 The accepted form of the constant is (1 << (C - 1)) where C is in the range
12033 [1, MODE_WIDTH/2]. */
12035 bool
12036 aarch64_rnd_imm_p (rtx x)
12038 wide_int rnd_cst;
12039 if (!aarch64_extract_vec_duplicate_wide_int (x, &rnd_cst))
12040 return false;
12041 int log2 = wi::exact_log2 (rnd_cst);
12042 if (log2 < 0)
12043 return false;
12044 return IN_RANGE (log2, 0, rnd_cst.get_precision () / 2 - 1);
12047 /* Return true if RND is a constant vector of integer rounding constants
12048 corresponding to a constant vector of shifts, SHIFT.
12049 The relationship should be RND == (1 << (SHIFT - 1)). */
12051 bool
12052 aarch64_const_vec_rnd_cst_p (rtx rnd, rtx shift)
12054 wide_int rnd_cst, shft_cst;
12055 if (!aarch64_extract_vec_duplicate_wide_int (rnd, &rnd_cst)
12056 || !aarch64_extract_vec_duplicate_wide_int (shift, &shft_cst))
12057 return false;
12059 return rnd_cst == (wi::shwi (1, rnd_cst.get_precision ()) << (shft_cst - 1));
12062 bool
12063 aarch64_const_vec_all_same_in_range_p (rtx x,
12064 HOST_WIDE_INT minval,
12065 HOST_WIDE_INT maxval)
12067 rtx elt;
12068 return (const_vec_duplicate_p (x, &elt)
12069 && CONST_INT_P (elt)
12070 && IN_RANGE (INTVAL (elt), minval, maxval));
12073 /* Some constants can't be made using normal mov instructions in Advanced SIMD
12074 but we can still create them in various ways. If the constant in VAL can be
12075 created using alternate methods then if possible then return true and
12076 additionally set TARGET to the rtx for the sequence if TARGET is not NULL.
12077 Otherwise return false if sequence is not possible. */
12079 bool
12080 aarch64_maybe_generate_simd_constant (rtx target, rtx val, machine_mode mode)
12082 wide_int wval;
12083 auto smode = GET_MODE_INNER (mode);
12084 if (!aarch64_extract_vec_duplicate_wide_int (val, &wval, smode))
12085 return false;
12087 /* For Advanced SIMD we can create an integer with only the top bit set
12088 using fneg (0.0f). */
12089 if (TARGET_SIMD
12090 && !TARGET_SVE
12091 && smode == DImode
12092 && wi::only_sign_bit_p (wval))
12094 if (!target)
12095 return true;
12097 /* Use the same base type as aarch64_gen_shareable_zero. */
12098 rtx zero = CONST0_RTX (V4SImode);
12099 emit_move_insn (lowpart_subreg (V4SImode, target, mode), zero);
12100 rtx neg = lowpart_subreg (V2DImode, target, mode);
12101 emit_insn (gen_aarch64_fnegv2di2 (neg, copy_rtx (neg)));
12102 return true;
12105 return false;
12108 /* Check if the value in VAL with mode MODE can be created using special
12109 instruction sequences. */
12111 bool aarch64_simd_special_constant_p (rtx val, machine_mode mode)
12113 return aarch64_maybe_generate_simd_constant (NULL_RTX, val, mode);
12116 bool
12117 aarch64_const_vec_all_same_int_p (rtx x, HOST_WIDE_INT val)
12119 return aarch64_const_vec_all_same_in_range_p (x, val, val);
12122 /* Return true if VEC is a constant in which every element is in the range
12123 [MINVAL, MAXVAL]. The elements do not need to have the same value. */
12125 static bool
12126 aarch64_const_vec_all_in_range_p (rtx vec,
12127 HOST_WIDE_INT minval,
12128 HOST_WIDE_INT maxval)
12130 if (!CONST_VECTOR_P (vec)
12131 || GET_MODE_CLASS (GET_MODE (vec)) != MODE_VECTOR_INT)
12132 return false;
12134 int nunits;
12135 if (!CONST_VECTOR_STEPPED_P (vec))
12136 nunits = const_vector_encoded_nelts (vec);
12137 else if (!CONST_VECTOR_NUNITS (vec).is_constant (&nunits))
12138 return false;
12140 for (int i = 0; i < nunits; i++)
12142 rtx vec_elem = CONST_VECTOR_ELT (vec, i);
12143 if (!CONST_INT_P (vec_elem)
12144 || !IN_RANGE (INTVAL (vec_elem), minval, maxval))
12145 return false;
12147 return true;
12150 /* N Z C V. */
12151 #define AARCH64_CC_V 1
12152 #define AARCH64_CC_C (1 << 1)
12153 #define AARCH64_CC_Z (1 << 2)
12154 #define AARCH64_CC_N (1 << 3)
12156 /* N Z C V flags for ccmp. Indexed by AARCH64_COND_CODE. */
12157 static const int aarch64_nzcv_codes[] =
12159 0, /* EQ, Z == 1. */
12160 AARCH64_CC_Z, /* NE, Z == 0. */
12161 0, /* CS, C == 1. */
12162 AARCH64_CC_C, /* CC, C == 0. */
12163 0, /* MI, N == 1. */
12164 AARCH64_CC_N, /* PL, N == 0. */
12165 0, /* VS, V == 1. */
12166 AARCH64_CC_V, /* VC, V == 0. */
12167 0, /* HI, C ==1 && Z == 0. */
12168 AARCH64_CC_C, /* LS, !(C == 1 && Z == 0). */
12169 AARCH64_CC_V, /* GE, N == V. */
12170 0, /* LT, N != V. */
12171 AARCH64_CC_Z, /* GT, Z == 0 && N == V. */
12172 0, /* LE, !(Z == 0 && N == V). */
12173 0, /* AL, Any. */
12174 0 /* NV, Any. */
12177 /* Print floating-point vector immediate operand X to F, negating it
12178 first if NEGATE is true. Return true on success, false if it isn't
12179 a constant we can handle. */
12181 static bool
12182 aarch64_print_vector_float_operand (FILE *f, rtx x, bool negate)
12184 rtx elt;
12186 if (!const_vec_duplicate_p (x, &elt))
12187 return false;
12189 REAL_VALUE_TYPE r = *CONST_DOUBLE_REAL_VALUE (elt);
12190 if (negate)
12191 r = real_value_negate (&r);
12193 /* Handle the SVE single-bit immediates specially, since they have a
12194 fixed form in the assembly syntax. */
12195 if (real_equal (&r, &dconst0))
12196 asm_fprintf (f, "0.0");
12197 else if (real_equal (&r, &dconst2))
12198 asm_fprintf (f, "2.0");
12199 else if (real_equal (&r, &dconst1))
12200 asm_fprintf (f, "1.0");
12201 else if (real_equal (&r, &dconsthalf))
12202 asm_fprintf (f, "0.5");
12203 else
12205 const int buf_size = 20;
12206 char float_buf[buf_size] = {'\0'};
12207 real_to_decimal_for_mode (float_buf, &r, buf_size, buf_size,
12208 1, GET_MODE (elt));
12209 asm_fprintf (f, "%s", float_buf);
12212 return true;
12215 /* Return the equivalent letter for size. */
12216 static char
12217 sizetochar (int size)
12219 switch (size)
12221 case 64: return 'd';
12222 case 32: return 's';
12223 case 16: return 'h';
12224 case 8: return 'b';
12225 default: gcc_unreachable ();
12229 /* Print operand X to file F in a target specific manner according to CODE.
12230 The acceptable formatting commands given by CODE are:
12231 'c': An integer or symbol address without a preceding #
12232 sign.
12233 'C': Take the duplicated element in a vector constant
12234 and print it in hex.
12235 'D': Take the duplicated element in a vector constant
12236 and print it as an unsigned integer, in decimal.
12237 'e': Print the sign/zero-extend size as a character 8->b,
12238 16->h, 32->w. Can also be used for masks:
12239 0xff->b, 0xffff->h, 0xffffffff->w.
12240 'I': If the operand is a duplicated vector constant,
12241 replace it with the duplicated scalar. If the
12242 operand is then a floating-point constant, replace
12243 it with the integer bit representation. Print the
12244 transformed constant as a signed decimal number.
12245 'p': Prints N such that 2^N == X (X must be power of 2 and
12246 const int).
12247 'P': Print the number of non-zero bits in X (a const_int).
12248 'H': Print the higher numbered register of a pair (TImode)
12249 of regs.
12250 'm': Print a condition (eq, ne, etc).
12251 'M': Same as 'm', but invert condition.
12252 'N': Take the duplicated element in a vector constant
12253 and print the negative of it in decimal.
12254 'b/h/s/d/q': Print a scalar FP/SIMD register name.
12255 'Z': Same for SVE registers. ('z' was already taken.)
12256 Note that it is not necessary to use %Z for operands
12257 that have SVE modes. The convention is to use %Z
12258 only for non-SVE (or potentially non-SVE) modes.
12259 'S/T/U/V': Print a FP/SIMD register name for a register list.
12260 The register printed is the FP/SIMD register name
12261 of X + 0/1/2/3 for S/T/U/V.
12262 'R': Print a scalar Integer/FP/SIMD register name + 1.
12263 'X': Print bottom 16 bits of integer constant in hex.
12264 'w/x': Print a general register name or the zero register
12265 (32-bit or 64-bit).
12266 '0': Print a normal operand, if it's a general register,
12267 then we assume DImode.
12268 'k': Print NZCV for conditional compare instructions.
12269 'K': Print a predicate register as pn<N> rather than p<N>
12270 'A': Output address constant representing the first
12271 argument of X, specifying a relocation offset
12272 if appropriate.
12273 'L': Output constant address specified by X
12274 with a relocation offset if appropriate.
12275 'G': Prints address of X, specifying a PC relative
12276 relocation mode if appropriate.
12277 'y': Output address of LDP or STP - this is used for
12278 some LDP/STPs which don't use a PARALLEL in their
12279 pattern (so the mode needs to be adjusted).
12280 'z': Output address of a typical LDP or STP. */
12282 static void
12283 aarch64_print_operand (FILE *f, rtx x, int code)
12285 rtx elt;
12286 switch (code)
12288 case 'c':
12289 if (CONST_INT_P (x))
12290 fprintf (f, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
12291 else
12293 poly_int64 offset;
12294 rtx base = strip_offset_and_salt (x, &offset);
12295 if (SYMBOL_REF_P (base))
12296 output_addr_const (f, x);
12297 else
12298 output_operand_lossage ("unsupported operand for code '%c'", code);
12300 break;
12302 case 'e':
12304 x = unwrap_const_vec_duplicate (x);
12305 if (!CONST_INT_P (x))
12307 output_operand_lossage ("invalid operand for '%%%c'", code);
12308 return;
12311 HOST_WIDE_INT val = INTVAL (x);
12312 if ((val & ~7) == 8 || val == 0xff)
12313 fputc ('b', f);
12314 else if ((val & ~7) == 16 || val == 0xffff)
12315 fputc ('h', f);
12316 else if ((val & ~7) == 32 || val == 0xffffffff)
12317 fputc ('w', f);
12318 else
12320 output_operand_lossage ("invalid operand for '%%%c'", code);
12321 return;
12324 break;
12326 case 'p':
12328 int n;
12330 if (!CONST_INT_P (x) || (n = exact_log2 (INTVAL (x))) < 0)
12332 output_operand_lossage ("invalid operand for '%%%c'", code);
12333 return;
12336 asm_fprintf (f, "%d", n);
12338 break;
12340 case 'P':
12341 if (!CONST_INT_P (x))
12343 output_operand_lossage ("invalid operand for '%%%c'", code);
12344 return;
12347 asm_fprintf (f, "%u", popcount_hwi (INTVAL (x)));
12348 break;
12350 case 'H':
12351 if (x == const0_rtx)
12353 asm_fprintf (f, "xzr");
12354 break;
12357 if (!REG_P (x) || !GP_REGNUM_P (REGNO (x) + 1))
12359 output_operand_lossage ("invalid operand for '%%%c'", code);
12360 return;
12363 asm_fprintf (f, "%s", reg_names [REGNO (x) + 1]);
12364 break;
12366 case 'I':
12368 x = aarch64_bit_representation (unwrap_const_vec_duplicate (x));
12369 if (CONST_INT_P (x))
12370 asm_fprintf (f, "%wd", INTVAL (x));
12371 else
12373 output_operand_lossage ("invalid operand for '%%%c'", code);
12374 return;
12376 break;
12379 case 'M':
12380 case 'm':
12382 int cond_code;
12383 /* CONST_TRUE_RTX means al/nv (al is the default, don't print it). */
12384 if (x == const_true_rtx)
12386 if (code == 'M')
12387 fputs ("nv", f);
12388 return;
12391 if (!COMPARISON_P (x))
12393 output_operand_lossage ("invalid operand for '%%%c'", code);
12394 return;
12397 cond_code = aarch64_get_condition_code (x);
12398 gcc_assert (cond_code >= 0);
12399 if (code == 'M')
12400 cond_code = AARCH64_INVERSE_CONDITION_CODE (cond_code);
12401 if (GET_MODE (XEXP (x, 0)) == CC_NZCmode)
12402 fputs (aarch64_sve_condition_codes[cond_code], f);
12403 else
12404 fputs (aarch64_condition_codes[cond_code], f);
12406 break;
12408 case 'N':
12409 if (!const_vec_duplicate_p (x, &elt))
12411 output_operand_lossage ("invalid vector constant");
12412 return;
12415 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
12416 asm_fprintf (f, "%wd", (HOST_WIDE_INT) -UINTVAL (elt));
12417 else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
12418 && aarch64_print_vector_float_operand (f, x, true))
12420 else
12422 output_operand_lossage ("invalid vector constant");
12423 return;
12425 break;
12427 case 'b':
12428 case 'h':
12429 case 's':
12430 case 'd':
12431 case 'q':
12432 case 'Z':
12433 code = TOLOWER (code);
12434 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
12436 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
12437 return;
12439 asm_fprintf (f, "%c%d", code, REGNO (x) - V0_REGNUM);
12440 break;
12442 case 'S':
12443 case 'T':
12444 case 'U':
12445 case 'V':
12446 if (!REG_P (x) || (!FP_REGNUM_P (REGNO (x)) && !PR_REGNUM_P (REGNO (x))))
12448 output_operand_lossage ("incompatible operand for '%%%c'", code);
12449 return;
12451 if (PR_REGNUM_P (REGNO (x)))
12452 asm_fprintf (f, "p%d", REGNO (x) - P0_REGNUM + (code - 'S'));
12453 else
12454 asm_fprintf (f, "%c%d",
12455 aarch64_sve_data_mode_p (GET_MODE (x)) ? 'z' : 'v',
12456 REGNO (x) - V0_REGNUM + (code - 'S'));
12457 break;
12459 case 'R':
12460 if (REG_P (x) && FP_REGNUM_P (REGNO (x))
12461 && (aarch64_advsimd_partial_struct_mode_p (GET_MODE (x))))
12462 asm_fprintf (f, "d%d", REGNO (x) - V0_REGNUM + 1);
12463 else if (REG_P (x) && FP_REGNUM_P (REGNO (x)))
12464 asm_fprintf (f, "q%d", REGNO (x) - V0_REGNUM + 1);
12465 else if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
12466 asm_fprintf (f, "x%d", REGNO (x) - R0_REGNUM + 1);
12467 else
12468 output_operand_lossage ("incompatible register operand for '%%%c'",
12469 code);
12470 break;
12472 case 'X':
12473 if (!CONST_INT_P (x))
12475 output_operand_lossage ("invalid operand for '%%%c'", code);
12476 return;
12478 asm_fprintf (f, "0x%wx", UINTVAL (x) & 0xffff);
12479 break;
12481 case 'C':
12483 /* Print a replicated constant in hex. */
12484 if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
12486 output_operand_lossage ("invalid operand for '%%%c'", code);
12487 return;
12489 scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
12490 asm_fprintf (f, "0x%wx", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
12492 break;
12494 case 'D':
12496 /* Print a replicated constant in decimal, treating it as
12497 unsigned. */
12498 if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
12500 output_operand_lossage ("invalid operand for '%%%c'", code);
12501 return;
12503 scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
12504 asm_fprintf (f, "%wd", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
12506 break;
12508 case 'w':
12509 case 'x':
12510 if (aarch64_const_zero_rtx_p (x))
12512 asm_fprintf (f, "%czr", code);
12513 break;
12516 if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
12518 asm_fprintf (f, "%c%d", code, REGNO (x) - R0_REGNUM);
12519 break;
12522 if (REG_P (x) && REGNO (x) == SP_REGNUM)
12524 asm_fprintf (f, "%ssp", code == 'w' ? "w" : "");
12525 break;
12528 /* Fall through */
12530 case 0:
12531 if (x == NULL)
12533 output_operand_lossage ("missing operand");
12534 return;
12537 switch (GET_CODE (x))
12539 case CONST_STRING:
12541 asm_fprintf (f, "%s", XSTR (x, 0));
12542 break;
12544 case REG:
12545 if (aarch64_sve_data_mode_p (GET_MODE (x)))
12547 if (REG_NREGS (x) == 1)
12548 asm_fprintf (f, "z%d", REGNO (x) - V0_REGNUM);
12549 else
12551 char suffix
12552 = sizetochar (GET_MODE_UNIT_BITSIZE (GET_MODE (x)));
12553 asm_fprintf (f, "{z%d.%c - z%d.%c}",
12554 REGNO (x) - V0_REGNUM, suffix,
12555 END_REGNO (x) - V0_REGNUM - 1, suffix);
12558 else
12559 asm_fprintf (f, "%s", reg_names [REGNO (x)]);
12560 break;
12562 case MEM:
12563 output_address (GET_MODE (x), XEXP (x, 0));
12564 break;
12566 case LABEL_REF:
12567 case SYMBOL_REF:
12568 output_addr_const (asm_out_file, x);
12569 break;
12571 case CONST_INT:
12572 asm_fprintf (f, "%wd", INTVAL (x));
12573 break;
12575 case CONST:
12576 if (!VECTOR_MODE_P (GET_MODE (x)))
12578 output_addr_const (asm_out_file, x);
12579 break;
12581 /* fall through */
12583 case CONST_VECTOR:
12584 if (!const_vec_duplicate_p (x, &elt))
12586 output_operand_lossage ("invalid vector constant");
12587 return;
12590 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
12591 asm_fprintf (f, "%wd", INTVAL (elt));
12592 else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
12593 && aarch64_print_vector_float_operand (f, x, false))
12595 else
12597 output_operand_lossage ("invalid vector constant");
12598 return;
12600 break;
12602 case CONST_DOUBLE:
12603 /* Since we define TARGET_SUPPORTS_WIDE_INT we shouldn't ever
12604 be getting CONST_DOUBLEs holding integers. */
12605 gcc_assert (GET_MODE (x) != VOIDmode);
12606 if (aarch64_float_const_zero_rtx_p (x))
12608 fputc ('0', f);
12609 break;
12611 else if (aarch64_float_const_representable_p (x))
12613 #define buf_size 20
12614 char float_buf[buf_size] = {'\0'};
12615 real_to_decimal_for_mode (float_buf,
12616 CONST_DOUBLE_REAL_VALUE (x),
12617 buf_size, buf_size,
12618 1, GET_MODE (x));
12619 asm_fprintf (asm_out_file, "%s", float_buf);
12620 break;
12621 #undef buf_size
12623 output_operand_lossage ("invalid constant");
12624 return;
12625 default:
12626 output_operand_lossage ("invalid operand");
12627 return;
12629 break;
12631 case 'A':
12632 if (GET_CODE (x) == HIGH)
12633 x = XEXP (x, 0);
12635 switch (aarch64_classify_symbolic_expression (x))
12637 case SYMBOL_SMALL_GOT_4G:
12638 asm_fprintf (asm_out_file, ":got:");
12639 break;
12641 case SYMBOL_SMALL_TLSGD:
12642 asm_fprintf (asm_out_file, ":tlsgd:");
12643 break;
12645 case SYMBOL_SMALL_TLSDESC:
12646 asm_fprintf (asm_out_file, ":tlsdesc:");
12647 break;
12649 case SYMBOL_SMALL_TLSIE:
12650 asm_fprintf (asm_out_file, ":gottprel:");
12651 break;
12653 case SYMBOL_TLSLE24:
12654 asm_fprintf (asm_out_file, ":tprel:");
12655 break;
12657 case SYMBOL_TINY_GOT:
12658 gcc_unreachable ();
12659 break;
12661 default:
12662 break;
12664 output_addr_const (asm_out_file, x);
12665 break;
12667 case 'L':
12668 switch (aarch64_classify_symbolic_expression (x))
12670 case SYMBOL_SMALL_GOT_4G:
12671 asm_fprintf (asm_out_file, ":got_lo12:");
12672 break;
12674 case SYMBOL_SMALL_TLSGD:
12675 asm_fprintf (asm_out_file, ":tlsgd_lo12:");
12676 break;
12678 case SYMBOL_SMALL_TLSDESC:
12679 asm_fprintf (asm_out_file, ":tlsdesc_lo12:");
12680 break;
12682 case SYMBOL_SMALL_TLSIE:
12683 asm_fprintf (asm_out_file, ":gottprel_lo12:");
12684 break;
12686 case SYMBOL_TLSLE12:
12687 asm_fprintf (asm_out_file, ":tprel_lo12:");
12688 break;
12690 case SYMBOL_TLSLE24:
12691 asm_fprintf (asm_out_file, ":tprel_lo12_nc:");
12692 break;
12694 case SYMBOL_TINY_GOT:
12695 asm_fprintf (asm_out_file, ":got:");
12696 break;
12698 case SYMBOL_TINY_TLSIE:
12699 asm_fprintf (asm_out_file, ":gottprel:");
12700 break;
12702 default:
12703 break;
12705 output_addr_const (asm_out_file, x);
12706 break;
12708 case 'G':
12709 switch (aarch64_classify_symbolic_expression (x))
12711 case SYMBOL_TLSLE24:
12712 asm_fprintf (asm_out_file, ":tprel_hi12:");
12713 break;
12714 default:
12715 break;
12717 output_addr_const (asm_out_file, x);
12718 break;
12720 case 'k':
12722 HOST_WIDE_INT cond_code;
12724 if (!CONST_INT_P (x))
12726 output_operand_lossage ("invalid operand for '%%%c'", code);
12727 return;
12730 cond_code = INTVAL (x);
12731 gcc_assert (cond_code >= 0 && cond_code <= AARCH64_NV);
12732 asm_fprintf (f, "%d", aarch64_nzcv_codes[cond_code]);
12734 break;
12736 case 'K':
12737 if (!REG_P (x) || !PR_REGNUM_P (REGNO (x)))
12739 output_operand_lossage ("invalid operand for '%%%c'", code);
12740 return;
12742 asm_fprintf (f, "pn%d", REGNO (x) - P0_REGNUM);
12743 break;
12745 case 'y':
12746 case 'z':
12748 machine_mode mode = GET_MODE (x);
12750 if (!MEM_P (x)
12751 || (code == 'y'
12752 && maybe_ne (GET_MODE_SIZE (mode), 8)
12753 && maybe_ne (GET_MODE_SIZE (mode), 16)
12754 && maybe_ne (GET_MODE_SIZE (mode), 32)))
12756 output_operand_lossage ("invalid operand for '%%%c'", code);
12757 return;
12760 if (!aarch64_print_address_internal (f, mode, XEXP (x, 0),
12761 code == 'y'
12762 ? ADDR_QUERY_LDP_STP_N
12763 : ADDR_QUERY_LDP_STP))
12764 output_operand_lossage ("invalid operand prefix '%%%c'", code);
12766 break;
12768 default:
12769 output_operand_lossage ("invalid operand prefix '%%%c'", code);
12770 return;
12774 /* Print address 'x' of a memory access with mode 'mode'.
12775 'op' is the context required by aarch64_classify_address. It can either be
12776 MEM for a normal memory access or PARALLEL for LDP/STP. */
12777 static bool
12778 aarch64_print_address_internal (FILE *f, machine_mode mode, rtx x,
12779 aarch64_addr_query_type type)
12781 struct aarch64_address_info addr;
12782 unsigned int size, vec_flags;
12784 /* Check all addresses are Pmode - including ILP32. */
12785 if (GET_MODE (x) != Pmode
12786 && (!CONST_INT_P (x)
12787 || trunc_int_for_mode (INTVAL (x), Pmode) != INTVAL (x)))
12789 output_operand_lossage ("invalid address mode");
12790 return false;
12793 const bool load_store_pair_p = (type == ADDR_QUERY_LDP_STP
12794 || type == ADDR_QUERY_LDP_STP_N);
12796 if (aarch64_classify_address (&addr, x, mode, true, type))
12797 switch (addr.type)
12799 case ADDRESS_REG_IMM:
12800 if (known_eq (addr.const_offset, 0))
12802 asm_fprintf (f, "[%s]", reg_names[REGNO (addr.base)]);
12803 return true;
12806 vec_flags = aarch64_classify_vector_memory_mode (mode);
12807 if ((vec_flags & VEC_ANY_SVE) && !load_store_pair_p)
12809 HOST_WIDE_INT vnum
12810 = exact_div (addr.const_offset,
12811 aarch64_vl_bytes (mode, vec_flags)).to_constant ();
12812 asm_fprintf (f, "[%s, #%wd, mul vl]",
12813 reg_names[REGNO (addr.base)], vnum);
12814 return true;
12817 if (!CONST_INT_P (addr.offset))
12818 return false;
12820 asm_fprintf (f, "[%s, %wd]", reg_names[REGNO (addr.base)],
12821 INTVAL (addr.offset));
12822 return true;
12824 case ADDRESS_REG_REG:
12825 if (addr.shift == 0)
12826 asm_fprintf (f, "[%s, %s]", reg_names [REGNO (addr.base)],
12827 reg_names [REGNO (addr.offset)]);
12828 else
12829 asm_fprintf (f, "[%s, %s, lsl %u]", reg_names [REGNO (addr.base)],
12830 reg_names [REGNO (addr.offset)], addr.shift);
12831 return true;
12833 case ADDRESS_REG_UXTW:
12834 if (addr.shift == 0)
12835 asm_fprintf (f, "[%s, w%d, uxtw]", reg_names [REGNO (addr.base)],
12836 REGNO (addr.offset) - R0_REGNUM);
12837 else
12838 asm_fprintf (f, "[%s, w%d, uxtw %u]", reg_names [REGNO (addr.base)],
12839 REGNO (addr.offset) - R0_REGNUM, addr.shift);
12840 return true;
12842 case ADDRESS_REG_SXTW:
12843 if (addr.shift == 0)
12844 asm_fprintf (f, "[%s, w%d, sxtw]", reg_names [REGNO (addr.base)],
12845 REGNO (addr.offset) - R0_REGNUM);
12846 else
12847 asm_fprintf (f, "[%s, w%d, sxtw %u]", reg_names [REGNO (addr.base)],
12848 REGNO (addr.offset) - R0_REGNUM, addr.shift);
12849 return true;
12851 case ADDRESS_REG_WB:
12852 /* Writeback is only supported for fixed-width modes. */
12853 size = GET_MODE_SIZE (mode).to_constant ();
12854 switch (GET_CODE (x))
12856 case PRE_INC:
12857 asm_fprintf (f, "[%s, %d]!", reg_names [REGNO (addr.base)], size);
12858 return true;
12859 case POST_INC:
12860 asm_fprintf (f, "[%s], %d", reg_names [REGNO (addr.base)], size);
12861 return true;
12862 case PRE_DEC:
12863 asm_fprintf (f, "[%s, -%d]!", reg_names [REGNO (addr.base)], size);
12864 return true;
12865 case POST_DEC:
12866 asm_fprintf (f, "[%s], -%d", reg_names [REGNO (addr.base)], size);
12867 return true;
12868 case PRE_MODIFY:
12869 asm_fprintf (f, "[%s, %wd]!", reg_names[REGNO (addr.base)],
12870 INTVAL (addr.offset));
12871 return true;
12872 case POST_MODIFY:
12873 asm_fprintf (f, "[%s], %wd", reg_names[REGNO (addr.base)],
12874 INTVAL (addr.offset));
12875 return true;
12876 default:
12877 break;
12879 break;
12881 case ADDRESS_LO_SUM:
12882 asm_fprintf (f, "[%s, #:lo12:", reg_names [REGNO (addr.base)]);
12883 output_addr_const (f, addr.offset);
12884 asm_fprintf (f, "]");
12885 return true;
12887 case ADDRESS_SYMBOLIC:
12888 output_addr_const (f, x);
12889 return true;
12892 return false;
12895 /* Print address 'x' of a memory access with mode 'mode'. */
12896 static void
12897 aarch64_print_operand_address (FILE *f, machine_mode mode, rtx x)
12899 if (!aarch64_print_address_internal (f, mode, x, ADDR_QUERY_ANY))
12900 output_addr_const (f, x);
12903 /* Implement TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA. */
12905 static bool
12906 aarch64_output_addr_const_extra (FILE *file, rtx x)
12908 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_SALT_ADDR)
12910 output_addr_const (file, XVECEXP (x, 0, 0));
12911 return true;
12913 return false;
12916 bool
12917 aarch64_label_mentioned_p (rtx x)
12919 const char *fmt;
12920 int i;
12922 if (LABEL_REF_P (x))
12923 return true;
12925 /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
12926 referencing instruction, but they are constant offsets, not
12927 symbols. */
12928 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
12929 return false;
12931 fmt = GET_RTX_FORMAT (GET_CODE (x));
12932 for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
12934 if (fmt[i] == 'E')
12936 int j;
12938 for (j = XVECLEN (x, i) - 1; j >= 0; j--)
12939 if (aarch64_label_mentioned_p (XVECEXP (x, i, j)))
12940 return 1;
12942 else if (fmt[i] == 'e' && aarch64_label_mentioned_p (XEXP (x, i)))
12943 return 1;
12946 return 0;
12949 /* Implement REGNO_REG_CLASS. */
12951 enum reg_class
12952 aarch64_regno_regclass (unsigned regno)
12954 if (W8_W11_REGNUM_P (regno))
12955 return W8_W11_REGS;
12957 if (W12_W15_REGNUM_P (regno))
12958 return W12_W15_REGS;
12960 if (STUB_REGNUM_P (regno))
12961 return STUB_REGS;
12963 if (GP_REGNUM_P (regno))
12964 return GENERAL_REGS;
12966 if (regno == SP_REGNUM)
12967 return STACK_REG;
12969 if (regno == FRAME_POINTER_REGNUM
12970 || regno == ARG_POINTER_REGNUM)
12971 return POINTER_REGS;
12973 if (FP_REGNUM_P (regno))
12974 return (FP_LO8_REGNUM_P (regno) ? FP_LO8_REGS
12975 : FP_LO_REGNUM_P (regno) ? FP_LO_REGS : FP_REGS);
12977 if (PR_REGNUM_P (regno))
12978 return PR_LO_REGNUM_P (regno) ? PR_LO_REGS : PR_HI_REGS;
12980 if (regno == FPM_REGNUM)
12981 return MOVEABLE_SYSREGS;
12983 if (regno == FFR_REGNUM || regno == FFRT_REGNUM)
12984 return FFR_REGS;
12986 if (FAKE_REGNUM_P (regno))
12987 return FAKE_REGS;
12989 return NO_REGS;
12992 /* OFFSET is an address offset for mode MODE, which has SIZE bytes.
12993 If OFFSET is out of range, return an offset of an anchor point
12994 that is in range. Return 0 otherwise. */
12996 static HOST_WIDE_INT
12997 aarch64_anchor_offset (HOST_WIDE_INT offset, HOST_WIDE_INT size,
12998 machine_mode mode)
13000 /* Does it look like we'll need a 16-byte load/store-pair operation? */
13001 if (size > 16)
13002 return (offset + 0x400) & ~0x7f0;
13004 /* For offsets that aren't a multiple of the access size, the limit is
13005 -256...255. */
13006 if (offset & (size - 1))
13008 /* BLKmode typically uses LDP of X-registers. */
13009 if (mode == BLKmode)
13010 return (offset + 512) & ~0x3ff;
13011 return (offset + 0x100) & ~0x1ff;
13014 /* Small negative offsets are supported. */
13015 if (IN_RANGE (offset, -256, 0))
13016 return 0;
13018 if (mode == TImode || mode == TFmode || mode == TDmode)
13019 return (offset + 0x100) & ~0x1ff;
13021 /* Use 12-bit offset by access size. */
13022 return offset & (~0xfff * size);
13025 static rtx
13026 aarch64_legitimize_address (rtx x, rtx /* orig_x */, machine_mode mode)
13028 #if TARGET_PECOFF
13029 rtx tmp = legitimize_pe_coff_symbol (x, true);
13030 if (tmp)
13031 return tmp;
13032 #endif
13034 /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask),
13035 where mask is selected by alignment and size of the offset.
13036 We try to pick as large a range for the offset as possible to
13037 maximize the chance of a CSE. However, for aligned addresses
13038 we limit the range to 4k so that structures with different sized
13039 elements are likely to use the same base. We need to be careful
13040 not to split a CONST for some forms of address expression, otherwise
13041 it will generate sub-optimal code. */
13043 /* First split X + CONST (base, offset) into (base + X) + offset. */
13044 if (GET_CODE (x) == PLUS && GET_CODE (XEXP (x, 1)) == CONST)
13046 poly_int64 offset;
13047 rtx base = strip_offset (XEXP (x, 1), &offset);
13049 base = expand_binop (Pmode, add_optab, base, XEXP (x, 0),
13050 NULL_RTX, true, OPTAB_DIRECT);
13051 x = plus_constant (Pmode, base, offset);
13054 if (GET_CODE (x) == PLUS && CONST_INT_P (XEXP (x, 1)))
13056 rtx base = XEXP (x, 0);
13057 rtx offset_rtx = XEXP (x, 1);
13058 HOST_WIDE_INT offset = INTVAL (offset_rtx);
13060 if (GET_CODE (base) == PLUS)
13062 rtx op0 = XEXP (base, 0);
13063 rtx op1 = XEXP (base, 1);
13065 /* Force any scaling into a temp for CSE. */
13066 op0 = force_reg (Pmode, op0);
13067 op1 = force_reg (Pmode, op1);
13069 /* Let the pointer register be in op0. */
13070 if (REG_POINTER (op1))
13071 std::swap (op0, op1);
13073 /* If the pointer is virtual or frame related, then we know that
13074 virtual register instantiation or register elimination is going
13075 to apply a second constant. We want the two constants folded
13076 together easily. Therefore, emit as (OP0 + CONST) + OP1. */
13077 if (virt_or_elim_regno_p (REGNO (op0)))
13079 base = expand_binop (Pmode, add_optab, op0, offset_rtx,
13080 NULL_RTX, true, OPTAB_DIRECT);
13081 return gen_rtx_PLUS (Pmode, base, op1);
13084 /* Otherwise, in order to encourage CSE (and thence loop strength
13085 reduce) scaled addresses, emit as (OP0 + OP1) + CONST. */
13086 base = expand_binop (Pmode, add_optab, op0, op1,
13087 NULL_RTX, true, OPTAB_DIRECT);
13088 x = gen_rtx_PLUS (Pmode, base, offset_rtx);
13091 HOST_WIDE_INT size;
13092 if (GET_MODE_SIZE (mode).is_constant (&size))
13094 HOST_WIDE_INT base_offset = aarch64_anchor_offset (offset, size,
13095 mode);
13096 if (base_offset != 0)
13098 base = plus_constant (Pmode, base, base_offset);
13099 base = force_operand (base, NULL_RTX);
13100 return plus_constant (Pmode, base, offset - base_offset);
13105 return x;
13108 static reg_class_t
13109 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x,
13110 reg_class_t rclass,
13111 machine_mode mode,
13112 secondary_reload_info *sri)
13114 /* Use aarch64_sve_reload_mem for SVE memory reloads that cannot use
13115 LDR and STR. See the comment at the head of aarch64-sve.md for
13116 more details about the big-endian handling. */
13117 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
13118 if (reg_class_subset_p (rclass, FP_REGS)
13119 && !((REG_P (x) && HARD_REGISTER_P (x))
13120 || aarch64_simd_valid_mov_imm (x))
13121 && mode != VNx16QImode
13122 && (vec_flags & VEC_SVE_DATA)
13123 && ((vec_flags & VEC_PARTIAL) || BYTES_BIG_ENDIAN))
13125 sri->icode = CODE_FOR_aarch64_sve_reload_mem;
13126 return NO_REGS;
13129 /* If we have to disable direct literal pool loads and stores because the
13130 function is too big, then we need a scratch register. */
13131 if (MEM_P (x) && SYMBOL_REF_P (x) && CONSTANT_POOL_ADDRESS_P (x)
13132 && (SCALAR_FLOAT_MODE_P (GET_MODE (x))
13133 || targetm.vector_mode_supported_p (GET_MODE (x)))
13134 && !aarch64_pcrelative_literal_loads)
13136 sri->icode = code_for_aarch64_reload_movcp (mode, DImode);
13137 return NO_REGS;
13140 /* Without the TARGET_SIMD or TARGET_SVE instructions we cannot move a
13141 Q register to a Q register directly. We need a scratch. */
13142 if (REG_P (x)
13143 && (mode == TFmode
13144 || mode == TImode
13145 || mode == TDmode
13146 || (vec_flags == VEC_ADVSIMD && known_eq (GET_MODE_SIZE (mode), 16)))
13147 && mode == GET_MODE (x)
13148 && !TARGET_SIMD
13149 && FP_REGNUM_P (REGNO (x))
13150 && reg_class_subset_p (rclass, FP_REGS))
13152 sri->icode = code_for_aarch64_reload_mov (mode);
13153 return NO_REGS;
13156 /* A TFmode, TImode or TDmode memory access should be handled via an FP_REGS
13157 because AArch64 has richer addressing modes for LDR/STR instructions
13158 than LDP/STP instructions. */
13159 if (TARGET_FLOAT && rclass == GENERAL_REGS
13160 && known_eq (GET_MODE_SIZE (mode), 16) && MEM_P (x))
13161 return FP_REGS;
13163 if (rclass == FP_REGS
13164 && (mode == TImode || mode == TFmode || mode == TDmode)
13165 && CONSTANT_P(x))
13166 return GENERAL_REGS;
13168 return NO_REGS;
13171 /* Implement TARGET_SECONDARY_MEMORY_NEEDED. */
13173 static bool
13174 aarch64_secondary_memory_needed (machine_mode mode, reg_class_t class1,
13175 reg_class_t class2)
13177 if (!TARGET_SIMD
13178 && reg_classes_intersect_p (class1, FP_REGS)
13179 && reg_classes_intersect_p (class2, FP_REGS))
13181 /* We can't do a 128-bit FPR-to-FPR move without TARGET_SIMD,
13182 so we can't easily split a move involving tuples of 128-bit
13183 vectors. Force the copy through memory instead.
13185 (Tuples of 64-bit vectors are fine.) */
13186 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
13187 if (vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
13188 return true;
13190 return false;
13193 /* Implement TARGET_FRAME_POINTER_REQUIRED. */
13195 static bool
13196 aarch64_frame_pointer_required ()
13198 /* If the function needs to record the incoming value of PSTATE.SM,
13199 make sure that the slot is accessible from the frame pointer. */
13200 return aarch64_need_old_pstate_sm ();
13203 static bool
13204 aarch64_can_eliminate (const int from ATTRIBUTE_UNUSED, const int to)
13206 gcc_assert (from == ARG_POINTER_REGNUM || from == FRAME_POINTER_REGNUM);
13208 /* If we need a frame pointer, ARG_POINTER_REGNUM and FRAME_POINTER_REGNUM
13209 can only eliminate to HARD_FRAME_POINTER_REGNUM. */
13210 if (frame_pointer_needed)
13211 return to == HARD_FRAME_POINTER_REGNUM;
13212 return true;
13215 poly_int64
13216 aarch64_initial_elimination_offset (unsigned from, unsigned to)
13218 aarch64_frame &frame = cfun->machine->frame;
13220 if (to == HARD_FRAME_POINTER_REGNUM)
13222 if (from == ARG_POINTER_REGNUM)
13223 return frame.bytes_above_hard_fp;
13225 if (from == FRAME_POINTER_REGNUM)
13226 return frame.bytes_above_hard_fp - frame.bytes_above_locals;
13229 if (to == STACK_POINTER_REGNUM)
13231 if (from == FRAME_POINTER_REGNUM)
13232 return frame.frame_size - frame.bytes_above_locals;
13235 return frame.frame_size;
13239 /* Get return address without mangling. */
13242 aarch64_return_addr_rtx (void)
13244 rtx val = get_hard_reg_initial_val (Pmode, LR_REGNUM);
13245 /* Note: aarch64_return_address_signing_enabled only
13246 works after cfun->machine->frame.laid_out is set,
13247 so here we don't know if the return address will
13248 be signed or not. */
13249 rtx lr = gen_rtx_REG (Pmode, LR_REGNUM);
13250 emit_move_insn (lr, val);
13251 emit_insn (GEN_FCN (CODE_FOR_xpaclri) ());
13252 return lr;
13256 /* Implement RETURN_ADDR_RTX. We do not support moving back to a
13257 previous frame. */
13260 aarch64_return_addr (int count, rtx frame ATTRIBUTE_UNUSED)
13262 if (count != 0)
13263 return const0_rtx;
13264 return aarch64_return_addr_rtx ();
13267 static void
13268 aarch64_asm_trampoline_template (FILE *f)
13270 /* Even if the current function doesn't have branch protection, some
13271 later function might, so since this template is only generated once
13272 we have to add a BTI just in case. */
13273 asm_fprintf (f, "\thint\t34 // bti c\n");
13275 if (TARGET_ILP32)
13277 asm_fprintf (f, "\tldr\tw%d, .+20\n", IP1_REGNUM - R0_REGNUM);
13278 asm_fprintf (f, "\tldr\tw%d, .+20\n", STATIC_CHAIN_REGNUM - R0_REGNUM);
13280 else
13282 asm_fprintf (f, "\tldr\t%s, .+20\n", reg_names [IP1_REGNUM]);
13283 asm_fprintf (f, "\tldr\t%s, .+24\n", reg_names [STATIC_CHAIN_REGNUM]);
13285 asm_fprintf (f, "\tbr\t%s\n", reg_names [IP1_REGNUM]);
13287 /* We always emit a speculation barrier.
13288 This is because the same trampoline template is used for every nested
13289 function. Since nested functions are not particularly common or
13290 performant we don't worry too much about the extra instructions to copy
13291 around.
13292 This is not yet a problem, since we have not yet implemented function
13293 specific attributes to choose between hardening against straight line
13294 speculation or not, but such function specific attributes are likely to
13295 happen in the future. */
13296 asm_fprintf (f, "\tdsb\tsy\n\tisb\n");
13298 assemble_aligned_integer (POINTER_BYTES, const0_rtx);
13299 assemble_aligned_integer (POINTER_BYTES, const0_rtx);
13302 static void
13303 aarch64_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
13305 rtx fnaddr, mem, a_tramp;
13306 const int tramp_code_sz = 24;
13308 /* Don't need to copy the trailing D-words, we fill those in below. */
13309 /* We create our own memory address in Pmode so that `emit_block_move` can
13310 use parts of the backend which expect Pmode addresses. */
13311 rtx temp = convert_memory_address (Pmode, XEXP (m_tramp, 0));
13312 emit_block_move (gen_rtx_MEM (BLKmode, temp),
13313 assemble_trampoline_template (),
13314 GEN_INT (tramp_code_sz), BLOCK_OP_NORMAL);
13315 mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz);
13316 fnaddr = XEXP (DECL_RTL (fndecl), 0);
13317 if (GET_MODE (fnaddr) != ptr_mode)
13318 fnaddr = convert_memory_address (ptr_mode, fnaddr);
13319 emit_move_insn (mem, fnaddr);
13321 mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz + POINTER_BYTES);
13322 emit_move_insn (mem, chain_value);
13324 /* XXX We should really define a "clear_cache" pattern and use
13325 gen_clear_cache(). */
13326 a_tramp = XEXP (m_tramp, 0);
13327 maybe_emit_call_builtin___clear_cache (a_tramp,
13328 plus_constant (ptr_mode,
13329 a_tramp,
13330 TRAMPOLINE_SIZE));
13333 static unsigned char
13334 aarch64_class_max_nregs (reg_class_t regclass, machine_mode mode)
13336 /* ??? Logically we should only need to provide a value when
13337 HARD_REGNO_MODE_OK says that at least one register in REGCLASS
13338 can hold MODE, but at the moment we need to handle all modes.
13339 Just ignore any runtime parts for registers that can't store them. */
13340 HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
13341 unsigned int nregs, vec_flags;
13342 switch (regclass)
13344 case W8_W11_REGS:
13345 case W12_W15_REGS:
13346 case STUB_REGS:
13347 case TAILCALL_ADDR_REGS:
13348 case POINTER_REGS:
13349 case GENERAL_REGS:
13350 case ALL_REGS:
13351 case POINTER_AND_FP_REGS:
13352 case FP_REGS:
13353 case FP_LO_REGS:
13354 case FP_LO8_REGS:
13355 vec_flags = aarch64_classify_vector_mode (mode);
13356 if ((vec_flags & VEC_SVE_DATA)
13357 && constant_multiple_p (GET_MODE_SIZE (mode),
13358 aarch64_vl_bytes (mode, vec_flags), &nregs))
13359 return nregs;
13360 if (vec_flags == (VEC_ADVSIMD | VEC_STRUCT | VEC_PARTIAL))
13361 return GET_MODE_SIZE (mode).to_constant () / 8;
13362 return (vec_flags & VEC_ADVSIMD
13363 ? CEIL (lowest_size, UNITS_PER_VREG)
13364 : CEIL (lowest_size, UNITS_PER_WORD));
13366 case PR_REGS:
13367 case PR_LO_REGS:
13368 case PR_HI_REGS:
13369 return mode == VNx64BImode ? 4 : mode == VNx32BImode ? 2 : 1;
13371 case MOVEABLE_SYSREGS:
13372 case STACK_REG:
13373 case FFR_REGS:
13374 case PR_AND_FFR_REGS:
13375 case FAKE_REGS:
13376 return 1;
13378 case NO_REGS:
13379 return 0;
13381 default:
13382 break;
13384 gcc_unreachable ();
13387 static reg_class_t
13388 aarch64_preferred_reload_class (rtx x, reg_class_t regclass)
13390 if (regclass == POINTER_REGS)
13391 return GENERAL_REGS;
13393 if (regclass == STACK_REG)
13395 if (REG_P(x)
13396 && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x)), POINTER_REGS))
13397 return regclass;
13399 return NO_REGS;
13402 /* Register eliminiation can result in a request for
13403 SP+constant->FP_REGS. We cannot support such operations which
13404 use SP as source and an FP_REG as destination, so reject out
13405 right now. */
13406 if (! reg_class_subset_p (regclass, GENERAL_REGS) && GET_CODE (x) == PLUS)
13408 rtx lhs = XEXP (x, 0);
13410 /* Look through a possible SUBREG introduced by ILP32. */
13411 if (SUBREG_P (lhs))
13412 lhs = SUBREG_REG (lhs);
13414 gcc_assert (REG_P (lhs));
13415 gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs)),
13416 POINTER_REGS));
13417 return NO_REGS;
13420 return regclass;
13423 void
13424 aarch64_asm_output_labelref (FILE* f, const char *name)
13426 asm_fprintf (f, "%U%s", name);
13429 static void
13430 aarch64_elf_asm_constructor (rtx symbol, int priority)
13432 if (priority == DEFAULT_INIT_PRIORITY)
13433 default_ctor_section_asm_out_constructor (symbol, priority);
13434 else
13436 section *s;
13437 /* While priority is known to be in range [0, 65535], so 18 bytes
13438 would be enough, the compiler might not know that. To avoid
13439 -Wformat-truncation false positive, use a larger size. */
13440 char buf[23];
13441 snprintf (buf, sizeof (buf), ".init_array.%.5u", priority);
13442 s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
13443 switch_to_section (s);
13444 assemble_align (POINTER_SIZE);
13445 assemble_aligned_integer (POINTER_BYTES, symbol);
13449 static void
13450 aarch64_elf_asm_destructor (rtx symbol, int priority)
13452 if (priority == DEFAULT_INIT_PRIORITY)
13453 default_dtor_section_asm_out_destructor (symbol, priority);
13454 else
13456 section *s;
13457 /* While priority is known to be in range [0, 65535], so 18 bytes
13458 would be enough, the compiler might not know that. To avoid
13459 -Wformat-truncation false positive, use a larger size. */
13460 char buf[23];
13461 snprintf (buf, sizeof (buf), ".fini_array.%.5u", priority);
13462 s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
13463 switch_to_section (s);
13464 assemble_align (POINTER_SIZE);
13465 assemble_aligned_integer (POINTER_BYTES, symbol);
13469 const char*
13470 aarch64_output_casesi (rtx *operands)
13472 char buf[100];
13473 char label[100];
13474 rtx diff_vec = PATTERN (NEXT_INSN (as_a <rtx_insn *> (operands[2])));
13475 int index;
13476 static const char *const patterns[4][2] =
13479 "ldrb\t%w3, [%0,%w1,uxtw]",
13480 "add\t%3, %4, %w3, sxtb #2"
13483 "ldrh\t%w3, [%0,%w1,uxtw #1]",
13484 "add\t%3, %4, %w3, sxth #2"
13487 "ldr\t%w3, [%0,%w1,uxtw #2]",
13488 "add\t%3, %4, %w3, sxtw #2"
13490 /* We assume that DImode is only generated when not optimizing and
13491 that we don't really need 64-bit address offsets. That would
13492 imply an object file with 8GB of code in a single function! */
13494 "ldr\t%w3, [%0,%w1,uxtw #2]",
13495 "add\t%3, %4, %w3, sxtw #2"
13499 gcc_assert (GET_CODE (diff_vec) == ADDR_DIFF_VEC);
13501 scalar_int_mode mode = as_a <scalar_int_mode> (GET_MODE (diff_vec));
13502 index = exact_log2 (GET_MODE_SIZE (mode));
13504 gcc_assert (index >= 0 && index <= 3);
13506 /* Need to implement table size reduction, by chaning the code below. */
13507 output_asm_insn (patterns[index][0], operands);
13508 ASM_GENERATE_INTERNAL_LABEL (label, "Lrtx", CODE_LABEL_NUMBER (operands[2]));
13509 snprintf (buf, sizeof (buf),
13510 "adr\t%%4, %s", targetm.strip_name_encoding (label));
13511 output_asm_insn (buf, operands);
13512 output_asm_insn (patterns[index][1], operands);
13513 output_asm_insn ("br\t%3", operands);
13514 output_asm_insn (aarch64_sls_barrier (aarch64_harden_sls_retbr_p ()),
13515 operands);
13516 assemble_label (asm_out_file, label);
13517 return "";
13520 /* Return the asm string for an SME ZERO instruction whose 8-bit mask
13521 operand is MASK. */
13522 const char *
13523 aarch64_output_sme_zero_za (rtx mask)
13525 auto mask_val = UINTVAL (mask);
13526 if (mask_val == 0)
13527 return "zero\t{}";
13529 if (mask_val == 0xff)
13530 return "zero\t{ za }";
13532 static constexpr struct { unsigned char mask; char letter; } tiles[] = {
13533 { 0xff, 'b' },
13534 { 0x55, 'h' },
13535 { 0x11, 's' },
13536 { 0x01, 'd' }
13538 /* The last entry in the list has the form "za7.d }", but that's the
13539 same length as "za7.d, ". */
13540 static char buffer[sizeof("zero\t{ ") + sizeof ("za7.d, ") * 8 + 1];
13541 for (auto &tile : tiles)
13543 unsigned int tile_mask = tile.mask;
13544 unsigned int tile_index = 0;
13545 unsigned int i = snprintf (buffer, sizeof (buffer), "zero\t");
13546 const char *prefix = "{ ";
13547 auto remaining_mask = mask_val;
13548 while (tile_mask < 0x100)
13550 if ((remaining_mask & tile_mask) == tile_mask)
13552 i += snprintf (buffer + i, sizeof (buffer) - i, "%sza%d.%c",
13553 prefix, tile_index, tile.letter);
13554 prefix = ", ";
13555 remaining_mask &= ~tile_mask;
13557 tile_mask <<= 1;
13558 tile_index += 1;
13560 if (remaining_mask == 0)
13562 gcc_assert (i + 3 <= sizeof (buffer));
13563 snprintf (buffer + i, sizeof (buffer) - i, " }");
13564 return buffer;
13567 gcc_unreachable ();
13570 /* Return size in bits of an arithmetic operand which is shifted/scaled and
13571 masked such that it is suitable for a UXTB, UXTH, or UXTW extend
13572 operator. */
13575 aarch64_uxt_size (int shift, HOST_WIDE_INT mask)
13577 if (shift >= 0 && shift <= 4)
13579 int size;
13580 for (size = 8; size <= 32; size *= 2)
13582 HOST_WIDE_INT bits = ((HOST_WIDE_INT)1U << size) - 1;
13583 if (mask == bits << shift)
13584 return size;
13587 return 0;
13590 /* Constant pools are per function only when PC relative
13591 literal loads are true or we are in the large memory
13592 model. */
13594 static inline bool
13595 aarch64_can_use_per_function_literal_pools_p (void)
13597 return (aarch64_pcrelative_literal_loads
13598 || aarch64_cmodel == AARCH64_CMODEL_LARGE);
13601 static bool
13602 aarch64_use_blocks_for_constant_p (machine_mode, const_rtx)
13604 /* We can't use blocks for constants when we're using a per-function
13605 constant pool. */
13606 return !aarch64_can_use_per_function_literal_pools_p ();
13609 /* Select appropriate section for constants depending
13610 on where we place literal pools. */
13612 static section *
13613 aarch64_select_rtx_section (machine_mode mode,
13614 rtx x,
13615 unsigned HOST_WIDE_INT align)
13617 if (aarch64_can_use_per_function_literal_pools_p ())
13618 return function_section (current_function_decl);
13620 return default_elf_select_rtx_section (mode, x, align);
13623 /* Implement ASM_OUTPUT_POOL_EPILOGUE. */
13624 void
13625 aarch64_asm_output_pool_epilogue (FILE *f, const char *, tree,
13626 HOST_WIDE_INT offset)
13628 /* When using per-function literal pools, we must ensure that any code
13629 section is aligned to the minimal instruction length, lest we get
13630 errors from the assembler re "unaligned instructions". */
13631 if ((offset & 3) && aarch64_can_use_per_function_literal_pools_p ())
13632 ASM_OUTPUT_ALIGN (f, 2);
13635 /* Costs. */
13637 /* Helper function for rtx cost calculation. Strip a shift expression
13638 from X. Returns the inner operand if successful, or the original
13639 expression on failure. */
13640 static rtx
13641 aarch64_strip_shift (rtx x)
13643 rtx op = x;
13645 /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
13646 we can convert both to ROR during final output. */
13647 if ((GET_CODE (op) == ASHIFT
13648 || GET_CODE (op) == ASHIFTRT
13649 || GET_CODE (op) == LSHIFTRT
13650 || GET_CODE (op) == ROTATERT
13651 || GET_CODE (op) == ROTATE)
13652 && CONST_INT_P (XEXP (op, 1)))
13653 return XEXP (op, 0);
13655 if (GET_CODE (op) == MULT
13656 && CONST_INT_P (XEXP (op, 1))
13657 && ((unsigned) exact_log2 (INTVAL (XEXP (op, 1)))) < 64)
13658 return XEXP (op, 0);
13660 return x;
13663 /* Helper function for rtx cost calculation. Strip an extend
13664 expression from X. Returns the inner operand if successful, or the
13665 original expression on failure. We deal with a number of possible
13666 canonicalization variations here. If STRIP_SHIFT is true, then
13667 we can strip off a shift also. */
13668 static rtx
13669 aarch64_strip_extend (rtx x, bool strip_shift)
13671 scalar_int_mode mode;
13672 rtx op = x;
13674 if (!is_a <scalar_int_mode> (GET_MODE (op), &mode))
13675 return op;
13677 if (GET_CODE (op) == AND
13678 && GET_CODE (XEXP (op, 0)) == MULT
13679 && CONST_INT_P (XEXP (XEXP (op, 0), 1))
13680 && CONST_INT_P (XEXP (op, 1))
13681 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op, 0), 1))),
13682 INTVAL (XEXP (op, 1))) != 0)
13683 return XEXP (XEXP (op, 0), 0);
13685 /* Now handle extended register, as this may also have an optional
13686 left shift by 1..4. */
13687 if (strip_shift
13688 && GET_CODE (op) == ASHIFT
13689 && CONST_INT_P (XEXP (op, 1))
13690 && ((unsigned HOST_WIDE_INT) INTVAL (XEXP (op, 1))) <= 4)
13691 op = XEXP (op, 0);
13693 if (GET_CODE (op) == ZERO_EXTEND
13694 || GET_CODE (op) == SIGN_EXTEND)
13695 op = XEXP (op, 0);
13697 if (op != x)
13698 return op;
13700 return x;
13703 /* Helper function for rtx cost calculation. Strip extension as well as any
13704 inner VEC_SELECT high-half from X. Returns the inner vector operand if
13705 successful, or the original expression on failure. */
13706 static rtx
13707 aarch64_strip_extend_vec_half (rtx x)
13709 if (GET_CODE (x) == ZERO_EXTEND || GET_CODE (x) == SIGN_EXTEND)
13711 x = XEXP (x, 0);
13712 if (GET_CODE (x) == VEC_SELECT
13713 && vec_series_highpart_p (GET_MODE (x), GET_MODE (XEXP (x, 0)),
13714 XEXP (x, 1)))
13715 x = XEXP (x, 0);
13717 return x;
13720 /* Helper function for rtx cost calculation. Strip VEC_DUPLICATE as well as
13721 any subsequent extend and VEC_SELECT from X. Returns the inner scalar
13722 operand if successful, or the original expression on failure. */
13723 static rtx
13724 aarch64_strip_duplicate_vec_elt (rtx x)
13726 if (GET_CODE (x) == VEC_DUPLICATE
13727 && is_a<scalar_mode> (GET_MODE (XEXP (x, 0))))
13729 x = XEXP (x, 0);
13730 if (GET_CODE (x) == VEC_SELECT)
13731 x = XEXP (x, 0);
13732 else if ((GET_CODE (x) == ZERO_EXTEND || GET_CODE (x) == SIGN_EXTEND)
13733 && GET_CODE (XEXP (x, 0)) == VEC_SELECT)
13734 x = XEXP (XEXP (x, 0), 0);
13736 return x;
13739 /* Return true iff CODE is a shift supported in combination
13740 with arithmetic instructions. */
13742 static bool
13743 aarch64_shift_p (enum rtx_code code)
13745 return code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT;
13749 /* Return true iff X is a cheap shift without a sign extend. */
13751 static bool
13752 aarch64_cheap_mult_shift_p (rtx x)
13754 rtx op0, op1;
13756 op0 = XEXP (x, 0);
13757 op1 = XEXP (x, 1);
13759 if (!(aarch64_tune_params.extra_tuning_flags
13760 & AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND))
13761 return false;
13763 if (GET_CODE (op0) == SIGN_EXTEND)
13764 return false;
13766 if (GET_CODE (x) == ASHIFT && CONST_INT_P (op1)
13767 && UINTVAL (op1) <= 4)
13768 return true;
13770 if (GET_CODE (x) != MULT || !CONST_INT_P (op1))
13771 return false;
13773 HOST_WIDE_INT l2 = exact_log2 (INTVAL (op1));
13775 if (l2 > 0 && l2 <= 4)
13776 return true;
13778 return false;
13781 /* Helper function for rtx cost calculation. Calculate the cost of
13782 a MULT or ASHIFT, which may be part of a compound PLUS/MINUS rtx.
13783 Return the calculated cost of the expression, recursing manually in to
13784 operands where needed. */
13786 static int
13787 aarch64_rtx_mult_cost (rtx x, enum rtx_code code, int outer, bool speed)
13789 rtx op0, op1;
13790 const struct cpu_cost_table *extra_cost
13791 = aarch64_tune_params.insn_extra_cost;
13792 int cost = 0;
13793 bool compound_p = (outer == PLUS || outer == MINUS);
13794 machine_mode mode = GET_MODE (x);
13796 gcc_checking_assert (code == MULT);
13798 op0 = XEXP (x, 0);
13799 op1 = XEXP (x, 1);
13801 if (VECTOR_MODE_P (mode))
13803 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
13804 if (TARGET_SIMD && (vec_flags & VEC_ADVSIMD))
13806 /* The select-operand-high-half versions of the instruction have the
13807 same cost as the three vector version - don't add the costs of the
13808 extension or selection into the costs of the multiply. */
13809 op0 = aarch64_strip_extend_vec_half (op0);
13810 op1 = aarch64_strip_extend_vec_half (op1);
13811 /* The by-element versions of the instruction have the same costs as
13812 the normal 3-vector version. We make an assumption that the input
13813 to the VEC_DUPLICATE is already on the FP & SIMD side. This means
13814 costing of a MUL by element pre RA is a bit optimistic. */
13815 op0 = aarch64_strip_duplicate_vec_elt (op0);
13816 op1 = aarch64_strip_duplicate_vec_elt (op1);
13818 cost += rtx_cost (op0, mode, MULT, 0, speed);
13819 cost += rtx_cost (op1, mode, MULT, 1, speed);
13820 if (speed)
13822 if (GET_CODE (x) == MULT)
13823 cost += extra_cost->vect.mult;
13824 /* This is to catch the SSRA costing currently flowing here. */
13825 else
13826 cost += extra_cost->vect.alu;
13828 return cost;
13831 /* Integer multiply/fma. */
13832 if (GET_MODE_CLASS (mode) == MODE_INT)
13834 /* The multiply will be canonicalized as a shift, cost it as such. */
13835 if (aarch64_shift_p (GET_CODE (x))
13836 || (CONST_INT_P (op1)
13837 && exact_log2 (INTVAL (op1)) > 0))
13839 bool is_extend = GET_CODE (op0) == ZERO_EXTEND
13840 || GET_CODE (op0) == SIGN_EXTEND;
13841 if (speed)
13843 if (compound_p)
13845 /* If the shift is considered cheap,
13846 then don't add any cost. */
13847 if (aarch64_cheap_mult_shift_p (x))
13849 else if (REG_P (op1))
13850 /* ARITH + shift-by-register. */
13851 cost += extra_cost->alu.arith_shift_reg;
13852 else if (is_extend)
13853 /* ARITH + extended register. We don't have a cost field
13854 for ARITH+EXTEND+SHIFT, so use extend_arith here. */
13855 cost += extra_cost->alu.extend_arith;
13856 else
13857 /* ARITH + shift-by-immediate. */
13858 cost += extra_cost->alu.arith_shift;
13860 else
13861 /* LSL (immediate). */
13862 cost += extra_cost->alu.shift;
13865 /* Strip extends as we will have costed them in the case above. */
13866 if (is_extend)
13867 op0 = aarch64_strip_extend (op0, true);
13869 cost += rtx_cost (op0, VOIDmode, code, 0, speed);
13871 return cost;
13874 /* MNEG or [US]MNEGL. Extract the NEG operand and indicate that it's a
13875 compound and let the below cases handle it. After all, MNEG is a
13876 special-case alias of MSUB. */
13877 if (GET_CODE (op0) == NEG)
13879 op0 = XEXP (op0, 0);
13880 compound_p = true;
13883 /* Integer multiplies or FMAs have zero/sign extending variants. */
13884 if ((GET_CODE (op0) == ZERO_EXTEND
13885 && GET_CODE (op1) == ZERO_EXTEND)
13886 || (GET_CODE (op0) == SIGN_EXTEND
13887 && GET_CODE (op1) == SIGN_EXTEND))
13889 cost += rtx_cost (XEXP (op0, 0), VOIDmode, MULT, 0, speed);
13890 cost += rtx_cost (XEXP (op1, 0), VOIDmode, MULT, 1, speed);
13892 if (speed)
13894 if (compound_p)
13895 /* SMADDL/UMADDL/UMSUBL/SMSUBL. */
13896 cost += extra_cost->mult[0].extend_add;
13897 else
13898 /* MUL/SMULL/UMULL. */
13899 cost += extra_cost->mult[0].extend;
13902 return cost;
13905 /* This is either an integer multiply or a MADD. In both cases
13906 we want to recurse and cost the operands. */
13907 cost += rtx_cost (op0, mode, MULT, 0, speed);
13908 cost += rtx_cost (op1, mode, MULT, 1, speed);
13910 if (speed)
13912 if (compound_p)
13913 /* MADD/MSUB. */
13914 cost += extra_cost->mult[mode == DImode].add;
13915 else
13916 /* MUL. */
13917 cost += extra_cost->mult[mode == DImode].simple;
13920 return cost;
13922 else
13924 if (speed)
13926 /* Floating-point FMA/FMUL can also support negations of the
13927 operands, unless the rounding mode is upward or downward in
13928 which case FNMUL is different than FMUL with operand negation. */
13929 bool neg0 = GET_CODE (op0) == NEG;
13930 bool neg1 = GET_CODE (op1) == NEG;
13931 if (compound_p || !flag_rounding_math || (neg0 && neg1))
13933 if (neg0)
13934 op0 = XEXP (op0, 0);
13935 if (neg1)
13936 op1 = XEXP (op1, 0);
13939 if (compound_p)
13940 /* FMADD/FNMADD/FNMSUB/FMSUB. */
13941 cost += extra_cost->fp[mode == DFmode].fma;
13942 else
13943 /* FMUL/FNMUL. */
13944 cost += extra_cost->fp[mode == DFmode].mult;
13947 cost += rtx_cost (op0, mode, MULT, 0, speed);
13948 cost += rtx_cost (op1, mode, MULT, 1, speed);
13949 return cost;
13953 static int
13954 aarch64_address_cost (rtx x,
13955 machine_mode mode,
13956 addr_space_t as ATTRIBUTE_UNUSED,
13957 bool speed)
13959 enum rtx_code c = GET_CODE (x);
13960 const struct cpu_addrcost_table *addr_cost = aarch64_tune_params.addr_cost;
13961 struct aarch64_address_info info;
13962 int cost = 0;
13963 info.shift = 0;
13965 if (!aarch64_classify_address (&info, x, mode, false))
13967 if (GET_CODE (x) == CONST || SYMBOL_REF_P (x))
13969 /* This is a CONST or SYMBOL ref which will be split
13970 in a different way depending on the code model in use.
13971 Cost it through the generic infrastructure. */
13972 int cost_symbol_ref = rtx_cost (x, Pmode, MEM, 1, speed);
13973 /* Divide through by the cost of one instruction to
13974 bring it to the same units as the address costs. */
13975 cost_symbol_ref /= COSTS_N_INSNS (1);
13976 /* The cost is then the cost of preparing the address,
13977 followed by an immediate (possibly 0) offset. */
13978 return cost_symbol_ref + addr_cost->imm_offset;
13980 else
13982 /* This is most likely a jump table from a case
13983 statement. */
13984 return addr_cost->register_offset;
13988 switch (info.type)
13990 case ADDRESS_LO_SUM:
13991 case ADDRESS_SYMBOLIC:
13992 case ADDRESS_REG_IMM:
13993 cost += addr_cost->imm_offset;
13994 break;
13996 case ADDRESS_REG_WB:
13997 if (c == PRE_INC || c == PRE_DEC || c == PRE_MODIFY)
13998 cost += addr_cost->pre_modify;
13999 else if (c == POST_INC || c == POST_DEC || c == POST_MODIFY)
14001 unsigned int nvectors = aarch64_ldn_stn_vectors (mode);
14002 if (nvectors == 3)
14003 cost += addr_cost->post_modify_ld3_st3;
14004 else if (nvectors == 4)
14005 cost += addr_cost->post_modify_ld4_st4;
14006 else
14007 cost += addr_cost->post_modify;
14009 else
14010 gcc_unreachable ();
14012 break;
14014 case ADDRESS_REG_REG:
14015 cost += addr_cost->register_offset;
14016 break;
14018 case ADDRESS_REG_SXTW:
14019 cost += addr_cost->register_sextend;
14020 break;
14022 case ADDRESS_REG_UXTW:
14023 cost += addr_cost->register_zextend;
14024 break;
14026 default:
14027 gcc_unreachable ();
14031 if (info.shift > 0)
14033 /* For the sake of calculating the cost of the shifted register
14034 component, we can treat same sized modes in the same way. */
14035 if (known_eq (GET_MODE_BITSIZE (mode), 16))
14036 cost += addr_cost->addr_scale_costs.hi;
14037 else if (known_eq (GET_MODE_BITSIZE (mode), 32))
14038 cost += addr_cost->addr_scale_costs.si;
14039 else if (known_eq (GET_MODE_BITSIZE (mode), 64))
14040 cost += addr_cost->addr_scale_costs.di;
14041 else
14042 /* We can't tell, or this is a 128-bit vector. */
14043 cost += addr_cost->addr_scale_costs.ti;
14046 return cost;
14049 /* Return the cost of a branch. If SPEED_P is true then the compiler is
14050 optimizing for speed. If PREDICTABLE_P is true then the branch is predicted
14051 to be taken. */
14054 aarch64_branch_cost (bool speed_p, bool predictable_p)
14056 /* When optimizing for speed, use the cost of unpredictable branches. */
14057 const struct cpu_branch_cost *branch_costs =
14058 aarch64_tune_params.branch_costs;
14060 if (!speed_p || predictable_p)
14061 return branch_costs->predictable;
14062 else
14063 return branch_costs->unpredictable;
14066 /* Return true if X is a zero or sign extract
14067 usable in an ADD or SUB (extended register) instruction. */
14068 static bool
14069 aarch64_rtx_arith_op_extract_p (rtx x)
14071 /* The simple case <ARITH>, XD, XN, XM, [us]xt.
14072 No shift. */
14073 if (GET_CODE (x) == SIGN_EXTEND
14074 || GET_CODE (x) == ZERO_EXTEND)
14075 return REG_P (XEXP (x, 0));
14077 return false;
14080 static bool
14081 aarch64_frint_unspec_p (unsigned int u)
14083 switch (u)
14085 case UNSPEC_FRINTZ:
14086 case UNSPEC_FRINTP:
14087 case UNSPEC_FRINTM:
14088 case UNSPEC_FRINTA:
14089 case UNSPEC_FRINTN:
14090 case UNSPEC_FRINTX:
14091 case UNSPEC_FRINTI:
14092 return true;
14094 default:
14095 return false;
14099 /* Return true iff X is an rtx that will match an extr instruction
14100 i.e. as described in the *extr<mode>5_insn family of patterns.
14101 OP0 and OP1 will be set to the operands of the shifts involved
14102 on success and will be NULL_RTX otherwise. */
14104 static bool
14105 aarch64_extr_rtx_p (rtx x, rtx *res_op0, rtx *res_op1)
14107 rtx op0, op1;
14108 scalar_int_mode mode;
14109 if (!is_a <scalar_int_mode> (GET_MODE (x), &mode))
14110 return false;
14112 *res_op0 = NULL_RTX;
14113 *res_op1 = NULL_RTX;
14115 if (GET_CODE (x) != IOR)
14116 return false;
14118 op0 = XEXP (x, 0);
14119 op1 = XEXP (x, 1);
14121 if ((GET_CODE (op0) == ASHIFT && GET_CODE (op1) == LSHIFTRT)
14122 || (GET_CODE (op1) == ASHIFT && GET_CODE (op0) == LSHIFTRT))
14124 /* Canonicalise locally to ashift in op0, lshiftrt in op1. */
14125 if (GET_CODE (op1) == ASHIFT)
14126 std::swap (op0, op1);
14128 if (!CONST_INT_P (XEXP (op0, 1)) || !CONST_INT_P (XEXP (op1, 1)))
14129 return false;
14131 unsigned HOST_WIDE_INT shft_amnt_0 = UINTVAL (XEXP (op0, 1));
14132 unsigned HOST_WIDE_INT shft_amnt_1 = UINTVAL (XEXP (op1, 1));
14134 if (shft_amnt_0 < GET_MODE_BITSIZE (mode)
14135 && shft_amnt_0 + shft_amnt_1 == GET_MODE_BITSIZE (mode))
14137 *res_op0 = XEXP (op0, 0);
14138 *res_op1 = XEXP (op1, 0);
14139 return true;
14143 return false;
14146 /* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
14147 storing it in *COST. Result is true if the total cost of the operation
14148 has now been calculated. */
14149 static bool
14150 aarch64_if_then_else_costs (rtx op0, rtx op1, rtx op2, int *cost, bool speed)
14152 rtx inner;
14153 rtx comparator;
14154 enum rtx_code cmpcode;
14155 const struct cpu_cost_table *extra_cost
14156 = aarch64_tune_params.insn_extra_cost;
14158 if (COMPARISON_P (op0))
14160 inner = XEXP (op0, 0);
14161 comparator = XEXP (op0, 1);
14162 cmpcode = GET_CODE (op0);
14164 else
14166 inner = op0;
14167 comparator = const0_rtx;
14168 cmpcode = NE;
14171 if (GET_CODE (op1) == PC || GET_CODE (op2) == PC)
14173 /* Conditional branch. */
14174 if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
14175 return true;
14176 else
14178 if (cmpcode == NE || cmpcode == EQ)
14180 if (comparator == const0_rtx)
14182 /* TBZ/TBNZ/CBZ/CBNZ. */
14183 if (GET_CODE (inner) == ZERO_EXTRACT)
14184 /* TBZ/TBNZ. */
14185 *cost += rtx_cost (XEXP (inner, 0), VOIDmode,
14186 ZERO_EXTRACT, 0, speed);
14187 else
14188 /* CBZ/CBNZ. */
14189 *cost += rtx_cost (inner, VOIDmode, cmpcode, 0, speed);
14191 return true;
14193 if (register_operand (inner, VOIDmode)
14194 && aarch64_imm24 (comparator, VOIDmode))
14196 /* SUB and SUBS. */
14197 *cost += COSTS_N_INSNS (2);
14198 if (speed)
14199 *cost += extra_cost->alu.arith * 2;
14200 return true;
14203 else if (cmpcode == LT || cmpcode == GE)
14205 /* TBZ/TBNZ. */
14206 if (comparator == const0_rtx)
14207 return true;
14211 else if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
14213 /* CCMP. */
14214 if (GET_CODE (op1) == COMPARE)
14216 /* Increase cost of CCMP reg, 0, imm, CC to prefer CMP reg, 0. */
14217 if (XEXP (op1, 1) == const0_rtx)
14218 *cost += 1;
14219 if (speed)
14221 machine_mode mode = GET_MODE (XEXP (op1, 0));
14223 if (GET_MODE_CLASS (mode) == MODE_INT)
14224 *cost += extra_cost->alu.arith;
14225 else
14226 *cost += extra_cost->fp[mode == DFmode].compare;
14228 return true;
14231 /* It's a conditional operation based on the status flags,
14232 so it must be some flavor of CSEL. */
14234 /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL. */
14235 if (GET_CODE (op1) == NEG
14236 || GET_CODE (op1) == NOT
14237 || (GET_CODE (op1) == PLUS && XEXP (op1, 1) == const1_rtx))
14238 op1 = XEXP (op1, 0);
14239 else if (GET_CODE (op1) == ZERO_EXTEND && GET_CODE (op2) == ZERO_EXTEND)
14241 /* CSEL with zero-extension (*cmovdi_insn_uxtw). */
14242 op1 = XEXP (op1, 0);
14243 op2 = XEXP (op2, 0);
14245 else if (GET_CODE (op1) == ZERO_EXTEND && op2 == const0_rtx)
14247 inner = XEXP (op1, 0);
14248 if (GET_CODE (inner) == NEG || GET_CODE (inner) == NOT)
14249 /* CSINV/NEG with zero extend + const 0 (*csinv3_uxtw_insn3). */
14250 op1 = XEXP (inner, 0);
14252 else if (op1 == constm1_rtx || op1 == const1_rtx)
14254 /* Use CSINV or CSINC. */
14255 *cost += rtx_cost (op2, VOIDmode, IF_THEN_ELSE, 2, speed);
14256 return true;
14258 else if (op2 == constm1_rtx || op2 == const1_rtx)
14260 /* Use CSINV or CSINC. */
14261 *cost += rtx_cost (op1, VOIDmode, IF_THEN_ELSE, 1, speed);
14262 return true;
14265 *cost += rtx_cost (op1, VOIDmode, IF_THEN_ELSE, 1, speed);
14266 *cost += rtx_cost (op2, VOIDmode, IF_THEN_ELSE, 2, speed);
14267 return true;
14270 /* We don't know what this is, cost all operands. */
14271 return false;
14274 /* Check whether X is a bitfield operation of the form shift + extend that
14275 maps down to a UBFIZ/SBFIZ/UBFX/SBFX instruction. If so, return the
14276 operand to which the bitfield operation is applied. Otherwise return
14277 NULL_RTX. */
14279 static rtx
14280 aarch64_extend_bitfield_pattern_p (rtx x)
14282 rtx_code outer_code = GET_CODE (x);
14283 machine_mode outer_mode = GET_MODE (x);
14285 if (outer_code != ZERO_EXTEND && outer_code != SIGN_EXTEND
14286 && outer_mode != SImode && outer_mode != DImode)
14287 return NULL_RTX;
14289 rtx inner = XEXP (x, 0);
14290 rtx_code inner_code = GET_CODE (inner);
14291 machine_mode inner_mode = GET_MODE (inner);
14292 rtx op = NULL_RTX;
14294 switch (inner_code)
14296 case ASHIFT:
14297 if (CONST_INT_P (XEXP (inner, 1))
14298 && (inner_mode == QImode || inner_mode == HImode))
14299 op = XEXP (inner, 0);
14300 break;
14301 case LSHIFTRT:
14302 if (outer_code == ZERO_EXTEND && CONST_INT_P (XEXP (inner, 1))
14303 && (inner_mode == QImode || inner_mode == HImode))
14304 op = XEXP (inner, 0);
14305 break;
14306 case ASHIFTRT:
14307 if (outer_code == SIGN_EXTEND && CONST_INT_P (XEXP (inner, 1))
14308 && (inner_mode == QImode || inner_mode == HImode))
14309 op = XEXP (inner, 0);
14310 break;
14311 default:
14312 break;
14315 return op;
14318 /* Return true if the mask and a shift amount from an RTX of the form
14319 (x << SHFT_AMNT) & MASK are valid to combine into a UBFIZ instruction of
14320 mode MODE. See the *andim_ashift<mode>_bfiz pattern. */
14322 bool
14323 aarch64_mask_and_shift_for_ubfiz_p (scalar_int_mode mode, rtx mask,
14324 rtx shft_amnt)
14326 return CONST_INT_P (mask) && CONST_INT_P (shft_amnt)
14327 && INTVAL (mask) > 0
14328 && UINTVAL (shft_amnt) < GET_MODE_BITSIZE (mode)
14329 && exact_log2 ((UINTVAL (mask) >> UINTVAL (shft_amnt)) + 1) >= 0
14330 && (UINTVAL (mask)
14331 & ((HOST_WIDE_INT_1U << UINTVAL (shft_amnt)) - 1)) == 0;
14334 /* Return true if the masks and a shift amount from an RTX of the form
14335 ((x & MASK1) | ((y << SHIFT_AMNT) & MASK2)) are valid to combine into
14336 a BFI instruction of mode MODE. See *arch64_bfi patterns. */
14338 bool
14339 aarch64_masks_and_shift_for_bfi_p (scalar_int_mode mode,
14340 unsigned HOST_WIDE_INT mask1,
14341 unsigned HOST_WIDE_INT shft_amnt,
14342 unsigned HOST_WIDE_INT mask2)
14344 unsigned HOST_WIDE_INT t;
14346 /* Verify that there is no overlap in what bits are set in the two masks. */
14347 if (mask1 != ~mask2)
14348 return false;
14350 /* Verify that mask2 is not all zeros or ones. */
14351 if (mask2 == 0 || mask2 == HOST_WIDE_INT_M1U)
14352 return false;
14354 /* The shift amount should always be less than the mode size. */
14355 gcc_assert (shft_amnt < GET_MODE_BITSIZE (mode));
14357 /* Verify that the mask being shifted is contiguous and would be in the
14358 least significant bits after shifting by shft_amnt. */
14359 t = mask2 + (HOST_WIDE_INT_1U << shft_amnt);
14360 return (t == (t & -t));
14363 /* Return true if X is an RTX representing an operation in the ABD family
14364 of instructions. */
14366 static bool
14367 aarch64_abd_rtx_p (rtx x)
14369 if (GET_CODE (x) != MINUS)
14370 return false;
14371 rtx max_arm = XEXP (x, 0);
14372 rtx min_arm = XEXP (x, 1);
14373 if (GET_CODE (max_arm) != SMAX && GET_CODE (max_arm) != UMAX)
14374 return false;
14375 bool signed_p = GET_CODE (max_arm) == SMAX;
14376 if (signed_p && GET_CODE (min_arm) != SMIN)
14377 return false;
14378 else if (!signed_p && GET_CODE (min_arm) != UMIN)
14379 return false;
14381 rtx maxop0 = XEXP (max_arm, 0);
14382 rtx maxop1 = XEXP (max_arm, 1);
14383 rtx minop0 = XEXP (min_arm, 0);
14384 rtx minop1 = XEXP (min_arm, 1);
14385 return rtx_equal_p (maxop0, minop0) && rtx_equal_p (maxop1, minop1);
14388 /* Calculate the cost of calculating X, storing it in *COST. Result
14389 is true if the total cost of the operation has now been calculated. */
14390 static bool
14391 aarch64_rtx_costs (rtx x, machine_mode mode, int outer ATTRIBUTE_UNUSED,
14392 int param ATTRIBUTE_UNUSED, int *cost, bool speed)
14394 rtx op0, op1, op2;
14395 const struct cpu_cost_table *extra_cost
14396 = aarch64_tune_params.insn_extra_cost;
14397 rtx_code code = GET_CODE (x);
14398 scalar_int_mode int_mode;
14400 /* By default, assume that everything has equivalent cost to the
14401 cheapest instruction. Any additional costs are applied as a delta
14402 above this default. */
14403 *cost = COSTS_N_INSNS (1);
14405 switch (code)
14407 case SET:
14408 /* The cost depends entirely on the operands to SET. */
14409 *cost = 0;
14410 op0 = SET_DEST (x);
14411 op1 = SET_SRC (x);
14413 switch (GET_CODE (op0))
14415 case MEM:
14416 if (speed)
14418 rtx address = XEXP (op0, 0);
14419 if (VECTOR_MODE_P (mode))
14420 *cost += extra_cost->ldst.storev;
14421 else if (GET_MODE_CLASS (mode) == MODE_INT)
14422 *cost += extra_cost->ldst.store;
14423 else if (mode == SFmode || mode == SDmode)
14424 *cost += extra_cost->ldst.storef;
14425 else if (mode == DFmode || mode == DDmode)
14426 *cost += extra_cost->ldst.stored;
14428 *cost +=
14429 COSTS_N_INSNS (aarch64_address_cost (address, mode,
14430 0, speed));
14433 *cost += rtx_cost (op1, mode, SET, 1, speed);
14434 return true;
14436 case SUBREG:
14437 if (! REG_P (SUBREG_REG (op0)))
14438 *cost += rtx_cost (SUBREG_REG (op0), VOIDmode, SET, 0, speed);
14440 /* Fall through. */
14441 case REG:
14442 /* The cost is one per vector-register copied. */
14443 if (VECTOR_MODE_P (GET_MODE (op0)) && REG_P (op1))
14445 int nregs = aarch64_hard_regno_nregs (V0_REGNUM, GET_MODE (op0));
14446 *cost = COSTS_N_INSNS (nregs);
14448 /* const0_rtx is in general free, but we will use an
14449 instruction to set a register to 0. */
14450 else if (REG_P (op1) || op1 == const0_rtx)
14452 /* The cost is 1 per register copied. */
14453 int nregs = aarch64_hard_regno_nregs (R0_REGNUM, GET_MODE (op0));
14454 *cost = COSTS_N_INSNS (nregs);
14456 else
14457 /* Cost is just the cost of the RHS of the set. */
14458 *cost += rtx_cost (op1, mode, SET, 1, speed);
14459 return true;
14461 case ZERO_EXTRACT:
14462 case SIGN_EXTRACT:
14463 /* Bit-field insertion. Strip any redundant widening of
14464 the RHS to meet the width of the target. */
14465 if (SUBREG_P (op1))
14466 op1 = SUBREG_REG (op1);
14467 if ((GET_CODE (op1) == ZERO_EXTEND
14468 || GET_CODE (op1) == SIGN_EXTEND)
14469 && CONST_INT_P (XEXP (op0, 1))
14470 && is_a <scalar_int_mode> (GET_MODE (XEXP (op1, 0)), &int_mode)
14471 && GET_MODE_BITSIZE (int_mode) >= INTVAL (XEXP (op0, 1)))
14472 op1 = XEXP (op1, 0);
14474 if (CONST_INT_P (op1))
14476 /* MOV immediate is assumed to always be cheap. */
14477 *cost = COSTS_N_INSNS (1);
14479 else
14481 /* BFM. */
14482 if (speed)
14483 *cost += extra_cost->alu.bfi;
14484 *cost += rtx_cost (op1, VOIDmode, code, 1, speed);
14487 return true;
14489 default:
14490 /* We can't make sense of this, assume default cost. */
14491 *cost = COSTS_N_INSNS (1);
14492 return false;
14494 return false;
14496 case CONST_INT:
14497 /* If an instruction can incorporate a constant within the
14498 instruction, the instruction's expression avoids calling
14499 rtx_cost() on the constant. If rtx_cost() is called on a
14500 constant, then it is usually because the constant must be
14501 moved into a register by one or more instructions.
14503 The exception is constant 0, which can be expressed
14504 as XZR/WZR and is therefore free. The exception to this is
14505 if we have (set (reg) (const0_rtx)) in which case we must cost
14506 the move. However, we can catch that when we cost the SET, so
14507 we don't need to consider that here. */
14508 if (x == const0_rtx)
14509 *cost = 0;
14510 else
14512 /* To an approximation, building any other constant is
14513 proportionally expensive to the number of instructions
14514 required to build that constant. This is true whether we
14515 are compiling for SPEED or otherwise. */
14516 machine_mode imode = known_le (GET_MODE_SIZE (mode), 4)
14517 ? SImode : DImode;
14518 *cost = COSTS_N_INSNS (aarch64_internal_mov_immediate
14519 (NULL_RTX, x, false, imode));
14521 return true;
14523 case CONST_DOUBLE:
14525 /* First determine number of instructions to do the move
14526 as an integer constant. */
14527 if (!aarch64_float_const_representable_p (x)
14528 && !aarch64_can_const_movi_rtx_p (x, mode)
14529 && aarch64_float_const_rtx_p (x))
14531 unsigned HOST_WIDE_INT ival;
14532 bool succeed = aarch64_reinterpret_float_as_int (x, &ival);
14533 gcc_assert (succeed);
14535 machine_mode imode = known_eq (GET_MODE_SIZE (mode), 8)
14536 ? DImode : SImode;
14537 int ncost = aarch64_internal_mov_immediate
14538 (NULL_RTX, gen_int_mode (ival, imode), false, imode);
14539 *cost += COSTS_N_INSNS (ncost);
14540 return true;
14543 if (speed)
14545 /* mov[df,sf]_aarch64. */
14546 if (aarch64_float_const_representable_p (x))
14547 /* FMOV (scalar immediate). */
14548 *cost += extra_cost->fp[mode == DFmode || mode == DDmode].fpconst;
14549 else if (!aarch64_float_const_zero_rtx_p (x))
14551 /* This will be a load from memory. */
14552 if (mode == DFmode || mode == DDmode)
14553 *cost += extra_cost->ldst.loadd;
14554 else
14555 *cost += extra_cost->ldst.loadf;
14557 else
14558 /* Otherwise this is +0.0. We get this using MOVI d0, #0
14559 or MOV v0.s[0], wzr - neither of which are modeled by the
14560 cost tables. Just use the default cost. */
14565 return true;
14567 case MEM:
14568 if (speed)
14570 /* For loads we want the base cost of a load, plus an
14571 approximation for the additional cost of the addressing
14572 mode. */
14573 rtx address = XEXP (x, 0);
14574 if (VECTOR_MODE_P (mode))
14575 *cost += extra_cost->ldst.loadv;
14576 else if (GET_MODE_CLASS (mode) == MODE_INT)
14577 *cost += extra_cost->ldst.load;
14578 else if (mode == SFmode || mode == SDmode)
14579 *cost += extra_cost->ldst.loadf;
14580 else if (mode == DFmode || mode == DDmode)
14581 *cost += extra_cost->ldst.loadd;
14583 *cost +=
14584 COSTS_N_INSNS (aarch64_address_cost (address, mode,
14585 0, speed));
14588 return true;
14590 case NEG:
14591 op0 = XEXP (x, 0);
14593 if (VECTOR_MODE_P (mode))
14595 /* Many vector comparison operations are represented as NEG
14596 of a comparison. */
14597 if (COMPARISON_P (op0))
14599 rtx op00 = XEXP (op0, 0);
14600 rtx op01 = XEXP (op0, 1);
14601 machine_mode inner_mode = GET_MODE (op00);
14602 /* FACGE/FACGT. */
14603 if (GET_MODE_CLASS (inner_mode) == MODE_VECTOR_FLOAT
14604 && GET_CODE (op00) == ABS
14605 && GET_CODE (op01) == ABS)
14607 op00 = XEXP (op00, 0);
14608 op01 = XEXP (op01, 0);
14610 *cost += rtx_cost (op00, inner_mode, GET_CODE (op0), 0, speed);
14611 *cost += rtx_cost (op01, inner_mode, GET_CODE (op0), 1, speed);
14612 if (speed)
14613 *cost += extra_cost->vect.alu;
14614 return true;
14616 if (speed)
14618 /* FNEG. */
14619 *cost += extra_cost->vect.alu;
14621 return false;
14624 if (GET_MODE_CLASS (mode) == MODE_INT)
14626 if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
14627 || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
14629 /* CSETM. */
14630 *cost += rtx_cost (XEXP (op0, 0), VOIDmode, NEG, 0, speed);
14631 return true;
14634 /* Cost this as SUB wzr, X. */
14635 op0 = CONST0_RTX (mode);
14636 op1 = XEXP (x, 0);
14637 goto cost_minus;
14640 if (GET_MODE_CLASS (mode) == MODE_FLOAT)
14642 /* Support (neg(fma...)) as a single instruction only if
14643 sign of zeros is unimportant. This matches the decision
14644 making in aarch64.md. */
14645 if (GET_CODE (op0) == FMA && !HONOR_SIGNED_ZEROS (GET_MODE (op0)))
14647 /* FNMADD. */
14648 *cost = rtx_cost (op0, mode, NEG, 0, speed);
14649 return true;
14651 if (GET_CODE (op0) == MULT)
14653 /* FNMUL. */
14654 *cost = rtx_cost (op0, mode, NEG, 0, speed);
14655 return true;
14657 if (speed)
14658 /* FNEG. */
14659 *cost += extra_cost->fp[mode == DFmode].neg;
14660 return false;
14663 return false;
14665 case CLRSB:
14666 case CLZ:
14667 if (speed)
14669 if (VECTOR_MODE_P (mode))
14670 *cost += extra_cost->vect.alu;
14671 else
14672 *cost += extra_cost->alu.clz;
14675 return false;
14677 case CTZ:
14678 if (VECTOR_MODE_P (mode))
14680 *cost = COSTS_N_INSNS (3);
14681 if (speed)
14682 *cost += extra_cost->vect.alu * 3;
14684 else if (TARGET_CSSC)
14686 *cost = COSTS_N_INSNS (1);
14687 if (speed)
14688 *cost += extra_cost->alu.clz;
14690 else
14692 *cost = COSTS_N_INSNS (2);
14693 if (speed)
14694 *cost += extra_cost->alu.clz + extra_cost->alu.rev;
14696 return false;
14698 case COMPARE:
14699 op0 = XEXP (x, 0);
14700 op1 = XEXP (x, 1);
14702 if (op1 == const0_rtx
14703 && GET_CODE (op0) == AND)
14705 x = op0;
14706 mode = GET_MODE (op0);
14707 goto cost_logic;
14710 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT)
14712 /* TODO: A write to the CC flags possibly costs extra, this
14713 needs encoding in the cost tables. */
14715 mode = GET_MODE (op0);
14716 /* ANDS. */
14717 if (GET_CODE (op0) == AND)
14719 x = op0;
14720 goto cost_logic;
14723 if (GET_CODE (op0) == PLUS)
14725 /* ADDS (and CMN alias). */
14726 x = op0;
14727 goto cost_plus;
14730 if (GET_CODE (op0) == MINUS)
14732 /* SUBS. */
14733 x = op0;
14734 goto cost_minus;
14737 if (GET_CODE (op0) == ZERO_EXTRACT && op1 == const0_rtx
14738 && GET_MODE (x) == CC_NZmode && CONST_INT_P (XEXP (op0, 1))
14739 && CONST_INT_P (XEXP (op0, 2)))
14741 /* COMPARE of ZERO_EXTRACT form of TST-immediate.
14742 Handle it here directly rather than going to cost_logic
14743 since we know the immediate generated for the TST is valid
14744 so we can avoid creating an intermediate rtx for it only
14745 for costing purposes. */
14746 if (speed)
14747 *cost += extra_cost->alu.logical;
14749 *cost += rtx_cost (XEXP (op0, 0), GET_MODE (op0),
14750 ZERO_EXTRACT, 0, speed);
14751 return true;
14754 if (GET_CODE (op1) == NEG)
14756 /* CMN. */
14757 if (speed)
14758 *cost += extra_cost->alu.arith;
14760 *cost += rtx_cost (op0, mode, COMPARE, 0, speed);
14761 *cost += rtx_cost (XEXP (op1, 0), mode, NEG, 1, speed);
14762 return true;
14765 /* CMP.
14767 Compare can freely swap the order of operands, and
14768 canonicalization puts the more complex operation first.
14769 But the integer MINUS logic expects the shift/extend
14770 operation in op1. */
14771 if (! (REG_P (op0)
14772 || (SUBREG_P (op0) && REG_P (SUBREG_REG (op0)))))
14774 op0 = XEXP (x, 1);
14775 op1 = XEXP (x, 0);
14777 goto cost_minus;
14780 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_FLOAT)
14782 /* FCMP. */
14783 if (speed)
14784 *cost += extra_cost->fp[mode == DFmode].compare;
14786 if (CONST_DOUBLE_P (op1) && aarch64_float_const_zero_rtx_p (op1))
14788 *cost += rtx_cost (op0, VOIDmode, COMPARE, 0, speed);
14789 /* FCMP supports constant 0.0 for no extra cost. */
14790 return true;
14792 return false;
14795 if (VECTOR_MODE_P (mode))
14797 /* Vector compare. */
14798 if (speed)
14799 *cost += extra_cost->vect.alu;
14801 if (aarch64_float_const_zero_rtx_p (op1))
14803 /* Vector cm (eq|ge|gt|lt|le) supports constant 0.0 for no extra
14804 cost. */
14805 return true;
14807 return false;
14809 return false;
14811 case MINUS:
14813 op0 = XEXP (x, 0);
14814 op1 = XEXP (x, 1);
14816 cost_minus:
14817 if (VECTOR_MODE_P (mode))
14819 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
14820 if (TARGET_SIMD && (vec_flags & VEC_ADVSIMD))
14822 /* Recognise the SABD and UABD operation here.
14823 Recursion from the PLUS case will catch the accumulating
14824 forms. */
14825 if (aarch64_abd_rtx_p (x))
14827 if (speed)
14828 *cost += extra_cost->vect.alu;
14829 return true;
14831 /* SUBL2 and SUBW2.
14832 The select-operand-high-half versions of the sub instruction
14833 have the same cost as the regular three vector version -
14834 don't add the costs of the select into the costs of the sub.
14836 op0 = aarch64_strip_extend_vec_half (op0);
14837 op1 = aarch64_strip_extend_vec_half (op1);
14841 *cost += rtx_cost (op0, mode, MINUS, 0, speed);
14843 /* Detect valid immediates. */
14844 if ((GET_MODE_CLASS (mode) == MODE_INT
14845 || (GET_MODE_CLASS (mode) == MODE_CC
14846 && GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT))
14847 && CONST_INT_P (op1)
14848 && aarch64_uimm12_shift (INTVAL (op1)))
14850 if (speed)
14851 /* SUB(S) (immediate). */
14852 *cost += extra_cost->alu.arith;
14853 return true;
14856 /* Look for SUB (extended register). */
14857 if (is_a <scalar_int_mode> (mode)
14858 && aarch64_rtx_arith_op_extract_p (op1))
14860 if (speed)
14861 *cost += extra_cost->alu.extend_arith;
14863 op1 = aarch64_strip_extend (op1, true);
14864 *cost += rtx_cost (op1, VOIDmode, GET_CODE (op1), 0, speed);
14865 return true;
14868 rtx new_op1 = aarch64_strip_extend (op1, false);
14870 /* Cost this as an FMA-alike operation. */
14871 if ((GET_CODE (new_op1) == MULT
14872 || aarch64_shift_p (GET_CODE (new_op1)))
14873 && code != COMPARE)
14875 *cost += aarch64_rtx_mult_cost (new_op1, MULT, code, speed);
14876 return true;
14879 *cost += rtx_cost (new_op1, VOIDmode, MINUS, 1, speed);
14881 if (speed)
14883 if (VECTOR_MODE_P (mode))
14885 /* Vector SUB. */
14886 *cost += extra_cost->vect.alu;
14888 else if (GET_MODE_CLASS (mode) == MODE_INT)
14890 /* SUB(S). */
14891 *cost += extra_cost->alu.arith;
14893 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
14895 /* FSUB. */
14896 *cost += extra_cost->fp[mode == DFmode].addsub;
14899 return true;
14902 case PLUS:
14904 rtx new_op0;
14906 op0 = XEXP (x, 0);
14907 op1 = XEXP (x, 1);
14909 cost_plus:
14910 if (VECTOR_MODE_P (mode))
14912 /* ADDL2 and ADDW2. */
14913 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
14914 if (TARGET_SIMD && (vec_flags & VEC_ADVSIMD))
14916 /* The select-operand-high-half versions of the add instruction
14917 have the same cost as the regular three vector version -
14918 don't add the costs of the select into the costs of the add.
14920 op0 = aarch64_strip_extend_vec_half (op0);
14921 op1 = aarch64_strip_extend_vec_half (op1);
14925 if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
14926 || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
14928 /* CSINC. */
14929 *cost += rtx_cost (XEXP (op0, 0), mode, PLUS, 0, speed);
14930 *cost += rtx_cost (op1, mode, PLUS, 1, speed);
14931 return true;
14934 if (GET_MODE_CLASS (mode) == MODE_INT
14935 && (aarch64_plus_immediate (op1, mode)
14936 || aarch64_sve_addvl_addpl_immediate (op1, mode)))
14938 *cost += rtx_cost (op0, mode, PLUS, 0, speed);
14940 if (speed)
14942 /* ADD (immediate). */
14943 *cost += extra_cost->alu.arith;
14945 /* Some tunings prefer to not use the VL-based scalar ops.
14946 Increase the cost of the poly immediate to prevent their
14947 formation. */
14948 if (GET_CODE (op1) == CONST_POLY_INT
14949 && (aarch64_tune_params.extra_tuning_flags
14950 & AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS))
14951 *cost += COSTS_N_INSNS (1);
14953 return true;
14956 if (aarch64_pluslong_immediate (op1, mode))
14958 /* 24-bit add in 2 instructions or 12-bit shifted add. */
14959 if ((INTVAL (op1) & 0xfff) != 0)
14960 *cost += COSTS_N_INSNS (1);
14962 *cost += rtx_cost (op0, mode, PLUS, 0, speed);
14963 return true;
14966 *cost += rtx_cost (op1, mode, PLUS, 1, speed);
14968 /* Look for ADD (extended register). */
14969 if (is_a <scalar_int_mode> (mode)
14970 && aarch64_rtx_arith_op_extract_p (op0))
14972 if (speed)
14973 *cost += extra_cost->alu.extend_arith;
14975 op0 = aarch64_strip_extend (op0, true);
14976 *cost += rtx_cost (op0, VOIDmode, GET_CODE (op0), 0, speed);
14977 return true;
14980 /* Strip any extend, leave shifts behind as we will
14981 cost them through mult_cost. */
14982 new_op0 = aarch64_strip_extend (op0, false);
14984 if (GET_CODE (new_op0) == MULT
14985 || aarch64_shift_p (GET_CODE (new_op0)))
14987 *cost += aarch64_rtx_mult_cost (new_op0, MULT, PLUS,
14988 speed);
14989 return true;
14992 *cost += rtx_cost (new_op0, VOIDmode, PLUS, 0, speed);
14994 if (speed)
14996 if (VECTOR_MODE_P (mode))
14998 /* Vector ADD. */
14999 *cost += extra_cost->vect.alu;
15001 else if (GET_MODE_CLASS (mode) == MODE_INT)
15003 /* ADD. */
15004 *cost += extra_cost->alu.arith;
15006 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
15008 /* FADD. */
15009 *cost += extra_cost->fp[mode == DFmode].addsub;
15012 return true;
15015 case BITREVERSE:
15016 case BSWAP:
15017 *cost = COSTS_N_INSNS (1);
15019 if (speed)
15021 if (VECTOR_MODE_P (mode))
15022 *cost += extra_cost->vect.alu;
15023 else
15024 *cost += extra_cost->alu.rev;
15026 return false;
15028 case IOR:
15029 if (aarch_rev16_p (x))
15031 *cost = COSTS_N_INSNS (1);
15033 if (speed)
15035 if (VECTOR_MODE_P (mode))
15036 *cost += extra_cost->vect.alu;
15037 else
15038 *cost += extra_cost->alu.rev;
15040 return true;
15043 if (aarch64_extr_rtx_p (x, &op0, &op1))
15045 *cost += rtx_cost (op0, mode, IOR, 0, speed);
15046 *cost += rtx_cost (op1, mode, IOR, 1, speed);
15047 if (speed)
15048 *cost += extra_cost->alu.shift;
15050 return true;
15052 /* Fall through. */
15053 case XOR:
15054 case AND:
15055 cost_logic:
15056 op0 = XEXP (x, 0);
15057 op1 = XEXP (x, 1);
15059 if (VECTOR_MODE_P (mode))
15061 if (speed)
15062 *cost += extra_cost->vect.alu;
15063 return true;
15066 if (code == AND
15067 && GET_CODE (op0) == MULT
15068 && CONST_INT_P (XEXP (op0, 1))
15069 && CONST_INT_P (op1)
15070 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0, 1))),
15071 INTVAL (op1)) != 0)
15073 /* This is a UBFM/SBFM. */
15074 *cost += rtx_cost (XEXP (op0, 0), mode, ZERO_EXTRACT, 0, speed);
15075 if (speed)
15076 *cost += extra_cost->alu.bfx;
15077 return true;
15080 if (is_int_mode (mode, &int_mode))
15082 if (CONST_INT_P (op1))
15084 /* We have a mask + shift version of a UBFIZ
15085 i.e. the *andim_ashift<mode>_bfiz pattern. */
15086 if (GET_CODE (op0) == ASHIFT
15087 && aarch64_mask_and_shift_for_ubfiz_p (int_mode, op1,
15088 XEXP (op0, 1)))
15090 *cost += rtx_cost (XEXP (op0, 0), int_mode, code, 0, speed);
15091 if (speed)
15092 *cost += extra_cost->alu.bfx;
15094 return true;
15096 else if (aarch64_bitmask_imm (INTVAL (op1), int_mode))
15098 /* We possibly get the immediate for free, this is not
15099 modelled. */
15100 *cost += rtx_cost (op0, int_mode, code, 0, speed);
15101 if (speed)
15102 *cost += extra_cost->alu.logical;
15104 return true;
15107 else
15109 rtx new_op0 = op0;
15111 /* Handle ORN, EON, or BIC. */
15112 if (GET_CODE (op0) == NOT)
15113 op0 = XEXP (op0, 0);
15115 new_op0 = aarch64_strip_shift (op0);
15117 /* If we had a shift on op0 then this is a logical-shift-
15118 by-register/immediate operation. Otherwise, this is just
15119 a logical operation. */
15120 if (speed)
15122 if (new_op0 != op0)
15124 /* Shift by immediate. */
15125 if (CONST_INT_P (XEXP (op0, 1)))
15126 *cost += extra_cost->alu.log_shift;
15127 else
15128 *cost += extra_cost->alu.log_shift_reg;
15130 else
15131 *cost += extra_cost->alu.logical;
15134 /* In both cases we want to cost both operands. */
15135 *cost += rtx_cost (new_op0, int_mode, code, 0, speed);
15136 *cost += rtx_cost (op1, int_mode, code, 1, speed);
15138 return true;
15141 return false;
15143 case NOT:
15144 x = XEXP (x, 0);
15145 op0 = aarch64_strip_shift (x);
15147 if (VECTOR_MODE_P (mode))
15149 /* Vector NOT. */
15150 *cost += extra_cost->vect.alu;
15151 return false;
15154 /* MVN-shifted-reg. */
15155 if (op0 != x)
15157 *cost += rtx_cost (op0, mode, code, 0, speed);
15159 if (speed)
15160 *cost += extra_cost->alu.log_shift;
15162 return true;
15164 /* EON can have two forms: (xor (not a) b) but also (not (xor a b)).
15165 Handle the second form here taking care that 'a' in the above can
15166 be a shift. */
15167 else if (GET_CODE (op0) == XOR)
15169 rtx newop0 = XEXP (op0, 0);
15170 rtx newop1 = XEXP (op0, 1);
15171 rtx op0_stripped = aarch64_strip_shift (newop0);
15173 *cost += rtx_cost (newop1, mode, code, 1, speed);
15174 *cost += rtx_cost (op0_stripped, mode, XOR, 0, speed);
15176 if (speed)
15178 if (op0_stripped != newop0)
15179 *cost += extra_cost->alu.log_shift;
15180 else
15181 *cost += extra_cost->alu.logical;
15184 return true;
15186 /* MVN. */
15187 if (speed)
15188 *cost += extra_cost->alu.logical;
15190 return false;
15192 case ZERO_EXTEND:
15194 op0 = XEXP (x, 0);
15195 /* If a value is written in SI mode, then zero extended to DI
15196 mode, the operation will in general be free as a write to
15197 a 'w' register implicitly zeroes the upper bits of an 'x'
15198 register. However, if this is
15200 (set (reg) (zero_extend (reg)))
15202 we must cost the explicit register move. */
15203 if (mode == DImode
15204 && GET_MODE (op0) == SImode)
15206 int op_cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, 0, speed);
15208 /* If OP_COST is non-zero, then the cost of the zero extend
15209 is effectively the cost of the inner operation. Otherwise
15210 we have a MOV instruction and we take the cost from the MOV
15211 itself. This is true independently of whether we are
15212 optimizing for space or time. */
15213 if (op_cost)
15214 *cost = op_cost;
15216 return true;
15218 else if (MEM_P (op0))
15220 /* All loads can zero extend to any size for free. */
15221 *cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, param, speed);
15222 return true;
15225 op0 = aarch64_extend_bitfield_pattern_p (x);
15226 if (op0)
15228 *cost += rtx_cost (op0, mode, ZERO_EXTEND, 0, speed);
15229 if (speed)
15230 *cost += extra_cost->alu.bfx;
15231 return true;
15234 if (speed)
15236 if (VECTOR_MODE_P (mode))
15238 /* UMOV. */
15239 *cost += extra_cost->vect.alu;
15241 else
15243 /* We generate an AND instead of UXTB/UXTH. */
15244 *cost += extra_cost->alu.logical;
15247 return false;
15249 case SIGN_EXTEND:
15250 if (MEM_P (XEXP (x, 0)))
15252 /* LDRSH. */
15253 if (speed)
15255 rtx address = XEXP (XEXP (x, 0), 0);
15256 *cost += extra_cost->ldst.load_sign_extend;
15258 *cost +=
15259 COSTS_N_INSNS (aarch64_address_cost (address, mode,
15260 0, speed));
15262 return true;
15265 op0 = aarch64_extend_bitfield_pattern_p (x);
15266 if (op0)
15268 *cost += rtx_cost (op0, mode, SIGN_EXTEND, 0, speed);
15269 if (speed)
15270 *cost += extra_cost->alu.bfx;
15271 return true;
15274 if (speed)
15276 if (VECTOR_MODE_P (mode))
15277 *cost += extra_cost->vect.alu;
15278 else
15279 *cost += extra_cost->alu.extend;
15281 return false;
15283 case ROTATE:
15284 case ROTATERT:
15285 case LSHIFTRT:
15286 case ASHIFTRT:
15287 case ASHIFT:
15288 op0 = XEXP (x, 0);
15289 op1 = XEXP (x, 1);
15291 if (CONST_INT_P (op1))
15293 if (speed)
15295 if (VECTOR_MODE_P (mode))
15297 /* Vector shift (immediate). */
15298 *cost += extra_cost->vect.alu;
15300 else
15302 /* LSL (immediate), ASR (immediate), UBMF, UBFIZ and friends.
15303 These are all aliases. */
15304 *cost += extra_cost->alu.shift;
15308 /* We can incorporate zero/sign extend for free. */
15309 if (GET_CODE (op0) == ZERO_EXTEND
15310 || GET_CODE (op0) == SIGN_EXTEND)
15311 op0 = XEXP (op0, 0);
15313 *cost += rtx_cost (op0, VOIDmode, ASHIFT, 0, speed);
15314 return true;
15316 else
15318 if (VECTOR_MODE_P (mode))
15320 if (speed)
15321 /* Vector shift (register). */
15322 *cost += extra_cost->vect.alu;
15324 else
15326 if (speed)
15327 /* LSLV, ASRV. */
15328 *cost += extra_cost->alu.shift_reg;
15330 /* The register shift amount may be in a shorter mode expressed
15331 as a lowpart SUBREG. For costing purposes just look inside. */
15332 if (SUBREG_P (op1) && subreg_lowpart_p (op1))
15333 op1 = SUBREG_REG (op1);
15334 if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
15335 && CONST_INT_P (XEXP (op1, 1))
15336 && known_eq (INTVAL (XEXP (op1, 1)),
15337 GET_MODE_BITSIZE (mode) - 1))
15339 *cost += rtx_cost (op0, mode, code, 0, speed);
15340 /* We already demanded XEXP (op1, 0) to be REG_P, so
15341 don't recurse into it. */
15342 return true;
15345 return false; /* All arguments need to be in registers. */
15348 case SYMBOL_REF:
15350 if (aarch64_cmodel == AARCH64_CMODEL_LARGE
15351 || aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC)
15353 /* LDR. */
15354 if (speed)
15355 *cost += extra_cost->ldst.load;
15357 else if (aarch64_cmodel == AARCH64_CMODEL_SMALL
15358 || aarch64_cmodel == AARCH64_CMODEL_SMALL_PIC)
15360 /* ADRP, followed by ADD. */
15361 *cost += COSTS_N_INSNS (1);
15362 if (speed)
15363 *cost += 2 * extra_cost->alu.arith;
15365 else if (aarch64_cmodel == AARCH64_CMODEL_TINY
15366 || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
15368 /* ADR. */
15369 if (speed)
15370 *cost += extra_cost->alu.arith;
15373 if (flag_pic)
15375 /* One extra load instruction, after accessing the GOT. */
15376 *cost += COSTS_N_INSNS (1);
15377 if (speed)
15378 *cost += extra_cost->ldst.load;
15380 return true;
15382 case HIGH:
15383 case LO_SUM:
15384 /* ADRP/ADD (immediate). */
15385 if (speed)
15386 *cost += extra_cost->alu.arith;
15387 return true;
15389 case ZERO_EXTRACT:
15390 case SIGN_EXTRACT:
15391 /* UBFX/SBFX. */
15392 if (speed)
15394 if (VECTOR_MODE_P (mode))
15395 *cost += extra_cost->vect.alu;
15396 else
15397 *cost += extra_cost->alu.bfx;
15400 /* We can trust that the immediates used will be correct (there
15401 are no by-register forms), so we need only cost op0. */
15402 *cost += rtx_cost (XEXP (x, 0), VOIDmode, code, 0, speed);
15403 return true;
15405 case MULT:
15406 *cost += aarch64_rtx_mult_cost (x, MULT, 0, speed);
15407 /* aarch64_rtx_mult_cost always handles recursion to its
15408 operands. */
15409 return true;
15411 case MOD:
15412 /* We can expand signed mod by power of 2 using a NEGS, two parallel
15413 ANDs and a CSNEG. Assume here that CSNEG is the same as the cost of
15414 an unconditional negate. This case should only ever be reached through
15415 the set_smod_pow2_cheap check in expmed.cc. */
15416 if (CONST_INT_P (XEXP (x, 1))
15417 && exact_log2 (INTVAL (XEXP (x, 1))) > 0
15418 && (mode == SImode || mode == DImode))
15420 /* We expand to 4 instructions. Reset the baseline. */
15421 *cost = COSTS_N_INSNS (4);
15423 if (speed)
15424 *cost += 2 * extra_cost->alu.logical
15425 + 2 * extra_cost->alu.arith;
15427 return true;
15430 /* Fall-through. */
15431 case UMOD:
15432 if (speed)
15434 /* Slighly prefer UMOD over SMOD. */
15435 if (VECTOR_MODE_P (mode))
15436 *cost += extra_cost->vect.alu;
15437 else if (GET_MODE_CLASS (mode) == MODE_INT)
15438 *cost += (extra_cost->mult[mode == DImode].add
15439 + extra_cost->mult[mode == DImode].idiv
15440 + (code == MOD ? 1 : 0));
15442 return false; /* All arguments need to be in registers. */
15444 case DIV:
15445 case UDIV:
15446 case SQRT:
15447 if (speed)
15449 if (VECTOR_MODE_P (mode))
15450 *cost += extra_cost->vect.alu;
15451 else if (GET_MODE_CLASS (mode) == MODE_INT)
15452 /* There is no integer SQRT, so only DIV and UDIV can get
15453 here. */
15454 *cost += (extra_cost->mult[mode == DImode].idiv
15455 /* Slighly prefer UDIV over SDIV. */
15456 + (code == DIV ? 1 : 0));
15457 else
15458 *cost += extra_cost->fp[mode == DFmode].div;
15460 return false; /* All arguments need to be in registers. */
15462 case IF_THEN_ELSE:
15463 return aarch64_if_then_else_costs (XEXP (x, 0), XEXP (x, 1),
15464 XEXP (x, 2), cost, speed);
15466 case EQ:
15467 case NE:
15468 case GT:
15469 case GTU:
15470 case LT:
15471 case LTU:
15472 case GE:
15473 case GEU:
15474 case LE:
15475 case LEU:
15477 return false; /* All arguments must be in registers. */
15479 case FMA:
15480 op0 = XEXP (x, 0);
15481 op1 = XEXP (x, 1);
15482 op2 = XEXP (x, 2);
15484 if (speed)
15486 if (VECTOR_MODE_P (mode))
15487 *cost += extra_cost->vect.alu;
15488 else
15489 *cost += extra_cost->fp[mode == DFmode].fma;
15492 /* FMSUB, FNMADD, and FNMSUB are free. */
15493 if (GET_CODE (op0) == NEG)
15494 op0 = XEXP (op0, 0);
15496 if (GET_CODE (op2) == NEG)
15497 op2 = XEXP (op2, 0);
15499 /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
15500 and the by-element operand as operand 0. */
15501 if (GET_CODE (op1) == NEG)
15502 op1 = XEXP (op1, 0);
15504 /* Catch vector-by-element operations. The by-element operand can
15505 either be (vec_duplicate (vec_select (x))) or just
15506 (vec_select (x)), depending on whether we are multiplying by
15507 a vector or a scalar.
15509 Canonicalization is not very good in these cases, FMA4 will put the
15510 by-element operand as operand 0, FNMA4 will have it as operand 1. */
15511 if (GET_CODE (op0) == VEC_DUPLICATE)
15512 op0 = XEXP (op0, 0);
15513 else if (GET_CODE (op1) == VEC_DUPLICATE)
15514 op1 = XEXP (op1, 0);
15516 if (GET_CODE (op0) == VEC_SELECT)
15517 op0 = XEXP (op0, 0);
15518 else if (GET_CODE (op1) == VEC_SELECT)
15519 op1 = XEXP (op1, 0);
15521 /* If the remaining parameters are not registers,
15522 get the cost to put them into registers. */
15523 *cost += rtx_cost (op0, mode, FMA, 0, speed);
15524 *cost += rtx_cost (op1, mode, FMA, 1, speed);
15525 *cost += rtx_cost (op2, mode, FMA, 2, speed);
15526 return true;
15528 case FLOAT:
15529 case UNSIGNED_FLOAT:
15530 if (speed)
15531 *cost += extra_cost->fp[mode == DFmode].fromint;
15532 return false;
15534 case FLOAT_EXTEND:
15535 if (speed)
15537 if (VECTOR_MODE_P (mode))
15539 /*Vector truncate. */
15540 *cost += extra_cost->vect.alu;
15542 else
15543 *cost += extra_cost->fp[mode == DFmode].widen;
15545 return false;
15547 case FLOAT_TRUNCATE:
15548 if (speed)
15550 if (VECTOR_MODE_P (mode))
15552 /*Vector conversion. */
15553 *cost += extra_cost->vect.alu;
15555 else
15556 *cost += extra_cost->fp[mode == DFmode].narrow;
15558 return false;
15560 case FIX:
15561 case UNSIGNED_FIX:
15562 x = XEXP (x, 0);
15563 /* Strip the rounding part. They will all be implemented
15564 by the fcvt* family of instructions anyway. */
15565 if (GET_CODE (x) == UNSPEC)
15567 unsigned int uns_code = XINT (x, 1);
15569 if (uns_code == UNSPEC_FRINTA
15570 || uns_code == UNSPEC_FRINTM
15571 || uns_code == UNSPEC_FRINTN
15572 || uns_code == UNSPEC_FRINTP
15573 || uns_code == UNSPEC_FRINTZ)
15574 x = XVECEXP (x, 0, 0);
15577 if (speed)
15579 if (VECTOR_MODE_P (mode))
15580 *cost += extra_cost->vect.alu;
15581 else
15582 *cost += extra_cost->fp[GET_MODE (x) == DFmode].toint;
15585 /* We can combine fmul by a power of 2 followed by a fcvt into a single
15586 fixed-point fcvt. */
15587 if (GET_CODE (x) == MULT
15588 && ((VECTOR_MODE_P (mode)
15589 && aarch64_vec_fpconst_pow_of_2 (XEXP (x, 1)) > 0)
15590 || aarch64_fpconst_pow_of_2 (XEXP (x, 1)) > 0))
15592 *cost += rtx_cost (XEXP (x, 0), VOIDmode, code, 0, speed);
15593 return true;
15596 *cost += rtx_cost (x, VOIDmode, code, 0, speed);
15597 return true;
15599 case ABS:
15600 if (VECTOR_MODE_P (mode))
15602 /* ABS (vector). */
15603 if (speed)
15604 *cost += extra_cost->vect.alu;
15606 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
15608 op0 = XEXP (x, 0);
15610 /* FABD, which is analogous to FADD. */
15611 if (GET_CODE (op0) == MINUS)
15613 *cost += rtx_cost (XEXP (op0, 0), mode, MINUS, 0, speed);
15614 *cost += rtx_cost (XEXP (op0, 1), mode, MINUS, 1, speed);
15615 if (speed)
15616 *cost += extra_cost->fp[mode == DFmode].addsub;
15618 return true;
15620 /* Simple FABS is analogous to FNEG. */
15621 if (speed)
15622 *cost += extra_cost->fp[mode == DFmode].neg;
15624 else
15626 /* Integer ABS will either be split to
15627 two arithmetic instructions, or will be an ABS
15628 (scalar), which we don't model. */
15629 *cost = COSTS_N_INSNS (2);
15630 if (speed)
15631 *cost += 2 * extra_cost->alu.arith;
15633 return false;
15635 case SMAX:
15636 case SMIN:
15637 if (speed)
15639 if (VECTOR_MODE_P (mode))
15640 *cost += extra_cost->vect.alu;
15641 else
15643 /* FMAXNM/FMINNM/FMAX/FMIN.
15644 TODO: This may not be accurate for all implementations, but
15645 we do not model this in the cost tables. */
15646 *cost += extra_cost->fp[mode == DFmode].addsub;
15649 return false;
15651 case UNSPEC:
15652 /* The floating point round to integer frint* instructions. */
15653 if (aarch64_frint_unspec_p (XINT (x, 1)))
15655 if (speed)
15656 *cost += extra_cost->fp[mode == DFmode].roundint;
15658 return false;
15660 break;
15662 case TRUNCATE:
15664 /* Decompose <su>muldi3_highpart. */
15665 if (/* (truncate:DI */
15666 mode == DImode
15667 /* (lshiftrt:TI */
15668 && GET_MODE (XEXP (x, 0)) == TImode
15669 && GET_CODE (XEXP (x, 0)) == LSHIFTRT
15670 /* (mult:TI */
15671 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
15672 /* (ANY_EXTEND:TI (reg:DI))
15673 (ANY_EXTEND:TI (reg:DI))) */
15674 && ((GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == ZERO_EXTEND
15675 && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == ZERO_EXTEND)
15676 || (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == SIGN_EXTEND
15677 && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == SIGN_EXTEND))
15678 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0)) == DImode
15679 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0)) == DImode
15680 /* (const_int 64) */
15681 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
15682 && UINTVAL (XEXP (XEXP (x, 0), 1)) == 64)
15684 /* UMULH/SMULH. */
15685 if (speed)
15686 *cost += extra_cost->mult[mode == DImode].extend;
15687 *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0),
15688 mode, MULT, 0, speed);
15689 *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0),
15690 mode, MULT, 1, speed);
15691 return true;
15693 break;
15694 case CONST_VECTOR:
15696 /* Load using MOVI/MVNI. */
15697 if (aarch64_simd_valid_mov_imm (x))
15698 *cost = extra_cost->vect.movi;
15699 else /* Load using constant pool. */
15700 *cost = extra_cost->ldst.load;
15701 break;
15703 case VEC_CONCAT:
15704 /* depending on the operation, either DUP or INS.
15705 For now, keep default costing. */
15706 break;
15707 case VEC_DUPLICATE:
15708 /* Load using a DUP. */
15709 *cost = extra_cost->vect.dup;
15710 return false;
15711 case VEC_SELECT:
15713 rtx op0 = XEXP (x, 0);
15714 *cost = rtx_cost (op0, GET_MODE (op0), VEC_SELECT, 0, speed);
15716 /* cost subreg of 0 as free, otherwise as DUP */
15717 rtx op1 = XEXP (x, 1);
15718 if (vec_series_lowpart_p (mode, GET_MODE (op1), op1))
15720 else if (vec_series_highpart_p (mode, GET_MODE (op1), op1))
15721 *cost = extra_cost->vect.dup;
15722 else
15723 *cost = extra_cost->vect.extract;
15724 return true;
15726 default:
15727 break;
15730 if (dump_file
15731 && flag_aarch64_verbose_cost)
15732 fprintf (dump_file,
15733 "\nFailed to cost RTX. Assuming default cost.\n");
15735 return true;
15738 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
15739 calculated for X. This cost is stored in *COST. Returns true
15740 if the total cost of X was calculated. */
15741 static bool
15742 aarch64_rtx_costs_wrapper (rtx x, machine_mode mode, int outer,
15743 int param, int *cost, bool speed)
15745 bool result = aarch64_rtx_costs (x, mode, outer, param, cost, speed);
15747 if (dump_file
15748 && flag_aarch64_verbose_cost)
15750 print_rtl_single (dump_file, x);
15751 fprintf (dump_file, "\n%s cost: %d (%s)\n",
15752 speed ? "Hot" : "Cold",
15753 *cost, result ? "final" : "partial");
15756 return result;
15759 static int
15760 aarch64_register_move_cost (machine_mode mode,
15761 reg_class_t from_i, reg_class_t to_i)
15763 enum reg_class from = (enum reg_class) from_i;
15764 enum reg_class to = (enum reg_class) to_i;
15765 const struct cpu_regmove_cost *regmove_cost
15766 = aarch64_tune_params.regmove_cost;
15768 /* Trest any subset of POINTER_REGS as though it were GENERAL_REGS. */
15769 if (reg_class_subset_p (to, POINTER_REGS))
15770 to = GENERAL_REGS;
15772 if (reg_class_subset_p (from, POINTER_REGS))
15773 from = GENERAL_REGS;
15775 /* Make RDFFR very expensive. In particular, if we know that the FFR
15776 contains a PTRUE (e.g. after a SETFFR), we must never use RDFFR
15777 as a way of obtaining a PTRUE. */
15778 if (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL
15779 && hard_reg_set_subset_p (reg_class_contents[from_i],
15780 reg_class_contents[FFR_REGS]))
15781 return 80;
15783 /* Moves to/from sysregs are expensive, and must go via GPR. */
15784 if (from == MOVEABLE_SYSREGS)
15785 return 80 + aarch64_register_move_cost (mode, GENERAL_REGS, to);
15786 if (to == MOVEABLE_SYSREGS)
15787 return 80 + aarch64_register_move_cost (mode, from, GENERAL_REGS);
15789 /* Moving between GPR and stack cost is the same as GP2GP. */
15790 if ((from == GENERAL_REGS && to == STACK_REG)
15791 || (to == GENERAL_REGS && from == STACK_REG))
15792 return regmove_cost->GP2GP;
15794 /* To/From the stack register, we move via the gprs. */
15795 if (to == STACK_REG || from == STACK_REG)
15796 return aarch64_register_move_cost (mode, from, GENERAL_REGS)
15797 + aarch64_register_move_cost (mode, GENERAL_REGS, to);
15799 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
15800 if (vec_flags != (VEC_ADVSIMD | VEC_STRUCT | VEC_PARTIAL)
15801 && known_eq (GET_MODE_SIZE (mode), 16))
15803 /* 128-bit operations on general registers require 2 instructions. */
15804 if (from == GENERAL_REGS && to == GENERAL_REGS)
15805 return regmove_cost->GP2GP * 2;
15806 else if (from == GENERAL_REGS)
15807 return regmove_cost->GP2FP * 2;
15808 else if (to == GENERAL_REGS)
15809 return regmove_cost->FP2GP * 2;
15811 /* When AdvSIMD instructions are disabled it is not possible to move
15812 a 128-bit value directly between Q registers. This is handled in
15813 secondary reload. A general register is used as a scratch to move
15814 the upper DI value and the lower DI value is moved directly,
15815 hence the cost is the sum of three moves. */
15816 if (!TARGET_SIMD && !TARGET_SVE)
15817 return regmove_cost->GP2FP + regmove_cost->FP2GP + regmove_cost->FP2FP;
15819 return regmove_cost->FP2FP;
15822 if (from == GENERAL_REGS && to == GENERAL_REGS)
15823 return regmove_cost->GP2GP;
15824 else if (from == GENERAL_REGS)
15825 return regmove_cost->GP2FP;
15826 else if (to == GENERAL_REGS)
15827 return regmove_cost->FP2GP;
15829 if (!TARGET_SIMD && vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
15831 /* Needs a round-trip through memory, which can use LDP/STP for pairs.
15832 The cost must be greater than 2 units to indicate that direct
15833 moves aren't possible. */
15834 auto per_vector = (aarch64_tune_params.memmov_cost.load_fp
15835 + aarch64_tune_params.memmov_cost.store_fp);
15836 return MIN (CEIL (per_vector, 2), 4);
15839 return regmove_cost->FP2FP;
15842 /* Implements TARGET_MEMORY_MOVE_COST. */
15843 static int
15844 aarch64_memory_move_cost (machine_mode mode, reg_class_t rclass_i, bool in)
15846 enum reg_class rclass = (enum reg_class) rclass_i;
15847 if (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL
15848 ? reg_classes_intersect_p (rclass, PR_REGS)
15849 : reg_class_subset_p (rclass, PR_REGS))
15850 return (in
15851 ? aarch64_tune_params.memmov_cost.load_pred
15852 : aarch64_tune_params.memmov_cost.store_pred);
15854 if (VECTOR_MODE_P (mode) || FLOAT_MODE_P (mode)
15855 ? reg_classes_intersect_p (rclass, FP_REGS)
15856 : reg_class_subset_p (rclass, FP_REGS))
15857 return (in
15858 ? aarch64_tune_params.memmov_cost.load_fp
15859 : aarch64_tune_params.memmov_cost.store_fp);
15861 /* If the move needs to go through GPRs, add the cost of doing that. */
15862 int base = 0;
15863 if (rclass_i == MOVEABLE_SYSREGS)
15864 base += (in
15865 ? aarch64_register_move_cost (DImode, GENERAL_REGS, rclass_i)
15866 : aarch64_register_move_cost (DImode, rclass_i, GENERAL_REGS));
15868 return (in
15869 ? base + aarch64_tune_params.memmov_cost.load_int
15870 : base + aarch64_tune_params.memmov_cost.store_int);
15873 /* Implement TARGET_INSN_COST. We have the opportunity to do something
15874 much more productive here, such as using insn attributes to cost things.
15875 But we don't, not yet.
15877 The main point of this current definition is to make calling insn_cost
15878 on one instruction equivalent to calling seq_cost on a sequence that
15879 contains only that instruction. The default definition would instead
15880 only look at SET_SRCs, ignoring SET_DESTs.
15882 This ensures that, for example, storing a 128-bit zero vector is more
15883 expensive than storing a 128-bit vector register. A move of zero
15884 into a 128-bit vector register followed by multiple stores of that
15885 register is then cheaper than multiple stores of zero (which would
15886 use STP of XZR). This in turn allows STP Qs to be formed. */
15887 static int
15888 aarch64_insn_cost (rtx_insn *insn, bool speed)
15890 if (rtx set = single_set (insn))
15891 return set_rtx_cost (set, speed);
15892 return pattern_cost (PATTERN (insn), speed);
15895 /* Implement TARGET_INIT_BUILTINS. */
15896 static void
15897 aarch64_init_builtins ()
15899 aarch64_general_init_builtins ();
15900 aarch64_sve::init_builtins ();
15901 #ifdef SUBTARGET_INIT_BUILTINS
15902 SUBTARGET_INIT_BUILTINS;
15903 #endif
15906 /* Implement TARGET_FOLD_BUILTIN. */
15907 static tree
15908 aarch64_fold_builtin (tree fndecl, int nargs, tree *args, bool)
15910 unsigned int code = DECL_MD_FUNCTION_CODE (fndecl);
15911 unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
15912 tree type = TREE_TYPE (TREE_TYPE (fndecl));
15913 switch (code & AARCH64_BUILTIN_CLASS)
15915 case AARCH64_BUILTIN_GENERAL:
15916 return aarch64_general_fold_builtin (subcode, type, nargs, args);
15918 case AARCH64_BUILTIN_SVE:
15919 return NULL_TREE;
15921 gcc_unreachable ();
15924 /* Implement TARGET_GIMPLE_FOLD_BUILTIN. */
15925 static bool
15926 aarch64_gimple_fold_builtin (gimple_stmt_iterator *gsi)
15928 gcall *stmt = as_a <gcall *> (gsi_stmt (*gsi));
15929 tree fndecl = gimple_call_fndecl (stmt);
15930 unsigned int code = DECL_MD_FUNCTION_CODE (fndecl);
15931 unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
15932 gimple *new_stmt = NULL;
15933 switch (code & AARCH64_BUILTIN_CLASS)
15935 case AARCH64_BUILTIN_GENERAL:
15936 new_stmt = aarch64_general_gimple_fold_builtin (subcode, stmt, gsi);
15937 break;
15939 case AARCH64_BUILTIN_SVE:
15940 new_stmt = aarch64_sve::gimple_fold_builtin (subcode, gsi, stmt);
15941 break;
15944 if (!new_stmt)
15945 return false;
15947 gsi_replace (gsi, new_stmt, false);
15948 return true;
15951 /* Implement TARGET_EXPAND_BUILTIN. */
15952 static rtx
15953 aarch64_expand_builtin (tree exp, rtx target, rtx, machine_mode, int ignore)
15955 tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
15956 unsigned int code = DECL_MD_FUNCTION_CODE (fndecl);
15957 unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
15958 switch (code & AARCH64_BUILTIN_CLASS)
15960 case AARCH64_BUILTIN_GENERAL:
15961 return aarch64_general_expand_builtin (subcode, exp, target, ignore);
15963 case AARCH64_BUILTIN_SVE:
15964 return aarch64_sve::expand_builtin (subcode, exp, target);
15966 gcc_unreachable ();
15969 /* Implement TARGET_BUILTIN_DECL. */
15970 static tree
15971 aarch64_builtin_decl (unsigned int code, bool initialize_p)
15973 unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
15974 switch (code & AARCH64_BUILTIN_CLASS)
15976 case AARCH64_BUILTIN_GENERAL:
15977 return aarch64_general_builtin_decl (subcode, initialize_p);
15979 case AARCH64_BUILTIN_SVE:
15980 return aarch64_sve::builtin_decl (subcode, initialize_p);
15982 gcc_unreachable ();
15985 /* Return true if it is safe and beneficial to use the approximate rsqrt optabs
15986 to optimize 1.0/sqrt. */
15988 static bool
15989 use_rsqrt_p (machine_mode mode)
15991 return (!flag_trapping_math
15992 && flag_unsafe_math_optimizations
15993 && ((aarch64_tune_params.approx_modes->recip_sqrt
15994 & AARCH64_APPROX_MODE (mode))
15995 || flag_mrecip_low_precision_sqrt));
15998 /* Function to decide when to use the approximate reciprocal square root
15999 builtin. */
16001 static tree
16002 aarch64_builtin_reciprocal (tree fndecl)
16004 machine_mode mode = TYPE_MODE (TREE_TYPE (fndecl));
16006 if (!use_rsqrt_p (mode))
16007 return NULL_TREE;
16008 unsigned int code = DECL_MD_FUNCTION_CODE (fndecl);
16009 unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
16010 switch (code & AARCH64_BUILTIN_CLASS)
16012 case AARCH64_BUILTIN_GENERAL:
16013 return aarch64_general_builtin_rsqrt (subcode);
16015 case AARCH64_BUILTIN_SVE:
16016 return NULL_TREE;
16018 gcc_unreachable ();
16021 /* Emit code to perform the floating-point operation:
16023 DST = SRC1 * SRC2
16025 where all three operands are already known to be registers.
16026 If the operation is an SVE one, PTRUE is a suitable all-true
16027 predicate. */
16029 static void
16030 aarch64_emit_mult (rtx dst, rtx ptrue, rtx src1, rtx src2)
16032 if (ptrue)
16033 emit_insn (gen_aarch64_pred (UNSPEC_COND_FMUL, GET_MODE (dst),
16034 dst, ptrue, src1, src2,
16035 gen_int_mode (SVE_RELAXED_GP, SImode)));
16036 else
16037 emit_set_insn (dst, gen_rtx_MULT (GET_MODE (dst), src1, src2));
16040 /* Emit instruction sequence to compute either the approximate square root
16041 or its approximate reciprocal, depending on the flag RECP, and return
16042 whether the sequence was emitted or not. */
16044 bool
16045 aarch64_emit_approx_sqrt (rtx dst, rtx src, bool recp)
16047 machine_mode mode = GET_MODE (dst);
16049 if (GET_MODE_INNER (mode) == HFmode)
16051 gcc_assert (!recp);
16052 return false;
16055 if (!recp)
16057 if (!(flag_mlow_precision_sqrt
16058 || (aarch64_tune_params.approx_modes->sqrt
16059 & AARCH64_APPROX_MODE (mode))))
16060 return false;
16062 if (!flag_finite_math_only
16063 || flag_trapping_math
16064 || !flag_unsafe_math_optimizations
16065 || optimize_function_for_size_p (cfun))
16066 return false;
16068 else
16069 /* Caller assumes we cannot fail. */
16070 gcc_assert (use_rsqrt_p (mode));
16072 rtx pg = NULL_RTX;
16073 if (aarch64_sve_mode_p (mode))
16074 pg = aarch64_ptrue_reg (aarch64_sve_pred_mode (mode));
16075 machine_mode mmsk = (VECTOR_MODE_P (mode)
16076 ? related_int_vector_mode (mode).require ()
16077 : int_mode_for_mode (mode).require ());
16078 rtx xmsk = NULL_RTX;
16079 if (!recp)
16081 /* When calculating the approximate square root, compare the
16082 argument with 0.0 and create a mask. */
16083 rtx zero = CONST0_RTX (mode);
16084 if (pg)
16086 xmsk = gen_reg_rtx (GET_MODE (pg));
16087 rtx hint = gen_int_mode (SVE_KNOWN_PTRUE, SImode);
16088 emit_insn (gen_aarch64_pred_fcm (UNSPEC_COND_FCMNE, mode,
16089 xmsk, pg, hint, src, zero));
16091 else
16093 xmsk = gen_reg_rtx (mmsk);
16094 emit_insn (gen_rtx_SET (xmsk,
16095 gen_rtx_NEG (mmsk,
16096 gen_rtx_EQ (mmsk, src, zero))));
16100 /* Estimate the approximate reciprocal square root. */
16101 rtx xdst = gen_reg_rtx (mode);
16102 emit_insn (gen_aarch64_rsqrte (mode, xdst, src));
16104 /* Iterate over the series twice for SF and thrice for DF. */
16105 int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
16107 /* Optionally iterate over the series once less for faster performance
16108 while sacrificing the accuracy. */
16109 if ((recp && flag_mrecip_low_precision_sqrt)
16110 || (!recp && flag_mlow_precision_sqrt))
16111 iterations--;
16113 /* Iterate over the series to calculate the approximate reciprocal square
16114 root. */
16115 rtx x1 = gen_reg_rtx (mode);
16116 while (iterations--)
16118 rtx x2 = gen_reg_rtx (mode);
16119 aarch64_emit_mult (x2, pg, xdst, xdst);
16121 emit_insn (gen_aarch64_rsqrts (mode, x1, src, x2));
16123 if (iterations > 0)
16124 aarch64_emit_mult (xdst, pg, xdst, x1);
16127 if (!recp)
16129 if (pg)
16130 /* Multiply nonzero source values by the corresponding intermediate
16131 result elements, so that the final calculation is the approximate
16132 square root rather than its reciprocal. Select a zero result for
16133 zero source values, to avoid the Inf * 0 -> NaN that we'd get
16134 otherwise. */
16135 emit_insn (gen_cond (UNSPEC_COND_FMUL, mode,
16136 xdst, xmsk, xdst, src, CONST0_RTX (mode)));
16137 else
16139 /* Qualify the approximate reciprocal square root when the
16140 argument is 0.0 by squashing the intermediary result to 0.0. */
16141 rtx xtmp = gen_reg_rtx (mmsk);
16142 emit_set_insn (xtmp, gen_rtx_AND (mmsk, gen_rtx_NOT (mmsk, xmsk),
16143 gen_rtx_SUBREG (mmsk, xdst, 0)));
16144 emit_move_insn (xdst, gen_rtx_SUBREG (mode, xtmp, 0));
16146 /* Calculate the approximate square root. */
16147 aarch64_emit_mult (xdst, pg, xdst, src);
16151 /* Finalize the approximation. */
16152 aarch64_emit_mult (dst, pg, xdst, x1);
16154 return true;
16157 /* Emit the instruction sequence to compute the approximation for the division
16158 of NUM by DEN in QUO and return whether the sequence was emitted or not. */
16160 bool
16161 aarch64_emit_approx_div (rtx quo, rtx num, rtx den)
16163 machine_mode mode = GET_MODE (quo);
16165 if (GET_MODE_INNER (mode) == HFmode)
16166 return false;
16168 bool use_approx_division_p = (flag_mlow_precision_div
16169 || (aarch64_tune_params.approx_modes->division
16170 & AARCH64_APPROX_MODE (mode)));
16172 if (!flag_finite_math_only
16173 || flag_trapping_math
16174 || !flag_unsafe_math_optimizations
16175 || optimize_function_for_size_p (cfun)
16176 || !use_approx_division_p)
16177 return false;
16179 if (!TARGET_SIMD && VECTOR_MODE_P (mode))
16180 return false;
16182 rtx pg = NULL_RTX;
16183 if (aarch64_sve_mode_p (mode))
16184 pg = aarch64_ptrue_reg (aarch64_sve_pred_mode (mode));
16186 /* Estimate the approximate reciprocal. */
16187 rtx xrcp = gen_reg_rtx (mode);
16188 emit_insn (gen_aarch64_frecpe (mode, xrcp, den));
16190 /* Iterate over the series twice for SF and thrice for DF. */
16191 int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
16193 /* Optionally iterate over the series less for faster performance,
16194 while sacrificing the accuracy. The default is 2 for DF and 1 for SF. */
16195 if (flag_mlow_precision_div)
16196 iterations = (GET_MODE_INNER (mode) == DFmode
16197 ? aarch64_double_recp_precision
16198 : aarch64_float_recp_precision);
16200 /* Iterate over the series to calculate the approximate reciprocal. */
16201 rtx xtmp = gen_reg_rtx (mode);
16202 while (iterations--)
16204 emit_insn (gen_aarch64_frecps (mode, xtmp, xrcp, den));
16206 if (iterations > 0)
16207 aarch64_emit_mult (xrcp, pg, xrcp, xtmp);
16210 if (num != CONST1_RTX (mode))
16212 /* As the approximate reciprocal of DEN is already calculated, only
16213 calculate the approximate division when NUM is not 1.0. */
16214 rtx xnum = force_reg (mode, num);
16215 aarch64_emit_mult (xrcp, pg, xrcp, xnum);
16218 /* Finalize the approximation. */
16219 aarch64_emit_mult (quo, pg, xrcp, xtmp);
16220 return true;
16223 /* Emit an optimized sequence to perform a vector rotate
16224 of REG by the vector constant amount AMNT_VEC and place the result
16225 in DST. Return true iff successful. */
16227 bool
16228 aarch64_emit_opt_vec_rotate (rtx dst, rtx reg, rtx amnt_vec)
16230 rtx amnt = unwrap_const_vec_duplicate (amnt_vec);
16231 gcc_assert (CONST_INT_P (amnt));
16232 HOST_WIDE_INT rotamnt = UINTVAL (amnt);
16233 machine_mode mode = GET_MODE (reg);
16234 /* Don't end up here after reload. */
16235 gcc_assert (can_create_pseudo_p ());
16236 /* Rotates by half the element width map down to REV* instructions and should
16237 always be preferred when possible. */
16238 if (rotamnt == GET_MODE_UNIT_BITSIZE (mode) / 2
16239 && expand_rotate_as_vec_perm (mode, dst, reg, amnt))
16240 return true;
16241 /* 64 and 128-bit vector modes can use the XAR instruction
16242 when available. */
16243 else if ((TARGET_SHA3 && mode == V2DImode)
16244 || (TARGET_SVE2
16245 && (known_eq (GET_MODE_SIZE (mode), 8)
16246 || known_eq (GET_MODE_SIZE (mode), 16))))
16248 rtx zeroes = aarch64_gen_shareable_zero (mode);
16249 rtx xar_op
16250 = gen_rtx_ROTATE (mode, gen_rtx_XOR (mode, reg, zeroes),
16251 amnt_vec);
16252 emit_set_insn (dst, xar_op);
16253 return true;
16255 /* If none of the above, try to expand rotates by any byte amount as
16256 permutes. */
16257 else if (expand_rotate_as_vec_perm (mode, dst, reg, amnt))
16258 return true;
16259 return false;
16262 /* Return the number of instructions that can be issued per cycle. */
16263 static int
16264 aarch64_sched_issue_rate (void)
16266 return aarch64_tune_params.issue_rate;
16269 /* Implement TARGET_SCHED_VARIABLE_ISSUE. */
16270 static int
16271 aarch64_sched_variable_issue (FILE *, int, rtx_insn *insn, int more)
16273 if (DEBUG_INSN_P (insn))
16274 return more;
16276 rtx_code code = GET_CODE (PATTERN (insn));
16277 if (code == USE || code == CLOBBER)
16278 return more;
16280 if (get_attr_type (insn) == TYPE_NO_INSN)
16281 return more;
16283 return more - 1;
16286 static int
16287 aarch64_sched_first_cycle_multipass_dfa_lookahead (void)
16289 int issue_rate = aarch64_sched_issue_rate ();
16291 return issue_rate > 1 && !sched_fusion ? issue_rate : 0;
16295 /* Implement TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD as
16296 autopref_multipass_dfa_lookahead_guard from haifa-sched.cc. It only
16297 has an effect if PARAM_SCHED_AUTOPREF_QUEUE_DEPTH > 0. */
16299 static int
16300 aarch64_first_cycle_multipass_dfa_lookahead_guard (rtx_insn *insn,
16301 int ready_index)
16303 return autopref_multipass_dfa_lookahead_guard (insn, ready_index);
16307 /* Vectorizer cost model target hooks. */
16309 /* If a vld1 from address ADDR should be recorded in vector_load_decls,
16310 return the decl that should be recorded. Return null otherwise. */
16311 tree
16312 aarch64_vector_load_decl (tree addr)
16314 if (TREE_CODE (addr) != ADDR_EXPR)
16315 return NULL_TREE;
16316 tree base = get_base_address (TREE_OPERAND (addr, 0));
16317 if (TREE_CODE (base) != VAR_DECL)
16318 return NULL_TREE;
16319 return base;
16322 /* Return true if STMT_INFO accesses a decl that is known to be the
16323 argument to a vld1 in the same function. */
16324 static bool
16325 aarch64_accesses_vector_load_decl_p (stmt_vec_info stmt_info)
16327 if (!cfun->machine->vector_load_decls)
16328 return false;
16329 auto dr = STMT_VINFO_DATA_REF (stmt_info);
16330 if (!dr)
16331 return false;
16332 tree decl = aarch64_vector_load_decl (DR_BASE_ADDRESS (dr));
16333 return decl && cfun->machine->vector_load_decls->contains (decl);
16336 /* Information about how the CPU would issue the scalar, Advanced SIMD
16337 or SVE version of a vector loop, using the scheme defined by the
16338 aarch64_base_vec_issue_info hierarchy of structures. */
16339 class aarch64_vec_op_count
16341 public:
16342 aarch64_vec_op_count () = default;
16343 aarch64_vec_op_count (const aarch64_vec_issue_info *, unsigned int,
16344 unsigned int = 1);
16346 unsigned int vec_flags () const { return m_vec_flags; }
16347 unsigned int vf_factor () const { return m_vf_factor; }
16349 const aarch64_base_vec_issue_info *base_issue_info () const;
16350 const aarch64_simd_vec_issue_info *simd_issue_info () const;
16351 const aarch64_sve_vec_issue_info *sve_issue_info () const;
16353 fractional_cost rename_cycles_per_iter () const;
16354 fractional_cost min_nonpred_cycles_per_iter () const;
16355 fractional_cost min_pred_cycles_per_iter () const;
16356 fractional_cost min_cycles_per_iter () const;
16358 void dump () const;
16360 /* The number of individual "general" operations. See the comments
16361 in aarch64_base_vec_issue_info for details. */
16362 unsigned int general_ops = 0;
16364 /* The number of load and store operations, under the same scheme
16365 as above. */
16366 unsigned int loads = 0;
16367 unsigned int stores = 0;
16369 /* The minimum number of cycles needed to execute all loop-carried
16370 operations, which in the vector code become associated with
16371 reductions. */
16372 unsigned int reduction_latency = 0;
16374 /* The number of individual predicate operations. See the comments
16375 in aarch64_sve_vec_issue_info for details. */
16376 unsigned int pred_ops = 0;
16378 private:
16379 /* The issue information for the core. */
16380 const aarch64_vec_issue_info *m_issue_info = nullptr;
16382 /* - If M_VEC_FLAGS is zero then this structure describes scalar code
16383 - If M_VEC_FLAGS & VEC_ADVSIMD is nonzero then this structure describes
16384 Advanced SIMD code.
16385 - If M_VEC_FLAGS & VEC_ANY_SVE is nonzero then this structure describes
16386 SVE code. */
16387 unsigned int m_vec_flags = 0;
16389 /* Assume that, when the code is executing on the core described
16390 by M_ISSUE_INFO, one iteration of the loop will handle M_VF_FACTOR
16391 times more data than the vectorizer anticipates.
16393 This is only ever different from 1 for SVE. It allows us to consider
16394 what would happen on a 256-bit SVE target even when the -mtune
16395 parameters say that the “likely” SVE length is 128 bits. */
16396 unsigned int m_vf_factor = 1;
16399 aarch64_vec_op_count::
16400 aarch64_vec_op_count (const aarch64_vec_issue_info *issue_info,
16401 unsigned int vec_flags, unsigned int vf_factor)
16402 : m_issue_info (issue_info),
16403 m_vec_flags (vec_flags),
16404 m_vf_factor (vf_factor)
16408 /* Return the base issue information (i.e. the parts that make sense
16409 for both scalar and vector code). Return null if we have no issue
16410 information. */
16411 const aarch64_base_vec_issue_info *
16412 aarch64_vec_op_count::base_issue_info () const
16414 if (auto *ret = simd_issue_info ())
16415 return ret;
16416 return m_issue_info->scalar;
16419 /* If the structure describes vector code and we have associated issue
16420 information, return that issue information, otherwise return null. */
16421 const aarch64_simd_vec_issue_info *
16422 aarch64_vec_op_count::simd_issue_info () const
16424 if (auto *ret = sve_issue_info ())
16425 return ret;
16426 if (m_vec_flags)
16427 return m_issue_info->advsimd;
16428 return nullptr;
16431 /* If the structure describes SVE code and we have associated issue
16432 information, return that issue information, otherwise return null. */
16433 const aarch64_sve_vec_issue_info *
16434 aarch64_vec_op_count::sve_issue_info () const
16436 if (m_vec_flags & VEC_ANY_SVE)
16437 return m_issue_info->sve;
16438 return nullptr;
16441 /* Estimate the minimum number of cycles per iteration needed to rename
16442 the instructions.
16444 ??? For now this is done inline rather than via cost tables, since it
16445 isn't clear how it should be parameterized for the general case. */
16446 fractional_cost
16447 aarch64_vec_op_count::rename_cycles_per_iter () const
16449 if (sve_issue_info () == &neoverse512tvb_sve_issue_info
16450 || sve_issue_info () == &neoversen2_sve_issue_info
16451 || sve_issue_info () == &neoversev2_sve_issue_info)
16452 /* + 1 for an addition. We've already counted a general op for each
16453 store, so we don't need to account for stores separately. The branch
16454 reads no registers and so does not need to be counted either.
16456 ??? This value is very much on the pessimistic side, but seems to work
16457 pretty well in practice. */
16458 return { general_ops + loads + pred_ops + 1, 5 };
16460 return 0;
16463 /* Like min_cycles_per_iter, but excluding predicate operations. */
16464 fractional_cost
16465 aarch64_vec_op_count::min_nonpred_cycles_per_iter () const
16467 auto *issue_info = base_issue_info ();
16469 fractional_cost cycles = MAX (reduction_latency, 1);
16470 cycles = std::max (cycles, { stores, issue_info->stores_per_cycle });
16471 cycles = std::max (cycles, { loads + stores,
16472 issue_info->loads_stores_per_cycle });
16473 cycles = std::max (cycles, { general_ops,
16474 issue_info->general_ops_per_cycle });
16475 cycles = std::max (cycles, rename_cycles_per_iter ());
16476 return cycles;
16479 /* Like min_cycles_per_iter, but including only the predicate operations. */
16480 fractional_cost
16481 aarch64_vec_op_count::min_pred_cycles_per_iter () const
16483 if (auto *issue_info = sve_issue_info ())
16484 return { pred_ops, issue_info->pred_ops_per_cycle };
16485 return 0;
16488 /* Estimate the minimum number of cycles needed to issue the operations.
16489 This is a very simplistic model! */
16490 fractional_cost
16491 aarch64_vec_op_count::min_cycles_per_iter () const
16493 return std::max (min_nonpred_cycles_per_iter (),
16494 min_pred_cycles_per_iter ());
16497 /* Dump information about the structure. */
16498 void
16499 aarch64_vec_op_count::dump () const
16501 dump_printf_loc (MSG_NOTE, vect_location,
16502 " load operations = %d\n", loads);
16503 dump_printf_loc (MSG_NOTE, vect_location,
16504 " store operations = %d\n", stores);
16505 dump_printf_loc (MSG_NOTE, vect_location,
16506 " general operations = %d\n", general_ops);
16507 if (sve_issue_info ())
16508 dump_printf_loc (MSG_NOTE, vect_location,
16509 " predicate operations = %d\n", pred_ops);
16510 dump_printf_loc (MSG_NOTE, vect_location,
16511 " reduction latency = %d\n", reduction_latency);
16512 if (auto rcpi = rename_cycles_per_iter ())
16513 dump_printf_loc (MSG_NOTE, vect_location,
16514 " estimated cycles per iteration to rename = %f\n",
16515 rcpi.as_double ());
16516 if (auto pred_cpi = min_pred_cycles_per_iter ())
16518 dump_printf_loc (MSG_NOTE, vect_location,
16519 " estimated min cycles per iteration"
16520 " without predication = %f\n",
16521 min_nonpred_cycles_per_iter ().as_double ());
16522 dump_printf_loc (MSG_NOTE, vect_location,
16523 " estimated min cycles per iteration"
16524 " for predication = %f\n", pred_cpi.as_double ());
16526 if (auto cpi = min_cycles_per_iter ())
16527 dump_printf_loc (MSG_NOTE, vect_location,
16528 " estimated min cycles per iteration = %f\n",
16529 cpi.as_double ());
16532 /* Information about vector code that we're in the process of costing. */
16533 class aarch64_vector_costs : public vector_costs
16535 public:
16536 aarch64_vector_costs (vec_info *, bool);
16538 unsigned int add_stmt_cost (int count, vect_cost_for_stmt kind,
16539 stmt_vec_info stmt_info, slp_tree, tree vectype,
16540 int misalign,
16541 vect_cost_model_location where) override;
16542 void finish_cost (const vector_costs *) override;
16543 bool better_main_loop_than_p (const vector_costs *other) const override;
16545 private:
16546 void record_potential_advsimd_unrolling (loop_vec_info);
16547 void analyze_loop_vinfo (loop_vec_info);
16548 void count_ops (unsigned int, vect_cost_for_stmt, stmt_vec_info, slp_tree,
16549 aarch64_vec_op_count *);
16550 fractional_cost adjust_body_cost_sve (const aarch64_vec_op_count *,
16551 fractional_cost, unsigned int,
16552 unsigned int *, bool *);
16553 unsigned int adjust_body_cost (loop_vec_info, const aarch64_vector_costs *,
16554 unsigned int);
16555 bool prefer_unrolled_loop () const;
16556 unsigned int determine_suggested_unroll_factor ();
16558 /* True if we have performed one-time initialization based on the
16559 vec_info. */
16560 bool m_analyzed_vinfo = false;
16562 /* This loop uses an average operation that is not supported by SVE, but is
16563 supported by Advanced SIMD and SVE2. */
16564 bool m_has_avg = false;
16566 /* Additional initialization costs for using gather or scatter operation in
16567 the current loop. */
16568 unsigned int m_sve_gather_scatter_init_cost = 0;
16570 /* True if the vector body contains a store to a decl and if the
16571 function is known to have a vld1 from the same decl.
16573 In the Advanced SIMD ACLE, the recommended endian-agnostic way of
16574 initializing a vector is:
16576 float f[4] = { elts };
16577 float32x4_t x = vld1q_f32(f);
16579 We should strongly prefer vectorization of the initialization of f,
16580 so that the store to f and the load back can be optimized away,
16581 leaving a vectorization of { elts }. */
16582 bool m_stores_to_vector_load_decl = false;
16584 /* Non-zero if the last operation we costed is a vector promotion or demotion.
16585 In this case the value is the number of insns in the last operation.
16587 On AArch64 vector promotion and demotions require us to first widen or
16588 narrow the input and only after that emit conversion instructions. For
16589 costing this means we need to emit the cost of the final conversions as
16590 well. */
16591 unsigned int m_num_last_promote_demote = 0;
16593 /* - If M_VEC_FLAGS is zero then we're costing the original scalar code.
16594 - If M_VEC_FLAGS & VEC_ADVSIMD is nonzero then we're costing Advanced
16595 SIMD code.
16596 - If M_VEC_FLAGS & VEC_ANY_SVE is nonzero then we're costing SVE code. */
16597 unsigned int m_vec_flags = 0;
16599 /* At the moment, we do not model LDP and STP in the vector and scalar costs.
16600 This means that code such as:
16602 a[0] = x;
16603 a[1] = x;
16605 will be costed as two scalar instructions and two vector instructions
16606 (a scalar_to_vec and an unaligned_store). For SLP, the vector form
16607 wins if the costs are equal, because of the fact that the vector costs
16608 include constant initializations whereas the scalar costs don't.
16609 We would therefore tend to vectorize the code above, even though
16610 the scalar version can use a single STP.
16612 We should eventually fix this and model LDP and STP in the main costs;
16613 see the comment in aarch64_sve_adjust_stmt_cost for some of the problems.
16614 Until then, we look specifically for code that does nothing more than
16615 STP-like operations. We cost them on that basis in addition to the
16616 normal latency-based costs.
16618 If the scalar or vector code could be a sequence of STPs +
16619 initialization, this variable counts the cost of the sequence,
16620 with 2 units per instruction. The variable is ~0U for other
16621 kinds of code. */
16622 unsigned int m_stp_sequence_cost = 0;
16624 /* On some CPUs, SVE and Advanced SIMD provide the same theoretical vector
16625 throughput, such as 4x128 Advanced SIMD vs. 2x256 SVE. In those
16626 situations, we try to predict whether an Advanced SIMD implementation
16627 of the loop could be completely unrolled and become straight-line code.
16628 If so, it is generally better to use the Advanced SIMD version rather
16629 than length-agnostic SVE, since the SVE loop would execute an unknown
16630 number of times and so could not be completely unrolled in the same way.
16632 If we're applying this heuristic, M_UNROLLED_ADVSIMD_NITERS is the
16633 number of Advanced SIMD loop iterations that would be unrolled and
16634 M_UNROLLED_ADVSIMD_STMTS estimates the total number of statements
16635 in the unrolled loop. Both values are zero if we're not applying
16636 the heuristic. */
16637 unsigned HOST_WIDE_INT m_unrolled_advsimd_niters = 0;
16638 unsigned HOST_WIDE_INT m_unrolled_advsimd_stmts = 0;
16640 /* If we're vectorizing a loop that executes a constant number of times,
16641 this variable gives the number of times that the vector loop would
16642 iterate, otherwise it is zero. */
16643 uint64_t m_num_vector_iterations = 0;
16645 /* Used only when vectorizing loops. Estimates the number and kind of
16646 operations that would be needed by one iteration of the scalar
16647 or vector loop. There is one entry for each tuning option of
16648 interest. */
16649 auto_vec<aarch64_vec_op_count, 2> m_ops;
16652 aarch64_vector_costs::aarch64_vector_costs (vec_info *vinfo,
16653 bool costing_for_scalar)
16654 : vector_costs (vinfo, costing_for_scalar),
16655 m_vec_flags (costing_for_scalar ? 0
16656 : aarch64_classify_vector_mode (vinfo->vector_mode))
16658 if (auto *issue_info = aarch64_tune_params.vec_costs->issue_info)
16660 m_ops.quick_push ({ issue_info, m_vec_flags });
16661 if (aarch64_tune_params.vec_costs == &neoverse512tvb_vector_cost)
16663 unsigned int vf_factor = (m_vec_flags & VEC_ANY_SVE) ? 2 : 1;
16664 m_ops.quick_push ({ &neoversev1_vec_issue_info, m_vec_flags,
16665 vf_factor });
16670 /* Implement TARGET_VECTORIZE_CREATE_COSTS. */
16671 vector_costs *
16672 aarch64_vectorize_create_costs (vec_info *vinfo, bool costing_for_scalar)
16674 return new aarch64_vector_costs (vinfo, costing_for_scalar);
16677 /* Return the appropriate SIMD costs for vectors of type VECTYPE. */
16678 static const simd_vec_cost *
16679 aarch64_simd_vec_costs (tree vectype)
16681 const cpu_vector_cost *costs = aarch64_tune_params.vec_costs;
16682 if (vectype != NULL
16683 && aarch64_sve_mode_p (TYPE_MODE (vectype))
16684 && costs->sve != NULL)
16685 return costs->sve;
16686 return costs->advsimd;
16689 /* Return the appropriate SIMD costs for vectors with VEC_* flags FLAGS. */
16690 static const simd_vec_cost *
16691 aarch64_simd_vec_costs_for_flags (unsigned int flags)
16693 const cpu_vector_cost *costs = aarch64_tune_params.vec_costs;
16694 if ((flags & VEC_ANY_SVE) && costs->sve)
16695 return costs->sve;
16696 return costs->advsimd;
16699 /* If STMT_INFO is a memory reference, return the scalar memory type,
16700 otherwise return null. */
16701 static tree
16702 aarch64_dr_type (stmt_vec_info stmt_info)
16704 if (auto dr = STMT_VINFO_DATA_REF (stmt_info))
16705 return TREE_TYPE (DR_REF (dr));
16706 return NULL_TREE;
16709 /* Decide whether to use the unrolling heuristic described above
16710 m_unrolled_advsimd_niters, updating that field if so. LOOP_VINFO
16711 describes the loop that we're vectorizing. */
16712 void
16713 aarch64_vector_costs::
16714 record_potential_advsimd_unrolling (loop_vec_info loop_vinfo)
16716 /* The heuristic only makes sense on targets that have the same
16717 vector throughput for SVE and Advanced SIMD. */
16718 if (!(aarch64_tune_params.extra_tuning_flags
16719 & AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT))
16720 return;
16722 /* We only want to apply the heuristic if LOOP_VINFO is being
16723 vectorized for SVE. */
16724 if (!(m_vec_flags & VEC_ANY_SVE))
16725 return;
16727 /* Check whether it is possible in principle to use Advanced SIMD
16728 instead. */
16729 if (aarch64_autovec_preference == AARCH64_AUTOVEC_SVE_ONLY)
16730 return;
16732 /* We don't want to apply the heuristic to outer loops, since it's
16733 harder to track two levels of unrolling. */
16734 if (LOOP_VINFO_LOOP (loop_vinfo)->inner)
16735 return;
16737 /* Only handle cases in which the number of Advanced SIMD iterations
16738 would be known at compile time but the number of SVE iterations
16739 would not. */
16740 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
16741 || aarch64_sve_vg.is_constant ())
16742 return;
16744 /* Guess how many times the Advanced SIMD loop would iterate and make
16745 sure that it is within the complete unrolling limit. Even if the
16746 number of iterations is small enough, the number of statements might
16747 not be, which is why we need to estimate the number of statements too. */
16748 unsigned int estimated_vq = aarch64_estimated_sve_vq ();
16749 unsigned int advsimd_vf = CEIL (vect_vf_for_cost (loop_vinfo), estimated_vq);
16750 unsigned HOST_WIDE_INT unrolled_advsimd_niters
16751 = LOOP_VINFO_INT_NITERS (loop_vinfo) / advsimd_vf;
16752 if (unrolled_advsimd_niters > (unsigned int) param_max_completely_peel_times)
16753 return;
16755 /* Record that we're applying the heuristic and should try to estimate
16756 the number of statements in the Advanced SIMD loop. */
16757 m_unrolled_advsimd_niters = unrolled_advsimd_niters;
16760 /* Do one-time initialization of the aarch64_vector_costs given that we're
16761 costing the loop vectorization described by LOOP_VINFO. */
16762 void
16763 aarch64_vector_costs::analyze_loop_vinfo (loop_vec_info loop_vinfo)
16765 /* Record the number of times that the vector loop would execute,
16766 if known. */
16767 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
16768 auto scalar_niters = max_stmt_executions_int (loop);
16769 if (scalar_niters >= 0)
16771 unsigned int vf = vect_vf_for_cost (loop_vinfo);
16772 if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
16773 m_num_vector_iterations = scalar_niters / vf;
16774 else
16775 m_num_vector_iterations = CEIL (scalar_niters, vf);
16778 /* Detect whether we're vectorizing for SVE and should apply the unrolling
16779 heuristic described above m_unrolled_advsimd_niters. */
16780 record_potential_advsimd_unrolling (loop_vinfo);
16783 /* Implement targetm.vectorize.builtin_vectorization_cost. */
16784 static int
16785 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
16786 tree vectype,
16787 int misalign ATTRIBUTE_UNUSED)
16789 unsigned elements;
16790 const cpu_vector_cost *costs = aarch64_tune_params.vec_costs;
16791 bool fp = false;
16793 if (vectype != NULL)
16794 fp = FLOAT_TYPE_P (vectype);
16796 const simd_vec_cost *simd_costs = aarch64_simd_vec_costs (vectype);
16798 switch (type_of_cost)
16800 case scalar_stmt:
16801 return fp ? costs->scalar_fp_stmt_cost : costs->scalar_int_stmt_cost;
16803 case scalar_load:
16804 return costs->scalar_load_cost;
16806 case scalar_store:
16807 return costs->scalar_store_cost;
16809 case vector_stmt:
16810 return fp ? simd_costs->fp_stmt_cost
16811 : simd_costs->int_stmt_cost;
16813 case vector_load:
16814 return simd_costs->align_load_cost;
16816 case vector_store:
16817 return simd_costs->store_cost;
16819 case vec_to_scalar:
16820 return simd_costs->vec_to_scalar_cost;
16822 case scalar_to_vec:
16823 return simd_costs->scalar_to_vec_cost;
16825 case unaligned_load:
16826 case vector_gather_load:
16827 return simd_costs->unalign_load_cost;
16829 case unaligned_store:
16830 case vector_scatter_store:
16831 return simd_costs->unalign_store_cost;
16833 case cond_branch_taken:
16834 return costs->cond_taken_branch_cost;
16836 case cond_branch_not_taken:
16837 return costs->cond_not_taken_branch_cost;
16839 case vec_perm:
16840 return simd_costs->permute_cost;
16842 case vec_promote_demote:
16843 return fp ? simd_costs->fp_stmt_cost
16844 : simd_costs->int_stmt_cost;
16846 case vec_construct:
16847 elements = estimated_poly_value (TYPE_VECTOR_SUBPARTS (vectype));
16848 return elements / 2 + 1;
16850 default:
16851 gcc_unreachable ();
16855 /* Return true if an access of kind KIND for STMT_INFO (or NODE if SLP)
16856 represents one vector of an LD[234] or ST[234] operation. Return the total
16857 number of vectors (2, 3 or 4) if so, otherwise return a value outside that
16858 range. */
16859 static int
16860 aarch64_ld234_st234_vectors (vect_cost_for_stmt kind, stmt_vec_info stmt_info,
16861 slp_tree node)
16863 if ((kind == vector_load
16864 || kind == unaligned_load
16865 || kind == vector_store
16866 || kind == unaligned_store)
16867 && STMT_VINFO_DATA_REF (stmt_info))
16869 stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
16870 if (stmt_info
16871 && vect_mem_access_type (stmt_info, node) == VMAT_LOAD_STORE_LANES)
16872 return DR_GROUP_SIZE (stmt_info);
16874 return 0;
16877 /* Return true if creating multiple copies of STMT_INFO for Advanced SIMD
16878 vectors would produce a series of LDP or STP operations. KIND is the
16879 kind of statement that STMT_INFO represents. */
16880 static bool
16881 aarch64_advsimd_ldp_stp_p (enum vect_cost_for_stmt kind,
16882 stmt_vec_info stmt_info)
16884 switch (kind)
16886 case vector_load:
16887 case vector_store:
16888 case unaligned_load:
16889 case unaligned_store:
16890 break;
16892 default:
16893 return false;
16896 return is_gimple_assign (stmt_info->stmt);
16899 /* Return true if STMT_INFO is the second part of a two-statement multiply-add
16900 or multiply-subtract sequence that might be suitable for fusing into a
16901 single instruction. If VEC_FLAGS is zero, analyze the operation as
16902 a scalar one, otherwise analyze it as an operation on vectors with those
16903 VEC_* flags. */
16904 static bool
16905 aarch64_multiply_add_p (vec_info *vinfo, stmt_vec_info stmt_info,
16906 unsigned int vec_flags)
16908 gassign *assign = dyn_cast<gassign *> (stmt_info->stmt);
16909 if (!assign)
16910 return false;
16911 tree_code code = gimple_assign_rhs_code (assign);
16912 if (code != PLUS_EXPR && code != MINUS_EXPR)
16913 return false;
16915 auto is_mul_result = [&](int i)
16917 tree rhs = gimple_op (assign, i);
16918 /* ??? Should we try to check for a single use as well? */
16919 if (TREE_CODE (rhs) != SSA_NAME)
16920 return false;
16922 stmt_vec_info def_stmt_info = vinfo->lookup_def (rhs);
16923 if (!def_stmt_info
16924 || STMT_VINFO_DEF_TYPE (def_stmt_info) != vect_internal_def)
16925 return false;
16926 gassign *rhs_assign = dyn_cast<gassign *> (def_stmt_info->stmt);
16927 if (!rhs_assign || gimple_assign_rhs_code (rhs_assign) != MULT_EXPR)
16928 return false;
16930 if (vec_flags & VEC_ADVSIMD)
16932 /* Scalar and SVE code can tie the result to any FMLA input (or none,
16933 although that requires a MOVPRFX for SVE). However, Advanced SIMD
16934 only supports MLA forms, so will require a move if the result
16935 cannot be tied to the accumulator. The most important case in
16936 which this is true is when the accumulator input is invariant. */
16937 rhs = gimple_op (assign, 3 - i);
16938 if (TREE_CODE (rhs) != SSA_NAME)
16939 return false;
16940 def_stmt_info = vinfo->lookup_def (rhs);
16941 if (!def_stmt_info
16942 || STMT_VINFO_DEF_TYPE (def_stmt_info) == vect_external_def
16943 || STMT_VINFO_DEF_TYPE (def_stmt_info) == vect_constant_def)
16944 return false;
16947 return true;
16950 if (code == MINUS_EXPR && (vec_flags & VEC_ADVSIMD))
16951 /* Advanced SIMD doesn't have FNMADD/FNMSUB/FNMLA/FNMLS, so the
16952 multiplication must be on the second operand (to form an FMLS).
16953 But if both operands are multiplications and the second operand
16954 is used more than once, we'll instead negate the second operand
16955 and use it as an accumulator for the first operand. */
16956 return (is_mul_result (2)
16957 && (has_single_use (gimple_assign_rhs2 (assign))
16958 || !is_mul_result (1)));
16960 return is_mul_result (1) || is_mul_result (2);
16963 /* Return true if STMT_INFO is the second part of a two-statement boolean AND
16964 expression sequence that might be suitable for fusing into a
16965 single instruction. If VEC_FLAGS is zero, analyze the operation as
16966 a scalar one, otherwise analyze it as an operation on vectors with those
16967 VEC_* flags. */
16969 static bool
16970 aarch64_bool_compound_p (vec_info *vinfo, stmt_vec_info stmt_info,
16971 unsigned int vec_flags)
16973 gassign *assign = dyn_cast<gassign *> (stmt_info->stmt);
16974 if (!assign
16975 || gimple_assign_rhs_code (assign) != BIT_AND_EXPR
16976 || !STMT_VINFO_VECTYPE (stmt_info)
16977 || !VECTOR_BOOLEAN_TYPE_P (STMT_VINFO_VECTYPE (stmt_info)))
16978 return false;
16980 for (int i = 1; i < 3; ++i)
16982 tree rhs = gimple_op (assign, i);
16984 if (TREE_CODE (rhs) != SSA_NAME)
16985 continue;
16987 stmt_vec_info def_stmt_info = vinfo->lookup_def (rhs);
16988 if (!def_stmt_info
16989 || STMT_VINFO_DEF_TYPE (def_stmt_info) != vect_internal_def)
16990 continue;
16992 gassign *rhs_assign = dyn_cast<gassign *> (def_stmt_info->stmt);
16993 if (!rhs_assign
16994 || TREE_CODE_CLASS (gimple_assign_rhs_code (rhs_assign))
16995 != tcc_comparison)
16996 continue;
16998 if (vec_flags & VEC_ADVSIMD)
16999 return false;
17001 return true;
17003 return false;
17006 /* We are considering implementing STMT_INFO using SVE. If STMT_INFO is an
17007 in-loop reduction that SVE supports directly, return its latency in cycles,
17008 otherwise return zero. SVE_COSTS specifies the latencies of the relevant
17009 instructions. */
17010 static unsigned int
17011 aarch64_sve_in_loop_reduction_latency (vec_info *vinfo,
17012 stmt_vec_info stmt_info,
17013 const sve_vec_cost *sve_costs)
17015 switch (vect_reduc_type (vinfo, stmt_info))
17017 case EXTRACT_LAST_REDUCTION:
17018 return sve_costs->clast_cost;
17020 case FOLD_LEFT_REDUCTION:
17021 switch (TYPE_MODE (TREE_TYPE (gimple_get_lhs (stmt_info->stmt))))
17023 case E_HFmode:
17024 case E_BFmode:
17025 return sve_costs->fadda_f16_cost;
17027 case E_SFmode:
17028 return sve_costs->fadda_f32_cost;
17030 case E_DFmode:
17031 return sve_costs->fadda_f64_cost;
17033 default:
17034 break;
17036 break;
17039 return 0;
17042 /* STMT_INFO describes a loop-carried operation in the original scalar code
17043 that we are considering implementing as a reduction. Return one of the
17044 following values, depending on VEC_FLAGS:
17046 - If VEC_FLAGS is zero, return the loop carry latency of the original
17047 scalar operation.
17049 - If VEC_FLAGS & VEC_ADVSIMD, return the loop carry latency of the
17050 Advanced SIMD implementation.
17052 - If VEC_FLAGS & VEC_ANY_SVE, return the loop carry latency of the
17053 SVE implementation. */
17054 static unsigned int
17055 aarch64_in_loop_reduction_latency (vec_info *vinfo, stmt_vec_info stmt_info,
17056 unsigned int vec_flags)
17058 const cpu_vector_cost *vec_costs = aarch64_tune_params.vec_costs;
17059 const sve_vec_cost *sve_costs = nullptr;
17060 if (vec_flags & VEC_ANY_SVE)
17061 sve_costs = aarch64_tune_params.vec_costs->sve;
17063 /* If the caller is asking for the SVE latency, check for forms of reduction
17064 that only SVE can handle directly. */
17065 if (sve_costs)
17067 unsigned int latency
17068 = aarch64_sve_in_loop_reduction_latency (vinfo, stmt_info, sve_costs);
17069 if (latency)
17070 return latency;
17073 /* Handle scalar costs. */
17074 bool is_float = FLOAT_TYPE_P (TREE_TYPE (gimple_get_lhs (stmt_info->stmt)));
17075 if (vec_flags == 0)
17077 if (is_float)
17078 return vec_costs->scalar_fp_stmt_cost;
17079 return vec_costs->scalar_int_stmt_cost;
17082 /* Otherwise, the loop body just contains normal integer or FP operations,
17083 with a vector reduction outside the loop. */
17084 const simd_vec_cost *simd_costs
17085 = aarch64_simd_vec_costs_for_flags (vec_flags);
17086 if (is_float)
17087 return simd_costs->fp_stmt_cost;
17088 return simd_costs->int_stmt_cost;
17091 /* STMT_COST is the cost calculated by aarch64_builtin_vectorization_cost
17092 for STMT_INFO, which has cost kind KIND. If this is a scalar operation,
17093 try to subdivide the target-independent categorization provided by KIND
17094 to get a more accurate cost. */
17095 static fractional_cost
17096 aarch64_detect_scalar_stmt_subtype (vec_info *vinfo, vect_cost_for_stmt kind,
17097 stmt_vec_info stmt_info,
17098 fractional_cost stmt_cost)
17100 /* Detect an extension of a loaded value. In general, we'll be able to fuse
17101 the extension with the load. */
17102 if (kind == scalar_stmt && vect_is_extending_load (vinfo, stmt_info))
17103 return 0;
17105 return stmt_cost;
17108 /* STMT_COST is the cost calculated by aarch64_builtin_vectorization_cost
17109 for the vectorized form of STMT_INFO possibly using SLP node NODE, which has
17110 cost kind KIND and which when vectorized would operate on vector type
17111 VECTYPE. Try to subdivide the target-independent categorization provided by
17112 KIND to get a more accurate cost. WHERE specifies where the cost associated
17113 with KIND occurs. */
17114 static fractional_cost
17115 aarch64_detect_vector_stmt_subtype (vec_info *vinfo, vect_cost_for_stmt kind,
17116 stmt_vec_info stmt_info, slp_tree node,
17117 tree vectype,
17118 enum vect_cost_model_location where,
17119 fractional_cost stmt_cost)
17121 const simd_vec_cost *simd_costs = aarch64_simd_vec_costs (vectype);
17122 const sve_vec_cost *sve_costs = nullptr;
17123 if (aarch64_sve_mode_p (TYPE_MODE (vectype)))
17124 sve_costs = aarch64_tune_params.vec_costs->sve;
17126 /* It's generally better to avoid costing inductions, since the induction
17127 will usually be hidden by other operations. This is particularly true
17128 for things like COND_REDUCTIONS. */
17129 if (is_a<gphi *> (stmt_info->stmt))
17130 return 0;
17132 /* Detect cases in which vec_to_scalar is describing the extraction of a
17133 vector element in preparation for a scalar store. The store itself is
17134 costed separately. */
17135 if (vect_is_store_elt_extraction (kind, stmt_info))
17136 return simd_costs->store_elt_extra_cost;
17138 /* Detect SVE gather loads, which are costed as a single scalar_load
17139 for each element. We therefore need to divide the full-instruction
17140 cost by the number of elements in the vector. */
17141 if (kind == scalar_load
17142 && sve_costs
17143 && vect_mem_access_type (stmt_info, node) == VMAT_GATHER_SCATTER)
17145 unsigned int nunits = vect_nunits_for_cost (vectype);
17146 /* Test for VNx2 modes, which have 64-bit containers. */
17147 if (known_eq (GET_MODE_NUNITS (TYPE_MODE (vectype)), aarch64_sve_vg))
17148 return { sve_costs->gather_load_x64_cost, nunits };
17149 return { sve_costs->gather_load_x32_cost, nunits };
17152 /* Detect cases in which a scalar_store is really storing one element
17153 in a scatter operation. */
17154 if (kind == scalar_store
17155 && sve_costs
17156 && vect_mem_access_type (stmt_info, node) == VMAT_GATHER_SCATTER)
17157 return sve_costs->scatter_store_elt_cost;
17159 /* Detect cases in which vec_to_scalar represents an in-loop reduction. */
17160 if (kind == vec_to_scalar
17161 && where == vect_body
17162 && sve_costs)
17164 unsigned int latency
17165 = aarch64_sve_in_loop_reduction_latency (vinfo, stmt_info, sve_costs);
17166 if (latency)
17167 return latency;
17170 /* Detect cases in which vec_to_scalar represents a single reduction
17171 instruction like FADDP or MAXV. */
17172 if (kind == vec_to_scalar
17173 && where == vect_epilogue
17174 && vect_is_reduction (stmt_info))
17175 switch (GET_MODE_INNER (TYPE_MODE (vectype)))
17177 case E_QImode:
17178 return simd_costs->reduc_i8_cost;
17180 case E_HImode:
17181 return simd_costs->reduc_i16_cost;
17183 case E_SImode:
17184 return simd_costs->reduc_i32_cost;
17186 case E_DImode:
17187 return simd_costs->reduc_i64_cost;
17189 case E_HFmode:
17190 case E_BFmode:
17191 return simd_costs->reduc_f16_cost;
17193 case E_SFmode:
17194 return simd_costs->reduc_f32_cost;
17196 case E_DFmode:
17197 return simd_costs->reduc_f64_cost;
17199 default:
17200 break;
17203 /* Otherwise stick with the original categorization. */
17204 return stmt_cost;
17207 /* STMT_COST is the cost calculated by aarch64_builtin_vectorization_cost
17208 for STMT_INFO, which has cost kind KIND and which when vectorized would
17209 operate on vector type VECTYPE. Adjust the cost as necessary for SVE
17210 targets. */
17211 static fractional_cost
17212 aarch64_sve_adjust_stmt_cost (class vec_info *vinfo, vect_cost_for_stmt kind,
17213 stmt_vec_info stmt_info, tree vectype,
17214 fractional_cost stmt_cost)
17216 /* Unlike vec_promote_demote, vector_stmt conversions do not change the
17217 vector register size or number of units. Integer promotions of this
17218 type therefore map to SXT[BHW] or UXT[BHW].
17220 Most loads have extending forms that can do the sign or zero extension
17221 on the fly. Optimistically assume that a load followed by an extension
17222 will fold to this form during combine, and that the extension therefore
17223 comes for free. */
17224 if (kind == vector_stmt && vect_is_extending_load (vinfo, stmt_info))
17225 stmt_cost = 0;
17227 /* For similar reasons, vector_stmt integer truncations are a no-op,
17228 because we can just ignore the unused upper bits of the source. */
17229 if (kind == vector_stmt && vect_is_integer_truncation (stmt_info))
17230 stmt_cost = 0;
17232 /* Advanced SIMD can load and store pairs of registers using LDP and STP,
17233 but there are no equivalent instructions for SVE. This means that
17234 (all other things being equal) 128-bit SVE needs twice as many load
17235 and store instructions as Advanced SIMD in order to process vector pairs.
17237 Also, scalar code can often use LDP and STP to access pairs of values,
17238 so it is too simplistic to say that one SVE load or store replaces
17239 VF scalar loads and stores.
17241 Ideally we would account for this in the scalar and Advanced SIMD
17242 costs by making suitable load/store pairs as cheap as a single
17243 load/store. However, that would be a very invasive change and in
17244 practice it tends to stress other parts of the cost model too much.
17245 E.g. stores of scalar constants currently count just a store,
17246 whereas stores of vector constants count a store and a vec_init.
17247 This is an artificial distinction for AArch64, where stores of
17248 nonzero scalar constants need the same kind of register invariant
17249 as vector stores.
17251 An alternative would be to double the cost of any SVE loads and stores
17252 that could be paired in Advanced SIMD (and possibly also paired in
17253 scalar code). But this tends to stress other parts of the cost model
17254 in the same way. It also means that we can fall back to Advanced SIMD
17255 even if full-loop predication would have been useful.
17257 Here we go for a more conservative version: double the costs of SVE
17258 loads and stores if one iteration of the scalar loop processes enough
17259 elements for it to use a whole number of Advanced SIMD LDP or STP
17260 instructions. This makes it very likely that the VF would be 1 for
17261 Advanced SIMD, and so no epilogue should be needed. */
17262 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
17264 stmt_vec_info first = DR_GROUP_FIRST_ELEMENT (stmt_info);
17265 unsigned int count = DR_GROUP_SIZE (first) - DR_GROUP_GAP (first);
17266 unsigned int elt_bits = GET_MODE_UNIT_BITSIZE (TYPE_MODE (vectype));
17267 if (multiple_p (count * elt_bits, 256)
17268 && aarch64_advsimd_ldp_stp_p (kind, stmt_info))
17269 stmt_cost *= 2;
17272 return stmt_cost;
17275 /* STMT_COST is the cost calculated for STMT_INFO, which has cost kind KIND
17276 and which when vectorized would operate on vector type VECTYPE. Add the
17277 cost of any embedded operations. */
17278 static fractional_cost
17279 aarch64_adjust_stmt_cost (vec_info *vinfo, vect_cost_for_stmt kind,
17280 stmt_vec_info stmt_info, slp_tree node, tree vectype,
17281 unsigned vec_flags, fractional_cost stmt_cost)
17283 if (vectype)
17285 const simd_vec_cost *simd_costs = aarch64_simd_vec_costs (vectype);
17287 /* Detect cases in which a vector load or store represents an
17288 LD[234] or ST[234] instruction. */
17289 switch (aarch64_ld234_st234_vectors (kind, stmt_info, node))
17291 case 2:
17292 stmt_cost += simd_costs->ld2_st2_permute_cost;
17293 break;
17295 case 3:
17296 stmt_cost += simd_costs->ld3_st3_permute_cost;
17297 break;
17299 case 4:
17300 stmt_cost += simd_costs->ld4_st4_permute_cost;
17301 break;
17304 gassign *assign = dyn_cast<gassign *> (STMT_VINFO_STMT (stmt_info));
17305 if ((kind == scalar_stmt || kind == vector_stmt) && assign)
17307 /* For MLA we need to reduce the cost since MLA is 1 instruction. */
17308 if (!vect_is_reduction (stmt_info)
17309 && aarch64_multiply_add_p (vinfo, stmt_info, vec_flags))
17310 return 0;
17312 /* For vector boolean ANDs with a compare operand we just need
17313 one insn. */
17314 if (aarch64_bool_compound_p (vinfo, stmt_info, vec_flags))
17315 return 0;
17318 if (kind == vector_stmt || kind == vec_to_scalar)
17319 if (tree cmp_type = vect_embedded_comparison_type (stmt_info))
17321 if (FLOAT_TYPE_P (cmp_type))
17322 stmt_cost += simd_costs->fp_stmt_cost;
17323 else
17324 stmt_cost += simd_costs->int_stmt_cost;
17328 if (kind == scalar_stmt)
17329 if (tree cmp_type = vect_embedded_comparison_type (stmt_info))
17331 if (FLOAT_TYPE_P (cmp_type))
17332 stmt_cost += aarch64_tune_params.vec_costs->scalar_fp_stmt_cost;
17333 else
17334 stmt_cost += aarch64_tune_params.vec_costs->scalar_int_stmt_cost;
17337 return stmt_cost;
17340 /* Return true if STMT_INFO is part of a reduction that has the form:
17342 r = r op ...;
17343 r = r op ...;
17345 with the single accumulator being read and written multiple times. */
17346 static bool
17347 aarch64_force_single_cycle (vec_info *vinfo, stmt_vec_info stmt_info)
17349 if (!STMT_VINFO_REDUC_DEF (stmt_info))
17350 return false;
17352 auto reduc_info = info_for_reduction (vinfo, stmt_info);
17353 return STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info);
17356 /* COUNT, KIND and STMT_INFO are the same as for vector_costs::add_stmt_cost
17357 and they describe an operation in the body of a vector loop. Record issue
17358 information relating to the vector operation in OPS. */
17359 void
17360 aarch64_vector_costs::count_ops (unsigned int count, vect_cost_for_stmt kind,
17361 stmt_vec_info stmt_info, slp_tree node,
17362 aarch64_vec_op_count *ops)
17364 const aarch64_base_vec_issue_info *base_issue = ops->base_issue_info ();
17365 if (!base_issue)
17366 return;
17367 const aarch64_simd_vec_issue_info *simd_issue = ops->simd_issue_info ();
17368 const aarch64_sve_vec_issue_info *sve_issue = ops->sve_issue_info ();
17370 /* Calculate the minimum cycles per iteration imposed by a reduction
17371 operation. */
17372 if ((kind == scalar_stmt || kind == vector_stmt || kind == vec_to_scalar)
17373 && vect_is_reduction (stmt_info))
17375 unsigned int base
17376 = aarch64_in_loop_reduction_latency (m_vinfo, stmt_info, m_vec_flags);
17377 if (aarch64_force_single_cycle (m_vinfo, stmt_info))
17378 /* ??? Ideally we'd use a tree to reduce the copies down to 1 vector,
17379 and then accumulate that, but at the moment the loop-carried
17380 dependency includes all copies. */
17381 ops->reduction_latency = MAX (ops->reduction_latency, base * count);
17382 else
17383 ops->reduction_latency = MAX (ops->reduction_latency, base);
17386 if (stmt_info && (kind == scalar_stmt || kind == vector_stmt))
17388 /* Assume that multiply-adds will become a single operation. */
17389 if (aarch64_multiply_add_p (m_vinfo, stmt_info, m_vec_flags))
17390 return;
17392 /* Assume that bool AND with compare operands will become a single
17393 operation. */
17394 if (aarch64_bool_compound_p (m_vinfo, stmt_info, m_vec_flags))
17395 return;
17398 /* Detect the case where we are using an emulated gather/scatter. When a
17399 target does not support gathers and scatters directly the vectorizer
17400 emulates these by constructing an index vector and then issuing an
17401 extraction for every lane in the vector. If the index vector is loaded
17402 from memory, the vector load and extractions are subsequently lowered by
17403 veclower into a series of scalar index loads. After the final loads are
17404 done it issues a vec_construct to recreate the vector from the scalar. For
17405 costing when we see a vec_to_scalar on a stmt with VMAT_GATHER_SCATTER we
17406 are dealing with an emulated instruction and should adjust costing
17407 properly. */
17408 if (kind == vec_to_scalar
17409 && (m_vec_flags & VEC_ADVSIMD)
17410 && vect_mem_access_type (stmt_info, node) == VMAT_GATHER_SCATTER)
17412 auto dr = STMT_VINFO_DATA_REF (stmt_info);
17413 tree dr_ref = DR_REF (dr);
17414 while (handled_component_p (dr_ref))
17416 if (TREE_CODE (dr_ref) == ARRAY_REF)
17418 tree offset = TREE_OPERAND (dr_ref, 1);
17419 if (SSA_VAR_P (offset))
17421 if (gimple_vuse (SSA_NAME_DEF_STMT (offset)))
17423 if (STMT_VINFO_TYPE (stmt_info) == load_vec_info_type)
17424 ops->loads += count - 1;
17425 else
17426 /* Stores want to count both the index to array and data to
17427 array using vec_to_scalar. However we have index stores
17428 in Adv.SIMD and so we only want to adjust the index
17429 loads. */
17430 ops->loads += count / 2;
17431 return;
17433 break;
17436 dr_ref = TREE_OPERAND (dr_ref, 0);
17440 /* Count the basic operation cost associated with KIND. */
17441 switch (kind)
17443 case cond_branch_taken:
17444 case cond_branch_not_taken:
17445 case vector_gather_load:
17446 case vector_scatter_store:
17447 /* We currently don't expect these to be used in a loop body. */
17448 break;
17450 case vec_perm:
17451 case vec_promote_demote:
17452 case vec_construct:
17453 case vec_to_scalar:
17454 case scalar_to_vec:
17455 case vector_stmt:
17456 case scalar_stmt:
17457 ops->general_ops += count;
17458 break;
17460 case scalar_load:
17461 case vector_load:
17462 case unaligned_load:
17463 ops->loads += count;
17464 if (m_vec_flags || FLOAT_TYPE_P (aarch64_dr_type (stmt_info)))
17465 ops->general_ops += base_issue->fp_simd_load_general_ops * count;
17466 break;
17468 case vector_store:
17469 case unaligned_store:
17470 case scalar_store:
17471 ops->stores += count;
17472 if (m_vec_flags || FLOAT_TYPE_P (aarch64_dr_type (stmt_info)))
17473 ops->general_ops += base_issue->fp_simd_store_general_ops * count;
17474 break;
17477 /* Add any embedded comparison operations. */
17478 if ((kind == scalar_stmt || kind == vector_stmt || kind == vec_to_scalar)
17479 && vect_embedded_comparison_type (stmt_info))
17480 ops->general_ops += count;
17482 /* COND_REDUCTIONS need two sets of VEC_COND_EXPRs, whereas so far we
17483 have only accounted for one. */
17484 if ((kind == vector_stmt || kind == vec_to_scalar)
17485 && vect_reduc_type (m_vinfo, stmt_info) == COND_REDUCTION)
17486 ops->general_ops += count;
17488 /* Count the predicate operations needed by an SVE comparison. */
17489 if (sve_issue && (kind == vector_stmt || kind == vec_to_scalar))
17490 if (tree type = vect_comparison_type (stmt_info))
17492 unsigned int base = (FLOAT_TYPE_P (type)
17493 ? sve_issue->fp_cmp_pred_ops
17494 : sve_issue->int_cmp_pred_ops);
17495 ops->pred_ops += base * count;
17498 /* Add any extra overhead associated with LD[234] and ST[234] operations. */
17499 if (simd_issue)
17500 switch (aarch64_ld234_st234_vectors (kind, stmt_info, node))
17502 case 2:
17503 ops->general_ops += simd_issue->ld2_st2_general_ops * count;
17504 break;
17506 case 3:
17507 ops->general_ops += simd_issue->ld3_st3_general_ops * count;
17508 break;
17510 case 4:
17511 ops->general_ops += simd_issue->ld4_st4_general_ops * count;
17512 break;
17515 /* Add any overhead associated with gather loads and scatter stores. */
17516 if (sve_issue
17517 && (kind == scalar_load || kind == scalar_store)
17518 && vect_mem_access_type (stmt_info, node) == VMAT_GATHER_SCATTER)
17520 unsigned int pairs = CEIL (count, 2);
17521 ops->pred_ops += sve_issue->gather_scatter_pair_pred_ops * pairs;
17522 ops->general_ops += sve_issue->gather_scatter_pair_general_ops * pairs;
17526 /* Return true if STMT_INFO contains a memory access and if the constant
17527 component of the memory address is aligned to SIZE bytes. */
17528 static bool
17529 aarch64_aligned_constant_offset_p (stmt_vec_info stmt_info,
17530 poly_uint64 size)
17532 if (!STMT_VINFO_DATA_REF (stmt_info))
17533 return false;
17535 if (auto first_stmt = DR_GROUP_FIRST_ELEMENT (stmt_info))
17536 stmt_info = first_stmt;
17537 tree constant_offset = DR_INIT (STMT_VINFO_DATA_REF (stmt_info));
17538 /* Needed for gathers & scatters, for example. */
17539 if (!constant_offset)
17540 return false;
17542 return multiple_p (wi::to_poly_offset (constant_offset), size);
17545 /* Check if a scalar or vector stmt could be part of a region of code
17546 that does nothing more than store values to memory, in the scalar
17547 case using STP. Return the cost of the stmt if so, counting 2 for
17548 one instruction. Return ~0U otherwise.
17550 The arguments are a subset of those passed to add_stmt_cost. */
17551 unsigned int
17552 aarch64_stp_sequence_cost (unsigned int count, vect_cost_for_stmt kind,
17553 stmt_vec_info stmt_info, tree vectype)
17555 /* Code that stores vector constants uses a vector_load to create
17556 the constant. We don't apply the heuristic to that case for two
17557 main reasons:
17559 - At the moment, STPs are only formed via peephole2, and the
17560 constant scalar moves would often come between STRs and so
17561 prevent STP formation.
17563 - The scalar code also has to load the constant somehow, and that
17564 isn't costed. */
17565 switch (kind)
17567 case scalar_to_vec:
17568 /* Count 2 insns for a GPR->SIMD dup and 1 insn for a FPR->SIMD dup. */
17569 return (FLOAT_TYPE_P (vectype) ? 2 : 4) * count;
17571 case vec_construct:
17572 if (FLOAT_TYPE_P (vectype))
17573 /* Count 1 insn for the maximum number of FP->SIMD INS
17574 instructions. */
17575 return (vect_nunits_for_cost (vectype) - 1) * 2 * count;
17577 /* Count 2 insns for a GPR->SIMD move and 2 insns for the
17578 maximum number of GPR->SIMD INS instructions. */
17579 return vect_nunits_for_cost (vectype) * 4 * count;
17581 case vector_store:
17582 case unaligned_store:
17583 /* Count 1 insn per vector if we can't form STP Q pairs. */
17584 if (aarch64_sve_mode_p (TYPE_MODE (vectype)))
17585 return count * 2;
17587 if (stmt_info)
17589 /* Assume we won't be able to use STP if the constant offset
17590 component of the address is misaligned. ??? This could be
17591 removed if we formed STP pairs earlier, rather than relying
17592 on peephole2. */
17593 auto size = GET_MODE_SIZE (TYPE_MODE (vectype));
17594 if (!aarch64_aligned_constant_offset_p (stmt_info, size))
17595 return count * 2;
17597 return CEIL (count, 2) * 2;
17599 case scalar_store:
17600 if (stmt_info && STMT_VINFO_DATA_REF (stmt_info))
17602 /* Check for a mode in which STP pairs can be formed. */
17603 auto size = GET_MODE_SIZE (TYPE_MODE (aarch64_dr_type (stmt_info)));
17604 if (maybe_ne (size, 4) && maybe_ne (size, 8))
17605 return ~0U;
17607 /* Assume we won't be able to use STP if the constant offset
17608 component of the address is misaligned. ??? This could be
17609 removed if we formed STP pairs earlier, rather than relying
17610 on peephole2. */
17611 if (!aarch64_aligned_constant_offset_p (stmt_info, size))
17612 return ~0U;
17614 return count;
17616 default:
17617 return ~0U;
17621 unsigned
17622 aarch64_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind,
17623 stmt_vec_info stmt_info, slp_tree node,
17624 tree vectype, int misalign,
17625 vect_cost_model_location where)
17627 fractional_cost stmt_cost
17628 = aarch64_builtin_vectorization_cost (kind, vectype, misalign);
17630 bool in_inner_loop_p = (where == vect_body
17631 && stmt_info
17632 && stmt_in_inner_loop_p (m_vinfo, stmt_info));
17634 /* Do one-time initialization based on the vinfo. */
17635 loop_vec_info loop_vinfo = dyn_cast<loop_vec_info> (m_vinfo);
17636 if (!m_analyzed_vinfo)
17638 if (loop_vinfo)
17639 analyze_loop_vinfo (loop_vinfo);
17641 m_analyzed_vinfo = true;
17644 /* Apply the heuristic described above m_stp_sequence_cost. */
17645 if (m_stp_sequence_cost != ~0U)
17647 uint64_t cost = aarch64_stp_sequence_cost (count, kind,
17648 stmt_info, vectype);
17649 m_stp_sequence_cost = MIN (m_stp_sequence_cost + cost, ~0U);
17652 /* Try to get a more accurate cost by looking at STMT_INFO instead
17653 of just looking at KIND. */
17654 if (stmt_info)
17656 /* If we scalarize a strided store, the vectorizer costs one
17657 vec_to_scalar for each element. However, we can store the first
17658 element using an FP store without a separate extract step. */
17659 if (vect_is_store_elt_extraction (kind, stmt_info))
17660 count -= 1;
17662 stmt_cost = aarch64_detect_scalar_stmt_subtype (m_vinfo, kind,
17663 stmt_info, stmt_cost);
17665 if (vectype && m_vec_flags)
17666 stmt_cost = aarch64_detect_vector_stmt_subtype (m_vinfo, kind,
17667 stmt_info, node,
17668 vectype, where,
17669 stmt_cost);
17671 /* Check if we've seen an SVE gather/scatter operation and which size. */
17672 if (kind == scalar_load
17673 && aarch64_sve_mode_p (TYPE_MODE (vectype))
17674 && vect_mem_access_type (stmt_info, node) == VMAT_GATHER_SCATTER)
17676 const sve_vec_cost *sve_costs = aarch64_tune_params.vec_costs->sve;
17677 if (sve_costs)
17679 /* Test for VNx2 modes, which have 64-bit containers. */
17680 if (known_eq (GET_MODE_NUNITS (TYPE_MODE (vectype)),
17681 aarch64_sve_vg))
17682 m_sve_gather_scatter_init_cost
17683 += sve_costs->gather_load_x64_init_cost;
17684 else
17685 m_sve_gather_scatter_init_cost
17686 += sve_costs->gather_load_x32_init_cost;
17691 /* Do any SVE-specific adjustments to the cost. */
17692 if (stmt_info && vectype && aarch64_sve_mode_p (TYPE_MODE (vectype)))
17693 stmt_cost = aarch64_sve_adjust_stmt_cost (m_vinfo, kind, stmt_info,
17694 vectype, stmt_cost);
17696 /* Vector promotion and demotion requires us to widen the operation first
17697 and only after that perform the conversion. Unfortunately the mid-end
17698 expects this to be doable as a single operation and doesn't pass on
17699 enough context here for us to tell which operation is happening. To
17700 account for this we count every promote-demote operation twice and if
17701 the previously costed operation was also a promote-demote we reduce
17702 the cost of the currently being costed operation to simulate the final
17703 conversion cost. Note that for SVE we can do better here if the converted
17704 value comes from a load since the widening load would consume the widening
17705 operations. However since we're in stage 3 we can't change the helper
17706 vect_is_extending_load and duplicating the code seems not useful. */
17707 gassign *assign = NULL;
17708 if (kind == vec_promote_demote
17709 && (assign = dyn_cast <gassign *> (STMT_VINFO_STMT (stmt_info)))
17710 && gimple_assign_rhs_code (assign) == FLOAT_EXPR)
17712 auto new_count = count * 2 - m_num_last_promote_demote;
17713 m_num_last_promote_demote = count;
17714 count = new_count;
17716 else
17717 m_num_last_promote_demote = 0;
17719 if (stmt_info)
17721 /* Account for any extra "embedded" costs that apply additively
17722 to the base cost calculated above. */
17723 stmt_cost = aarch64_adjust_stmt_cost (m_vinfo, kind, stmt_info, node,
17724 vectype, m_vec_flags, stmt_cost);
17726 /* If we're recording a nonzero vector loop body cost for the
17727 innermost loop, also estimate the operations that would need
17728 to be issued by all relevant implementations of the loop. */
17729 if (loop_vinfo
17730 && (m_costing_for_scalar || where == vect_body)
17731 && (!LOOP_VINFO_LOOP (loop_vinfo)->inner || in_inner_loop_p)
17732 && stmt_cost != 0)
17733 for (auto &ops : m_ops)
17734 count_ops (count, kind, stmt_info, node, &ops);
17736 /* If we're applying the SVE vs. Advanced SIMD unrolling heuristic,
17737 estimate the number of statements in the unrolled Advanced SIMD
17738 loop. For simplicitly, we assume that one iteration of the
17739 Advanced SIMD loop would need the same number of statements
17740 as one iteration of the SVE loop. */
17741 if (where == vect_body && m_unrolled_advsimd_niters)
17742 m_unrolled_advsimd_stmts += count * m_unrolled_advsimd_niters;
17744 /* Detect the use of an averaging operation. */
17745 gimple *stmt = stmt_info->stmt;
17746 if (is_gimple_call (stmt)
17747 && gimple_call_internal_p (stmt))
17749 switch (gimple_call_internal_fn (stmt))
17751 case IFN_AVG_FLOOR:
17752 case IFN_AVG_CEIL:
17753 m_has_avg = true;
17754 default:
17755 break;
17760 /* If the statement stores to a decl that is known to be the argument
17761 to a vld1 in the same function, ignore the store for costing purposes.
17762 See the comment above m_stores_to_vector_load_decl for more details. */
17763 if (stmt_info
17764 && (kind == vector_store || kind == unaligned_store)
17765 && aarch64_accesses_vector_load_decl_p (stmt_info))
17767 stmt_cost = 0;
17768 m_stores_to_vector_load_decl = true;
17771 return record_stmt_cost (stmt_info, where, (count * stmt_cost).ceil ());
17774 /* Return true if (a) we're applying the Advanced SIMD vs. SVE unrolling
17775 heuristic described above m_unrolled_advsimd_niters and (b) the heuristic
17776 says that we should prefer the Advanced SIMD loop. */
17777 bool
17778 aarch64_vector_costs::prefer_unrolled_loop () const
17780 if (!m_unrolled_advsimd_stmts)
17781 return false;
17783 if (dump_enabled_p ())
17784 dump_printf_loc (MSG_NOTE, vect_location, "Number of insns in"
17785 " unrolled Advanced SIMD loop = "
17786 HOST_WIDE_INT_PRINT_UNSIGNED "\n",
17787 m_unrolled_advsimd_stmts);
17789 /* The balance here is tricky. On the one hand, we can't be sure whether
17790 the code is vectorizable with Advanced SIMD or not. However, even if
17791 it isn't vectorizable with Advanced SIMD, there's a possibility that
17792 the scalar code could also be unrolled. Some of the code might then
17793 benefit from SLP, or from using LDP and STP. We therefore apply
17794 the heuristic regardless of can_use_advsimd_p. */
17795 return (m_unrolled_advsimd_stmts
17796 && (m_unrolled_advsimd_stmts
17797 <= (unsigned int) param_max_completely_peeled_insns));
17800 /* Subroutine of adjust_body_cost for handling SVE. Use ISSUE_INFO to work out
17801 how fast the SVE code can be issued and compare it to the equivalent value
17802 for scalar code (SCALAR_CYCLES_PER_ITER). If COULD_USE_ADVSIMD is true,
17803 also compare it to the issue rate of Advanced SIMD code
17804 (ADVSIMD_CYCLES_PER_ITER).
17806 ORIG_BODY_COST is the cost originally passed to adjust_body_cost and
17807 *BODY_COST is the current value of the adjusted cost. *SHOULD_DISPARAGE
17808 is true if we think the loop body is too expensive. */
17810 fractional_cost
17811 aarch64_vector_costs::
17812 adjust_body_cost_sve (const aarch64_vec_op_count *ops,
17813 fractional_cost scalar_cycles_per_iter,
17814 unsigned int orig_body_cost, unsigned int *body_cost,
17815 bool *should_disparage)
17817 if (dump_enabled_p ())
17818 ops->dump ();
17820 fractional_cost sve_pred_cycles_per_iter = ops->min_pred_cycles_per_iter ();
17821 fractional_cost sve_cycles_per_iter = ops->min_cycles_per_iter ();
17823 /* If the scalar version of the loop could issue at least as
17824 quickly as the predicate parts of the SVE loop, make the SVE loop
17825 prohibitively expensive. In this case vectorization is adding an
17826 overhead that the original scalar code didn't have.
17828 This is mostly intended to detect cases in which WHILELOs dominate
17829 for very tight loops, which is something that normal latency-based
17830 costs would not model. Adding this kind of cliffedge would be
17831 too drastic for scalar_cycles_per_iter vs. sve_cycles_per_iter;
17832 code in the caller handles that case in a more conservative way. */
17833 fractional_cost sve_estimate = sve_pred_cycles_per_iter + 1;
17834 if (scalar_cycles_per_iter < sve_estimate)
17836 unsigned int min_cost
17837 = orig_body_cost * estimated_poly_value (BYTES_PER_SVE_VECTOR);
17838 if (*body_cost < min_cost)
17840 if (dump_enabled_p ())
17841 dump_printf_loc (MSG_NOTE, vect_location,
17842 "Increasing body cost to %d because the"
17843 " scalar code could issue within the limit"
17844 " imposed by predicate operations\n",
17845 min_cost);
17846 *body_cost = min_cost;
17847 *should_disparage = true;
17851 return sve_cycles_per_iter;
17854 unsigned int
17855 aarch64_vector_costs::determine_suggested_unroll_factor ()
17857 bool sve = m_vec_flags & VEC_ANY_SVE;
17858 /* If we are trying to unroll an Advanced SIMD main loop that contains
17859 an averaging operation that we do not support with SVE and we might use a
17860 predicated epilogue, we need to be conservative and block unrolling as
17861 this might lead to a less optimal loop for the first and only epilogue
17862 using the original loop's vectorization factor.
17863 TODO: Remove this constraint when we add support for multiple epilogue
17864 vectorization. */
17865 if (!sve && !TARGET_SVE2 && m_has_avg)
17866 return 1;
17868 unsigned int max_unroll_factor = 1;
17869 for (auto vec_ops : m_ops)
17871 aarch64_simd_vec_issue_info const *vec_issue
17872 = vec_ops.simd_issue_info ();
17873 if (!vec_issue)
17874 return 1;
17875 /* Limit unroll factor to a value adjustable by the user, the default
17876 value is 4. */
17877 unsigned int unroll_factor = aarch64_vect_unroll_limit;
17878 unsigned int factor
17879 = vec_ops.reduction_latency > 1 ? vec_ops.reduction_latency : 1;
17880 unsigned int temp;
17882 /* Sanity check, this should never happen. */
17883 if ((vec_ops.stores + vec_ops.loads + vec_ops.general_ops) == 0)
17884 return 1;
17886 /* Check stores. */
17887 if (vec_ops.stores > 0)
17889 temp = CEIL (factor * vec_issue->stores_per_cycle,
17890 vec_ops.stores);
17891 unroll_factor = MIN (unroll_factor, temp);
17894 /* Check loads + stores. */
17895 if (vec_ops.loads > 0)
17897 temp = CEIL (factor * vec_issue->loads_stores_per_cycle,
17898 vec_ops.loads + vec_ops.stores);
17899 unroll_factor = MIN (unroll_factor, temp);
17902 /* Check general ops. */
17903 if (vec_ops.general_ops > 0)
17905 temp = CEIL (factor * vec_issue->general_ops_per_cycle,
17906 vec_ops.general_ops);
17907 unroll_factor = MIN (unroll_factor, temp);
17909 max_unroll_factor = MAX (max_unroll_factor, unroll_factor);
17912 /* Make sure unroll factor is power of 2. */
17913 return 1 << ceil_log2 (max_unroll_factor);
17916 /* BODY_COST is the cost of a vector loop body. Adjust the cost as necessary
17917 and return the new cost. */
17918 unsigned int
17919 aarch64_vector_costs::
17920 adjust_body_cost (loop_vec_info loop_vinfo,
17921 const aarch64_vector_costs *scalar_costs,
17922 unsigned int body_cost)
17924 if (scalar_costs->m_ops.is_empty () || m_ops.is_empty ())
17925 return body_cost;
17927 const auto &scalar_ops = scalar_costs->m_ops[0];
17928 const auto &vector_ops = m_ops[0];
17929 unsigned int estimated_vf = vect_vf_for_cost (loop_vinfo);
17930 unsigned int orig_body_cost = body_cost;
17931 bool should_disparage = false;
17933 if (dump_enabled_p ())
17934 dump_printf_loc (MSG_NOTE, vect_location,
17935 "Original vector body cost = %d\n", body_cost);
17937 /* If we know we have a single partial vector iteration, cap the VF
17938 to the number of scalar iterations for costing purposes. */
17939 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
17941 auto niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
17942 if (niters < estimated_vf && dump_enabled_p ())
17943 dump_printf_loc (MSG_NOTE, vect_location,
17944 "Scalar loop iterates at most %wd times. Capping VF "
17945 " from %d to %wd\n", niters, estimated_vf, niters);
17947 estimated_vf = MIN (estimated_vf, niters);
17950 fractional_cost scalar_cycles_per_iter
17951 = scalar_ops.min_cycles_per_iter () * estimated_vf;
17953 fractional_cost vector_cycles_per_iter = vector_ops.min_cycles_per_iter ();
17955 if (dump_enabled_p ())
17957 if (IN_RANGE (m_num_vector_iterations, 0, 65536))
17958 dump_printf_loc (MSG_NOTE, vect_location,
17959 "Vector loop iterates at most %wd times\n",
17960 m_num_vector_iterations);
17961 dump_printf_loc (MSG_NOTE, vect_location, "Scalar issue estimate:\n");
17962 scalar_ops.dump ();
17963 dump_printf_loc (MSG_NOTE, vect_location,
17964 " estimated cycles per vector iteration"
17965 " (for VF %d) = %f\n",
17966 estimated_vf, scalar_cycles_per_iter.as_double ());
17969 if (vector_ops.sve_issue_info ())
17971 if (dump_enabled_p ())
17972 dump_printf_loc (MSG_NOTE, vect_location, "SVE issue estimate:\n");
17973 vector_cycles_per_iter
17974 = adjust_body_cost_sve (&vector_ops, scalar_cycles_per_iter,
17975 orig_body_cost, &body_cost, &should_disparage);
17977 if (aarch64_tune_params.vec_costs == &neoverse512tvb_vector_cost)
17979 /* Also take Neoverse V1 tuning into account, doubling the
17980 scalar and Advanced SIMD estimates to account for the
17981 doubling in SVE vector length. */
17982 if (dump_enabled_p ())
17983 dump_printf_loc (MSG_NOTE, vect_location,
17984 "Neoverse V1 estimate:\n");
17985 auto vf_factor = m_ops[1].vf_factor ();
17986 adjust_body_cost_sve (&m_ops[1], scalar_cycles_per_iter * vf_factor,
17987 orig_body_cost, &body_cost, &should_disparage);
17990 else
17992 if (dump_enabled_p ())
17994 dump_printf_loc (MSG_NOTE, vect_location,
17995 "Vector issue estimate:\n");
17996 vector_ops.dump ();
18000 /* Decide whether to stick to latency-based costs or whether to try to
18001 take issue rates into account. */
18002 unsigned int threshold = aarch64_loop_vect_issue_rate_niters;
18003 if (m_vec_flags & VEC_ANY_SVE)
18004 threshold = CEIL (threshold, aarch64_estimated_sve_vq ());
18006 if (m_num_vector_iterations >= 1
18007 && m_num_vector_iterations < threshold)
18009 if (dump_enabled_p ())
18010 dump_printf_loc (MSG_NOTE, vect_location,
18011 "Low iteration count, so using pure latency"
18012 " costs\n");
18014 /* Increase the cost of the vector code if it looks like the scalar code
18015 could issue more quickly. These values are only rough estimates,
18016 so minor differences should only result in minor changes. */
18017 else if (scalar_cycles_per_iter < vector_cycles_per_iter)
18019 body_cost = fractional_cost::scale (body_cost, vector_cycles_per_iter,
18020 scalar_cycles_per_iter);
18021 if (dump_enabled_p ())
18022 dump_printf_loc (MSG_NOTE, vect_location,
18023 "Increasing body cost to %d because scalar code"
18024 " would issue more quickly\n", body_cost);
18026 /* In general, it's expected that the proposed vector code would be able
18027 to issue more quickly than the original scalar code. This should
18028 already be reflected to some extent in the latency-based costs.
18030 However, the latency-based costs effectively assume that the scalar
18031 code and the vector code execute serially, which tends to underplay
18032 one important case: if the real (non-serialized) execution time of
18033 a scalar iteration is dominated by loop-carried dependencies,
18034 and if the vector code is able to reduce both the length of
18035 the loop-carried dependencies *and* the number of cycles needed
18036 to issue the code in general, we can be more confident that the
18037 vector code is an improvement, even if adding the other (non-loop-carried)
18038 latencies tends to hide this saving. We therefore reduce the cost of the
18039 vector loop body in proportion to the saving. */
18040 else if (scalar_ops.reduction_latency > vector_ops.reduction_latency
18041 && scalar_ops.reduction_latency == scalar_cycles_per_iter
18042 && scalar_cycles_per_iter > vector_cycles_per_iter
18043 && !should_disparage)
18045 body_cost = fractional_cost::scale (body_cost, vector_cycles_per_iter,
18046 scalar_cycles_per_iter);
18047 if (dump_enabled_p ())
18048 dump_printf_loc (MSG_NOTE, vect_location,
18049 "Decreasing body cost to %d account for smaller"
18050 " reduction latency\n", body_cost);
18053 return body_cost;
18056 void
18057 aarch64_vector_costs::finish_cost (const vector_costs *uncast_scalar_costs)
18059 /* Record the issue information for any SVE WHILE instructions that the
18060 loop needs. */
18061 loop_vec_info loop_vinfo = dyn_cast<loop_vec_info> (m_vinfo);
18062 if (!m_ops.is_empty ()
18063 && loop_vinfo
18064 && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
18066 unsigned int num_masks = 0;
18067 rgroup_controls *rgm;
18068 unsigned int num_vectors_m1;
18069 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo).rgc_vec,
18070 num_vectors_m1, rgm)
18071 if (rgm->type)
18072 num_masks += num_vectors_m1 + 1;
18073 for (auto &ops : m_ops)
18074 if (auto *issue = ops.sve_issue_info ())
18075 ops.pred_ops += num_masks * issue->while_pred_ops;
18078 auto *scalar_costs
18079 = static_cast<const aarch64_vector_costs *> (uncast_scalar_costs);
18080 if (loop_vinfo && m_vec_flags)
18082 m_costs[vect_body] = adjust_body_cost (loop_vinfo, scalar_costs,
18083 m_costs[vect_body]);
18084 m_suggested_unroll_factor = determine_suggested_unroll_factor ();
18086 /* For gather and scatters there's an additional overhead for the first
18087 iteration. For low count loops they're not beneficial so model the
18088 overhead as loop prologue costs. */
18089 m_costs[vect_prologue] += m_sve_gather_scatter_init_cost;
18092 /* Apply the heuristic described above m_stp_sequence_cost. Prefer
18093 the scalar code in the event of a tie, since there is more chance
18094 of scalar code being optimized with surrounding operations.
18096 In addition, if the vector body is a simple store to a decl that
18097 is elsewhere loaded using vld1, strongly prefer the vector form,
18098 to the extent of giving the prologue a zero cost. See the comment
18099 above m_stores_to_vector_load_decl for details. */
18100 if (!loop_vinfo
18101 && scalar_costs
18102 && m_stp_sequence_cost != ~0U)
18104 if (m_stores_to_vector_load_decl)
18105 m_costs[vect_prologue] = 0;
18106 else if (m_stp_sequence_cost >= scalar_costs->m_stp_sequence_cost)
18107 m_costs[vect_body] = 2 * scalar_costs->total_cost ();
18110 vector_costs::finish_cost (scalar_costs);
18113 bool
18114 aarch64_vector_costs::
18115 better_main_loop_than_p (const vector_costs *uncast_other) const
18117 auto other = static_cast<const aarch64_vector_costs *> (uncast_other);
18119 auto this_loop_vinfo = as_a<loop_vec_info> (this->m_vinfo);
18120 auto other_loop_vinfo = as_a<loop_vec_info> (other->m_vinfo);
18122 if (dump_enabled_p ())
18123 dump_printf_loc (MSG_NOTE, vect_location,
18124 "Comparing two main loops (%s at VF %d vs %s at VF %d)\n",
18125 GET_MODE_NAME (this_loop_vinfo->vector_mode),
18126 vect_vf_for_cost (this_loop_vinfo),
18127 GET_MODE_NAME (other_loop_vinfo->vector_mode),
18128 vect_vf_for_cost (other_loop_vinfo));
18130 /* Apply the unrolling heuristic described above
18131 m_unrolled_advsimd_niters. */
18132 if (bool (m_unrolled_advsimd_stmts)
18133 != bool (other->m_unrolled_advsimd_stmts))
18135 bool this_prefer_unrolled = this->prefer_unrolled_loop ();
18136 bool other_prefer_unrolled = other->prefer_unrolled_loop ();
18137 if (this_prefer_unrolled != other_prefer_unrolled)
18139 if (dump_enabled_p ())
18140 dump_printf_loc (MSG_NOTE, vect_location,
18141 "Preferring Advanced SIMD loop because"
18142 " it can be unrolled\n");
18143 return other_prefer_unrolled;
18147 for (unsigned int i = 0; i < m_ops.length (); ++i)
18149 if (dump_enabled_p ())
18151 if (i)
18152 dump_printf_loc (MSG_NOTE, vect_location,
18153 "Reconsidering with subtuning %d\n", i);
18154 dump_printf_loc (MSG_NOTE, vect_location,
18155 "Issue info for %s loop:\n",
18156 GET_MODE_NAME (this_loop_vinfo->vector_mode));
18157 this->m_ops[i].dump ();
18158 dump_printf_loc (MSG_NOTE, vect_location,
18159 "Issue info for %s loop:\n",
18160 GET_MODE_NAME (other_loop_vinfo->vector_mode));
18161 other->m_ops[i].dump ();
18164 auto this_estimated_vf = (vect_vf_for_cost (this_loop_vinfo)
18165 * this->m_ops[i].vf_factor ());
18166 auto other_estimated_vf = (vect_vf_for_cost (other_loop_vinfo)
18167 * other->m_ops[i].vf_factor ());
18169 /* If it appears that one loop could process the same amount of data
18170 in fewer cycles, prefer that loop over the other one. */
18171 fractional_cost this_cost
18172 = this->m_ops[i].min_cycles_per_iter () * other_estimated_vf;
18173 fractional_cost other_cost
18174 = other->m_ops[i].min_cycles_per_iter () * this_estimated_vf;
18175 if (dump_enabled_p ())
18177 dump_printf_loc (MSG_NOTE, vect_location,
18178 "Weighted cycles per iteration of %s loop ~= %f\n",
18179 GET_MODE_NAME (this_loop_vinfo->vector_mode),
18180 this_cost.as_double ());
18181 dump_printf_loc (MSG_NOTE, vect_location,
18182 "Weighted cycles per iteration of %s loop ~= %f\n",
18183 GET_MODE_NAME (other_loop_vinfo->vector_mode),
18184 other_cost.as_double ());
18186 if (this_cost != other_cost)
18188 if (dump_enabled_p ())
18189 dump_printf_loc (MSG_NOTE, vect_location,
18190 "Preferring loop with lower cycles"
18191 " per iteration\n");
18192 return this_cost < other_cost;
18195 /* If the issue rate of SVE code is limited by predicate operations
18196 (i.e. if sve_pred_cycles_per_iter > sve_nonpred_cycles_per_iter),
18197 and if Advanced SIMD code could issue within the limit imposed
18198 by the predicate operations, the predicate operations are adding an
18199 overhead that the original code didn't have and so we should prefer
18200 the Advanced SIMD version. */
18201 auto better_pred_limit_p = [](const aarch64_vec_op_count &a,
18202 const aarch64_vec_op_count &b) -> bool
18204 if (a.pred_ops == 0
18205 && (b.min_pred_cycles_per_iter ()
18206 > b.min_nonpred_cycles_per_iter ()))
18208 if (dump_enabled_p ())
18209 dump_printf_loc (MSG_NOTE, vect_location,
18210 "Preferring Advanced SIMD loop since"
18211 " SVE loop is predicate-limited\n");
18212 return true;
18214 return false;
18216 if (better_pred_limit_p (this->m_ops[i], other->m_ops[i]))
18217 return true;
18218 if (better_pred_limit_p (other->m_ops[i], this->m_ops[i]))
18219 return false;
18222 return vector_costs::better_main_loop_than_p (other);
18225 static void initialize_aarch64_code_model (struct gcc_options *);
18227 /* Parse TOKEN, which has length LENGTH to see if it is an option
18228 described in FLAG. If it is, return the index bit for that fusion type.
18229 If not, error (printing OPTION_NAME) and return zero. */
18231 static unsigned int
18232 aarch64_parse_one_option_token (const char *token,
18233 size_t length,
18234 const struct aarch64_flag_desc *flag,
18235 const char *option_name)
18237 for (; flag->name != NULL; flag++)
18239 if (length == strlen (flag->name)
18240 && !strncmp (flag->name, token, length))
18241 return flag->flag;
18244 error ("unknown flag passed in %<-moverride=%s%> (%s)", option_name, token);
18245 return 0;
18248 /* Parse OPTION which is a comma-separated list of flags to enable.
18249 FLAGS gives the list of flags we understand, INITIAL_STATE gives any
18250 default state we inherit from the CPU tuning structures. OPTION_NAME
18251 gives the top-level option we are parsing in the -moverride string,
18252 for use in error messages. */
18254 static unsigned int
18255 aarch64_parse_boolean_options (const char *option,
18256 const struct aarch64_flag_desc *flags,
18257 unsigned int initial_state,
18258 const char *option_name)
18260 const char separator = '.';
18261 const char* specs = option;
18262 const char* ntoken = option;
18263 unsigned int found_flags = initial_state;
18265 while ((ntoken = strchr (specs, separator)))
18267 size_t token_length = ntoken - specs;
18268 unsigned token_ops = aarch64_parse_one_option_token (specs,
18269 token_length,
18270 flags,
18271 option_name);
18272 /* If we find "none" (or, for simplicity's sake, an error) anywhere
18273 in the token stream, reset the supported operations. So:
18275 adrp+add.cmp+branch.none.adrp+add
18277 would have the result of turning on only adrp+add fusion. */
18278 if (!token_ops)
18279 found_flags = 0;
18281 found_flags |= token_ops;
18282 specs = ++ntoken;
18285 /* We ended with a comma, print something. */
18286 if (!(*specs))
18288 error ("%qs string ill-formed", option_name);
18289 return 0;
18292 /* We still have one more token to parse. */
18293 size_t token_length = strlen (specs);
18294 unsigned token_ops = aarch64_parse_one_option_token (specs,
18295 token_length,
18296 flags,
18297 option_name);
18298 if (!token_ops)
18299 found_flags = 0;
18301 found_flags |= token_ops;
18302 return found_flags;
18305 /* Support for overriding instruction fusion. */
18307 static void
18308 aarch64_parse_fuse_string (const char *fuse_string,
18309 struct tune_params *tune)
18311 tune->fusible_ops = aarch64_parse_boolean_options (fuse_string,
18312 aarch64_fusible_pairs,
18313 tune->fusible_ops,
18314 "fuse=");
18317 /* Support for overriding other tuning flags. */
18319 static void
18320 aarch64_parse_tune_string (const char *tune_string,
18321 struct tune_params *tune)
18323 tune->extra_tuning_flags
18324 = aarch64_parse_boolean_options (tune_string,
18325 aarch64_tuning_flags,
18326 tune->extra_tuning_flags,
18327 "tune=");
18330 /* Parse the sve_width tuning moverride string in TUNE_STRING.
18331 Accept the valid SVE vector widths allowed by
18332 aarch64_sve_vector_bits_enum and use it to override sve_width
18333 in TUNE. */
18335 static void
18336 aarch64_parse_sve_width_string (const char *tune_string,
18337 struct tune_params *tune)
18339 int width = -1;
18341 int n = sscanf (tune_string, "%d", &width);
18342 if (n == EOF)
18344 error ("invalid format for %<sve_width%>");
18345 return;
18347 switch (width)
18349 case SVE_128:
18350 case SVE_256:
18351 case SVE_512:
18352 case SVE_1024:
18353 case SVE_2048:
18354 break;
18355 default:
18356 error ("invalid %<sve_width%> value: %d", width);
18358 tune->sve_width = (enum aarch64_sve_vector_bits_enum) width;
18361 /* Parse TOKEN, which has length LENGTH to see if it is a tuning option
18362 we understand. If it is, extract the option string and handoff to
18363 the appropriate function. */
18365 void
18366 aarch64_parse_one_override_token (const char* token,
18367 size_t length,
18368 struct tune_params *tune)
18370 const struct aarch64_tuning_override_function *fn
18371 = aarch64_tuning_override_functions;
18373 const char *option_part = strchr (token, '=');
18374 if (!option_part)
18376 error ("tuning string missing in option (%s)", token);
18377 return;
18380 /* Get the length of the option name. */
18381 length = option_part - token;
18382 /* Skip the '=' to get to the option string. */
18383 option_part++;
18385 for (; fn->name != NULL; fn++)
18387 if (!strncmp (fn->name, token, length))
18389 fn->parse_override (option_part, tune);
18390 return;
18394 error ("unknown tuning option (%s)",token);
18395 return;
18398 /* A checking mechanism for the implementation of the tls size. */
18400 static void
18401 initialize_aarch64_tls_size (struct gcc_options *opts)
18403 if (aarch64_tls_size == 0)
18404 aarch64_tls_size = 24;
18406 switch (opts->x_aarch64_cmodel_var)
18408 case AARCH64_CMODEL_TINY:
18409 /* Both the default and maximum TLS size allowed under tiny is 1M which
18410 needs two instructions to address, so we clamp the size to 24. */
18411 if (aarch64_tls_size > 24)
18412 aarch64_tls_size = 24;
18413 break;
18414 case AARCH64_CMODEL_SMALL:
18415 /* The maximum TLS size allowed under small is 4G. */
18416 if (aarch64_tls_size > 32)
18417 aarch64_tls_size = 32;
18418 break;
18419 case AARCH64_CMODEL_LARGE:
18420 /* The maximum TLS size allowed under large is 16E.
18421 FIXME: 16E should be 64bit, we only support 48bit offset now. */
18422 if (aarch64_tls_size > 48)
18423 aarch64_tls_size = 48;
18424 break;
18425 default:
18426 gcc_unreachable ();
18429 return;
18432 /* Return the CPU corresponding to the enum CPU. */
18434 static const struct processor *
18435 aarch64_get_tune_cpu (enum aarch64_cpu cpu)
18437 gcc_assert (cpu != aarch64_no_cpu);
18439 return &all_cores[cpu];
18442 /* Return the architecture corresponding to the enum ARCH. */
18444 static const struct processor *
18445 aarch64_get_arch (enum aarch64_arch arch)
18447 gcc_assert (arch != aarch64_no_arch);
18449 return &all_architectures[arch];
18452 /* Parse STRING looking for options in the format:
18453 string :: option:string
18454 option :: name=substring
18455 name :: {a-z}
18456 substring :: defined by option. */
18458 static void
18459 aarch64_parse_override_string (const char* input_string,
18460 struct tune_params* tune)
18462 const char separator = ':';
18463 size_t string_length = strlen (input_string) + 1;
18464 char *string_root = (char *) xmalloc (sizeof (*string_root) * string_length);
18465 char *string = string_root;
18466 strncpy (string, input_string, string_length);
18467 string[string_length - 1] = '\0';
18469 char* ntoken = string;
18471 while ((ntoken = strchr (string, separator)))
18473 size_t token_length = ntoken - string;
18474 /* Make this substring look like a string. */
18475 *ntoken = '\0';
18476 aarch64_parse_one_override_token (string, token_length, tune);
18477 string = ++ntoken;
18480 /* One last option to parse. */
18481 aarch64_parse_one_override_token (string, strlen (string), tune);
18482 free (string_root);
18485 /* Adjust CURRENT_TUNE (a generic tuning struct) with settings that
18486 are best for a generic target with the currently-enabled architecture
18487 extensions. */
18488 static void
18489 aarch64_adjust_generic_arch_tuning (struct tune_params &current_tune)
18491 /* Neoverse V1 is the only core that is known to benefit from
18492 AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS. There is therefore no
18493 point enabling it for SVE2 and above. */
18494 if (TARGET_SVE2)
18495 current_tune.extra_tuning_flags
18496 &= ~AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS;
18499 static void
18500 aarch64_override_options_after_change_1 (struct gcc_options *opts)
18502 /* PR 70044: We have to be careful about being called multiple times for the
18503 same function. This means all changes should be repeatable. */
18505 /* Set aarch64_use_frame_pointer based on -fno-omit-frame-pointer.
18506 Disable the frame pointer flag so the mid-end will not use a frame
18507 pointer in leaf functions in order to support -fomit-leaf-frame-pointer.
18508 Set x_flag_omit_frame_pointer to the special value 2 to differentiate
18509 between -fomit-frame-pointer (1) and -fno-omit-frame-pointer (2). */
18510 aarch64_use_frame_pointer = opts->x_flag_omit_frame_pointer != 1;
18511 if (opts->x_flag_omit_frame_pointer == 0)
18512 opts->x_flag_omit_frame_pointer = 2;
18514 /* If not optimizing for size, set the default
18515 alignment to what the target wants. */
18516 if (!opts->x_optimize_size)
18518 if (opts->x_flag_align_loops && !opts->x_str_align_loops)
18519 opts->x_str_align_loops = aarch64_tune_params.loop_align;
18520 if (opts->x_flag_align_jumps && !opts->x_str_align_jumps)
18521 opts->x_str_align_jumps = aarch64_tune_params.jump_align;
18522 if (opts->x_flag_align_functions && !opts->x_str_align_functions)
18523 opts->x_str_align_functions = aarch64_tune_params.function_align;
18526 /* We default to no pc-relative literal loads. */
18528 aarch64_pcrelative_literal_loads = false;
18530 /* If -mpc-relative-literal-loads is set on the command line, this
18531 implies that the user asked for PC relative literal loads. */
18532 if (opts->x_pcrelative_literal_loads == 1)
18533 aarch64_pcrelative_literal_loads = true;
18535 /* In the tiny memory model it makes no sense to disallow PC relative
18536 literal pool loads. */
18537 if (aarch64_cmodel == AARCH64_CMODEL_TINY
18538 || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
18539 aarch64_pcrelative_literal_loads = true;
18541 /* When enabling the lower precision Newton series for the square root, also
18542 enable it for the reciprocal square root, since the latter is an
18543 intermediary step for the former. */
18544 if (flag_mlow_precision_sqrt)
18545 flag_mrecip_low_precision_sqrt = true;
18548 /* 'Unpack' up the internal tuning structs and update the options
18549 in OPTS. The caller must have set up selected_tune and selected_arch
18550 as all the other target-specific codegen decisions are
18551 derived from them. */
18553 void
18554 aarch64_override_options_internal (struct gcc_options *opts)
18556 const struct processor *tune = aarch64_get_tune_cpu (opts->x_selected_tune);
18557 aarch64_tune = tune->sched_core;
18558 /* Make a copy of the tuning parameters attached to the core, which
18559 we may later overwrite. */
18560 aarch64_tune_params = *(tune->tune);
18561 if (tune->tune == &generic_tunings)
18562 aarch64_adjust_generic_arch_tuning (aarch64_tune_params);
18564 if (opts->x_aarch64_override_tune_string)
18565 aarch64_parse_override_string (opts->x_aarch64_override_tune_string,
18566 &aarch64_tune_params);
18568 if (opts->x_aarch64_ldp_policy_param)
18569 aarch64_tune_params.ldp_policy_model = opts->x_aarch64_ldp_policy_param;
18571 if (opts->x_aarch64_stp_policy_param)
18572 aarch64_tune_params.stp_policy_model = opts->x_aarch64_stp_policy_param;
18574 /* This target defaults to strict volatile bitfields. */
18575 if (opts->x_flag_strict_volatile_bitfields < 0 && abi_version_at_least (2))
18576 opts->x_flag_strict_volatile_bitfields = 1;
18578 if (aarch64_stack_protector_guard == SSP_GLOBAL
18579 && opts->x_aarch64_stack_protector_guard_offset_str)
18581 error ("incompatible options %<-mstack-protector-guard=global%> and "
18582 "%<-mstack-protector-guard-offset=%s%>",
18583 aarch64_stack_protector_guard_offset_str);
18586 if (aarch64_stack_protector_guard == SSP_SYSREG
18587 && !(opts->x_aarch64_stack_protector_guard_offset_str
18588 && opts->x_aarch64_stack_protector_guard_reg_str))
18590 error ("both %<-mstack-protector-guard-offset%> and "
18591 "%<-mstack-protector-guard-reg%> must be used "
18592 "with %<-mstack-protector-guard=sysreg%>");
18595 if (opts->x_aarch64_stack_protector_guard_reg_str)
18597 if (strlen (opts->x_aarch64_stack_protector_guard_reg_str) > 100)
18598 error ("specify a system register with a small string length");
18601 if (opts->x_aarch64_stack_protector_guard_offset_str)
18603 char *end;
18604 const char *str = aarch64_stack_protector_guard_offset_str;
18605 errno = 0;
18606 long offs = strtol (aarch64_stack_protector_guard_offset_str, &end, 0);
18607 if (!*str || *end || errno)
18608 error ("%qs is not a valid offset in %qs", str,
18609 "-mstack-protector-guard-offset=");
18610 aarch64_stack_protector_guard_offset = offs;
18613 if ((flag_sanitize & SANITIZE_SHADOW_CALL_STACK)
18614 && !fixed_regs[R18_REGNUM])
18615 error ("%<-fsanitize=shadow-call-stack%> requires %<-ffixed-x18%>");
18617 aarch64_feature_flags isa_flags = aarch64_get_isa_flags (opts);
18618 if ((isa_flags & (AARCH64_FL_SM_ON | AARCH64_FL_ZA_ON))
18619 && !(isa_flags & AARCH64_FL_SME))
18621 if (isa_flags & AARCH64_FL_SM_ON)
18622 error ("streaming functions require the ISA extension %qs", "sme");
18623 else
18624 error ("functions with SME state require the ISA extension %qs",
18625 "sme");
18626 inform (input_location, "you can enable %qs using the command-line"
18627 " option %<-march%>, or by using the %<target%>"
18628 " attribute or pragma", "sme");
18629 opts->x_target_flags &= ~MASK_GENERAL_REGS_ONLY;
18630 auto new_flags = isa_flags | feature_deps::SME ().enable;
18631 aarch64_set_asm_isa_flags (opts, new_flags);
18634 initialize_aarch64_code_model (opts);
18635 initialize_aarch64_tls_size (opts);
18636 aarch64_tpidr_register = opts->x_aarch64_tpidr_reg;
18638 int queue_depth = 0;
18639 switch (aarch64_tune_params.autoprefetcher_model)
18641 case tune_params::AUTOPREFETCHER_OFF:
18642 queue_depth = -1;
18643 break;
18644 case tune_params::AUTOPREFETCHER_WEAK:
18645 queue_depth = 0;
18646 break;
18647 case tune_params::AUTOPREFETCHER_STRONG:
18648 queue_depth = max_insn_queue_index + 1;
18649 break;
18650 default:
18651 gcc_unreachable ();
18654 /* We don't mind passing in global_options_set here as we don't use
18655 the *options_set structs anyway. */
18656 SET_OPTION_IF_UNSET (opts, &global_options_set,
18657 param_sched_autopref_queue_depth, queue_depth);
18659 /* Set up parameters to be used in prefetching algorithm. Do not
18660 override the defaults unless we are tuning for a core we have
18661 researched values for. */
18662 if (aarch64_tune_params.prefetch->num_slots > 0)
18663 SET_OPTION_IF_UNSET (opts, &global_options_set,
18664 param_simultaneous_prefetches,
18665 aarch64_tune_params.prefetch->num_slots);
18666 if (aarch64_tune_params.prefetch->l1_cache_size >= 0)
18667 SET_OPTION_IF_UNSET (opts, &global_options_set,
18668 param_l1_cache_size,
18669 aarch64_tune_params.prefetch->l1_cache_size);
18670 if (aarch64_tune_params.prefetch->l1_cache_line_size >= 0)
18671 SET_OPTION_IF_UNSET (opts, &global_options_set,
18672 param_l1_cache_line_size,
18673 aarch64_tune_params.prefetch->l1_cache_line_size);
18675 if (aarch64_tune_params.prefetch->l1_cache_line_size >= 0)
18677 SET_OPTION_IF_UNSET (opts, &global_options_set,
18678 param_destruct_interfere_size,
18679 aarch64_tune_params.prefetch->l1_cache_line_size);
18680 SET_OPTION_IF_UNSET (opts, &global_options_set,
18681 param_construct_interfere_size,
18682 aarch64_tune_params.prefetch->l1_cache_line_size);
18684 else
18686 /* For a generic AArch64 target, cover the current range of cache line
18687 sizes. */
18688 SET_OPTION_IF_UNSET (opts, &global_options_set,
18689 param_destruct_interfere_size,
18690 256);
18691 SET_OPTION_IF_UNSET (opts, &global_options_set,
18692 param_construct_interfere_size,
18693 64);
18696 if (aarch64_tune_params.prefetch->l2_cache_size >= 0)
18697 SET_OPTION_IF_UNSET (opts, &global_options_set,
18698 param_l2_cache_size,
18699 aarch64_tune_params.prefetch->l2_cache_size);
18700 if (!aarch64_tune_params.prefetch->prefetch_dynamic_strides)
18701 SET_OPTION_IF_UNSET (opts, &global_options_set,
18702 param_prefetch_dynamic_strides, 0);
18703 if (aarch64_tune_params.prefetch->minimum_stride >= 0)
18704 SET_OPTION_IF_UNSET (opts, &global_options_set,
18705 param_prefetch_minimum_stride,
18706 aarch64_tune_params.prefetch->minimum_stride);
18708 /* Use the alternative scheduling-pressure algorithm by default. */
18709 SET_OPTION_IF_UNSET (opts, &global_options_set,
18710 param_sched_pressure_algorithm,
18711 SCHED_PRESSURE_MODEL);
18713 /* Validate the guard size. */
18714 int guard_size = param_stack_clash_protection_guard_size;
18716 if (guard_size != 12 && guard_size != 16)
18717 error ("only values 12 (4 KB) and 16 (64 KB) are supported for guard "
18718 "size. Given value %d (%llu KB) is out of range",
18719 guard_size, (1ULL << guard_size) / 1024ULL);
18721 /* Enforce that interval is the same size as size so the mid-end does the
18722 right thing. */
18723 SET_OPTION_IF_UNSET (opts, &global_options_set,
18724 param_stack_clash_protection_probe_interval,
18725 guard_size);
18727 /* The maybe_set calls won't update the value if the user has explicitly set
18728 one. Which means we need to validate that probing interval and guard size
18729 are equal. */
18730 int probe_interval
18731 = param_stack_clash_protection_probe_interval;
18732 if (guard_size != probe_interval)
18733 error ("stack clash guard size %<%d%> must be equal to probing interval "
18734 "%<%d%>", guard_size, probe_interval);
18736 /* Enable sw prefetching at specified optimization level for
18737 CPUS that have prefetch. Lower optimization level threshold by 1
18738 when profiling is enabled. */
18739 if (opts->x_flag_prefetch_loop_arrays < 0
18740 && !opts->x_optimize_size
18741 && aarch64_tune_params.prefetch->default_opt_level >= 0
18742 && opts->x_optimize >= aarch64_tune_params.prefetch->default_opt_level)
18743 opts->x_flag_prefetch_loop_arrays = 1;
18745 /* Avoid loop-dependant FMA chains. */
18746 if (aarch64_tune_params.extra_tuning_flags
18747 & AARCH64_EXTRA_TUNE_AVOID_CROSS_LOOP_FMA)
18748 SET_OPTION_IF_UNSET (opts, &global_options_set, param_avoid_fma_max_bits,
18749 512);
18751 /* Consider fully pipelined FMA in reassociation. */
18752 if (aarch64_tune_params.extra_tuning_flags
18753 & AARCH64_EXTRA_TUNE_FULLY_PIPELINED_FMA)
18754 SET_OPTION_IF_UNSET (opts, &global_options_set, param_fully_pipelined_fma,
18757 aarch64_override_options_after_change_1 (opts);
18760 /* Straight line speculation indicators. */
18761 enum aarch64_sls_hardening_type
18763 SLS_NONE = 0,
18764 SLS_RETBR = 1,
18765 SLS_BLR = 2,
18766 SLS_ALL = 3,
18768 static enum aarch64_sls_hardening_type aarch64_sls_hardening;
18770 /* Return whether we should mitigatate Straight Line Speculation for the RET
18771 and BR instructions. */
18772 bool
18773 aarch64_harden_sls_retbr_p (void)
18775 return aarch64_sls_hardening & SLS_RETBR;
18778 /* Return whether we should mitigatate Straight Line Speculation for the BLR
18779 instruction. */
18780 bool
18781 aarch64_harden_sls_blr_p (void)
18783 return aarch64_sls_hardening & SLS_BLR;
18786 /* As of yet we only allow setting these options globally, in the future we may
18787 allow setting them per function. */
18788 static void
18789 aarch64_validate_sls_mitigation (const char *const_str)
18791 char *token_save = NULL;
18792 char *str = NULL;
18794 if (strcmp (const_str, "none") == 0)
18796 aarch64_sls_hardening = SLS_NONE;
18797 return;
18799 if (strcmp (const_str, "all") == 0)
18801 aarch64_sls_hardening = SLS_ALL;
18802 return;
18805 char *str_root = xstrdup (const_str);
18806 str = strtok_r (str_root, ",", &token_save);
18807 if (!str)
18808 error ("invalid argument given to %<-mharden-sls=%>");
18810 int temp = SLS_NONE;
18811 while (str)
18813 if (strcmp (str, "blr") == 0)
18814 temp |= SLS_BLR;
18815 else if (strcmp (str, "retbr") == 0)
18816 temp |= SLS_RETBR;
18817 else if (strcmp (str, "none") == 0 || strcmp (str, "all") == 0)
18819 error ("%qs must be by itself for %<-mharden-sls=%>", str);
18820 break;
18822 else
18824 error ("invalid argument %qs for %<-mharden-sls=%>", str);
18825 break;
18827 str = strtok_r (NULL, ",", &token_save);
18829 aarch64_sls_hardening = (aarch64_sls_hardening_type) temp;
18830 free (str_root);
18833 /* Return the VG value associated with -msve-vector-bits= value VALUE. */
18835 static poly_uint16
18836 aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits_enum value)
18838 /* 128-bit SVE and Advanced SIMD modes use different register layouts
18839 on big-endian targets, so we would need to forbid subregs that convert
18840 from one to the other. By default a reinterpret sequence would then
18841 involve a store to memory in one mode and a load back in the other.
18842 Even if we optimize that sequence using reverse instructions,
18843 it would still be a significant potential overhead.
18845 For now, it seems better to generate length-agnostic code for that
18846 case instead. */
18847 if (value == SVE_SCALABLE
18848 || (value == SVE_128 && BYTES_BIG_ENDIAN))
18849 return poly_uint16 (2, 2);
18850 else
18851 return (int) value / 64;
18854 /* Set the global aarch64_asm_isa_flags to FLAGS and update
18855 aarch64_isa_flags accordingly. */
18857 void
18858 aarch64_set_asm_isa_flags (aarch64_feature_flags flags)
18860 aarch64_set_asm_isa_flags (&global_options, flags);
18863 static void
18864 aarch64_handle_no_branch_protection (void)
18866 aarch_ra_sign_scope = AARCH_FUNCTION_NONE;
18867 aarch_enable_bti = 0;
18868 aarch64_enable_gcs = 0;
18871 static void
18872 aarch64_handle_standard_branch_protection (void)
18874 aarch_ra_sign_scope = AARCH_FUNCTION_NON_LEAF;
18875 aarch64_ra_sign_key = AARCH64_KEY_A;
18876 aarch_enable_bti = 1;
18877 aarch64_enable_gcs = 1;
18880 static void
18881 aarch64_handle_pac_ret_protection (void)
18883 aarch_ra_sign_scope = AARCH_FUNCTION_NON_LEAF;
18884 aarch64_ra_sign_key = AARCH64_KEY_A;
18887 static void
18888 aarch64_handle_pac_ret_leaf (void)
18890 aarch_ra_sign_scope = AARCH_FUNCTION_ALL;
18893 static void
18894 aarch64_handle_pac_ret_b_key (void)
18896 aarch64_ra_sign_key = AARCH64_KEY_B;
18899 static void
18900 aarch64_handle_bti_protection (void)
18902 aarch_enable_bti = 1;
18904 static void
18905 aarch64_handle_gcs_protection (void)
18907 aarch64_enable_gcs = 1;
18910 static const struct aarch_branch_protect_type aarch64_pac_ret_subtypes[] = {
18911 { "leaf", false, aarch64_handle_pac_ret_leaf, NULL, 0 },
18912 { "b-key", false, aarch64_handle_pac_ret_b_key, NULL, 0 },
18913 { NULL, false, NULL, NULL, 0 }
18916 static const struct aarch_branch_protect_type aarch64_branch_protect_types[] =
18918 { "none", true, aarch64_handle_no_branch_protection, NULL, 0 },
18919 { "standard", true, aarch64_handle_standard_branch_protection, NULL, 0 },
18920 { "pac-ret", false, aarch64_handle_pac_ret_protection,
18921 aarch64_pac_ret_subtypes, ARRAY_SIZE (aarch64_pac_ret_subtypes) },
18922 { "bti", false, aarch64_handle_bti_protection, NULL, 0 },
18923 { "gcs", false, aarch64_handle_gcs_protection, NULL, 0 },
18924 { NULL, false, NULL, NULL, 0 }
18927 /* Implement TARGET_OPTION_OVERRIDE. This is called once in the beginning
18928 and is used to parse the -m{cpu,tune,arch} strings and setup the initial
18929 tuning structs. In particular it must set selected_tune and
18930 aarch64_asm_isa_flags that define the available ISA features and tuning
18931 decisions. It must also set selected_arch as this will be used to
18932 output the .arch asm tags for each function. */
18934 static void
18935 aarch64_override_options (void)
18937 aarch64_feature_flags cpu_isa = 0;
18938 aarch64_feature_flags arch_isa = 0;
18939 aarch64_set_asm_isa_flags (0);
18941 aarch64_cpu cpu = aarch64_no_cpu;
18942 aarch64_arch arch = aarch64_no_arch;
18943 aarch64_cpu tune = aarch64_no_cpu;
18945 if (aarch64_harden_sls_string)
18946 aarch64_validate_sls_mitigation (aarch64_harden_sls_string);
18948 if (aarch64_branch_protection_string)
18949 aarch_validate_mbranch_protection (aarch64_branch_protect_types,
18950 aarch64_branch_protection_string,
18951 "-mbranch-protection=");
18953 /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
18954 If either of -march or -mtune is given, they override their
18955 respective component of -mcpu. */
18956 if (aarch64_cpu_string)
18957 aarch64_validate_mcpu (aarch64_cpu_string, &cpu, &cpu_isa);
18959 if (aarch64_arch_string)
18960 aarch64_validate_march (aarch64_arch_string, &arch, &arch_isa);
18962 if (aarch64_tune_string)
18963 aarch64_validate_mtune (aarch64_tune_string, &tune);
18965 #ifdef SUBTARGET_OVERRIDE_OPTIONS
18966 SUBTARGET_OVERRIDE_OPTIONS;
18967 #endif
18969 if (cpu != aarch64_no_cpu && arch != aarch64_no_arch)
18971 /* If both -mcpu and -march are specified, warn if they are not
18972 feature compatible. feature compatible means that the inclusion of the
18973 cpu features would end up disabling an achitecture feature. In
18974 otherwords the cpu features need to be a strict superset of the arch
18975 features and if so prefer the -march ISA flags. */
18976 if (~cpu_isa & arch_isa)
18978 std::string ext_diff
18979 = aarch64_get_extension_string_for_isa_flags (arch_isa, cpu_isa);
18980 warning (0, "switch %<-mcpu=%s%> conflicts with %<-march=%s%> switch "
18981 "and resulted in options %qs being added",
18982 aarch64_cpu_string,
18983 aarch64_arch_string,
18984 ext_diff.c_str ());
18987 selected_arch = arch;
18988 aarch64_set_asm_isa_flags (arch_isa | AARCH64_FL_DEFAULT_ISA_MODE);
18990 else if (cpu != aarch64_no_cpu)
18992 selected_arch = aarch64_get_tune_cpu (cpu)->arch;
18993 aarch64_set_asm_isa_flags (cpu_isa | AARCH64_FL_DEFAULT_ISA_MODE);
18995 else if (arch != aarch64_no_arch)
18997 cpu = aarch64_get_arch (arch)->ident;
18998 selected_arch = arch;
18999 aarch64_set_asm_isa_flags (arch_isa | AARCH64_FL_DEFAULT_ISA_MODE);
19001 else
19003 /* No -mcpu or -march specified, so use the default CPU. */
19004 cpu = TARGET_CPU_DEFAULT;
19005 const processor *cpu_info = aarch64_get_tune_cpu (cpu);
19006 selected_arch = cpu_info->arch;
19007 aarch64_set_asm_isa_flags (cpu_info->flags
19008 | AARCH64_FL_DEFAULT_ISA_MODE);
19011 selected_tune = (tune != aarch64_no_cpu) ? tune : cpu;
19013 if (aarch_enable_bti == 2)
19015 #ifdef TARGET_ENABLE_BTI
19016 aarch_enable_bti = 1;
19017 #else
19018 aarch_enable_bti = 0;
19019 #endif
19022 if (aarch64_enable_gcs == 2)
19024 #ifdef TARGET_ENABLE_GCS
19025 aarch64_enable_gcs = 1;
19026 #else
19027 aarch64_enable_gcs = 0;
19028 #endif
19031 /* Return address signing is currently not supported for ILP32 targets. For
19032 LP64 targets use the configured option in the absence of a command-line
19033 option for -mbranch-protection. */
19034 if (!TARGET_ILP32 && aarch64_branch_protection_string == NULL)
19036 #ifdef TARGET_ENABLE_PAC_RET
19037 aarch_ra_sign_scope = AARCH_FUNCTION_NON_LEAF;
19038 #else
19039 aarch_ra_sign_scope = AARCH_FUNCTION_NONE;
19040 #endif
19043 #ifndef HAVE_AS_MABI_OPTION
19044 /* The compiler may have been configured with 2.23.* binutils, which does
19045 not have support for ILP32. */
19046 if (TARGET_ILP32)
19047 error ("assembler does not support %<-mabi=ilp32%>");
19048 #endif
19049 if (TARGET_ILP32)
19050 warning (OPT_Wdeprecated, "%<-mabi=ilp32%> is deprecated");
19052 /* Convert -msve-vector-bits to a VG count. */
19053 aarch64_sve_vg = aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits);
19055 if (aarch_ra_sign_scope != AARCH_FUNCTION_NONE && TARGET_ILP32)
19056 sorry ("return address signing is only supported for %<-mabi=lp64%>");
19058 /* The pass to insert speculation tracking runs before
19059 shrink-wrapping and the latter does not know how to update the
19060 tracking status. So disable it in this case. */
19061 if (aarch64_track_speculation)
19062 flag_shrink_wrap = 0;
19064 aarch64_override_options_internal (&global_options);
19066 /* Save these options as the default ones in case we push and pop them later
19067 while processing functions with potential target attributes. */
19068 target_option_default_node = target_option_current_node
19069 = build_target_option_node (&global_options, &global_options_set);
19072 /* Implement targetm.override_options_after_change. */
19074 static void
19075 aarch64_override_options_after_change (void)
19077 aarch64_override_options_after_change_1 (&global_options);
19080 /* Implement the TARGET_OFFLOAD_OPTIONS hook. */
19081 static char *
19082 aarch64_offload_options (void)
19084 if (TARGET_ILP32)
19085 return xstrdup ("-foffload-abi=ilp32 -foffload-abi-host-opts=-mabi=ilp32");
19086 else
19087 return xstrdup ("-foffload-abi=lp64 -foffload-abi-host-opts=-mabi=lp64");
19090 static struct machine_function *
19091 aarch64_init_machine_status (void)
19093 struct machine_function *machine;
19094 machine = ggc_cleared_alloc<machine_function> ();
19095 return machine;
19098 void
19099 aarch64_init_expanders (void)
19101 init_machine_status = aarch64_init_machine_status;
19104 /* A checking mechanism for the implementation of the various code models. */
19105 static void
19106 initialize_aarch64_code_model (struct gcc_options *opts)
19108 aarch64_cmodel = opts->x_aarch64_cmodel_var;
19109 switch (opts->x_aarch64_cmodel_var)
19111 case AARCH64_CMODEL_TINY:
19112 if (opts->x_flag_pic)
19113 aarch64_cmodel = AARCH64_CMODEL_TINY_PIC;
19114 break;
19115 case AARCH64_CMODEL_SMALL:
19116 if (opts->x_flag_pic)
19118 #ifdef HAVE_AS_SMALL_PIC_RELOCS
19119 aarch64_cmodel = (flag_pic == 2
19120 ? AARCH64_CMODEL_SMALL_PIC
19121 : AARCH64_CMODEL_SMALL_SPIC);
19122 #else
19123 aarch64_cmodel = AARCH64_CMODEL_SMALL_PIC;
19124 #endif
19126 break;
19127 case AARCH64_CMODEL_LARGE:
19128 if (opts->x_flag_pic)
19129 sorry ("code model %qs with %<-f%s%>", "large",
19130 opts->x_flag_pic > 1 ? "PIC" : "pic");
19131 if (opts->x_aarch64_abi == AARCH64_ABI_ILP32)
19132 sorry ("code model %qs not supported in ilp32 mode", "large");
19133 break;
19134 case AARCH64_CMODEL_TINY_PIC:
19135 case AARCH64_CMODEL_SMALL_PIC:
19136 case AARCH64_CMODEL_SMALL_SPIC:
19137 gcc_unreachable ();
19141 /* Implements TARGET_OPTION_RESTORE. Restore the backend codegen decisions
19142 using the information saved in PTR. */
19144 static void
19145 aarch64_option_restore (struct gcc_options *opts,
19146 struct gcc_options * /* opts_set */,
19147 struct cl_target_option * /* ptr */)
19149 aarch64_override_options_internal (opts);
19152 /* Implement TARGET_OPTION_PRINT. */
19154 static void
19155 aarch64_option_print (FILE *file, int indent, struct cl_target_option *ptr)
19157 const struct processor *cpu
19158 = aarch64_get_tune_cpu (ptr->x_selected_tune);
19159 const struct processor *arch = aarch64_get_arch (ptr->x_selected_arch);
19160 aarch64_feature_flags isa_flags = aarch64_get_asm_isa_flags(ptr);
19161 std::string extension
19162 = aarch64_get_extension_string_for_isa_flags (isa_flags, arch->flags);
19164 fprintf (file, "%*sselected tune = %s\n", indent, "", cpu->name);
19165 fprintf (file, "%*sselected arch = %s%s\n", indent, "",
19166 arch->name, extension.c_str ());
19169 static GTY(()) tree aarch64_previous_fndecl;
19171 void
19172 aarch64_reset_previous_fndecl (void)
19174 aarch64_previous_fndecl = NULL;
19177 /* Restore or save the TREE_TARGET_GLOBALS from or to NEW_TREE.
19178 Used by aarch64_set_current_function and aarch64_pragma_target_parse to
19179 make sure optab availability predicates are recomputed when necessary. */
19181 void
19182 aarch64_save_restore_target_globals (tree new_tree)
19184 if (TREE_TARGET_GLOBALS (new_tree))
19185 restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
19186 else if (new_tree == target_option_default_node)
19187 restore_target_globals (&default_target_globals);
19188 else
19189 TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
19192 /* Return the target_option_node for FNDECL, or the current options
19193 if FNDECL is null. */
19195 static tree
19196 aarch64_fndecl_options (tree fndecl)
19198 if (!fndecl)
19199 return target_option_current_node;
19201 if (tree options = DECL_FUNCTION_SPECIFIC_TARGET (fndecl))
19202 return options;
19204 return target_option_default_node;
19207 /* Implement TARGET_SET_CURRENT_FUNCTION. Unpack the codegen decisions
19208 like tuning and ISA features from the DECL_FUNCTION_SPECIFIC_TARGET
19209 of the function, if such exists. This function may be called multiple
19210 times on a single function so use aarch64_previous_fndecl to avoid
19211 setting up identical state. */
19213 static void
19214 aarch64_set_current_function (tree fndecl)
19216 tree old_tree = aarch64_fndecl_options (aarch64_previous_fndecl);
19217 tree new_tree = aarch64_fndecl_options (fndecl);
19219 auto new_isa_mode = (fndecl
19220 ? aarch64_fndecl_isa_mode (fndecl)
19221 : AARCH64_DEFAULT_ISA_MODE);
19222 auto isa_flags = aarch64_get_isa_flags (TREE_TARGET_OPTION (new_tree));
19224 static bool reported_zt0_p;
19225 if (!reported_zt0_p
19226 && !(isa_flags & AARCH64_FL_SME2)
19227 && fndecl
19228 && aarch64_fndecl_has_state (fndecl, "zt0"))
19230 error ("functions with %qs state require the ISA extension %qs",
19231 "zt0", "sme2");
19232 inform (input_location, "you can enable %qs using the command-line"
19233 " option %<-march%>, or by using the %<target%>"
19234 " attribute or pragma", "sme2");
19235 reported_zt0_p = true;
19238 /* If nothing to do, return. #pragma GCC reset or #pragma GCC pop to
19239 the default have been handled by aarch64_save_restore_target_globals from
19240 aarch64_pragma_target_parse. */
19241 if (old_tree == new_tree
19242 && (!fndecl || aarch64_previous_fndecl)
19243 && (isa_flags & AARCH64_FL_ISA_MODES).val[0] == new_isa_mode)
19245 gcc_assert (AARCH64_ISA_MODE == new_isa_mode);
19246 return;
19249 aarch64_previous_fndecl = fndecl;
19251 /* First set the target options. */
19252 cl_target_option_restore (&global_options, &global_options_set,
19253 TREE_TARGET_OPTION (new_tree));
19255 /* The ISA mode can vary based on function type attributes and
19256 function declaration attributes. Make sure that the target
19257 options correctly reflect these attributes. */
19258 if ((isa_flags & AARCH64_FL_ISA_MODES).val[0] != new_isa_mode)
19260 auto base_flags = (aarch64_asm_isa_flags & ~AARCH64_FL_ISA_MODES);
19261 aarch64_set_asm_isa_flags (base_flags
19262 | aarch64_feature_flags (new_isa_mode));
19264 aarch64_override_options_internal (&global_options);
19265 new_tree = build_target_option_node (&global_options,
19266 &global_options_set);
19267 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_tree;
19269 tree new_optimize = build_optimization_node (&global_options,
19270 &global_options_set);
19271 if (new_optimize != optimization_default_node)
19272 DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
19275 aarch64_save_restore_target_globals (new_tree);
19277 gcc_assert (AARCH64_ISA_MODE == new_isa_mode);
19280 /* Enum describing the various ways we can handle attributes.
19281 In many cases we can reuse the generic option handling machinery. */
19283 enum aarch64_attr_opt_type
19285 aarch64_attr_mask, /* Attribute should set a bit in target_flags. */
19286 aarch64_attr_bool, /* Attribute sets or unsets a boolean variable. */
19287 aarch64_attr_enum, /* Attribute sets an enum variable. */
19288 aarch64_attr_custom /* Attribute requires a custom handling function. */
19291 /* All the information needed to handle a target attribute.
19292 NAME is the name of the attribute.
19293 ATTR_TYPE specifies the type of behavior of the attribute as described
19294 in the definition of enum aarch64_attr_opt_type.
19295 ALLOW_NEG is true if the attribute supports a "no-" form.
19296 HANDLER is the function that takes the attribute string as an argument
19297 It is needed only when the ATTR_TYPE is aarch64_attr_custom.
19298 OPT_NUM is the enum specifying the option that the attribute modifies.
19299 This is needed for attributes that mirror the behavior of a command-line
19300 option, that is it has ATTR_TYPE aarch64_attr_mask, aarch64_attr_bool or
19301 aarch64_attr_enum. */
19303 struct aarch64_attribute_info
19305 const char *name;
19306 enum aarch64_attr_opt_type attr_type;
19307 bool allow_neg;
19308 bool (*handler) (const char *);
19309 enum opt_code opt_num;
19312 /* Handle the ARCH_STR argument to the arch= target attribute. */
19314 static bool
19315 aarch64_handle_attr_arch (const char *str)
19317 aarch64_arch tmp_arch = aarch64_no_arch;
19318 std::string invalid_extension;
19319 aarch64_feature_flags tmp_flags;
19320 enum aarch_parse_opt_result parse_res
19321 = aarch64_parse_arch (str, &tmp_arch, &tmp_flags, &invalid_extension);
19323 if (parse_res == AARCH_PARSE_OK)
19325 gcc_assert (tmp_arch != aarch64_no_arch);
19326 selected_arch = tmp_arch;
19327 aarch64_set_asm_isa_flags (tmp_flags | (aarch64_asm_isa_flags
19328 & AARCH64_FL_ISA_MODES));
19329 return true;
19332 switch (parse_res)
19334 case AARCH_PARSE_MISSING_ARG:
19335 error ("missing name in %<target(\"arch=\")%> pragma or attribute");
19336 break;
19337 case AARCH_PARSE_INVALID_ARG:
19338 error ("invalid name %qs in %<target(\"arch=\")%> pragma or attribute", str);
19339 aarch64_print_hint_for_arch (str);
19340 break;
19341 case AARCH_PARSE_INVALID_FEATURE:
19342 error ("invalid feature modifier %s of value %qs in "
19343 "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
19344 aarch64_print_hint_for_extensions (invalid_extension.c_str ());
19345 break;
19346 default:
19347 gcc_unreachable ();
19350 return false;
19353 /* Handle the argument CPU_STR to the cpu= target attribute. */
19355 static bool
19356 aarch64_handle_attr_cpu (const char *str)
19358 aarch64_cpu tmp_cpu = aarch64_no_cpu;
19359 std::string invalid_extension;
19360 aarch64_feature_flags tmp_flags;
19361 enum aarch_parse_opt_result parse_res
19362 = aarch64_parse_cpu (str, &tmp_cpu, &tmp_flags, &invalid_extension);
19364 if (parse_res == AARCH_PARSE_OK)
19366 gcc_assert (tmp_cpu != aarch64_no_cpu);
19367 selected_tune = tmp_cpu;
19368 selected_arch = aarch64_get_tune_cpu (tmp_cpu)->arch;
19369 aarch64_set_asm_isa_flags (tmp_flags | (aarch64_asm_isa_flags
19370 & AARCH64_FL_ISA_MODES));
19371 return true;
19374 switch (parse_res)
19376 case AARCH_PARSE_MISSING_ARG:
19377 error ("missing name in %<target(\"cpu=\")%> pragma or attribute");
19378 break;
19379 case AARCH_PARSE_INVALID_ARG:
19380 error ("invalid name %qs in %<target(\"cpu=\")%> pragma or attribute", str);
19381 aarch64_print_hint_for_core (str);
19382 break;
19383 case AARCH_PARSE_INVALID_FEATURE:
19384 error ("invalid feature modifier %qs of value %qs in "
19385 "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
19386 aarch64_print_hint_for_extensions (invalid_extension.c_str ());
19387 break;
19388 default:
19389 gcc_unreachable ();
19392 return false;
19395 /* Handle the argument STR to the branch-protection= attribute. */
19397 static bool
19398 aarch64_handle_attr_branch_protection (const char* str)
19400 return aarch_validate_mbranch_protection (aarch64_branch_protect_types, str,
19401 "target(\"branch-protection=\")");
19404 /* Handle the argument STR to the tune= target attribute. */
19406 static bool
19407 aarch64_handle_attr_tune (const char *str)
19409 aarch64_cpu tmp_tune = aarch64_no_cpu;
19410 enum aarch_parse_opt_result parse_res
19411 = aarch64_parse_tune (str, &tmp_tune);
19413 if (parse_res == AARCH_PARSE_OK)
19415 gcc_assert (tmp_tune != aarch64_no_cpu);
19416 selected_tune = tmp_tune;
19417 return true;
19420 switch (parse_res)
19422 case AARCH_PARSE_INVALID_ARG:
19423 error ("invalid name %qs in %<target(\"tune=\")%> pragma or attribute", str);
19424 aarch64_print_hint_for_core (str);
19425 break;
19426 default:
19427 gcc_unreachable ();
19430 return false;
19433 /* Parse an architecture extensions target attribute string specified in STR.
19434 For example "+fp+nosimd". Show any errors if needed. Return TRUE
19435 if successful. Update aarch64_isa_flags to reflect the ISA features
19436 modified. */
19438 static bool
19439 aarch64_handle_attr_isa_flags (char *str)
19441 enum aarch_parse_opt_result parse_res;
19442 auto isa_flags = aarch64_asm_isa_flags;
19444 /* We allow "+nothing" in the beginning to clear out all architectural
19445 features if the user wants to handpick specific features. */
19446 if (strncmp ("+nothing", str, 8) == 0)
19448 isa_flags &= AARCH64_FL_ISA_MODES;
19449 str += 8;
19452 std::string invalid_extension;
19453 parse_res = aarch64_parse_extension (str, &isa_flags, &invalid_extension);
19455 if (parse_res == AARCH_PARSE_OK)
19457 aarch64_set_asm_isa_flags (isa_flags);
19458 return true;
19461 switch (parse_res)
19463 case AARCH_PARSE_MISSING_ARG:
19464 error ("missing value in %<target()%> pragma or attribute");
19465 break;
19467 case AARCH_PARSE_INVALID_FEATURE:
19468 error ("invalid feature modifier %qs of value %qs in "
19469 "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
19470 break;
19472 default:
19473 gcc_unreachable ();
19476 return false;
19479 /* The target attributes that we support. On top of these we also support just
19480 ISA extensions, like __attribute__ ((target ("+crc"))), but that case is
19481 handled explicitly in aarch64_process_one_target_attr. */
19483 static const struct aarch64_attribute_info aarch64_attributes[] =
19485 { "general-regs-only", aarch64_attr_mask, false, NULL,
19486 OPT_mgeneral_regs_only },
19487 { "fix-cortex-a53-835769", aarch64_attr_bool, true, NULL,
19488 OPT_mfix_cortex_a53_835769 },
19489 { "fix-cortex-a53-843419", aarch64_attr_bool, true, NULL,
19490 OPT_mfix_cortex_a53_843419 },
19491 { "cmodel", aarch64_attr_enum, false, NULL, OPT_mcmodel_ },
19492 { "strict-align", aarch64_attr_mask, true, NULL, OPT_mstrict_align },
19493 { "omit-leaf-frame-pointer", aarch64_attr_bool, true, NULL,
19494 OPT_momit_leaf_frame_pointer },
19495 { "tls-dialect", aarch64_attr_enum, false, NULL, OPT_mtls_dialect_ },
19496 { "arch", aarch64_attr_custom, false, aarch64_handle_attr_arch,
19497 OPT_march_ },
19498 { "cpu", aarch64_attr_custom, false, aarch64_handle_attr_cpu, OPT_mcpu_ },
19499 { "tune", aarch64_attr_custom, false, aarch64_handle_attr_tune,
19500 OPT_mtune_ },
19501 { "branch-protection", aarch64_attr_custom, false,
19502 aarch64_handle_attr_branch_protection, OPT_mbranch_protection_ },
19503 { "sign-return-address", aarch64_attr_enum, false, NULL,
19504 OPT_msign_return_address_ },
19505 { "outline-atomics", aarch64_attr_bool, true, NULL,
19506 OPT_moutline_atomics},
19507 { NULL, aarch64_attr_custom, false, NULL, OPT____ }
19510 /* Parse ARG_STR which contains the definition of one target attribute.
19511 Show appropriate errors if any or return true if the attribute is valid. */
19513 static bool
19514 aarch64_process_one_target_attr (char *arg_str)
19516 bool invert = false;
19518 size_t len = strlen (arg_str);
19520 if (len == 0)
19522 error ("malformed %<target()%> pragma or attribute");
19523 return false;
19526 auto_vec<char, 32> buffer;
19527 buffer.safe_grow (len + 1);
19528 char *str_to_check = buffer.address ();
19529 memcpy (str_to_check, arg_str, len + 1);
19531 /* We have something like __attribute__ ((target ("+fp+nosimd"))).
19532 It is easier to detect and handle it explicitly here rather than going
19533 through the machinery for the rest of the target attributes in this
19534 function. */
19535 if (*str_to_check == '+')
19536 return aarch64_handle_attr_isa_flags (str_to_check);
19538 if (len > 3 && startswith (str_to_check, "no-"))
19540 invert = true;
19541 str_to_check += 3;
19543 char *arg = strchr (str_to_check, '=');
19545 /* If we found opt=foo then terminate STR_TO_CHECK at the '='
19546 and point ARG to "foo". */
19547 if (arg)
19549 *arg = '\0';
19550 arg++;
19552 const struct aarch64_attribute_info *p_attr;
19553 bool found = false;
19554 for (p_attr = aarch64_attributes; p_attr->name; p_attr++)
19556 /* If the names don't match up, or the user has given an argument
19557 to an attribute that doesn't accept one, or didn't give an argument
19558 to an attribute that expects one, fail to match. */
19559 if (strcmp (str_to_check, p_attr->name) != 0)
19560 continue;
19562 found = true;
19563 bool attr_need_arg_p = p_attr->attr_type == aarch64_attr_custom
19564 || p_attr->attr_type == aarch64_attr_enum;
19566 if (attr_need_arg_p ^ (arg != NULL))
19568 error ("pragma or attribute %<target(\"%s\")%> does not accept an argument", str_to_check);
19569 return false;
19572 /* If the name matches but the attribute does not allow "no-" versions
19573 then we can't match. */
19574 if (invert && !p_attr->allow_neg)
19576 error ("pragma or attribute %<target(\"%s\")%> does not allow a negated form", str_to_check);
19577 return false;
19580 switch (p_attr->attr_type)
19582 /* Has a custom handler registered.
19583 For example, cpu=, arch=, tune=. */
19584 case aarch64_attr_custom:
19585 gcc_assert (p_attr->handler);
19586 if (!p_attr->handler (arg))
19587 return false;
19588 break;
19590 /* Either set or unset a boolean option. */
19591 case aarch64_attr_bool:
19593 struct cl_decoded_option decoded;
19595 generate_option (p_attr->opt_num, NULL, !invert,
19596 CL_TARGET, &decoded);
19597 aarch64_handle_option (&global_options, &global_options_set,
19598 &decoded, input_location);
19599 break;
19601 /* Set or unset a bit in the target_flags. aarch64_handle_option
19602 should know what mask to apply given the option number. */
19603 case aarch64_attr_mask:
19605 struct cl_decoded_option decoded;
19606 /* We only need to specify the option number.
19607 aarch64_handle_option will know which mask to apply. */
19608 decoded.opt_index = p_attr->opt_num;
19609 decoded.value = !invert;
19610 aarch64_handle_option (&global_options, &global_options_set,
19611 &decoded, input_location);
19612 break;
19614 /* Use the option setting machinery to set an option to an enum. */
19615 case aarch64_attr_enum:
19617 gcc_assert (arg);
19618 bool valid;
19619 int value;
19620 valid = opt_enum_arg_to_value (p_attr->opt_num, arg,
19621 &value, CL_TARGET);
19622 if (valid)
19624 set_option (&global_options, NULL, p_attr->opt_num, value,
19625 NULL, DK_UNSPECIFIED, input_location,
19626 global_dc);
19628 else
19630 error ("pragma or attribute %<target(\"%s=%s\")%> is not valid", str_to_check, arg);
19632 break;
19634 default:
19635 gcc_unreachable ();
19639 /* If we reached here we either have found an attribute and validated
19640 it or didn't match any. If we matched an attribute but its arguments
19641 were malformed we will have returned false already. */
19642 return found;
19645 /* Count how many times the character C appears in
19646 NULL-terminated string STR. */
19648 static unsigned int
19649 num_occurences_in_str (char c, char *str)
19651 unsigned int res = 0;
19652 while (*str != '\0')
19654 if (*str == c)
19655 res++;
19657 str++;
19660 return res;
19663 /* Parse the tree in ARGS that contains the target attribute information
19664 and update the global target options space. */
19666 bool
19667 aarch64_process_target_attr (tree args)
19669 if (TREE_CODE (args) == TREE_LIST)
19673 tree head = TREE_VALUE (args);
19674 if (head)
19676 if (!aarch64_process_target_attr (head))
19677 return false;
19679 args = TREE_CHAIN (args);
19680 } while (args);
19682 return true;
19685 if (TREE_CODE (args) != STRING_CST)
19687 error ("attribute %<target%> argument not a string");
19688 return false;
19691 size_t len = strlen (TREE_STRING_POINTER (args));
19692 auto_vec<char, 32> buffer;
19693 buffer.safe_grow (len + 1);
19694 char *str_to_check = buffer.address ();
19695 memcpy (str_to_check, TREE_STRING_POINTER (args), len + 1);
19697 if (len == 0)
19699 error ("malformed %<target()%> pragma or attribute");
19700 return false;
19703 /* Used to catch empty spaces between commas i.e.
19704 attribute ((target ("attr1,,attr2"))). */
19705 unsigned int num_commas = num_occurences_in_str (',', str_to_check);
19707 /* Handle multiple target attributes separated by ','. */
19708 char *token = strtok_r (str_to_check, ",", &str_to_check);
19710 unsigned int num_attrs = 0;
19711 while (token)
19713 num_attrs++;
19714 if (!aarch64_process_one_target_attr (token))
19716 /* Check if token is possibly an arch extension without
19717 leading '+'. */
19718 aarch64_feature_flags isa_temp = 0;
19719 auto with_plus = std::string ("+") + token;
19720 enum aarch_parse_opt_result ext_res
19721 = aarch64_parse_extension (with_plus.c_str (), &isa_temp, nullptr);
19723 if (ext_res == AARCH_PARSE_OK)
19724 error ("arch extension %qs should be prefixed by %<+%>",
19725 token);
19726 else
19727 error ("pragma or attribute %<target(\"%s\")%> is not valid", token);
19728 return false;
19731 token = strtok_r (NULL, ",", &str_to_check);
19734 if (num_attrs != num_commas + 1)
19736 error ("malformed %<target(\"%s\")%> pragma or attribute", TREE_STRING_POINTER (args));
19737 return false;
19740 return true;
19743 static bool aarch64_process_target_version_attr (tree args);
19745 /* Implement TARGET_OPTION_VALID_ATTRIBUTE_P. This is used to
19746 process attribute ((target ("..."))). */
19748 static bool
19749 aarch64_option_valid_attribute_p (tree fndecl, tree, tree args, int)
19751 struct cl_target_option cur_target;
19752 bool ret;
19753 tree old_optimize;
19754 tree new_target, new_optimize;
19755 tree existing_target = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
19757 /* If what we're processing is the current pragma string then the
19758 target option node is already stored in target_option_current_node
19759 by aarch64_pragma_target_parse in aarch64-c.cc. Use that to avoid
19760 having to re-parse the string. This is especially useful to keep
19761 arm_neon.h compile times down since that header contains a lot
19762 of intrinsics enclosed in pragmas. */
19763 if (!existing_target && args == current_target_pragma)
19765 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = target_option_current_node;
19766 return true;
19768 tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
19770 old_optimize
19771 = build_optimization_node (&global_options, &global_options_set);
19772 func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
19774 /* If the function changed the optimization levels as well as setting
19775 target options, start with the optimizations specified. */
19776 if (func_optimize && func_optimize != old_optimize)
19777 cl_optimization_restore (&global_options, &global_options_set,
19778 TREE_OPTIMIZATION (func_optimize));
19780 /* Save the current target options to restore at the end. */
19781 cl_target_option_save (&cur_target, &global_options, &global_options_set);
19783 /* If fndecl already has some target attributes applied to it, unpack
19784 them so that we add this attribute on top of them, rather than
19785 overwriting them. */
19786 if (existing_target)
19788 struct cl_target_option *existing_options
19789 = TREE_TARGET_OPTION (existing_target);
19791 if (existing_options)
19792 cl_target_option_restore (&global_options, &global_options_set,
19793 existing_options);
19795 else
19796 cl_target_option_restore (&global_options, &global_options_set,
19797 TREE_TARGET_OPTION (target_option_current_node));
19799 ret = aarch64_process_target_attr (args);
19800 if (ret)
19802 tree version_attr = lookup_attribute ("target_version",
19803 DECL_ATTRIBUTES (fndecl));
19804 if (version_attr != NULL_TREE)
19806 /* Reapply any target_version attribute after target attribute.
19807 This should be equivalent to applying the target_version once
19808 after processing all target attributes. */
19809 tree version_args = TREE_VALUE (version_attr);
19810 ret = aarch64_process_target_version_attr (version_args);
19814 /* Set up any additional state. */
19815 if (ret)
19817 aarch64_override_options_internal (&global_options);
19818 new_target = build_target_option_node (&global_options,
19819 &global_options_set);
19821 else
19822 new_target = NULL;
19824 new_optimize = build_optimization_node (&global_options,
19825 &global_options_set);
19827 if (fndecl && ret)
19829 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
19831 if (old_optimize != new_optimize)
19832 DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
19835 cl_target_option_restore (&global_options, &global_options_set, &cur_target);
19837 if (old_optimize != new_optimize)
19838 cl_optimization_restore (&global_options, &global_options_set,
19839 TREE_OPTIMIZATION (old_optimize));
19840 return ret;
19843 typedef unsigned long long aarch64_fmv_feature_mask;
19845 typedef struct
19847 const char *name;
19848 aarch64_fmv_feature_mask feature_mask;
19849 aarch64_feature_flags opt_flags;
19850 } aarch64_fmv_feature_datum;
19852 #define AARCH64_FMV_FEATURE(NAME, FEAT_NAME, C) \
19853 {NAME, 1ULL << FEAT_##FEAT_NAME, ::feature_deps::fmv_deps_##FEAT_NAME},
19855 /* The "rdma" alias uses a different FEAT_NAME to avoid a duplicate
19856 feature_deps name. */
19857 #define FEAT_RDMA FEAT_RDM
19859 /* FMV features are listed in priority order, to make it easier to sort target
19860 strings. */
19861 static aarch64_fmv_feature_datum aarch64_fmv_feature_data[] = {
19862 #include "config/aarch64/aarch64-option-extensions.def"
19865 /* Parse a function multiversioning feature string STR, as found in a
19866 target_version or target_clones attribute.
19868 If ISA_FLAGS is nonnull, then update it with the specified architecture
19869 features turned on. If FEATURE_MASK is nonnull, then assign to it a bitmask
19870 representing the set of features explicitly specified in the feature string.
19871 Return an aarch_parse_opt_result describing the result.
19873 When the STR string contains an invalid or duplicate extension, a copy of
19874 the extension string is created and stored to INVALID_EXTENSION. */
19876 static enum aarch_parse_opt_result
19877 aarch64_parse_fmv_features (const char *str, aarch64_feature_flags *isa_flags,
19878 aarch64_fmv_feature_mask *feature_mask,
19879 std::string *invalid_extension)
19881 if (feature_mask)
19882 *feature_mask = 0ULL;
19884 if (strcmp (str, "default") == 0)
19885 return AARCH_PARSE_OK;
19887 while (str != NULL && *str != 0)
19889 const char *ext;
19890 size_t len;
19892 ext = strchr (str, '+');
19894 if (ext != NULL)
19895 len = ext - str;
19896 else
19897 len = strlen (str);
19899 if (len == 0)
19900 return AARCH_PARSE_MISSING_ARG;
19902 int num_features = ARRAY_SIZE (aarch64_fmv_feature_data);
19903 int i;
19904 for (i = 0; i < num_features; i++)
19906 if (strlen (aarch64_fmv_feature_data[i].name) == len
19907 && strncmp (aarch64_fmv_feature_data[i].name, str, len) == 0)
19909 if (isa_flags)
19910 *isa_flags |= aarch64_fmv_feature_data[i].opt_flags;
19911 if (feature_mask)
19913 auto old_feature_mask = *feature_mask;
19914 *feature_mask |= aarch64_fmv_feature_data[i].feature_mask;
19915 if (*feature_mask == old_feature_mask)
19917 /* Duplicate feature. */
19918 if (invalid_extension)
19919 *invalid_extension = std::string (str, len);
19920 return AARCH_PARSE_DUPLICATE_FEATURE;
19923 break;
19927 if (i == num_features)
19929 /* Feature not found in list. */
19930 if (invalid_extension)
19931 *invalid_extension = std::string (str, len);
19932 return AARCH_PARSE_INVALID_FEATURE;
19935 str = ext;
19936 if (str)
19937 /* Skip over the next '+'. */
19938 str++;
19941 return AARCH_PARSE_OK;
19944 /* Parse the tree in ARGS that contains the target_version attribute
19945 information and update the global target options space. */
19947 static bool
19948 aarch64_process_target_version_attr (tree args)
19950 static bool issued_warning = false;
19951 if (!issued_warning)
19953 warning (OPT_Wexperimental_fmv_target,
19954 "Function Multi Versioning support is experimental, and the "
19955 "behavior is likely to change");
19956 issued_warning = true;
19959 if (TREE_CODE (args) == TREE_LIST)
19961 if (TREE_CHAIN (args))
19963 error ("attribute %<target_version%> has multiple values");
19964 return false;
19966 args = TREE_VALUE (args);
19969 if (!args || TREE_CODE (args) != STRING_CST)
19971 error ("attribute %<target_version%> argument not a string");
19972 return false;
19975 const char *str = TREE_STRING_POINTER (args);
19977 enum aarch_parse_opt_result parse_res;
19978 auto isa_flags = aarch64_asm_isa_flags;
19980 std::string invalid_extension;
19981 parse_res = aarch64_parse_fmv_features (str, &isa_flags, NULL,
19982 &invalid_extension);
19984 if (parse_res == AARCH_PARSE_OK)
19986 aarch64_set_asm_isa_flags (isa_flags);
19987 return true;
19990 switch (parse_res)
19992 case AARCH_PARSE_MISSING_ARG:
19993 error ("missing value in %<target_version%> attribute");
19994 break;
19996 case AARCH_PARSE_INVALID_FEATURE:
19997 error ("invalid feature modifier %qs of value %qs in "
19998 "%<target_version%> attribute", invalid_extension.c_str (),
19999 str);
20000 break;
20002 case AARCH_PARSE_DUPLICATE_FEATURE:
20003 error ("duplicate feature modifier %qs of value %qs in "
20004 "%<target_version%> attribute", invalid_extension.c_str (),
20005 str);
20006 break;
20008 default:
20009 gcc_unreachable ();
20012 return false;
20015 /* Implement TARGET_OPTION_VALID_VERSION_ATTRIBUTE_P. This is used to
20016 process attribute ((target_version ("..."))). */
20018 static bool
20019 aarch64_option_valid_version_attribute_p (tree fndecl, tree, tree args, int)
20021 struct cl_target_option cur_target;
20022 bool ret;
20023 tree new_target;
20024 tree existing_target = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
20026 /* Save the current target options to restore at the end. */
20027 cl_target_option_save (&cur_target, &global_options, &global_options_set);
20029 /* If fndecl already has some target attributes applied to it, unpack
20030 them so that we add this attribute on top of them, rather than
20031 overwriting them. */
20032 if (existing_target)
20034 struct cl_target_option *existing_options
20035 = TREE_TARGET_OPTION (existing_target);
20037 if (existing_options)
20038 cl_target_option_restore (&global_options, &global_options_set,
20039 existing_options);
20041 else
20042 cl_target_option_restore (&global_options, &global_options_set,
20043 TREE_TARGET_OPTION (target_option_current_node));
20045 ret = aarch64_process_target_version_attr (args);
20047 /* Set up any additional state. */
20048 if (ret)
20050 aarch64_override_options_internal (&global_options);
20051 new_target = build_target_option_node (&global_options,
20052 &global_options_set);
20054 else
20055 new_target = NULL;
20057 if (fndecl && ret)
20058 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
20060 cl_target_option_restore (&global_options, &global_options_set, &cur_target);
20062 return ret;
20065 /* This parses the attribute arguments to target_version in DECL and the
20066 feature mask required to select those targets. No adjustments are made to
20067 add or remove redundant feature requirements. */
20069 static aarch64_fmv_feature_mask
20070 get_feature_mask_for_version (tree decl)
20072 tree version_attr = lookup_attribute ("target_version",
20073 DECL_ATTRIBUTES (decl));
20074 if (version_attr == NULL)
20075 return 0;
20077 const char *version_string = TREE_STRING_POINTER (TREE_VALUE (TREE_VALUE
20078 (version_attr)));
20079 enum aarch_parse_opt_result parse_res;
20080 aarch64_fmv_feature_mask feature_mask;
20082 parse_res = aarch64_parse_fmv_features (version_string, NULL, &feature_mask,
20083 NULL);
20085 /* We should have detected any errors before getting here. */
20086 gcc_assert (parse_res == AARCH_PARSE_OK);
20088 return feature_mask;
20091 /* Compare priorities of two feature masks. Return:
20092 1: mask1 is higher priority
20093 -1: mask2 is higher priority
20094 0: masks are equal. */
20096 static int
20097 compare_feature_masks (aarch64_fmv_feature_mask mask1,
20098 aarch64_fmv_feature_mask mask2)
20100 int pop1 = popcount_hwi (mask1);
20101 int pop2 = popcount_hwi (mask2);
20102 if (pop1 > pop2)
20103 return 1;
20104 if (pop2 > pop1)
20105 return -1;
20107 auto diff_mask = mask1 ^ mask2;
20108 if (diff_mask == 0ULL)
20109 return 0;
20110 int num_features = ARRAY_SIZE (aarch64_fmv_feature_data);
20111 for (int i = num_features - 1; i >= 0; i--)
20113 auto bit_mask = aarch64_fmv_feature_data[i].feature_mask;
20114 if (diff_mask & bit_mask)
20115 return (mask1 & bit_mask) ? 1 : -1;
20117 gcc_unreachable();
20120 /* Compare priorities of two version decls. */
20123 aarch64_compare_version_priority (tree decl1, tree decl2)
20125 auto mask1 = get_feature_mask_for_version (decl1);
20126 auto mask2 = get_feature_mask_for_version (decl2);
20128 return compare_feature_masks (mask1, mask2);
20131 /* Build the struct __ifunc_arg_t type:
20133 struct __ifunc_arg_t
20135 unsigned long _size; // Size of the struct, so it can grow.
20136 unsigned long _hwcap;
20137 unsigned long _hwcap2;
20141 static tree
20142 build_ifunc_arg_type ()
20144 tree ifunc_arg_type = lang_hooks.types.make_type (RECORD_TYPE);
20145 tree field1 = build_decl (UNKNOWN_LOCATION, FIELD_DECL,
20146 get_identifier ("_size"),
20147 long_unsigned_type_node);
20148 tree field2 = build_decl (UNKNOWN_LOCATION, FIELD_DECL,
20149 get_identifier ("_hwcap"),
20150 long_unsigned_type_node);
20151 tree field3 = build_decl (UNKNOWN_LOCATION, FIELD_DECL,
20152 get_identifier ("_hwcap2"),
20153 long_unsigned_type_node);
20155 DECL_FIELD_CONTEXT (field1) = ifunc_arg_type;
20156 DECL_FIELD_CONTEXT (field2) = ifunc_arg_type;
20157 DECL_FIELD_CONTEXT (field3) = ifunc_arg_type;
20159 TYPE_FIELDS (ifunc_arg_type) = field1;
20160 DECL_CHAIN (field1) = field2;
20161 DECL_CHAIN (field2) = field3;
20163 layout_type (ifunc_arg_type);
20165 tree const_type = build_qualified_type (ifunc_arg_type, TYPE_QUAL_CONST);
20166 tree pointer_type = build_pointer_type (const_type);
20168 return pointer_type;
20171 /* Implement TARGET_MANGLE_DECL_ASSEMBLER_NAME, to add function multiversioning
20172 suffixes. */
20174 tree
20175 aarch64_mangle_decl_assembler_name (tree decl, tree id)
20177 /* For function version, add the target suffix to the assembler name. */
20178 if (TREE_CODE (decl) == FUNCTION_DECL
20179 && DECL_FUNCTION_VERSIONED (decl))
20181 aarch64_fmv_feature_mask feature_mask = get_feature_mask_for_version (decl);
20183 std::string name = IDENTIFIER_POINTER (id);
20185 /* For the default version, append ".default". */
20186 if (feature_mask == 0ULL)
20188 name += ".default";
20189 return get_identifier (name.c_str());
20192 name += "._";
20194 int num_features = ARRAY_SIZE (aarch64_fmv_feature_data);
20195 for (int i = 0; i < num_features; i++)
20197 if (feature_mask & aarch64_fmv_feature_data[i].feature_mask)
20199 name += "M";
20200 name += aarch64_fmv_feature_data[i].name;
20204 if (DECL_ASSEMBLER_NAME_SET_P (decl))
20205 SET_DECL_RTL (decl, NULL);
20207 id = get_identifier (name.c_str());
20209 return id;
20212 /* Return an identifier for the base assembler name of a versioned function.
20213 This is computed by taking the default version's assembler name, and
20214 stripping off the ".default" suffix if it's already been appended. */
20216 static tree
20217 get_suffixed_assembler_name (tree default_decl, const char *suffix)
20219 std::string name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (default_decl));
20221 auto size = name.size ();
20222 if (size >= 8 && name.compare (size - 8, 8, ".default") == 0)
20223 name.resize (size - 8);
20224 name += suffix;
20225 return get_identifier (name.c_str());
20228 /* Make the resolver function decl to dispatch the versions of
20229 a multi-versioned function, DEFAULT_DECL. IFUNC_ALIAS_DECL is
20230 ifunc alias that will point to the created resolver. Create an
20231 empty basic block in the resolver and store the pointer in
20232 EMPTY_BB. Return the decl of the resolver function. */
20234 static tree
20235 make_resolver_func (const tree default_decl,
20236 const tree ifunc_alias_decl,
20237 basic_block *empty_bb)
20239 tree decl, type, t;
20241 /* Create resolver function name based on default_decl. We need to remove an
20242 existing ".default" suffix if this has already been appended. */
20243 tree decl_name = get_suffixed_assembler_name (default_decl, ".resolver");
20244 const char *resolver_name = IDENTIFIER_POINTER (decl_name);
20246 /* The resolver function should have signature
20247 (void *) resolver (uint64_t, const __ifunc_arg_t *) */
20248 type = build_function_type_list (ptr_type_node,
20249 uint64_type_node,
20250 build_ifunc_arg_type (),
20251 NULL_TREE);
20253 decl = build_fn_decl (resolver_name, type);
20254 SET_DECL_ASSEMBLER_NAME (decl, decl_name);
20256 DECL_NAME (decl) = decl_name;
20257 TREE_USED (decl) = 1;
20258 DECL_ARTIFICIAL (decl) = 1;
20259 DECL_IGNORED_P (decl) = 1;
20260 TREE_PUBLIC (decl) = 0;
20261 DECL_UNINLINABLE (decl) = 1;
20263 /* Resolver is not external, body is generated. */
20264 DECL_EXTERNAL (decl) = 0;
20265 DECL_EXTERNAL (ifunc_alias_decl) = 0;
20267 DECL_CONTEXT (decl) = NULL_TREE;
20268 DECL_INITIAL (decl) = make_node (BLOCK);
20269 DECL_STATIC_CONSTRUCTOR (decl) = 0;
20271 if (DECL_COMDAT_GROUP (default_decl)
20272 || TREE_PUBLIC (default_decl))
20274 /* In this case, each translation unit with a call to this
20275 versioned function will put out a resolver. Ensure it
20276 is comdat to keep just one copy. */
20277 DECL_COMDAT (decl) = 1;
20278 make_decl_one_only (decl, DECL_ASSEMBLER_NAME (decl));
20280 else
20281 TREE_PUBLIC (ifunc_alias_decl) = 0;
20283 /* Build result decl and add to function_decl. */
20284 t = build_decl (UNKNOWN_LOCATION, RESULT_DECL, NULL_TREE, ptr_type_node);
20285 DECL_CONTEXT (t) = decl;
20286 DECL_ARTIFICIAL (t) = 1;
20287 DECL_IGNORED_P (t) = 1;
20288 DECL_RESULT (decl) = t;
20290 /* Build parameter decls and add to function_decl. */
20291 tree arg1 = build_decl (UNKNOWN_LOCATION, PARM_DECL,
20292 get_identifier ("hwcap"),
20293 uint64_type_node);
20294 tree arg2 = build_decl (UNKNOWN_LOCATION, PARM_DECL,
20295 get_identifier ("arg"),
20296 build_ifunc_arg_type());
20297 DECL_CONTEXT (arg1) = decl;
20298 DECL_CONTEXT (arg2) = decl;
20299 DECL_ARTIFICIAL (arg1) = 1;
20300 DECL_ARTIFICIAL (arg2) = 1;
20301 DECL_IGNORED_P (arg1) = 1;
20302 DECL_IGNORED_P (arg2) = 1;
20303 DECL_ARG_TYPE (arg1) = uint64_type_node;
20304 DECL_ARG_TYPE (arg2) = build_ifunc_arg_type ();
20305 DECL_ARGUMENTS (decl) = arg1;
20306 TREE_CHAIN (arg1) = arg2;
20308 gimplify_function_tree (decl);
20309 push_cfun (DECL_STRUCT_FUNCTION (decl));
20310 *empty_bb = init_lowered_empty_function (decl, false,
20311 profile_count::uninitialized ());
20313 cgraph_node::add_new_function (decl, true);
20314 symtab->call_cgraph_insertion_hooks (cgraph_node::get_create (decl));
20316 pop_cfun ();
20318 gcc_assert (ifunc_alias_decl != NULL);
20319 /* Mark ifunc_alias_decl as "ifunc" with resolver as resolver_name. */
20320 DECL_ATTRIBUTES (ifunc_alias_decl)
20321 = make_attribute ("ifunc", resolver_name,
20322 DECL_ATTRIBUTES (ifunc_alias_decl));
20324 /* Create the alias for dispatch to resolver here. */
20325 cgraph_node::create_same_body_alias (ifunc_alias_decl, decl);
20326 return decl;
20329 /* This adds a condition to the basic_block NEW_BB in function FUNCTION_DECL
20330 to return a pointer to VERSION_DECL if all feature bits specified in
20331 FEATURE_MASK are not set in MASK_VAR. This function will be called during
20332 version dispatch to decide which function version to execute. It returns
20333 the basic block at the end, to which more conditions can be added. */
20334 static basic_block
20335 add_condition_to_bb (tree function_decl, tree version_decl,
20336 aarch64_fmv_feature_mask feature_mask,
20337 tree mask_var, basic_block new_bb)
20339 gimple *return_stmt;
20340 tree convert_expr, result_var;
20341 gimple *convert_stmt;
20342 gimple *if_else_stmt;
20344 basic_block bb1, bb2, bb3;
20345 edge e12, e23;
20347 gimple_seq gseq;
20349 push_cfun (DECL_STRUCT_FUNCTION (function_decl));
20351 gcc_assert (new_bb != NULL);
20352 gseq = bb_seq (new_bb);
20354 convert_expr = build1 (CONVERT_EXPR, ptr_type_node,
20355 build_fold_addr_expr (version_decl));
20356 result_var = create_tmp_var (ptr_type_node);
20357 convert_stmt = gimple_build_assign (result_var, convert_expr);
20358 return_stmt = gimple_build_return (result_var);
20360 if (feature_mask == 0ULL)
20362 /* Default version. */
20363 gimple_seq_add_stmt (&gseq, convert_stmt);
20364 gimple_seq_add_stmt (&gseq, return_stmt);
20365 set_bb_seq (new_bb, gseq);
20366 gimple_set_bb (convert_stmt, new_bb);
20367 gimple_set_bb (return_stmt, new_bb);
20368 pop_cfun ();
20369 return new_bb;
20372 tree and_expr_var = create_tmp_var (long_long_unsigned_type_node);
20373 tree and_expr = build2 (BIT_AND_EXPR,
20374 long_long_unsigned_type_node,
20375 mask_var,
20376 build_int_cst (long_long_unsigned_type_node,
20377 feature_mask));
20378 gimple *and_stmt = gimple_build_assign (and_expr_var, and_expr);
20379 gimple_set_block (and_stmt, DECL_INITIAL (function_decl));
20380 gimple_set_bb (and_stmt, new_bb);
20381 gimple_seq_add_stmt (&gseq, and_stmt);
20383 tree zero_llu = build_int_cst (long_long_unsigned_type_node, 0);
20384 if_else_stmt = gimple_build_cond (EQ_EXPR, and_expr_var, zero_llu,
20385 NULL_TREE, NULL_TREE);
20386 gimple_set_block (if_else_stmt, DECL_INITIAL (function_decl));
20387 gimple_set_bb (if_else_stmt, new_bb);
20388 gimple_seq_add_stmt (&gseq, if_else_stmt);
20390 gimple_seq_add_stmt (&gseq, convert_stmt);
20391 gimple_seq_add_stmt (&gseq, return_stmt);
20392 set_bb_seq (new_bb, gseq);
20394 bb1 = new_bb;
20395 e12 = split_block (bb1, if_else_stmt);
20396 bb2 = e12->dest;
20397 e12->flags &= ~EDGE_FALLTHRU;
20398 e12->flags |= EDGE_TRUE_VALUE;
20400 e23 = split_block (bb2, return_stmt);
20402 gimple_set_bb (convert_stmt, bb2);
20403 gimple_set_bb (return_stmt, bb2);
20405 bb3 = e23->dest;
20406 make_edge (bb1, bb3, EDGE_FALSE_VALUE);
20408 remove_edge (e23);
20409 make_edge (bb2, EXIT_BLOCK_PTR_FOR_FN (cfun), 0);
20411 pop_cfun ();
20413 return bb3;
20416 /* This function generates the dispatch function for
20417 multi-versioned functions. DISPATCH_DECL is the function which will
20418 contain the dispatch logic. FNDECLS are the function choices for
20419 dispatch, and is a tree chain. EMPTY_BB is the basic block pointer
20420 in DISPATCH_DECL in which the dispatch code is generated. */
20422 static int
20423 dispatch_function_versions (tree dispatch_decl,
20424 void *fndecls_p,
20425 basic_block *empty_bb)
20427 gimple *ifunc_cpu_init_stmt;
20428 gimple_seq gseq;
20429 vec<tree> *fndecls;
20431 gcc_assert (dispatch_decl != NULL
20432 && fndecls_p != NULL
20433 && empty_bb != NULL);
20435 push_cfun (DECL_STRUCT_FUNCTION (dispatch_decl));
20437 gseq = bb_seq (*empty_bb);
20438 /* Function version dispatch is via IFUNC. IFUNC resolvers fire before
20439 constructors, so explicity call __init_cpu_features_resolver here. */
20440 tree init_fn_type = build_function_type_list (void_type_node,
20441 long_unsigned_type_node,
20442 build_ifunc_arg_type(),
20443 NULL);
20444 tree init_fn_id = get_identifier ("__init_cpu_features_resolver");
20445 tree init_fn_decl = build_decl (UNKNOWN_LOCATION, FUNCTION_DECL,
20446 init_fn_id, init_fn_type);
20447 DECL_EXTERNAL (init_fn_decl) = 1;
20448 TREE_PUBLIC (init_fn_decl) = 1;
20449 DECL_VISIBILITY (init_fn_decl) = VISIBILITY_HIDDEN;
20450 DECL_VISIBILITY_SPECIFIED (init_fn_decl) = 1;
20451 tree arg1 = DECL_ARGUMENTS (dispatch_decl);
20452 tree arg2 = TREE_CHAIN (arg1);
20453 ifunc_cpu_init_stmt = gimple_build_call (init_fn_decl, 2, arg1, arg2);
20454 gimple_seq_add_stmt (&gseq, ifunc_cpu_init_stmt);
20455 gimple_set_bb (ifunc_cpu_init_stmt, *empty_bb);
20457 /* Build the struct type for __aarch64_cpu_features. */
20458 tree global_type = lang_hooks.types.make_type (RECORD_TYPE);
20459 tree field1 = build_decl (UNKNOWN_LOCATION, FIELD_DECL,
20460 get_identifier ("features"),
20461 long_long_unsigned_type_node);
20462 DECL_FIELD_CONTEXT (field1) = global_type;
20463 TYPE_FIELDS (global_type) = field1;
20464 layout_type (global_type);
20466 tree global_var = build_decl (UNKNOWN_LOCATION, VAR_DECL,
20467 get_identifier ("__aarch64_cpu_features"),
20468 global_type);
20469 DECL_EXTERNAL (global_var) = 1;
20470 TREE_PUBLIC (global_var) = 1;
20471 DECL_VISIBILITY (global_var) = VISIBILITY_HIDDEN;
20472 DECL_VISIBILITY_SPECIFIED (global_var) = 1;
20473 tree mask_var = create_tmp_var (long_long_unsigned_type_node);
20475 tree component_expr = build3 (COMPONENT_REF, long_long_unsigned_type_node,
20476 global_var, field1, NULL_TREE);
20477 gimple *component_stmt = gimple_build_assign (mask_var, component_expr);
20478 gimple_set_block (component_stmt, DECL_INITIAL (dispatch_decl));
20479 gimple_set_bb (component_stmt, *empty_bb);
20480 gimple_seq_add_stmt (&gseq, component_stmt);
20482 tree not_expr = build1 (BIT_NOT_EXPR, long_long_unsigned_type_node, mask_var);
20483 gimple *not_stmt = gimple_build_assign (mask_var, not_expr);
20484 gimple_set_block (not_stmt, DECL_INITIAL (dispatch_decl));
20485 gimple_set_bb (not_stmt, *empty_bb);
20486 gimple_seq_add_stmt (&gseq, not_stmt);
20488 set_bb_seq (*empty_bb, gseq);
20490 pop_cfun ();
20492 /* fndecls_p is actually a vector. */
20493 fndecls = static_cast<vec<tree> *> (fndecls_p);
20495 /* At least one more version other than the default. */
20496 unsigned int num_versions = fndecls->length ();
20497 gcc_assert (num_versions >= 2);
20499 struct function_version_info
20501 tree version_decl;
20502 aarch64_fmv_feature_mask feature_mask;
20503 } *function_versions;
20505 function_versions = (struct function_version_info *)
20506 XNEWVEC (struct function_version_info, (num_versions));
20508 unsigned int actual_versions = 0;
20510 for (tree version_decl : *fndecls)
20512 aarch64_fmv_feature_mask feature_mask;
20513 /* Get attribute string, parse it and find the right features. */
20514 feature_mask = get_feature_mask_for_version (version_decl);
20515 function_versions [actual_versions].version_decl = version_decl;
20516 function_versions [actual_versions].feature_mask = feature_mask;
20517 actual_versions++;
20520 auto compare_feature_version_info = [](const void *p1, const void *p2) {
20521 const function_version_info v1 = *(const function_version_info *)p1;
20522 const function_version_info v2 = *(const function_version_info *)p2;
20523 return - compare_feature_masks (v1.feature_mask, v2.feature_mask);
20526 /* Sort the versions according to descending order of dispatch priority. */
20527 qsort (function_versions, actual_versions,
20528 sizeof (struct function_version_info), compare_feature_version_info);
20530 for (unsigned int i = 0; i < actual_versions; ++i)
20531 *empty_bb = add_condition_to_bb (dispatch_decl,
20532 function_versions[i].version_decl,
20533 function_versions[i].feature_mask,
20534 mask_var,
20535 *empty_bb);
20537 free (function_versions);
20538 return 0;
20541 /* Implement TARGET_GENERATE_VERSION_DISPATCHER_BODY. */
20543 tree
20544 aarch64_generate_version_dispatcher_body (void *node_p)
20546 tree resolver_decl;
20547 basic_block empty_bb;
20548 tree default_ver_decl;
20549 struct cgraph_node *versn;
20550 struct cgraph_node *node;
20552 struct cgraph_function_version_info *node_version_info = NULL;
20553 struct cgraph_function_version_info *versn_info = NULL;
20555 node = (cgraph_node *)node_p;
20557 node_version_info = node->function_version ();
20558 gcc_assert (node->dispatcher_function
20559 && node_version_info != NULL);
20561 if (node_version_info->dispatcher_resolver)
20562 return node_version_info->dispatcher_resolver;
20564 /* The first version in the chain corresponds to the default version. */
20565 default_ver_decl = node_version_info->next->this_node->decl;
20567 /* node is going to be an alias, so remove the finalized bit. */
20568 node->definition = false;
20570 resolver_decl = make_resolver_func (default_ver_decl,
20571 node->decl, &empty_bb);
20573 node_version_info->dispatcher_resolver = resolver_decl;
20575 push_cfun (DECL_STRUCT_FUNCTION (resolver_decl));
20577 auto_vec<tree, 2> fn_ver_vec;
20579 for (versn_info = node_version_info->next; versn_info;
20580 versn_info = versn_info->next)
20582 versn = versn_info->this_node;
20583 /* Check for virtual functions here again, as by this time it should
20584 have been determined if this function needs a vtable index or
20585 not. This happens for methods in derived classes that override
20586 virtual methods in base classes but are not explicitly marked as
20587 virtual. */
20588 if (DECL_VINDEX (versn->decl))
20589 sorry ("virtual function multiversioning not supported");
20591 fn_ver_vec.safe_push (versn->decl);
20594 dispatch_function_versions (resolver_decl, &fn_ver_vec, &empty_bb);
20595 cgraph_edge::rebuild_edges ();
20596 pop_cfun ();
20598 /* Fix up symbol names. First we need to obtain the base name, which may
20599 have already been mangled. */
20600 tree base_name = get_suffixed_assembler_name (default_ver_decl, "");
20602 /* We need to redo the version mangling on the non-default versions for the
20603 target_clones case. Redoing the mangling for the target_version case is
20604 redundant but does no harm. We need to skip the default version, because
20605 expand_clones will append ".default" later; fortunately that suffix is the
20606 one we want anyway. */
20607 for (versn_info = node_version_info->next->next; versn_info;
20608 versn_info = versn_info->next)
20610 tree version_decl = versn_info->this_node->decl;
20611 tree name = aarch64_mangle_decl_assembler_name (version_decl,
20612 base_name);
20613 symtab->change_decl_assembler_name (version_decl, name);
20616 /* We also need to use the base name for the ifunc declaration. */
20617 symtab->change_decl_assembler_name (node->decl, base_name);
20619 return resolver_decl;
20622 /* Make a dispatcher declaration for the multi-versioned function DECL.
20623 Calls to DECL function will be replaced with calls to the dispatcher
20624 by the front-end. Returns the decl of the dispatcher function. */
20626 tree
20627 aarch64_get_function_versions_dispatcher (void *decl)
20629 tree fn = (tree) decl;
20630 struct cgraph_node *node = NULL;
20631 struct cgraph_node *default_node = NULL;
20632 struct cgraph_function_version_info *node_v = NULL;
20633 struct cgraph_function_version_info *first_v = NULL;
20635 tree dispatch_decl = NULL;
20637 struct cgraph_function_version_info *default_version_info = NULL;
20639 gcc_assert (fn != NULL && DECL_FUNCTION_VERSIONED (fn));
20641 node = cgraph_node::get (fn);
20642 gcc_assert (node != NULL);
20644 node_v = node->function_version ();
20645 gcc_assert (node_v != NULL);
20647 if (node_v->dispatcher_resolver != NULL)
20648 return node_v->dispatcher_resolver;
20650 /* Find the default version and make it the first node. */
20651 first_v = node_v;
20652 /* Go to the beginning of the chain. */
20653 while (first_v->prev != NULL)
20654 first_v = first_v->prev;
20655 default_version_info = first_v;
20656 while (default_version_info != NULL)
20658 if (get_feature_mask_for_version
20659 (default_version_info->this_node->decl) == 0ULL)
20660 break;
20661 default_version_info = default_version_info->next;
20664 /* If there is no default node, just return NULL. */
20665 if (default_version_info == NULL)
20666 return NULL;
20668 /* Make default info the first node. */
20669 if (first_v != default_version_info)
20671 default_version_info->prev->next = default_version_info->next;
20672 if (default_version_info->next)
20673 default_version_info->next->prev = default_version_info->prev;
20674 first_v->prev = default_version_info;
20675 default_version_info->next = first_v;
20676 default_version_info->prev = NULL;
20679 default_node = default_version_info->this_node;
20681 if (targetm.has_ifunc_p ())
20683 struct cgraph_function_version_info *it_v = NULL;
20684 struct cgraph_node *dispatcher_node = NULL;
20685 struct cgraph_function_version_info *dispatcher_version_info = NULL;
20687 /* Right now, the dispatching is done via ifunc. */
20688 dispatch_decl = make_dispatcher_decl (default_node->decl);
20689 TREE_NOTHROW (dispatch_decl) = TREE_NOTHROW (fn);
20691 dispatcher_node = cgraph_node::get_create (dispatch_decl);
20692 gcc_assert (dispatcher_node != NULL);
20693 dispatcher_node->dispatcher_function = 1;
20694 dispatcher_version_info
20695 = dispatcher_node->insert_new_function_version ();
20696 dispatcher_version_info->next = default_version_info;
20697 dispatcher_node->definition = 1;
20699 /* Set the dispatcher for all the versions. */
20700 it_v = default_version_info;
20701 while (it_v != NULL)
20703 it_v->dispatcher_resolver = dispatch_decl;
20704 it_v = it_v->next;
20707 else
20709 error_at (DECL_SOURCE_LOCATION (default_node->decl),
20710 "multiversioning needs %<ifunc%> which is not supported "
20711 "on this target");
20714 return dispatch_decl;
20717 /* This function returns true if FN1 and FN2 are versions of the same function,
20718 that is, the target_version attributes of the function decls are different.
20719 This assumes that FN1 and FN2 have the same signature. */
20721 bool
20722 aarch64_common_function_versions (tree fn1, tree fn2)
20724 if (TREE_CODE (fn1) != FUNCTION_DECL
20725 || TREE_CODE (fn2) != FUNCTION_DECL)
20726 return false;
20728 return (aarch64_compare_version_priority (fn1, fn2) != 0);
20731 /* Implement TARGET_FUNCTION_ATTRIBUTE_INLINABLE_P. Use an opt-out
20732 rather than an opt-in list. */
20734 static bool
20735 aarch64_function_attribute_inlinable_p (const_tree fndecl)
20737 /* A function that has local SME state cannot be inlined into its caller,
20738 since we only support managing PSTATE.ZA switches at function scope. */
20739 return (!aarch64_fndecl_has_new_state (fndecl, "za")
20740 && !aarch64_fndecl_has_new_state (fndecl, "zt0"));
20743 /* Helper for aarch64_can_inline_p. In the case where CALLER and CALLEE are
20744 tri-bool options (yes, no, don't care) and the default value is
20745 DEF, determine whether to reject inlining. */
20747 static bool
20748 aarch64_tribools_ok_for_inlining_p (int caller, int callee,
20749 int dont_care, int def)
20751 /* If the callee doesn't care, always allow inlining. */
20752 if (callee == dont_care)
20753 return true;
20755 /* If the caller doesn't care, always allow inlining. */
20756 if (caller == dont_care)
20757 return true;
20759 /* Otherwise, allow inlining if either the callee and caller values
20760 agree, or if the callee is using the default value. */
20761 return (callee == caller || callee == def);
20764 /* Bit allocations for ipa_fn_summary::target_info. */
20766 /* Set if the function contains a stmt that relies on the function's
20767 choice of PSTATE.SM setting (0 for non-streaming, 1 for streaming).
20768 Not meaningful for streaming-compatible functions. */
20769 constexpr auto AARCH64_IPA_SM_FIXED = 1U << 0;
20771 /* Set if the function clobbers ZA and ZT0. Not meaningful for functions that
20772 have ZA state. */
20773 constexpr auto AARCH64_IPA_CLOBBERS_ZA = 1U << 1;
20774 constexpr auto AARCH64_IPA_CLOBBERS_ZT0 = 1U << 2;
20776 /* Implement TARGET_NEED_IPA_FN_TARGET_INFO. */
20778 static bool
20779 aarch64_need_ipa_fn_target_info (const_tree, unsigned int &)
20781 /* We could in principle skip this for streaming-compatible functions
20782 that have ZA state, but that's a rare combination. */
20783 return true;
20786 /* Implement TARGET_UPDATE_IPA_FN_TARGET_INFO. */
20788 static bool
20789 aarch64_update_ipa_fn_target_info (unsigned int &info, const gimple *stmt)
20791 if (auto *ga = dyn_cast<const gasm *> (stmt))
20793 /* We don't know what the asm does, so conservatively assume that
20794 it requires the function's current SM mode. */
20795 info |= AARCH64_IPA_SM_FIXED;
20796 for (unsigned int i = 0; i < gimple_asm_nclobbers (ga); ++i)
20798 tree op = gimple_asm_clobber_op (ga, i);
20799 const char *clobber = TREE_STRING_POINTER (TREE_VALUE (op));
20800 if (strcmp (clobber, "za") == 0)
20801 info |= AARCH64_IPA_CLOBBERS_ZA;
20802 if (strcmp (clobber, "zt0") == 0)
20803 info |= AARCH64_IPA_CLOBBERS_ZT0;
20806 if (auto *call = dyn_cast<const gcall *> (stmt))
20808 if (gimple_call_builtin_p (call, BUILT_IN_MD))
20810 /* The attributes on AArch64 builtins are supposed to be accurate.
20811 If the function isn't marked streaming-compatible then it
20812 needs whichever SM mode it selects. */
20813 tree decl = gimple_call_fndecl (call);
20814 if (aarch64_fndecl_pstate_sm (decl) != 0)
20815 info |= AARCH64_IPA_SM_FIXED;
20818 return true;
20821 /* Implement TARGET_CAN_INLINE_P. Decide whether it is valid
20822 to inline CALLEE into CALLER based on target-specific info.
20823 Make sure that the caller and callee have compatible architectural
20824 features. Then go through the other possible target attributes
20825 and see if they can block inlining. Try not to reject always_inline
20826 callees unless they are incompatible architecturally. */
20828 static bool
20829 aarch64_can_inline_p (tree caller, tree callee)
20831 tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
20832 tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
20834 struct cl_target_option *caller_opts
20835 = TREE_TARGET_OPTION (caller_tree ? caller_tree
20836 : target_option_default_node);
20838 struct cl_target_option *callee_opts
20839 = TREE_TARGET_OPTION (callee_tree ? callee_tree
20840 : target_option_default_node);
20842 /* Callee's ISA flags should be a subset of the caller's. */
20843 auto caller_asm_isa = (aarch64_get_asm_isa_flags (caller_opts)
20844 & ~AARCH64_FL_ISA_MODES);
20845 auto callee_asm_isa = (aarch64_get_asm_isa_flags (callee_opts)
20846 & ~AARCH64_FL_ISA_MODES);
20847 if (callee_asm_isa & ~caller_asm_isa)
20848 return false;
20850 auto caller_isa = (aarch64_get_isa_flags (caller_opts)
20851 & ~AARCH64_FL_ISA_MODES);
20852 auto callee_isa = (aarch64_get_isa_flags (callee_opts)
20853 & ~AARCH64_FL_ISA_MODES);
20854 if (callee_isa & ~caller_isa)
20855 return false;
20857 /* Return true if the callee might have target_info property PROPERTY.
20858 The answer must be true unless we have positive proof to the contrary. */
20859 auto callee_has_property = [&](unsigned int property)
20861 if (ipa_fn_summaries)
20862 if (auto *summary = ipa_fn_summaries->get (cgraph_node::get (callee)))
20863 if (!(summary->target_info & property))
20864 return false;
20865 return true;
20868 /* Streaming-compatible code can be inlined into functions with any
20869 PSTATE.SM mode. Otherwise the caller and callee must agree on
20870 PSTATE.SM mode, unless we can prove that the callee is naturally
20871 streaming-compatible. */
20872 auto caller_sm = (aarch64_get_isa_flags (caller_opts) & AARCH64_FL_SM_STATE);
20873 auto callee_sm = (aarch64_get_isa_flags (callee_opts) & AARCH64_FL_SM_STATE);
20874 if (callee_sm
20875 && caller_sm != callee_sm
20876 && callee_has_property (AARCH64_IPA_SM_FIXED))
20877 return false;
20879 /* aarch64_function_attribute_inlinable_p prevents new-ZA and new-ZT0
20880 functions from being inlined into others. We also need to prevent
20881 inlining of shared-ZA functions into functions without ZA state,
20882 since this is an error condition.
20884 The only other problematic case for ZA is inlining a function that
20885 directly clobbers ZA or ZT0 into a function that has ZA or ZT0 state. */
20886 auto caller_za = (aarch64_get_isa_flags (caller_opts) & AARCH64_FL_ZA_ON);
20887 auto callee_za = (aarch64_get_isa_flags (callee_opts) & AARCH64_FL_ZA_ON);
20888 if (!caller_za && callee_za)
20889 return false;
20890 if (!callee_za
20891 && aarch64_fndecl_has_state (caller, "za")
20892 && callee_has_property (AARCH64_IPA_CLOBBERS_ZA))
20893 return false;
20894 if (!callee_za
20895 && aarch64_fndecl_has_state (caller, "zt0")
20896 && callee_has_property (AARCH64_IPA_CLOBBERS_ZT0))
20897 return false;
20899 /* Allow non-strict aligned functions inlining into strict
20900 aligned ones. */
20901 if ((TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)
20902 != TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags))
20903 && !(!TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags)
20904 && TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)))
20905 return false;
20907 bool always_inline = lookup_attribute ("always_inline",
20908 DECL_ATTRIBUTES (callee));
20910 /* If the architectural features match up and the callee is always_inline
20911 then the other attributes don't matter. */
20912 if (always_inline)
20913 return true;
20915 if (caller_opts->x_aarch64_cmodel_var
20916 != callee_opts->x_aarch64_cmodel_var)
20917 return false;
20919 if (caller_opts->x_aarch64_tls_dialect
20920 != callee_opts->x_aarch64_tls_dialect)
20921 return false;
20923 /* Honour explicit requests to workaround errata. */
20924 if (!aarch64_tribools_ok_for_inlining_p (
20925 caller_opts->x_aarch64_fix_a53_err835769,
20926 callee_opts->x_aarch64_fix_a53_err835769,
20927 2, TARGET_FIX_ERR_A53_835769_DEFAULT))
20928 return false;
20930 if (!aarch64_tribools_ok_for_inlining_p (
20931 caller_opts->x_aarch64_fix_a53_err843419,
20932 callee_opts->x_aarch64_fix_a53_err843419,
20933 2, TARGET_FIX_ERR_A53_843419))
20934 return false;
20936 /* If the user explicitly specified -momit-leaf-frame-pointer for the
20937 caller and calle and they don't match up, reject inlining. */
20938 if (!aarch64_tribools_ok_for_inlining_p (
20939 caller_opts->x_flag_omit_leaf_frame_pointer,
20940 callee_opts->x_flag_omit_leaf_frame_pointer,
20941 2, 1))
20942 return false;
20944 /* If the callee has specific tuning overrides, respect them. */
20945 if (callee_opts->x_aarch64_override_tune_string != NULL
20946 && caller_opts->x_aarch64_override_tune_string == NULL)
20947 return false;
20949 /* If the user specified tuning override strings for the
20950 caller and callee and they don't match up, reject inlining.
20951 We just do a string compare here, we don't analyze the meaning
20952 of the string, as it would be too costly for little gain. */
20953 if (callee_opts->x_aarch64_override_tune_string
20954 && caller_opts->x_aarch64_override_tune_string
20955 && (strcmp (callee_opts->x_aarch64_override_tune_string,
20956 caller_opts->x_aarch64_override_tune_string) != 0))
20957 return false;
20959 return true;
20962 /* Return the ID of the TLDESC ABI, initializing the descriptor if hasn't
20963 been already. */
20965 arm_pcs
20966 aarch64_tlsdesc_abi_id ()
20968 predefined_function_abi &tlsdesc_abi = function_abis[ARM_PCS_TLSDESC];
20969 if (!tlsdesc_abi.initialized_p ())
20971 HARD_REG_SET full_reg_clobbers;
20972 CLEAR_HARD_REG_SET (full_reg_clobbers);
20973 SET_HARD_REG_BIT (full_reg_clobbers, R0_REGNUM);
20974 SET_HARD_REG_BIT (full_reg_clobbers, CC_REGNUM);
20975 for (int regno = P0_REGNUM; regno <= P15_REGNUM; ++regno)
20976 SET_HARD_REG_BIT (full_reg_clobbers, regno);
20977 tlsdesc_abi.initialize (ARM_PCS_TLSDESC, full_reg_clobbers);
20979 return ARM_PCS_TLSDESC;
20982 /* Return true if SYMBOL_REF X binds locally. */
20984 static bool
20985 aarch64_symbol_binds_local_p (const_rtx x)
20987 return (SYMBOL_REF_DECL (x)
20988 ? targetm.binds_local_p (SYMBOL_REF_DECL (x))
20989 : SYMBOL_REF_LOCAL_P (x));
20992 /* Return true if SYMBOL_REF X is thread local */
20993 static bool
20994 aarch64_tls_symbol_p (rtx x)
20996 if (! TARGET_HAVE_TLS)
20997 return false;
20999 x = strip_salt (x);
21000 if (!SYMBOL_REF_P (x))
21001 return false;
21003 return SYMBOL_REF_TLS_MODEL (x) != 0;
21006 /* Classify a TLS symbol into one of the TLS kinds. */
21007 enum aarch64_symbol_type
21008 aarch64_classify_tls_symbol (rtx x)
21010 enum tls_model tls_kind = tls_symbolic_operand_type (x);
21012 switch (tls_kind)
21014 case TLS_MODEL_GLOBAL_DYNAMIC:
21015 case TLS_MODEL_LOCAL_DYNAMIC:
21016 return TARGET_TLS_DESC ? SYMBOL_SMALL_TLSDESC : SYMBOL_SMALL_TLSGD;
21018 case TLS_MODEL_INITIAL_EXEC:
21019 switch (aarch64_cmodel)
21021 case AARCH64_CMODEL_TINY:
21022 case AARCH64_CMODEL_TINY_PIC:
21023 return SYMBOL_TINY_TLSIE;
21024 default:
21025 return SYMBOL_SMALL_TLSIE;
21028 case TLS_MODEL_LOCAL_EXEC:
21029 if (aarch64_tls_size == 12)
21030 return SYMBOL_TLSLE12;
21031 else if (aarch64_tls_size == 24)
21032 return SYMBOL_TLSLE24;
21033 else if (aarch64_tls_size == 32)
21034 return SYMBOL_TLSLE32;
21035 else if (aarch64_tls_size == 48)
21036 return SYMBOL_TLSLE48;
21037 else
21038 gcc_unreachable ();
21040 case TLS_MODEL_EMULATED:
21041 case TLS_MODEL_NONE:
21042 return SYMBOL_FORCE_TO_MEM;
21044 default:
21045 gcc_unreachable ();
21049 /* Return the correct method for accessing X + OFFSET, where X is either
21050 a SYMBOL_REF or LABEL_REF. */
21052 enum aarch64_symbol_type
21053 aarch64_classify_symbol (rtx x, HOST_WIDE_INT offset)
21055 x = strip_salt (x);
21057 if (LABEL_REF_P (x))
21059 switch (aarch64_cmodel)
21061 case AARCH64_CMODEL_LARGE:
21062 return SYMBOL_FORCE_TO_MEM;
21064 case AARCH64_CMODEL_TINY_PIC:
21065 case AARCH64_CMODEL_TINY:
21066 return SYMBOL_TINY_ABSOLUTE;
21068 case AARCH64_CMODEL_SMALL_SPIC:
21069 case AARCH64_CMODEL_SMALL_PIC:
21070 case AARCH64_CMODEL_SMALL:
21071 return SYMBOL_SMALL_ABSOLUTE;
21073 default:
21074 gcc_unreachable ();
21078 if (SYMBOL_REF_P (x))
21080 if (aarch64_tls_symbol_p (x))
21081 return aarch64_classify_tls_symbol (x);
21083 switch (aarch64_cmodel)
21085 case AARCH64_CMODEL_TINY_PIC:
21086 case AARCH64_CMODEL_TINY:
21087 /* With -fPIC non-local symbols use the GOT. For orthogonality
21088 always use the GOT for extern weak symbols. */
21089 if (!TARGET_PECOFF
21090 && (flag_pic || SYMBOL_REF_WEAK (x))
21091 && !aarch64_symbol_binds_local_p (x))
21092 return SYMBOL_TINY_GOT;
21094 /* When we retrieve symbol + offset address, we have to make sure
21095 the offset does not cause overflow of the final address. But
21096 we have no way of knowing the address of symbol at compile time
21097 so we can't accurately say if the distance between the PC and
21098 symbol + offset is outside the addressible range of +/-1MB in the
21099 TINY code model. So we limit the maximum offset to +/-64KB and
21100 assume the offset to the symbol is not larger than +/-(1MB - 64KB).
21101 If offset_within_block_p is true we allow larger offsets. */
21102 if (!(IN_RANGE (offset, -0x10000, 0x10000)
21103 || offset_within_block_p (x, offset)))
21104 return SYMBOL_FORCE_TO_MEM;
21106 return SYMBOL_TINY_ABSOLUTE;
21109 case AARCH64_CMODEL_SMALL_SPIC:
21110 case AARCH64_CMODEL_SMALL_PIC:
21111 case AARCH64_CMODEL_SMALL:
21112 if (!TARGET_PECOFF
21113 && (flag_pic || SYMBOL_REF_WEAK (x))
21114 && !aarch64_symbol_binds_local_p (x))
21115 return aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC
21116 ? SYMBOL_SMALL_GOT_28K : SYMBOL_SMALL_GOT_4G;
21118 /* Same reasoning as the tiny code model, but the offset cap here is
21119 1MB, allowing +/-3.9GB for the offset to the symbol. */
21120 if (!(IN_RANGE (offset, -0x100000, 0x100000)
21121 || offset_within_block_p (x, offset)))
21122 return SYMBOL_FORCE_TO_MEM;
21124 return SYMBOL_SMALL_ABSOLUTE;
21126 case AARCH64_CMODEL_LARGE:
21127 /* This is alright even in PIC code as the constant
21128 pool reference is always PC relative and within
21129 the same translation unit. */
21130 if (!aarch64_pcrelative_literal_loads && CONSTANT_POOL_ADDRESS_P (x))
21131 return SYMBOL_SMALL_ABSOLUTE;
21132 else
21133 return SYMBOL_FORCE_TO_MEM;
21135 default:
21136 gcc_unreachable ();
21140 /* By default push everything into the constant pool. */
21141 return SYMBOL_FORCE_TO_MEM;
21144 bool
21145 aarch64_constant_address_p (rtx x)
21147 return (CONSTANT_P (x) && memory_address_p (DImode, x));
21150 bool
21151 aarch64_legitimate_pic_operand_p (rtx x)
21153 poly_int64 offset;
21154 x = strip_offset_and_salt (x, &offset);
21155 if (SYMBOL_REF_P (x))
21156 return false;
21158 return true;
21161 /* Implement TARGET_LEGITIMATE_CONSTANT_P hook. Return true for constants
21162 that should be rematerialized rather than spilled. */
21164 static bool
21165 aarch64_legitimate_constant_p (machine_mode mode, rtx x)
21167 /* Support CSE and rematerialization of common constants. */
21168 if (CONST_INT_P (x)
21169 || CONST_DOUBLE_P (x))
21170 return true;
21172 /* Only accept variable-length vector constants if they can be
21173 handled directly.
21175 ??? It would be possible (but complex) to handle rematerialization
21176 of other constants via secondary reloads. */
21177 if (!GET_MODE_SIZE (mode).is_constant ())
21178 return aarch64_simd_valid_mov_imm (x);
21180 /* Otherwise, accept any CONST_VECTOR that, if all else fails, can at
21181 least be forced to memory and loaded from there. */
21182 if (CONST_VECTOR_P (x))
21183 return !targetm.cannot_force_const_mem (mode, x);
21185 /* Do not allow vector struct mode constants for Advanced SIMD.
21186 We could support 0 and -1 easily, but they need support in
21187 aarch64-simd.md. */
21188 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
21189 if (vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
21190 return false;
21192 if (GET_CODE (x) == HIGH)
21193 x = XEXP (x, 0);
21195 /* Accept polynomial constants that can be calculated by using the
21196 destination of a move as the sole temporary. Constants that
21197 require a second temporary cannot be rematerialized (they can't be
21198 forced to memory and also aren't legitimate constants). */
21199 poly_int64 offset;
21200 if (poly_int_rtx_p (x, &offset))
21201 return aarch64_offset_temporaries (false, offset) <= 1;
21203 /* If an offset is being added to something else, we need to allow the
21204 base to be moved into the destination register, meaning that there
21205 are no free temporaries for the offset. */
21206 x = strip_offset_and_salt (x, &offset);
21207 if (!offset.is_constant () && aarch64_offset_temporaries (true, offset) > 0)
21208 return false;
21210 /* Do not allow const (plus (anchor_symbol, const_int)). */
21211 if (maybe_ne (offset, 0) && SYMBOL_REF_P (x) && SYMBOL_REF_ANCHOR_P (x))
21212 return false;
21214 /* Treat symbols as constants. Avoid TLS symbols as they are complex,
21215 so spilling them is better than rematerialization. */
21216 if (SYMBOL_REF_P (x) && !SYMBOL_REF_TLS_MODEL (x))
21217 return true;
21219 /* Label references are always constant. */
21220 if (LABEL_REF_P (x))
21221 return true;
21223 return false;
21227 aarch64_load_tp (rtx target)
21229 if (!target
21230 || GET_MODE (target) != Pmode
21231 || !register_operand (target, Pmode))
21232 target = gen_reg_rtx (Pmode);
21234 /* Can return in any reg. */
21235 emit_insn (gen_aarch64_load_tp_hard (target));
21236 return target;
21239 /* On AAPCS systems, this is the "struct __va_list". */
21240 static GTY(()) tree va_list_type;
21242 /* Implement TARGET_BUILD_BUILTIN_VA_LIST.
21243 Return the type to use as __builtin_va_list.
21245 AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
21247 struct __va_list
21249 void *__stack;
21250 void *__gr_top;
21251 void *__vr_top;
21252 int __gr_offs;
21253 int __vr_offs;
21254 }; */
21256 static tree
21257 aarch64_build_builtin_va_list (void)
21259 tree va_list_name;
21260 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
21262 /* Create the type. */
21263 va_list_type = lang_hooks.types.make_type (RECORD_TYPE);
21264 /* Give it the required name. */
21265 va_list_name = build_decl (BUILTINS_LOCATION,
21266 TYPE_DECL,
21267 get_identifier ("__va_list"),
21268 va_list_type);
21269 DECL_ARTIFICIAL (va_list_name) = 1;
21270 TREE_PUBLIC (va_list_name) = 1;
21271 TYPE_NAME (va_list_type) = va_list_name;
21272 TYPE_STUB_DECL (va_list_type) = va_list_name;
21274 /* Create the fields. */
21275 f_stack = build_decl (BUILTINS_LOCATION,
21276 FIELD_DECL, get_identifier ("__stack"),
21277 ptr_type_node);
21278 f_grtop = build_decl (BUILTINS_LOCATION,
21279 FIELD_DECL, get_identifier ("__gr_top"),
21280 ptr_type_node);
21281 f_vrtop = build_decl (BUILTINS_LOCATION,
21282 FIELD_DECL, get_identifier ("__vr_top"),
21283 ptr_type_node);
21284 f_groff = build_decl (BUILTINS_LOCATION,
21285 FIELD_DECL, get_identifier ("__gr_offs"),
21286 integer_type_node);
21287 f_vroff = build_decl (BUILTINS_LOCATION,
21288 FIELD_DECL, get_identifier ("__vr_offs"),
21289 integer_type_node);
21291 /* Tell tree-stdarg pass about our internal offset fields.
21292 NOTE: va_list_gpr/fpr_counter_field are only used for tree comparision
21293 purpose to identify whether the code is updating va_list internal
21294 offset fields through irregular way. */
21295 va_list_gpr_counter_field = f_groff;
21296 va_list_fpr_counter_field = f_vroff;
21298 DECL_ARTIFICIAL (f_stack) = 1;
21299 DECL_ARTIFICIAL (f_grtop) = 1;
21300 DECL_ARTIFICIAL (f_vrtop) = 1;
21301 DECL_ARTIFICIAL (f_groff) = 1;
21302 DECL_ARTIFICIAL (f_vroff) = 1;
21304 DECL_FIELD_CONTEXT (f_stack) = va_list_type;
21305 DECL_FIELD_CONTEXT (f_grtop) = va_list_type;
21306 DECL_FIELD_CONTEXT (f_vrtop) = va_list_type;
21307 DECL_FIELD_CONTEXT (f_groff) = va_list_type;
21308 DECL_FIELD_CONTEXT (f_vroff) = va_list_type;
21310 TYPE_FIELDS (va_list_type) = f_stack;
21311 DECL_CHAIN (f_stack) = f_grtop;
21312 DECL_CHAIN (f_grtop) = f_vrtop;
21313 DECL_CHAIN (f_vrtop) = f_groff;
21314 DECL_CHAIN (f_groff) = f_vroff;
21316 /* Compute its layout. */
21317 layout_type (va_list_type);
21319 return va_list_type;
21322 /* Implement TARGET_EXPAND_BUILTIN_VA_START. */
21323 static void
21324 aarch64_expand_builtin_va_start (tree valist, rtx nextarg ATTRIBUTE_UNUSED)
21326 const CUMULATIVE_ARGS *cum;
21327 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
21328 tree stack, grtop, vrtop, groff, vroff;
21329 tree t;
21330 int gr_save_area_size = cfun->va_list_gpr_size;
21331 int vr_save_area_size = cfun->va_list_fpr_size;
21332 int vr_offset;
21334 cum = &crtl->args.info;
21335 if (cfun->va_list_gpr_size)
21336 gr_save_area_size = MIN ((NUM_ARG_REGS - cum->aapcs_ncrn) * UNITS_PER_WORD,
21337 cfun->va_list_gpr_size);
21338 if (cfun->va_list_fpr_size)
21339 vr_save_area_size = MIN ((NUM_FP_ARG_REGS - cum->aapcs_nvrn)
21340 * UNITS_PER_VREG, cfun->va_list_fpr_size);
21342 if (!TARGET_FLOAT)
21344 gcc_assert (cum->aapcs_nvrn == 0);
21345 vr_save_area_size = 0;
21348 f_stack = TYPE_FIELDS (va_list_type_node);
21349 f_grtop = DECL_CHAIN (f_stack);
21350 f_vrtop = DECL_CHAIN (f_grtop);
21351 f_groff = DECL_CHAIN (f_vrtop);
21352 f_vroff = DECL_CHAIN (f_groff);
21354 stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), valist, f_stack,
21355 NULL_TREE);
21356 grtop = build3 (COMPONENT_REF, TREE_TYPE (f_grtop), valist, f_grtop,
21357 NULL_TREE);
21358 vrtop = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop), valist, f_vrtop,
21359 NULL_TREE);
21360 groff = build3 (COMPONENT_REF, TREE_TYPE (f_groff), valist, f_groff,
21361 NULL_TREE);
21362 vroff = build3 (COMPONENT_REF, TREE_TYPE (f_vroff), valist, f_vroff,
21363 NULL_TREE);
21365 /* Emit code to initialize STACK, which points to the next varargs stack
21366 argument. CUM->AAPCS_STACK_SIZE gives the number of stack words used
21367 by named arguments. STACK is 8-byte aligned. */
21368 t = make_tree (TREE_TYPE (stack), virtual_incoming_args_rtx);
21369 if (cum->aapcs_stack_size > 0)
21370 t = fold_build_pointer_plus_hwi (t, cum->aapcs_stack_size * UNITS_PER_WORD);
21371 t = build2 (MODIFY_EXPR, TREE_TYPE (stack), stack, t);
21372 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
21374 /* Emit code to initialize GRTOP, the top of the GR save area.
21375 virtual_incoming_args_rtx should have been 16 byte aligned. */
21376 t = make_tree (TREE_TYPE (grtop), virtual_incoming_args_rtx);
21377 t = build2 (MODIFY_EXPR, TREE_TYPE (grtop), grtop, t);
21378 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
21380 /* Emit code to initialize VRTOP, the top of the VR save area.
21381 This address is gr_save_area_bytes below GRTOP, rounded
21382 down to the next 16-byte boundary. */
21383 t = make_tree (TREE_TYPE (vrtop), virtual_incoming_args_rtx);
21384 vr_offset = ROUND_UP (gr_save_area_size,
21385 STACK_BOUNDARY / BITS_PER_UNIT);
21387 if (vr_offset)
21388 t = fold_build_pointer_plus_hwi (t, -vr_offset);
21389 t = build2 (MODIFY_EXPR, TREE_TYPE (vrtop), vrtop, t);
21390 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
21392 /* Emit code to initialize GROFF, the offset from GRTOP of the
21393 next GPR argument. */
21394 t = build2 (MODIFY_EXPR, TREE_TYPE (groff), groff,
21395 build_int_cst (TREE_TYPE (groff), -gr_save_area_size));
21396 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
21398 /* Likewise emit code to initialize VROFF, the offset from FTOP
21399 of the next VR argument. */
21400 t = build2 (MODIFY_EXPR, TREE_TYPE (vroff), vroff,
21401 build_int_cst (TREE_TYPE (vroff), -vr_save_area_size));
21402 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
21405 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR. */
21407 static tree
21408 aarch64_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
21409 gimple_seq *post_p ATTRIBUTE_UNUSED)
21411 tree addr;
21412 bool indirect_p;
21413 bool is_ha; /* is HFA or HVA. */
21414 bool dw_align; /* double-word align. */
21415 machine_mode ag_mode = VOIDmode;
21416 int nregs;
21417 machine_mode mode;
21419 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
21420 tree stack, f_top, f_off, off, arg, roundup, on_stack;
21421 HOST_WIDE_INT size, rsize, adjust, align;
21422 tree t, u, cond1, cond2;
21424 indirect_p = pass_va_arg_by_reference (type);
21425 if (indirect_p)
21426 type = build_pointer_type (type);
21428 mode = TYPE_MODE (type);
21430 f_stack = TYPE_FIELDS (va_list_type_node);
21431 f_grtop = DECL_CHAIN (f_stack);
21432 f_vrtop = DECL_CHAIN (f_grtop);
21433 f_groff = DECL_CHAIN (f_vrtop);
21434 f_vroff = DECL_CHAIN (f_groff);
21436 stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), unshare_expr (valist),
21437 f_stack, NULL_TREE);
21438 size = int_size_in_bytes (type);
21440 unsigned int abi_break_gcc_9;
21441 unsigned int abi_break_gcc_13;
21442 unsigned int abi_break_gcc_14;
21443 align
21444 = aarch64_function_arg_alignment (mode, type, &abi_break_gcc_9,
21445 &abi_break_gcc_13, &abi_break_gcc_14)
21446 / BITS_PER_UNIT;
21448 dw_align = false;
21449 adjust = 0;
21450 if (aarch64_vfp_is_call_or_return_candidate (mode, type, &ag_mode, &nregs,
21451 &is_ha, false))
21453 /* No frontends can create types with variable-sized modes, so we
21454 shouldn't be asked to pass or return them. */
21455 unsigned int ag_size = GET_MODE_SIZE (ag_mode).to_constant ();
21457 /* TYPE passed in fp/simd registers. */
21458 if (!TARGET_FLOAT)
21459 aarch64_err_no_fpadvsimd (mode);
21461 f_top = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop),
21462 unshare_expr (valist), f_vrtop, NULL_TREE);
21463 f_off = build3 (COMPONENT_REF, TREE_TYPE (f_vroff),
21464 unshare_expr (valist), f_vroff, NULL_TREE);
21466 rsize = nregs * UNITS_PER_VREG;
21468 if (is_ha)
21470 if (BYTES_BIG_ENDIAN && ag_size < UNITS_PER_VREG)
21471 adjust = UNITS_PER_VREG - ag_size;
21473 else if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
21474 && size < UNITS_PER_VREG)
21476 adjust = UNITS_PER_VREG - size;
21479 else
21481 /* TYPE passed in general registers. */
21482 f_top = build3 (COMPONENT_REF, TREE_TYPE (f_grtop),
21483 unshare_expr (valist), f_grtop, NULL_TREE);
21484 f_off = build3 (COMPONENT_REF, TREE_TYPE (f_groff),
21485 unshare_expr (valist), f_groff, NULL_TREE);
21486 rsize = ROUND_UP (size, UNITS_PER_WORD);
21487 nregs = rsize / UNITS_PER_WORD;
21489 if (align <= 8
21490 && abi_break_gcc_13
21491 && warn_psabi
21492 && !bitint_or_aggr_of_bitint_p (type))
21493 inform (input_location, "parameter passing for argument of type "
21494 "%qT changed in GCC 13.1", type);
21496 if (warn_psabi
21497 && abi_break_gcc_14
21498 && (abi_break_gcc_14 > 8 * BITS_PER_UNIT) != (align > 8)
21499 && !bitint_or_aggr_of_bitint_p (type))
21500 inform (input_location, "parameter passing for argument of type "
21501 "%qT changed in GCC 14.1", type);
21503 if (align > 8)
21505 if (abi_break_gcc_9
21506 && warn_psabi
21507 && !bitint_or_aggr_of_bitint_p (type))
21508 inform (input_location, "parameter passing for argument of type "
21509 "%qT changed in GCC 9.1", type);
21510 dw_align = true;
21513 if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
21514 && size < UNITS_PER_WORD)
21516 adjust = UNITS_PER_WORD - size;
21520 /* Get a local temporary for the field value. */
21521 off = get_initialized_tmp_var (f_off, pre_p, NULL);
21523 /* Emit code to branch if off >= 0. */
21524 t = build2 (GE_EXPR, boolean_type_node, off,
21525 build_int_cst (TREE_TYPE (off), 0));
21526 cond1 = build3 (COND_EXPR, ptr_type_node, t, NULL_TREE, NULL_TREE);
21528 if (dw_align)
21530 /* Emit: offs = (offs + 15) & -16. */
21531 t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
21532 build_int_cst (TREE_TYPE (off), 15));
21533 t = build2 (BIT_AND_EXPR, TREE_TYPE (off), t,
21534 build_int_cst (TREE_TYPE (off), -16));
21535 roundup = build2 (MODIFY_EXPR, TREE_TYPE (off), off, t);
21537 else
21538 roundup = NULL;
21540 /* Update ap.__[g|v]r_offs */
21541 t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
21542 build_int_cst (TREE_TYPE (off), rsize));
21543 t = build2 (MODIFY_EXPR, TREE_TYPE (f_off), unshare_expr (f_off), t);
21545 /* String up. */
21546 if (roundup)
21547 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
21549 /* [cond2] if (ap.__[g|v]r_offs > 0) */
21550 u = build2 (GT_EXPR, boolean_type_node, unshare_expr (f_off),
21551 build_int_cst (TREE_TYPE (f_off), 0));
21552 cond2 = build3 (COND_EXPR, ptr_type_node, u, NULL_TREE, NULL_TREE);
21554 /* String up: make sure the assignment happens before the use. */
21555 t = build2 (COMPOUND_EXPR, TREE_TYPE (cond2), t, cond2);
21556 COND_EXPR_ELSE (cond1) = t;
21558 /* Prepare the trees handling the argument that is passed on the stack;
21559 the top level node will store in ON_STACK. */
21560 arg = get_initialized_tmp_var (stack, pre_p, NULL);
21561 if (align > 8)
21563 /* if (alignof(type) > 8) (arg = arg + 15) & -16; */
21564 t = fold_build_pointer_plus_hwi (arg, 15);
21565 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
21566 build_int_cst (TREE_TYPE (t), -16));
21567 roundup = build2 (MODIFY_EXPR, TREE_TYPE (arg), arg, t);
21569 else
21570 roundup = NULL;
21571 /* Advance ap.__stack */
21572 t = fold_build_pointer_plus_hwi (arg, size + 7);
21573 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
21574 build_int_cst (TREE_TYPE (t), -8));
21575 t = build2 (MODIFY_EXPR, TREE_TYPE (stack), unshare_expr (stack), t);
21576 /* String up roundup and advance. */
21577 if (roundup)
21578 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
21579 /* String up with arg */
21580 on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), t, arg);
21581 /* Big-endianness related address adjustment. */
21582 if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
21583 && size < UNITS_PER_WORD)
21585 t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (arg), arg,
21586 size_int (UNITS_PER_WORD - size));
21587 on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), on_stack, t);
21590 COND_EXPR_THEN (cond1) = unshare_expr (on_stack);
21591 COND_EXPR_THEN (cond2) = unshare_expr (on_stack);
21593 /* Adjustment to OFFSET in the case of BIG_ENDIAN. */
21594 t = off;
21595 if (adjust)
21596 t = build2 (PREINCREMENT_EXPR, TREE_TYPE (off), off,
21597 build_int_cst (TREE_TYPE (off), adjust));
21599 t = fold_convert (sizetype, t);
21600 t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (f_top), f_top, t);
21602 if (is_ha)
21604 /* type ha; // treat as "struct {ftype field[n];}"
21605 ... [computing offs]
21606 for (i = 0; i <nregs; ++i, offs += 16)
21607 ha.field[i] = *((ftype *)(ap.__vr_top + offs));
21608 return ha; */
21609 int i;
21610 tree tmp_ha, field_t, field_ptr_t;
21612 /* Declare a local variable. */
21613 tmp_ha = create_tmp_var_raw (type, "ha");
21614 gimple_add_tmp_var (tmp_ha);
21616 /* Establish the base type. */
21617 switch (ag_mode)
21619 case E_SFmode:
21620 field_t = float_type_node;
21621 field_ptr_t = float_ptr_type_node;
21622 break;
21623 case E_DFmode:
21624 field_t = double_type_node;
21625 field_ptr_t = double_ptr_type_node;
21626 break;
21627 case E_TFmode:
21628 field_t = long_double_type_node;
21629 field_ptr_t = long_double_ptr_type_node;
21630 break;
21631 case E_SDmode:
21632 field_t = dfloat32_type_node;
21633 field_ptr_t = build_pointer_type (dfloat32_type_node);
21634 break;
21635 case E_DDmode:
21636 field_t = dfloat64_type_node;
21637 field_ptr_t = build_pointer_type (dfloat64_type_node);
21638 break;
21639 case E_TDmode:
21640 field_t = dfloat128_type_node;
21641 field_ptr_t = build_pointer_type (dfloat128_type_node);
21642 break;
21643 case E_HFmode:
21644 field_t = aarch64_fp16_type_node;
21645 field_ptr_t = aarch64_fp16_ptr_type_node;
21646 break;
21647 case E_BFmode:
21648 field_t = bfloat16_type_node;
21649 field_ptr_t = aarch64_bf16_ptr_type_node;
21650 break;
21651 case E_V2SImode:
21652 case E_V4SImode:
21654 tree innertype = make_signed_type (GET_MODE_PRECISION (SImode));
21655 field_t = build_vector_type_for_mode (innertype, ag_mode);
21656 field_ptr_t = build_pointer_type (field_t);
21658 break;
21659 default:
21660 gcc_assert (0);
21663 /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area */
21664 TREE_ADDRESSABLE (tmp_ha) = 1;
21665 tmp_ha = build1 (ADDR_EXPR, field_ptr_t, tmp_ha);
21666 addr = t;
21667 t = fold_convert (field_ptr_t, addr);
21668 t = build2 (MODIFY_EXPR, field_t,
21669 build1 (INDIRECT_REF, field_t, tmp_ha),
21670 build1 (INDIRECT_REF, field_t, t));
21672 /* ha.field[i] = *((field_ptr_t)vr_saved_area + i) */
21673 for (i = 1; i < nregs; ++i)
21675 addr = fold_build_pointer_plus_hwi (addr, UNITS_PER_VREG);
21676 u = fold_convert (field_ptr_t, addr);
21677 u = build2 (MODIFY_EXPR, field_t,
21678 build2 (MEM_REF, field_t, tmp_ha,
21679 build_int_cst (field_ptr_t,
21680 (i *
21681 int_size_in_bytes (field_t)))),
21682 build1 (INDIRECT_REF, field_t, u));
21683 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), t, u);
21686 u = fold_convert (TREE_TYPE (f_top), tmp_ha);
21687 t = build2 (COMPOUND_EXPR, TREE_TYPE (f_top), t, u);
21690 COND_EXPR_ELSE (cond2) = t;
21691 addr = fold_convert (build_pointer_type (type), cond1);
21692 addr = build_va_arg_indirect_ref (addr);
21694 if (indirect_p)
21695 addr = build_va_arg_indirect_ref (addr);
21697 return addr;
21700 /* Implement TARGET_SETUP_INCOMING_VARARGS. */
21702 static void
21703 aarch64_setup_incoming_varargs (cumulative_args_t cum_v,
21704 const function_arg_info &arg,
21705 int *pretend_size ATTRIBUTE_UNUSED, int no_rtl)
21707 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
21708 CUMULATIVE_ARGS local_cum;
21709 int gr_saved = cfun->va_list_gpr_size;
21710 int vr_saved = cfun->va_list_fpr_size;
21712 /* The caller has advanced CUM up to, but not beyond, the last named
21713 argument. Advance a local copy of CUM past the last "real" named
21714 argument, to find out how many registers are left over. */
21715 local_cum = *cum;
21716 if (!TYPE_NO_NAMED_ARGS_STDARG_P (TREE_TYPE (current_function_decl)))
21717 aarch64_function_arg_advance (pack_cumulative_args(&local_cum), arg);
21719 /* Found out how many registers we need to save.
21720 Honor tree-stdvar analysis results. */
21721 if (cfun->va_list_gpr_size)
21722 gr_saved = MIN (NUM_ARG_REGS - local_cum.aapcs_ncrn,
21723 cfun->va_list_gpr_size / UNITS_PER_WORD);
21724 if (cfun->va_list_fpr_size)
21725 vr_saved = MIN (NUM_FP_ARG_REGS - local_cum.aapcs_nvrn,
21726 cfun->va_list_fpr_size / UNITS_PER_VREG);
21728 if (!TARGET_FLOAT)
21730 gcc_assert (local_cum.aapcs_nvrn == 0);
21731 vr_saved = 0;
21734 if (!no_rtl)
21736 if (gr_saved > 0)
21738 rtx ptr, mem;
21740 /* virtual_incoming_args_rtx should have been 16-byte aligned. */
21741 ptr = plus_constant (Pmode, virtual_incoming_args_rtx,
21742 - gr_saved * UNITS_PER_WORD);
21743 mem = gen_frame_mem (BLKmode, ptr);
21744 set_mem_alias_set (mem, get_varargs_alias_set ());
21746 move_block_from_reg (local_cum.aapcs_ncrn + R0_REGNUM,
21747 mem, gr_saved);
21749 if (vr_saved > 0)
21751 /* We can't use move_block_from_reg, because it will use
21752 the wrong mode, storing D regs only. */
21753 machine_mode mode = TImode;
21754 int off, i, vr_start;
21756 /* Set OFF to the offset from virtual_incoming_args_rtx of
21757 the first vector register. The VR save area lies below
21758 the GR one, and is aligned to 16 bytes. */
21759 off = -ROUND_UP (gr_saved * UNITS_PER_WORD,
21760 STACK_BOUNDARY / BITS_PER_UNIT);
21761 off -= vr_saved * UNITS_PER_VREG;
21763 vr_start = V0_REGNUM + local_cum.aapcs_nvrn;
21764 for (i = 0; i < vr_saved; ++i)
21766 rtx ptr, mem;
21768 ptr = plus_constant (Pmode, virtual_incoming_args_rtx, off);
21769 mem = gen_frame_mem (mode, ptr);
21770 set_mem_alias_set (mem, get_varargs_alias_set ());
21771 aarch64_emit_move (mem, gen_rtx_REG (mode, vr_start + i));
21772 off += UNITS_PER_VREG;
21777 /* We don't save the size into *PRETEND_SIZE because we want to avoid
21778 any complication of having crtl->args.pretend_args_size changed. */
21779 cfun->machine->frame.saved_varargs_size
21780 = (ROUND_UP (gr_saved * UNITS_PER_WORD,
21781 STACK_BOUNDARY / BITS_PER_UNIT)
21782 + vr_saved * UNITS_PER_VREG);
21785 static void
21786 aarch64_conditional_register_usage (void)
21788 int i;
21789 if (!TARGET_FLOAT)
21791 for (i = V0_REGNUM; i <= V31_REGNUM; i++)
21793 fixed_regs[i] = 1;
21794 call_used_regs[i] = 1;
21795 CLEAR_HARD_REG_BIT (operand_reg_set, i);
21798 if (!TARGET_SVE)
21799 for (i = P0_REGNUM; i <= P15_REGNUM; i++)
21801 fixed_regs[i] = 1;
21802 call_used_regs[i] = 1;
21805 /* Only allow these registers to be accessed via special patterns. */
21806 CLEAR_HARD_REG_BIT (operand_reg_set, VG_REGNUM);
21807 CLEAR_HARD_REG_BIT (operand_reg_set, FFR_REGNUM);
21808 CLEAR_HARD_REG_BIT (operand_reg_set, FFRT_REGNUM);
21809 for (int i = FIRST_FAKE_REGNUM; i <= LAST_FAKE_REGNUM; ++i)
21810 CLEAR_HARD_REG_BIT (operand_reg_set, i);
21812 /* When tracking speculation, we need a couple of call-clobbered registers
21813 to track the speculation state. It would be nice to just use
21814 IP0 and IP1, but currently there are numerous places that just
21815 assume these registers are free for other uses (eg pointer
21816 authentication). */
21817 if (aarch64_track_speculation)
21819 fixed_regs[SPECULATION_TRACKER_REGNUM] = 1;
21820 call_used_regs[SPECULATION_TRACKER_REGNUM] = 1;
21821 fixed_regs[SPECULATION_SCRATCH_REGNUM] = 1;
21822 call_used_regs[SPECULATION_SCRATCH_REGNUM] = 1;
21826 /* Implement TARGET_MEMBER_TYPE_FORCES_BLK. */
21828 bool
21829 aarch64_member_type_forces_blk (const_tree field_or_array, machine_mode mode)
21831 /* For records we're passed a FIELD_DECL, for arrays we're passed
21832 an ARRAY_TYPE. In both cases we're interested in the TREE_TYPE. */
21833 const_tree type = TREE_TYPE (field_or_array);
21835 /* Assign BLKmode to anything that contains more than 2 SVE predicates.
21836 For structures, the "multiple" case is indicated by MODE being
21837 VOIDmode. */
21838 unsigned int num_zr, num_pr;
21839 if (aarch64_sve::builtin_type_p (type, &num_zr, &num_pr) && num_pr > 2)
21841 if (TREE_CODE (field_or_array) == ARRAY_TYPE)
21842 return !simple_cst_equal (TYPE_SIZE (field_or_array),
21843 TYPE_SIZE (type));
21844 return mode == VOIDmode;
21847 return default_member_type_forces_blk (field_or_array, mode);
21850 /* Bitmasks that indicate whether earlier versions of GCC would have
21851 taken a different path through the ABI logic. This should result in
21852 a -Wpsabi warning if the earlier path led to a different ABI decision.
21854 WARN_PSABI_EMPTY_CXX17_BASE
21855 Indicates that the type includes an artificial empty C++17 base field
21856 that, prior to GCC 10.1, would prevent the type from being treated as
21857 a HFA or HVA. See PR94383 for details.
21859 WARN_PSABI_NO_UNIQUE_ADDRESS
21860 Indicates that the type includes an empty [[no_unique_address]] field
21861 that, prior to GCC 10.1, would prevent the type from being treated as
21862 a HFA or HVA. */
21863 const unsigned int WARN_PSABI_EMPTY_CXX17_BASE = 1U << 0;
21864 const unsigned int WARN_PSABI_NO_UNIQUE_ADDRESS = 1U << 1;
21865 const unsigned int WARN_PSABI_ZERO_WIDTH_BITFIELD = 1U << 2;
21867 /* Walk down the type tree of TYPE counting consecutive base elements.
21868 If *MODEP is VOIDmode, then set it to the first valid floating point
21869 type. If a non-floating point type is found, or if a floating point
21870 type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
21871 otherwise return the count in the sub-tree.
21873 The WARN_PSABI_FLAGS argument allows the caller to check whether this
21874 function has changed its behavior relative to earlier versions of GCC.
21875 Normally the argument should be nonnull and point to a zero-initialized
21876 variable. The function then records whether the ABI decision might
21877 be affected by a known fix to the ABI logic, setting the associated
21878 WARN_PSABI_* bits if so.
21880 When the argument is instead a null pointer, the function tries to
21881 simulate the behavior of GCC before all such ABI fixes were made.
21882 This is useful to check whether the function returns something
21883 different after the ABI fixes. */
21884 static int
21885 aapcs_vfp_sub_candidate (const_tree type, machine_mode *modep,
21886 unsigned int *warn_psabi_flags)
21888 machine_mode mode;
21889 HOST_WIDE_INT size;
21891 if (aarch64_sve::builtin_type_p (type))
21892 return -1;
21894 switch (TREE_CODE (type))
21896 case REAL_TYPE:
21897 mode = TYPE_MODE (type);
21898 if (mode != DFmode && mode != SFmode
21899 && mode != TFmode && mode != HFmode
21900 && mode != SDmode && mode != DDmode && mode != TDmode)
21901 return -1;
21903 if (*modep == VOIDmode)
21904 *modep = mode;
21906 if (*modep == mode)
21907 return 1;
21909 break;
21911 case COMPLEX_TYPE:
21912 mode = TYPE_MODE (TREE_TYPE (type));
21913 if (mode != DFmode && mode != SFmode
21914 && mode != TFmode && mode != HFmode)
21915 return -1;
21917 if (*modep == VOIDmode)
21918 *modep = mode;
21920 if (*modep == mode)
21921 return 2;
21923 break;
21925 case VECTOR_TYPE:
21926 /* Use V2SImode and V4SImode as representatives of all 64-bit
21927 and 128-bit vector types. */
21928 size = int_size_in_bytes (type);
21929 switch (size)
21931 case 8:
21932 mode = V2SImode;
21933 break;
21934 case 16:
21935 mode = V4SImode;
21936 break;
21937 default:
21938 return -1;
21941 if (*modep == VOIDmode)
21942 *modep = mode;
21944 /* Vector modes are considered to be opaque: two vectors are
21945 equivalent for the purposes of being homogeneous aggregates
21946 if they are the same size. */
21947 if (*modep == mode)
21948 return 1;
21950 break;
21952 case ARRAY_TYPE:
21954 int count;
21955 tree index = TYPE_DOMAIN (type);
21957 /* Can't handle incomplete types nor sizes that are not
21958 fixed. */
21959 if (!COMPLETE_TYPE_P (type)
21960 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
21961 return -1;
21963 count = aapcs_vfp_sub_candidate (TREE_TYPE (type), modep,
21964 warn_psabi_flags);
21965 if (count == -1
21966 || !index
21967 || !TYPE_MAX_VALUE (index)
21968 || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index))
21969 || !TYPE_MIN_VALUE (index)
21970 || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index))
21971 || count < 0)
21972 return -1;
21974 count *= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index))
21975 - tree_to_uhwi (TYPE_MIN_VALUE (index)));
21977 /* There must be no padding. */
21978 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
21979 count * GET_MODE_BITSIZE (*modep)))
21980 return -1;
21982 return count;
21985 case RECORD_TYPE:
21987 int count = 0;
21988 int sub_count;
21989 tree field;
21991 /* Can't handle incomplete types nor sizes that are not
21992 fixed. */
21993 if (!COMPLETE_TYPE_P (type)
21994 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
21995 return -1;
21997 for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
21999 if (TREE_CODE (field) != FIELD_DECL)
22000 continue;
22002 if (DECL_FIELD_ABI_IGNORED (field))
22004 /* See whether this is something that earlier versions of
22005 GCC failed to ignore. */
22006 unsigned int flag;
22007 if (lookup_attribute ("no_unique_address",
22008 DECL_ATTRIBUTES (field)))
22009 flag = WARN_PSABI_NO_UNIQUE_ADDRESS;
22010 else if (cxx17_empty_base_field_p (field))
22011 flag = WARN_PSABI_EMPTY_CXX17_BASE;
22012 else
22013 /* No compatibility problem. */
22014 continue;
22016 /* Simulate the old behavior when WARN_PSABI_FLAGS is null. */
22017 if (warn_psabi_flags)
22019 *warn_psabi_flags |= flag;
22020 continue;
22023 /* A zero-width bitfield may affect layout in some
22024 circumstances, but adds no members. The determination
22025 of whether or not a type is an HFA is performed after
22026 layout is complete, so if the type still looks like an
22027 HFA afterwards, it is still classed as one. This is
22028 potentially an ABI break for the hard-float ABI. */
22029 else if (DECL_BIT_FIELD (field)
22030 && integer_zerop (DECL_SIZE (field)))
22032 /* Prior to GCC-12 these fields were striped early,
22033 hiding them from the back-end entirely and
22034 resulting in the correct behaviour for argument
22035 passing. Simulate that old behaviour without
22036 generating a warning. */
22037 if (DECL_FIELD_CXX_ZERO_WIDTH_BIT_FIELD (field))
22038 continue;
22039 if (warn_psabi_flags)
22041 *warn_psabi_flags |= WARN_PSABI_ZERO_WIDTH_BITFIELD;
22042 continue;
22046 sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep,
22047 warn_psabi_flags);
22048 if (sub_count < 0)
22049 return -1;
22050 count += sub_count;
22053 /* There must be no padding. */
22054 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
22055 count * GET_MODE_BITSIZE (*modep)))
22056 return -1;
22058 return count;
22061 case UNION_TYPE:
22062 case QUAL_UNION_TYPE:
22064 /* These aren't very interesting except in a degenerate case. */
22065 int count = 0;
22066 int sub_count;
22067 tree field;
22069 /* Can't handle incomplete types nor sizes that are not
22070 fixed. */
22071 if (!COMPLETE_TYPE_P (type)
22072 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
22073 return -1;
22075 for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
22077 if (TREE_CODE (field) != FIELD_DECL)
22078 continue;
22080 sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep,
22081 warn_psabi_flags);
22082 if (sub_count < 0)
22083 return -1;
22084 count = count > sub_count ? count : sub_count;
22087 /* There must be no padding. */
22088 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
22089 count * GET_MODE_BITSIZE (*modep)))
22090 return -1;
22092 return count;
22095 default:
22096 break;
22099 return -1;
22102 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector
22103 type as described in AAPCS64 \S 4.1.2.
22105 See the comment above aarch64_composite_type_p for the notes on MODE. */
22107 static bool
22108 aarch64_short_vector_p (const_tree type,
22109 machine_mode mode)
22111 poly_int64 size = -1;
22113 if (type && VECTOR_TYPE_P (type))
22115 if (aarch64_sve::builtin_type_p (type))
22116 return false;
22117 size = int_size_in_bytes (type);
22119 else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
22120 || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
22122 /* The containing "else if" is too loose: it means that we look at TYPE
22123 if the type is a vector type (good), but that we otherwise ignore TYPE
22124 and look only at the mode. This is wrong because the type describes
22125 the language-level information whereas the mode is purely an internal
22126 GCC concept. We can therefore reach here for types that are not
22127 vectors in the AAPCS64 sense.
22129 We can't "fix" that for the traditional Advanced SIMD vector modes
22130 without breaking backwards compatibility. However, there's no such
22131 baggage for the structure modes, which were introduced in GCC 12. */
22132 if (aarch64_advsimd_struct_mode_p (mode))
22133 return false;
22135 /* For similar reasons, rely only on the type, not the mode, when
22136 processing SVE types. */
22137 if (type && aarch64_some_values_include_pst_objects_p (type))
22138 /* Leave later code to report an error if SVE is disabled. */
22139 gcc_assert (!TARGET_SVE || aarch64_sve_mode_p (mode));
22140 else
22141 size = GET_MODE_SIZE (mode);
22143 if (known_eq (size, 8) || known_eq (size, 16))
22145 /* 64-bit and 128-bit vectors should only acquire an SVE mode if
22146 they are being treated as scalable AAPCS64 types. */
22147 gcc_assert (!aarch64_sve_mode_p (mode)
22148 && !aarch64_advsimd_struct_mode_p (mode));
22149 return true;
22151 return false;
22154 /* Return TRUE if the type, as described by TYPE and MODE, is a composite
22155 type as described in AAPCS64 \S 4.3. This includes aggregate, union and
22156 array types. The C99 floating-point complex types are also considered
22157 as composite types, according to AAPCS64 \S 7.1.1. The complex integer
22158 types, which are GCC extensions and out of the scope of AAPCS64, are
22159 treated as composite types here as well.
22161 Note that MODE itself is not sufficient in determining whether a type
22162 is such a composite type or not. This is because
22163 stor-layout.cc:compute_record_mode may have already changed the MODE
22164 (BLKmode) of a RECORD_TYPE TYPE to some other mode. For example, a
22165 structure with only one field may have its MODE set to the mode of the
22166 field. Also an integer mode whose size matches the size of the
22167 RECORD_TYPE type may be used to substitute the original mode
22168 (i.e. BLKmode) in certain circumstances. In other words, MODE cannot be
22169 solely relied on. */
22171 static bool
22172 aarch64_composite_type_p (const_tree type,
22173 machine_mode mode)
22175 if (aarch64_short_vector_p (type, mode))
22176 return false;
22178 if (type && (AGGREGATE_TYPE_P (type) || TREE_CODE (type) == COMPLEX_TYPE))
22179 return true;
22181 if (type
22182 && TREE_CODE (type) == BITINT_TYPE
22183 && int_size_in_bytes (type) > 16)
22184 return true;
22186 if (mode == BLKmode
22187 || GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT
22188 || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
22189 return true;
22191 return false;
22194 /* Return TRUE if an argument, whose type is described by TYPE and MODE,
22195 shall be passed or returned in simd/fp register(s) (providing these
22196 parameter passing registers are available).
22198 Upon successful return, *COUNT returns the number of needed registers,
22199 *BASE_MODE returns the mode of the individual register and when IS_HA
22200 is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
22201 floating-point aggregate or a homogeneous short-vector aggregate.
22203 SILENT_P is true if the function should refrain from reporting any
22204 diagnostics. This should only be used if the caller is certain that
22205 any ABI decisions would eventually come through this function with
22206 SILENT_P set to false. */
22208 static bool
22209 aarch64_vfp_is_call_or_return_candidate (machine_mode mode,
22210 const_tree type,
22211 machine_mode *base_mode,
22212 int *count,
22213 bool *is_ha,
22214 bool silent_p)
22216 if (is_ha != NULL) *is_ha = false;
22218 machine_mode new_mode = VOIDmode;
22219 bool composite_p = aarch64_composite_type_p (type, mode);
22221 if ((!composite_p
22222 && (GET_MODE_CLASS (mode) == MODE_FLOAT
22223 || GET_MODE_CLASS (mode) == MODE_DECIMAL_FLOAT
22224 || (type && TYPE_MAIN_VARIANT (type) == aarch64_mfp8_type_node)))
22225 || aarch64_short_vector_p (type, mode))
22227 *count = 1;
22228 new_mode = mode;
22230 else if (GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT)
22232 if (is_ha != NULL) *is_ha = true;
22233 *count = 2;
22234 new_mode = GET_MODE_INNER (mode);
22236 else if (type && composite_p)
22238 unsigned int warn_psabi_flags = 0;
22239 int ag_count = aapcs_vfp_sub_candidate (type, &new_mode,
22240 &warn_psabi_flags);
22241 if (ag_count > 0 && ag_count <= HA_MAX_NUM_FLDS)
22243 static unsigned last_reported_type_uid;
22244 unsigned uid = TYPE_UID (TYPE_MAIN_VARIANT (type));
22245 int alt;
22246 if (!silent_p
22247 && warn_psabi
22248 && warn_psabi_flags
22249 && uid != last_reported_type_uid
22250 && ((alt = aapcs_vfp_sub_candidate (type, &new_mode, NULL))
22251 != ag_count))
22253 const char *url10
22254 = CHANGES_ROOT_URL "gcc-10/changes.html#empty_base";
22255 const char *url12
22256 = CHANGES_ROOT_URL "gcc-12/changes.html#zero_width_bitfields";
22257 gcc_assert (alt == -1);
22258 last_reported_type_uid = uid;
22259 /* Use TYPE_MAIN_VARIANT to strip any redundant const
22260 qualification. */
22261 if (warn_psabi_flags & WARN_PSABI_NO_UNIQUE_ADDRESS)
22262 inform (input_location, "parameter passing for argument of "
22263 "type %qT with %<[[no_unique_address]]%> members "
22264 "changed %{in GCC 10.1%}",
22265 TYPE_MAIN_VARIANT (type), url10);
22266 else if (warn_psabi_flags & WARN_PSABI_EMPTY_CXX17_BASE)
22267 inform (input_location, "parameter passing for argument of "
22268 "type %qT when C++17 is enabled changed to match "
22269 "C++14 %{in GCC 10.1%}",
22270 TYPE_MAIN_VARIANT (type), url10);
22271 else if (warn_psabi_flags & WARN_PSABI_ZERO_WIDTH_BITFIELD)
22272 inform (input_location, "parameter passing for argument of "
22273 "type %qT changed %{in GCC 12.1%}",
22274 TYPE_MAIN_VARIANT (type), url12);
22277 if (is_ha != NULL) *is_ha = true;
22278 *count = ag_count;
22280 else
22281 return false;
22283 else
22284 return false;
22286 gcc_assert (!aarch64_sve_mode_p (new_mode));
22287 *base_mode = new_mode;
22288 return true;
22291 /* Implement TARGET_STRUCT_VALUE_RTX. */
22293 static rtx
22294 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED,
22295 int incoming ATTRIBUTE_UNUSED)
22297 return gen_rtx_REG (Pmode, AARCH64_STRUCT_VALUE_REGNUM);
22300 /* Implements target hook vector_mode_supported_p. */
22301 static bool
22302 aarch64_vector_mode_supported_p (machine_mode mode)
22304 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
22305 return vec_flags != 0 && (vec_flags & VEC_STRUCT) == 0;
22308 /* Implements target hook vector_mode_supported_any_target_p. */
22309 static bool
22310 aarch64_vector_mode_supported_any_target_p (machine_mode mode)
22312 unsigned int vec_flags = aarch64_classify_vector_mode (mode, true);
22313 return vec_flags != 0 && (vec_flags & VEC_STRUCT) == 0;
22316 /* Return the full-width SVE vector mode for element mode MODE, if one
22317 exists. */
22318 opt_machine_mode
22319 aarch64_full_sve_mode (scalar_mode mode)
22321 switch (mode)
22323 case E_DFmode:
22324 return VNx2DFmode;
22325 case E_SFmode:
22326 return VNx4SFmode;
22327 case E_HFmode:
22328 return VNx8HFmode;
22329 case E_BFmode:
22330 return VNx8BFmode;
22331 case E_DImode:
22332 return VNx2DImode;
22333 case E_SImode:
22334 return VNx4SImode;
22335 case E_HImode:
22336 return VNx8HImode;
22337 case E_QImode:
22338 return VNx16QImode;
22339 default:
22340 return opt_machine_mode ();
22344 /* Return the 64-bit Advanced SIMD vector mode for element mode MODE,
22345 if it exists. */
22346 opt_machine_mode
22347 aarch64_v64_mode (scalar_mode mode)
22349 switch (mode)
22351 case E_SFmode:
22352 return V2SFmode;
22353 case E_HFmode:
22354 return V4HFmode;
22355 case E_BFmode:
22356 return V4BFmode;
22357 case E_SImode:
22358 return V2SImode;
22359 case E_HImode:
22360 return V4HImode;
22361 case E_QImode:
22362 return V8QImode;
22363 default:
22364 return {};
22368 /* Return the 128-bit Advanced SIMD vector mode for element mode MODE,
22369 if it exists. */
22370 opt_machine_mode
22371 aarch64_v128_mode (scalar_mode mode)
22373 switch (mode)
22375 case E_DFmode:
22376 return V2DFmode;
22377 case E_SFmode:
22378 return V4SFmode;
22379 case E_HFmode:
22380 return V8HFmode;
22381 case E_BFmode:
22382 return V8BFmode;
22383 case E_SImode:
22384 return V4SImode;
22385 case E_HImode:
22386 return V8HImode;
22387 case E_QImode:
22388 return V16QImode;
22389 case E_DImode:
22390 return V2DImode;
22391 default:
22392 return opt_machine_mode ();
22396 /* Return appropriate SIMD container
22397 for MODE within a vector of WIDTH bits. */
22398 static machine_mode
22399 aarch64_simd_container_mode (scalar_mode mode, poly_int64 width)
22401 if (TARGET_SVE
22402 && maybe_ne (width, 128)
22403 && known_eq (width, BITS_PER_SVE_VECTOR))
22404 return aarch64_full_sve_mode (mode).else_mode (word_mode);
22406 gcc_assert (known_eq (width, 64) || known_eq (width, 128));
22407 if (TARGET_BASE_SIMD)
22409 if (known_eq (width, 128))
22410 return aarch64_v128_mode (mode).else_mode (word_mode);
22411 else
22412 return aarch64_v64_mode (mode).else_mode (word_mode);
22414 return word_mode;
22417 /* Compare an SVE mode SVE_M and an Advanced SIMD mode ASIMD_M
22418 and return whether the SVE mode should be preferred over the
22419 Advanced SIMD one in aarch64_autovectorize_vector_modes. */
22420 static bool
22421 aarch64_cmp_autovec_modes (machine_mode sve_m, machine_mode asimd_m)
22423 /* Take into account the aarch64-autovec-preference param if non-zero. */
22424 bool only_asimd_p = aarch64_autovec_preference == AARCH64_AUTOVEC_ASIMD_ONLY;
22425 bool only_sve_p = aarch64_autovec_preference == AARCH64_AUTOVEC_SVE_ONLY;
22427 if (only_asimd_p)
22428 return false;
22429 if (only_sve_p)
22430 return true;
22432 /* The preference in case of a tie in costs. */
22433 bool prefer_asimd = aarch64_autovec_preference == AARCH64_AUTOVEC_PREFER_ASIMD;
22434 bool prefer_sve = aarch64_autovec_preference == AARCH64_AUTOVEC_PREFER_SVE;
22436 poly_int64 nunits_sve = GET_MODE_NUNITS (sve_m);
22437 poly_int64 nunits_asimd = GET_MODE_NUNITS (asimd_m);
22438 /* If the CPU information does not have an SVE width registered use the
22439 generic poly_int comparison that prefers SVE. If a preference is
22440 explicitly requested avoid this path. */
22441 if (aarch64_tune_params.sve_width == SVE_SCALABLE
22442 && !prefer_asimd
22443 && !prefer_sve)
22444 return maybe_gt (nunits_sve, nunits_asimd);
22446 /* Otherwise estimate the runtime width of the modes involved. */
22447 HOST_WIDE_INT est_sve = estimated_poly_value (nunits_sve);
22448 HOST_WIDE_INT est_asimd = estimated_poly_value (nunits_asimd);
22450 /* Preferring SVE means picking it first unless the Advanced SIMD mode
22451 is clearly wider. */
22452 if (prefer_sve)
22453 return est_sve >= est_asimd;
22454 /* Conversely, preferring Advanced SIMD means picking SVE only if SVE
22455 is clearly wider. */
22456 if (prefer_asimd)
22457 return est_sve > est_asimd;
22459 /* In the default case prefer Advanced SIMD over SVE in case of a tie. */
22460 return est_sve > est_asimd;
22463 /* Return 128-bit container as the preferred SIMD mode for MODE. */
22464 static machine_mode
22465 aarch64_preferred_simd_mode (scalar_mode mode)
22467 /* Take into account explicit auto-vectorization ISA preferences through
22468 aarch64_cmp_autovec_modes. */
22469 if (TARGET_SVE && aarch64_cmp_autovec_modes (VNx16QImode, V16QImode))
22470 return aarch64_full_sve_mode (mode).else_mode (word_mode);
22471 if (TARGET_SIMD)
22472 return aarch64_v128_mode (mode).else_mode (word_mode);
22473 return word_mode;
22476 /* Return a list of possible vector sizes for the vectorizer
22477 to iterate over. */
22478 static unsigned int
22479 aarch64_autovectorize_vector_modes (vector_modes *modes, bool)
22481 static const machine_mode sve_modes[] = {
22482 /* Try using full vectors for all element types. */
22483 VNx16QImode,
22485 /* Try using 16-bit containers for 8-bit elements and full vectors
22486 for wider elements. */
22487 VNx8QImode,
22489 /* Try using 32-bit containers for 8-bit and 16-bit elements and
22490 full vectors for wider elements. */
22491 VNx4QImode,
22493 /* Try using 64-bit containers for all element types. */
22494 VNx2QImode
22497 static const machine_mode advsimd_modes[] = {
22498 /* Try using 128-bit vectors for all element types. */
22499 V16QImode,
22501 /* Try using 64-bit vectors for 8-bit elements and 128-bit vectors
22502 for wider elements. */
22503 V8QImode,
22505 /* Try using 64-bit vectors for 16-bit elements and 128-bit vectors
22506 for wider elements.
22508 TODO: We could support a limited form of V4QImode too, so that
22509 we use 32-bit vectors for 8-bit elements. */
22510 V4HImode,
22512 /* Try using 64-bit vectors for 32-bit elements and 128-bit vectors
22513 for 64-bit elements.
22515 TODO: We could similarly support limited forms of V2QImode and V2HImode
22516 for this case. */
22517 V2SImode
22520 /* Try using N-byte SVE modes only after trying N-byte Advanced SIMD mode.
22521 This is because:
22523 - If we can't use N-byte Advanced SIMD vectors then the placement
22524 doesn't matter; we'll just continue as though the Advanced SIMD
22525 entry didn't exist.
22527 - If an SVE main loop with N bytes ends up being cheaper than an
22528 Advanced SIMD main loop with N bytes then by default we'll replace
22529 the Advanced SIMD version with the SVE one.
22531 - If an Advanced SIMD main loop with N bytes ends up being cheaper
22532 than an SVE main loop with N bytes then by default we'll try to
22533 use the SVE loop to vectorize the epilogue instead. */
22535 bool only_asimd_p = aarch64_autovec_preference == AARCH64_AUTOVEC_ASIMD_ONLY;
22536 bool only_sve_p = aarch64_autovec_preference == AARCH64_AUTOVEC_SVE_ONLY;
22538 unsigned int sve_i = (TARGET_SVE && !only_asimd_p) ? 0 : ARRAY_SIZE (sve_modes);
22539 unsigned int advsimd_i = 0;
22541 while (!only_sve_p && advsimd_i < ARRAY_SIZE (advsimd_modes))
22543 if (sve_i < ARRAY_SIZE (sve_modes)
22544 && aarch64_cmp_autovec_modes (sve_modes[sve_i],
22545 advsimd_modes[advsimd_i]))
22546 modes->safe_push (sve_modes[sve_i++]);
22547 else
22548 modes->safe_push (advsimd_modes[advsimd_i++]);
22550 while (sve_i < ARRAY_SIZE (sve_modes))
22551 modes->safe_push (sve_modes[sve_i++]);
22553 unsigned int flags = 0;
22554 if (aarch64_vect_compare_costs)
22555 flags |= VECT_COMPARE_COSTS;
22556 return flags;
22559 /* Implement TARGET_MANGLE_TYPE. */
22561 static const char *
22562 aarch64_mangle_type (const_tree type)
22564 /* The AArch64 ABI documents say that "__va_list" has to be
22565 mangled as if it is in the "std" namespace. */
22566 if (lang_hooks.types_compatible_p (CONST_CAST_TREE (type), va_list_type))
22567 return "St9__va_list";
22569 /* Half-precision floating point types. */
22570 if (SCALAR_FLOAT_TYPE_P (type) && TYPE_PRECISION (type) == 16)
22572 if (TYPE_MAIN_VARIANT (type) == float16_type_node)
22573 return NULL;
22574 if (TYPE_MODE (type) == BFmode)
22575 return "u6__bf16";
22576 else
22577 return "Dh";
22580 /* Modal 8 bit floating point types. */
22581 if (TYPE_MAIN_VARIANT (type) == aarch64_mfp8_type_node)
22582 return "u6__mfp8";
22584 /* Mangle AArch64-specific internal types. TYPE_NAME is non-NULL_TREE for
22585 builtin types. */
22586 if (TYPE_NAME (type) != NULL)
22588 const char *res;
22589 if ((res = aarch64_general_mangle_builtin_type (type))
22590 || (res = aarch64_sve::mangle_builtin_type (type)))
22591 return res;
22594 /* Use the default mangling. */
22595 return NULL;
22598 /* Implement TARGET_INVALID_CONVERSION. */
22600 static const char *
22601 aarch64_invalid_conversion (const_tree fromtype, const_tree totype)
22603 /* Do not allow conversions to/from FP8. But do allow conversions between
22604 volatile and const variants of __mfp8. */
22605 bool fromtype_is_fp8
22606 = (TYPE_MAIN_VARIANT (fromtype) == aarch64_mfp8_type_node);
22607 bool totype_is_fp8 = (TYPE_MAIN_VARIANT (totype) == aarch64_mfp8_type_node);
22609 if (fromtype_is_fp8 && totype_is_fp8)
22610 return NULL;
22612 if (fromtype_is_fp8)
22613 return N_ ("invalid conversion from type %<mfloat8_t%>");
22614 if (totype_is_fp8)
22615 return N_ ("invalid conversion to type %<mfloat8_t%>");
22617 /* Conversion allowed. */
22618 return NULL;
22621 /* Implement TARGET_VERIFY_TYPE_CONTEXT. */
22623 static bool
22624 aarch64_verify_type_context (location_t loc, type_context_kind context,
22625 const_tree type, bool silent_p)
22627 return aarch64_sve::verify_type_context (loc, context, type, silent_p);
22630 /* Find the first rtx_insn before insn that will generate an assembly
22631 instruction. */
22633 static rtx_insn *
22634 aarch64_prev_real_insn (rtx_insn *insn)
22636 if (!insn)
22637 return NULL;
22641 insn = prev_real_insn (insn);
22643 while (insn && recog_memoized (insn) < 0);
22645 return insn;
22648 static bool
22649 is_madd_op (enum attr_type t1)
22651 unsigned int i;
22652 /* A number of these may be AArch32 only. */
22653 enum attr_type mlatypes[] = {
22654 TYPE_MLA, TYPE_MLAS, TYPE_SMLAD, TYPE_SMLADX, TYPE_SMLAL, TYPE_SMLALD,
22655 TYPE_SMLALS, TYPE_SMLALXY, TYPE_SMLAWX, TYPE_SMLAWY, TYPE_SMLAXY,
22656 TYPE_SMMLA, TYPE_UMLAL, TYPE_UMLALS,TYPE_SMLSD, TYPE_SMLSDX, TYPE_SMLSLD
22659 for (i = 0; i < ARRAY_SIZE (mlatypes); i++)
22661 if (t1 == mlatypes[i])
22662 return true;
22665 return false;
22668 /* Check if there is a register dependency between a load and the insn
22669 for which we hold recog_data. */
22671 static bool
22672 dep_between_memop_and_curr (rtx memop)
22674 rtx load_reg;
22675 int opno;
22677 gcc_assert (GET_CODE (memop) == SET);
22679 if (!REG_P (SET_DEST (memop)))
22680 return false;
22682 load_reg = SET_DEST (memop);
22683 for (opno = 1; opno < recog_data.n_operands; opno++)
22685 rtx operand = recog_data.operand[opno];
22686 if (REG_P (operand)
22687 && reg_overlap_mentioned_p (load_reg, operand))
22688 return true;
22691 return false;
22695 /* When working around the Cortex-A53 erratum 835769,
22696 given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate
22697 instruction and has a preceding memory instruction such that a NOP
22698 should be inserted between them. */
22700 bool
22701 aarch64_madd_needs_nop (rtx_insn* insn)
22703 enum attr_type attr_type;
22704 rtx_insn *prev;
22705 rtx body;
22707 if (!TARGET_FIX_ERR_A53_835769)
22708 return false;
22710 if (!INSN_P (insn) || recog_memoized (insn) < 0)
22711 return false;
22713 attr_type = get_attr_type (insn);
22714 if (!is_madd_op (attr_type))
22715 return false;
22717 prev = aarch64_prev_real_insn (insn);
22718 /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN.
22719 Restore recog state to INSN to avoid state corruption. */
22720 extract_constrain_insn_cached (insn);
22722 if (!prev || !contains_mem_rtx_p (PATTERN (prev)))
22723 return false;
22725 body = single_set (prev);
22727 /* If the previous insn is a memory op and there is no dependency between
22728 it and the DImode madd, emit a NOP between them. If body is NULL then we
22729 have a complex memory operation, probably a load/store pair.
22730 Be conservative for now and emit a NOP. */
22731 if (GET_MODE (recog_data.operand[0]) == DImode
22732 && (!body || !dep_between_memop_and_curr (body)))
22733 return true;
22735 return false;
22740 /* Implement FINAL_PRESCAN_INSN. */
22742 void
22743 aarch64_final_prescan_insn (rtx_insn *insn)
22745 if (aarch64_madd_needs_nop (insn))
22746 fprintf (asm_out_file, "\tnop // between mem op and mult-accumulate\n");
22750 /* Return true if BASE_OR_STEP is a valid immediate operand for an SVE INDEX
22751 instruction. */
22753 bool
22754 aarch64_sve_index_immediate_p (rtx base_or_step)
22756 return (CONST_INT_P (base_or_step)
22757 && IN_RANGE (INTVAL (base_or_step), -16, 15));
22760 /* Return true if X is a valid immediate for the SVE ADD and SUB instructions
22761 when applied to mode MODE. Negate X first if NEGATE_P is true. */
22763 bool
22764 aarch64_sve_arith_immediate_p (machine_mode mode, rtx x, bool negate_p)
22766 rtx elt = unwrap_const_vec_duplicate (x);
22767 if (!CONST_INT_P (elt))
22768 return false;
22770 HOST_WIDE_INT val = INTVAL (elt);
22771 if (negate_p)
22772 val = -val;
22773 val &= GET_MODE_MASK (GET_MODE_INNER (mode));
22775 if (val & 0xff)
22776 return IN_RANGE (val, 0, 0xff);
22777 return IN_RANGE (val, 0, 0xff00);
22780 /* Return true if X is a valid immediate for the SVE SQADD and SQSUB
22781 instructions when applied to mode MODE. Negate X first if NEGATE_P
22782 is true. */
22784 bool
22785 aarch64_sve_sqadd_sqsub_immediate_p (machine_mode mode, rtx x, bool negate_p)
22787 if (!aarch64_sve_arith_immediate_p (mode, x, negate_p))
22788 return false;
22790 /* After the optional negation, the immediate must be nonnegative.
22791 E.g. a saturating add of -127 must be done via SQSUB Zn.B, Zn.B, #127
22792 instead of SQADD Zn.B, Zn.B, #129. */
22793 rtx elt = unwrap_const_vec_duplicate (x);
22794 return negate_p == (INTVAL (elt) < 0);
22797 /* Return true if X is a valid immediate operand for an SVE logical
22798 instruction such as AND. */
22800 bool
22801 aarch64_sve_bitmask_immediate_p (rtx x)
22803 rtx elt;
22805 return (const_vec_duplicate_p (x, &elt)
22806 && CONST_INT_P (elt)
22807 && aarch64_bitmask_imm (INTVAL (elt),
22808 GET_MODE_INNER (GET_MODE (x))));
22811 /* Return true if X is a valid immediate for the SVE DUP and CPY
22812 instructions. */
22814 bool
22815 aarch64_sve_dup_immediate_p (rtx x)
22817 x = aarch64_bit_representation (unwrap_const_vec_duplicate (x));
22818 if (!CONST_INT_P (x))
22819 return false;
22821 HOST_WIDE_INT val = INTVAL (x);
22822 if (val & 0xff)
22823 return IN_RANGE (val, -0x80, 0x7f);
22824 return IN_RANGE (val, -0x8000, 0x7f00);
22827 /* Return true if X is a valid immediate operand for an SVE CMP instruction.
22828 SIGNED_P says whether the operand is signed rather than unsigned. */
22830 bool
22831 aarch64_sve_cmp_immediate_p (rtx x, bool signed_p)
22833 x = unwrap_const_vec_duplicate (x);
22834 return (CONST_INT_P (x)
22835 && (signed_p
22836 ? IN_RANGE (INTVAL (x), -16, 15)
22837 : IN_RANGE (INTVAL (x), 0, 127)));
22840 /* Return true if X is a valid immediate operand for an SVE FADD or FSUB
22841 instruction. Negate X first if NEGATE_P is true. */
22843 bool
22844 aarch64_sve_float_arith_immediate_p (rtx x, bool negate_p)
22846 rtx elt;
22847 REAL_VALUE_TYPE r;
22849 if (GET_MODE_INNER (GET_MODE (x)) == BFmode
22850 || !const_vec_duplicate_p (x, &elt)
22851 || !CONST_DOUBLE_P (elt))
22852 return false;
22854 r = *CONST_DOUBLE_REAL_VALUE (elt);
22856 if (negate_p)
22857 r = real_value_negate (&r);
22859 if (real_equal (&r, &dconst1))
22860 return true;
22861 if (real_equal (&r, &dconsthalf))
22862 return true;
22863 return false;
22866 /* Return true if X is a valid immediate operand for an SVE FMUL
22867 instruction. */
22869 bool
22870 aarch64_sve_float_mul_immediate_p (rtx x)
22872 rtx elt;
22874 return (GET_MODE_INNER (GET_MODE (x)) != BFmode
22875 && const_vec_duplicate_p (x, &elt)
22876 && CONST_DOUBLE_P (elt)
22877 && (real_equal (CONST_DOUBLE_REAL_VALUE (elt), &dconsthalf)
22878 || real_equal (CONST_DOUBLE_REAL_VALUE (elt), &dconst2)));
22881 /* Return true if replicating VAL32 is a valid 2-byte or 4-byte immediate
22882 for the Advanced SIMD operation described by WHICH and INSN. If INFO
22883 is nonnull, use it to describe valid immediates. */
22884 static bool
22885 aarch64_advsimd_valid_immediate_hs (unsigned int val32,
22886 simd_immediate_info *info,
22887 enum simd_immediate_check which,
22888 simd_immediate_info::insn_type insn)
22890 /* Try a 4-byte immediate with LSL. */
22891 for (unsigned int shift = 0; shift < 32; shift += 8)
22892 if ((val32 & (0xff << shift)) == val32)
22894 if (info)
22895 *info = simd_immediate_info (SImode, val32 >> shift, insn,
22896 simd_immediate_info::LSL, shift);
22897 return true;
22900 /* Try a 2-byte immediate with LSL. */
22901 unsigned int imm16 = val32 & 0xffff;
22902 if (imm16 == (val32 >> 16))
22903 for (unsigned int shift = 0; shift < 16; shift += 8)
22904 if ((imm16 & (0xff << shift)) == imm16)
22906 if (info)
22907 *info = simd_immediate_info (HImode, imm16 >> shift, insn,
22908 simd_immediate_info::LSL, shift);
22909 return true;
22912 /* Try a 4-byte immediate with MSL, except for cases that MVN
22913 can handle. */
22914 if (which == AARCH64_CHECK_MOV)
22915 for (unsigned int shift = 8; shift < 24; shift += 8)
22917 unsigned int low = (1 << shift) - 1;
22918 if (((val32 & (0xff << shift)) | low) == val32)
22920 if (info)
22921 *info = simd_immediate_info (SImode, val32 >> shift, insn,
22922 simd_immediate_info::MSL, shift);
22923 return true;
22927 return false;
22930 /* Return true if replicating VAL64 with mode MODE is a valid immediate for the
22931 Advanced SIMD operation described by WHICH. If INFO is nonnull,
22932 use it to describe valid immediates. */
22933 static bool
22934 aarch64_advsimd_valid_immediate (unsigned HOST_WIDE_INT val64,
22935 scalar_int_mode mode,
22936 simd_immediate_info *info,
22937 enum simd_immediate_check which)
22939 unsigned int val32 = val64 & 0xffffffff;
22940 unsigned int val8 = val64 & 0xff;
22942 if (mode != DImode)
22944 if ((which == AARCH64_CHECK_MOV || which == AARCH64_CHECK_ORR)
22945 && aarch64_advsimd_valid_immediate_hs (val32, info, which,
22946 simd_immediate_info::MOV))
22947 return true;
22949 if ((which == AARCH64_CHECK_MOV || which == AARCH64_CHECK_AND)
22950 && aarch64_advsimd_valid_immediate_hs (~val32, info, which,
22951 simd_immediate_info::MVN))
22952 return true;
22954 /* Try using a replicated byte. */
22955 if (which == AARCH64_CHECK_MOV && mode == QImode)
22957 if (info)
22958 *info = simd_immediate_info (QImode, val8);
22959 return true;
22963 /* Try using a bit-to-bytemask. */
22964 if (which == AARCH64_CHECK_MOV)
22966 unsigned int i;
22967 for (i = 0; i < 64; i += 8)
22969 unsigned char byte = (val64 >> i) & 0xff;
22970 if (byte != 0 && byte != 0xff)
22971 break;
22973 if (i == 64)
22975 if (info)
22976 *info = simd_immediate_info (DImode, val64);
22977 return true;
22980 return false;
22983 /* Return true if replicating IVAL with MODE gives a valid immediate for an SVE
22984 MOV instruction. If INFO is nonnull, use it to describe valid
22985 immediates. */
22987 static bool
22988 aarch64_sve_valid_immediate (unsigned HOST_WIDE_INT ival, scalar_int_mode mode,
22989 simd_immediate_info *info,
22990 enum simd_immediate_check which)
22992 HOST_WIDE_INT val = trunc_int_for_mode (ival, mode);
22994 if (which == AARCH64_CHECK_MOV)
22996 if (IN_RANGE (val, -0x80, 0x7f))
22998 /* DUP with no shift. */
22999 if (info)
23000 *info = simd_immediate_info (mode, val,
23001 simd_immediate_info::SVE_MOV);
23002 return true;
23004 if ((val & 0xff) == 0 && IN_RANGE (val, -0x8000, 0x7f00))
23006 /* DUP with LSL #8. */
23007 if (info)
23008 *info = simd_immediate_info (mode, val,
23009 simd_immediate_info::SVE_MOV);
23010 return true;
23013 if (aarch64_bitmask_imm (ival, mode))
23015 /* DUPM. */
23016 if (info)
23017 *info = simd_immediate_info (mode, val, simd_immediate_info::SVE_MOV);
23018 return true;
23020 return false;
23023 /* Return true if X is an UNSPEC_PTRUE constant of the form:
23025 (const (unspec [PATTERN ZERO] UNSPEC_PTRUE))
23027 where PATTERN is the svpattern as a CONST_INT and where ZERO
23028 is a zero constant of the required PTRUE mode (which can have
23029 fewer elements than X's mode, if zero bits are significant).
23031 If so, and if INFO is nonnull, describe the immediate in INFO. */
23032 bool
23033 aarch64_sve_ptrue_svpattern_p (rtx x, struct simd_immediate_info *info)
23035 if (GET_CODE (x) != CONST)
23036 return false;
23038 x = XEXP (x, 0);
23039 if (GET_CODE (x) != UNSPEC || XINT (x, 1) != UNSPEC_PTRUE)
23040 return false;
23042 if (info)
23044 aarch64_svpattern pattern
23045 = (aarch64_svpattern) INTVAL (XVECEXP (x, 0, 0));
23046 machine_mode pred_mode = GET_MODE (XVECEXP (x, 0, 1));
23047 scalar_int_mode int_mode = aarch64_sve_element_int_mode (pred_mode);
23048 *info = simd_immediate_info (int_mode, pattern);
23050 return true;
23053 /* Return true if X is a valid SVE predicate. If INFO is nonnull, use
23054 it to describe valid immediates. */
23056 static bool
23057 aarch64_sve_pred_valid_immediate (rtx x, simd_immediate_info *info)
23059 if (aarch64_sve_ptrue_svpattern_p (x, info))
23060 return true;
23062 if (x == CONST0_RTX (GET_MODE (x)))
23064 if (info)
23065 *info = simd_immediate_info (DImode, 0);
23066 return true;
23069 /* Analyze the value as a VNx16BImode. This should be relatively
23070 efficient, since rtx_vector_builder has enough built-in capacity
23071 to store all VLA predicate constants without needing the heap. */
23072 rtx_vector_builder builder;
23073 if (!aarch64_get_sve_pred_bits (builder, x))
23074 return false;
23076 unsigned int elt_size = aarch64_widest_sve_pred_elt_size (builder);
23077 if (int vl = aarch64_partial_ptrue_length (builder, elt_size))
23079 machine_mode mode = aarch64_sve_pred_mode (elt_size).require ();
23080 aarch64_svpattern pattern = aarch64_svpattern_for_vl (mode, vl);
23081 if (pattern != AARCH64_NUM_SVPATTERNS)
23083 if (info)
23085 scalar_int_mode int_mode = aarch64_sve_element_int_mode (mode);
23086 *info = simd_immediate_info (int_mode, pattern);
23088 return true;
23091 return false;
23094 /* We can only represent floating point constants which will fit in
23095 "quarter-precision" values. These values are characterised by
23096 a sign bit, a 4-bit mantissa and a 3-bit exponent. And are given
23099 (-1)^s * (n/16) * 2^r
23101 Where:
23102 's' is the sign bit.
23103 'n' is an integer in the range 16 <= n <= 31.
23104 'r' is an integer in the range -3 <= r <= 4.
23106 Return true iff R represents a vale encodable into an AArch64 floating point
23107 move instruction as an immediate. Othewise false. */
23109 static bool
23110 aarch64_real_float_const_representable_p (REAL_VALUE_TYPE r)
23112 /* This represents our current view of how many bits
23113 make up the mantissa. */
23114 int point_pos = 2 * HOST_BITS_PER_WIDE_INT - 1;
23115 int exponent;
23116 unsigned HOST_WIDE_INT mantissa, mask;
23117 REAL_VALUE_TYPE m;
23118 bool fail = false;
23120 /* We cannot represent infinities, NaNs or +/-zero. We won't
23121 know if we have +zero until we analyse the mantissa, but we
23122 can reject the other invalid values. */
23123 if (REAL_VALUE_ISINF (r) || REAL_VALUE_ISNAN (r)
23124 || REAL_VALUE_MINUS_ZERO (r))
23125 return false;
23127 /* Extract exponent. */
23128 r = real_value_abs (&r);
23129 exponent = REAL_EXP (&r);
23131 /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
23132 highest (sign) bit, with a fixed binary point at bit point_pos.
23133 m1 holds the low part of the mantissa, m2 the high part.
23134 WARNING: If we ever have a representation using more than 2 * H_W_I - 1
23135 bits for the mantissa, this can fail (low bits will be lost). */
23136 real_ldexp (&m, &r, point_pos - exponent);
23137 wide_int w = real_to_integer (&m, &fail, HOST_BITS_PER_WIDE_INT * 2);
23139 /* If the low part of the mantissa has bits set we cannot represent
23140 the value. */
23141 if (fail || w.ulow () != 0)
23142 return false;
23144 /* We have rejected the lower HOST_WIDE_INT, so update our
23145 understanding of how many bits lie in the mantissa and
23146 look only at the high HOST_WIDE_INT. */
23147 mantissa = w.elt (1);
23148 point_pos -= HOST_BITS_PER_WIDE_INT;
23150 /* We can only represent values with a mantissa of the form 1.xxxx. */
23151 mask = ((unsigned HOST_WIDE_INT)1 << (point_pos - 5)) - 1;
23152 if ((mantissa & mask) != 0)
23153 return false;
23155 /* Having filtered unrepresentable values, we may now remove all
23156 but the highest 5 bits. */
23157 mantissa >>= point_pos - 5;
23159 /* We cannot represent the value 0.0, so reject it. This is handled
23160 elsewhere. */
23161 if (mantissa == 0)
23162 return false;
23164 /* Then, as bit 4 is always set, we can mask it off, leaving
23165 the mantissa in the range [0, 15]. */
23166 mantissa &= ~(1 << 4);
23167 gcc_assert (mantissa <= 15);
23169 /* GCC internally does not use IEEE754-like encoding (where normalized
23170 significands are in the range [1, 2). GCC uses [0.5, 1) (see real.cc).
23171 Our mantissa values are shifted 4 places to the left relative to
23172 normalized IEEE754 so we must modify the exponent returned by REAL_EXP
23173 by 5 places to correct for GCC's representation. */
23174 exponent = 5 - exponent;
23176 return (exponent >= 0 && exponent <= 7);
23179 /* Return true if OP is a valid SIMD immediate for the operation
23180 described by WHICH. If INFO is nonnull, use it to describe valid
23181 immediates. */
23182 static bool
23183 aarch64_simd_valid_imm (rtx op, simd_immediate_info *info,
23184 enum simd_immediate_check which)
23186 machine_mode mode = GET_MODE (op);
23187 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
23188 if (vec_flags == 0 || vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
23189 return false;
23191 if ((vec_flags & VEC_ADVSIMD) && !TARGET_SIMD)
23192 return false;
23194 if (vec_flags == (VEC_SVE_PRED | VEC_STRUCT))
23195 return op == CONST0_RTX (mode) || op == CONSTM1_RTX (mode);
23197 if (vec_flags & VEC_SVE_PRED)
23198 return aarch64_sve_pred_valid_immediate (op, info);
23200 scalar_mode elt_mode = GET_MODE_INNER (mode);
23201 rtx base, step;
23202 unsigned int n_elts;
23203 if (CONST_VECTOR_P (op)
23204 && CONST_VECTOR_DUPLICATE_P (op))
23205 n_elts = CONST_VECTOR_NPATTERNS (op);
23206 else if (which == AARCH64_CHECK_MOV
23207 && TARGET_SVE
23208 && const_vec_series_p (op, &base, &step))
23210 gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_INT);
23211 if (!aarch64_sve_index_immediate_p (base)
23212 || !aarch64_sve_index_immediate_p (step))
23213 return false;
23215 if (info)
23217 /* Get the corresponding container mode. E.g. an INDEX on V2SI
23218 should yield two integer values per 128-bit block, meaning
23219 that we need to treat it in the same way as V2DI and then
23220 ignore the upper 32 bits of each element. */
23221 elt_mode = aarch64_sve_container_int_mode (mode);
23222 *info = simd_immediate_info (elt_mode, base, step);
23224 return true;
23226 else if (CONST_VECTOR_P (op)
23227 && CONST_VECTOR_NUNITS (op).is_constant (&n_elts))
23228 /* N_ELTS set above. */;
23229 else
23230 return false;
23232 /* If all elements in an SVE vector have the same value, we have a free
23233 choice between using the element mode and using the container mode.
23234 Using the element mode means that unused parts of the vector are
23235 duplicates of the used elements, while using the container mode means
23236 that the unused parts are an extension of the used elements. Using the
23237 element mode is better for (say) VNx4HI 0x101, since 0x01010101 is valid
23238 for its container mode VNx4SI while 0x00000101 isn't.
23240 If not all elements in an SVE vector have the same value, we need the
23241 transition from one element to the next to occur at container boundaries.
23242 E.g. a fixed-length VNx4HI containing { 1, 2, 3, 4 } should be treated
23243 in the same way as a VNx4SI containing { 1, 2, 3, 4 }. */
23244 scalar_int_mode elt_int_mode;
23245 if ((vec_flags & VEC_SVE_DATA) && n_elts > 1)
23246 elt_int_mode = aarch64_sve_container_int_mode (mode);
23247 else
23248 elt_int_mode = int_mode_for_mode (elt_mode).require ();
23250 unsigned int elt_size = GET_MODE_SIZE (elt_int_mode);
23251 if (elt_size > 8)
23252 return false;
23254 /* Expand the vector constant out into a byte vector, with the least
23255 significant byte of the register first. */
23256 auto_vec<unsigned char, 16> bytes;
23257 bytes.reserve (n_elts * elt_size);
23258 for (unsigned int i = 0; i < n_elts; i++)
23260 /* The vector is provided in gcc endian-neutral fashion.
23261 For aarch64_be Advanced SIMD, it must be laid out in the vector
23262 register in reverse order. */
23263 bool swap_p = ((vec_flags & VEC_ADVSIMD) != 0 && BYTES_BIG_ENDIAN);
23264 rtx elt = CONST_VECTOR_ELT (op, swap_p ? (n_elts - 1 - i) : i);
23266 if (elt_mode != elt_int_mode)
23267 elt = gen_lowpart (elt_int_mode, elt);
23269 if (!CONST_INT_P (elt))
23270 return false;
23272 unsigned HOST_WIDE_INT elt_val = INTVAL (elt);
23273 for (unsigned int byte = 0; byte < elt_size; byte++)
23275 bytes.quick_push (elt_val & 0xff);
23276 elt_val >>= BITS_PER_UNIT;
23280 /* The immediate must repeat every eight bytes. */
23281 unsigned int nbytes = bytes.length ();
23282 for (unsigned i = 8; i < nbytes; ++i)
23283 if (bytes[i] != bytes[i - 8])
23284 return false;
23286 /* Get the repeating 8-byte value as an integer. No endian correction
23287 is needed here because bytes is already in lsb-first order. */
23288 unsigned HOST_WIDE_INT val64 = 0;
23289 for (unsigned int i = 0; i < 8; i++)
23290 val64 |= ((unsigned HOST_WIDE_INT) bytes[i % nbytes]
23291 << (i * BITS_PER_UNIT));
23293 /* Try encoding the integer immediate as a floating point value if it's an
23294 exact value. */
23295 scalar_float_mode fmode = DFmode;
23296 scalar_int_mode imode = DImode;
23297 unsigned HOST_WIDE_INT ival = val64;
23298 unsigned int val32 = val64 & 0xffffffff;
23299 if (val32 == (val64 >> 32))
23301 fmode = SFmode;
23302 imode = SImode;
23303 ival = val32;
23304 unsigned int val16 = val32 & 0xffff;
23305 if (val16 == (val32 >> 16))
23307 fmode = HFmode;
23308 imode = HImode;
23309 ival = val16;
23310 unsigned int val8 = val16 & 0xff;
23311 if (val8 == (val16 >> 8))
23313 imode = QImode;
23314 ival = val8;
23319 if (which == AARCH64_CHECK_MOV
23320 && imode != QImode
23321 && (imode != HImode || TARGET_FP_F16INST))
23323 long int as_long_ints[2];
23324 as_long_ints[0] = ival & 0xFFFFFFFF;
23325 as_long_ints[1] = (ival >> 32) & 0xFFFFFFFF;
23327 REAL_VALUE_TYPE r;
23328 real_from_target (&r, as_long_ints, fmode);
23329 if (aarch64_real_float_const_representable_p (r))
23331 if (info)
23333 rtx float_val = const_double_from_real_value (r, fmode);
23334 *info = simd_immediate_info (fmode, float_val);
23336 return true;
23340 if (vec_flags & VEC_SVE_DATA)
23341 return aarch64_sve_valid_immediate (ival, imode, info, which);
23343 if (aarch64_advsimd_valid_immediate (val64, imode, info, which))
23344 return true;
23346 if (TARGET_SVE)
23347 return aarch64_sve_valid_immediate (ival, imode, info, which);
23348 return false;
23351 /* Return true if OP is a valid SIMD move immediate for SVE or AdvSIMD. */
23352 bool
23353 aarch64_simd_valid_mov_imm (rtx op)
23355 return aarch64_simd_valid_imm (op, NULL, AARCH64_CHECK_MOV);
23358 /* Return true if OP is a valid SIMD orr immediate for SVE or AdvSIMD. */
23359 bool
23360 aarch64_simd_valid_orr_imm (rtx op)
23362 return aarch64_simd_valid_imm (op, NULL, AARCH64_CHECK_ORR);
23365 /* Return true if OP is a valid SIMD and immediate for SVE or AdvSIMD. */
23366 bool
23367 aarch64_simd_valid_and_imm (rtx op)
23369 return aarch64_simd_valid_imm (op, NULL, AARCH64_CHECK_AND);
23372 /* Return true if OP is a valid SIMD xor immediate for SVE. */
23373 bool
23374 aarch64_simd_valid_xor_imm (rtx op)
23376 return aarch64_simd_valid_imm (op, NULL, AARCH64_CHECK_XOR);
23379 /* Check whether X is a VEC_SERIES-like constant that starts at 0 and
23380 has a step in the range of INDEX. Return the index expression if so,
23381 otherwise return null. */
23383 aarch64_check_zero_based_sve_index_immediate (rtx x)
23385 rtx base, step;
23386 if (const_vec_series_p (x, &base, &step)
23387 && base == const0_rtx
23388 && aarch64_sve_index_immediate_p (step))
23389 return step;
23390 return NULL_RTX;
23393 /* Check of immediate shift constants are within range. */
23394 bool
23395 aarch64_simd_shift_imm_p (rtx x, machine_mode mode, bool left)
23397 x = unwrap_const_vec_duplicate (x);
23398 if (!CONST_INT_P (x))
23399 return false;
23400 int bit_width = GET_MODE_UNIT_SIZE (mode) * BITS_PER_UNIT;
23401 if (left)
23402 return IN_RANGE (INTVAL (x), 0, bit_width - 1);
23403 else
23404 return IN_RANGE (INTVAL (x), 1, bit_width);
23407 /* Return the bitmask CONST_INT to select the bits required by a zero extract
23408 operation of width WIDTH at bit position POS. */
23411 aarch64_mask_from_zextract_ops (rtx width, rtx pos)
23413 gcc_assert (CONST_INT_P (width));
23414 gcc_assert (CONST_INT_P (pos));
23416 unsigned HOST_WIDE_INT mask
23417 = ((unsigned HOST_WIDE_INT) 1 << UINTVAL (width)) - 1;
23418 return GEN_INT (mask << UINTVAL (pos));
23421 bool
23422 aarch64_mov_operand_p (rtx x, machine_mode mode)
23424 if (GET_CODE (x) == HIGH
23425 && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
23426 return true;
23428 if (CONST_INT_P (x))
23429 return true;
23431 if (VECTOR_MODE_P (GET_MODE (x)))
23433 /* Require predicate constants to be VNx16BI before RA, so that we
23434 force everything to have a canonical form. */
23435 if (!lra_in_progress
23436 && !reload_completed
23437 && aarch64_sve_pred_mode_p (GET_MODE (x))
23438 && known_eq (GET_MODE_SIZE (GET_MODE (x)), BYTES_PER_SVE_PRED)
23439 && GET_MODE (x) != VNx16BImode)
23440 return false;
23442 return aarch64_simd_valid_mov_imm (x);
23445 /* Remove UNSPEC_SALT_ADDR before checking symbol reference. */
23446 x = strip_salt (x);
23448 /* GOT accesses are valid moves. */
23449 if (SYMBOL_REF_P (x)
23450 && aarch64_classify_symbolic_expression (x) == SYMBOL_SMALL_GOT_4G)
23451 return true;
23453 if (SYMBOL_REF_P (x) && mode == DImode && CONSTANT_ADDRESS_P (x))
23454 return true;
23456 if (TARGET_SVE
23457 && (aarch64_sve_cnt_immediate_p (x)
23458 || aarch64_sve_rdvl_immediate_p (x)))
23459 return true;
23461 if (aarch64_rdsvl_immediate_p (x))
23462 return true;
23464 return aarch64_classify_symbolic_expression (x)
23465 == SYMBOL_TINY_ABSOLUTE;
23468 /* Return a function-invariant register that contains VALUE. *CACHED_INSN
23469 caches instructions that set up such registers, so that they can be
23470 reused by future calls. */
23472 static rtx
23473 aarch64_get_shareable_reg (rtx_insn **cached_insn, rtx value)
23475 rtx_insn *insn = *cached_insn;
23476 if (insn && INSN_P (insn) && !insn->deleted ())
23478 rtx pat = PATTERN (insn);
23479 if (GET_CODE (pat) == SET)
23481 rtx dest = SET_DEST (pat);
23482 if (REG_P (dest)
23483 && !HARD_REGISTER_P (dest)
23484 && rtx_equal_p (SET_SRC (pat), value))
23485 return dest;
23488 rtx reg = gen_reg_rtx (GET_MODE (value));
23489 *cached_insn = emit_insn_before (gen_rtx_SET (reg, value),
23490 function_beg_insn);
23491 return reg;
23494 /* Create a 0 constant that is based on V4SI to allow CSE to optimally share
23495 the constant creation. */
23498 aarch64_gen_shareable_zero (machine_mode mode)
23500 rtx reg = aarch64_get_shareable_reg (&cfun->machine->advsimd_zero_insn,
23501 CONST0_RTX (V4SImode));
23502 return lowpart_subreg (mode, reg, GET_MODE (reg));
23505 /* INSN is some form of extension or shift that can be split into a
23506 permutation involving a shared zero. Return true if we should
23507 perform such a split.
23509 ??? For now, make sure that the split instruction executes more
23510 frequently than the zero that feeds it. In future it would be good
23511 to split without that restriction and instead recombine shared zeros
23512 if they turn out not to be worthwhile. This would allow splits in
23513 single-block functions and would also cope more naturally with
23514 rematerialization. The downside of not doing this is that we lose the
23515 optimizations for vector epilogues as well. */
23517 bool
23518 aarch64_split_simd_shift_p (rtx_insn *insn)
23520 return (can_create_pseudo_p ()
23521 && optimize_bb_for_speed_p (BLOCK_FOR_INSN (insn))
23522 && (ENTRY_BLOCK_PTR_FOR_FN (cfun)->count
23523 < BLOCK_FOR_INSN (insn)->count));
23526 /* Return a const_int vector of VAL. */
23528 aarch64_simd_gen_const_vector_dup (machine_mode mode, HOST_WIDE_INT val)
23530 rtx c = gen_int_mode (val, GET_MODE_INNER (mode));
23531 return gen_const_vec_duplicate (mode, c);
23534 /* Check OP is a legal scalar immediate for the MOVI instruction. */
23536 bool
23537 aarch64_simd_scalar_immediate_valid_for_move (rtx op, scalar_int_mode mode)
23539 machine_mode vmode;
23541 vmode = aarch64_simd_container_mode (mode, 64);
23542 rtx op_v = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (op));
23543 return aarch64_simd_valid_mov_imm (op_v);
23546 /* Construct and return a PARALLEL RTX vector with elements numbering the
23547 lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
23548 the vector - from the perspective of the architecture. This does not
23549 line up with GCC's perspective on lane numbers, so we end up with
23550 different masks depending on our target endian-ness. The diagram
23551 below may help. We must draw the distinction when building masks
23552 which select one half of the vector. An instruction selecting
23553 architectural low-lanes for a big-endian target, must be described using
23554 a mask selecting GCC high-lanes.
23556 Big-Endian Little-Endian
23558 GCC 0 1 2 3 3 2 1 0
23559 | x | x | x | x | | x | x | x | x |
23560 Architecture 3 2 1 0 3 2 1 0
23562 Low Mask: { 2, 3 } { 0, 1 }
23563 High Mask: { 0, 1 } { 2, 3 }
23565 MODE Is the mode of the vector and NUNITS is the number of units in it. */
23568 aarch64_simd_vect_par_cnst_half (machine_mode mode, int nunits, bool high)
23570 rtvec v = rtvec_alloc (nunits / 2);
23571 int high_base = nunits / 2;
23572 int low_base = 0;
23573 int base;
23574 rtx t1;
23575 int i;
23577 if (BYTES_BIG_ENDIAN)
23578 base = high ? low_base : high_base;
23579 else
23580 base = high ? high_base : low_base;
23582 for (i = 0; i < nunits / 2; i++)
23583 RTVEC_ELT (v, i) = GEN_INT (base + i);
23585 t1 = gen_rtx_PARALLEL (mode, v);
23586 return t1;
23589 /* Check OP for validity as a PARALLEL RTX vector with elements
23590 numbering the lanes of either the high (HIGH == TRUE) or low lanes,
23591 from the perspective of the architecture. See the diagram above
23592 aarch64_simd_vect_par_cnst_half for more details. */
23594 bool
23595 aarch64_simd_check_vect_par_cnst_half (rtx op, machine_mode mode,
23596 bool high)
23598 int nelts;
23599 if (!VECTOR_MODE_P (mode) || !GET_MODE_NUNITS (mode).is_constant (&nelts))
23600 return false;
23602 rtx ideal = aarch64_simd_vect_par_cnst_half (mode, nelts, high);
23603 HOST_WIDE_INT count_op = XVECLEN (op, 0);
23604 HOST_WIDE_INT count_ideal = XVECLEN (ideal, 0);
23605 int i = 0;
23607 if (count_op != count_ideal)
23608 return false;
23610 for (i = 0; i < count_ideal; i++)
23612 rtx elt_op = XVECEXP (op, 0, i);
23613 rtx elt_ideal = XVECEXP (ideal, 0, i);
23615 if (!CONST_INT_P (elt_op)
23616 || INTVAL (elt_ideal) != INTVAL (elt_op))
23617 return false;
23619 return true;
23622 /* Return a PARALLEL containing NELTS elements, with element I equal
23623 to BASE + I * STEP. */
23626 aarch64_gen_stepped_int_parallel (unsigned int nelts, int base, int step)
23628 rtvec vec = rtvec_alloc (nelts);
23629 for (unsigned int i = 0; i < nelts; ++i)
23630 RTVEC_ELT (vec, i) = gen_int_mode (base + i * step, DImode);
23631 return gen_rtx_PARALLEL (VOIDmode, vec);
23634 /* Return true if OP is a PARALLEL of CONST_INTs that form a linear
23635 series with step STEP. */
23637 bool
23638 aarch64_stepped_int_parallel_p (rtx op, int step)
23640 if (GET_CODE (op) != PARALLEL || !CONST_INT_P (XVECEXP (op, 0, 0)))
23641 return false;
23643 unsigned HOST_WIDE_INT base = UINTVAL (XVECEXP (op, 0, 0));
23644 for (int i = 1; i < XVECLEN (op, 0); ++i)
23645 if (!CONST_INT_P (XVECEXP (op, 0, i))
23646 || UINTVAL (XVECEXP (op, 0, i)) != base + i * step)
23647 return false;
23649 return true;
23652 /* Return true if OPERANDS[0] to OPERANDS[NUM_OPERANDS - 1] form a
23653 sequence of strided registers, with the stride being equal STRIDE.
23654 The operands are already known to be FPRs. */
23655 bool
23656 aarch64_strided_registers_p (rtx *operands, unsigned int num_operands,
23657 unsigned int stride)
23659 for (unsigned int i = 1; i < num_operands; ++i)
23660 if (REGNO (operands[i]) != REGNO (operands[0]) + i * stride)
23661 return false;
23662 return true;
23665 /* Bounds-check lanes. Ensure OPERAND lies between LOW (inclusive) and
23666 HIGH (exclusive). */
23667 void
23668 aarch64_simd_lane_bounds (rtx operand, HOST_WIDE_INT low, HOST_WIDE_INT high,
23669 const_tree exp)
23671 HOST_WIDE_INT lane;
23672 gcc_assert (CONST_INT_P (operand));
23673 lane = INTVAL (operand);
23675 if (lane < low || lane >= high)
23677 if (exp)
23678 error_at (EXPR_LOCATION (exp), "lane %wd out of range %wd - %wd",
23679 lane, low, high - 1);
23680 else
23681 error ("lane %wd out of range %wd - %wd", lane, low, high - 1);
23685 /* Peform endian correction on lane number N, which indexes a vector
23686 of mode MODE, and return the result as an SImode rtx. */
23689 aarch64_endian_lane_rtx (machine_mode mode, unsigned int n)
23691 return gen_int_mode (ENDIAN_LANE_N (GET_MODE_NUNITS (mode), n), SImode);
23694 /* Return TRUE if OP is a valid vector addressing mode. */
23696 bool
23697 aarch64_simd_mem_operand_p (rtx op)
23699 return (MEM_P (op)
23700 && (GET_CODE (XEXP (op, 0)) == POST_INC || REG_P (XEXP (op, 0)))
23701 && memory_operand (op, VOIDmode));
23704 /* Return true if OP is a valid MEM operand for an SVE LD1R instruction. */
23706 bool
23707 aarch64_sve_ld1r_operand_p (rtx op)
23709 struct aarch64_address_info addr;
23710 scalar_mode mode;
23712 return (MEM_P (op)
23713 && is_a <scalar_mode> (GET_MODE (op), &mode)
23714 && aarch64_classify_address (&addr, XEXP (op, 0), mode, false)
23715 && addr.type == ADDRESS_REG_IMM
23716 && offset_6bit_unsigned_scaled_p (mode, addr.const_offset));
23719 /* Return true if OP is a valid MEM operand for an SVE LD1R{Q,O} instruction
23720 where the size of the read data is specified by `mode` and the size of the
23721 vector elements are specified by `elem_mode`. */
23722 bool
23723 aarch64_sve_ld1rq_ld1ro_operand_p (rtx op, machine_mode mode,
23724 scalar_mode elem_mode)
23726 struct aarch64_address_info addr;
23727 if (!MEM_P (op)
23728 || !aarch64_classify_address (&addr, XEXP (op, 0), elem_mode, false))
23729 return false;
23731 if (addr.type == ADDRESS_REG_IMM)
23732 return offset_4bit_signed_scaled_p (mode, addr.const_offset);
23734 if (addr.type == ADDRESS_REG_REG)
23735 return (1U << addr.shift) == GET_MODE_SIZE (elem_mode);
23737 return false;
23740 /* Return true if OP is a valid MEM operand for an SVE LD1RQ instruction. */
23741 bool
23742 aarch64_sve_ld1rq_operand_p (rtx op)
23744 return aarch64_sve_ld1rq_ld1ro_operand_p (op, TImode,
23745 GET_MODE_INNER (GET_MODE (op)));
23748 /* Return true if OP is a valid MEM operand for an SVE LD1RO instruction for
23749 accessing a vector where the element size is specified by `elem_mode`. */
23750 bool
23751 aarch64_sve_ld1ro_operand_p (rtx op, scalar_mode elem_mode)
23753 return aarch64_sve_ld1rq_ld1ro_operand_p (op, OImode, elem_mode);
23756 /* Return true if OP is a valid MEM operand for an SVE LDFF1 instruction. */
23757 bool
23758 aarch64_sve_ldff1_operand_p (rtx op)
23760 if (!MEM_P (op))
23761 return false;
23763 struct aarch64_address_info addr;
23764 if (!aarch64_classify_address (&addr, XEXP (op, 0), GET_MODE (op), false))
23765 return false;
23767 if (addr.type == ADDRESS_REG_IMM)
23768 return known_eq (addr.const_offset, 0);
23770 return addr.type == ADDRESS_REG_REG;
23773 /* Return true if OP is a valid MEM operand for an SVE LDNF1 instruction. */
23774 bool
23775 aarch64_sve_ldnf1_operand_p (rtx op)
23777 struct aarch64_address_info addr;
23779 return (MEM_P (op)
23780 && aarch64_classify_address (&addr, XEXP (op, 0),
23781 GET_MODE (op), false)
23782 && addr.type == ADDRESS_REG_IMM);
23785 /* Return true if OP is a valid MEM operand for an SVE LDR instruction.
23786 The conditions for STR are the same. */
23787 bool
23788 aarch64_sve_ldr_operand_p (rtx op)
23790 struct aarch64_address_info addr;
23792 return (MEM_P (op)
23793 && aarch64_classify_address (&addr, XEXP (op, 0), GET_MODE (op),
23794 false, ADDR_QUERY_ANY)
23795 && addr.type == ADDRESS_REG_IMM);
23798 /* Return true if OP is a valid address for an SVE PRF[BHWD] instruction,
23799 addressing memory of mode MODE. */
23800 bool
23801 aarch64_sve_prefetch_operand_p (rtx op, machine_mode mode)
23803 struct aarch64_address_info addr;
23804 if (!aarch64_classify_address (&addr, op, mode, false, ADDR_QUERY_ANY))
23805 return false;
23807 if (addr.type == ADDRESS_REG_IMM)
23808 return offset_6bit_signed_scaled_p (mode, addr.const_offset);
23810 return addr.type == ADDRESS_REG_REG;
23813 /* Return true if OP is a valid MEM operand for an SVE_STRUCT mode.
23814 We need to be able to access the individual pieces, so the range
23815 is different from LD[234] and ST[234]. */
23816 bool
23817 aarch64_sve_struct_memory_operand_p (rtx op)
23819 if (!MEM_P (op))
23820 return false;
23822 machine_mode mode = GET_MODE (op);
23823 struct aarch64_address_info addr;
23824 if (!aarch64_classify_address (&addr, XEXP (op, 0), SVE_BYTE_MODE, false,
23825 ADDR_QUERY_ANY)
23826 || addr.type != ADDRESS_REG_IMM)
23827 return false;
23829 poly_int64 first = addr.const_offset;
23830 poly_int64 last = first + GET_MODE_SIZE (mode) - BYTES_PER_SVE_VECTOR;
23831 return (offset_4bit_signed_scaled_p (SVE_BYTE_MODE, first)
23832 && offset_4bit_signed_scaled_p (SVE_BYTE_MODE, last));
23835 /* Return true if OFFSET is a constant integer and if VNUM is
23836 OFFSET * the number of bytes in an SVE vector. This is the requirement
23837 that exists in SME LDR and STR instructions, where the VL offset must
23838 equal the ZA slice offset. */
23839 bool
23840 aarch64_sme_ldr_vnum_offset_p (rtx offset, rtx vnum)
23842 if (!CONST_INT_P (offset) || !IN_RANGE (INTVAL (offset), 0, 15))
23843 return false;
23845 if (TARGET_STREAMING)
23847 poly_int64 const_vnum;
23848 return (poly_int_rtx_p (vnum, &const_vnum)
23849 && known_eq (const_vnum,
23850 INTVAL (offset) * BYTES_PER_SVE_VECTOR));
23852 else
23854 HOST_WIDE_INT factor;
23855 return (aarch64_sme_vq_unspec_p (vnum, &factor)
23856 && factor == INTVAL (offset) * 16);
23860 /* Emit a register copy from operand to operand, taking care not to
23861 early-clobber source registers in the process.
23863 COUNT is the number of components into which the copy needs to be
23864 decomposed. */
23865 void
23866 aarch64_simd_emit_reg_reg_move (rtx *operands, machine_mode mode,
23867 unsigned int count)
23869 unsigned int i;
23870 int rdest = REGNO (operands[0]);
23871 int rsrc = REGNO (operands[1]);
23873 if (!reg_overlap_mentioned_p (operands[0], operands[1])
23874 || rdest < rsrc)
23875 for (i = 0; i < count; i++)
23876 emit_move_insn (gen_rtx_REG (mode, rdest + i),
23877 gen_rtx_REG (mode, rsrc + i));
23878 else
23879 for (i = 0; i < count; i++)
23880 emit_move_insn (gen_rtx_REG (mode, rdest + count - i - 1),
23881 gen_rtx_REG (mode, rsrc + count - i - 1));
23884 /* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is
23885 one of VSTRUCT modes: OI, CI, or XI. */
23887 aarch64_simd_attr_length_rglist (machine_mode mode)
23889 /* This is only used (and only meaningful) for Advanced SIMD, not SVE. */
23890 return (GET_MODE_SIZE (mode).to_constant () / UNITS_PER_VREG) * 4;
23893 /* Implement target hook TARGET_VECTOR_ALIGNMENT. The AAPCS64 sets the maximum
23894 alignment of a vector to 128 bits. SVE predicates have an alignment of
23895 16 bits. */
23896 static HOST_WIDE_INT
23897 aarch64_simd_vector_alignment (const_tree type)
23899 /* ??? Checking the mode isn't ideal, but VECTOR_BOOLEAN_TYPE_P can
23900 be set for non-predicate vectors of booleans. Modes are the most
23901 direct way we have of identifying real SVE predicate types. */
23902 if (GET_MODE_CLASS (TYPE_MODE (type)) == MODE_VECTOR_BOOL)
23903 return 16;
23904 widest_int min_size
23905 = constant_lower_bound (wi::to_poly_widest (TYPE_SIZE (type)));
23906 return wi::umin (min_size, 128).to_uhwi ();
23909 /* Implement target hook TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT. */
23910 static poly_uint64
23911 aarch64_vectorize_preferred_vector_alignment (const_tree type)
23913 if (aarch64_sve_data_mode_p (TYPE_MODE (type)))
23915 /* If the length of the vector is a fixed power of 2, try to align
23916 to that length, otherwise don't try to align at all. */
23917 HOST_WIDE_INT result;
23918 if (!GET_MODE_BITSIZE (TYPE_MODE (type)).is_constant (&result)
23919 || !pow2p_hwi (result))
23920 result = TYPE_ALIGN (TREE_TYPE (type));
23921 return result;
23923 return TYPE_ALIGN (type);
23926 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE. */
23927 static bool
23928 aarch64_simd_vector_alignment_reachable (const_tree type, bool is_packed)
23930 if (is_packed)
23931 return false;
23933 /* For fixed-length vectors, check that the vectorizer will aim for
23934 full-vector alignment. This isn't true for generic GCC vectors
23935 that are wider than the ABI maximum of 128 bits. */
23936 poly_uint64 preferred_alignment =
23937 aarch64_vectorize_preferred_vector_alignment (type);
23938 if (TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
23939 && maybe_ne (wi::to_widest (TYPE_SIZE (type)),
23940 preferred_alignment))
23941 return false;
23943 /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned. */
23944 return true;
23947 /* Return true if the vector misalignment factor is supported by the
23948 target. */
23949 static bool
23950 aarch64_builtin_support_vector_misalignment (machine_mode mode,
23951 const_tree type, int misalignment,
23952 bool is_packed)
23954 if (TARGET_SIMD && STRICT_ALIGNMENT)
23956 /* Return if movmisalign pattern is not supported for this mode. */
23957 if (optab_handler (movmisalign_optab, mode) == CODE_FOR_nothing)
23958 return false;
23960 /* Misalignment factor is unknown at compile time. */
23961 if (misalignment == -1)
23962 return false;
23964 return default_builtin_support_vector_misalignment (mode, type, misalignment,
23965 is_packed);
23968 /* If VALS is a vector constant that can be loaded into a register
23969 using DUP, generate instructions to do so and return an RTX to
23970 assign to the register. Otherwise return NULL_RTX. */
23971 static rtx
23972 aarch64_simd_dup_constant (rtx vals)
23974 machine_mode mode = GET_MODE (vals);
23975 machine_mode inner_mode = GET_MODE_INNER (mode);
23976 rtx x;
23978 if (!const_vec_duplicate_p (vals, &x))
23979 return NULL_RTX;
23981 /* We can load this constant by using DUP and a constant in a
23982 single ARM register. This will be cheaper than a vector
23983 load. */
23984 x = force_reg (inner_mode, x);
23985 return gen_vec_duplicate (mode, x);
23989 /* Generate code to load VALS, which is a PARALLEL containing only
23990 constants (for vec_init) or CONST_VECTOR, efficiently into a
23991 register. Returns an RTX to copy into the register, or NULL_RTX
23992 for a PARALLEL that cannot be converted into a CONST_VECTOR. */
23993 static rtx
23994 aarch64_simd_make_constant (rtx vals)
23996 machine_mode mode = GET_MODE (vals);
23997 rtx const_dup;
23998 rtx const_vec = NULL_RTX;
23999 int n_const = 0;
24000 int i;
24002 if (CONST_VECTOR_P (vals))
24003 const_vec = vals;
24004 else if (GET_CODE (vals) == PARALLEL)
24006 /* A CONST_VECTOR must contain only CONST_INTs and
24007 CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
24008 Only store valid constants in a CONST_VECTOR. */
24009 int n_elts = XVECLEN (vals, 0);
24010 for (i = 0; i < n_elts; ++i)
24012 rtx x = XVECEXP (vals, 0, i);
24013 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
24014 n_const++;
24016 if (n_const == n_elts)
24017 const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0));
24019 else
24020 gcc_unreachable ();
24022 if (const_vec != NULL_RTX
24023 && aarch64_simd_valid_mov_imm (const_vec))
24024 /* Load using MOVI/MVNI. */
24025 return const_vec;
24026 else if ((const_dup = aarch64_simd_dup_constant (vals)) != NULL_RTX)
24027 /* Loaded using DUP. */
24028 return const_dup;
24029 else if (const_vec != NULL_RTX)
24030 /* Load from constant pool. We cannot take advantage of single-cycle
24031 LD1 because we need a PC-relative addressing mode. */
24032 return const_vec;
24033 else
24034 /* A PARALLEL containing something not valid inside CONST_VECTOR.
24035 We cannot construct an initializer. */
24036 return NULL_RTX;
24039 /* A subroutine of aarch64_expand_vector_init, with the same interface.
24040 The caller has already tried a divide-and-conquer approach, so do
24041 not consider that case here. */
24043 void
24044 aarch64_expand_vector_init_fallback (rtx target, rtx vals)
24046 machine_mode mode = GET_MODE (target);
24047 scalar_mode inner_mode = GET_MODE_INNER (mode);
24048 /* The number of vector elements. */
24049 int n_elts = XVECLEN (vals, 0);
24050 /* The number of vector elements which are not constant. */
24051 int n_var = 0;
24052 rtx any_const = NULL_RTX;
24053 /* The first element of vals. */
24054 rtx v0 = XVECEXP (vals, 0, 0);
24055 bool all_same = true;
24057 /* This is a special vec_init<M><N> where N is not an element mode but a
24058 vector mode with half the elements of M. We expect to find two entries
24059 of mode N in VALS and we must put their concatentation into TARGET. */
24060 if (XVECLEN (vals, 0) == 2 && VECTOR_MODE_P (GET_MODE (XVECEXP (vals, 0, 0))))
24062 machine_mode narrow_mode = GET_MODE (XVECEXP (vals, 0, 0));
24063 gcc_assert (GET_MODE_INNER (narrow_mode) == inner_mode
24064 && known_eq (GET_MODE_SIZE (mode),
24065 2 * GET_MODE_SIZE (narrow_mode)));
24066 emit_insn (gen_aarch64_vec_concat (narrow_mode, target,
24067 XVECEXP (vals, 0, 0),
24068 XVECEXP (vals, 0, 1)));
24069 return;
24072 /* Count the number of variable elements to initialise. */
24073 for (int i = 0; i < n_elts; ++i)
24075 rtx x = XVECEXP (vals, 0, i);
24076 if (!(CONST_INT_P (x) || CONST_DOUBLE_P (x)))
24077 ++n_var;
24078 else
24079 any_const = x;
24081 all_same &= rtx_equal_p (x, v0);
24084 /* No variable elements, hand off to aarch64_simd_make_constant which knows
24085 how best to handle this. */
24086 if (n_var == 0)
24088 rtx constant = aarch64_simd_make_constant (vals);
24089 if (constant != NULL_RTX)
24091 emit_move_insn (target, constant);
24092 return;
24096 /* Splat a single non-constant element if we can. */
24097 if (all_same)
24099 rtx x = force_reg (inner_mode, v0);
24100 aarch64_emit_move (target, gen_vec_duplicate (mode, x));
24101 return;
24104 enum insn_code icode = optab_handler (vec_set_optab, mode);
24105 gcc_assert (icode != CODE_FOR_nothing);
24107 /* If there are only variable elements, try to optimize
24108 the insertion using dup for the most common element
24109 followed by insertions. */
24111 /* The algorithm will fill matches[*][0] with the earliest matching element,
24112 and matches[X][1] with the count of duplicate elements (if X is the
24113 earliest element which has duplicates). */
24115 if (n_var >= n_elts - 1 && n_elts <= 16)
24117 int matches[16][2] = {0};
24118 for (int i = 0; i < n_elts; i++)
24120 for (int j = 0; j <= i; j++)
24122 if (rtx_equal_p (XVECEXP (vals, 0, i), XVECEXP (vals, 0, j)))
24124 matches[i][0] = j;
24125 matches[j][1]++;
24126 break;
24130 int maxelement = 0;
24131 int maxv = 0;
24132 rtx const_elem = NULL_RTX;
24133 int const_elem_pos = 0;
24135 for (int i = 0; i < n_elts; i++)
24137 if (matches[i][1] > maxv)
24139 maxelement = i;
24140 maxv = matches[i][1];
24142 if (CONST_INT_P (XVECEXP (vals, 0, i))
24143 || CONST_DOUBLE_P (XVECEXP (vals, 0, i)))
24145 const_elem_pos = i;
24146 const_elem = XVECEXP (vals, 0, i);
24150 /* Create a duplicate of the most common element, unless all elements
24151 are equally useless to us, in which case just immediately set the
24152 vector register using the first element. */
24154 if (maxv == 1)
24156 /* For vectors of two 64-bit elements, we can do even better. */
24157 if (n_elts == 2
24158 && (inner_mode == E_DImode
24159 || inner_mode == E_DFmode))
24162 rtx x0 = XVECEXP (vals, 0, 0);
24163 rtx x1 = XVECEXP (vals, 0, 1);
24164 /* Combine can pick up this case, but handling it directly
24165 here leaves clearer RTL.
24167 This is load_pair_lanes<mode>, and also gives us a clean-up
24168 for store_pair_lanes<mode>. */
24169 if (memory_operand (x0, inner_mode)
24170 && memory_operand (x1, inner_mode)
24171 && aarch64_mergeable_load_pair_p (mode, x0, x1))
24173 rtx t;
24174 if (inner_mode == DFmode)
24175 t = gen_load_pair_lanesdf (target, x0, x1);
24176 else
24177 t = gen_load_pair_lanesdi (target, x0, x1);
24178 emit_insn (t);
24179 return;
24182 /* The subreg-move sequence below will move into lane zero of the
24183 vector register. For big-endian we want that position to hold
24184 the last element of VALS. */
24185 maxelement = BYTES_BIG_ENDIAN ? n_elts - 1 : 0;
24187 /* If we have a single constant element, use that for duplicating
24188 instead. */
24189 if (const_elem)
24191 maxelement = const_elem_pos;
24192 aarch64_emit_move (target, gen_vec_duplicate (mode, const_elem));
24194 else
24196 rtx x = force_reg (inner_mode, XVECEXP (vals, 0, maxelement));
24197 aarch64_emit_move (target, lowpart_subreg (mode, x, inner_mode));
24200 else
24202 rtx x = force_reg (inner_mode, XVECEXP (vals, 0, maxelement));
24203 aarch64_emit_move (target, gen_vec_duplicate (mode, x));
24206 /* Insert the rest. */
24207 for (int i = 0; i < n_elts; i++)
24209 rtx x = XVECEXP (vals, 0, i);
24210 if (matches[i][0] == maxelement)
24211 continue;
24212 x = force_reg (inner_mode, x);
24213 emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
24215 return;
24218 /* Initialise a vector which is part-variable. We want to first try
24219 to build those lanes which are constant in the most efficient way we
24220 can. */
24221 if (n_var != n_elts)
24223 rtx copy = copy_rtx (vals);
24225 /* Load constant part of vector. We really don't care what goes into the
24226 parts we will overwrite, but we're more likely to be able to load the
24227 constant efficiently if it has fewer, larger, repeating parts
24228 (see aarch64_simd_valid_imm). */
24229 for (int i = 0; i < n_elts; i++)
24231 rtx x = XVECEXP (vals, 0, i);
24232 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
24233 continue;
24234 rtx subst = any_const;
24235 for (int bit = n_elts / 2; bit > 0; bit /= 2)
24237 /* Look in the copied vector, as more elements are const. */
24238 rtx test = XVECEXP (copy, 0, i ^ bit);
24239 if (CONST_INT_P (test) || CONST_DOUBLE_P (test))
24241 subst = test;
24242 break;
24245 XVECEXP (copy, 0, i) = subst;
24247 aarch64_expand_vector_init_fallback (target, copy);
24250 /* Insert the variable lanes directly. */
24251 for (int i = 0; i < n_elts; i++)
24253 rtx x = XVECEXP (vals, 0, i);
24254 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
24255 continue;
24256 x = force_reg (inner_mode, x);
24257 emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
24261 /* Return even or odd half of VALS depending on EVEN_P. */
24263 static rtx
24264 aarch64_unzip_vector_init (machine_mode mode, rtx vals, bool even_p)
24266 int n = XVECLEN (vals, 0);
24267 machine_mode new_mode
24268 = aarch64_simd_container_mode (GET_MODE_INNER (mode),
24269 GET_MODE_BITSIZE (mode).to_constant () / 2);
24270 rtvec vec = rtvec_alloc (n / 2);
24271 for (int i = 0; i < n / 2; i++)
24272 RTVEC_ELT (vec, i) = (even_p) ? XVECEXP (vals, 0, 2 * i)
24273 : XVECEXP (vals, 0, 2 * i + 1);
24274 return gen_rtx_PARALLEL (new_mode, vec);
24277 /* Return true if SET is a scalar move. */
24279 static bool
24280 scalar_move_insn_p (rtx set)
24282 rtx src = SET_SRC (set);
24283 rtx dest = SET_DEST (set);
24284 return (is_a<scalar_mode> (GET_MODE (dest))
24285 && aarch64_mov_operand (src, GET_MODE (dest)));
24288 /* Similar to seq_cost, but ignore cost for scalar moves. */
24290 static unsigned
24291 seq_cost_ignoring_scalar_moves (const rtx_insn *seq, bool speed)
24293 unsigned cost = 0;
24295 for (; seq; seq = NEXT_INSN (seq))
24296 if (NONDEBUG_INSN_P (seq))
24298 if (rtx set = single_set (seq))
24300 if (!scalar_move_insn_p (set))
24301 cost += set_rtx_cost (set, speed);
24303 else
24305 int this_cost = insn_cost (CONST_CAST_RTX_INSN (seq), speed);
24306 if (this_cost > 0)
24307 cost += this_cost;
24308 else
24309 cost++;
24313 return cost;
24316 /* Expand a vector initialization sequence, such that TARGET is
24317 initialized to contain VALS. */
24319 void
24320 aarch64_expand_vector_init (rtx target, rtx vals)
24322 /* Try decomposing the initializer into even and odd halves and
24323 then ZIP them together. Use the resulting sequence if it is
24324 strictly cheaper than loading VALS directly.
24326 Prefer the fallback sequence in the event of a tie, since it
24327 will tend to use fewer registers. */
24329 machine_mode mode = GET_MODE (target);
24330 int n_elts = XVECLEN (vals, 0);
24332 if (n_elts < 4
24333 || maybe_ne (GET_MODE_BITSIZE (mode), 128))
24335 aarch64_expand_vector_init_fallback (target, vals);
24336 return;
24339 start_sequence ();
24340 rtx halves[2];
24341 unsigned costs[2];
24342 for (int i = 0; i < 2; i++)
24344 start_sequence ();
24345 rtx new_vals = aarch64_unzip_vector_init (mode, vals, i == 0);
24346 rtx tmp_reg = gen_reg_rtx (GET_MODE (new_vals));
24347 aarch64_expand_vector_init (tmp_reg, new_vals);
24348 halves[i] = gen_rtx_SUBREG (mode, tmp_reg, 0);
24349 rtx_insn *rec_seq = get_insns ();
24350 end_sequence ();
24351 costs[i] = seq_cost_ignoring_scalar_moves (rec_seq, !optimize_size);
24352 emit_insn (rec_seq);
24355 rtvec v = gen_rtvec (2, halves[0], halves[1]);
24356 rtx_insn *zip1_insn
24357 = emit_set_insn (target, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
24358 unsigned seq_total_cost
24359 = (!optimize_size) ? std::max (costs[0], costs[1]) : costs[0] + costs[1];
24360 seq_total_cost += insn_cost (zip1_insn, !optimize_size);
24362 rtx_insn *seq = get_insns ();
24363 end_sequence ();
24365 start_sequence ();
24366 aarch64_expand_vector_init_fallback (target, vals);
24367 rtx_insn *fallback_seq = get_insns ();
24368 unsigned fallback_seq_cost
24369 = seq_cost_ignoring_scalar_moves (fallback_seq, !optimize_size);
24370 end_sequence ();
24372 emit_insn (seq_total_cost < fallback_seq_cost ? seq : fallback_seq);
24375 /* Emit RTL corresponding to:
24376 insr TARGET, ELEM. */
24378 static void
24379 emit_insr (rtx target, rtx elem)
24381 machine_mode mode = GET_MODE (target);
24382 scalar_mode elem_mode = GET_MODE_INNER (mode);
24383 elem = force_reg (elem_mode, elem);
24385 insn_code icode = optab_handler (vec_shl_insert_optab, mode);
24386 gcc_assert (icode != CODE_FOR_nothing);
24387 emit_insn (GEN_FCN (icode) (target, target, elem));
24390 /* Subroutine of aarch64_sve_expand_vector_init for handling
24391 trailing constants.
24392 This function works as follows:
24393 (a) Create a new vector consisting of trailing constants.
24394 (b) Initialize TARGET with the constant vector using emit_move_insn.
24395 (c) Insert remaining elements in TARGET using insr.
24396 NELTS is the total number of elements in original vector while
24397 while NELTS_REQD is the number of elements that are actually
24398 significant.
24400 ??? The heuristic used is to do above only if number of constants
24401 is at least half the total number of elements. May need fine tuning. */
24403 static bool
24404 aarch64_sve_expand_vector_init_handle_trailing_constants
24405 (rtx target, const rtx_vector_builder &builder, int nelts, int nelts_reqd)
24407 machine_mode mode = GET_MODE (target);
24408 scalar_mode elem_mode = GET_MODE_INNER (mode);
24409 int n_trailing_constants = 0;
24411 for (int i = nelts_reqd - 1;
24412 i >= 0 && valid_for_const_vector_p (elem_mode, builder.elt (i));
24413 i--)
24414 n_trailing_constants++;
24416 if (n_trailing_constants >= nelts_reqd / 2)
24418 /* Try to use the natural pattern of BUILDER to extend the trailing
24419 constant elements to a full vector. Replace any variables in the
24420 extra elements with zeros.
24422 ??? It would be better if the builders supported "don't care"
24423 elements, with the builder filling in whichever elements
24424 give the most compact encoding. */
24425 rtx_vector_builder v (mode, nelts, 1);
24426 for (int i = 0; i < nelts; i++)
24428 rtx x = builder.elt (i + nelts_reqd - n_trailing_constants);
24429 if (!valid_for_const_vector_p (elem_mode, x))
24430 x = CONST0_RTX (elem_mode);
24431 v.quick_push (x);
24433 rtx const_vec = v.build ();
24434 emit_move_insn (target, const_vec);
24436 for (int i = nelts_reqd - n_trailing_constants - 1; i >= 0; i--)
24437 emit_insr (target, builder.elt (i));
24439 return true;
24442 return false;
24445 /* Subroutine of aarch64_sve_expand_vector_init.
24446 Works as follows:
24447 (a) Initialize TARGET by broadcasting element NELTS_REQD - 1 of BUILDER.
24448 (b) Skip trailing elements from BUILDER, which are the same as
24449 element NELTS_REQD - 1.
24450 (c) Insert earlier elements in reverse order in TARGET using insr. */
24452 static void
24453 aarch64_sve_expand_vector_init_insert_elems (rtx target,
24454 const rtx_vector_builder &builder,
24455 int nelts_reqd)
24457 machine_mode mode = GET_MODE (target);
24458 scalar_mode elem_mode = GET_MODE_INNER (mode);
24460 struct expand_operand ops[2];
24461 enum insn_code icode = optab_handler (vec_duplicate_optab, mode);
24462 gcc_assert (icode != CODE_FOR_nothing);
24464 create_output_operand (&ops[0], target, mode);
24465 create_input_operand (&ops[1], builder.elt (nelts_reqd - 1), elem_mode);
24466 expand_insn (icode, 2, ops);
24468 int ndups = builder.count_dups (nelts_reqd - 1, -1, -1);
24469 for (int i = nelts_reqd - ndups - 1; i >= 0; i--)
24470 emit_insr (target, builder.elt (i));
24473 /* Subroutine of aarch64_sve_expand_vector_init to handle case
24474 when all trailing elements of builder are same.
24475 This works as follows:
24476 (a) Use expand_insn interface to broadcast last vector element in TARGET.
24477 (b) Insert remaining elements in TARGET using insr.
24479 ??? The heuristic used is to do above if number of same trailing elements
24480 is at least 3/4 of total number of elements, loosely based on
24481 heuristic from mostly_zeros_p. May need fine-tuning. */
24483 static bool
24484 aarch64_sve_expand_vector_init_handle_trailing_same_elem
24485 (rtx target, const rtx_vector_builder &builder, int nelts_reqd)
24487 int ndups = builder.count_dups (nelts_reqd - 1, -1, -1);
24488 if (ndups >= (3 * nelts_reqd) / 4)
24490 aarch64_sve_expand_vector_init_insert_elems (target, builder,
24491 nelts_reqd - ndups + 1);
24492 return true;
24495 return false;
24498 /* Initialize register TARGET from BUILDER. NELTS is the constant number
24499 of elements in BUILDER.
24501 The function tries to initialize TARGET from BUILDER if it fits one
24502 of the special cases outlined below.
24504 Failing that, the function divides BUILDER into two sub-vectors:
24505 v_even = even elements of BUILDER;
24506 v_odd = odd elements of BUILDER;
24508 and recursively calls itself with v_even and v_odd.
24510 if (recursive call succeeded for v_even or v_odd)
24511 TARGET = zip (v_even, v_odd)
24513 The function returns true if it managed to build TARGET from BUILDER
24514 with one of the special cases, false otherwise.
24516 Example: {a, 1, b, 2, c, 3, d, 4}
24518 The vector gets divided into:
24519 v_even = {a, b, c, d}
24520 v_odd = {1, 2, 3, 4}
24522 aarch64_sve_expand_vector_init(v_odd) hits case 1 and
24523 initialize tmp2 from constant vector v_odd using emit_move_insn.
24525 aarch64_sve_expand_vector_init(v_even) fails since v_even contains
24526 4 elements, so we construct tmp1 from v_even using insr:
24527 tmp1 = dup(d)
24528 insr tmp1, c
24529 insr tmp1, b
24530 insr tmp1, a
24532 And finally:
24533 TARGET = zip (tmp1, tmp2)
24534 which sets TARGET to {a, 1, b, 2, c, 3, d, 4}. */
24536 static bool
24537 aarch64_sve_expand_vector_init (rtx target, const rtx_vector_builder &builder,
24538 int nelts, int nelts_reqd)
24540 machine_mode mode = GET_MODE (target);
24542 /* Case 1: Vector contains trailing constants. */
24544 if (aarch64_sve_expand_vector_init_handle_trailing_constants
24545 (target, builder, nelts, nelts_reqd))
24546 return true;
24548 /* Case 2: Vector contains leading constants. */
24550 rtx_vector_builder rev_builder (mode, nelts_reqd, 1);
24551 for (int i = 0; i < nelts_reqd; i++)
24552 rev_builder.quick_push (builder.elt (nelts_reqd - i - 1));
24553 rev_builder.finalize ();
24555 if (aarch64_sve_expand_vector_init_handle_trailing_constants
24556 (target, rev_builder, nelts, nelts_reqd))
24558 emit_insn (gen_aarch64_sve_rev (mode, target, target));
24559 return true;
24562 /* Case 3: Vector contains trailing same element. */
24564 if (aarch64_sve_expand_vector_init_handle_trailing_same_elem
24565 (target, builder, nelts_reqd))
24566 return true;
24568 /* Case 4: Vector contains leading same element. */
24570 if (aarch64_sve_expand_vector_init_handle_trailing_same_elem
24571 (target, rev_builder, nelts_reqd) && nelts_reqd == nelts)
24573 emit_insn (gen_aarch64_sve_rev (mode, target, target));
24574 return true;
24577 /* Avoid recursing below 4-elements.
24578 ??? The threshold 4 may need fine-tuning. */
24580 if (nelts_reqd <= 4)
24581 return false;
24583 rtx_vector_builder v_even (mode, nelts, 1);
24584 rtx_vector_builder v_odd (mode, nelts, 1);
24586 for (int i = 0; i < nelts * 2; i += 2)
24588 v_even.quick_push (builder.elt (i));
24589 v_odd.quick_push (builder.elt (i + 1));
24592 v_even.finalize ();
24593 v_odd.finalize ();
24595 rtx tmp1 = gen_reg_rtx (mode);
24596 bool did_even_p = aarch64_sve_expand_vector_init (tmp1, v_even,
24597 nelts, nelts_reqd / 2);
24599 rtx tmp2 = gen_reg_rtx (mode);
24600 bool did_odd_p = aarch64_sve_expand_vector_init (tmp2, v_odd,
24601 nelts, nelts_reqd / 2);
24603 if (!did_even_p && !did_odd_p)
24604 return false;
24606 /* Initialize v_even and v_odd using INSR if it didn't match any of the
24607 special cases and zip v_even, v_odd. */
24609 if (!did_even_p)
24610 aarch64_sve_expand_vector_init_insert_elems (tmp1, v_even, nelts_reqd / 2);
24612 if (!did_odd_p)
24613 aarch64_sve_expand_vector_init_insert_elems (tmp2, v_odd, nelts_reqd / 2);
24615 rtvec v = gen_rtvec (2, tmp1, tmp2);
24616 emit_set_insn (target, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
24617 return true;
24620 /* Initialize register TARGET from the elements in PARALLEL rtx VALS. */
24622 void
24623 aarch64_sve_expand_vector_init (rtx target, rtx vals)
24625 machine_mode mode = GET_MODE (target);
24626 int nelts = XVECLEN (vals, 0);
24628 rtx_vector_builder v (mode, nelts, 1);
24629 for (int i = 0; i < nelts; i++)
24630 v.quick_push (XVECEXP (vals, 0, i));
24631 v.finalize ();
24633 /* If neither sub-vectors of v could be initialized specially,
24634 then use INSR to insert all elements from v into TARGET.
24635 ??? This might not be optimal for vectors with large
24636 initializers like 16-element or above.
24637 For nelts < 4, it probably isn't useful to handle specially. */
24639 if (nelts < 4
24640 || !aarch64_sve_expand_vector_init (target, v, nelts, nelts))
24641 aarch64_sve_expand_vector_init_insert_elems (target, v, nelts);
24644 /* Initialize register TARGET from the two vector subelements in PARALLEL
24645 rtx VALS. */
24647 void
24648 aarch64_sve_expand_vector_init_subvector (rtx target, rtx vals)
24650 machine_mode mode = GET_MODE (target);
24651 int nelts = XVECLEN (vals, 0);
24653 gcc_assert (nelts % 2 == 0);
24655 /* We have to be concatting vector. */
24656 machine_mode elem_mode = GET_MODE (XVECEXP (vals, 0, 0));
24657 gcc_assert (VECTOR_MODE_P (elem_mode));
24659 auto_vec<rtx> worklist;
24660 machine_mode wider_mode = elem_mode;
24662 for (int i = 0; i < nelts; i++)
24663 worklist.safe_push (force_reg (elem_mode, XVECEXP (vals, 0, i)));
24665 /* Keep widening pairwise to have maximum throughput. */
24666 while (nelts >= 2)
24668 wider_mode
24669 = related_vector_mode (wider_mode, GET_MODE_INNER (wider_mode),
24670 GET_MODE_NUNITS (wider_mode) * 2).require ();
24672 for (int i = 0; i < nelts; i += 2)
24674 rtx arg0 = worklist[i];
24675 rtx arg1 = worklist[i+1];
24676 gcc_assert (GET_MODE (arg0) == GET_MODE (arg1));
24678 rtx tmp = gen_reg_rtx (wider_mode);
24679 emit_insn (gen_aarch64_pack_partial (wider_mode, tmp, arg0, arg1));
24680 worklist[i / 2] = tmp;
24683 nelts /= 2;
24686 gcc_assert (wider_mode == mode);
24687 emit_move_insn (target, worklist[0]);
24689 return;
24692 /* Check whether VALUE is a vector constant in which every element
24693 is either a power of 2 or a negated power of 2. If so, return
24694 a constant vector of log2s, and flip CODE between PLUS and MINUS
24695 if VALUE contains negated powers of 2. Return NULL_RTX otherwise. */
24697 static rtx
24698 aarch64_convert_mult_to_shift (rtx value, rtx_code &code)
24700 if (!CONST_VECTOR_P (value))
24701 return NULL_RTX;
24703 rtx_vector_builder builder;
24704 if (!builder.new_unary_operation (GET_MODE (value), value, false))
24705 return NULL_RTX;
24707 scalar_mode int_mode = GET_MODE_INNER (GET_MODE (value));
24708 /* 1 if the result of the multiplication must be negated,
24709 0 if it mustn't, or -1 if we don't yet care. */
24710 int negate = -1;
24711 unsigned int encoded_nelts = const_vector_encoded_nelts (value);
24712 for (unsigned int i = 0; i < encoded_nelts; ++i)
24714 rtx elt = CONST_VECTOR_ENCODED_ELT (value, i);
24715 if (!CONST_SCALAR_INT_P (elt))
24716 return NULL_RTX;
24717 rtx_mode_t val (elt, int_mode);
24718 wide_int pow2 = wi::neg (val);
24719 if (val != pow2)
24721 /* It matters whether we negate or not. Make that choice,
24722 and make sure that it's consistent with previous elements. */
24723 if (negate == !wi::neg_p (val))
24724 return NULL_RTX;
24725 negate = wi::neg_p (val);
24726 if (!negate)
24727 pow2 = val;
24729 /* POW2 is now the value that we want to be a power of 2. */
24730 int shift = wi::exact_log2 (pow2);
24731 if (shift < 0)
24732 return NULL_RTX;
24733 builder.quick_push (gen_int_mode (shift, int_mode));
24735 if (negate == -1)
24736 /* PLUS and MINUS are equivalent; canonicalize on PLUS. */
24737 code = PLUS;
24738 else if (negate == 1)
24739 code = code == PLUS ? MINUS : PLUS;
24740 return builder.build ();
24743 /* Prepare for an integer SVE multiply-add or multiply-subtract pattern;
24744 CODE is PLUS for the former and MINUS for the latter. OPERANDS is the
24745 operands array, in the same order as for fma_optab. Return true if
24746 the function emitted all the necessary instructions, false if the caller
24747 should generate the pattern normally with the new OPERANDS array. */
24749 bool
24750 aarch64_prepare_sve_int_fma (rtx *operands, rtx_code code)
24752 machine_mode mode = GET_MODE (operands[0]);
24753 if (rtx shifts = aarch64_convert_mult_to_shift (operands[2], code))
24755 rtx product = expand_binop (mode, vashl_optab, operands[1], shifts,
24756 NULL_RTX, true, OPTAB_DIRECT);
24757 force_expand_binop (mode, code == PLUS ? add_optab : sub_optab,
24758 operands[3], product, operands[0], true,
24759 OPTAB_DIRECT);
24760 return true;
24762 operands[2] = force_reg (mode, operands[2]);
24763 return false;
24766 /* Likewise, but for a conditional pattern. */
24768 bool
24769 aarch64_prepare_sve_cond_int_fma (rtx *operands, rtx_code code)
24771 machine_mode mode = GET_MODE (operands[0]);
24772 if (rtx shifts = aarch64_convert_mult_to_shift (operands[3], code))
24774 rtx product = expand_binop (mode, vashl_optab, operands[2], shifts,
24775 NULL_RTX, true, OPTAB_DIRECT);
24776 emit_insn (gen_cond (code, mode, operands[0], operands[1],
24777 operands[4], product, operands[5]));
24778 return true;
24780 operands[3] = force_reg (mode, operands[3]);
24781 return false;
24784 static unsigned HOST_WIDE_INT
24785 aarch64_shift_truncation_mask (machine_mode mode)
24787 if (!SHIFT_COUNT_TRUNCATED || aarch64_vector_data_mode_p (mode))
24788 return 0;
24789 return GET_MODE_UNIT_BITSIZE (mode) - 1;
24792 /* Select a format to encode pointers in exception handling data. */
24794 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED, int global)
24796 int type;
24797 switch (aarch64_cmodel)
24799 case AARCH64_CMODEL_TINY:
24800 case AARCH64_CMODEL_TINY_PIC:
24801 case AARCH64_CMODEL_SMALL:
24802 case AARCH64_CMODEL_SMALL_PIC:
24803 case AARCH64_CMODEL_SMALL_SPIC:
24804 /* text+got+data < 4Gb. 4-byte signed relocs are sufficient
24805 for everything. */
24806 type = DW_EH_PE_sdata4;
24807 break;
24808 default:
24809 /* No assumptions here. 8-byte relocs required. */
24810 type = DW_EH_PE_sdata8;
24811 break;
24813 return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
24816 /* Output .variant_pcs for aarch64_vector_pcs function symbols. */
24818 static void
24819 aarch64_asm_output_variant_pcs (FILE *stream, const tree decl, const char* name)
24821 if (TREE_CODE (decl) == FUNCTION_DECL)
24823 arm_pcs pcs = (arm_pcs) fndecl_abi (decl).id ();
24824 if (pcs == ARM_PCS_SIMD || pcs == ARM_PCS_SVE)
24826 fprintf (stream, "\t.variant_pcs\t");
24827 assemble_name (stream, name);
24828 fprintf (stream, "\n");
24833 /* The last .arch and .tune assembly strings that we printed. */
24834 static std::string aarch64_last_printed_arch_string;
24835 static std::string aarch64_last_printed_tune_string;
24837 /* Implement ASM_DECLARE_FUNCTION_NAME. Output the ISA features used
24838 by the function fndecl. */
24840 void
24841 aarch64_declare_function_name (FILE *stream, const char* name,
24842 tree fndecl)
24844 tree target_parts = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
24846 struct cl_target_option *targ_options;
24847 if (target_parts)
24848 targ_options = TREE_TARGET_OPTION (target_parts);
24849 else
24850 targ_options = TREE_TARGET_OPTION (target_option_current_node);
24851 gcc_assert (targ_options);
24853 auto isa_flags = aarch64_get_asm_isa_flags (targ_options);
24854 aarch64_arch arch = targ_options->x_selected_arch;
24855 std::string to_print
24856 = aarch64_get_arch_string_for_assembler (arch, isa_flags);
24857 /* Only update the assembler .arch string if it is distinct from the last
24858 such string we printed. */
24859 if (to_print != aarch64_last_printed_arch_string)
24861 asm_fprintf (asm_out_file, "\t.arch %s\n", to_print.c_str ());
24862 aarch64_last_printed_arch_string = to_print;
24865 /* Print the cpu name we're tuning for in the comments, might be
24866 useful to readers of the generated asm. Do it only when it changes
24867 from function to function and verbose assembly is requested. */
24868 const struct processor *this_tune
24869 = aarch64_get_tune_cpu (targ_options->x_selected_tune);
24871 if (flag_debug_asm && aarch64_last_printed_tune_string != this_tune->name)
24873 asm_fprintf (asm_out_file, "\t" ASM_COMMENT_START ".tune %s\n",
24874 this_tune->name);
24875 aarch64_last_printed_tune_string = this_tune->name;
24878 aarch64_asm_output_variant_pcs (stream, fndecl, name);
24880 /* Don't forget the type directive for ELF. */
24881 ASM_OUTPUT_TYPE_DIRECTIVE (stream, name, "function");
24882 ASM_OUTPUT_FUNCTION_LABEL (stream, name, fndecl);
24884 cfun->machine->label_is_assembled = true;
24887 /* Implement PRINT_PATCHABLE_FUNCTION_ENTRY. */
24889 void
24890 aarch64_print_patchable_function_entry (FILE *file,
24891 unsigned HOST_WIDE_INT patch_area_size,
24892 bool record_p)
24894 if (!cfun->machine->label_is_assembled)
24896 /* Emit the patching area before the entry label, if any. */
24897 default_print_patchable_function_entry (file, patch_area_size,
24898 record_p);
24899 return;
24902 rtx pa = gen_patchable_area (GEN_INT (patch_area_size),
24903 GEN_INT (record_p));
24904 basic_block bb = ENTRY_BLOCK_PTR_FOR_FN (cfun)->next_bb;
24906 if (!aarch_bti_enabled ()
24907 || cgraph_node::get (cfun->decl)->only_called_directly_p ())
24909 /* Emit the patchable_area at the beginning of the function. */
24910 rtx_insn *insn = emit_insn_before (pa, BB_HEAD (bb));
24911 INSN_ADDRESSES_NEW (insn, -1);
24912 return;
24915 rtx_insn *insn = next_real_nondebug_insn (get_insns ());
24916 if (!insn
24917 || !INSN_P (insn)
24918 || GET_CODE (PATTERN (insn)) != UNSPEC_VOLATILE
24919 || XINT (PATTERN (insn), 1) != UNSPECV_BTI_C)
24921 /* Emit a BTI_C. */
24922 insn = emit_insn_before (gen_bti_c (), BB_HEAD (bb));
24925 /* Emit the patchable_area after BTI_C. */
24926 insn = emit_insn_after (pa, insn);
24927 INSN_ADDRESSES_NEW (insn, -1);
24930 /* Output patchable area. */
24932 void
24933 aarch64_output_patchable_area (unsigned int patch_area_size, bool record_p)
24935 default_print_patchable_function_entry (asm_out_file, patch_area_size,
24936 record_p);
24939 /* Implement ASM_OUTPUT_DEF_FROM_DECLS. Output .variant_pcs for aliases. */
24941 void
24942 aarch64_asm_output_alias (FILE *stream, const tree decl, const tree target)
24944 const char *name = XSTR (XEXP (DECL_RTL (decl), 0), 0);
24945 const char *value = IDENTIFIER_POINTER (target);
24946 aarch64_asm_output_variant_pcs (stream, decl, name);
24947 ASM_OUTPUT_DEF (stream, name, value);
24950 /* Implement ASM_OUTPUT_EXTERNAL. Output .variant_pcs for undefined
24951 function symbol references. */
24953 void
24954 aarch64_asm_output_external (FILE *stream, tree decl, const char* name)
24956 default_elf_asm_output_external (stream, decl, name);
24957 aarch64_asm_output_variant_pcs (stream, decl, name);
24960 /* Triggered after a .cfi_startproc directive is emitted into the assembly file.
24961 Used to output the .cfi_b_key_frame directive when signing the current
24962 function with the B key. */
24964 void
24965 aarch64_post_cfi_startproc (FILE *f, tree ignored ATTRIBUTE_UNUSED)
24967 if (cfun->machine->frame.laid_out && aarch64_return_address_signing_enabled ()
24968 && aarch64_ra_sign_key == AARCH64_KEY_B)
24969 asm_fprintf (f, "\t.cfi_b_key_frame\n");
24972 /* Implements TARGET_ASM_FILE_START. Output the assembly header. */
24974 static void
24975 aarch64_start_file (void)
24977 struct cl_target_option *default_options
24978 = TREE_TARGET_OPTION (target_option_default_node);
24980 aarch64_arch default_arch = default_options->x_selected_arch;
24981 auto default_isa_flags = aarch64_get_asm_isa_flags (default_options);
24982 std::string arch_string
24983 = aarch64_get_arch_string_for_assembler (default_arch, default_isa_flags);
24984 aarch64_last_printed_arch_string = arch_string;
24985 aarch64_last_printed_tune_string = "";
24986 asm_fprintf (asm_out_file, "\t.arch %s\n",
24987 arch_string.c_str ());
24989 default_file_start ();
24992 /* Emit load exclusive. */
24994 static void
24995 aarch64_emit_load_exclusive (machine_mode mode, rtx rval,
24996 rtx mem, rtx model_rtx)
24998 if (mode == TImode)
24999 emit_insn (gen_aarch64_load_exclusive_pair (gen_lowpart (DImode, rval),
25000 gen_highpart (DImode, rval),
25001 mem, model_rtx));
25002 else
25003 emit_insn (gen_aarch64_load_exclusive (mode, rval, mem, model_rtx));
25006 /* Emit store exclusive. */
25008 static void
25009 aarch64_emit_store_exclusive (machine_mode mode, rtx bval,
25010 rtx mem, rtx rval, rtx model_rtx)
25012 if (mode == TImode)
25013 emit_insn (gen_aarch64_store_exclusive_pair
25014 (bval, mem, operand_subword (rval, 0, 0, TImode),
25015 operand_subword (rval, 1, 0, TImode), model_rtx));
25016 else
25017 emit_insn (gen_aarch64_store_exclusive (mode, bval, mem, rval, model_rtx));
25020 /* Mark the previous jump instruction as unlikely. */
25022 static void
25023 aarch64_emit_unlikely_jump (rtx insn)
25025 rtx_insn *jump = emit_jump_insn (insn);
25026 add_reg_br_prob_note (jump, profile_probability::very_unlikely ());
25029 /* We store the names of the various atomic helpers in a 5x5 array.
25030 Return the libcall function given MODE, MODEL and NAMES. */
25033 aarch64_atomic_ool_func(machine_mode mode, rtx model_rtx,
25034 const atomic_ool_names *names)
25036 memmodel model = memmodel_from_int (INTVAL (model_rtx));
25037 int mode_idx, model_idx;
25039 switch (mode)
25041 case E_QImode:
25042 mode_idx = 0;
25043 break;
25044 case E_HImode:
25045 mode_idx = 1;
25046 break;
25047 case E_SImode:
25048 mode_idx = 2;
25049 break;
25050 case E_DImode:
25051 mode_idx = 3;
25052 break;
25053 case E_TImode:
25054 mode_idx = 4;
25055 break;
25056 default:
25057 gcc_unreachable ();
25060 switch (model)
25062 case MEMMODEL_RELAXED:
25063 model_idx = 0;
25064 break;
25065 case MEMMODEL_CONSUME:
25066 case MEMMODEL_ACQUIRE:
25067 model_idx = 1;
25068 break;
25069 case MEMMODEL_RELEASE:
25070 model_idx = 2;
25071 break;
25072 case MEMMODEL_ACQ_REL:
25073 case MEMMODEL_SEQ_CST:
25074 model_idx = 3;
25075 break;
25076 case MEMMODEL_SYNC_ACQUIRE:
25077 case MEMMODEL_SYNC_RELEASE:
25078 case MEMMODEL_SYNC_SEQ_CST:
25079 model_idx = 4;
25080 break;
25081 default:
25082 gcc_unreachable ();
25085 return init_one_libfunc_visibility (names->str[mode_idx][model_idx],
25086 VISIBILITY_HIDDEN);
25089 #define DEF0(B, N) \
25090 { "__aarch64_" #B #N "_relax", \
25091 "__aarch64_" #B #N "_acq", \
25092 "__aarch64_" #B #N "_rel", \
25093 "__aarch64_" #B #N "_acq_rel", \
25094 "__aarch64_" #B #N "_sync" }
25096 #define DEF4(B) DEF0(B, 1), DEF0(B, 2), DEF0(B, 4), DEF0(B, 8), \
25097 { NULL, NULL, NULL, NULL }
25098 #define DEF5(B) DEF0(B, 1), DEF0(B, 2), DEF0(B, 4), DEF0(B, 8), DEF0(B, 16)
25100 static const atomic_ool_names aarch64_ool_cas_names = { { DEF5(cas) } };
25101 const atomic_ool_names aarch64_ool_swp_names = { { DEF4(swp) } };
25102 const atomic_ool_names aarch64_ool_ldadd_names = { { DEF4(ldadd) } };
25103 const atomic_ool_names aarch64_ool_ldset_names = { { DEF4(ldset) } };
25104 const atomic_ool_names aarch64_ool_ldclr_names = { { DEF4(ldclr) } };
25105 const atomic_ool_names aarch64_ool_ldeor_names = { { DEF4(ldeor) } };
25107 #undef DEF0
25108 #undef DEF4
25109 #undef DEF5
25111 /* Expand a compare and swap pattern. */
25113 void
25114 aarch64_expand_compare_and_swap (rtx operands[])
25116 rtx bval, rval, mem, oldval, newval, is_weak, mod_s, mod_f, x, cc_reg;
25117 machine_mode mode, r_mode;
25119 bval = operands[0];
25120 rval = operands[1];
25121 mem = operands[2];
25122 oldval = operands[3];
25123 newval = operands[4];
25124 is_weak = operands[5];
25125 mod_s = operands[6];
25126 mod_f = operands[7];
25127 mode = GET_MODE (mem);
25129 /* Normally the succ memory model must be stronger than fail, but in the
25130 unlikely event of fail being ACQUIRE and succ being RELEASE we need to
25131 promote succ to ACQ_REL so that we don't lose the acquire semantics. */
25132 if (is_mm_acquire (memmodel_from_int (INTVAL (mod_f)))
25133 && is_mm_release (memmodel_from_int (INTVAL (mod_s))))
25134 mod_s = GEN_INT (MEMMODEL_ACQ_REL);
25136 r_mode = mode;
25137 if (mode == QImode || mode == HImode)
25139 r_mode = SImode;
25140 rval = gen_reg_rtx (r_mode);
25143 if (TARGET_LSE)
25145 /* The CAS insn requires oldval and rval overlap, but we need to
25146 have a copy of oldval saved across the operation to tell if
25147 the operation is successful. */
25148 if (reg_overlap_mentioned_p (rval, oldval))
25149 rval = copy_to_mode_reg (r_mode, oldval);
25150 else
25151 emit_move_insn (rval, gen_lowpart (r_mode, oldval));
25152 if (mode == TImode)
25153 newval = force_reg (mode, newval);
25155 emit_insn (gen_aarch64_compare_and_swap_lse (mode, rval, mem,
25156 newval, mod_s));
25157 cc_reg = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
25159 else if (TARGET_OUTLINE_ATOMICS)
25161 /* Oldval must satisfy compare afterward. */
25162 if (!aarch64_plus_operand (oldval, mode))
25163 oldval = force_reg (mode, oldval);
25164 rtx func = aarch64_atomic_ool_func (mode, mod_s, &aarch64_ool_cas_names);
25165 rval = emit_library_call_value (func, NULL_RTX, LCT_NORMAL, r_mode,
25166 oldval, mode, newval, mode,
25167 XEXP (mem, 0), Pmode);
25168 cc_reg = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
25170 else
25172 /* The oldval predicate varies by mode. Test it and force to reg. */
25173 insn_code code = code_for_aarch64_compare_and_swap (mode);
25174 if (!insn_data[code].operand[2].predicate (oldval, mode))
25175 oldval = force_reg (mode, oldval);
25177 emit_insn (GEN_FCN (code) (rval, mem, oldval, newval,
25178 is_weak, mod_s, mod_f));
25179 cc_reg = gen_rtx_REG (CCmode, CC_REGNUM);
25182 if (r_mode != mode)
25183 rval = gen_lowpart (mode, rval);
25184 emit_move_insn (operands[1], rval);
25186 x = gen_rtx_EQ (SImode, cc_reg, const0_rtx);
25187 emit_insn (gen_rtx_SET (bval, x));
25190 /* Emit a barrier, that is appropriate for memory model MODEL, at the end of a
25191 sequence implementing an atomic operation. */
25193 static void
25194 aarch64_emit_post_barrier (enum memmodel model)
25196 const enum memmodel base_model = memmodel_base (model);
25198 if (is_mm_sync (model)
25199 && (base_model == MEMMODEL_ACQUIRE
25200 || base_model == MEMMODEL_ACQ_REL
25201 || base_model == MEMMODEL_SEQ_CST))
25203 emit_insn (gen_mem_thread_fence (GEN_INT (MEMMODEL_SEQ_CST)));
25207 /* Split a compare and swap pattern. */
25209 void
25210 aarch64_split_compare_and_swap (rtx operands[])
25212 /* Split after prolog/epilog to avoid interactions with shrinkwrapping. */
25213 gcc_assert (epilogue_completed);
25215 rtx rval, mem, oldval, newval, scratch, x, model_rtx;
25216 machine_mode mode;
25217 bool is_weak;
25218 rtx_code_label *label1, *label2;
25219 enum memmodel model;
25221 rval = operands[0];
25222 mem = operands[1];
25223 oldval = operands[2];
25224 newval = operands[3];
25225 model_rtx = operands[5];
25226 scratch = operands[7];
25227 mode = GET_MODE (mem);
25228 model = memmodel_from_int (INTVAL (model_rtx));
25229 is_weak = operands[4] != const0_rtx && mode != TImode;
25231 /* When OLDVAL is zero and we want the strong version we can emit a tighter
25232 loop:
25233 .label1:
25234 LD[A]XR rval, [mem]
25235 CBNZ rval, .label2
25236 ST[L]XR scratch, newval, [mem]
25237 CBNZ scratch, .label1
25238 .label2:
25239 CMP rval, 0. */
25240 bool strong_zero_p = (!is_weak && !aarch64_track_speculation &&
25241 oldval == const0_rtx && mode != TImode);
25243 label1 = NULL;
25244 if (!is_weak)
25246 label1 = gen_label_rtx ();
25247 emit_label (label1);
25249 label2 = gen_label_rtx ();
25251 /* The initial load can be relaxed for a __sync operation since a final
25252 barrier will be emitted to stop code hoisting. */
25253 if (is_mm_sync (model))
25254 aarch64_emit_load_exclusive (mode, rval, mem, GEN_INT (MEMMODEL_RELAXED));
25255 else
25256 aarch64_emit_load_exclusive (mode, rval, mem, model_rtx);
25258 if (strong_zero_p)
25259 x = gen_rtx_NE (VOIDmode, rval, const0_rtx);
25260 else
25262 rtx cc_reg = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
25263 x = gen_rtx_NE (VOIDmode, cc_reg, const0_rtx);
25265 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
25266 gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
25267 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
25269 aarch64_emit_store_exclusive (mode, scratch, mem, newval, model_rtx);
25271 if (!is_weak)
25273 x = aarch64_gen_compare_zero_and_branch (NE, scratch, label1);
25274 aarch64_emit_unlikely_jump (x);
25276 else
25277 aarch64_gen_compare_reg (NE, scratch, const0_rtx);
25279 /* 128-bit LDAXP is not atomic unless STLXP succeeds. So for a mismatch,
25280 store the returned value and loop if the STLXP fails. */
25281 if (mode == TImode)
25283 rtx_code_label *label3 = gen_label_rtx ();
25284 emit_jump_insn (gen_rtx_SET (pc_rtx, gen_rtx_LABEL_REF (Pmode, label3)));
25285 emit_barrier ();
25287 emit_label (label2);
25288 aarch64_emit_store_exclusive (mode, scratch, mem, rval, model_rtx);
25290 x = aarch64_gen_compare_zero_and_branch (NE, scratch, label1);
25291 aarch64_emit_unlikely_jump (x);
25293 label2 = label3;
25296 emit_label (label2);
25298 /* If we used a CBNZ in the exchange loop emit an explicit compare with RVAL
25299 to set the condition flags. If this is not used it will be removed by
25300 later passes. */
25301 if (strong_zero_p)
25302 aarch64_gen_compare_reg (NE, rval, const0_rtx);
25304 /* Emit any final barrier needed for a __sync operation. */
25305 if (is_mm_sync (model))
25306 aarch64_emit_post_barrier (model);
25309 /* Split an atomic operation. */
25311 void
25312 aarch64_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
25313 rtx value, rtx model_rtx, rtx cond)
25315 /* Split after prolog/epilog to avoid interactions with shrinkwrapping. */
25316 gcc_assert (epilogue_completed);
25318 machine_mode mode = GET_MODE (mem);
25319 machine_mode wmode = (mode == DImode ? DImode : SImode);
25320 const enum memmodel model = memmodel_from_int (INTVAL (model_rtx));
25321 const bool is_sync = is_mm_sync (model);
25322 rtx_code_label *label;
25323 rtx x;
25325 /* Split the atomic operation into a sequence. */
25326 label = gen_label_rtx ();
25327 emit_label (label);
25329 if (new_out)
25330 new_out = gen_lowpart (wmode, new_out);
25331 if (old_out)
25332 old_out = gen_lowpart (wmode, old_out);
25333 else
25334 old_out = new_out;
25335 value = simplify_gen_subreg (wmode, value, mode, 0);
25337 /* The initial load can be relaxed for a __sync operation since a final
25338 barrier will be emitted to stop code hoisting. */
25339 if (is_sync)
25340 aarch64_emit_load_exclusive (mode, old_out, mem,
25341 GEN_INT (MEMMODEL_RELAXED));
25342 else
25343 aarch64_emit_load_exclusive (mode, old_out, mem, model_rtx);
25345 switch (code)
25347 case SET:
25348 new_out = value;
25349 break;
25351 case NOT:
25352 x = gen_rtx_AND (wmode, old_out, value);
25353 emit_insn (gen_rtx_SET (new_out, x));
25354 x = gen_rtx_NOT (wmode, new_out);
25355 emit_insn (gen_rtx_SET (new_out, x));
25356 break;
25358 case MINUS:
25359 if (CONST_INT_P (value))
25361 value = GEN_INT (-UINTVAL (value));
25362 code = PLUS;
25364 /* Fall through. */
25366 default:
25367 x = gen_rtx_fmt_ee (code, wmode, old_out, value);
25368 emit_insn (gen_rtx_SET (new_out, x));
25369 break;
25372 aarch64_emit_store_exclusive (mode, cond, mem,
25373 gen_lowpart (mode, new_out), model_rtx);
25375 x = aarch64_gen_compare_zero_and_branch (NE, cond, label);
25376 aarch64_emit_unlikely_jump (x);
25378 /* Emit any final barrier needed for a __sync operation. */
25379 if (is_sync)
25380 aarch64_emit_post_barrier (model);
25383 static void
25384 aarch64_init_libfuncs (void)
25386 /* Half-precision float operations. The compiler handles all operations
25387 with NULL libfuncs by converting to SFmode. */
25389 /* Conversions. */
25390 set_conv_libfunc (trunc_optab, HFmode, SFmode, "__gnu_f2h_ieee");
25391 set_conv_libfunc (sext_optab, SFmode, HFmode, "__gnu_h2f_ieee");
25393 /* Arithmetic. */
25394 set_optab_libfunc (add_optab, HFmode, NULL);
25395 set_optab_libfunc (sdiv_optab, HFmode, NULL);
25396 set_optab_libfunc (smul_optab, HFmode, NULL);
25397 set_optab_libfunc (neg_optab, HFmode, NULL);
25398 set_optab_libfunc (sub_optab, HFmode, NULL);
25400 /* Comparisons. */
25401 set_optab_libfunc (eq_optab, HFmode, NULL);
25402 set_optab_libfunc (ne_optab, HFmode, NULL);
25403 set_optab_libfunc (lt_optab, HFmode, NULL);
25404 set_optab_libfunc (le_optab, HFmode, NULL);
25405 set_optab_libfunc (ge_optab, HFmode, NULL);
25406 set_optab_libfunc (gt_optab, HFmode, NULL);
25407 set_optab_libfunc (unord_optab, HFmode, NULL);
25410 /* Target hook for c_mode_for_suffix. */
25411 static machine_mode
25412 aarch64_c_mode_for_suffix (char suffix)
25414 if (suffix == 'q')
25415 return TFmode;
25417 return VOIDmode;
25420 /* Return true iff X with mode MODE can be represented by a quarter-precision
25421 floating point immediate operand X. Note, we cannot represent 0.0. */
25423 bool
25424 aarch64_float_const_representable_p (rtx x)
25426 x = unwrap_const_vec_duplicate (x);
25427 machine_mode mode = GET_MODE (x);
25428 if (!CONST_DOUBLE_P (x))
25429 return false;
25431 if ((mode == HFmode && !TARGET_FP_F16INST)
25432 || mode == BFmode)
25433 return false;
25435 REAL_VALUE_TYPE r = *CONST_DOUBLE_REAL_VALUE (x);
25437 return aarch64_real_float_const_representable_p (r);
25440 /* Returns the string with the instruction for the SIMD immediate
25441 * CONST_VECTOR of MODE and WIDTH. WHICH selects a move, and(bic) or orr. */
25442 char*
25443 aarch64_output_simd_imm (rtx const_vector, unsigned width,
25444 enum simd_immediate_check which)
25446 bool is_valid;
25447 static char templ[40];
25448 const char *mnemonic;
25449 const char *shift_op;
25450 unsigned int lane_count = 0;
25451 char element_char;
25453 struct simd_immediate_info info;
25455 is_valid = aarch64_simd_valid_imm (const_vector, &info, which);
25456 gcc_assert (is_valid);
25458 element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
25459 lane_count = width / GET_MODE_BITSIZE (info.elt_mode);
25461 if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
25463 gcc_assert (info.insn == simd_immediate_info::MOV
25464 && info.u.mov.shift == 0);
25465 /* For FP zero change it to a CONST_INT 0 and use the integer SIMD
25466 move immediate path. */
25467 if (aarch64_float_const_zero_rtx_p (info.u.mov.value))
25468 info.u.mov.value = GEN_INT (0);
25469 else
25471 const unsigned int buf_size = 20;
25472 char float_buf[buf_size] = {'\0'};
25473 real_to_decimal_for_mode (float_buf,
25474 CONST_DOUBLE_REAL_VALUE (info.u.mov.value),
25475 buf_size, buf_size, 1, info.elt_mode);
25477 if (lane_count == 1)
25478 snprintf (templ, sizeof (templ), "fmov\t%%d0, %s", float_buf);
25479 else
25480 snprintf (templ, sizeof (templ), "fmov\t%%0.%d%c, %s",
25481 lane_count, element_char, float_buf);
25482 return templ;
25486 gcc_assert (CONST_INT_P (info.u.mov.value));
25488 if (which == AARCH64_CHECK_MOV)
25490 if (info.insn == simd_immediate_info::INDEX)
25492 gcc_assert (TARGET_SVE);
25493 snprintf (templ, sizeof (templ), "index\t%%Z0.%c, #"
25494 HOST_WIDE_INT_PRINT_DEC ", #" HOST_WIDE_INT_PRINT_DEC,
25495 element_char, INTVAL (info.u.index.base),
25496 INTVAL (info.u.index.step));
25497 return templ;
25500 if (info.insn == simd_immediate_info::SVE_MOV)
25502 gcc_assert (TARGET_SVE);
25503 snprintf (templ, sizeof (templ), "mov\t%%Z0.%c, #" HOST_WIDE_INT_PRINT_DEC,
25504 element_char, INTVAL (info.u.mov.value));
25505 return templ;
25508 mnemonic = info.insn == simd_immediate_info::MVN ? "mvni" : "movi";
25509 shift_op = (info.u.mov.modifier == simd_immediate_info::MSL
25510 ? "msl" : "lsl");
25511 if (lane_count == 1)
25512 snprintf (templ, sizeof (templ), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX,
25513 mnemonic, UINTVAL (info.u.mov.value));
25514 else if (info.u.mov.shift)
25515 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
25516 HOST_WIDE_INT_PRINT_HEX ", %s %d", mnemonic, lane_count,
25517 element_char, UINTVAL (info.u.mov.value), shift_op,
25518 info.u.mov.shift);
25519 else
25520 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
25521 HOST_WIDE_INT_PRINT_HEX, mnemonic, lane_count,
25522 element_char, UINTVAL (info.u.mov.value));
25524 else
25526 /* AARCH64_CHECK_ORR, AARCH64_CHECK_AND or AARCH64_CHECK_XOR. */
25527 mnemonic = "orr";
25528 if (which == AARCH64_CHECK_AND)
25529 mnemonic = info.insn == simd_immediate_info::MVN ? "bic" : "and";
25530 else if (which == AARCH64_CHECK_XOR)
25531 mnemonic = "eor";
25533 if (info.insn == simd_immediate_info::SVE_MOV)
25535 gcc_assert (TARGET_SVE);
25536 snprintf (templ, sizeof (templ), "%s\t%%Z0.%c, %%Z0.%c, "
25537 HOST_WIDE_INT_PRINT_DEC, mnemonic, element_char,
25538 element_char, INTVAL (info.u.mov.value));
25540 else if (info.u.mov.shift)
25541 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
25542 HOST_WIDE_INT_PRINT_DEC ", %s #%d", mnemonic, lane_count,
25543 element_char, UINTVAL (info.u.mov.value), "lsl",
25544 info.u.mov.shift);
25545 else
25546 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
25547 HOST_WIDE_INT_PRINT_DEC, mnemonic, lane_count,
25548 element_char, UINTVAL (info.u.mov.value));
25550 return templ;
25553 /* Returns the string with the ORR instruction for the SIMD immediate
25554 CONST_VECTOR of WIDTH bits. */
25555 char*
25556 aarch64_output_simd_orr_imm (rtx const_vector, unsigned width)
25558 return aarch64_output_simd_imm (const_vector, width, AARCH64_CHECK_ORR);
25561 /* Returns the string with the AND/BIC instruction for the SIMD immediate
25562 CONST_VECTOR of WIDTH bits. */
25563 char*
25564 aarch64_output_simd_and_imm (rtx const_vector, unsigned width)
25566 return aarch64_output_simd_imm (const_vector, width, AARCH64_CHECK_AND);
25569 /* Returns the string with the EOR instruction for the SIMD immediate
25570 CONST_VECTOR of WIDTH bits. */
25571 char*
25572 aarch64_output_simd_xor_imm (rtx const_vector, unsigned width)
25574 return aarch64_output_simd_imm (const_vector, width, AARCH64_CHECK_XOR);
25577 /* Returns the string with the MOV instruction for the SIMD immediate
25578 CONST_VECTOR of WIDTH bits. */
25579 char*
25580 aarch64_output_simd_mov_imm (rtx const_vector, unsigned width)
25582 return aarch64_output_simd_imm (const_vector, width, AARCH64_CHECK_MOV);
25585 char*
25586 aarch64_output_scalar_simd_mov_immediate (rtx immediate, scalar_int_mode mode)
25589 /* If a floating point number was passed and we desire to use it in an
25590 integer mode do the conversion to integer. */
25591 if (CONST_DOUBLE_P (immediate) && GET_MODE_CLASS (mode) == MODE_INT)
25593 unsigned HOST_WIDE_INT ival;
25594 if (!aarch64_reinterpret_float_as_int (immediate, &ival))
25595 gcc_unreachable ();
25596 immediate = gen_int_mode (ival, mode);
25599 machine_mode vmode;
25600 /* use a 64 bit mode for everything except for DI/DF/DD mode, where we use
25601 a 128 bit vector mode. */
25602 int width = GET_MODE_BITSIZE (mode) == 64 ? 128 : 64;
25604 vmode = aarch64_simd_container_mode (mode, width);
25605 rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (immediate));
25606 return aarch64_output_simd_mov_imm (v_op, width);
25609 /* Return the output string to use for moving immediate CONST_VECTOR
25610 into an SVE register. */
25612 char *
25613 aarch64_output_sve_mov_immediate (rtx const_vector)
25615 static char templ[40];
25616 struct simd_immediate_info info;
25617 char element_char;
25618 bool is_valid;
25620 is_valid = aarch64_simd_valid_imm (const_vector, &info, AARCH64_CHECK_MOV);
25621 gcc_assert (is_valid);
25623 element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
25625 machine_mode vec_mode = GET_MODE (const_vector);
25626 if (aarch64_sve_pred_mode_p (vec_mode))
25628 static char buf[sizeof ("ptrue\t%0.N, vlNNNNN")];
25629 if (info.insn == simd_immediate_info::MOV)
25631 gcc_assert (info.u.mov.value == const0_rtx);
25632 snprintf (buf, sizeof (buf), "pfalse\t%%0.b");
25634 else
25636 gcc_assert (info.insn == simd_immediate_info::PTRUE);
25637 unsigned int total_bytes;
25638 if (info.u.pattern == AARCH64_SV_ALL
25639 && BYTES_PER_SVE_VECTOR.is_constant (&total_bytes))
25640 snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, vl%d", element_char,
25641 total_bytes / GET_MODE_SIZE (info.elt_mode));
25642 else
25643 snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, %s", element_char,
25644 svpattern_token (info.u.pattern));
25646 return buf;
25649 if (info.insn == simd_immediate_info::INDEX)
25651 snprintf (templ, sizeof (templ), "index\t%%0.%c, #"
25652 HOST_WIDE_INT_PRINT_DEC ", #" HOST_WIDE_INT_PRINT_DEC,
25653 element_char, INTVAL (info.u.index.base),
25654 INTVAL (info.u.index.step));
25655 return templ;
25658 if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
25660 if (aarch64_float_const_zero_rtx_p (info.u.mov.value))
25661 info.u.mov.value = GEN_INT (0);
25662 else
25664 const int buf_size = 20;
25665 char float_buf[buf_size] = {};
25666 real_to_decimal_for_mode (float_buf,
25667 CONST_DOUBLE_REAL_VALUE (info.u.mov.value),
25668 buf_size, buf_size, 1, info.elt_mode);
25670 snprintf (templ, sizeof (templ), "fmov\t%%0.%c, #%s",
25671 element_char, float_buf);
25672 return templ;
25676 if (info.u.mov.value == const0_rtx && TARGET_NON_STREAMING)
25677 snprintf (templ, sizeof (templ), "movi\t%%d0, #0");
25678 else
25679 snprintf (templ, sizeof (templ), "mov\t%%0.%c, #" HOST_WIDE_INT_PRINT_DEC,
25680 element_char, INTVAL (info.u.mov.value));
25681 return templ;
25684 /* Return the asm template for a PTRUES. CONST_UNSPEC is the
25685 aarch64_sve_ptrue_svpattern_immediate that describes the predicate
25686 pattern. */
25688 char *
25689 aarch64_output_sve_ptrues (rtx const_unspec)
25691 static char templ[40];
25692 struct simd_immediate_info info;
25693 bool is_valid;
25695 is_valid = aarch64_simd_valid_imm (const_unspec, &info, AARCH64_CHECK_MOV);
25696 gcc_assert (is_valid && info.insn == simd_immediate_info::PTRUE);
25698 char element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
25699 snprintf (templ, sizeof (templ), "ptrues\t%%0.%c, %s", element_char,
25700 svpattern_token (info.u.pattern));
25701 return templ;
25704 /* Split operands into moves from op[1] + op[2] into op[0]. */
25706 void
25707 aarch64_split_combinev16qi (rtx operands[3])
25709 machine_mode halfmode = GET_MODE (operands[1]);
25711 gcc_assert (halfmode == V16QImode);
25713 rtx destlo = simplify_gen_subreg (halfmode, operands[0],
25714 GET_MODE (operands[0]), 0);
25715 rtx desthi = simplify_gen_subreg (halfmode, operands[0],
25716 GET_MODE (operands[0]),
25717 GET_MODE_SIZE (halfmode));
25719 bool skiplo = rtx_equal_p (destlo, operands[1]);
25720 bool skiphi = rtx_equal_p (desthi, operands[2]);
25722 if (skiplo && skiphi)
25724 /* No-op move. Can't split to nothing; emit something. */
25725 emit_note (NOTE_INSN_DELETED);
25726 return;
25729 /* Special case of reversed high/low parts. */
25730 if (reg_overlap_mentioned_p (operands[2], destlo)
25731 && reg_overlap_mentioned_p (operands[1], desthi))
25733 emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
25734 emit_insn (gen_xorv16qi3 (operands[2], operands[1], operands[2]));
25735 emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
25737 else if (!reg_overlap_mentioned_p (operands[2], destlo))
25739 /* Try to avoid unnecessary moves if part of the result
25740 is in the right place already. */
25741 if (!skiplo)
25742 emit_move_insn (destlo, operands[1]);
25743 if (!skiphi)
25744 emit_move_insn (desthi, operands[2]);
25746 else
25748 if (!skiphi)
25749 emit_move_insn (desthi, operands[2]);
25750 if (!skiplo)
25751 emit_move_insn (destlo, operands[1]);
25755 /* vec_perm support. */
25757 struct expand_vec_perm_d
25759 rtx target, op0, op1;
25760 vec_perm_indices perm;
25761 machine_mode vmode;
25762 machine_mode op_mode;
25763 unsigned int vec_flags;
25764 unsigned int op_vec_flags;
25765 bool one_vector_p;
25766 bool zero_op0_p, zero_op1_p;
25767 bool testing_p;
25770 static bool aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d);
25772 /* Generate a variable permutation. */
25774 static void
25775 aarch64_expand_vec_perm_1 (rtx target, rtx op0, rtx op1, rtx sel)
25777 machine_mode vmode = GET_MODE (target);
25778 bool one_vector_p = rtx_equal_p (op0, op1);
25780 gcc_checking_assert (vmode == V8QImode || vmode == V16QImode);
25781 gcc_checking_assert (GET_MODE (op0) == vmode);
25782 gcc_checking_assert (GET_MODE (op1) == vmode);
25783 gcc_checking_assert (GET_MODE (sel) == vmode);
25784 gcc_checking_assert (TARGET_SIMD);
25786 if (one_vector_p)
25788 if (vmode == V8QImode)
25790 /* Expand the argument to a V16QI mode by duplicating it. */
25791 rtx pair = gen_reg_rtx (V16QImode);
25792 emit_insn (gen_aarch64_combinev8qi (pair, op0, op0));
25793 emit_insn (gen_aarch64_qtbl1v8qi (target, pair, sel));
25795 else
25797 emit_insn (gen_aarch64_qtbl1v16qi (target, op0, sel));
25800 else
25802 rtx pair;
25804 if (vmode == V8QImode)
25806 pair = gen_reg_rtx (V16QImode);
25807 emit_insn (gen_aarch64_combinev8qi (pair, op0, op1));
25808 emit_insn (gen_aarch64_qtbl1v8qi (target, pair, sel));
25810 else
25812 pair = gen_reg_rtx (V2x16QImode);
25813 emit_insn (gen_aarch64_combinev16qi (pair, op0, op1));
25814 emit_insn (gen_aarch64_qtbl2v16qi (target, pair, sel));
25819 /* Expand a vec_perm with the operands given by TARGET, OP0, OP1 and SEL.
25820 NELT is the number of elements in the vector. */
25822 void
25823 aarch64_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel,
25824 unsigned int nelt)
25826 machine_mode vmode = GET_MODE (target);
25827 bool one_vector_p = rtx_equal_p (op0, op1);
25828 rtx mask;
25830 /* The TBL instruction does not use a modulo index, so we must take care
25831 of that ourselves. */
25832 mask = aarch64_simd_gen_const_vector_dup (vmode,
25833 one_vector_p ? nelt - 1 : 2 * nelt - 1);
25834 sel = expand_simple_binop (vmode, AND, sel, mask, NULL, 0, OPTAB_LIB_WIDEN);
25836 /* For big-endian, we also need to reverse the index within the vector
25837 (but not which vector). */
25838 if (BYTES_BIG_ENDIAN)
25840 /* If one_vector_p, mask is a vector of (nelt - 1)'s already. */
25841 if (!one_vector_p)
25842 mask = aarch64_simd_gen_const_vector_dup (vmode, nelt - 1);
25843 sel = expand_simple_binop (vmode, XOR, sel, mask,
25844 NULL, 0, OPTAB_LIB_WIDEN);
25846 aarch64_expand_vec_perm_1 (target, op0, op1, sel);
25849 /* Generate (set TARGET (unspec [OP0 OP1] CODE)). */
25851 static void
25852 emit_unspec2 (rtx target, int code, rtx op0, rtx op1)
25854 emit_insn (gen_rtx_SET (target,
25855 gen_rtx_UNSPEC (GET_MODE (target),
25856 gen_rtvec (2, op0, op1), code)));
25859 /* Expand an SVE vec_perm with the given operands. */
25861 void
25862 aarch64_expand_sve_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
25864 machine_mode data_mode = GET_MODE (target);
25865 machine_mode sel_mode = GET_MODE (sel);
25866 /* Enforced by the pattern condition. */
25867 int nunits = GET_MODE_NUNITS (sel_mode).to_constant ();
25869 /* Note: vec_perm indices are supposed to wrap when they go beyond the
25870 size of the two value vectors, i.e. the upper bits of the indices
25871 are effectively ignored. SVE TBL instead produces 0 for any
25872 out-of-range indices, so we need to modulo all the vec_perm indices
25873 to ensure they are all in range. */
25874 rtx sel_reg = force_reg (sel_mode, sel);
25876 /* Check if the sel only references the first values vector. */
25877 if (CONST_VECTOR_P (sel)
25878 && aarch64_const_vec_all_in_range_p (sel, 0, nunits - 1))
25880 emit_unspec2 (target, UNSPEC_TBL, op0, sel_reg);
25881 return;
25884 /* Check if the two values vectors are the same. */
25885 if (rtx_equal_p (op0, op1))
25887 rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode, nunits - 1);
25888 rtx sel_mod = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
25889 NULL, 0, OPTAB_DIRECT);
25890 emit_unspec2 (target, UNSPEC_TBL, op0, sel_mod);
25891 return;
25894 /* Run TBL on for each value vector and combine the results. */
25896 rtx res0 = gen_reg_rtx (data_mode);
25897 rtx res1 = gen_reg_rtx (data_mode);
25898 rtx neg_num_elems = aarch64_simd_gen_const_vector_dup (sel_mode, -nunits);
25899 if (!CONST_VECTOR_P (sel)
25900 || !aarch64_const_vec_all_in_range_p (sel, 0, 2 * nunits - 1))
25902 rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode,
25903 2 * nunits - 1);
25904 sel_reg = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
25905 NULL, 0, OPTAB_DIRECT);
25907 emit_unspec2 (res0, UNSPEC_TBL, op0, sel_reg);
25908 rtx sel_sub = expand_simple_binop (sel_mode, PLUS, sel_reg, neg_num_elems,
25909 NULL, 0, OPTAB_DIRECT);
25910 emit_unspec2 (res1, UNSPEC_TBL, op1, sel_sub);
25911 if (GET_MODE_CLASS (data_mode) == MODE_VECTOR_INT)
25912 emit_insn (gen_rtx_SET (target, gen_rtx_IOR (data_mode, res0, res1)));
25913 else
25914 emit_unspec2 (target, UNSPEC_IORF, res0, res1);
25917 /* Recognize patterns suitable for the TRN instructions. */
25918 static bool
25919 aarch64_evpc_trn (struct expand_vec_perm_d *d)
25921 HOST_WIDE_INT odd;
25922 poly_uint64 nelt = d->perm.length ();
25923 rtx out, in0, in1;
25924 machine_mode vmode = d->vmode;
25926 if (GET_MODE_UNIT_SIZE (vmode) > 8)
25927 return false;
25929 /* Note that these are little-endian tests.
25930 We correct for big-endian later. */
25931 if (!d->perm[0].is_constant (&odd)
25932 || (odd != 0 && odd != 1)
25933 || !d->perm.series_p (0, 2, odd, 2)
25934 || !d->perm.series_p (1, 2, nelt + odd, 2))
25935 return false;
25937 /* Success! */
25938 if (d->testing_p)
25939 return true;
25941 in0 = d->op0;
25942 in1 = d->op1;
25943 /* We don't need a big-endian lane correction for SVE; see the comment
25944 at the head of aarch64-sve.md for details. */
25945 if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
25947 std::swap (in0, in1);
25948 odd = !odd;
25950 out = d->target;
25952 emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
25953 odd ? UNSPEC_TRN2 : UNSPEC_TRN1));
25954 return true;
25957 /* Try to re-encode the PERM constant so it combines odd and even elements.
25958 This rewrites constants such as {0, 1, 4, 5}/V4SF to {0, 2}/V2DI.
25959 We retry with this new constant with the full suite of patterns. */
25960 static bool
25961 aarch64_evpc_reencode (struct expand_vec_perm_d *d)
25963 expand_vec_perm_d newd;
25965 /* The subregs that we'd create are not supported for big-endian SVE;
25966 see aarch64_modes_compatible_p for details. */
25967 if (BYTES_BIG_ENDIAN && (d->vec_flags & VEC_ANY_SVE))
25968 return false;
25970 /* Get the new mode. Always twice the size of the inner
25971 and half the elements. */
25972 machine_mode new_mode;
25973 if (!aarch64_coalesce_units (d->vmode, 2).exists (&new_mode))
25974 return false;
25976 vec_perm_indices newpermindices;
25977 if (!newpermindices.new_shrunk_vector (d->perm, 2))
25978 return false;
25980 newd.vmode = new_mode;
25981 newd.vec_flags = d->vec_flags;
25982 newd.op_mode = newd.vmode;
25983 newd.op_vec_flags = newd.vec_flags;
25984 newd.target = d->target ? gen_lowpart (new_mode, d->target) : NULL;
25985 newd.op0 = d->op0 ? gen_lowpart (new_mode, d->op0) : NULL;
25986 newd.op1 = d->op1 ? gen_lowpart (new_mode, d->op1) : NULL;
25987 newd.testing_p = d->testing_p;
25988 newd.one_vector_p = d->one_vector_p;
25990 newd.perm.new_vector (newpermindices.encoding (), newd.one_vector_p ? 1 : 2,
25991 newpermindices.nelts_per_input ());
25992 return aarch64_expand_vec_perm_const_1 (&newd);
25995 /* Recognize patterns suitable for the UZP instructions. */
25996 static bool
25997 aarch64_evpc_uzp (struct expand_vec_perm_d *d)
25999 HOST_WIDE_INT odd;
26000 rtx out, in0, in1;
26001 machine_mode vmode = d->vmode;
26003 if (GET_MODE_UNIT_SIZE (vmode) > 8)
26004 return false;
26006 /* Note that these are little-endian tests.
26007 We correct for big-endian later. */
26008 if (!d->perm[0].is_constant (&odd)
26009 || (odd != 0 && odd != 1)
26010 || !d->perm.series_p (0, 1, odd, 2))
26011 return false;
26013 /* Success! */
26014 if (d->testing_p)
26015 return true;
26017 in0 = d->op0;
26018 in1 = d->op1;
26019 /* We don't need a big-endian lane correction for SVE; see the comment
26020 at the head of aarch64-sve.md for details. */
26021 if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
26023 std::swap (in0, in1);
26024 odd = !odd;
26026 out = d->target;
26028 emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
26029 odd ? UNSPEC_UZP2 : UNSPEC_UZP1));
26030 return true;
26033 /* Recognize patterns suitable for the ZIP instructions. */
26034 static bool
26035 aarch64_evpc_zip (struct expand_vec_perm_d *d)
26037 unsigned int high;
26038 poly_uint64 nelt = d->perm.length ();
26039 rtx out, in0, in1;
26040 machine_mode vmode = d->vmode;
26042 if (GET_MODE_UNIT_SIZE (vmode) > 8)
26043 return false;
26045 /* Note that these are little-endian tests.
26046 We correct for big-endian later. */
26047 poly_uint64 first = d->perm[0];
26048 if ((maybe_ne (first, 0U) && maybe_ne (first * 2, nelt))
26049 || !d->perm.series_p (0, 2, first, 1)
26050 || !d->perm.series_p (1, 2, first + nelt, 1))
26051 return false;
26052 high = maybe_ne (first, 0U);
26054 /* Success! */
26055 if (d->testing_p)
26056 return true;
26058 in0 = d->op0;
26059 in1 = d->op1;
26060 /* We don't need a big-endian lane correction for SVE; see the comment
26061 at the head of aarch64-sve.md for details. */
26062 if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
26064 std::swap (in0, in1);
26065 high = !high;
26067 out = d->target;
26069 emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
26070 high ? UNSPEC_ZIP2 : UNSPEC_ZIP1));
26071 return true;
26074 /* Recognize patterns for the EXT insn. */
26076 static bool
26077 aarch64_evpc_ext (struct expand_vec_perm_d *d)
26079 HOST_WIDE_INT location;
26080 rtx offset;
26082 /* The first element always refers to the first vector.
26083 Check if the extracted indices are increasing by one. */
26084 if ((d->vec_flags & VEC_SVE_PRED)
26085 || !d->perm[0].is_constant (&location)
26086 || !d->perm.series_p (0, 1, location, 1))
26087 return false;
26089 /* Success! */
26090 if (d->testing_p)
26091 return true;
26093 /* The case where (location == 0) is a no-op for both big- and little-endian,
26094 and is removed by the mid-end at optimization levels -O1 and higher.
26096 We don't need a big-endian lane correction for SVE; see the comment
26097 at the head of aarch64-sve.md for details. */
26098 if (BYTES_BIG_ENDIAN && location != 0 && d->vec_flags == VEC_ADVSIMD)
26100 /* After setup, we want the high elements of the first vector (stored
26101 at the LSB end of the register), and the low elements of the second
26102 vector (stored at the MSB end of the register). So swap. */
26103 std::swap (d->op0, d->op1);
26104 /* location != 0 (above), so safe to assume (nelt - location) < nelt.
26105 to_constant () is safe since this is restricted to Advanced SIMD
26106 vectors. */
26107 location = d->perm.length ().to_constant () - location;
26110 offset = GEN_INT (location);
26111 emit_set_insn (d->target,
26112 gen_rtx_UNSPEC (d->vmode,
26113 gen_rtvec (3, d->op0, d->op1, offset),
26114 UNSPEC_EXT));
26115 return true;
26118 /* Recognize patterns for the REV{64,32,16} insns, which reverse elements
26119 within each 64-bit, 32-bit or 16-bit granule. */
26121 static bool
26122 aarch64_evpc_rev_local (struct expand_vec_perm_d *d)
26124 HOST_WIDE_INT diff;
26125 unsigned int i, size, unspec;
26126 machine_mode pred_mode;
26128 if ((d->vec_flags & VEC_SVE_PRED)
26129 || !d->one_vector_p
26130 || !d->perm[0].is_constant (&diff)
26131 || !diff)
26132 return false;
26134 if (d->vec_flags & VEC_SVE_DATA)
26135 size = (diff + 1) * aarch64_sve_container_bits (d->vmode);
26136 else
26137 size = (diff + 1) * GET_MODE_UNIT_BITSIZE (d->vmode);
26138 if (size == 64)
26140 unspec = UNSPEC_REV64;
26141 pred_mode = VNx2BImode;
26143 else if (size == 32)
26145 unspec = UNSPEC_REV32;
26146 pred_mode = VNx4BImode;
26148 else if (size == 16)
26150 unspec = UNSPEC_REV16;
26151 pred_mode = VNx8BImode;
26153 else
26154 return false;
26156 unsigned int step = diff + 1;
26157 for (i = 0; i < step; ++i)
26158 if (!d->perm.series_p (i, step, diff - i, step))
26159 return false;
26161 /* Success! */
26162 if (d->testing_p)
26163 return true;
26165 if (d->vec_flags & VEC_SVE_DATA)
26167 rtx pred = aarch64_ptrue_reg (pred_mode);
26168 emit_insn (gen_aarch64_sve_revbhw (d->vmode, pred_mode,
26169 d->target, pred, d->op0));
26170 return true;
26172 rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), unspec);
26173 emit_set_insn (d->target, src);
26174 return true;
26177 /* Recognize patterns for the REV insn, which reverses elements within
26178 a full vector. */
26180 static bool
26181 aarch64_evpc_rev_global (struct expand_vec_perm_d *d)
26183 poly_uint64 nelt = d->perm.length ();
26185 if (!d->one_vector_p || d->vec_flags == VEC_ADVSIMD)
26186 return false;
26188 if (!d->perm.series_p (0, 1, nelt - 1, -1))
26189 return false;
26191 /* Success! */
26192 if (d->testing_p)
26193 return true;
26195 rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), UNSPEC_REV);
26196 emit_set_insn (d->target, src);
26197 return true;
26200 static bool
26201 aarch64_evpc_dup (struct expand_vec_perm_d *d)
26203 rtx out = d->target;
26204 rtx in0;
26205 HOST_WIDE_INT elt;
26206 machine_mode vmode = d->vmode;
26207 rtx lane;
26209 if ((d->vec_flags & VEC_SVE_PRED)
26210 || d->perm.encoding ().encoded_nelts () != 1
26211 || !d->perm[0].is_constant (&elt))
26212 return false;
26214 if ((d->vec_flags & VEC_SVE_DATA)
26215 && elt * (aarch64_sve_container_bits (vmode) / 8) >= 64)
26216 return false;
26218 /* Success! */
26219 if (d->testing_p)
26220 return true;
26222 /* The generic preparation in aarch64_expand_vec_perm_const_1
26223 swaps the operand order and the permute indices if it finds
26224 d->perm[0] to be in the second operand. Thus, we can always
26225 use d->op0 and need not do any extra arithmetic to get the
26226 correct lane number. */
26227 in0 = d->op0;
26228 lane = GEN_INT (elt); /* The pattern corrects for big-endian. */
26230 rtx parallel = gen_rtx_PARALLEL (vmode, gen_rtvec (1, lane));
26231 rtx select = gen_rtx_VEC_SELECT (GET_MODE_INNER (vmode), in0, parallel);
26232 emit_set_insn (out, gen_rtx_VEC_DUPLICATE (vmode, select));
26233 return true;
26236 /* Recognize things that can be done using the SVE2p1 Hybrid-VLA
26237 permutations, which apply Advanced-SIMD-style permutations to each
26238 individual 128-bit block. */
26240 static bool
26241 aarch64_evpc_hvla (struct expand_vec_perm_d *d)
26243 machine_mode vmode = d->vmode;
26244 if (!TARGET_SVE2p1
26245 || !TARGET_NON_STREAMING
26246 || BYTES_BIG_ENDIAN
26247 || d->vec_flags != VEC_SVE_DATA
26248 || GET_MODE_UNIT_BITSIZE (vmode) > 64)
26249 return false;
26251 /* Set SUBELTS to the number of elements in an Advanced SIMD vector
26252 and make sure that adding SUBELTS to each block of SUBELTS indices
26253 gives the next block of SUBELTS indices. That is, it must be possible
26254 to interpret the index vector as SUBELTS interleaved linear series in
26255 which each series has step SUBELTS. */
26256 unsigned int subelts = 128U / GET_MODE_UNIT_BITSIZE (vmode);
26257 unsigned int pairs = subelts / 2;
26258 for (unsigned int i = 0; i < subelts; ++i)
26259 if (!d->perm.series_p (i, subelts, d->perm[i], subelts))
26260 return false;
26262 /* Used once we have verified that we can use UNSPEC to do the operation. */
26263 auto use_binary = [&](int unspec) -> bool
26265 if (!d->testing_p)
26267 rtvec vec = gen_rtvec (2, d->op0, d->op1);
26268 emit_set_insn (d->target, gen_rtx_UNSPEC (vmode, vec, unspec));
26270 return true;
26273 /* Now check whether the first SUBELTS elements match a supported
26274 Advanced-SIMD-style operation. */
26275 poly_int64 first = d->perm[0];
26276 poly_int64 nelt = d->perm.length ();
26277 auto try_zip = [&]() -> bool
26279 if (maybe_ne (first, 0) && maybe_ne (first, pairs))
26280 return false;
26281 for (unsigned int i = 0; i < pairs; ++i)
26282 if (maybe_ne (d->perm[i * 2], first + i)
26283 || maybe_ne (d->perm[i * 2 + 1], first + nelt + i))
26284 return false;
26285 return use_binary (maybe_ne (first, 0) ? UNSPEC_ZIPQ2 : UNSPEC_ZIPQ1);
26287 auto try_uzp = [&]() -> bool
26289 if (maybe_ne (first, 0) && maybe_ne (first, 1))
26290 return false;
26291 for (unsigned int i = 0; i < pairs; ++i)
26292 if (maybe_ne (d->perm[i], first + i * 2)
26293 || maybe_ne (d->perm[i + pairs], first + nelt + i * 2))
26294 return false;
26295 return use_binary (maybe_ne (first, 0) ? UNSPEC_UZPQ2 : UNSPEC_UZPQ1);
26297 auto try_extq = [&]() -> bool
26299 HOST_WIDE_INT start;
26300 if (!first.is_constant (&start) || !IN_RANGE (start, 0, subelts - 1))
26301 return false;
26302 for (unsigned int i = 0; i < subelts; ++i)
26304 poly_int64 next = (start + i >= subelts
26305 ? start + i - subelts + nelt
26306 : start + i);
26307 if (maybe_ne (d->perm[i], next))
26308 return false;
26310 if (!d->testing_p)
26312 rtx op2 = gen_int_mode (start, SImode);
26313 emit_insn (gen_aarch64_sve_extq (vmode, d->target,
26314 d->op0, d->op1, op2));
26316 return true;
26318 auto try_dupq = [&]() -> bool
26320 HOST_WIDE_INT start;
26321 if (!first.is_constant (&start) || !IN_RANGE (start, 0, subelts - 1))
26322 return false;
26323 for (unsigned int i = 0; i < subelts; ++i)
26324 if (maybe_ne (d->perm[i], start))
26325 return false;
26326 if (!d->testing_p)
26328 rtx op1 = gen_int_mode (start, SImode);
26329 emit_insn (gen_aarch64_sve_dupq (vmode, d->target, d->op0, op1));
26331 return true;
26334 return try_zip () || try_uzp () || try_extq () || try_dupq ();
26337 static bool
26338 aarch64_evpc_tbl (struct expand_vec_perm_d *d)
26340 rtx rperm[MAX_COMPILE_TIME_VEC_BYTES], sel;
26341 machine_mode vmode = d->vmode;
26343 /* Make sure that the indices are constant. */
26344 unsigned int encoded_nelts = d->perm.encoding ().encoded_nelts ();
26345 for (unsigned int i = 0; i < encoded_nelts; ++i)
26346 if (!d->perm[i].is_constant ())
26347 return false;
26349 if (d->testing_p)
26350 return true;
26352 /* Generic code will try constant permutation twice. Once with the
26353 original mode and again with the elements lowered to QImode.
26354 So wait and don't do the selector expansion ourselves. */
26355 if (vmode != V8QImode && vmode != V16QImode)
26356 return false;
26358 /* to_constant is safe since this routine is specific to Advanced SIMD
26359 vectors. */
26360 unsigned int nelt = d->perm.length ().to_constant ();
26362 /* If one register is the constant vector of 0 then we only need
26363 a one reg TBL and we map any accesses to the vector of 0 to -1. We can't
26364 do this earlier since vec_perm_indices clamps elements to within range so
26365 we can only do it during codegen. */
26366 if (d->zero_op0_p)
26367 d->op0 = d->op1;
26368 else if (d->zero_op1_p)
26369 d->op1 = d->op0;
26371 for (unsigned int i = 0; i < nelt; ++i)
26373 auto val = d->perm[i].to_constant ();
26375 /* If we're selecting from a 0 vector, we can just use an out of range
26376 index instead. */
26377 if ((d->zero_op0_p && val < nelt) || (d->zero_op1_p && val >= nelt))
26378 rperm[i] = constm1_rtx;
26379 else
26381 /* If we are remapping a zero register as the first parameter we need
26382 to adjust the indices of the non-zero register. */
26383 if (d->zero_op0_p)
26384 val = val % nelt;
26386 /* If big-endian and two vectors we end up with a weird mixed-endian
26387 mode on NEON. Reverse the index within each word but not the word
26388 itself. to_constant is safe because we checked is_constant
26389 above. */
26390 rperm[i] = GEN_INT (BYTES_BIG_ENDIAN ? val ^ (nelt - 1) : val);
26394 sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
26395 sel = force_reg (vmode, sel);
26397 aarch64_expand_vec_perm_1 (d->target, d->op0, d->op1, sel);
26398 return true;
26401 /* Try to implement D using an SVE TBL instruction. */
26403 static bool
26404 aarch64_evpc_sve_tbl (struct expand_vec_perm_d *d)
26406 unsigned HOST_WIDE_INT nelt;
26408 /* Permuting two variable-length vectors could overflow the
26409 index range. */
26410 if (!d->one_vector_p && !d->perm.length ().is_constant (&nelt))
26411 return false;
26413 if (d->testing_p)
26414 return true;
26416 machine_mode sel_mode = related_int_vector_mode (d->vmode).require ();
26417 rtx sel = vec_perm_indices_to_rtx (sel_mode, d->perm);
26418 if (d->one_vector_p)
26419 emit_unspec2 (d->target, UNSPEC_TBL, d->op0, force_reg (sel_mode, sel));
26420 else
26421 aarch64_expand_sve_vec_perm (d->target, d->op0, d->op1, sel);
26422 return true;
26425 /* Try to implement D using SVE dup instruction. */
26427 static bool
26428 aarch64_evpc_sve_dup (struct expand_vec_perm_d *d)
26430 if (BYTES_BIG_ENDIAN
26431 || !d->one_vector_p
26432 || d->vec_flags != VEC_SVE_DATA
26433 || d->op_vec_flags != VEC_ADVSIMD
26434 || d->perm.encoding ().nelts_per_pattern () != 1
26435 || !known_eq (d->perm.encoding ().npatterns (),
26436 GET_MODE_NUNITS (d->op_mode))
26437 || !known_eq (GET_MODE_BITSIZE (d->op_mode), 128))
26438 return false;
26440 int npatterns = d->perm.encoding ().npatterns ();
26441 for (int i = 0; i < npatterns; i++)
26442 if (!known_eq (d->perm[i], i))
26443 return false;
26445 if (d->testing_p)
26446 return true;
26448 aarch64_expand_sve_dupq (d->target, GET_MODE (d->target), d->op0);
26449 return true;
26452 /* Try to implement D using SVE SEL instruction. */
26454 static bool
26455 aarch64_evpc_sel (struct expand_vec_perm_d *d)
26457 machine_mode vmode = d->vmode;
26458 int unit_size = GET_MODE_UNIT_SIZE (vmode);
26460 if (d->vec_flags != VEC_SVE_DATA
26461 || unit_size > 8)
26462 return false;
26464 int n_patterns = d->perm.encoding ().npatterns ();
26465 poly_int64 vec_len = d->perm.length ();
26467 for (int i = 0; i < n_patterns; ++i)
26468 if (!known_eq (d->perm[i], i)
26469 && !known_eq (d->perm[i], vec_len + i))
26470 return false;
26472 for (int i = n_patterns; i < n_patterns * 2; i++)
26473 if (!d->perm.series_p (i, n_patterns, i, n_patterns)
26474 && !d->perm.series_p (i, n_patterns, vec_len + i, n_patterns))
26475 return false;
26477 if (d->testing_p)
26478 return true;
26480 machine_mode pred_mode = aarch64_sve_pred_mode (vmode);
26482 /* Build a predicate that is true when op0 elements should be used. */
26483 rtx_vector_builder builder (pred_mode, n_patterns, 2);
26484 for (int i = 0; i < n_patterns * 2; i++)
26486 rtx elem = known_eq (d->perm[i], i) ? CONST1_RTX (BImode)
26487 : CONST0_RTX (BImode);
26488 builder.quick_push (elem);
26491 rtx const_vec = builder.build ();
26492 rtx pred = force_reg (pred_mode, const_vec);
26493 /* TARGET = PRED ? OP0 : OP1. */
26494 emit_insn (gen_vcond_mask (vmode, vmode, d->target, d->op0, d->op1, pred));
26495 return true;
26498 /* Recognize patterns suitable for the INS instructions. */
26499 static bool
26500 aarch64_evpc_ins (struct expand_vec_perm_d *d)
26502 machine_mode mode = d->vmode;
26503 unsigned HOST_WIDE_INT nelt;
26505 if (d->vec_flags != VEC_ADVSIMD)
26506 return false;
26508 /* to_constant is safe since this routine is specific to Advanced SIMD
26509 vectors. */
26510 nelt = d->perm.length ().to_constant ();
26511 rtx insv = d->op0;
26513 HOST_WIDE_INT idx = -1;
26515 for (unsigned HOST_WIDE_INT i = 0; i < nelt; i++)
26517 HOST_WIDE_INT elt;
26518 if (!d->perm[i].is_constant (&elt))
26519 return false;
26520 if (elt == (HOST_WIDE_INT) i)
26521 continue;
26522 if (idx != -1)
26524 idx = -1;
26525 break;
26527 idx = i;
26530 if (idx == -1)
26532 insv = d->op1;
26533 for (unsigned HOST_WIDE_INT i = 0; i < nelt; i++)
26535 if (d->perm[i].to_constant () == (HOST_WIDE_INT) (i + nelt))
26536 continue;
26537 if (idx != -1)
26538 return false;
26539 idx = i;
26542 if (idx == -1)
26543 return false;
26546 if (d->testing_p)
26547 return true;
26549 gcc_assert (idx != -1);
26551 unsigned extractindex = d->perm[idx].to_constant ();
26552 rtx extractv = d->op0;
26553 if (extractindex >= nelt)
26555 extractv = d->op1;
26556 extractindex -= nelt;
26558 gcc_assert (extractindex < nelt);
26560 insn_code icode = code_for_aarch64_simd_vec_copy_lane (mode);
26561 expand_operand ops[5];
26562 create_output_operand (&ops[0], d->target, mode);
26563 create_input_operand (&ops[1], insv, mode);
26564 create_integer_operand (&ops[2], 1 << idx);
26565 create_input_operand (&ops[3], extractv, mode);
26566 create_integer_operand (&ops[4], extractindex);
26567 expand_insn (icode, 5, ops);
26569 return true;
26572 static bool
26573 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
26575 gcc_assert (d->op_mode != E_VOIDmode);
26577 /* The pattern matching functions above are written to look for a small
26578 number to begin the sequence (0, 1, N/2). If we begin with an index
26579 from the second operand, we can swap the operands. */
26580 poly_int64 nelt = d->perm.length ();
26581 if (known_ge (d->perm[0], nelt))
26583 d->perm.rotate_inputs (1);
26584 std::swap (d->op0, d->op1);
26587 if (((d->vec_flags == VEC_ADVSIMD && TARGET_SIMD)
26588 || d->vec_flags == VEC_SVE_DATA
26589 || d->vec_flags == (VEC_SVE_DATA | VEC_PARTIAL)
26590 || d->vec_flags == VEC_SVE_PRED)
26591 && known_gt (nelt, 1))
26593 if (d->vmode == d->op_mode)
26595 if (aarch64_evpc_rev_local (d))
26596 return true;
26597 else if (aarch64_evpc_rev_global (d))
26598 return true;
26599 else if (aarch64_evpc_ext (d))
26600 return true;
26601 else if (aarch64_evpc_dup (d))
26602 return true;
26603 else if (aarch64_evpc_zip (d))
26604 return true;
26605 else if (aarch64_evpc_uzp (d))
26606 return true;
26607 else if (aarch64_evpc_trn (d))
26608 return true;
26609 else if (aarch64_evpc_sel (d))
26610 return true;
26611 else if (aarch64_evpc_ins (d))
26612 return true;
26613 else if (aarch64_evpc_hvla (d))
26614 return true;
26615 else if (aarch64_evpc_reencode (d))
26616 return true;
26618 if (d->vec_flags == VEC_SVE_DATA)
26619 return aarch64_evpc_sve_tbl (d);
26620 else if (d->vec_flags == VEC_ADVSIMD)
26621 return aarch64_evpc_tbl (d);
26623 else
26625 if (aarch64_evpc_sve_dup (d))
26626 return true;
26629 return false;
26632 /* Implement TARGET_VECTORIZE_VEC_PERM_CONST. */
26634 static bool
26635 aarch64_vectorize_vec_perm_const (machine_mode vmode, machine_mode op_mode,
26636 rtx target, rtx op0, rtx op1,
26637 const vec_perm_indices &sel)
26639 struct expand_vec_perm_d d;
26641 /* Check whether the mask can be applied to a single vector. */
26642 if (sel.ninputs () == 1
26643 || (op0 && rtx_equal_p (op0, op1)))
26644 d.one_vector_p = true;
26645 else if (sel.all_from_input_p (0))
26647 d.one_vector_p = true;
26648 op1 = op0;
26650 else if (sel.all_from_input_p (1))
26652 d.one_vector_p = true;
26653 op0 = op1;
26655 else
26656 d.one_vector_p = false;
26658 d.zero_op0_p = op0 == CONST0_RTX (op_mode);
26659 d.zero_op1_p = op1 == CONST0_RTX (op_mode);
26660 d.perm.new_vector (sel.encoding (), d.one_vector_p ? 1 : 2,
26661 sel.nelts_per_input ());
26662 d.vmode = vmode;
26663 d.vec_flags = aarch64_classify_vector_mode (d.vmode);
26664 d.op_mode = op_mode;
26665 d.op_vec_flags = aarch64_classify_vector_mode (d.op_mode);
26666 d.target = target;
26667 d.op0 = op0 ? force_reg (op_mode, op0) : NULL_RTX;
26668 if (op0 == op1)
26669 d.op1 = d.op0;
26670 else
26671 d.op1 = op1 ? force_reg (op_mode, op1) : NULL_RTX;
26672 d.testing_p = !target;
26674 if (!d.testing_p)
26675 return aarch64_expand_vec_perm_const_1 (&d);
26677 rtx_insn *last = get_last_insn ();
26678 bool ret = aarch64_expand_vec_perm_const_1 (&d);
26679 gcc_assert (last == get_last_insn ());
26681 return ret;
26683 /* Generate a byte permute mask for a register of mode MODE,
26684 which has NUNITS units. */
26687 aarch64_reverse_mask (machine_mode mode, unsigned int nunits)
26689 /* We have to reverse each vector because we dont have
26690 a permuted load that can reverse-load according to ABI rules. */
26691 rtx mask;
26692 rtvec v = rtvec_alloc (16);
26693 unsigned int i, j;
26694 unsigned int usize = GET_MODE_UNIT_SIZE (mode);
26696 gcc_assert (BYTES_BIG_ENDIAN);
26697 gcc_assert (AARCH64_VALID_SIMD_QREG_MODE (mode));
26699 for (i = 0; i < nunits; i++)
26700 for (j = 0; j < usize; j++)
26701 RTVEC_ELT (v, i * usize + j) = GEN_INT ((i + 1) * usize - 1 - j);
26702 mask = gen_rtx_CONST_VECTOR (V16QImode, v);
26703 return force_reg (V16QImode, mask);
26706 /* Expand an SVE integer comparison using the SVE equivalent of:
26708 (set TARGET (CODE OP0 OP1)). */
26710 void
26711 aarch64_expand_sve_vec_cmp_int (rtx target, rtx_code code, rtx op0, rtx op1)
26713 machine_mode pred_mode = GET_MODE (target);
26714 machine_mode data_mode = GET_MODE (op0);
26715 rtx res = aarch64_sve_emit_int_cmp (target, pred_mode, code, data_mode,
26716 op0, op1);
26717 if (!rtx_equal_p (target, res))
26718 emit_move_insn (target, res);
26721 /* Return the UNSPEC_COND_* code for comparison CODE. */
26723 static unsigned int
26724 aarch64_unspec_cond_code (rtx_code code)
26726 switch (code)
26728 case NE:
26729 return UNSPEC_COND_FCMNE;
26730 case EQ:
26731 return UNSPEC_COND_FCMEQ;
26732 case LT:
26733 return UNSPEC_COND_FCMLT;
26734 case GT:
26735 return UNSPEC_COND_FCMGT;
26736 case LE:
26737 return UNSPEC_COND_FCMLE;
26738 case GE:
26739 return UNSPEC_COND_FCMGE;
26740 case UNORDERED:
26741 return UNSPEC_COND_FCMUO;
26742 default:
26743 gcc_unreachable ();
26747 /* Emit:
26749 (set TARGET (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X>))
26751 where <X> is the operation associated with comparison CODE.
26752 KNOWN_PTRUE_P is true if PRED is known to be a PTRUE. */
26754 static void
26755 aarch64_emit_sve_fp_cond (rtx target, rtx_code code, rtx pred,
26756 bool known_ptrue_p, rtx op0, rtx op1)
26758 rtx flag = gen_int_mode (known_ptrue_p, SImode);
26759 rtx unspec = gen_rtx_UNSPEC (GET_MODE (pred),
26760 gen_rtvec (4, pred, flag, op0, op1),
26761 aarch64_unspec_cond_code (code));
26762 emit_set_insn (target, unspec);
26765 /* Emit the SVE equivalent of:
26767 (set TMP1 (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X1>))
26768 (set TMP2 (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X2>))
26769 (set TARGET (ior:PRED_MODE TMP1 TMP2))
26771 where <Xi> is the operation associated with comparison CODEi.
26772 KNOWN_PTRUE_P is true if PRED is known to be a PTRUE. */
26774 static void
26775 aarch64_emit_sve_or_fp_conds (rtx target, rtx_code code1, rtx_code code2,
26776 rtx pred, bool known_ptrue_p, rtx op0, rtx op1)
26778 machine_mode pred_mode = GET_MODE (pred);
26779 rtx tmp1 = gen_reg_rtx (pred_mode);
26780 aarch64_emit_sve_fp_cond (tmp1, code1, pred, known_ptrue_p, op0, op1);
26781 rtx tmp2 = gen_reg_rtx (pred_mode);
26782 aarch64_emit_sve_fp_cond (tmp2, code2, pred, known_ptrue_p, op0, op1);
26783 aarch64_emit_binop (target, ior_optab, tmp1, tmp2);
26786 /* Emit the SVE equivalent of:
26788 (set TMP (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X>))
26789 (set TARGET (not TMP))
26791 where <X> is the operation associated with comparison CODE.
26792 KNOWN_PTRUE_P is true if PRED is known to be a PTRUE. */
26794 static void
26795 aarch64_emit_sve_invert_fp_cond (rtx target, rtx_code code, rtx pred,
26796 bool known_ptrue_p, rtx op0, rtx op1)
26798 machine_mode pred_mode = GET_MODE (pred);
26799 rtx tmp = gen_reg_rtx (pred_mode);
26800 aarch64_emit_sve_fp_cond (tmp, code, pred, known_ptrue_p, op0, op1);
26801 aarch64_emit_unop (target, one_cmpl_optab, tmp);
26804 /* Expand an SVE floating-point comparison using the SVE equivalent of:
26806 (set TARGET (CODE OP0 OP1))
26808 If CAN_INVERT_P is true, the caller can also handle inverted results;
26809 return true if the result is in fact inverted. */
26811 bool
26812 aarch64_expand_sve_vec_cmp_float (rtx target, rtx_code code,
26813 rtx op0, rtx op1, bool can_invert_p)
26815 machine_mode pred_mode = GET_MODE (target);
26816 machine_mode data_mode = GET_MODE (op0);
26818 rtx ptrue = aarch64_ptrue_reg (pred_mode);
26819 switch (code)
26821 case UNORDERED:
26822 /* UNORDERED has no immediate form. */
26823 op1 = force_reg (data_mode, op1);
26824 /* fall through */
26825 case LT:
26826 case LE:
26827 case GT:
26828 case GE:
26829 case EQ:
26830 case NE:
26832 /* There is native support for the comparison. */
26833 aarch64_emit_sve_fp_cond (target, code, ptrue, true, op0, op1);
26834 return false;
26837 case LTGT:
26838 /* This is a trapping operation (LT or GT). */
26839 aarch64_emit_sve_or_fp_conds (target, LT, GT, ptrue, true, op0, op1);
26840 return false;
26842 case UNEQ:
26843 if (!flag_trapping_math)
26845 /* This would trap for signaling NaNs. */
26846 op1 = force_reg (data_mode, op1);
26847 aarch64_emit_sve_or_fp_conds (target, UNORDERED, EQ,
26848 ptrue, true, op0, op1);
26849 return false;
26851 /* fall through */
26852 case UNLT:
26853 case UNLE:
26854 case UNGT:
26855 case UNGE:
26856 if (flag_trapping_math)
26858 /* Work out which elements are ordered. */
26859 rtx ordered = gen_reg_rtx (pred_mode);
26860 op1 = force_reg (data_mode, op1);
26861 aarch64_emit_sve_invert_fp_cond (ordered, UNORDERED,
26862 ptrue, true, op0, op1);
26864 /* Test the opposite condition for the ordered elements,
26865 then invert the result. */
26866 if (code == UNEQ)
26867 code = NE;
26868 else
26869 code = reverse_condition_maybe_unordered (code);
26870 if (can_invert_p)
26872 aarch64_emit_sve_fp_cond (target, code,
26873 ordered, false, op0, op1);
26874 return true;
26876 aarch64_emit_sve_invert_fp_cond (target, code,
26877 ordered, false, op0, op1);
26878 return false;
26880 break;
26882 case ORDERED:
26883 /* ORDERED has no immediate form. */
26884 op1 = force_reg (data_mode, op1);
26885 break;
26887 default:
26888 gcc_unreachable ();
26891 /* There is native support for the inverse comparison. */
26892 code = reverse_condition_maybe_unordered (code);
26893 if (can_invert_p)
26895 aarch64_emit_sve_fp_cond (target, code, ptrue, true, op0, op1);
26896 return true;
26898 aarch64_emit_sve_invert_fp_cond (target, code, ptrue, true, op0, op1);
26899 return false;
26902 /* Return true if:
26904 (a) MODE1 and MODE2 use the same layout for bytes that are common
26905 to both modes;
26907 (b) subregs involving the two modes behave as the target-independent
26908 subreg rules require; and
26910 (c) there is at least one register that can hold both modes.
26912 Return false otherwise. */
26914 static bool
26915 aarch64_modes_compatible_p (machine_mode mode1, machine_mode mode2)
26917 unsigned int flags1 = aarch64_classify_vector_mode (mode1);
26918 unsigned int flags2 = aarch64_classify_vector_mode (mode2);
26920 bool sve1_p = (flags1 & VEC_ANY_SVE);
26921 bool sve2_p = (flags2 & VEC_ANY_SVE);
26923 bool partial_sve1_p = sve1_p && (flags1 & VEC_PARTIAL);
26924 bool partial_sve2_p = sve2_p && (flags2 & VEC_PARTIAL);
26926 bool pred1_p = (flags1 & VEC_SVE_PRED);
26927 bool pred2_p = (flags2 & VEC_SVE_PRED);
26929 bool partial_advsimd_struct1_p = (flags1 == (VEC_ADVSIMD | VEC_STRUCT
26930 | VEC_PARTIAL));
26931 bool partial_advsimd_struct2_p = (flags2 == (VEC_ADVSIMD | VEC_STRUCT
26932 | VEC_PARTIAL));
26934 /* Don't allow changes between predicate modes and other modes.
26935 Only predicate registers can hold predicate modes and only
26936 non-predicate registers can hold non-predicate modes, so any
26937 attempt to mix them would require a round trip through memory. */
26938 if (pred1_p != pred2_p)
26939 return false;
26941 /* The contents of partial SVE modes are distributed evenly across
26942 the register, whereas GCC expects them to be clustered together.
26943 We therefore need to be careful about mode changes involving them. */
26944 if (partial_sve1_p && partial_sve2_p)
26946 /* Reject changes between partial SVE modes that have different
26947 patterns of significant and insignificant bits. */
26948 if ((aarch64_sve_container_bits (mode1)
26949 != aarch64_sve_container_bits (mode2))
26950 || GET_MODE_UNIT_SIZE (mode1) != GET_MODE_UNIT_SIZE (mode2))
26951 return false;
26953 else if (partial_sve1_p)
26955 /* The first lane of MODE1 is where GCC expects it, but anything
26956 bigger than that is not. */
26957 if (maybe_gt (GET_MODE_SIZE (mode2), GET_MODE_UNIT_SIZE (mode1)))
26958 return false;
26960 else if (partial_sve2_p)
26962 /* Similarly in reverse. */
26963 if (maybe_gt (GET_MODE_SIZE (mode1), GET_MODE_UNIT_SIZE (mode2)))
26964 return false;
26967 /* Don't allow changes between partial Advanced SIMD structure modes
26968 and other modes that are bigger than 8 bytes. E.g. V16QI and V2x8QI
26969 are the same size, but the former occupies one Q register while the
26970 latter occupies two D registers. */
26971 if (partial_advsimd_struct1_p != partial_advsimd_struct2_p
26972 && maybe_gt (GET_MODE_SIZE (mode1), 8)
26973 && maybe_gt (GET_MODE_SIZE (mode2), 8))
26974 return false;
26976 if (maybe_ne (BITS_PER_SVE_VECTOR, 128u))
26978 /* Don't allow changes between SVE modes and other modes that might
26979 be bigger than 128 bits. In particular, OImode, CImode and XImode
26980 divide into 128-bit quantities while SVE modes divide into
26981 BITS_PER_SVE_VECTOR quantities. */
26982 if (sve1_p && !sve2_p && maybe_gt (GET_MODE_BITSIZE (mode2), 128))
26983 return false;
26984 if (sve2_p && !sve1_p && maybe_gt (GET_MODE_BITSIZE (mode1), 128))
26985 return false;
26988 if (BYTES_BIG_ENDIAN)
26990 /* Don't allow changes between SVE data modes and non-SVE modes.
26991 See the comment at the head of aarch64-sve.md for details. */
26992 if (sve1_p != sve2_p)
26993 return false;
26995 /* Don't allow changes in element size: lane 0 of the new vector
26996 would not then be lane 0 of the old vector. See the comment
26997 above aarch64_maybe_expand_sve_subreg_move for a more detailed
26998 description.
27000 In the worst case, this forces a register to be spilled in
27001 one mode and reloaded in the other, which handles the
27002 endianness correctly. */
27003 if (sve1_p && GET_MODE_UNIT_SIZE (mode1) != GET_MODE_UNIT_SIZE (mode2))
27004 return false;
27006 return true;
27009 /* Implement TARGET_MODES_TIEABLE_P. In principle we should always defer
27010 to aarch64_modes_compatible_p. However due to issues with register
27011 allocation it is preferable to avoid tieing integer scalar and FP
27012 scalar modes. Executing integer operations in general registers is
27013 better than treating them as scalar vector operations. This reduces
27014 latency and avoids redundant int<->FP moves. So tie modes if they
27015 are either the same class, or one of them is a vector mode. */
27017 static bool
27018 aarch64_modes_tieable_p (machine_mode mode1, machine_mode mode2)
27020 if (aarch64_modes_compatible_p (mode1, mode2))
27022 if (GET_MODE_CLASS (mode1) == GET_MODE_CLASS (mode2))
27023 return true;
27024 if (VECTOR_MODE_P (mode1) || VECTOR_MODE_P (mode2))
27025 return true;
27027 return false;
27030 /* Return a new RTX holding the result of moving POINTER forward by
27031 AMOUNT bytes. */
27033 static rtx
27034 aarch64_move_pointer (rtx pointer, poly_int64 amount)
27036 rtx next = plus_constant (Pmode, XEXP (pointer, 0), amount);
27038 return adjust_automodify_address (pointer, GET_MODE (pointer),
27039 next, amount);
27042 /* Expand a cpymem/movmem using the MOPS extension. OPERANDS are taken
27043 from the cpymem/movmem pattern. IS_MEMMOVE is true if this is a memmove
27044 rather than memcpy. Return true iff we succeeded. */
27045 bool
27046 aarch64_expand_cpymem_mops (rtx *operands, bool is_memmove)
27048 if (!TARGET_MOPS)
27049 return false;
27051 /* All three registers are changed by the instruction, so each one
27052 must be a fresh pseudo. */
27053 rtx dst_addr = copy_to_mode_reg (Pmode, XEXP (operands[0], 0));
27054 rtx src_addr = copy_to_mode_reg (Pmode, XEXP (operands[1], 0));
27055 rtx dst_mem = replace_equiv_address (operands[0], dst_addr);
27056 rtx src_mem = replace_equiv_address (operands[1], src_addr);
27057 rtx sz_reg = copy_to_mode_reg (DImode, operands[2]);
27058 if (is_memmove)
27059 emit_insn (gen_aarch64_movmemdi (dst_mem, src_mem, sz_reg));
27060 else
27061 emit_insn (gen_aarch64_cpymemdi (dst_mem, src_mem, sz_reg));
27062 return true;
27065 /* Expand cpymem/movmem, as if from a __builtin_memcpy/memmove.
27066 OPERANDS are taken from the cpymem/movmem pattern. IS_MEMMOVE is true
27067 if this is a memmove rather than memcpy. Return true if we succeed,
27068 otherwise return false, indicating that a libcall should be emitted. */
27069 bool
27070 aarch64_expand_cpymem (rtx *operands, bool is_memmove)
27072 int mode_bytes;
27073 rtx dst = operands[0];
27074 rtx src = operands[1];
27075 unsigned align = UINTVAL (operands[3]);
27076 rtx base;
27077 machine_mode mode = BLKmode, next_mode;
27079 /* Variable-sized or strict-align copies may use the MOPS expansion. */
27080 if (!CONST_INT_P (operands[2]) || (STRICT_ALIGNMENT && align < 16))
27081 return aarch64_expand_cpymem_mops (operands, is_memmove);
27083 unsigned HOST_WIDE_INT size = UINTVAL (operands[2]);
27085 /* Set inline limits for memmove/memcpy. MOPS has a separate threshold. */
27086 unsigned max_copy_size = TARGET_SIMD ? 256 : 128;
27087 unsigned mops_threshold = is_memmove ? aarch64_mops_memmove_size_threshold
27088 : aarch64_mops_memcpy_size_threshold;
27090 /* Reduce the maximum size with -Os. */
27091 if (optimize_function_for_size_p (cfun))
27092 max_copy_size /= 4;
27094 /* Large copies use MOPS when available or a library call. */
27095 if (size > max_copy_size || (TARGET_MOPS && size > mops_threshold))
27096 return aarch64_expand_cpymem_mops (operands, is_memmove);
27098 /* Default to 32-byte LDP/STP on large copies, however small copies or
27099 no SIMD support fall back to 16-byte chunks.
27100 ??? Although it would be possible to use LDP/STP Qn in streaming mode
27101 (so using TARGET_BASE_SIMD instead of TARGET_SIMD), it isn't clear
27102 whether that would improve performance. */
27103 bool use_qregs = size > 24 && TARGET_SIMD;
27105 base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
27106 dst = adjust_automodify_address (dst, VOIDmode, base, 0);
27108 base = copy_to_mode_reg (Pmode, XEXP (src, 0));
27109 src = adjust_automodify_address (src, VOIDmode, base, 0);
27111 auto_vec<std::pair<rtx, rtx>, 16> ops;
27112 int offset = 0;
27114 while (size > 0)
27116 /* Find the largest mode in which to do the copy in without over reading
27117 or writing. */
27118 opt_scalar_int_mode mode_iter;
27119 FOR_EACH_MODE_IN_CLASS (mode_iter, MODE_INT)
27120 if (GET_MODE_SIZE (mode_iter.require ()) <= MIN (size, 16))
27121 mode = mode_iter.require ();
27123 gcc_assert (mode != BLKmode);
27125 mode_bytes = GET_MODE_SIZE (mode).to_constant ();
27127 /* Prefer Q-register accesses. */
27128 if (mode_bytes == 16 && use_qregs)
27129 mode = V4SImode;
27131 rtx reg = gen_reg_rtx (mode);
27132 rtx load = gen_move_insn (reg, adjust_address (src, mode, offset));
27133 rtx store = gen_move_insn (adjust_address (dst, mode, offset), reg);
27134 ops.safe_push ({ load, store });
27135 size -= mode_bytes;
27136 offset += mode_bytes;
27138 /* Emit trailing copies using overlapping unaligned accesses
27139 (when !STRICT_ALIGNMENT) - this is smaller and faster. */
27140 if (size > 0 && size < 16 && !STRICT_ALIGNMENT)
27142 next_mode = smallest_mode_for_size
27143 (size * BITS_PER_UNIT, MODE_INT).require ();
27144 int n_bytes = GET_MODE_SIZE (next_mode).to_constant ();
27145 gcc_assert (n_bytes <= mode_bytes);
27146 offset -= n_bytes - size;
27147 size = n_bytes;
27151 /* Memcpy interleaves loads with stores, memmove emits all loads first. */
27152 int nops = ops.length();
27153 int inc = is_memmove || nops <= 8 ? nops : 6;
27155 for (int i = 0; i < nops; i += inc)
27157 int m = MIN (nops, i + inc);
27158 /* Emit loads. */
27159 for (int j = i; j < m; j++)
27160 emit_insn (ops[j].first);
27161 /* Emit stores. */
27162 for (int j = i; j < m; j++)
27163 emit_insn (ops[j].second);
27165 return true;
27168 /* Expand a setmem using the MOPS instructions. OPERANDS are the same
27169 as for the setmem pattern. Return true iff we succeed. */
27170 static bool
27171 aarch64_expand_setmem_mops (rtx *operands)
27173 if (!TARGET_MOPS)
27174 return false;
27176 /* The first two registers are changed by the instruction, so both
27177 of them must be a fresh pseudo. */
27178 rtx dst_addr = copy_to_mode_reg (Pmode, XEXP (operands[0], 0));
27179 rtx dst_mem = replace_equiv_address (operands[0], dst_addr);
27180 rtx sz_reg = copy_to_mode_reg (DImode, operands[1]);
27181 rtx val = operands[2];
27182 if (val != CONST0_RTX (QImode))
27183 val = force_reg (QImode, val);
27184 emit_insn (gen_aarch64_setmemdi (dst_mem, val, sz_reg));
27185 return true;
27188 /* Expand setmem, as if from a __builtin_memset. Return true if
27189 we succeed, otherwise return false. */
27191 bool
27192 aarch64_expand_setmem (rtx *operands)
27194 int mode_bytes;
27195 unsigned HOST_WIDE_INT len;
27196 rtx dst = operands[0];
27197 rtx val = operands[2], src;
27198 unsigned align = UINTVAL (operands[3]);
27199 rtx base;
27200 machine_mode mode = BLKmode, next_mode;
27202 /* Variable-sized or strict-align memset may use the MOPS expansion. */
27203 if (!CONST_INT_P (operands[1]) || !TARGET_SIMD
27204 || (STRICT_ALIGNMENT && align < 16))
27205 return aarch64_expand_setmem_mops (operands);
27207 /* Set inline limits for memset. MOPS has a separate threshold. */
27208 unsigned max_set_size = MAX_SET_SIZE (optimize_function_for_speed_p (cfun));
27209 unsigned mops_threshold = aarch64_mops_memset_size_threshold;
27211 len = UINTVAL (operands[1]);
27213 /* Large memset uses MOPS when available or a library call. */
27214 if (len > max_set_size || (TARGET_MOPS && len > mops_threshold))
27215 return aarch64_expand_setmem_mops (operands);
27217 base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
27218 dst = adjust_automodify_address (dst, VOIDmode, base, 0);
27220 /* Prepare the val using a DUP/MOVI v0.16B, val. */
27221 val = expand_vector_broadcast (V16QImode, val);
27222 val = force_reg (V16QImode, val);
27224 int offset = 0;
27225 while (len > 0)
27227 /* Find the largest mode in which to do the copy without
27228 over writing. */
27229 opt_scalar_int_mode mode_iter;
27230 FOR_EACH_MODE_IN_CLASS (mode_iter, MODE_INT)
27231 if (GET_MODE_SIZE (mode_iter.require ()) <= MIN (len, 16))
27232 mode = mode_iter.require ();
27234 gcc_assert (mode != BLKmode);
27236 mode_bytes = GET_MODE_SIZE (mode).to_constant ();
27238 src = val;
27240 /* Prefer Q-register accesses. */
27241 if (mode_bytes == 16)
27242 mode = V16QImode;
27243 else
27244 src = lowpart_subreg (mode, src, GET_MODE (val));
27246 emit_move_insn (adjust_address (dst, mode, offset), src);
27247 len -= mode_bytes;
27248 offset += mode_bytes;
27250 /* Emit trailing writes using overlapping unaligned accesses
27251 (when !STRICT_ALIGNMENT) - this is smaller and faster. */
27252 if (len > 0 && len < 16 && !STRICT_ALIGNMENT)
27254 next_mode = smallest_mode_for_size
27255 (len * BITS_PER_UNIT, MODE_INT).require ();
27256 int n_bytes = GET_MODE_SIZE (next_mode).to_constant ();
27257 gcc_assert (n_bytes <= mode_bytes);
27258 offset -= n_bytes - len;
27259 len = n_bytes;
27263 return true;
27267 /* Split a DImode store of a CONST_INT SRC to MEM DST as two
27268 SImode stores. Handle the case when the constant has identical
27269 bottom and top halves. This is beneficial when the two stores can be
27270 merged into an STP and we avoid synthesising potentially expensive
27271 immediates twice. Return true if such a split is possible. */
27273 bool
27274 aarch64_split_dimode_const_store (rtx dst, rtx src)
27276 rtx lo = gen_lowpart (SImode, src);
27277 rtx hi = gen_highpart_mode (SImode, DImode, src);
27279 if (!rtx_equal_p (lo, hi))
27280 return false;
27282 unsigned int orig_cost
27283 = aarch64_internal_mov_immediate (NULL_RTX, src, false, DImode);
27284 unsigned int lo_cost
27285 = aarch64_internal_mov_immediate (NULL_RTX, lo, false, SImode);
27287 /* We want to transform:
27288 MOV x1, 49370
27289 MOVK x1, 0x140, lsl 16
27290 MOVK x1, 0xc0da, lsl 32
27291 MOVK x1, 0x140, lsl 48
27292 STR x1, [x0]
27293 into:
27294 MOV w1, 49370
27295 MOVK w1, 0x140, lsl 16
27296 STP w1, w1, [x0]
27297 So we want to perform this when we save at least one instruction. */
27298 if (orig_cost <= lo_cost)
27299 return false;
27301 rtx mem_lo = adjust_address (dst, SImode, 0);
27302 if (!aarch64_mem_pair_operand (mem_lo, SImode))
27303 return false;
27305 rtx tmp_reg = gen_reg_rtx (SImode);
27306 aarch64_expand_mov_immediate (tmp_reg, lo);
27307 rtx mem_hi = aarch64_move_pointer (mem_lo, GET_MODE_SIZE (SImode));
27308 /* Don't emit an explicit store pair as this may not be always profitable.
27309 Let the sched-fusion logic decide whether to merge them. */
27310 emit_move_insn (mem_lo, tmp_reg);
27311 emit_move_insn (mem_hi, tmp_reg);
27313 return true;
27316 /* Generate RTL for a conditional branch with rtx comparison CODE in
27317 mode CC_MODE. The destination of the unlikely conditional branch
27318 is LABEL_REF. */
27320 void
27321 aarch64_gen_unlikely_cbranch (enum rtx_code code, machine_mode cc_mode,
27322 rtx label_ref)
27324 rtx x;
27325 x = gen_rtx_fmt_ee (code, VOIDmode,
27326 gen_rtx_REG (cc_mode, CC_REGNUM),
27327 const0_rtx);
27329 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
27330 gen_rtx_LABEL_REF (VOIDmode, label_ref),
27331 pc_rtx);
27332 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
27335 /* Generate DImode scratch registers for 128-bit (TImode) addition.
27337 OP1 represents the TImode destination operand 1
27338 OP2 represents the TImode destination operand 2
27339 LOW_DEST represents the low half (DImode) of TImode operand 0
27340 LOW_IN1 represents the low half (DImode) of TImode operand 1
27341 LOW_IN2 represents the low half (DImode) of TImode operand 2
27342 HIGH_DEST represents the high half (DImode) of TImode operand 0
27343 HIGH_IN1 represents the high half (DImode) of TImode operand 1
27344 HIGH_IN2 represents the high half (DImode) of TImode operand 2. */
27346 void
27347 aarch64_addti_scratch_regs (rtx op1, rtx op2, rtx *low_dest,
27348 rtx *low_in1, rtx *low_in2,
27349 rtx *high_dest, rtx *high_in1,
27350 rtx *high_in2)
27352 *low_dest = gen_reg_rtx (DImode);
27353 *low_in1 = force_lowpart_subreg (DImode, op1, TImode);
27354 *low_in2 = force_lowpart_subreg (DImode, op2, TImode);
27355 *high_dest = gen_reg_rtx (DImode);
27356 *high_in1 = force_highpart_subreg (DImode, op1, TImode);
27357 *high_in2 = force_highpart_subreg (DImode, op2, TImode);
27360 /* Generate DImode scratch registers for 128-bit (TImode) subtraction.
27362 OP1 represents the TImode destination operand 1
27363 OP2 represents the TImode destination operand 2
27364 LOW_DEST represents the low half (DImode) of TImode operand 0
27365 LOW_IN1 represents the low half (DImode) of TImode operand 1
27366 LOW_IN2 represents the low half (DImode) of TImode operand 2
27367 HIGH_DEST represents the high half (DImode) of TImode operand 0
27368 HIGH_IN1 represents the high half (DImode) of TImode operand 1
27369 HIGH_IN2 represents the high half (DImode) of TImode operand 2. */
27372 void
27373 aarch64_subvti_scratch_regs (rtx op1, rtx op2, rtx *low_dest,
27374 rtx *low_in1, rtx *low_in2,
27375 rtx *high_dest, rtx *high_in1,
27376 rtx *high_in2)
27378 *low_dest = gen_reg_rtx (DImode);
27379 *low_in1 = force_lowpart_subreg (DImode, op1, TImode);
27380 *low_in2 = force_lowpart_subreg (DImode, op2, TImode);
27381 *high_dest = gen_reg_rtx (DImode);
27383 *high_in1 = force_highpart_subreg (DImode, op1, TImode);
27384 *high_in2 = force_highpart_subreg (DImode, op2, TImode);
27387 /* Generate RTL for 128-bit (TImode) subtraction with overflow.
27389 OP0 represents the TImode destination operand 0
27390 LOW_DEST represents the low half (DImode) of TImode operand 0
27391 LOW_IN1 represents the low half (DImode) of TImode operand 1
27392 LOW_IN2 represents the low half (DImode) of TImode operand 2
27393 HIGH_DEST represents the high half (DImode) of TImode operand 0
27394 HIGH_IN1 represents the high half (DImode) of TImode operand 1
27395 HIGH_IN2 represents the high half (DImode) of TImode operand 2
27396 UNSIGNED_P is true if the operation is being performed on unsigned
27397 values. */
27398 void
27399 aarch64_expand_subvti (rtx op0, rtx low_dest, rtx low_in1,
27400 rtx low_in2, rtx high_dest, rtx high_in1,
27401 rtx high_in2, bool unsigned_p)
27403 if (low_in2 == const0_rtx)
27405 low_dest = low_in1;
27406 high_in2 = force_reg (DImode, high_in2);
27407 if (unsigned_p)
27408 emit_insn (gen_subdi3_compare1 (high_dest, high_in1, high_in2));
27409 else
27410 emit_insn (gen_subvdi_insn (high_dest, high_in1, high_in2));
27412 else
27414 if (aarch64_plus_immediate (low_in2, DImode))
27415 emit_insn (gen_subdi3_compare1_imm (low_dest, low_in1, low_in2,
27416 GEN_INT (-UINTVAL (low_in2))));
27417 else
27419 low_in2 = force_reg (DImode, low_in2);
27420 emit_insn (gen_subdi3_compare1 (low_dest, low_in1, low_in2));
27422 high_in2 = force_reg (DImode, high_in2);
27424 if (unsigned_p)
27425 emit_insn (gen_usubdi3_carryinC (high_dest, high_in1, high_in2));
27426 else
27427 emit_insn (gen_subdi3_carryinV (high_dest, high_in1, high_in2));
27430 emit_move_insn (gen_lowpart (DImode, op0), low_dest);
27431 emit_move_insn (gen_highpart (DImode, op0), high_dest);
27435 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook. */
27437 static unsigned HOST_WIDE_INT
27438 aarch64_asan_shadow_offset (void)
27440 if (TARGET_ILP32)
27441 return (HOST_WIDE_INT_1 << 29);
27442 else
27443 return (HOST_WIDE_INT_1 << 36);
27446 static rtx
27447 aarch64_gen_ccmp_first (rtx_insn **prep_seq, rtx_insn **gen_seq,
27448 rtx_code code, tree treeop0, tree treeop1)
27450 machine_mode op_mode, cmp_mode, cc_mode = CCmode;
27451 rtx op0, op1;
27452 int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
27453 insn_code icode;
27454 struct expand_operand ops[4];
27456 start_sequence ();
27457 expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
27459 op_mode = GET_MODE (op0);
27460 if (op_mode == VOIDmode)
27461 op_mode = GET_MODE (op1);
27463 if (CONST_SCALAR_INT_P (op1))
27464 canonicalize_comparison (op_mode, &code, &op1);
27466 switch (op_mode)
27468 case E_QImode:
27469 case E_HImode:
27470 case E_SImode:
27471 cmp_mode = SImode;
27472 icode = CODE_FOR_cmpsi;
27473 break;
27475 case E_DImode:
27476 cmp_mode = DImode;
27477 icode = CODE_FOR_cmpdi;
27478 break;
27480 case E_SFmode:
27481 cmp_mode = SFmode;
27482 cc_mode = aarch64_select_cc_mode (code, op0, op1);
27483 icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpesf : CODE_FOR_fcmpsf;
27484 break;
27486 case E_DFmode:
27487 cmp_mode = DFmode;
27488 cc_mode = aarch64_select_cc_mode (code, op0, op1);
27489 icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpedf : CODE_FOR_fcmpdf;
27490 break;
27492 default:
27493 end_sequence ();
27494 return NULL_RTX;
27497 op0 = prepare_operand (icode, op0, 0, op_mode, cmp_mode, unsignedp);
27498 op1 = prepare_operand (icode, op1, 1, op_mode, cmp_mode, unsignedp);
27499 if (!op0 || !op1)
27501 end_sequence ();
27502 return NULL_RTX;
27504 *prep_seq = get_insns ();
27505 end_sequence ();
27507 create_fixed_operand (&ops[0], op0);
27508 create_fixed_operand (&ops[1], op1);
27510 start_sequence ();
27511 if (!maybe_expand_insn (icode, 2, ops))
27513 end_sequence ();
27514 return NULL_RTX;
27516 *gen_seq = get_insns ();
27517 end_sequence ();
27519 return gen_rtx_fmt_ee (code, cc_mode,
27520 gen_rtx_REG (cc_mode, CC_REGNUM), const0_rtx);
27523 static rtx
27524 aarch64_gen_ccmp_next (rtx_insn **prep_seq, rtx_insn **gen_seq, rtx prev,
27525 rtx_code cmp_code, tree treeop0, tree treeop1,
27526 rtx_code bit_code)
27528 rtx op0, op1, target;
27529 machine_mode op_mode, cmp_mode, cc_mode = CCmode;
27530 int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
27531 insn_code icode;
27532 struct expand_operand ops[6];
27533 int aarch64_cond;
27535 push_to_sequence (*prep_seq);
27536 expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
27538 op_mode = GET_MODE (op0);
27539 if (op_mode == VOIDmode)
27540 op_mode = GET_MODE (op1);
27542 if (CONST_SCALAR_INT_P (op1))
27543 canonicalize_comparison (op_mode, &cmp_code, &op1);
27545 switch (op_mode)
27547 case E_QImode:
27548 case E_HImode:
27549 case E_SImode:
27550 cmp_mode = SImode;
27551 break;
27553 case E_DImode:
27554 cmp_mode = DImode;
27555 break;
27557 case E_SFmode:
27558 cmp_mode = SFmode;
27559 cc_mode = aarch64_select_cc_mode (cmp_code, op0, op1);
27560 break;
27562 case E_DFmode:
27563 cmp_mode = DFmode;
27564 cc_mode = aarch64_select_cc_mode (cmp_code, op0, op1);
27565 break;
27567 default:
27568 end_sequence ();
27569 return NULL_RTX;
27572 icode = code_for_ccmp (cc_mode, cmp_mode);
27574 op0 = prepare_operand (icode, op0, 2, op_mode, cmp_mode, unsignedp);
27575 op1 = prepare_operand (icode, op1, 3, op_mode, cmp_mode, unsignedp);
27576 if (!op0 || !op1)
27578 end_sequence ();
27579 return NULL_RTX;
27581 *prep_seq = get_insns ();
27582 end_sequence ();
27584 target = gen_rtx_REG (cc_mode, CC_REGNUM);
27585 aarch64_cond = aarch64_get_condition_code_1 (cc_mode, cmp_code);
27587 if (bit_code != AND)
27589 /* Treat the ccmp patterns as canonical and use them where possible,
27590 but fall back to ccmp_rev patterns if there's no other option. */
27591 rtx_code prev_code = GET_CODE (prev);
27592 machine_mode prev_mode = GET_MODE (XEXP (prev, 0));
27593 if ((prev_mode == CCFPmode || prev_mode == CCFPEmode)
27594 && !(prev_code == EQ
27595 || prev_code == NE
27596 || prev_code == ORDERED
27597 || prev_code == UNORDERED))
27598 icode = code_for_ccmp_rev (cc_mode, cmp_mode);
27599 else
27601 rtx_code code = reverse_condition (prev_code);
27602 prev = gen_rtx_fmt_ee (code, VOIDmode, XEXP (prev, 0), const0_rtx);
27604 aarch64_cond = AARCH64_INVERSE_CONDITION_CODE (aarch64_cond);
27607 create_fixed_operand (&ops[0], XEXP (prev, 0));
27608 create_fixed_operand (&ops[1], target);
27609 create_fixed_operand (&ops[2], op0);
27610 create_fixed_operand (&ops[3], op1);
27611 create_fixed_operand (&ops[4], prev);
27612 create_fixed_operand (&ops[5], GEN_INT (aarch64_cond));
27614 push_to_sequence (*gen_seq);
27615 if (!maybe_expand_insn (icode, 6, ops))
27617 end_sequence ();
27618 return NULL_RTX;
27621 *gen_seq = get_insns ();
27622 end_sequence ();
27624 return gen_rtx_fmt_ee (cmp_code, VOIDmode, target, const0_rtx);
27627 #undef TARGET_GEN_CCMP_FIRST
27628 #define TARGET_GEN_CCMP_FIRST aarch64_gen_ccmp_first
27630 #undef TARGET_GEN_CCMP_NEXT
27631 #define TARGET_GEN_CCMP_NEXT aarch64_gen_ccmp_next
27633 /* Implement TARGET_SCHED_MACRO_FUSION_P. Return true if target supports
27634 instruction fusion of some sort. */
27636 static bool
27637 aarch64_macro_fusion_p (void)
27639 return aarch64_tune_params.fusible_ops != AARCH64_FUSE_NOTHING;
27643 /* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P. Return true if PREV and CURR
27644 should be kept together during scheduling. */
27646 static bool
27647 aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
27649 rtx set_dest;
27650 rtx prev_set = single_set (prev);
27651 rtx curr_set = single_set (curr);
27652 /* prev and curr are simple SET insns i.e. no flag setting or branching. */
27653 bool simple_sets_p = prev_set && curr_set && !any_condjump_p (curr);
27655 if (!aarch64_macro_fusion_p ())
27656 return false;
27658 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOV_MOVK))
27660 /* We are trying to match:
27661 prev (mov) == (set (reg r0) (const_int imm16))
27662 curr (movk) == (set (zero_extract (reg r0)
27663 (const_int 16)
27664 (const_int 16))
27665 (const_int imm16_1)) */
27667 set_dest = SET_DEST (curr_set);
27669 if (GET_CODE (set_dest) == ZERO_EXTRACT
27670 && CONST_INT_P (SET_SRC (curr_set))
27671 && CONST_INT_P (SET_SRC (prev_set))
27672 && CONST_INT_P (XEXP (set_dest, 2))
27673 && INTVAL (XEXP (set_dest, 2)) == 16
27674 && REG_P (XEXP (set_dest, 0))
27675 && REG_P (SET_DEST (prev_set))
27676 && REGNO (XEXP (set_dest, 0)) == REGNO (SET_DEST (prev_set)))
27678 return true;
27682 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_ADD))
27685 /* We're trying to match:
27686 prev (adrp) == (set (reg r1)
27687 (high (symbol_ref ("SYM"))))
27688 curr (add) == (set (reg r0)
27689 (lo_sum (reg r1)
27690 (symbol_ref ("SYM"))))
27691 Note that r0 need not necessarily be the same as r1, especially
27692 during pre-regalloc scheduling. */
27694 if (satisfies_constraint_Ush (SET_SRC (prev_set))
27695 && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
27697 if (GET_CODE (SET_SRC (curr_set)) == LO_SUM
27698 && REG_P (XEXP (SET_SRC (curr_set), 0))
27699 && REGNO (XEXP (SET_SRC (curr_set), 0))
27700 == REGNO (SET_DEST (prev_set))
27701 && rtx_equal_p (XEXP (SET_SRC (prev_set), 0),
27702 XEXP (SET_SRC (curr_set), 1)))
27703 return true;
27707 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOVK_MOVK))
27710 /* We're trying to match:
27711 prev (movk) == (set (zero_extract (reg r0)
27712 (const_int 16)
27713 (const_int 32))
27714 (const_int imm16_1))
27715 curr (movk) == (set (zero_extract (reg r0)
27716 (const_int 16)
27717 (const_int 48))
27718 (const_int imm16_2)) */
27720 if (GET_CODE (SET_DEST (prev_set)) == ZERO_EXTRACT
27721 && GET_CODE (SET_DEST (curr_set)) == ZERO_EXTRACT
27722 && REG_P (XEXP (SET_DEST (prev_set), 0))
27723 && REG_P (XEXP (SET_DEST (curr_set), 0))
27724 && REGNO (XEXP (SET_DEST (prev_set), 0))
27725 == REGNO (XEXP (SET_DEST (curr_set), 0))
27726 && CONST_INT_P (XEXP (SET_DEST (prev_set), 2))
27727 && CONST_INT_P (XEXP (SET_DEST (curr_set), 2))
27728 && INTVAL (XEXP (SET_DEST (prev_set), 2)) == 32
27729 && INTVAL (XEXP (SET_DEST (curr_set), 2)) == 48
27730 && CONST_INT_P (SET_SRC (prev_set))
27731 && CONST_INT_P (SET_SRC (curr_set)))
27732 return true;
27735 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_LDR))
27737 /* We're trying to match:
27738 prev (adrp) == (set (reg r0)
27739 (high (symbol_ref ("SYM"))))
27740 curr (ldr) == (set (reg r1)
27741 (mem (lo_sum (reg r0)
27742 (symbol_ref ("SYM")))))
27744 curr (ldr) == (set (reg r1)
27745 (zero_extend (mem
27746 (lo_sum (reg r0)
27747 (symbol_ref ("SYM")))))) */
27748 if (satisfies_constraint_Ush (SET_SRC (prev_set))
27749 && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
27751 rtx curr_src = SET_SRC (curr_set);
27753 if (GET_CODE (curr_src) == ZERO_EXTEND)
27754 curr_src = XEXP (curr_src, 0);
27756 if (MEM_P (curr_src) && GET_CODE (XEXP (curr_src, 0)) == LO_SUM
27757 && REG_P (XEXP (XEXP (curr_src, 0), 0))
27758 && REGNO (XEXP (XEXP (curr_src, 0), 0))
27759 == REGNO (SET_DEST (prev_set))
27760 && rtx_equal_p (XEXP (XEXP (curr_src, 0), 1),
27761 XEXP (SET_SRC (prev_set), 0)))
27762 return true;
27766 /* Fuse compare (CMP/CMN/TST/BICS) and conditional branch. */
27767 if (aarch64_fusion_enabled_p (AARCH64_FUSE_CMP_BRANCH)
27768 && prev_set && curr_set && any_condjump_p (curr)
27769 && GET_CODE (SET_SRC (prev_set)) == COMPARE
27770 && SCALAR_INT_MODE_P (GET_MODE (XEXP (SET_SRC (prev_set), 0)))
27771 && reg_referenced_p (SET_DEST (prev_set), PATTERN (curr)))
27772 return true;
27774 /* Fuse CMP and CSEL/CSET. */
27775 if (prev_set && curr_set
27776 && GET_CODE (SET_SRC (prev_set)) == COMPARE
27777 && SCALAR_INT_MODE_P (GET_MODE (XEXP (SET_SRC (prev_set), 0)))
27778 && reg_referenced_p (SET_DEST (prev_set), PATTERN (curr)))
27780 enum attr_type prev_type = get_attr_type (prev);
27781 if ((prev_type == TYPE_ALUS_SREG || prev_type == TYPE_ALUS_IMM)
27782 && ((aarch64_fusion_enabled_p (AARCH64_FUSE_CMP_CSEL)
27783 && GET_CODE (SET_SRC (curr_set)) == IF_THEN_ELSE
27784 && aarch64_reg_or_zero (XEXP (SET_SRC (curr_set), 1), VOIDmode)
27785 && aarch64_reg_or_zero (XEXP (SET_SRC (curr_set), 2), VOIDmode)
27786 && SCALAR_INT_MODE_P (GET_MODE (XEXP (SET_SRC (curr_set), 1))))
27787 || (aarch64_fusion_enabled_p (AARCH64_FUSE_CMP_CSET)
27788 && GET_RTX_CLASS (GET_CODE (SET_SRC (curr_set)))
27789 == RTX_COMPARE
27790 && REG_P (SET_DEST (curr_set)))))
27791 return true;
27794 /* Fuse flag-setting ALU instructions and conditional branch. */
27795 if (aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_BRANCH)
27796 && any_condjump_p (curr))
27798 unsigned int condreg1, condreg2;
27799 rtx cc_reg_1;
27800 aarch64_fixed_condition_code_regs (&condreg1, &condreg2);
27801 cc_reg_1 = gen_rtx_REG (CCmode, condreg1);
27803 if (reg_referenced_p (cc_reg_1, PATTERN (curr))
27804 && prev
27805 && modified_in_p (cc_reg_1, prev))
27807 enum attr_type prev_type = get_attr_type (prev);
27809 /* FIXME: this misses some which is considered simple arthematic
27810 instructions for ThunderX. Simple shifts are missed here. */
27811 if (prev_type == TYPE_ALUS_SREG
27812 || prev_type == TYPE_ALUS_IMM
27813 || prev_type == TYPE_LOGICS_REG
27814 || prev_type == TYPE_LOGICS_IMM)
27815 return true;
27819 /* Fuse ALU instructions and CBZ/CBNZ. */
27820 if (prev_set
27821 && curr_set
27822 && aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_CBZ)
27823 && any_condjump_p (curr))
27825 /* We're trying to match:
27826 prev (alu_insn) == (set (r0) plus ((r0) (r1/imm)))
27827 curr (cbz) == (set (pc) (if_then_else (eq/ne) (r0)
27828 (const_int 0))
27829 (label_ref ("SYM"))
27830 (pc)) */
27831 if (SET_DEST (curr_set) == (pc_rtx)
27832 && GET_CODE (SET_SRC (curr_set)) == IF_THEN_ELSE
27833 && REG_P (XEXP (XEXP (SET_SRC (curr_set), 0), 0))
27834 && REG_P (SET_DEST (prev_set))
27835 && REGNO (SET_DEST (prev_set))
27836 == REGNO (XEXP (XEXP (SET_SRC (curr_set), 0), 0)))
27838 /* Fuse ALU operations followed by conditional branch instruction. */
27839 switch (get_attr_type (prev))
27841 case TYPE_ALU_IMM:
27842 case TYPE_ALU_SREG:
27843 case TYPE_ADC_REG:
27844 case TYPE_ADC_IMM:
27845 case TYPE_ADCS_REG:
27846 case TYPE_ADCS_IMM:
27847 case TYPE_LOGIC_REG:
27848 case TYPE_LOGIC_IMM:
27849 case TYPE_CSEL:
27850 case TYPE_ADR:
27851 case TYPE_MOV_IMM:
27852 case TYPE_SHIFT_REG:
27853 case TYPE_SHIFT_IMM:
27854 case TYPE_BFM:
27855 case TYPE_RBIT:
27856 case TYPE_REV:
27857 case TYPE_EXTEND:
27858 return true;
27860 default:;
27865 /* Fuse A+B+1 and A-B-1 */
27866 if (simple_sets_p
27867 && aarch64_fusion_enabled_p (AARCH64_FUSE_ADDSUB_2REG_CONST1))
27869 /* We're trying to match:
27870 prev == (set (r0) (plus (r0) (r1)))
27871 curr == (set (r0) (plus (r0) (const_int 1)))
27873 prev == (set (r0) (minus (r0) (r1)))
27874 curr == (set (r0) (plus (r0) (const_int -1))) */
27876 rtx prev_src = SET_SRC (prev_set);
27877 rtx curr_src = SET_SRC (curr_set);
27879 int polarity = 1;
27880 if (GET_CODE (prev_src) == MINUS)
27881 polarity = -1;
27883 if (GET_CODE (curr_src) == PLUS
27884 && (GET_CODE (prev_src) == PLUS || GET_CODE (prev_src) == MINUS)
27885 && CONST_INT_P (XEXP (curr_src, 1))
27886 && INTVAL (XEXP (curr_src, 1)) == polarity
27887 && REG_P (XEXP (curr_src, 0))
27888 && REG_P (SET_DEST (prev_set))
27889 && REGNO (SET_DEST (prev_set)) == REGNO (XEXP (curr_src, 0)))
27890 return true;
27893 return false;
27896 /* Return true iff the instruction fusion described by OP is enabled. */
27898 bool
27899 aarch64_fusion_enabled_p (enum aarch64_fusion_pairs op)
27901 return (aarch64_tune_params.fusible_ops & op) != 0;
27904 /* If MEM is in the form of [base+offset], extract the two parts
27905 of address and set to BASE and OFFSET, otherwise return false
27906 after clearing BASE and OFFSET. */
27908 bool
27909 extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
27911 rtx addr;
27913 gcc_assert (MEM_P (mem));
27915 addr = XEXP (mem, 0);
27917 if (REG_P (addr))
27919 *base = addr;
27920 *offset = const0_rtx;
27921 return true;
27924 if (GET_CODE (addr) == PLUS
27925 && REG_P (XEXP (addr, 0)) && CONST_INT_P (XEXP (addr, 1)))
27927 *base = XEXP (addr, 0);
27928 *offset = XEXP (addr, 1);
27929 return true;
27932 *base = NULL_RTX;
27933 *offset = NULL_RTX;
27935 return false;
27938 /* Types for scheduling fusion. */
27939 enum sched_fusion_type
27941 SCHED_FUSION_NONE = 0,
27942 SCHED_FUSION_LD_SIGN_EXTEND,
27943 SCHED_FUSION_LD_ZERO_EXTEND,
27944 SCHED_FUSION_LD,
27945 SCHED_FUSION_ST,
27946 SCHED_FUSION_NUM
27949 /* If INSN is a load or store of address in the form of [base+offset],
27950 extract the two parts and set to BASE and OFFSET. Return scheduling
27951 fusion type this INSN is. */
27953 static enum sched_fusion_type
27954 fusion_load_store (rtx_insn *insn, rtx *base, rtx *offset)
27956 rtx x, dest, src;
27957 enum sched_fusion_type fusion = SCHED_FUSION_LD;
27959 gcc_assert (INSN_P (insn));
27960 x = PATTERN (insn);
27961 if (GET_CODE (x) != SET)
27962 return SCHED_FUSION_NONE;
27964 src = SET_SRC (x);
27965 dest = SET_DEST (x);
27967 machine_mode dest_mode = GET_MODE (dest);
27969 if (!aarch64_mode_valid_for_sched_fusion_p (dest_mode))
27970 return SCHED_FUSION_NONE;
27972 if (GET_CODE (src) == SIGN_EXTEND)
27974 fusion = SCHED_FUSION_LD_SIGN_EXTEND;
27975 src = XEXP (src, 0);
27976 if (!MEM_P (src) || GET_MODE (src) != SImode)
27977 return SCHED_FUSION_NONE;
27979 else if (GET_CODE (src) == ZERO_EXTEND)
27981 fusion = SCHED_FUSION_LD_ZERO_EXTEND;
27982 src = XEXP (src, 0);
27983 if (!MEM_P (src) || GET_MODE (src) != SImode)
27984 return SCHED_FUSION_NONE;
27987 if (MEM_P (src) && REG_P (dest))
27988 extract_base_offset_in_addr (src, base, offset);
27989 else if (MEM_P (dest) && (REG_P (src) || src == const0_rtx))
27991 fusion = SCHED_FUSION_ST;
27992 extract_base_offset_in_addr (dest, base, offset);
27994 else
27995 return SCHED_FUSION_NONE;
27997 if (*base == NULL_RTX || *offset == NULL_RTX)
27998 fusion = SCHED_FUSION_NONE;
28000 return fusion;
28003 /* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
28005 Currently we only support to fuse ldr or str instructions, so FUSION_PRI
28006 and PRI are only calculated for these instructions. For other instruction,
28007 FUSION_PRI and PRI are simply set to MAX_PRI - 1. In the future, other
28008 type instruction fusion can be added by returning different priorities.
28010 It's important that irrelevant instructions get the largest FUSION_PRI. */
28012 static void
28013 aarch64_sched_fusion_priority (rtx_insn *insn, int max_pri,
28014 int *fusion_pri, int *pri)
28016 int tmp, off_val;
28017 rtx base, offset;
28018 enum sched_fusion_type fusion;
28020 gcc_assert (INSN_P (insn));
28022 tmp = max_pri - 1;
28023 fusion = fusion_load_store (insn, &base, &offset);
28024 if (fusion == SCHED_FUSION_NONE)
28026 *pri = tmp;
28027 *fusion_pri = tmp;
28028 return;
28031 /* Set FUSION_PRI according to fusion type and base register. */
28032 *fusion_pri = tmp - fusion * FIRST_PSEUDO_REGISTER - REGNO (base);
28034 /* Calculate PRI. */
28035 tmp /= 2;
28037 /* INSN with smaller offset goes first. */
28038 off_val = (int)(INTVAL (offset));
28039 if (off_val >= 0)
28040 tmp -= (off_val & 0xfffff);
28041 else
28042 tmp += ((- off_val) & 0xfffff);
28044 *pri = tmp;
28045 return;
28048 /* Implement the TARGET_SCHED_ADJUST_PRIORITY hook.
28049 Adjust priority of sha1h instructions so they are scheduled before
28050 other SHA1 instructions. */
28052 static int
28053 aarch64_sched_adjust_priority (rtx_insn *insn, int priority)
28055 rtx x = PATTERN (insn);
28057 if (GET_CODE (x) == SET)
28059 x = SET_SRC (x);
28061 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_SHA1H)
28062 return priority + 10;
28065 return priority;
28068 /* If REVERSED is null, return true if memory reference *MEM2 comes
28069 immediately after memory reference *MEM1. Do not change the references
28070 in this case.
28072 Otherwise, check if *MEM1 and *MEM2 are consecutive memory references and,
28073 if they are, try to make them use constant offsets from the same base
28074 register. Return true on success. When returning true, set *REVERSED
28075 to true if *MEM1 comes after *MEM2, false if *MEM1 comes before *MEM2. */
28076 static bool
28077 aarch64_check_consecutive_mems (rtx *mem1, rtx *mem2, bool *reversed)
28079 if (reversed)
28080 *reversed = false;
28082 if (GET_RTX_CLASS (GET_CODE (XEXP (*mem1, 0))) == RTX_AUTOINC
28083 || GET_RTX_CLASS (GET_CODE (XEXP (*mem2, 0))) == RTX_AUTOINC)
28084 return false;
28086 if (!MEM_SIZE_KNOWN_P (*mem1) || !MEM_SIZE_KNOWN_P (*mem2))
28087 return false;
28089 auto size1 = MEM_SIZE (*mem1);
28090 auto size2 = MEM_SIZE (*mem2);
28092 rtx base1, base2, offset1, offset2;
28093 extract_base_offset_in_addr (*mem1, &base1, &offset1);
28094 extract_base_offset_in_addr (*mem2, &base2, &offset2);
28096 /* Make sure at least one memory is in base+offset form. */
28097 if (!(base1 && offset1) && !(base2 && offset2))
28098 return false;
28100 /* If both mems already use the same base register, just check the
28101 offsets. */
28102 if (base1 && base2 && rtx_equal_p (base1, base2))
28104 if (!offset1 || !offset2)
28105 return false;
28107 if (known_eq (UINTVAL (offset1) + size1, UINTVAL (offset2)))
28108 return true;
28110 if (known_eq (UINTVAL (offset2) + size2, UINTVAL (offset1)) && reversed)
28112 *reversed = true;
28113 return true;
28116 return false;
28119 /* Otherwise, check whether the MEM_EXPRs and MEM_OFFSETs together
28120 guarantee that the values are consecutive. */
28121 if (MEM_EXPR (*mem1)
28122 && MEM_EXPR (*mem2)
28123 && MEM_OFFSET_KNOWN_P (*mem1)
28124 && MEM_OFFSET_KNOWN_P (*mem2))
28126 poly_int64 expr_offset1;
28127 poly_int64 expr_offset2;
28128 tree expr_base1 = get_addr_base_and_unit_offset (MEM_EXPR (*mem1),
28129 &expr_offset1);
28130 tree expr_base2 = get_addr_base_and_unit_offset (MEM_EXPR (*mem2),
28131 &expr_offset2);
28132 if (!expr_base1
28133 || !expr_base2
28134 || !DECL_P (expr_base1)
28135 || !operand_equal_p (expr_base1, expr_base2, OEP_ADDRESS_OF))
28136 return false;
28138 expr_offset1 += MEM_OFFSET (*mem1);
28139 expr_offset2 += MEM_OFFSET (*mem2);
28141 if (known_eq (expr_offset1 + size1, expr_offset2))
28143 else if (known_eq (expr_offset2 + size2, expr_offset1) && reversed)
28144 *reversed = true;
28145 else
28146 return false;
28148 if (reversed)
28150 if (base2)
28152 rtx addr1 = plus_constant (Pmode, XEXP (*mem2, 0),
28153 expr_offset1 - expr_offset2);
28154 *mem1 = replace_equiv_address_nv (*mem1, addr1);
28156 else
28158 rtx addr2 = plus_constant (Pmode, XEXP (*mem1, 0),
28159 expr_offset2 - expr_offset1);
28160 *mem2 = replace_equiv_address_nv (*mem2, addr2);
28163 return true;
28166 return false;
28169 /* Test if MODE is suitable for a single transfer register in an ldp or stp
28170 instruction. */
28172 bool
28173 aarch64_ldpstp_operand_mode_p (machine_mode mode)
28175 if (!targetm.hard_regno_mode_ok (V0_REGNUM, mode)
28176 || hard_regno_nregs (V0_REGNUM, mode) > 1)
28177 return false;
28179 const auto size = GET_MODE_SIZE (mode);
28180 return known_eq (size, 4) || known_eq (size, 8) || known_eq (size, 16);
28183 /* Return true if MEM1 and MEM2 can be combined into a single access
28184 of mode MODE, with the combined access having the same address as MEM1. */
28186 bool
28187 aarch64_mergeable_load_pair_p (machine_mode mode, rtx mem1, rtx mem2)
28189 if (STRICT_ALIGNMENT && MEM_ALIGN (mem1) < GET_MODE_ALIGNMENT (mode))
28190 return false;
28191 return aarch64_check_consecutive_mems (&mem1, &mem2, nullptr);
28194 /* Return true if MEM agrees with the ldp-stp policy model.
28195 Otherwise, false. */
28197 bool
28198 aarch64_mem_ok_with_ldpstp_policy_model (rtx mem, bool load, machine_mode mode)
28200 auto policy = (load
28201 ? aarch64_tune_params.ldp_policy_model
28202 : aarch64_tune_params.stp_policy_model);
28204 /* If we have AARCH64_LDP_STP_POLICY_NEVER, reject the load pair. */
28205 if (policy == AARCH64_LDP_STP_POLICY_NEVER)
28206 return false;
28208 /* If we have AARCH64_LDP_STP_POLICY_ALIGNED,
28209 do not emit the load pair unless the alignment is checked to be
28210 at least double the alignment of the type. */
28211 if (policy == AARCH64_LDP_STP_POLICY_ALIGNED
28212 && !optimize_function_for_size_p (cfun)
28213 && MEM_ALIGN (mem) < 2 * GET_MODE_ALIGNMENT (mode))
28214 return false;
28216 return true;
28219 /* Given OPERANDS of consecutive load/store, check if we can merge
28220 them into ldp/stp. LOAD is true if they are load instructions. */
28222 bool
28223 aarch64_operands_ok_for_ldpstp (rtx *operands, bool load)
28225 enum reg_class rclass_1, rclass_2;
28226 rtx mem_1, mem_2, reg_1, reg_2;
28228 if (load)
28230 mem_1 = operands[1];
28231 mem_2 = operands[3];
28232 reg_1 = operands[0];
28233 reg_2 = operands[2];
28234 gcc_assert (REG_P (reg_1) && REG_P (reg_2));
28235 if (REGNO (reg_1) == REGNO (reg_2))
28236 return false;
28237 if (reg_overlap_mentioned_p (reg_1, mem_2))
28238 return false;
28240 else
28242 mem_1 = operands[0];
28243 mem_2 = operands[2];
28244 reg_1 = operands[1];
28245 reg_2 = operands[3];
28248 /* The mems cannot be volatile. */
28249 if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2))
28250 return false;
28252 /* Check if the addresses are in the form of [base+offset]. */
28253 bool reversed = false;
28254 if (!aarch64_check_consecutive_mems (&mem_1, &mem_2, &reversed))
28255 return false;
28257 /* The operands must be of the same size. */
28258 gcc_assert (known_eq (GET_MODE_SIZE (GET_MODE (mem_1)),
28259 GET_MODE_SIZE (GET_MODE (mem_2))));
28261 /* The lower memory access must be a mem-pair operand. */
28262 rtx lower_mem = reversed ? mem_2 : mem_1;
28263 machine_mode lower_mem_mode = GET_MODE (lower_mem);
28264 if (!aarch64_mem_pair_operand (lower_mem, lower_mem_mode))
28265 return false;
28267 /* Check if lower_mem is ok with the ldp-stp policy model. */
28268 if (!aarch64_mem_ok_with_ldpstp_policy_model (lower_mem, load,
28269 lower_mem_mode))
28270 return false;
28272 if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
28273 rclass_1 = FP_REGS;
28274 else
28275 rclass_1 = GENERAL_REGS;
28277 if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
28278 rclass_2 = FP_REGS;
28279 else
28280 rclass_2 = GENERAL_REGS;
28282 /* Check if the registers are of same class. */
28283 if (rclass_1 != rclass_2)
28284 return false;
28286 return true;
28289 /* Given OPERANDS of consecutive load/store that can be merged,
28290 swap them if they are not in ascending order. */
28291 void
28292 aarch64_swap_ldrstr_operands (rtx* operands, bool load)
28294 int mem_op = load ? 1 : 0;
28295 bool reversed = false;
28296 if (!aarch64_check_consecutive_mems (operands + mem_op,
28297 operands + mem_op + 2, &reversed))
28298 gcc_unreachable ();
28300 if (reversed)
28302 /* Irrespective of whether this is a load or a store,
28303 we do the same swap. */
28304 std::swap (operands[0], operands[2]);
28305 std::swap (operands[1], operands[3]);
28309 /* Helper function used for generation of load/store pair instructions, called
28310 from peepholes in aarch64-ldpstp.md. OPERANDS is an array of
28311 operands as matched by the peepholes in that file. LOAD_P is true if we're
28312 generating a load pair, otherwise we're generating a store pair. CODE is
28313 either {ZERO,SIGN}_EXTEND for extending loads or UNKNOWN if we're generating a
28314 standard load/store pair. */
28316 void
28317 aarch64_finish_ldpstp_peephole (rtx *operands, bool load_p, enum rtx_code code)
28319 aarch64_swap_ldrstr_operands (operands, load_p);
28321 if (load_p)
28322 emit_insn (aarch64_gen_load_pair (operands[0], operands[2],
28323 operands[1], code));
28324 else
28326 gcc_assert (code == UNKNOWN);
28327 emit_insn (aarch64_gen_store_pair (operands[0], operands[1],
28328 operands[3]));
28332 /* Taking X and Y to be HOST_WIDE_INT pointers, return the result of a
28333 comparison between the two. */
28335 aarch64_host_wide_int_compare (const void *x, const void *y)
28337 return wi::cmps (* ((const HOST_WIDE_INT *) x),
28338 * ((const HOST_WIDE_INT *) y));
28341 /* Taking X and Y to be pairs of RTX, one pointing to a MEM rtx and the
28342 other pointing to a REG rtx containing an offset, compare the offsets
28343 of the two pairs.
28345 Return:
28347 1 iff offset (X) > offset (Y)
28348 0 iff offset (X) == offset (Y)
28349 -1 iff offset (X) < offset (Y) */
28351 aarch64_ldrstr_offset_compare (const void *x, const void *y)
28353 const rtx * operands_1 = (const rtx *) x;
28354 const rtx * operands_2 = (const rtx *) y;
28355 rtx mem_1, mem_2, base, offset_1, offset_2;
28357 if (MEM_P (operands_1[0]))
28358 mem_1 = operands_1[0];
28359 else
28360 mem_1 = operands_1[1];
28362 if (MEM_P (operands_2[0]))
28363 mem_2 = operands_2[0];
28364 else
28365 mem_2 = operands_2[1];
28367 /* Extract the offsets. */
28368 extract_base_offset_in_addr (mem_1, &base, &offset_1);
28369 extract_base_offset_in_addr (mem_2, &base, &offset_2);
28371 gcc_assert (offset_1 != NULL_RTX && offset_2 != NULL_RTX);
28373 return wi::cmps (INTVAL (offset_1), INTVAL (offset_2));
28376 /* Given OPERANDS of consecutive load/store, check if we can merge
28377 them into ldp/stp by adjusting the offset. LOAD is true if they
28378 are load instructions. MODE is the mode of memory operands.
28380 Given below consecutive stores:
28382 str w1, [xb, 0x100]
28383 str w1, [xb, 0x104]
28384 str w1, [xb, 0x108]
28385 str w1, [xb, 0x10c]
28387 Though the offsets are out of the range supported by stp, we can
28388 still pair them after adjusting the offset, like:
28390 add scratch, xb, 0x100
28391 stp w1, w1, [scratch]
28392 stp w1, w1, [scratch, 0x8]
28394 The peephole patterns detecting this opportunity should guarantee
28395 the scratch register is avaliable. */
28397 bool
28398 aarch64_operands_adjust_ok_for_ldpstp (rtx *operands, bool load,
28399 machine_mode mode)
28401 const int num_insns = 4;
28402 enum reg_class rclass;
28403 HOST_WIDE_INT offvals[num_insns], msize;
28404 rtx mem[num_insns], reg[num_insns], base[num_insns], offset[num_insns];
28406 if (load)
28408 for (int i = 0; i < num_insns; i++)
28410 reg[i] = operands[2 * i];
28411 mem[i] = operands[2 * i + 1];
28413 gcc_assert (REG_P (reg[i]));
28416 /* Do not attempt to merge the loads if the loads clobber each other. */
28417 for (int i = 0; i < 8; i += 2)
28418 for (int j = i + 2; j < 8; j += 2)
28419 if (reg_overlap_mentioned_p (operands[i], operands[j]))
28420 return false;
28422 else
28423 for (int i = 0; i < num_insns; i++)
28425 mem[i] = operands[2 * i];
28426 reg[i] = operands[2 * i + 1];
28429 /* Skip if memory operand is by itself valid for ldp/stp. */
28430 if (!MEM_P (mem[0]) || aarch64_mem_pair_operand (mem[0], mode))
28431 return false;
28433 for (int i = 0; i < num_insns; i++)
28435 /* The mems cannot be volatile. */
28436 if (MEM_VOLATILE_P (mem[i]))
28437 return false;
28439 /* Check if the addresses are in the form of [base+offset]. */
28440 extract_base_offset_in_addr (mem[i], base + i, offset + i);
28441 if (base[i] == NULL_RTX || offset[i] == NULL_RTX)
28442 return false;
28445 /* Check if the registers are of same class. */
28446 rclass = REG_P (reg[0]) && FP_REGNUM_P (REGNO (reg[0]))
28447 ? FP_REGS : GENERAL_REGS;
28449 for (int i = 1; i < num_insns; i++)
28450 if (REG_P (reg[i]) && FP_REGNUM_P (REGNO (reg[i])))
28452 if (rclass != FP_REGS)
28453 return false;
28455 else
28457 if (rclass != GENERAL_REGS)
28458 return false;
28461 /* Only the last register in the order in which they occur
28462 may be clobbered by the load. */
28463 if (rclass == GENERAL_REGS && load)
28464 for (int i = 0; i < num_insns - 1; i++)
28465 if (reg_mentioned_p (reg[i], mem[i]))
28466 return false;
28468 /* Check if the bases are same. */
28469 for (int i = 0; i < num_insns - 1; i++)
28470 if (!rtx_equal_p (base[i], base[i + 1]))
28471 return false;
28473 for (int i = 0; i < num_insns; i++)
28474 offvals[i] = INTVAL (offset[i]);
28476 msize = GET_MODE_SIZE (mode).to_constant ();
28478 /* Check if the offsets can be put in the right order to do a ldp/stp. */
28479 qsort (offvals, num_insns, sizeof (HOST_WIDE_INT),
28480 aarch64_host_wide_int_compare);
28482 if (!(offvals[1] == offvals[0] + msize
28483 && offvals[3] == offvals[2] + msize))
28484 return false;
28486 /* Check that offsets are within range of each other. The ldp/stp
28487 instructions have 7 bit immediate offsets, so use 0x80. */
28488 if (offvals[2] - offvals[0] >= msize * 0x80)
28489 return false;
28491 /* The offsets must be aligned with respect to each other. */
28492 if (offvals[0] % msize != offvals[2] % msize)
28493 return false;
28495 /* Check if mem[0] is ok with the ldp-stp policy model. */
28496 if (!aarch64_mem_ok_with_ldpstp_policy_model (mem[0], load, mode))
28497 return false;
28499 return true;
28502 /* Given OPERANDS of consecutive load/store, this function pairs them
28503 into LDP/STP after adjusting the offset. It depends on the fact
28504 that the operands can be sorted so the offsets are correct for STP.
28505 MODE is the mode of memory operands. CODE is the rtl operator
28506 which should be applied to all memory operands, it's SIGN_EXTEND,
28507 ZERO_EXTEND or UNKNOWN. */
28509 bool
28510 aarch64_gen_adjusted_ldpstp (rtx *operands, bool load,
28511 machine_mode mode, RTX_CODE code)
28513 rtx base, offset_1, offset_2;
28514 rtx mem_1, mem_2;
28515 rtx temp_operands[8];
28516 HOST_WIDE_INT off_val_1, off_val_2, base_off, new_off_1, new_off_2,
28517 stp_off_upper_limit, stp_off_lower_limit, msize;
28519 /* We make changes on a copy as we may still bail out. */
28520 for (int i = 0; i < 8; i ++)
28521 temp_operands[i] = operands[i];
28523 /* Sort the operands. Note for cases as below:
28524 [base + 0x310] = A
28525 [base + 0x320] = B
28526 [base + 0x330] = C
28527 [base + 0x320] = D
28528 We need stable sorting otherwise wrong data may be store to offset 0x320.
28529 Also note the dead store in above case should be optimized away, but no
28530 guarantees here. */
28531 gcc_stablesort(temp_operands, 4, 2 * sizeof (rtx *),
28532 aarch64_ldrstr_offset_compare);
28534 /* Copy the memory operands so that if we have to bail for some
28535 reason the original addresses are unchanged. */
28536 if (load)
28538 mem_1 = copy_rtx (temp_operands[1]);
28539 mem_2 = copy_rtx (temp_operands[5]);
28541 else
28543 mem_1 = copy_rtx (temp_operands[0]);
28544 mem_2 = copy_rtx (temp_operands[4]);
28545 gcc_assert (code == UNKNOWN);
28548 extract_base_offset_in_addr (mem_1, &base, &offset_1);
28549 extract_base_offset_in_addr (mem_2, &base, &offset_2);
28550 gcc_assert (base != NULL_RTX && offset_1 != NULL_RTX
28551 && offset_2 != NULL_RTX);
28553 /* Adjust offset so it can fit in LDP/STP instruction. */
28554 msize = GET_MODE_SIZE (mode).to_constant();
28555 stp_off_upper_limit = msize * (0x40 - 1);
28556 stp_off_lower_limit = - msize * 0x40;
28558 off_val_1 = INTVAL (offset_1);
28559 off_val_2 = INTVAL (offset_2);
28561 /* The base offset is optimally half way between the two STP/LDP offsets. */
28562 if (msize <= 4)
28563 base_off = (off_val_1 + off_val_2) / 2;
28564 else
28565 /* However, due to issues with negative LDP/STP offset generation for
28566 larger modes, for DF, DD, DI and vector modes. we must not use negative
28567 addresses smaller than 9 signed unadjusted bits can store. This
28568 provides the most range in this case. */
28569 base_off = off_val_1;
28571 /* Adjust the base so that it is aligned with the addresses but still
28572 optimal. */
28573 if (base_off % msize != off_val_1 % msize)
28574 /* Fix the offset, bearing in mind we want to make it bigger not
28575 smaller. */
28576 base_off += (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
28577 else if (msize <= 4)
28578 /* The negative range of LDP/STP is one larger than the positive range. */
28579 base_off += msize;
28581 /* Check if base offset is too big or too small. We can attempt to resolve
28582 this issue by setting it to the maximum value and seeing if the offsets
28583 still fit. */
28584 if (base_off >= 0x1000)
28586 base_off = 0x1000 - 1;
28587 /* We must still make sure that the base offset is aligned with respect
28588 to the address. But it may not be made any bigger. */
28589 base_off -= (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
28592 /* Likewise for the case where the base is too small. */
28593 if (base_off <= -0x1000)
28595 base_off = -0x1000 + 1;
28596 base_off += (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
28599 /* Offset of the first STP/LDP. */
28600 new_off_1 = off_val_1 - base_off;
28602 /* Offset of the second STP/LDP. */
28603 new_off_2 = off_val_2 - base_off;
28605 /* The offsets must be within the range of the LDP/STP instructions. */
28606 if (new_off_1 > stp_off_upper_limit || new_off_1 < stp_off_lower_limit
28607 || new_off_2 > stp_off_upper_limit || new_off_2 < stp_off_lower_limit)
28608 return false;
28610 replace_equiv_address_nv (mem_1, plus_constant (Pmode, operands[8],
28611 new_off_1), true);
28612 replace_equiv_address_nv (mem_2, plus_constant (Pmode, operands[8],
28613 new_off_2), true);
28615 if (!aarch64_mem_pair_operand (mem_1, mode)
28616 || !aarch64_mem_pair_operand (mem_2, mode))
28617 return false;
28619 if (load)
28621 operands[0] = temp_operands[0];
28622 operands[1] = mem_1;
28623 operands[2] = temp_operands[2];
28624 operands[4] = temp_operands[4];
28625 operands[5] = mem_2;
28626 operands[6] = temp_operands[6];
28628 else
28630 operands[0] = mem_1;
28631 operands[1] = temp_operands[1];
28632 operands[3] = temp_operands[3];
28633 operands[4] = mem_2;
28634 operands[5] = temp_operands[5];
28635 operands[7] = temp_operands[7];
28638 /* Emit adjusting instruction. */
28639 emit_insn (gen_rtx_SET (operands[8], plus_constant (DImode, base, base_off)));
28640 /* Emit ldp/stp instructions. */
28641 if (load)
28643 emit_insn (aarch64_gen_load_pair (operands[0], operands[2],
28644 operands[1], code));
28645 emit_insn (aarch64_gen_load_pair (operands[4], operands[6],
28646 operands[5], code));
28648 else
28650 emit_insn (aarch64_gen_store_pair (operands[0], operands[1],
28651 operands[3]));
28652 emit_insn (aarch64_gen_store_pair (operands[4], operands[5],
28653 operands[7]));
28655 return true;
28658 /* Implement TARGET_VECTORIZE_CONDITIONAL_OPERATION_IS_EXPENSIVE. Assume that
28659 predicated operations when available are beneficial. */
28661 static bool
28662 aarch64_conditional_operation_is_expensive (unsigned)
28664 return false;
28667 /* Implement TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE. Assume for now that
28668 it isn't worth branching around empty masked ops (including masked
28669 stores). */
28671 static bool
28672 aarch64_empty_mask_is_expensive (unsigned)
28674 return false;
28677 /* Return 1 if pseudo register should be created and used to hold
28678 GOT address for PIC code. */
28680 bool
28681 aarch64_use_pseudo_pic_reg (void)
28683 return aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC;
28686 /* Implement TARGET_UNSPEC_MAY_TRAP_P. */
28688 static int
28689 aarch64_unspec_may_trap_p (const_rtx x, unsigned flags)
28691 switch (XINT (x, 1))
28693 case UNSPEC_GOTSMALLPIC:
28694 case UNSPEC_GOTSMALLPIC28K:
28695 case UNSPEC_GOTTINYPIC:
28696 return 0;
28697 default:
28698 break;
28701 return default_unspec_may_trap_p (x, flags);
28705 /* If X is a positive CONST_DOUBLE with a value that is a power of 2
28706 return the log2 of that value. Otherwise return -1. */
28709 aarch64_fpconst_pow_of_2 (rtx x)
28711 const REAL_VALUE_TYPE *r;
28713 if (!CONST_DOUBLE_P (x))
28714 return -1;
28716 r = CONST_DOUBLE_REAL_VALUE (x);
28718 if (REAL_VALUE_NEGATIVE (*r)
28719 || REAL_VALUE_ISNAN (*r)
28720 || REAL_VALUE_ISINF (*r)
28721 || !real_isinteger (r, DFmode))
28722 return -1;
28724 return exact_log2 (real_to_integer (r));
28727 /* If X is a positive CONST_DOUBLE with a value that is the reciprocal of a
28728 power of 2 (i.e 1/2^n) return the number of float bits. e.g. for x==(1/2^n)
28729 return n. Otherwise return -1. */
28732 aarch64_fpconst_pow2_recip (rtx x)
28734 REAL_VALUE_TYPE r0;
28736 if (!CONST_DOUBLE_P (x))
28737 return -1;
28739 r0 = *CONST_DOUBLE_REAL_VALUE (x);
28740 if (exact_real_inverse (DFmode, &r0)
28741 && !REAL_VALUE_NEGATIVE (r0))
28743 int ret = exact_log2 (real_to_integer (&r0));
28744 if (ret >= 1 && ret <= 32)
28745 return ret;
28747 return -1;
28750 /* If X is a vector of equal CONST_DOUBLE values and that value is
28751 Y, return the aarch64_fpconst_pow_of_2 of Y. Otherwise return -1. */
28754 aarch64_vec_fpconst_pow_of_2 (rtx x)
28756 int nelts;
28757 if (!CONST_VECTOR_P (x)
28758 || !CONST_VECTOR_NUNITS (x).is_constant (&nelts))
28759 return -1;
28761 if (GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_FLOAT)
28762 return -1;
28764 int firstval = aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, 0));
28765 if (firstval <= 0)
28766 return -1;
28768 for (int i = 1; i < nelts; i++)
28769 if (aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, i)) != firstval)
28770 return -1;
28772 return firstval;
28775 /* Implement TARGET_PROMOTED_TYPE to promote 16-bit floating point types
28776 to float.
28778 __fp16 always promotes through this hook.
28779 _Float16 may promote if TARGET_FLT_EVAL_METHOD is 16, but we do that
28780 through the generic excess precision logic rather than here. */
28782 static tree
28783 aarch64_promoted_type (const_tree t)
28785 if (SCALAR_FLOAT_TYPE_P (t)
28786 && TYPE_MAIN_VARIANT (t) == aarch64_fp16_type_node)
28787 return float_type_node;
28789 return NULL_TREE;
28792 /* Implement the TARGET_OPTAB_SUPPORTED_P hook. */
28794 static bool
28795 aarch64_optab_supported_p (int op, machine_mode mode1, machine_mode,
28796 optimization_type opt_type)
28798 switch (op)
28800 case rsqrt_optab:
28801 return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p (mode1);
28803 default:
28804 return true;
28808 /* Implement the TARGET_DWARF_POLY_INDETERMINATE_VALUE hook. */
28810 static unsigned int
28811 aarch64_dwarf_poly_indeterminate_value (unsigned int i, unsigned int *factor,
28812 int *offset)
28814 /* Polynomial invariant 1 == (VG / 2) - 1. */
28815 gcc_assert (i == 1);
28816 *factor = 2;
28817 *offset = 1;
28818 return AARCH64_DWARF_VG;
28821 /* Implement TARGET_LIBGCC_FLOATING_POINT_MODE_SUPPORTED_P - return TRUE
28822 if MODE is [BH]Fmode, and punt to the generic implementation otherwise. */
28824 static bool
28825 aarch64_libgcc_floating_mode_supported_p (scalar_float_mode mode)
28827 return ((mode == HFmode || mode == BFmode)
28828 ? true
28829 : default_libgcc_floating_mode_supported_p (mode));
28832 /* Implement TARGET_SCALAR_MODE_SUPPORTED_P - return TRUE
28833 if MODE is [BH]Fmode, and punt to the generic implementation otherwise. */
28835 static bool
28836 aarch64_scalar_mode_supported_p (scalar_mode mode)
28838 if (DECIMAL_FLOAT_MODE_P (mode))
28839 return default_decimal_float_supported_p ();
28841 return ((mode == HFmode || mode == BFmode)
28842 ? true
28843 : default_scalar_mode_supported_p (mode));
28846 /* Set the value of FLT_EVAL_METHOD.
28847 ISO/IEC TS 18661-3 defines two values that we'd like to make use of:
28849 0: evaluate all operations and constants, whose semantic type has at
28850 most the range and precision of type float, to the range and
28851 precision of float; evaluate all other operations and constants to
28852 the range and precision of the semantic type;
28854 N, where _FloatN is a supported interchange floating type
28855 evaluate all operations and constants, whose semantic type has at
28856 most the range and precision of _FloatN type, to the range and
28857 precision of the _FloatN type; evaluate all other operations and
28858 constants to the range and precision of the semantic type;
28860 If we have the ARMv8.2-A extensions then we support _Float16 in native
28861 precision, so we should set this to 16. Otherwise, we support the type,
28862 but want to evaluate expressions in float precision, so set this to
28863 0. */
28865 static enum flt_eval_method
28866 aarch64_excess_precision (enum excess_precision_type type)
28868 switch (type)
28870 case EXCESS_PRECISION_TYPE_FAST:
28871 case EXCESS_PRECISION_TYPE_STANDARD:
28872 /* We can calculate either in 16-bit range and precision or
28873 32-bit range and precision. Make that decision based on whether
28874 we have native support for the ARMv8.2-A 16-bit floating-point
28875 instructions or not. */
28876 return (TARGET_FP_F16INST
28877 ? FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16
28878 : FLT_EVAL_METHOD_PROMOTE_TO_FLOAT);
28879 case EXCESS_PRECISION_TYPE_IMPLICIT:
28880 case EXCESS_PRECISION_TYPE_FLOAT16:
28881 return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16;
28882 default:
28883 gcc_unreachable ();
28885 return FLT_EVAL_METHOD_UNPREDICTABLE;
28888 /* Implement TARGET_C_BITINT_TYPE_INFO.
28889 Return true if _BitInt(N) is supported and fill its details into *INFO. */
28890 bool
28891 aarch64_bitint_type_info (int n, struct bitint_info *info)
28893 if (TARGET_BIG_END)
28894 return false;
28896 if (n <= 8)
28897 info->limb_mode = QImode;
28898 else if (n <= 16)
28899 info->limb_mode = HImode;
28900 else if (n <= 32)
28901 info->limb_mode = SImode;
28902 else if (n <= 64)
28903 info->limb_mode = DImode;
28904 else if (n <= 128)
28905 info->limb_mode = TImode;
28906 else
28907 /* The AAPCS for AArch64 defines _BitInt(N > 128) as an array with
28908 type {signed,unsigned} __int128[M] where M*128 >= N. However, to be
28909 able to use libgcc's implementation to support large _BitInt's we need
28910 to use a LIMB_MODE that is no larger than 'long long'. This is why we
28911 use DImode for our internal LIMB_MODE and we define the ABI_LIMB_MODE to
28912 be TImode to ensure we are ABI compliant. */
28913 info->limb_mode = DImode;
28915 if (n > 128)
28916 info->abi_limb_mode = TImode;
28917 else
28918 info->abi_limb_mode = info->limb_mode;
28919 info->big_endian = TARGET_BIG_END;
28920 info->extended = false;
28921 return true;
28924 /* Implement TARGET_C_MODE_FOR_FLOATING_TYPE. Return TFmode for
28925 TI_LONG_DOUBLE_TYPE which is for long double type, go with the default
28926 one for the others. */
28928 static machine_mode
28929 aarch64_c_mode_for_floating_type (enum tree_index ti)
28931 if (ti == TI_LONG_DOUBLE_TYPE)
28932 return TFmode;
28933 return default_mode_for_floating_type (ti);
28936 /* Implement TARGET_SCHED_CAN_SPECULATE_INSN. Return true if INSN can be
28937 scheduled for speculative execution. Reject the long-running division
28938 and square-root instructions. */
28940 static bool
28941 aarch64_sched_can_speculate_insn (rtx_insn *insn)
28943 switch (get_attr_type (insn))
28945 case TYPE_SDIV:
28946 case TYPE_UDIV:
28947 case TYPE_FDIVS:
28948 case TYPE_FDIVD:
28949 case TYPE_FSQRTS:
28950 case TYPE_FSQRTD:
28951 case TYPE_NEON_FP_SQRT_S:
28952 case TYPE_NEON_FP_SQRT_D:
28953 case TYPE_NEON_FP_SQRT_S_Q:
28954 case TYPE_NEON_FP_SQRT_D_Q:
28955 case TYPE_NEON_FP_DIV_S:
28956 case TYPE_NEON_FP_DIV_D:
28957 case TYPE_NEON_FP_DIV_S_Q:
28958 case TYPE_NEON_FP_DIV_D_Q:
28959 return false;
28960 default:
28961 return true;
28965 /* Implement TARGET_COMPUTE_PRESSURE_CLASSES. */
28967 static int
28968 aarch64_compute_pressure_classes (reg_class *classes)
28970 int i = 0;
28971 classes[i++] = GENERAL_REGS;
28972 classes[i++] = FP_REGS;
28973 /* PR_REGS isn't a useful pressure class because many predicate pseudo
28974 registers need to go in PR_LO_REGS at some point during their
28975 lifetime. Splitting it into two halves has the effect of making
28976 all predicates count against PR_LO_REGS, so that we try whenever
28977 possible to restrict the number of live predicates to 8. This
28978 greatly reduces the amount of spilling in certain loops. */
28979 classes[i++] = PR_LO_REGS;
28980 classes[i++] = PR_HI_REGS;
28981 return i;
28984 /* Implement TARGET_CAN_CHANGE_MODE_CLASS. */
28986 static bool
28987 aarch64_can_change_mode_class (machine_mode from,
28988 machine_mode to, reg_class_t)
28990 return aarch64_modes_compatible_p (from, to);
28993 /* Implement TARGET_EARLY_REMAT_MODES. */
28995 static void
28996 aarch64_select_early_remat_modes (sbitmap modes)
28998 /* SVE values are not normally live across a call, so it should be
28999 worth doing early rematerialization even in VL-specific mode. */
29000 for (int i = 0; i < NUM_MACHINE_MODES; ++i)
29001 if (aarch64_sve_mode_p ((machine_mode) i))
29002 bitmap_set_bit (modes, i);
29005 /* Override the default target speculation_safe_value. */
29006 static rtx
29007 aarch64_speculation_safe_value (machine_mode mode,
29008 rtx result, rtx val, rtx failval)
29010 /* Maybe we should warn if falling back to hard barriers. They are
29011 likely to be noticably more expensive than the alternative below. */
29012 if (!aarch64_track_speculation)
29013 return default_speculation_safe_value (mode, result, val, failval);
29015 if (!REG_P (val))
29016 val = copy_to_mode_reg (mode, val);
29018 if (!aarch64_reg_or_zero (failval, mode))
29019 failval = copy_to_mode_reg (mode, failval);
29021 emit_insn (gen_despeculate_copy (mode, result, val, failval));
29022 return result;
29025 /* Implement TARGET_ESTIMATED_POLY_VALUE.
29026 Look into the tuning structure for an estimate.
29027 KIND specifies the type of requested estimate: min, max or likely.
29028 For cores with a known SVE width all three estimates are the same.
29029 For generic SVE tuning we want to distinguish the maximum estimate from
29030 the minimum and likely ones.
29031 The likely estimate is the same as the minimum in that case to give a
29032 conservative behavior of auto-vectorizing with SVE when it is a win
29033 even for 128-bit SVE.
29034 When SVE width information is available VAL.coeffs[1] is multiplied by
29035 the number of VQ chunks over the initial Advanced SIMD 128 bits. */
29037 static HOST_WIDE_INT
29038 aarch64_estimated_poly_value (poly_int64 val,
29039 poly_value_estimate_kind kind
29040 = POLY_VALUE_LIKELY)
29042 unsigned int width_source = aarch64_tune_params.sve_width;
29044 /* If there is no core-specific information then the minimum and likely
29045 values are based on 128-bit vectors and the maximum is based on
29046 the architectural maximum of 2048 bits. */
29047 if (width_source == SVE_SCALABLE)
29048 switch (kind)
29050 case POLY_VALUE_MIN:
29051 case POLY_VALUE_LIKELY:
29052 return val.coeffs[0];
29053 case POLY_VALUE_MAX:
29054 return val.coeffs[0] + val.coeffs[1] * 15;
29057 /* Allow sve_width to be a bitmask of different VL, treating the lowest
29058 as likely. This could be made more general if future -mtune options
29059 need it to be. */
29060 if (kind == POLY_VALUE_MAX)
29061 width_source = 1 << floor_log2 (width_source);
29062 else
29063 width_source = least_bit_hwi (width_source);
29065 /* If the core provides width information, use that. */
29066 HOST_WIDE_INT over_128 = width_source - 128;
29067 return val.coeffs[0] + val.coeffs[1] * over_128 / 128;
29071 /* Return true for types that could be supported as SIMD return or
29072 argument types. */
29074 static bool
29075 supported_simd_type (tree t)
29077 if (SCALAR_FLOAT_TYPE_P (t) || INTEGRAL_TYPE_P (t) || POINTER_TYPE_P (t))
29079 HOST_WIDE_INT s = tree_to_shwi (TYPE_SIZE_UNIT (t));
29080 return s == 1 || s == 2 || s == 4 || s == 8;
29082 return false;
29085 /* Determine the lane size for the clone argument/return type. This follows
29086 the LS(P) rule in the VFABIA64. */
29088 static unsigned
29089 lane_size (cgraph_simd_clone_arg_type clone_arg_type, tree type)
29091 gcc_assert (clone_arg_type != SIMD_CLONE_ARG_TYPE_MASK);
29093 /* For non map-to-vector types that are pointers we use the element type it
29094 points to. */
29095 if (POINTER_TYPE_P (type))
29096 switch (clone_arg_type)
29098 default:
29099 break;
29100 case SIMD_CLONE_ARG_TYPE_UNIFORM:
29101 case SIMD_CLONE_ARG_TYPE_LINEAR_CONSTANT_STEP:
29102 case SIMD_CLONE_ARG_TYPE_LINEAR_VARIABLE_STEP:
29103 type = TREE_TYPE (type);
29104 break;
29107 /* For types (or pointers of non map-to-vector types point to) that are
29108 integers or floating point, we use their size if they are 1, 2, 4 or 8.
29110 if (INTEGRAL_TYPE_P (type)
29111 || SCALAR_FLOAT_TYPE_P (type))
29112 switch (TYPE_PRECISION (type) / BITS_PER_UNIT)
29114 default:
29115 break;
29116 case 1:
29117 case 2:
29118 case 4:
29119 case 8:
29120 return TYPE_PRECISION (type);
29122 /* For any other we use the size of uintptr_t. For map-to-vector types that
29123 are pointers, using the size of uintptr_t is the same as using the size of
29124 their type, seeing all pointers are the same size as uintptr_t. */
29125 return POINTER_SIZE;
29129 /* Implement TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN. */
29131 static int
29132 aarch64_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node *node,
29133 struct cgraph_simd_clone *clonei,
29134 tree base_type ATTRIBUTE_UNUSED,
29135 int num, bool explicit_p)
29137 tree t, ret_type;
29138 unsigned int nds_elt_bits, wds_elt_bits;
29139 unsigned HOST_WIDE_INT const_simdlen;
29141 if (!TARGET_SIMD)
29142 return 0;
29144 /* For now, SVE simdclones won't produce illegal simdlen, So only check
29145 const simdlens here. */
29146 if (maybe_ne (clonei->simdlen, 0U)
29147 && clonei->simdlen.is_constant (&const_simdlen)
29148 && (const_simdlen < 2
29149 || const_simdlen > 1024
29150 || (const_simdlen & (const_simdlen - 1)) != 0))
29152 if (explicit_p)
29153 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
29154 "unsupported simdlen %wd", const_simdlen);
29155 return 0;
29158 ret_type = TREE_TYPE (TREE_TYPE (node->decl));
29159 /* According to AArch64's Vector ABI the type that determines the simdlen is
29160 the narrowest of types, so we ignore base_type for AArch64. */
29161 if (TREE_CODE (ret_type) != VOID_TYPE
29162 && !supported_simd_type (ret_type))
29164 if (!explicit_p)
29166 else if (COMPLEX_FLOAT_TYPE_P (ret_type))
29167 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
29168 "GCC does not currently support return type %qT "
29169 "for simd", ret_type);
29170 else
29171 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
29172 "unsupported return type %qT for simd",
29173 ret_type);
29174 return 0;
29177 auto_vec<std::pair <tree, unsigned int>> vec_elts (clonei->nargs + 1);
29179 /* We are looking for the NDS type here according to the VFABIA64. */
29180 if (TREE_CODE (ret_type) != VOID_TYPE)
29182 nds_elt_bits = lane_size (SIMD_CLONE_ARG_TYPE_VECTOR, ret_type);
29183 wds_elt_bits = nds_elt_bits;
29184 vec_elts.safe_push (std::make_pair (ret_type, nds_elt_bits));
29186 else
29188 nds_elt_bits = POINTER_SIZE;
29189 wds_elt_bits = 0;
29192 int i;
29193 tree type_arg_types = TYPE_ARG_TYPES (TREE_TYPE (node->decl));
29194 bool decl_arg_p = (node->definition || type_arg_types == NULL_TREE);
29195 for (t = (decl_arg_p ? DECL_ARGUMENTS (node->decl) : type_arg_types), i = 0;
29196 t && t != void_list_node; t = TREE_CHAIN (t), i++)
29198 tree type = decl_arg_p ? TREE_TYPE (t) : TREE_VALUE (t);
29199 if (clonei->args[i].arg_type != SIMD_CLONE_ARG_TYPE_UNIFORM
29200 && !supported_simd_type (type))
29202 if (!explicit_p)
29204 else if (COMPLEX_FLOAT_TYPE_P (type))
29205 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
29206 "GCC does not currently support argument type %qT "
29207 "for simd", type);
29208 else
29209 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
29210 "unsupported argument type %qT for simd",
29211 type);
29212 return 0;
29214 unsigned lane_bits = lane_size (clonei->args[i].arg_type, type);
29215 if (clonei->args[i].arg_type == SIMD_CLONE_ARG_TYPE_VECTOR)
29216 vec_elts.safe_push (std::make_pair (type, lane_bits));
29217 if (nds_elt_bits > lane_bits)
29218 nds_elt_bits = lane_bits;
29219 if (wds_elt_bits < lane_bits)
29220 wds_elt_bits = lane_bits;
29223 /* If we could not determine the WDS type from available parameters/return,
29224 then fallback to using uintptr_t. */
29225 if (wds_elt_bits == 0)
29226 wds_elt_bits = POINTER_SIZE;
29228 clonei->mask_mode = VOIDmode;
29229 poly_uint64 simdlen;
29230 typedef struct
29232 poly_uint64 len;
29233 char mangle;
29234 } aarch64_clone_info;
29235 auto_vec<aarch64_clone_info, 3> clones;
29237 /* Keep track of the possible simdlens the clones of this function can have,
29238 and check them later to see if we support them. */
29239 if (known_eq (clonei->simdlen, 0U))
29241 simdlen = exact_div (poly_uint64 (64), nds_elt_bits);
29242 if (maybe_ne (simdlen, 1U))
29243 clones.safe_push ({simdlen, 'n'});
29244 clones.safe_push ({simdlen * 2, 'n'});
29245 /* Only create an SVE simd clone if we aren't dealing with an unprototyped
29246 function.
29247 We have also disabled support for creating SVE simdclones for functions
29248 with function bodies and any simdclones when -msve-vector-bits is used.
29249 TODO: add support for these. */
29250 if (prototype_p (TREE_TYPE (node->decl))
29251 && !node->definition
29252 && !aarch64_sve_vg.is_constant ())
29253 clones.safe_push ({exact_div (BITS_PER_SVE_VECTOR, wds_elt_bits), 's'});
29255 else
29256 clones.safe_push ({clonei->simdlen, 'n'});
29258 clonei->vecsize_int = 0;
29259 clonei->vecsize_float = 0;
29261 /* We currently do not support generating simdclones where vector arguments
29262 do not fit into a single vector register, i.e. vector types that are more
29263 than 128-bits large. This is because of how we currently represent such
29264 types in ACLE, where we use a struct to allow us to pass them as arguments
29265 and return.
29266 Hence why we have to check whether the simdlens available for this
29267 simdclone would cause a vector type to be larger than 128-bits, and reject
29268 such a clone. */
29269 unsigned j = 0;
29270 while (j < clones.length ())
29272 bool remove_simdlen = false;
29273 for (auto elt : vec_elts)
29274 if (clones[j].mangle == 'n'
29275 && known_gt (clones[j].len * elt.second, 128U))
29277 /* Don't issue a warning for every simdclone when there is no
29278 specific simdlen clause. */
29279 if (explicit_p && maybe_ne (clonei->simdlen, 0U))
29280 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
29281 "GCC does not currently support simdlen %wd for "
29282 "type %qT",
29283 constant_lower_bound (clones[j].len), elt.first);
29284 remove_simdlen = true;
29285 break;
29287 if (remove_simdlen)
29288 clones.ordered_remove (j);
29289 else
29290 j++;
29293 int count = clones.length ();
29294 if (count == 0)
29296 if (explicit_p && known_eq (clonei->simdlen, 0U))
29298 /* Warn the user if we can't generate any simdclone. */
29299 simdlen = exact_div (poly_uint64 (64), nds_elt_bits);
29300 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
29301 "GCC does not currently support a simdclone with simdlens"
29302 " %wd and %wd for these types.",
29303 constant_lower_bound (simdlen),
29304 constant_lower_bound (simdlen*2));
29306 return 0;
29309 gcc_assert (num < count);
29310 clonei->simdlen = clones[num].len;
29311 clonei->vecsize_mangle = clones[num].mangle;
29312 /* SVE simdclones always have a Mask, so set inbranch to 1. */
29313 if (clonei->vecsize_mangle == 's')
29314 clonei->inbranch = 1;
29315 return count;
29318 /* Helper function to adjust an SVE vector type of an SVE simd clone. Returns
29319 an SVE vector type based on the element type of the vector TYPE, with SIMDLEN
29320 number of elements. If IS_MASK, returns an SVE mask type appropriate for use
29321 with the SVE type it would otherwise return. */
29323 static tree
29324 simd_clone_adjust_sve_vector_type (tree type, bool is_mask, poly_uint64 simdlen)
29326 unsigned int num_zr = 0;
29327 unsigned int num_pr = 0;
29328 machine_mode vector_mode;
29329 type = TREE_TYPE (type);
29330 scalar_mode scalar_m = SCALAR_TYPE_MODE (type);
29331 vector_mode = aarch64_sve_data_mode (scalar_m, simdlen).require ();
29332 type = build_vector_type_for_mode (type, vector_mode);
29333 if (is_mask)
29335 type = truth_type_for (type);
29336 num_pr = 1;
29338 else
29339 num_zr = 1;
29341 /* We create new types here with the SVE type attribute instead of using ACLE
29342 types as we need to support unpacked vectors which aren't available as
29343 ACLE SVE types. */
29345 /* ??? This creates anonymous "SVE type" attributes for all types,
29346 even those that correspond to <arm_sve.h> types. This affects type
29347 compatibility in C/C++, but not in gimple. (Gimple type equivalence
29348 is instead decided by TARGET_COMPATIBLE_VECTOR_TYPES_P.)
29350 Thus a C/C++ definition of the implementation function will have a
29351 different function type from the declaration that this code creates.
29352 However, it doesn't seem worth trying to fix that until we have a
29353 way of handling implementations that operate on unpacked types. */
29354 type = build_distinct_type_copy (type);
29355 aarch64_sve::add_sve_type_attribute (type, num_zr, num_pr, NULL, NULL);
29356 return type;
29359 /* Implement TARGET_SIMD_CLONE_ADJUST. */
29360 static void
29361 aarch64_simd_clone_adjust (struct cgraph_node *node)
29363 tree t = TREE_TYPE (node->decl);
29365 if (node->simdclone->vecsize_mangle == 's')
29367 /* This is additive and has no effect if SVE, or a superset thereof, is
29368 already enabled. */
29369 tree target = build_string (strlen ("+sve") + 1, "+sve");
29370 if (!aarch64_option_valid_attribute_p (node->decl, NULL_TREE, target, 0))
29371 gcc_unreachable ();
29372 push_function_decl (node->decl);
29374 else
29376 /* Add aarch64_vector_pcs target attribute to SIMD clones so they
29377 use the correct ABI. */
29378 TYPE_ATTRIBUTES (t) = make_attribute ("aarch64_vector_pcs", "default",
29379 TYPE_ATTRIBUTES (t));
29382 cgraph_simd_clone *sc = node->simdclone;
29384 for (unsigned i = 0; i < sc->nargs; ++i)
29386 bool is_mask = false;
29387 tree type;
29388 switch (sc->args[i].arg_type)
29390 case SIMD_CLONE_ARG_TYPE_MASK:
29391 is_mask = true;
29392 gcc_fallthrough ();
29393 case SIMD_CLONE_ARG_TYPE_VECTOR:
29394 case SIMD_CLONE_ARG_TYPE_LINEAR_VAL_CONSTANT_STEP:
29395 case SIMD_CLONE_ARG_TYPE_LINEAR_VAL_VARIABLE_STEP:
29396 type = sc->args[i].vector_type;
29397 gcc_assert (VECTOR_TYPE_P (type));
29398 if (node->simdclone->vecsize_mangle == 's')
29399 type = simd_clone_adjust_sve_vector_type (type, is_mask,
29400 sc->simdlen);
29401 sc->args[i].vector_type = type;
29402 break;
29403 default:
29404 continue;
29407 if (node->simdclone->vecsize_mangle == 's')
29409 tree ret_type = TREE_TYPE (t);
29410 if (VECTOR_TYPE_P (ret_type))
29411 TREE_TYPE (t)
29412 = simd_clone_adjust_sve_vector_type (ret_type, false,
29413 node->simdclone->simdlen);
29414 pop_function_decl ();
29418 /* Implement TARGET_SIMD_CLONE_USABLE. */
29420 static int
29421 aarch64_simd_clone_usable (struct cgraph_node *node, machine_mode vector_mode)
29423 switch (node->simdclone->vecsize_mangle)
29425 case 'n':
29426 if (!TARGET_SIMD || aarch64_sve_mode_p (vector_mode))
29427 return -1;
29428 return 0;
29429 case 's':
29430 if (!TARGET_SVE
29431 || !aarch64_sve_mode_p (vector_mode))
29432 return -1;
29433 return 0;
29434 default:
29435 gcc_unreachable ();
29439 /* Implement TARGET_COMP_TYPE_ATTRIBUTES */
29441 static int
29442 aarch64_comp_type_attributes (const_tree type1, const_tree type2)
29444 auto check_attr = [&](const char *ns, const char *name) {
29445 tree attr1 = lookup_attribute (ns, name, TYPE_ATTRIBUTES (type1));
29446 tree attr2 = lookup_attribute (ns, name, TYPE_ATTRIBUTES (type2));
29447 if (!attr1 && !attr2)
29448 return true;
29450 return attr1 && attr2 && attribute_value_equal (attr1, attr2);
29453 if (!check_attr ("gnu", "aarch64_vector_pcs"))
29454 return 0;
29455 if (!check_attr ("gnu", "indirect_return"))
29456 return 0;
29457 if (!check_attr ("gnu", "Advanced SIMD type"))
29458 return 0;
29459 if (!check_attr ("gnu", "SVE type"))
29460 return 0;
29461 if (!check_attr ("gnu", "SVE sizeless type"))
29462 return 0;
29463 if (!check_attr ("arm", "streaming"))
29464 return 0;
29465 if (!check_attr ("arm", "streaming_compatible"))
29466 return 0;
29467 if (aarch64_lookup_shared_state_flags (TYPE_ATTRIBUTES (type1), "za")
29468 != aarch64_lookup_shared_state_flags (TYPE_ATTRIBUTES (type2), "za"))
29469 return 0;
29470 if (aarch64_lookup_shared_state_flags (TYPE_ATTRIBUTES (type1), "zt0")
29471 != aarch64_lookup_shared_state_flags (TYPE_ATTRIBUTES (type2), "zt0"))
29472 return 0;
29473 return 1;
29476 /* Implement TARGET_MERGE_DECL_ATTRIBUTES. */
29478 static tree
29479 aarch64_merge_decl_attributes (tree olddecl, tree newdecl)
29481 tree old_attrs = DECL_ATTRIBUTES (olddecl);
29482 tree old_new = lookup_attribute ("arm", "new", old_attrs);
29484 tree new_attrs = DECL_ATTRIBUTES (newdecl);
29485 tree new_new = lookup_attribute ("arm", "new", new_attrs);
29487 if (DECL_INITIAL (olddecl) && new_new)
29489 error ("cannot apply attribute %qs to %q+D after the function"
29490 " has been defined", "new", newdecl);
29491 inform (DECL_SOURCE_LOCATION (olddecl), "%q+D defined here",
29492 newdecl);
29494 else
29496 if (old_new && new_new)
29498 old_attrs = remove_attribute ("arm", "new", old_attrs);
29499 TREE_VALUE (new_new) = chainon (TREE_VALUE (new_new),
29500 TREE_VALUE (old_new));
29502 if (new_new)
29503 aarch64_check_arm_new_against_type (TREE_VALUE (new_new), newdecl);
29506 return merge_attributes (old_attrs, new_attrs);
29509 /* Implement TARGET_GET_MULTILIB_ABI_NAME */
29511 static const char *
29512 aarch64_get_multilib_abi_name (void)
29514 if (TARGET_BIG_END)
29515 return TARGET_ILP32 ? "aarch64_be_ilp32" : "aarch64_be";
29516 return TARGET_ILP32 ? "aarch64_ilp32" : "aarch64";
29519 /* Implement TARGET_STACK_PROTECT_GUARD. In case of a
29520 global variable based guard use the default else
29521 return a null tree. */
29522 static tree
29523 aarch64_stack_protect_guard (void)
29525 if (aarch64_stack_protector_guard == SSP_GLOBAL)
29526 return default_stack_protect_guard ();
29528 return NULL_TREE;
29531 /* Implement TARGET_INVALID_UNARY_OP. */
29533 static const char *
29534 aarch64_invalid_unary_op (int op, const_tree type)
29536 /* Reject all single-operand operations on __mfp8 except for &. */
29537 if (TYPE_MAIN_VARIANT (type) == aarch64_mfp8_type_node && op != ADDR_EXPR)
29538 return N_ ("operation not permitted on type %<mfloat8_t%>");
29540 /* Operation allowed. */
29541 return NULL;
29544 /* Implement TARGET_INVALID_BINARY_OP. */
29546 static const char *
29547 aarch64_invalid_binary_op (int op ATTRIBUTE_UNUSED, const_tree type1,
29548 const_tree type2)
29550 if (VECTOR_TYPE_P (type1)
29551 && VECTOR_TYPE_P (type2)
29552 && !TYPE_INDIVISIBLE_P (type1)
29553 && !TYPE_INDIVISIBLE_P (type2)
29554 && (aarch64_sve::builtin_type_p (type1)
29555 != aarch64_sve::builtin_type_p (type2)))
29556 return N_("cannot combine GNU and SVE vectors in a binary operation");
29558 /* Reject all 2-operand operations on __mfp8. */
29559 if (TYPE_MAIN_VARIANT (type1) == aarch64_mfp8_type_node
29560 || TYPE_MAIN_VARIANT (type2) == aarch64_mfp8_type_node)
29561 return N_ ("operation not permitted on type %<mfloat8_t%>");
29563 /* Operation allowed. */
29564 return NULL;
29567 /* Implement TARGET_MEMTAG_CAN_TAG_ADDRESSES. Here we tell the rest of the
29568 compiler that we automatically ignore the top byte of our pointers, which
29569 allows using -fsanitize=hwaddress. */
29570 bool
29571 aarch64_can_tag_addresses ()
29573 return !TARGET_ILP32;
29576 /* Implement TARGET_ASM_FILE_END for AArch64. This adds the AArch64 GNU NOTE
29577 section at the end if needed. */
29578 #define GNU_PROPERTY_AARCH64_FEATURE_1_AND 0xc0000000
29579 #define GNU_PROPERTY_AARCH64_FEATURE_1_BTI (1U << 0)
29580 #define GNU_PROPERTY_AARCH64_FEATURE_1_PAC (1U << 1)
29581 #define GNU_PROPERTY_AARCH64_FEATURE_1_GCS (1U << 2)
29582 void
29583 aarch64_file_end_indicate_exec_stack ()
29585 file_end_indicate_exec_stack ();
29587 unsigned feature_1_and = 0;
29588 if (aarch_bti_enabled ())
29589 feature_1_and |= GNU_PROPERTY_AARCH64_FEATURE_1_BTI;
29591 if (aarch_ra_sign_scope != AARCH_FUNCTION_NONE)
29592 feature_1_and |= GNU_PROPERTY_AARCH64_FEATURE_1_PAC;
29594 if (aarch64_gcs_enabled ())
29595 feature_1_and |= GNU_PROPERTY_AARCH64_FEATURE_1_GCS;
29597 if (feature_1_and)
29599 /* Generate .note.gnu.property section. */
29600 switch_to_section (get_section (".note.gnu.property",
29601 SECTION_NOTYPE, NULL));
29603 /* PT_NOTE header: namesz, descsz, type.
29604 namesz = 4 ("GNU\0")
29605 descsz = 16 (Size of the program property array)
29606 [(12 + padding) * Number of array elements]
29607 type = 5 (NT_GNU_PROPERTY_TYPE_0). */
29608 assemble_align (POINTER_SIZE);
29609 assemble_integer (GEN_INT (4), 4, 32, 1);
29610 assemble_integer (GEN_INT (ROUND_UP (12, POINTER_BYTES)), 4, 32, 1);
29611 assemble_integer (GEN_INT (5), 4, 32, 1);
29613 /* PT_NOTE name. */
29614 assemble_string ("GNU", 4);
29616 /* PT_NOTE contents for NT_GNU_PROPERTY_TYPE_0:
29617 type = GNU_PROPERTY_AARCH64_FEATURE_1_AND
29618 datasz = 4
29619 data = feature_1_and. */
29620 assemble_integer (GEN_INT (GNU_PROPERTY_AARCH64_FEATURE_1_AND), 4, 32, 1);
29621 assemble_integer (GEN_INT (4), 4, 32, 1);
29622 assemble_integer (GEN_INT (feature_1_and), 4, 32, 1);
29624 /* Pad the size of the note to the required alignment. */
29625 assemble_align (POINTER_SIZE);
29628 #undef GNU_PROPERTY_AARCH64_FEATURE_1_GCS
29629 #undef GNU_PROPERTY_AARCH64_FEATURE_1_PAC
29630 #undef GNU_PROPERTY_AARCH64_FEATURE_1_BTI
29631 #undef GNU_PROPERTY_AARCH64_FEATURE_1_AND
29633 /* Helper function for straight line speculation.
29634 Return what barrier should be emitted for straight line speculation
29635 mitigation.
29636 When not mitigating against straight line speculation this function returns
29637 an empty string.
29638 When mitigating against straight line speculation, use:
29639 * SB when the v8.5-A SB extension is enabled.
29640 * DSB+ISB otherwise. */
29641 const char *
29642 aarch64_sls_barrier (int mitigation_required)
29644 return mitigation_required
29645 ? (TARGET_SB ? "sb" : "dsb\tsy\n\tisb")
29646 : "";
29649 static GTY (()) tree aarch64_sls_shared_thunks[30];
29650 static GTY (()) bool aarch64_sls_shared_thunks_needed = false;
29651 const char *indirect_symbol_names[30] = {
29652 "__call_indirect_x0",
29653 "__call_indirect_x1",
29654 "__call_indirect_x2",
29655 "__call_indirect_x3",
29656 "__call_indirect_x4",
29657 "__call_indirect_x5",
29658 "__call_indirect_x6",
29659 "__call_indirect_x7",
29660 "__call_indirect_x8",
29661 "__call_indirect_x9",
29662 "__call_indirect_x10",
29663 "__call_indirect_x11",
29664 "__call_indirect_x12",
29665 "__call_indirect_x13",
29666 "__call_indirect_x14",
29667 "__call_indirect_x15",
29668 "", /* "__call_indirect_x16", */
29669 "", /* "__call_indirect_x17", */
29670 "__call_indirect_x18",
29671 "__call_indirect_x19",
29672 "__call_indirect_x20",
29673 "__call_indirect_x21",
29674 "__call_indirect_x22",
29675 "__call_indirect_x23",
29676 "__call_indirect_x24",
29677 "__call_indirect_x25",
29678 "__call_indirect_x26",
29679 "__call_indirect_x27",
29680 "__call_indirect_x28",
29681 "__call_indirect_x29",
29684 /* Function to create a BLR thunk. This thunk is used to mitigate straight
29685 line speculation. Instead of a simple BLR that can be speculated past,
29686 we emit a BL to this thunk, and this thunk contains a BR to the relevant
29687 register. These thunks have the relevant speculation barries put after
29688 their indirect branch so that speculation is blocked.
29690 We use such a thunk so the speculation barriers are kept off the
29691 architecturally executed path in order to reduce the performance overhead.
29693 When optimizing for size we use stubs shared by the linked object.
29694 When optimizing for performance we emit stubs for each function in the hope
29695 that the branch predictor can better train on jumps specific for a given
29696 function. */
29698 aarch64_sls_create_blr_label (int regnum)
29700 gcc_assert (STUB_REGNUM_P (regnum));
29701 if (optimize_function_for_size_p (cfun))
29703 /* For the thunks shared between different functions in this compilation
29704 unit we use a named symbol -- this is just for users to more easily
29705 understand the generated assembly. */
29706 aarch64_sls_shared_thunks_needed = true;
29707 const char *thunk_name = indirect_symbol_names[regnum];
29708 if (aarch64_sls_shared_thunks[regnum] == NULL)
29710 /* Build a decl representing this function stub and record it for
29711 later. We build a decl here so we can use the GCC machinery for
29712 handling sections automatically (through `get_named_section` and
29713 `make_decl_one_only`). That saves us a lot of trouble handling
29714 the specifics of different output file formats. */
29715 tree decl = build_decl (BUILTINS_LOCATION, FUNCTION_DECL,
29716 get_identifier (thunk_name),
29717 build_function_type_list (void_type_node,
29718 NULL_TREE));
29719 DECL_RESULT (decl) = build_decl (BUILTINS_LOCATION, RESULT_DECL,
29720 NULL_TREE, void_type_node);
29721 TREE_PUBLIC (decl) = 1;
29722 TREE_STATIC (decl) = 1;
29723 DECL_IGNORED_P (decl) = 1;
29724 DECL_ARTIFICIAL (decl) = 1;
29725 make_decl_one_only (decl, DECL_ASSEMBLER_NAME (decl));
29726 resolve_unique_section (decl, 0, false);
29727 aarch64_sls_shared_thunks[regnum] = decl;
29730 return gen_rtx_SYMBOL_REF (Pmode, thunk_name);
29733 if (cfun->machine->call_via[regnum] == NULL)
29734 cfun->machine->call_via[regnum]
29735 = gen_rtx_LABEL_REF (Pmode, gen_label_rtx ());
29736 return cfun->machine->call_via[regnum];
29739 /* Helper function for aarch64_sls_emit_blr_function_thunks and
29740 aarch64_sls_emit_shared_blr_thunks below. */
29741 static void
29742 aarch64_sls_emit_function_stub (FILE *out_file, int regnum)
29744 /* Save in x16 and branch to that function so this transformation does
29745 not prevent jumping to `BTI c` instructions. */
29746 asm_fprintf (out_file, "\tmov\tx16, x%d\n", regnum);
29747 asm_fprintf (out_file, "\tbr\tx16\n");
29750 /* Emit all BLR stubs for this particular function.
29751 Here we emit all the BLR stubs needed for the current function. Since we
29752 emit these stubs in a consecutive block we know there will be no speculation
29753 gadgets between each stub, and hence we only emit a speculation barrier at
29754 the end of the stub sequences.
29756 This is called in the TARGET_ASM_FUNCTION_EPILOGUE hook. */
29757 void
29758 aarch64_sls_emit_blr_function_thunks (FILE *out_file)
29760 if (! aarch64_harden_sls_blr_p ())
29761 return;
29763 bool any_functions_emitted = false;
29764 /* We must save and restore the current function section since this assembly
29765 is emitted at the end of the function. This means it can be emitted *just
29766 after* the cold section of a function. That cold part would be emitted in
29767 a different section. That switch would trigger a `.cfi_endproc` directive
29768 to be emitted in the original section and a `.cfi_startproc` directive to
29769 be emitted in the new section. Switching to the original section without
29770 restoring would mean that the `.cfi_endproc` emitted as a function ends
29771 would happen in a different section -- leaving an unmatched
29772 `.cfi_startproc` in the cold text section and an unmatched `.cfi_endproc`
29773 in the standard text section. */
29774 section *save_text_section = in_section;
29775 switch_to_section (function_section (current_function_decl));
29776 for (int regnum = 0; regnum < 30; ++regnum)
29778 rtx specu_label = cfun->machine->call_via[regnum];
29779 if (specu_label == NULL)
29780 continue;
29782 targetm.asm_out.print_operand (out_file, specu_label, 0);
29783 asm_fprintf (out_file, ":\n");
29784 aarch64_sls_emit_function_stub (out_file, regnum);
29785 any_functions_emitted = true;
29787 if (any_functions_emitted)
29788 /* Can use the SB if needs be here, since this stub will only be used
29789 by the current function, and hence for the current target. */
29790 asm_fprintf (out_file, "\t%s\n", aarch64_sls_barrier (true));
29791 switch_to_section (save_text_section);
29794 /* Emit shared BLR stubs for the current compilation unit.
29795 Over the course of compiling this unit we may have converted some BLR
29796 instructions to a BL to a shared stub function. This is where we emit those
29797 stub functions.
29798 This function is for the stubs shared between different functions in this
29799 compilation unit. We share when optimizing for size instead of speed.
29801 This function is called through the TARGET_ASM_FILE_END hook. */
29802 void
29803 aarch64_sls_emit_shared_blr_thunks (FILE *out_file)
29805 if (! aarch64_sls_shared_thunks_needed)
29806 return;
29808 for (int regnum = 0; regnum < 30; ++regnum)
29810 tree decl = aarch64_sls_shared_thunks[regnum];
29811 if (!decl)
29812 continue;
29814 const char *name = indirect_symbol_names[regnum];
29815 switch_to_section (get_named_section (decl, NULL, 0));
29816 ASM_OUTPUT_ALIGN (out_file, 2);
29817 targetm.asm_out.globalize_label (out_file, name);
29818 /* Only emits if the compiler is configured for an assembler that can
29819 handle visibility directives. */
29820 targetm.asm_out.assemble_visibility (decl, VISIBILITY_HIDDEN);
29821 ASM_OUTPUT_TYPE_DIRECTIVE (out_file, name, "function");
29822 ASM_OUTPUT_LABEL (out_file, name);
29823 aarch64_sls_emit_function_stub (out_file, regnum);
29824 /* Use the most conservative target to ensure it can always be used by any
29825 function in the translation unit. */
29826 asm_fprintf (out_file, "\tdsb\tsy\n\tisb\n");
29827 ASM_DECLARE_FUNCTION_SIZE (out_file, name, decl);
29831 /* Implement TARGET_ASM_FILE_END. */
29832 void
29833 aarch64_asm_file_end ()
29835 aarch64_sls_emit_shared_blr_thunks (asm_out_file);
29836 /* Since this function will be called for the ASM_FILE_END hook, we ensure
29837 that what would be called otherwise (e.g. `file_end_indicate_exec_stack`
29838 for FreeBSD) still gets called. */
29839 #ifdef TARGET_ASM_FILE_END
29840 TARGET_ASM_FILE_END ();
29841 #endif
29844 const char *
29845 aarch64_indirect_call_asm (rtx addr)
29847 gcc_assert (REG_P (addr));
29848 if (aarch64_harden_sls_blr_p ())
29850 rtx stub_label = aarch64_sls_create_blr_label (REGNO (addr));
29851 output_asm_insn ("bl\t%0", &stub_label);
29853 else
29854 output_asm_insn ("blr\t%0", &addr);
29855 return "";
29858 /* Emit the assembly instruction to load the thread pointer into DEST.
29859 Select between different tpidr_elN registers depending on -mtp= setting. */
29861 const char *
29862 aarch64_output_load_tp (rtx dest)
29864 const char *tpidrs[] = {"tpidr_el0", "tpidr_el1", "tpidr_el2",
29865 "tpidr_el3", "tpidrro_el0"};
29866 char buffer[64];
29867 snprintf (buffer, sizeof (buffer), "mrs\t%%0, %s",
29868 tpidrs[aarch64_tpidr_register]);
29869 output_asm_insn (buffer, &dest);
29870 return "";
29873 /* Set up the value of REG_ALLOC_ORDER from scratch.
29875 It was previously good practice to put call-clobbered registers ahead
29876 of call-preserved registers, but that isn't necessary these days.
29877 IRA's model of register save/restore costs is much more sophisticated
29878 than the model that a simple ordering could provide. We leave
29879 HONOR_REG_ALLOC_ORDER undefined so that we can get the full benefit
29880 of IRA's model.
29882 However, it is still useful to list registers that are members of
29883 multiple classes after registers that are members of fewer classes.
29884 For example, we have:
29886 - FP_LO8_REGS: v0-v7
29887 - FP_LO_REGS: v0-v15
29888 - FP_REGS: v0-v31
29890 If, as a tie-breaker, we allocate FP_REGS in the order v0-v31,
29891 we run the risk of starving other (lower-priority) pseudos that
29892 require FP_LO8_REGS or FP_LO_REGS. Allocating FP_LO_REGS in the
29893 order v0-v15 could similarly starve pseudos that require FP_LO8_REGS.
29894 Allocating downwards rather than upwards avoids this problem, at least
29895 in code that has reasonable register pressure.
29897 The situation for predicate registers is similar. */
29899 void
29900 aarch64_adjust_reg_alloc_order ()
29902 for (int i = 0; i < FIRST_PSEUDO_REGISTER; ++i)
29903 if (IN_RANGE (i, V0_REGNUM, V31_REGNUM))
29904 reg_alloc_order[i] = V31_REGNUM - (i - V0_REGNUM);
29905 else if (IN_RANGE (i, P0_REGNUM, P15_REGNUM))
29906 reg_alloc_order[i] = P15_REGNUM - (i - P0_REGNUM);
29907 else
29908 reg_alloc_order[i] = i;
29911 /* Return true if the PARALLEL PAR can be used in a VEC_SELECT expression
29912 of vector mode MODE to select half the elements of that vector.
29913 Allow any combination of indices except duplicates (or out of range of
29914 the mode units). */
29916 bool
29917 aarch64_parallel_select_half_p (machine_mode mode, rtx par)
29919 int nunits = XVECLEN (par, 0);
29920 if (!known_eq (GET_MODE_NUNITS (mode), nunits * 2))
29921 return false;
29922 int mode_nunits = nunits * 2;
29923 /* Put all the elements of PAR into a hash_set and use its
29924 uniqueness guarantees to check that we don't try to insert the same
29925 element twice. */
29926 hash_set<rtx> parset;
29927 for (int i = 0; i < nunits; ++i)
29929 rtx elt = XVECEXP (par, 0, i);
29930 if (!CONST_INT_P (elt)
29931 || !IN_RANGE (INTVAL (elt), 0, mode_nunits - 1)
29932 || parset.add (elt))
29933 return false;
29935 return true;
29938 /* Return true if PAR1 and PAR2, two PARALLEL rtxes of CONST_INT values,
29939 contain any common elements. */
29941 bool
29942 aarch64_pars_overlap_p (rtx par1, rtx par2)
29944 int len1 = XVECLEN (par1, 0);
29945 int len2 = XVECLEN (par2, 0);
29946 hash_set<rtx> parset;
29947 for (int i = 0; i < len1; ++i)
29948 parset.add (XVECEXP (par1, 0, i));
29949 for (int i = 0; i < len2; ++i)
29950 if (parset.contains (XVECEXP (par2, 0, i)))
29951 return true;
29952 return false;
29955 /* Implement OPTIMIZE_MODE_SWITCHING. */
29957 bool
29958 aarch64_optimize_mode_switching (aarch64_mode_entity entity)
29960 bool have_sme_state = (aarch64_cfun_incoming_pstate_za () != 0
29961 || (aarch64_cfun_has_new_state ("za")
29962 && df_regs_ever_live_p (ZA_REGNUM))
29963 || (aarch64_cfun_has_new_state ("zt0")
29964 && df_regs_ever_live_p (ZT0_REGNUM)));
29966 if (have_sme_state && nonlocal_goto_handler_labels)
29968 static bool reported;
29969 if (!reported)
29971 sorry ("non-local gotos in functions with SME state");
29972 reported = true;
29976 switch (entity)
29978 case aarch64_mode_entity::HAVE_ZA_SAVE_BUFFER:
29979 case aarch64_mode_entity::LOCAL_SME_STATE:
29980 return have_sme_state && !nonlocal_goto_handler_labels;
29982 gcc_unreachable ();
29985 /* Implement TARGET_MODE_EMIT for ZA_SAVE_BUFFER. */
29987 static void
29988 aarch64_mode_emit_za_save_buffer (aarch64_tristate_mode mode,
29989 aarch64_tristate_mode prev_mode)
29991 if (mode == aarch64_tristate_mode::YES)
29993 gcc_assert (prev_mode == aarch64_tristate_mode::NO);
29994 aarch64_init_tpidr2_block ();
29996 else
29997 gcc_unreachable ();
30000 /* Implement TARGET_MODE_EMIT for LOCAL_SME_STATE. */
30002 static void
30003 aarch64_mode_emit_local_sme_state (aarch64_local_sme_state mode,
30004 aarch64_local_sme_state prev_mode)
30006 /* Back-propagation should ensure that we're always starting from
30007 a known mode. */
30008 gcc_assert (prev_mode != aarch64_local_sme_state::ANY);
30010 if (prev_mode == aarch64_local_sme_state::INACTIVE_CALLER)
30012 /* Commit any uncommitted lazy save. This leaves ZA either active
30013 and zero (lazy save case) or off (normal case).
30015 The sequence is:
30017 mrs <temp>, tpidr2_el0
30018 cbz <temp>, no_save
30019 bl __arm_tpidr2_save
30020 msr tpidr2_el0, xzr
30021 zero { za } // Only if ZA is live
30022 zero { zt0 } // Only if ZT0 is live
30023 no_save: */
30024 auto tmp_reg = gen_reg_rtx (DImode);
30025 emit_insn (gen_aarch64_read_tpidr2 (tmp_reg));
30026 auto label = gen_label_rtx ();
30027 rtx branch = aarch64_gen_compare_zero_and_branch (EQ, tmp_reg, label);
30028 auto jump = emit_jump_insn (branch);
30029 JUMP_LABEL (jump) = label;
30030 emit_insn (gen_aarch64_tpidr2_save ());
30031 emit_insn (gen_aarch64_clear_tpidr2 ());
30032 if (mode == aarch64_local_sme_state::ACTIVE_LIVE
30033 || mode == aarch64_local_sme_state::ACTIVE_DEAD)
30035 if (aarch64_cfun_has_state ("za"))
30036 emit_insn (gen_aarch64_initial_zero_za ());
30037 if (aarch64_cfun_has_state ("zt0"))
30038 emit_insn (gen_aarch64_sme_zero_zt0 ());
30040 emit_label (label);
30043 if (mode == aarch64_local_sme_state::ACTIVE_LIVE
30044 || mode == aarch64_local_sme_state::ACTIVE_DEAD)
30046 if (prev_mode == aarch64_local_sme_state::INACTIVE_LOCAL)
30048 /* Make ZA active after being inactive.
30050 First handle the case in which the lazy save we set up was
30051 committed by a callee. If the function's source-level ZA state
30052 is live then we must conditionally restore it from the lazy
30053 save buffer. Otherwise we can just force PSTATE.ZA to 1. */
30054 if (mode == aarch64_local_sme_state::ACTIVE_LIVE)
30055 emit_insn (gen_aarch64_restore_za (aarch64_get_tpidr2_ptr ()));
30056 else
30057 emit_insn (gen_aarch64_smstart_za ());
30059 /* Now handle the case in which the lazy save was not committed.
30060 In that case, ZA still contains the current function's ZA state,
30061 and we just need to cancel the lazy save. */
30062 emit_insn (gen_aarch64_clear_tpidr2 ());
30064 /* Restore the ZT0 state, if we have some. */
30065 if (aarch64_cfun_has_state ("zt0"))
30066 aarch64_restore_zt0 (true);
30068 return;
30071 if (prev_mode == aarch64_local_sme_state::SAVED_LOCAL)
30073 /* Retrieve the current function's ZA state from the lazy save
30074 buffer. */
30075 aarch64_restore_za (aarch64_get_tpidr2_ptr ());
30077 /* Restore the ZT0 state, if we have some. */
30078 if (aarch64_cfun_has_state ("zt0"))
30079 aarch64_restore_zt0 (true);
30080 return;
30083 if (prev_mode == aarch64_local_sme_state::INACTIVE_CALLER
30084 || prev_mode == aarch64_local_sme_state::OFF)
30086 /* INACTIVE_CALLER means that we are enabling ZA for the first
30087 time in this function. The code above means that ZA is either
30088 active and zero (if we committed a lazy save) or off. Handle
30089 the latter case by forcing ZA on.
30091 OFF means that PSTATE.ZA is guaranteed to be 0. We just need
30092 to force it to 1.
30094 Both cases leave ZA zeroed. */
30095 emit_insn (gen_aarch64_smstart_za ());
30097 /* Restore the ZT0 state, if we have some. */
30098 if (prev_mode == aarch64_local_sme_state::OFF
30099 && aarch64_cfun_has_state ("zt0"))
30100 aarch64_restore_zt0 (true);
30101 return;
30104 if (prev_mode == aarch64_local_sme_state::ACTIVE_DEAD
30105 || prev_mode == aarch64_local_sme_state::ACTIVE_LIVE)
30106 /* A simple change in liveness, such as in a CFG structure where
30107 ZA is only conditionally defined. No code is needed. */
30108 return;
30110 gcc_unreachable ();
30113 if (mode == aarch64_local_sme_state::INACTIVE_LOCAL)
30115 if (prev_mode == aarch64_local_sme_state::ACTIVE_LIVE
30116 || prev_mode == aarch64_local_sme_state::ACTIVE_DEAD
30117 || prev_mode == aarch64_local_sme_state::INACTIVE_CALLER)
30119 /* Save the ZT0 state, if we have some. */
30120 if (aarch64_cfun_has_state ("zt0"))
30121 aarch64_save_zt0 ();
30123 /* A transition from ACTIVE_LIVE to INACTIVE_LOCAL is the usual
30124 case of setting up a lazy save buffer before a call.
30125 A transition from INACTIVE_CALLER is similar, except that
30126 the contents of ZA are known to be zero.
30128 A transition from ACTIVE_DEAD means that ZA is live at the
30129 point of the transition, but is dead on at least one incoming
30130 edge. (That is, ZA is only conditionally initialized.)
30131 For efficiency, we want to set up a lazy save even for
30132 dead contents, since forcing ZA off would make later code
30133 restore ZA from the lazy save buffer. */
30134 emit_insn (gen_aarch64_write_tpidr2 (aarch64_get_tpidr2_ptr ()));
30135 return;
30138 if (prev_mode == aarch64_local_sme_state::SAVED_LOCAL
30139 || prev_mode == aarch64_local_sme_state::OFF)
30140 /* We're simply discarding the information about which inactive
30141 state applies. */
30142 return;
30144 gcc_unreachable ();
30147 if (mode == aarch64_local_sme_state::INACTIVE_CALLER
30148 || mode == aarch64_local_sme_state::OFF)
30150 /* Save the ZT0 state, if we have some. */
30151 if ((prev_mode == aarch64_local_sme_state::ACTIVE_LIVE
30152 || prev_mode == aarch64_local_sme_state::ACTIVE_DEAD)
30153 && mode == aarch64_local_sme_state::OFF
30154 && aarch64_cfun_has_state ("zt0"))
30155 aarch64_save_zt0 ();
30157 /* The transition to INACTIVE_CALLER is used before returning from
30158 new("za") functions. Any state in ZA belongs to the current
30159 function rather than a caller, but that state is no longer
30160 needed. Clear any pending lazy save and turn ZA off.
30162 The transition to OFF is used before calling a private-ZA function.
30163 We committed any incoming lazy save above, so at this point any
30164 contents in ZA belong to the current function. */
30165 if (prev_mode == aarch64_local_sme_state::INACTIVE_LOCAL)
30166 emit_insn (gen_aarch64_clear_tpidr2 ());
30168 if (prev_mode != aarch64_local_sme_state::OFF
30169 && prev_mode != aarch64_local_sme_state::SAVED_LOCAL)
30170 emit_insn (gen_aarch64_smstop_za ());
30172 return;
30175 if (mode == aarch64_local_sme_state::SAVED_LOCAL)
30177 /* This is a transition to an exception handler. */
30178 gcc_assert (prev_mode == aarch64_local_sme_state::OFF
30179 || prev_mode == aarch64_local_sme_state::INACTIVE_LOCAL);
30180 return;
30183 gcc_unreachable ();
30186 /* Implement TARGET_MODE_EMIT. */
30188 static void
30189 aarch64_mode_emit (int entity, int mode, int prev_mode, HARD_REG_SET live)
30191 if (mode == prev_mode)
30192 return;
30194 start_sequence ();
30195 switch (aarch64_mode_entity (entity))
30197 case aarch64_mode_entity::HAVE_ZA_SAVE_BUFFER:
30198 aarch64_mode_emit_za_save_buffer (aarch64_tristate_mode (mode),
30199 aarch64_tristate_mode (prev_mode));
30200 break;
30202 case aarch64_mode_entity::LOCAL_SME_STATE:
30203 aarch64_mode_emit_local_sme_state (aarch64_local_sme_state (mode),
30204 aarch64_local_sme_state (prev_mode));
30205 break;
30207 rtx_insn *seq = get_insns ();
30208 end_sequence ();
30210 /* Get the set of clobbered registers that are currently live. */
30211 HARD_REG_SET clobbers = {};
30212 for (rtx_insn *insn = seq; insn; insn = NEXT_INSN (insn))
30214 if (!NONDEBUG_INSN_P (insn))
30215 continue;
30216 vec_rtx_properties properties;
30217 properties.add_insn (insn, false);
30218 for (rtx_obj_reference ref : properties.refs ())
30219 if (ref.is_write () && HARD_REGISTER_NUM_P (ref.regno))
30220 SET_HARD_REG_BIT (clobbers, ref.regno);
30222 clobbers &= live;
30224 /* Emit instructions to save clobbered registers to pseudos. Queue
30225 instructions to restore the registers afterwards.
30227 This should only needed in rare situations. */
30228 auto_vec<rtx, 33> after;
30229 for (unsigned int regno = R0_REGNUM; regno < R30_REGNUM; ++regno)
30230 if (TEST_HARD_REG_BIT (clobbers, regno))
30232 rtx hard_reg = gen_rtx_REG (DImode, regno);
30233 rtx pseudo_reg = gen_reg_rtx (DImode);
30234 emit_move_insn (pseudo_reg, hard_reg);
30235 after.quick_push (gen_move_insn (hard_reg, pseudo_reg));
30237 if (TEST_HARD_REG_BIT (clobbers, CC_REGNUM))
30239 rtx pseudo_reg = gen_reg_rtx (DImode);
30240 emit_insn (gen_aarch64_save_nzcv (pseudo_reg));
30241 after.quick_push (gen_aarch64_restore_nzcv (pseudo_reg));
30244 /* Emit the transition instructions themselves. */
30245 emit_insn (seq);
30247 /* Restore the clobbered registers. */
30248 for (auto *insn : after)
30249 emit_insn (insn);
30252 /* Return true if INSN references the SME state represented by hard register
30253 REGNO. */
30255 static bool
30256 aarch64_insn_references_sme_state_p (rtx_insn *insn, unsigned int regno)
30258 df_ref ref;
30259 FOR_EACH_INSN_DEF (ref, insn)
30260 if (!DF_REF_FLAGS_IS_SET (ref, DF_REF_MUST_CLOBBER)
30261 && DF_REF_REGNO (ref) == regno)
30262 return true;
30263 FOR_EACH_INSN_USE (ref, insn)
30264 if (DF_REF_REGNO (ref) == regno)
30265 return true;
30266 return false;
30269 /* Implement TARGET_MODE_NEEDED for LOCAL_SME_STATE. */
30271 static aarch64_local_sme_state
30272 aarch64_mode_needed_local_sme_state (rtx_insn *insn, HARD_REG_SET live)
30274 if (!CALL_P (insn)
30275 && find_reg_note (insn, REG_EH_REGION, NULL_RTX))
30277 static bool reported;
30278 if (!reported)
30280 sorry ("catching non-call exceptions in functions with SME state");
30281 reported = true;
30283 /* Aim for graceful error recovery by picking the value that is
30284 least likely to generate an ICE. */
30285 return aarch64_local_sme_state::INACTIVE_LOCAL;
30288 /* A non-local goto is equivalent to a return. We disallow non-local
30289 receivers in functions with SME state, so we know that the target
30290 expects ZA to be dormant or off. */
30291 if (JUMP_P (insn)
30292 && find_reg_note (insn, REG_NON_LOCAL_GOTO, NULL_RTX))
30293 return aarch64_local_sme_state::INACTIVE_CALLER;
30295 /* start_private_za_call and end_private_za_call bracket a sequence
30296 that calls a private-ZA function. Force ZA to be turned off if the
30297 function doesn't have any live ZA state, otherwise require ZA to be
30298 inactive. */
30299 auto icode = recog_memoized (insn);
30300 if (icode == CODE_FOR_aarch64_start_private_za_call
30301 || icode == CODE_FOR_aarch64_end_private_za_call)
30302 return (TEST_HARD_REG_BIT (live, ZA_REGNUM)
30303 ? aarch64_local_sme_state::INACTIVE_LOCAL
30304 : aarch64_local_sme_state::OFF);
30306 /* Force ZA to contain the current function's ZA state if INSN wants
30307 to access it. Do the same for accesses to ZT0, since ZA and ZT0
30308 are both controlled by PSTATE.ZA. */
30309 if (aarch64_insn_references_sme_state_p (insn, ZA_REGNUM)
30310 || aarch64_insn_references_sme_state_p (insn, ZT0_REGNUM))
30311 return (TEST_HARD_REG_BIT (live, ZA_REGNUM)
30312 ? aarch64_local_sme_state::ACTIVE_LIVE
30313 : aarch64_local_sme_state::ACTIVE_DEAD);
30315 return aarch64_local_sme_state::ANY;
30318 /* Implement TARGET_MODE_NEEDED for ZA_SAVE_BUFFER. */
30320 static aarch64_tristate_mode
30321 aarch64_mode_needed_za_save_buffer (rtx_insn *insn, HARD_REG_SET live)
30323 /* We need to set up a lazy save buffer no later than the first
30324 transition to INACTIVE_LOCAL (which involves setting up a lazy save). */
30325 if (aarch64_mode_needed_local_sme_state (insn, live)
30326 == aarch64_local_sme_state::INACTIVE_LOCAL)
30327 return aarch64_tristate_mode::YES;
30329 /* Also make sure that the lazy save buffer is set up before the first
30330 insn that throws internally. The exception handler will sometimes
30331 load from it. */
30332 if (find_reg_note (insn, REG_EH_REGION, NULL_RTX))
30333 return aarch64_tristate_mode::YES;
30335 return aarch64_tristate_mode::MAYBE;
30338 /* Implement TARGET_MODE_NEEDED. */
30340 static int
30341 aarch64_mode_needed (int entity, rtx_insn *insn, HARD_REG_SET live)
30343 switch (aarch64_mode_entity (entity))
30345 case aarch64_mode_entity::HAVE_ZA_SAVE_BUFFER:
30346 return int (aarch64_mode_needed_za_save_buffer (insn, live));
30348 case aarch64_mode_entity::LOCAL_SME_STATE:
30349 return int (aarch64_mode_needed_local_sme_state (insn, live));
30351 gcc_unreachable ();
30354 /* Implement TARGET_MODE_AFTER for LOCAL_SME_STATE. */
30356 static aarch64_local_sme_state
30357 aarch64_mode_after_local_sme_state (aarch64_local_sme_state mode,
30358 HARD_REG_SET live)
30360 /* Note places where ZA dies, so that we can try to avoid saving and
30361 restoring state that isn't needed. */
30362 if (mode == aarch64_local_sme_state::ACTIVE_LIVE
30363 && !TEST_HARD_REG_BIT (live, ZA_REGNUM))
30364 return aarch64_local_sme_state::ACTIVE_DEAD;
30366 /* Note where ZA is born, e.g. when moving past an __arm_out("za")
30367 function. */
30368 if (mode == aarch64_local_sme_state::ACTIVE_DEAD
30369 && TEST_HARD_REG_BIT (live, ZA_REGNUM))
30370 return aarch64_local_sme_state::ACTIVE_LIVE;
30372 return mode;
30375 /* Implement TARGET_MODE_AFTER. */
30377 static int
30378 aarch64_mode_after (int entity, int mode, rtx_insn *, HARD_REG_SET live)
30380 switch (aarch64_mode_entity (entity))
30382 case aarch64_mode_entity::HAVE_ZA_SAVE_BUFFER:
30383 return mode;
30385 case aarch64_mode_entity::LOCAL_SME_STATE:
30386 return int (aarch64_mode_after_local_sme_state
30387 (aarch64_local_sme_state (mode), live));
30389 gcc_unreachable ();
30392 /* Implement TARGET_MODE_CONFLUENCE for LOCAL_SME_STATE. */
30394 static aarch64_local_sme_state
30395 aarch64_local_sme_confluence (aarch64_local_sme_state mode1,
30396 aarch64_local_sme_state mode2)
30398 /* Perform a symmetrical check for two values. */
30399 auto is_pair = [&](aarch64_local_sme_state val1,
30400 aarch64_local_sme_state val2)
30402 return ((mode1 == val1 && mode2 == val2)
30403 || (mode1 == val2 && mode2 == val1));
30406 /* INACTIVE_CALLER means ZA is off or it has dormant contents belonging
30407 to a caller. OFF is one of the options. */
30408 if (is_pair (aarch64_local_sme_state::INACTIVE_CALLER,
30409 aarch64_local_sme_state::OFF))
30410 return aarch64_local_sme_state::INACTIVE_CALLER;
30412 /* Similarly for dormant contents belonging to the current function. */
30413 if (is_pair (aarch64_local_sme_state::INACTIVE_LOCAL,
30414 aarch64_local_sme_state::OFF))
30415 return aarch64_local_sme_state::INACTIVE_LOCAL;
30417 /* Treat a conditionally-initialized value as a fully-initialized value. */
30418 if (is_pair (aarch64_local_sme_state::ACTIVE_LIVE,
30419 aarch64_local_sme_state::ACTIVE_DEAD))
30420 return aarch64_local_sme_state::ACTIVE_LIVE;
30422 return aarch64_local_sme_state::ANY;
30425 /* Implement TARGET_MODE_CONFLUENCE. */
30427 static int
30428 aarch64_mode_confluence (int entity, int mode1, int mode2)
30430 gcc_assert (mode1 != mode2);
30431 switch (aarch64_mode_entity (entity))
30433 case aarch64_mode_entity::HAVE_ZA_SAVE_BUFFER:
30434 return int (aarch64_tristate_mode::MAYBE);
30436 case aarch64_mode_entity::LOCAL_SME_STATE:
30437 return int (aarch64_local_sme_confluence
30438 (aarch64_local_sme_state (mode1),
30439 aarch64_local_sme_state (mode2)));
30441 gcc_unreachable ();
30444 /* Implement TARGET_MODE_BACKPROP for an entity that either stays
30445 NO throughput, or makes one transition from NO to YES. */
30447 static aarch64_tristate_mode
30448 aarch64_one_shot_backprop (aarch64_tristate_mode mode1,
30449 aarch64_tristate_mode mode2)
30451 /* Keep bringing the transition forward until it starts from NO. */
30452 if (mode1 == aarch64_tristate_mode::MAYBE
30453 && mode2 == aarch64_tristate_mode::YES)
30454 return mode2;
30456 return aarch64_tristate_mode::MAYBE;
30459 /* Implement TARGET_MODE_BACKPROP for LOCAL_SME_STATE. */
30461 static aarch64_local_sme_state
30462 aarch64_local_sme_backprop (aarch64_local_sme_state mode1,
30463 aarch64_local_sme_state mode2)
30465 /* We always need to know what the current state is when transitioning
30466 to a new state. Force any location with indeterminate starting state
30467 to be active. */
30468 if (mode1 == aarch64_local_sme_state::ANY)
30469 switch (mode2)
30471 case aarch64_local_sme_state::INACTIVE_CALLER:
30472 case aarch64_local_sme_state::OFF:
30473 case aarch64_local_sme_state::ACTIVE_DEAD:
30474 /* The current function's ZA state is not live. */
30475 return aarch64_local_sme_state::ACTIVE_DEAD;
30477 case aarch64_local_sme_state::INACTIVE_LOCAL:
30478 case aarch64_local_sme_state::ACTIVE_LIVE:
30479 /* The current function's ZA state is live. */
30480 return aarch64_local_sme_state::ACTIVE_LIVE;
30482 case aarch64_local_sme_state::SAVED_LOCAL:
30483 /* This is a transition to an exception handler. Since we don't
30484 support non-call exceptions for SME functions, the source of
30485 the transition must be known. We'll assert later if that's
30486 not the case. */
30487 return aarch64_local_sme_state::ANY;
30489 case aarch64_local_sme_state::ANY:
30490 return aarch64_local_sme_state::ANY;
30493 return aarch64_local_sme_state::ANY;
30496 /* Implement TARGET_MODE_BACKPROP. */
30498 static int
30499 aarch64_mode_backprop (int entity, int mode1, int mode2)
30501 switch (aarch64_mode_entity (entity))
30503 case aarch64_mode_entity::HAVE_ZA_SAVE_BUFFER:
30504 return int (aarch64_one_shot_backprop (aarch64_tristate_mode (mode1),
30505 aarch64_tristate_mode (mode2)));
30507 case aarch64_mode_entity::LOCAL_SME_STATE:
30508 return int (aarch64_local_sme_backprop
30509 (aarch64_local_sme_state (mode1),
30510 aarch64_local_sme_state (mode2)));
30512 gcc_unreachable ();
30515 /* Implement TARGET_MODE_ENTRY. */
30517 static int
30518 aarch64_mode_entry (int entity)
30520 switch (aarch64_mode_entity (entity))
30522 case aarch64_mode_entity::HAVE_ZA_SAVE_BUFFER:
30523 return int (aarch64_tristate_mode::NO);
30525 case aarch64_mode_entity::LOCAL_SME_STATE:
30526 return int (aarch64_cfun_shared_flags ("za") != 0
30527 ? aarch64_local_sme_state::ACTIVE_LIVE
30528 : aarch64_cfun_incoming_pstate_za () != 0
30529 ? aarch64_local_sme_state::ACTIVE_DEAD
30530 : aarch64_local_sme_state::INACTIVE_CALLER);
30532 gcc_unreachable ();
30535 /* Implement TARGET_MODE_EXIT. */
30537 static int
30538 aarch64_mode_exit (int entity)
30540 switch (aarch64_mode_entity (entity))
30542 case aarch64_mode_entity::HAVE_ZA_SAVE_BUFFER:
30543 return int (aarch64_tristate_mode::MAYBE);
30545 case aarch64_mode_entity::LOCAL_SME_STATE:
30546 return int (aarch64_cfun_shared_flags ("za") != 0
30547 ? aarch64_local_sme_state::ACTIVE_LIVE
30548 : aarch64_cfun_incoming_pstate_za () != 0
30549 ? aarch64_local_sme_state::ACTIVE_DEAD
30550 : aarch64_local_sme_state::INACTIVE_CALLER);
30552 gcc_unreachable ();
30555 /* Implement TARGET_MODE_EH_HANDLER. */
30557 static int
30558 aarch64_mode_eh_handler (int entity)
30560 switch (aarch64_mode_entity (entity))
30562 case aarch64_mode_entity::HAVE_ZA_SAVE_BUFFER:
30563 /* Require a lazy save buffer to be allocated before the first
30564 insn that can throw. */
30565 return int (aarch64_tristate_mode::YES);
30567 case aarch64_mode_entity::LOCAL_SME_STATE:
30568 return int (aarch64_local_sme_state::SAVED_LOCAL);
30570 gcc_unreachable ();
30573 /* Implement TARGET_MODE_PRIORITY. */
30575 static int
30576 aarch64_mode_priority (int, int n)
30578 return n;
30581 /* Implement TARGET_MD_ASM_ADJUST. */
30583 static rtx_insn *
30584 aarch64_md_asm_adjust (vec<rtx> &outputs, vec<rtx> &inputs,
30585 vec<machine_mode> &input_modes,
30586 vec<const char *> &constraints,
30587 vec<rtx> &uses, vec<rtx> &clobbers,
30588 HARD_REG_SET &clobbered_regs, location_t loc)
30590 rtx_insn *seq = arm_md_asm_adjust (outputs, inputs, input_modes, constraints,
30591 uses, clobbers, clobbered_regs, loc);
30593 /* "za" in the clobber list of a function with ZA state is defined to
30594 mean that the asm can read from and write to ZA. We can model the
30595 read using a USE, but unfortunately, it's not possible to model the
30596 write directly. Use a separate insn to model the effect.
30598 We must ensure that ZA is active on entry, which is enforced by using
30599 SME_STATE_REGNUM. The asm must ensure that ZA is active on return.
30601 The same thing applies to ZT0. */
30602 if (TARGET_ZA)
30603 for (unsigned int i = clobbers.length (); i-- > 0; )
30605 rtx x = clobbers[i];
30606 if (REG_P (x)
30607 && (REGNO (x) == ZA_REGNUM || REGNO (x) == ZT0_REGNUM))
30609 auto id = cfun->machine->next_asm_update_za_id++;
30611 start_sequence ();
30612 if (seq)
30613 emit_insn (seq);
30614 rtx id_rtx = gen_int_mode (id, SImode);
30615 emit_insn (REGNO (x) == ZA_REGNUM
30616 ? gen_aarch64_asm_update_za (id_rtx)
30617 : gen_aarch64_asm_update_zt0 (id_rtx));
30618 seq = get_insns ();
30619 end_sequence ();
30621 auto mode = REGNO (x) == ZA_REGNUM ? VNx16QImode : V8DImode;
30622 uses.safe_push (gen_rtx_REG (mode, REGNO (x)));
30623 uses.safe_push (gen_rtx_REG (DImode, SME_STATE_REGNUM));
30625 clobbers.ordered_remove (i);
30626 CLEAR_HARD_REG_BIT (clobbered_regs, REGNO (x));
30629 return seq;
30632 /* BB is the target of an exception or nonlocal goto edge, which means
30633 that PSTATE.SM is known to be 0 on entry. Put it into the state that
30634 the current function requires. */
30636 static bool
30637 aarch64_switch_pstate_sm_for_landing_pad (basic_block bb)
30639 if (TARGET_NON_STREAMING)
30640 return false;
30642 start_sequence ();
30643 rtx_insn *guard_label = nullptr;
30644 if (TARGET_STREAMING_COMPATIBLE)
30645 guard_label = aarch64_guard_switch_pstate_sm (IP0_REGNUM,
30646 AARCH64_ISA_MODE_SM_OFF);
30647 aarch64_sme_mode_switch_regs args_switch;
30648 args_switch.add_call_preserved_regs (df_get_live_in (bb));
30649 args_switch.emit_prologue ();
30650 aarch64_switch_pstate_sm (AARCH64_ISA_MODE_SM_OFF, AARCH64_ISA_MODE_SM_ON);
30651 args_switch.emit_epilogue ();
30652 if (guard_label)
30653 emit_label (guard_label);
30654 auto seq = get_insns ();
30655 end_sequence ();
30657 emit_insn_after (seq, bb_note (bb));
30658 return true;
30661 /* JUMP is a nonlocal goto. Its target requires PSTATE.SM to be 0 on entry,
30662 so arrange to make it so. */
30664 static bool
30665 aarch64_switch_pstate_sm_for_jump (rtx_insn *jump)
30667 if (TARGET_NON_STREAMING)
30668 return false;
30670 start_sequence ();
30671 rtx_insn *guard_label = nullptr;
30672 if (TARGET_STREAMING_COMPATIBLE)
30673 guard_label = aarch64_guard_switch_pstate_sm (IP0_REGNUM,
30674 AARCH64_ISA_MODE_SM_OFF);
30675 aarch64_switch_pstate_sm (AARCH64_ISA_MODE_SM_ON, AARCH64_ISA_MODE_SM_OFF);
30676 if (guard_label)
30677 emit_label (guard_label);
30678 auto seq = get_insns ();
30679 end_sequence ();
30681 emit_insn_before (seq, jump);
30682 return true;
30685 /* If CALL involves a change in PSTATE.SM, emit the instructions needed
30686 to switch to the new mode and the instructions needed to restore the
30687 original mode. Return true if something changed. */
30688 static bool
30689 aarch64_switch_pstate_sm_for_call (rtx_call_insn *call)
30691 /* Mode switches for sibling calls are handled via the epilogue. */
30692 if (SIBLING_CALL_P (call))
30693 return false;
30695 auto callee_isa_mode = aarch64_insn_callee_isa_mode (call);
30696 if (!aarch64_call_switches_pstate_sm (callee_isa_mode))
30697 return false;
30699 /* Switch mode before the call, preserving any argument registers
30700 across the switch. */
30701 start_sequence ();
30702 rtx_insn *args_guard_label = nullptr;
30703 if (TARGET_STREAMING_COMPATIBLE)
30704 args_guard_label = aarch64_guard_switch_pstate_sm (IP0_REGNUM,
30705 callee_isa_mode);
30706 aarch64_sme_mode_switch_regs args_switch;
30707 args_switch.add_call_args (call);
30708 args_switch.emit_prologue ();
30709 aarch64_switch_pstate_sm (AARCH64_ISA_MODE, callee_isa_mode);
30710 args_switch.emit_epilogue ();
30711 if (args_guard_label)
30712 emit_label (args_guard_label);
30713 auto args_seq = get_insns ();
30714 end_sequence ();
30715 emit_insn_before (args_seq, call);
30717 if (find_reg_note (call, REG_NORETURN, NULL_RTX))
30718 return true;
30720 /* Switch mode after the call, preserving any return registers across
30721 the switch. */
30722 start_sequence ();
30723 rtx_insn *return_guard_label = nullptr;
30724 if (TARGET_STREAMING_COMPATIBLE)
30725 return_guard_label = aarch64_guard_switch_pstate_sm (IP0_REGNUM,
30726 callee_isa_mode);
30727 aarch64_sme_mode_switch_regs return_switch;
30728 return_switch.add_call_result (call);
30729 return_switch.emit_prologue ();
30730 aarch64_switch_pstate_sm (callee_isa_mode, AARCH64_ISA_MODE);
30731 return_switch.emit_epilogue ();
30732 if (return_guard_label)
30733 emit_label (return_guard_label);
30734 auto result_seq = get_insns ();
30735 end_sequence ();
30736 emit_insn_after (result_seq, call);
30737 return true;
30740 namespace {
30742 const pass_data pass_data_switch_pstate_sm =
30744 RTL_PASS, // type
30745 "smstarts", // name
30746 OPTGROUP_NONE, // optinfo_flags
30747 TV_NONE, // tv_id
30748 0, // properties_required
30749 0, // properties_provided
30750 0, // properties_destroyed
30751 0, // todo_flags_start
30752 TODO_df_finish, // todo_flags_finish
30755 class pass_switch_pstate_sm : public rtl_opt_pass
30757 public:
30758 pass_switch_pstate_sm (gcc::context *ctxt)
30759 : rtl_opt_pass (pass_data_switch_pstate_sm, ctxt)
30762 // opt_pass methods:
30763 bool gate (function *) override final;
30764 unsigned int execute (function *) override final;
30767 bool
30768 pass_switch_pstate_sm::gate (function *fn)
30770 return (aarch64_fndecl_pstate_sm (fn->decl) != AARCH64_ISA_MODE_SM_OFF
30771 || cfun->machine->call_switches_pstate_sm);
30774 /* Emit any instructions needed to switch PSTATE.SM. */
30775 unsigned int
30776 pass_switch_pstate_sm::execute (function *fn)
30778 basic_block bb;
30780 auto_sbitmap blocks (last_basic_block_for_fn (cfun));
30781 bitmap_clear (blocks);
30782 FOR_EACH_BB_FN (bb, fn)
30784 if (has_abnormal_call_or_eh_pred_edge_p (bb)
30785 && aarch64_switch_pstate_sm_for_landing_pad (bb))
30786 bitmap_set_bit (blocks, bb->index);
30788 if (cfun->machine->call_switches_pstate_sm)
30790 rtx_insn *insn;
30791 FOR_BB_INSNS (bb, insn)
30792 if (auto *call = dyn_cast<rtx_call_insn *> (insn))
30793 if (aarch64_switch_pstate_sm_for_call (call))
30794 bitmap_set_bit (blocks, bb->index);
30797 auto end = BB_END (bb);
30798 if (JUMP_P (end)
30799 && find_reg_note (end, REG_NON_LOCAL_GOTO, NULL_RTX)
30800 && aarch64_switch_pstate_sm_for_jump (end))
30801 bitmap_set_bit (blocks, bb->index);
30803 find_many_sub_basic_blocks (blocks);
30804 clear_aux_for_blocks ();
30805 return 0;
30810 rtl_opt_pass *
30811 make_pass_switch_pstate_sm (gcc::context *ctxt)
30813 return new pass_switch_pstate_sm (ctxt);
30816 /* Parse an implementation-defined system register name of
30817 the form S[0-3]_[0-7]_C[0-15]_C[0-15]_[0-7].
30818 Return true if name matched against above pattern, false
30819 otherwise. */
30820 bool
30821 aarch64_is_implem_def_reg (const char *regname)
30823 unsigned pos = 0;
30824 unsigned name_len = strlen (regname);
30825 if (name_len < 12 || name_len > 14)
30826 return false;
30828 auto cterm_valid_p = [&]()
30830 bool leading_zero_p = false;
30831 unsigned i = 0;
30832 char n[3] = {0};
30834 if (regname[pos] != 'c')
30835 return false;
30836 pos++;
30837 while (regname[pos] != '_')
30839 if (leading_zero_p)
30840 return false;
30841 if (i == 0 && regname[pos] == '0')
30842 leading_zero_p = true;
30843 if (i > 2)
30844 return false;
30845 if (!ISDIGIT (regname[pos]))
30846 return false;
30847 n[i++] = regname[pos++];
30849 if (atoi (n) > 15)
30850 return false;
30851 return true;
30854 if (regname[pos] != 's')
30855 return false;
30856 pos++;
30857 if (regname[pos] < '0' || regname[pos] > '3')
30858 return false;
30859 pos++;
30860 if (regname[pos++] != '_')
30861 return false;
30862 if (regname[pos] < '0' || regname[pos] > '7')
30863 return false;
30864 pos++;
30865 if (regname[pos++] != '_')
30866 return false;
30867 if (!cterm_valid_p ())
30868 return false;
30869 if (regname[pos++] != '_')
30870 return false;
30871 if (!cterm_valid_p ())
30872 return false;
30873 if (regname[pos++] != '_')
30874 return false;
30875 if (regname[pos] < '0' || regname[pos] > '7')
30876 return false;
30877 return true;
30880 /* Return true if REGNAME matches either a known permitted system
30881 register name, or a generic sysreg specification. For use in
30882 back-end predicate `aarch64_sysreg_string'. */
30883 bool
30884 aarch64_valid_sysreg_name_p (const char *regname)
30886 const sysreg_t *sysreg = aarch64_lookup_sysreg_map (regname);
30887 if (sysreg == NULL)
30888 return aarch64_is_implem_def_reg (regname);
30889 if (sysreg->arch_reqs)
30890 return bool (aarch64_isa_flags & sysreg->arch_reqs);
30891 return true;
30894 /* Return the generic sysreg specification for a valid system register
30895 name, otherwise NULL. WRITE_P is true iff the register is being
30896 written to. IS128OP indicates the requested system register should
30897 be checked for a 128-bit implementation. */
30898 const char *
30899 aarch64_retrieve_sysreg (const char *regname, bool write_p, bool is128op)
30901 const sysreg_t *sysreg = aarch64_lookup_sysreg_map (regname);
30902 if (sysreg == NULL)
30904 if (aarch64_is_implem_def_reg (regname))
30905 return regname;
30906 else
30907 return NULL;
30909 if (is128op && !(sysreg->properties & F_REG_128))
30910 return NULL;
30911 if ((write_p && (sysreg->properties & F_REG_READ))
30912 || (!write_p && (sysreg->properties & F_REG_WRITE)))
30913 return NULL;
30914 if ((~aarch64_isa_flags & sysreg->arch_reqs) != 0)
30915 return NULL;
30916 return sysreg->encoding;
30919 /* Report that LOCATION has a call to FNDECL in which argument ARGNO
30920 was not an integer constant expression. ARGNO counts from zero. */
30921 void
30922 aarch64::report_non_ice (location_t location, tree fndecl, unsigned int argno)
30924 error_at (location, "argument %d of %qE must be an integer constant"
30925 " expression", argno + 1, fndecl);
30928 /* Report that LOCATION has a call to FNDECL in which argument ARGNO has
30929 the value ACTUAL, whereas the function requires a value in the range
30930 [MIN, MAX]. ARGNO counts from zero. */
30931 void
30932 aarch64::report_out_of_range (location_t location, tree fndecl,
30933 unsigned int argno, HOST_WIDE_INT actual,
30934 HOST_WIDE_INT min, HOST_WIDE_INT max)
30936 if (min == max)
30937 error_at (location, "passing %wd to argument %d of %qE, which expects"
30938 " the value %wd", actual, argno + 1, fndecl, min);
30939 else
30940 error_at (location, "passing %wd to argument %d of %qE, which expects"
30941 " a value in the range [%wd, %wd]", actual, argno + 1, fndecl,
30942 min, max);
30945 /* Report that LOCATION has a call to FNDECL in which argument ARGNO has
30946 the value ACTUAL, whereas the function requires either VALUE0 or
30947 VALUE1. ARGNO counts from zero. */
30948 void
30949 aarch64::report_neither_nor (location_t location, tree fndecl,
30950 unsigned int argno, HOST_WIDE_INT actual,
30951 HOST_WIDE_INT value0, HOST_WIDE_INT value1)
30953 error_at (location, "passing %wd to argument %d of %qE, which expects"
30954 " either %wd or %wd", actual, argno + 1, fndecl, value0, value1);
30957 /* Report that LOCATION has a call to FNDECL in which argument ARGNO has
30958 the value ACTUAL, whereas the function requires one of VALUE0..3.
30959 ARGNO counts from zero. */
30960 void
30961 aarch64::report_not_one_of (location_t location, tree fndecl,
30962 unsigned int argno, HOST_WIDE_INT actual,
30963 HOST_WIDE_INT value0, HOST_WIDE_INT value1,
30964 HOST_WIDE_INT value2,
30965 HOST_WIDE_INT value3)
30967 error_at (location, "passing %wd to argument %d of %qE, which expects"
30968 " %wd, %wd, %wd or %wd", actual, argno + 1, fndecl, value0, value1,
30969 value2, value3);
30972 /* Report that LOCATION has a call to FNDECL in which argument ARGNO has
30973 the value ACTUAL, whereas the function requires a valid value of
30974 enum type ENUMTYPE. ARGNO counts from zero. */
30975 void
30976 aarch64::report_not_enum (location_t location, tree fndecl, unsigned int argno,
30977 HOST_WIDE_INT actual, tree enumtype)
30979 error_at (location, "passing %wd to argument %d of %qE, which expects"
30980 " a valid %qT value", actual, argno + 1, fndecl, enumtype);
30983 /* Generate assembly to calculate CRC
30984 using carry-less multiplication instruction.
30985 OPERANDS[1] is input CRC,
30986 OPERANDS[2] is data (message),
30987 OPERANDS[3] is the polynomial without the leading 1. */
30989 void
30990 aarch64_expand_crc_using_pmull (scalar_mode crc_mode,
30991 scalar_mode data_mode,
30992 rtx *operands)
30994 /* Check and keep arguments. */
30995 gcc_assert (!CONST_INT_P (operands[0]));
30996 gcc_assert (CONST_INT_P (operands[3]));
30997 rtx crc = operands[1];
30998 rtx data = operands[2];
30999 rtx polynomial = operands[3];
31001 unsigned HOST_WIDE_INT crc_size = GET_MODE_BITSIZE (crc_mode);
31002 unsigned HOST_WIDE_INT data_size = GET_MODE_BITSIZE (data_mode);
31003 gcc_assert (crc_size <= 32);
31004 gcc_assert (data_size <= crc_size);
31006 /* Calculate the quotient. */
31007 unsigned HOST_WIDE_INT
31008 q = gf2n_poly_long_div_quotient (UINTVAL (polynomial), crc_size);
31009 /* CRC calculation's main part. */
31010 if (crc_size > data_size)
31011 crc = expand_shift (RSHIFT_EXPR, DImode, crc, crc_size - data_size,
31012 NULL_RTX, 1);
31014 rtx t0 = force_reg (DImode, gen_int_mode (q, DImode));
31015 polynomial = simplify_gen_unary (ZERO_EXTEND, DImode, polynomial,
31016 GET_MODE (polynomial));
31017 rtx t1 = force_reg (DImode, polynomial);
31019 rtx a0 = expand_binop (DImode, xor_optab, crc, data, NULL_RTX, 1,
31020 OPTAB_WIDEN);
31022 rtx pmull_res = gen_reg_rtx (TImode);
31023 emit_insn (gen_aarch64_crypto_pmulldi (pmull_res, a0, t0));
31024 a0 = gen_lowpart (DImode, pmull_res);
31026 a0 = expand_shift (RSHIFT_EXPR, DImode, a0, crc_size, NULL_RTX, 1);
31028 emit_insn (gen_aarch64_crypto_pmulldi (pmull_res, a0, t1));
31029 a0 = gen_lowpart (DImode, pmull_res);
31031 if (crc_size > data_size)
31033 rtx crc_part = expand_shift (LSHIFT_EXPR, DImode, operands[1], data_size,
31034 NULL_RTX, 0);
31035 a0 = expand_binop (DImode, xor_optab, a0, crc_part, NULL_RTX, 1,
31036 OPTAB_DIRECT);
31039 aarch64_emit_move (operands[0], gen_lowpart (crc_mode, a0));
31042 /* Generate assembly to calculate reversed CRC
31043 using carry-less multiplication instruction.
31044 OPERANDS[1] is input CRC,
31045 OPERANDS[2] is data,
31046 OPERANDS[3] is the polynomial without the leading 1. */
31048 void
31049 aarch64_expand_reversed_crc_using_pmull (scalar_mode crc_mode,
31050 scalar_mode data_mode,
31051 rtx *operands)
31053 /* Check and keep arguments. */
31054 gcc_assert (!CONST_INT_P (operands[0]));
31055 gcc_assert (CONST_INT_P (operands[3]));
31056 rtx crc = operands[1];
31057 rtx data = operands[2];
31058 rtx polynomial = operands[3];
31060 unsigned HOST_WIDE_INT crc_size = GET_MODE_BITSIZE (crc_mode);
31061 unsigned HOST_WIDE_INT data_size = GET_MODE_BITSIZE (data_mode);
31062 gcc_assert (crc_size <= 32);
31063 gcc_assert (data_size <= crc_size);
31065 /* Calculate the quotient. */
31066 unsigned HOST_WIDE_INT
31067 q = gf2n_poly_long_div_quotient (UINTVAL (polynomial), crc_size);
31068 /* Reflect the calculated quotient. */
31069 q = reflect_hwi (q, crc_size + 1);
31070 rtx t0 = force_reg (DImode, gen_int_mode (q, DImode));
31072 /* Reflect the polynomial. */
31073 unsigned HOST_WIDE_INT ref_polynomial = reflect_hwi (UINTVAL (polynomial),
31074 crc_size);
31075 /* An unshifted multiplier would require the final result to be extracted
31076 using a shift right by DATA_SIZE - 1 bits. Shift the multiplier left
31077 so that the shift right can be by CRC_SIZE bits instead. */
31078 ref_polynomial <<= crc_size - data_size + 1;
31079 rtx t1 = force_reg (DImode, gen_int_mode (ref_polynomial, DImode));
31081 /* CRC calculation's main part. */
31082 rtx a0 = expand_binop (DImode, xor_optab, crc, data, NULL_RTX, 1,
31083 OPTAB_WIDEN);
31085 /* Perform carry-less multiplication and get low part. */
31086 rtx pmull_res = gen_reg_rtx (TImode);
31087 emit_insn (gen_aarch64_crypto_pmulldi (pmull_res, a0, t0));
31088 a0 = gen_lowpart (DImode, pmull_res);
31090 a0 = expand_binop (DImode, and_optab, a0,
31091 gen_int_mode (GET_MODE_MASK (data_mode), DImode),
31092 NULL_RTX, 1, OPTAB_WIDEN);
31094 /* Perform carry-less multiplication. */
31095 emit_insn (gen_aarch64_crypto_pmulldi (pmull_res, a0, t1));
31097 /* Perform a shift right by CRC_SIZE as an extraction of lane 1. */
31098 machine_mode crc_vmode = aarch64_v128_mode (crc_mode).require ();
31099 a0 = (crc_size > data_size ? gen_reg_rtx (crc_mode) : operands[0]);
31100 emit_insn (gen_aarch64_get_lane (crc_vmode, a0,
31101 gen_lowpart (crc_vmode, pmull_res),
31102 aarch64_endian_lane_rtx (crc_vmode, 1)));
31104 if (crc_size > data_size)
31106 rtx crc_part = expand_shift (RSHIFT_EXPR, crc_mode, crc, data_size,
31107 NULL_RTX, 1);
31108 a0 = expand_binop (crc_mode, xor_optab, a0, crc_part, operands[0], 1,
31109 OPTAB_WIDEN);
31110 aarch64_emit_move (operands[0], a0);
31114 /* Target-specific selftests. */
31116 #if CHECKING_P
31118 namespace selftest {
31120 /* Selftest for the RTL loader.
31121 Verify that the RTL loader copes with a dump from
31122 print_rtx_function. This is essentially just a test that class
31123 function_reader can handle a real dump, but it also verifies
31124 that lookup_reg_by_dump_name correctly handles hard regs.
31125 The presence of hard reg names in the dump means that the test is
31126 target-specific, hence it is in this file. */
31128 static void
31129 aarch64_test_loading_full_dump ()
31131 rtl_dump_test t (SELFTEST_LOCATION, locate_file ("aarch64/times-two.rtl"));
31133 ASSERT_STREQ ("times_two", IDENTIFIER_POINTER (DECL_NAME (cfun->decl)));
31135 rtx_insn *insn_1 = get_insn_by_uid (1);
31136 ASSERT_EQ (NOTE, GET_CODE (insn_1));
31138 rtx_insn *insn_15 = get_insn_by_uid (15);
31139 ASSERT_EQ (INSN, GET_CODE (insn_15));
31140 ASSERT_EQ (USE, GET_CODE (PATTERN (insn_15)));
31142 /* Verify crtl->return_rtx. */
31143 ASSERT_EQ (REG, GET_CODE (crtl->return_rtx));
31144 ASSERT_EQ (0, REGNO (crtl->return_rtx));
31145 ASSERT_EQ (SImode, GET_MODE (crtl->return_rtx));
31148 /* Test the fractional_cost class. */
31150 static void
31151 aarch64_test_fractional_cost ()
31153 using cf = fractional_cost;
31155 ASSERT_EQ (cf (0, 20), 0);
31157 ASSERT_EQ (cf (4, 2), 2);
31158 ASSERT_EQ (3, cf (9, 3));
31160 ASSERT_NE (cf (5, 2), 2);
31161 ASSERT_NE (3, cf (8, 3));
31163 ASSERT_EQ (cf (7, 11) + cf (15, 11), 2);
31164 ASSERT_EQ (cf (2, 3) + cf (3, 5), cf (19, 15));
31165 ASSERT_EQ (cf (2, 3) + cf (1, 6) + cf (1, 6), 1);
31167 ASSERT_EQ (cf (14, 15) - cf (4, 15), cf (2, 3));
31168 ASSERT_EQ (cf (1, 4) - cf (1, 2), 0);
31169 ASSERT_EQ (cf (3, 5) - cf (1, 10), cf (1, 2));
31170 ASSERT_EQ (cf (11, 3) - 3, cf (2, 3));
31171 ASSERT_EQ (3 - cf (7, 3), cf (2, 3));
31172 ASSERT_EQ (3 - cf (10, 3), 0);
31174 ASSERT_EQ (cf (2, 3) * 5, cf (10, 3));
31175 ASSERT_EQ (14 * cf (11, 21), cf (22, 3));
31177 ASSERT_TRUE (cf (4, 15) <= cf (5, 15));
31178 ASSERT_TRUE (cf (5, 15) <= cf (5, 15));
31179 ASSERT_FALSE (cf (6, 15) <= cf (5, 15));
31180 ASSERT_TRUE (cf (1, 3) <= cf (2, 5));
31181 ASSERT_TRUE (cf (1, 12) <= cf (1, 6));
31182 ASSERT_TRUE (cf (5, 3) <= cf (5, 3));
31183 ASSERT_TRUE (cf (239, 240) <= 1);
31184 ASSERT_TRUE (cf (240, 240) <= 1);
31185 ASSERT_FALSE (cf (241, 240) <= 1);
31186 ASSERT_FALSE (2 <= cf (207, 104));
31187 ASSERT_TRUE (2 <= cf (208, 104));
31188 ASSERT_TRUE (2 <= cf (209, 104));
31190 ASSERT_TRUE (cf (4, 15) < cf (5, 15));
31191 ASSERT_FALSE (cf (5, 15) < cf (5, 15));
31192 ASSERT_FALSE (cf (6, 15) < cf (5, 15));
31193 ASSERT_TRUE (cf (1, 3) < cf (2, 5));
31194 ASSERT_TRUE (cf (1, 12) < cf (1, 6));
31195 ASSERT_FALSE (cf (5, 3) < cf (5, 3));
31196 ASSERT_TRUE (cf (239, 240) < 1);
31197 ASSERT_FALSE (cf (240, 240) < 1);
31198 ASSERT_FALSE (cf (241, 240) < 1);
31199 ASSERT_FALSE (2 < cf (207, 104));
31200 ASSERT_FALSE (2 < cf (208, 104));
31201 ASSERT_TRUE (2 < cf (209, 104));
31203 ASSERT_FALSE (cf (4, 15) >= cf (5, 15));
31204 ASSERT_TRUE (cf (5, 15) >= cf (5, 15));
31205 ASSERT_TRUE (cf (6, 15) >= cf (5, 15));
31206 ASSERT_FALSE (cf (1, 3) >= cf (2, 5));
31207 ASSERT_FALSE (cf (1, 12) >= cf (1, 6));
31208 ASSERT_TRUE (cf (5, 3) >= cf (5, 3));
31209 ASSERT_FALSE (cf (239, 240) >= 1);
31210 ASSERT_TRUE (cf (240, 240) >= 1);
31211 ASSERT_TRUE (cf (241, 240) >= 1);
31212 ASSERT_TRUE (2 >= cf (207, 104));
31213 ASSERT_TRUE (2 >= cf (208, 104));
31214 ASSERT_FALSE (2 >= cf (209, 104));
31216 ASSERT_FALSE (cf (4, 15) > cf (5, 15));
31217 ASSERT_FALSE (cf (5, 15) > cf (5, 15));
31218 ASSERT_TRUE (cf (6, 15) > cf (5, 15));
31219 ASSERT_FALSE (cf (1, 3) > cf (2, 5));
31220 ASSERT_FALSE (cf (1, 12) > cf (1, 6));
31221 ASSERT_FALSE (cf (5, 3) > cf (5, 3));
31222 ASSERT_FALSE (cf (239, 240) > 1);
31223 ASSERT_FALSE (cf (240, 240) > 1);
31224 ASSERT_TRUE (cf (241, 240) > 1);
31225 ASSERT_TRUE (2 > cf (207, 104));
31226 ASSERT_FALSE (2 > cf (208, 104));
31227 ASSERT_FALSE (2 > cf (209, 104));
31229 ASSERT_EQ (cf (1, 2).ceil (), 1);
31230 ASSERT_EQ (cf (11, 7).ceil (), 2);
31231 ASSERT_EQ (cf (20, 1).ceil (), 20);
31232 ASSERT_EQ ((cf (0xfffffffd) + 1).ceil (), 0xfffffffe);
31233 ASSERT_EQ ((cf (0xfffffffd) + 2).ceil (), 0xffffffff);
31234 ASSERT_EQ ((cf (0xfffffffd) + 3).ceil (), 0xffffffff);
31235 ASSERT_EQ ((cf (0x7fffffff) * 2).ceil (), 0xfffffffe);
31236 ASSERT_EQ ((cf (0x80000000) * 2).ceil (), 0xffffffff);
31238 ASSERT_EQ (cf (1, 2).as_double (), 0.5);
31241 /* Calculate whether our system register data, as imported from
31242 `aarch64-sys-reg.def' has any duplicate entries. */
31243 static void
31244 aarch64_test_sysreg_encoding_clashes (void)
31246 using dup_instances_t = hash_map<nofree_string_hash,
31247 std::vector<const sysreg_t*>>;
31249 dup_instances_t duplicate_instances;
31251 /* Every time an encoding is established to come up more than once
31252 we add it to a "clash-analysis queue", which is then used to extract
31253 necessary information from our hash map when establishing whether
31254 repeated encodings are valid. */
31256 /* 1) Collect recurrence information. */
31257 for (unsigned i = 0; i < ARRAY_SIZE (aarch64_sysregs); i++)
31259 const sysreg_t *reg = aarch64_sysregs + i;
31261 std::vector<const sysreg_t*> *tmp
31262 = &duplicate_instances.get_or_insert (reg->encoding);
31264 tmp->push_back (reg);
31267 /* 2) Carry out analysis on collected data. */
31268 for (auto instance : duplicate_instances)
31270 unsigned nrep = instance.second.size ();
31271 if (nrep > 1)
31272 for (unsigned i = 0; i < nrep; i++)
31273 for (unsigned j = i + 1; j < nrep; j++)
31275 const sysreg_t *a = instance.second[i];
31276 const sysreg_t *b = instance.second[j];
31277 ASSERT_TRUE ((a->properties != b->properties)
31278 || (a->arch_reqs != b->arch_reqs));
31283 /* Run all target-specific selftests. */
31285 static void
31286 aarch64_run_selftests (void)
31288 aarch64_test_loading_full_dump ();
31289 aarch64_test_fractional_cost ();
31290 aarch64_test_sysreg_encoding_clashes ();
31293 } // namespace selftest
31295 #endif /* #if CHECKING_P */
31297 #undef TARGET_STACK_PROTECT_GUARD
31298 #define TARGET_STACK_PROTECT_GUARD aarch64_stack_protect_guard
31300 #undef TARGET_ADDRESS_COST
31301 #define TARGET_ADDRESS_COST aarch64_address_cost
31303 /* This hook will determines whether unnamed bitfields affect the alignment
31304 of the containing structure. The hook returns true if the structure
31305 should inherit the alignment requirements of an unnamed bitfield's
31306 type. */
31307 #undef TARGET_ALIGN_ANON_BITFIELD
31308 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
31310 #undef TARGET_ASM_ALIGNED_DI_OP
31311 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
31313 #undef TARGET_ASM_ALIGNED_HI_OP
31314 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
31316 #undef TARGET_ASM_ALIGNED_SI_OP
31317 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
31319 #if TARGET_PECOFF
31320 #undef TARGET_ASM_UNALIGNED_HI_OP
31321 #define TARGET_ASM_UNALIGNED_HI_OP TARGET_ASM_ALIGNED_HI_OP
31322 #undef TARGET_ASM_UNALIGNED_SI_OP
31323 #define TARGET_ASM_UNALIGNED_SI_OP TARGET_ASM_ALIGNED_SI_OP
31324 #undef TARGET_ASM_UNALIGNED_DI_OP
31325 #define TARGET_ASM_UNALIGNED_DI_OP TARGET_ASM_ALIGNED_DI_OP
31326 #endif
31328 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
31329 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
31330 hook_bool_const_tree_hwi_hwi_const_tree_true
31332 #undef TARGET_ASM_FILE_START
31333 #define TARGET_ASM_FILE_START aarch64_start_file
31335 #undef TARGET_ASM_OUTPUT_MI_THUNK
31336 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
31338 #undef TARGET_ASM_SELECT_RTX_SECTION
31339 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
31341 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
31342 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
31344 #undef TARGET_ASM_PRINT_PATCHABLE_FUNCTION_ENTRY
31345 #define TARGET_ASM_PRINT_PATCHABLE_FUNCTION_ENTRY aarch64_print_patchable_function_entry
31347 #undef TARGET_BUILD_BUILTIN_VA_LIST
31348 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
31350 #undef TARGET_CALLEE_COPIES
31351 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_arg_info_false
31353 #undef TARGET_FRAME_POINTER_REQUIRED
31354 #define TARGET_FRAME_POINTER_REQUIRED aarch64_frame_pointer_required
31356 #undef TARGET_CAN_ELIMINATE
31357 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
31359 #undef TARGET_FUNCTION_ATTRIBUTE_INLINABLE_P
31360 #define TARGET_FUNCTION_ATTRIBUTE_INLINABLE_P \
31361 aarch64_function_attribute_inlinable_p
31363 #undef TARGET_NEED_IPA_FN_TARGET_INFO
31364 #define TARGET_NEED_IPA_FN_TARGET_INFO aarch64_need_ipa_fn_target_info
31366 #undef TARGET_UPDATE_IPA_FN_TARGET_INFO
31367 #define TARGET_UPDATE_IPA_FN_TARGET_INFO aarch64_update_ipa_fn_target_info
31369 #undef TARGET_CAN_INLINE_P
31370 #define TARGET_CAN_INLINE_P aarch64_can_inline_p
31372 #undef TARGET_CANNOT_FORCE_CONST_MEM
31373 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
31375 #undef TARGET_CASE_VALUES_THRESHOLD
31376 #define TARGET_CASE_VALUES_THRESHOLD aarch64_case_values_threshold
31378 #undef TARGET_CONDITIONAL_REGISTER_USAGE
31379 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
31381 #undef TARGET_MEMBER_TYPE_FORCES_BLK
31382 #define TARGET_MEMBER_TYPE_FORCES_BLK aarch64_member_type_forces_blk
31384 /* Only the least significant bit is used for initialization guard
31385 variables. */
31386 #undef TARGET_CXX_GUARD_MASK_BIT
31387 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
31389 #undef TARGET_C_MODE_FOR_SUFFIX
31390 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
31392 #ifdef TARGET_BIG_ENDIAN_DEFAULT
31393 #undef TARGET_DEFAULT_TARGET_FLAGS
31394 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
31395 #endif
31397 #undef TARGET_CLASS_MAX_NREGS
31398 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
31400 #undef TARGET_BUILTIN_DECL
31401 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
31403 #undef TARGET_BUILTIN_RECIPROCAL
31404 #define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal
31406 #undef TARGET_C_EXCESS_PRECISION
31407 #define TARGET_C_EXCESS_PRECISION aarch64_excess_precision
31409 #undef TARGET_C_BITINT_TYPE_INFO
31410 #define TARGET_C_BITINT_TYPE_INFO aarch64_bitint_type_info
31412 #undef TARGET_C_MODE_FOR_FLOATING_TYPE
31413 #define TARGET_C_MODE_FOR_FLOATING_TYPE aarch64_c_mode_for_floating_type
31415 #undef TARGET_EXPAND_BUILTIN
31416 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
31418 #undef TARGET_EXPAND_BUILTIN_VA_START
31419 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
31421 #undef TARGET_FOLD_BUILTIN
31422 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin
31424 #undef TARGET_FUNCTION_ARG
31425 #define TARGET_FUNCTION_ARG aarch64_function_arg
31427 #undef TARGET_FUNCTION_ARG_ADVANCE
31428 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
31430 #undef TARGET_FUNCTION_ARG_BOUNDARY
31431 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
31433 #undef TARGET_FUNCTION_ARG_PADDING
31434 #define TARGET_FUNCTION_ARG_PADDING aarch64_function_arg_padding
31436 #undef TARGET_GET_RAW_RESULT_MODE
31437 #define TARGET_GET_RAW_RESULT_MODE aarch64_get_reg_raw_mode
31438 #undef TARGET_GET_RAW_ARG_MODE
31439 #define TARGET_GET_RAW_ARG_MODE aarch64_get_reg_raw_mode
31441 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
31442 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
31444 #undef TARGET_FUNCTION_VALUE
31445 #define TARGET_FUNCTION_VALUE aarch64_function_value
31447 #undef TARGET_FUNCTION_VALUE_REGNO_P
31448 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
31450 #undef TARGET_START_CALL_ARGS
31451 #define TARGET_START_CALL_ARGS aarch64_start_call_args
31453 #undef TARGET_END_CALL_ARGS
31454 #define TARGET_END_CALL_ARGS aarch64_end_call_args
31456 #undef TARGET_GIMPLE_FOLD_BUILTIN
31457 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
31459 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
31460 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
31462 #undef TARGET_INIT_BUILTINS
31463 #define TARGET_INIT_BUILTINS aarch64_init_builtins
31465 #undef TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
31466 #define TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS \
31467 aarch64_ira_change_pseudo_allocno_class
31469 #undef TARGET_LEGITIMATE_ADDRESS_P
31470 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
31472 #undef TARGET_LEGITIMATE_CONSTANT_P
31473 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
31475 #undef TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT
31476 #define TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT \
31477 aarch64_legitimize_address_displacement
31479 #undef TARGET_LIBGCC_CMP_RETURN_MODE
31480 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
31482 #undef TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P
31483 #define TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P \
31484 aarch64_libgcc_floating_mode_supported_p
31486 #undef TARGET_MANGLE_TYPE
31487 #define TARGET_MANGLE_TYPE aarch64_mangle_type
31489 #undef TARGET_INVALID_CONVERSION
31490 #define TARGET_INVALID_CONVERSION aarch64_invalid_conversion
31492 #undef TARGET_INVALID_UNARY_OP
31493 #define TARGET_INVALID_UNARY_OP aarch64_invalid_unary_op
31495 #undef TARGET_INVALID_BINARY_OP
31496 #define TARGET_INVALID_BINARY_OP aarch64_invalid_binary_op
31498 #undef TARGET_VERIFY_TYPE_CONTEXT
31499 #define TARGET_VERIFY_TYPE_CONTEXT aarch64_verify_type_context
31501 #undef TARGET_MEMORY_MOVE_COST
31502 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
31504 #undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
31505 #define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
31507 #undef TARGET_MUST_PASS_IN_STACK
31508 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
31510 /* This target hook should return true if accesses to volatile bitfields
31511 should use the narrowest mode possible. It should return false if these
31512 accesses should use the bitfield container type. */
31513 #undef TARGET_NARROW_VOLATILE_BITFIELD
31514 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
31516 #undef TARGET_OPTION_OVERRIDE
31517 #define TARGET_OPTION_OVERRIDE aarch64_override_options
31519 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
31520 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
31521 aarch64_override_options_after_change
31523 #undef TARGET_OFFLOAD_OPTIONS
31524 #define TARGET_OFFLOAD_OPTIONS aarch64_offload_options
31526 #undef TARGET_OPTION_RESTORE
31527 #define TARGET_OPTION_RESTORE aarch64_option_restore
31529 #undef TARGET_OPTION_PRINT
31530 #define TARGET_OPTION_PRINT aarch64_option_print
31532 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
31533 #define TARGET_OPTION_VALID_ATTRIBUTE_P aarch64_option_valid_attribute_p
31535 #undef TARGET_OPTION_VALID_VERSION_ATTRIBUTE_P
31536 #define TARGET_OPTION_VALID_VERSION_ATTRIBUTE_P \
31537 aarch64_option_valid_version_attribute_p
31539 #undef TARGET_SET_CURRENT_FUNCTION
31540 #define TARGET_SET_CURRENT_FUNCTION aarch64_set_current_function
31542 #undef TARGET_PASS_BY_REFERENCE
31543 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
31545 #undef TARGET_PREFERRED_RELOAD_CLASS
31546 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
31548 #undef TARGET_SCHED_REASSOCIATION_WIDTH
31549 #define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width
31551 #undef TARGET_DWARF_FRAME_REG_MODE
31552 #define TARGET_DWARF_FRAME_REG_MODE aarch64_dwarf_frame_reg_mode
31554 #undef TARGET_OUTPUT_CFI_DIRECTIVE
31555 #define TARGET_OUTPUT_CFI_DIRECTIVE aarch64_output_cfi_directive
31557 #undef TARGET_DW_CFI_OPRND1_DESC
31558 #define TARGET_DW_CFI_OPRND1_DESC aarch64_dw_cfi_oprnd1_desc
31560 #undef TARGET_PROMOTED_TYPE
31561 #define TARGET_PROMOTED_TYPE aarch64_promoted_type
31563 #undef TARGET_SECONDARY_RELOAD
31564 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
31566 #undef TARGET_SECONDARY_MEMORY_NEEDED
31567 #define TARGET_SECONDARY_MEMORY_NEEDED aarch64_secondary_memory_needed
31569 #undef TARGET_SHIFT_TRUNCATION_MASK
31570 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
31572 #undef TARGET_SETUP_INCOMING_VARARGS
31573 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
31575 #undef TARGET_STRUCT_VALUE_RTX
31576 #define TARGET_STRUCT_VALUE_RTX aarch64_struct_value_rtx
31578 #undef TARGET_REGISTER_MOVE_COST
31579 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
31581 #undef TARGET_RETURN_IN_MEMORY
31582 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
31584 #undef TARGET_RETURN_IN_MSB
31585 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb
31587 #undef TARGET_RTX_COSTS
31588 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
31590 #undef TARGET_INSN_COST
31591 #define TARGET_INSN_COST aarch64_insn_cost
31593 #undef TARGET_SCALAR_MODE_SUPPORTED_P
31594 #define TARGET_SCALAR_MODE_SUPPORTED_P aarch64_scalar_mode_supported_p
31596 #undef TARGET_SCHED_ISSUE_RATE
31597 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
31599 #undef TARGET_SCHED_VARIABLE_ISSUE
31600 #define TARGET_SCHED_VARIABLE_ISSUE aarch64_sched_variable_issue
31602 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
31603 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
31604 aarch64_sched_first_cycle_multipass_dfa_lookahead
31606 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD
31607 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD \
31608 aarch64_first_cycle_multipass_dfa_lookahead_guard
31610 #undef TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS
31611 #define TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS \
31612 aarch64_get_separate_components
31614 #undef TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB
31615 #define TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB \
31616 aarch64_components_for_bb
31618 #undef TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS
31619 #define TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS \
31620 aarch64_disqualify_components
31622 #undef TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS
31623 #define TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS \
31624 aarch64_emit_prologue_components
31626 #undef TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS
31627 #define TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS \
31628 aarch64_emit_epilogue_components
31630 #undef TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS
31631 #define TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS \
31632 aarch64_set_handled_components
31634 #undef TARGET_TRAMPOLINE_INIT
31635 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
31637 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
31638 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
31640 #undef TARGET_VECTOR_MODE_SUPPORTED_P
31641 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
31643 #undef TARGET_VECTOR_MODE_SUPPORTED_ANY_TARGET_P
31644 #define TARGET_VECTOR_MODE_SUPPORTED_ANY_TARGET_P aarch64_vector_mode_supported_any_target_p
31646 #undef TARGET_COMPATIBLE_VECTOR_TYPES_P
31647 #define TARGET_COMPATIBLE_VECTOR_TYPES_P aarch64_compatible_vector_types_p
31649 #undef TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT
31650 #define TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT \
31651 aarch64_builtin_support_vector_misalignment
31653 #undef TARGET_ARRAY_MODE
31654 #define TARGET_ARRAY_MODE aarch64_array_mode
31656 #undef TARGET_ARRAY_MODE_SUPPORTED_P
31657 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
31659 #undef TARGET_VECTORIZE_CREATE_COSTS
31660 #define TARGET_VECTORIZE_CREATE_COSTS aarch64_vectorize_create_costs
31662 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
31663 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
31664 aarch64_builtin_vectorization_cost
31666 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
31667 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
31669 #undef TARGET_VECTORIZE_BUILTINS
31670 #define TARGET_VECTORIZE_BUILTINS
31672 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_MODES
31673 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_MODES \
31674 aarch64_autovectorize_vector_modes
31676 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
31677 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
31678 aarch64_atomic_assign_expand_fenv
31680 /* Section anchor support. */
31682 #undef TARGET_MIN_ANCHOR_OFFSET
31683 #define TARGET_MIN_ANCHOR_OFFSET -256
31685 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a
31686 byte offset; we can do much more for larger data types, but have no way
31687 to determine the size of the access. We assume accesses are aligned. */
31688 #undef TARGET_MAX_ANCHOR_OFFSET
31689 #define TARGET_MAX_ANCHOR_OFFSET 4095
31691 #undef TARGET_VECTORIZE_PREFERRED_DIV_AS_SHIFTS_OVER_MULT
31692 #define TARGET_VECTORIZE_PREFERRED_DIV_AS_SHIFTS_OVER_MULT \
31693 aarch64_vectorize_preferred_div_as_shifts_over_mult
31695 #undef TARGET_VECTOR_ALIGNMENT
31696 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
31698 #undef TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT
31699 #define TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT \
31700 aarch64_vectorize_preferred_vector_alignment
31701 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
31702 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
31703 aarch64_simd_vector_alignment_reachable
31705 /* vec_perm support. */
31707 #undef TARGET_VECTORIZE_VEC_PERM_CONST
31708 #define TARGET_VECTORIZE_VEC_PERM_CONST \
31709 aarch64_vectorize_vec_perm_const
31711 #undef TARGET_VECTORIZE_RELATED_MODE
31712 #define TARGET_VECTORIZE_RELATED_MODE aarch64_vectorize_related_mode
31713 #undef TARGET_VECTORIZE_GET_MASK_MODE
31714 #define TARGET_VECTORIZE_GET_MASK_MODE aarch64_get_mask_mode
31715 #undef TARGET_VECTORIZE_CONDITIONAL_OPERATION_IS_EXPENSIVE
31716 #define TARGET_VECTORIZE_CONDITIONAL_OPERATION_IS_EXPENSIVE \
31717 aarch64_conditional_operation_is_expensive
31718 #undef TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE
31719 #define TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE \
31720 aarch64_empty_mask_is_expensive
31721 #undef TARGET_PREFERRED_ELSE_VALUE
31722 #define TARGET_PREFERRED_ELSE_VALUE \
31723 aarch64_preferred_else_value
31725 #undef TARGET_INIT_LIBFUNCS
31726 #define TARGET_INIT_LIBFUNCS aarch64_init_libfuncs
31728 #undef TARGET_FIXED_CONDITION_CODE_REGS
31729 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
31731 #undef TARGET_FLAGS_REGNUM
31732 #define TARGET_FLAGS_REGNUM CC_REGNUM
31734 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
31735 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
31737 #undef TARGET_ASAN_SHADOW_OFFSET
31738 #define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset
31740 #undef TARGET_LEGITIMIZE_ADDRESS
31741 #define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address
31743 #undef TARGET_SCHED_CAN_SPECULATE_INSN
31744 #define TARGET_SCHED_CAN_SPECULATE_INSN aarch64_sched_can_speculate_insn
31746 #undef TARGET_CAN_USE_DOLOOP_P
31747 #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
31749 #undef TARGET_SCHED_ADJUST_PRIORITY
31750 #define TARGET_SCHED_ADJUST_PRIORITY aarch64_sched_adjust_priority
31752 #undef TARGET_SCHED_MACRO_FUSION_P
31753 #define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p
31755 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
31756 #define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
31758 #undef TARGET_SCHED_FUSION_PRIORITY
31759 #define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority
31761 #undef TARGET_UNSPEC_MAY_TRAP_P
31762 #define TARGET_UNSPEC_MAY_TRAP_P aarch64_unspec_may_trap_p
31764 #undef TARGET_USE_PSEUDO_PIC_REG
31765 #define TARGET_USE_PSEUDO_PIC_REG aarch64_use_pseudo_pic_reg
31767 #undef TARGET_PRINT_OPERAND
31768 #define TARGET_PRINT_OPERAND aarch64_print_operand
31770 #undef TARGET_PRINT_OPERAND_ADDRESS
31771 #define TARGET_PRINT_OPERAND_ADDRESS aarch64_print_operand_address
31773 #undef TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA
31774 #define TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA aarch64_output_addr_const_extra
31776 #undef TARGET_OPTAB_SUPPORTED_P
31777 #define TARGET_OPTAB_SUPPORTED_P aarch64_optab_supported_p
31779 #undef TARGET_OMIT_STRUCT_RETURN_REG
31780 #define TARGET_OMIT_STRUCT_RETURN_REG true
31782 #undef TARGET_DWARF_POLY_INDETERMINATE_VALUE
31783 #define TARGET_DWARF_POLY_INDETERMINATE_VALUE \
31784 aarch64_dwarf_poly_indeterminate_value
31786 /* The architecture reserves bits 0 and 1 so use bit 2 for descriptors. */
31787 #undef TARGET_CUSTOM_FUNCTION_DESCRIPTORS
31788 #define TARGET_CUSTOM_FUNCTION_DESCRIPTORS 4
31790 #undef TARGET_HARD_REGNO_NREGS
31791 #define TARGET_HARD_REGNO_NREGS aarch64_hard_regno_nregs
31792 #undef TARGET_HARD_REGNO_MODE_OK
31793 #define TARGET_HARD_REGNO_MODE_OK aarch64_hard_regno_mode_ok
31795 #undef TARGET_MODES_TIEABLE_P
31796 #define TARGET_MODES_TIEABLE_P aarch64_modes_tieable_p
31798 #undef TARGET_HARD_REGNO_CALL_PART_CLOBBERED
31799 #define TARGET_HARD_REGNO_CALL_PART_CLOBBERED \
31800 aarch64_hard_regno_call_part_clobbered
31802 #undef TARGET_INSN_CALLEE_ABI
31803 #define TARGET_INSN_CALLEE_ABI aarch64_insn_callee_abi
31805 #undef TARGET_CONSTANT_ALIGNMENT
31806 #define TARGET_CONSTANT_ALIGNMENT aarch64_constant_alignment
31808 #undef TARGET_STACK_CLASH_PROTECTION_ALLOCA_PROBE_RANGE
31809 #define TARGET_STACK_CLASH_PROTECTION_ALLOCA_PROBE_RANGE \
31810 aarch64_stack_clash_protection_alloca_probe_range
31812 #undef TARGET_COMPUTE_PRESSURE_CLASSES
31813 #define TARGET_COMPUTE_PRESSURE_CLASSES aarch64_compute_pressure_classes
31815 #undef TARGET_CAN_CHANGE_MODE_CLASS
31816 #define TARGET_CAN_CHANGE_MODE_CLASS aarch64_can_change_mode_class
31818 #undef TARGET_SELECT_EARLY_REMAT_MODES
31819 #define TARGET_SELECT_EARLY_REMAT_MODES aarch64_select_early_remat_modes
31821 #undef TARGET_SPECULATION_SAFE_VALUE
31822 #define TARGET_SPECULATION_SAFE_VALUE aarch64_speculation_safe_value
31824 #undef TARGET_ESTIMATED_POLY_VALUE
31825 #define TARGET_ESTIMATED_POLY_VALUE aarch64_estimated_poly_value
31827 #undef TARGET_ATTRIBUTE_TABLE
31828 #define TARGET_ATTRIBUTE_TABLE aarch64_attribute_table
31830 #undef TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN
31831 #define TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN \
31832 aarch64_simd_clone_compute_vecsize_and_simdlen
31834 #undef TARGET_SIMD_CLONE_ADJUST
31835 #define TARGET_SIMD_CLONE_ADJUST aarch64_simd_clone_adjust
31837 #undef TARGET_SIMD_CLONE_USABLE
31838 #define TARGET_SIMD_CLONE_USABLE aarch64_simd_clone_usable
31840 #undef TARGET_COMP_TYPE_ATTRIBUTES
31841 #define TARGET_COMP_TYPE_ATTRIBUTES aarch64_comp_type_attributes
31843 #undef TARGET_MERGE_DECL_ATTRIBUTES
31844 #define TARGET_MERGE_DECL_ATTRIBUTES aarch64_merge_decl_attributes
31846 #undef TARGET_GET_MULTILIB_ABI_NAME
31847 #define TARGET_GET_MULTILIB_ABI_NAME aarch64_get_multilib_abi_name
31849 #undef TARGET_FNTYPE_ABI
31850 #define TARGET_FNTYPE_ABI aarch64_fntype_abi
31852 #undef TARGET_MEMTAG_CAN_TAG_ADDRESSES
31853 #define TARGET_MEMTAG_CAN_TAG_ADDRESSES aarch64_can_tag_addresses
31855 #if CHECKING_P
31856 #undef TARGET_RUN_TARGET_SELFTESTS
31857 #define TARGET_RUN_TARGET_SELFTESTS selftest::aarch64_run_selftests
31858 #endif /* #if CHECKING_P */
31860 #undef TARGET_ASM_POST_CFI_STARTPROC
31861 #define TARGET_ASM_POST_CFI_STARTPROC aarch64_post_cfi_startproc
31863 #undef TARGET_STRICT_ARGUMENT_NAMING
31864 #define TARGET_STRICT_ARGUMENT_NAMING hook_bool_CUMULATIVE_ARGS_true
31866 #undef TARGET_MODE_EMIT
31867 #define TARGET_MODE_EMIT aarch64_mode_emit
31869 #undef TARGET_MODE_NEEDED
31870 #define TARGET_MODE_NEEDED aarch64_mode_needed
31872 #undef TARGET_MODE_AFTER
31873 #define TARGET_MODE_AFTER aarch64_mode_after
31875 #undef TARGET_MODE_CONFLUENCE
31876 #define TARGET_MODE_CONFLUENCE aarch64_mode_confluence
31878 #undef TARGET_MODE_BACKPROP
31879 #define TARGET_MODE_BACKPROP aarch64_mode_backprop
31881 #undef TARGET_MODE_ENTRY
31882 #define TARGET_MODE_ENTRY aarch64_mode_entry
31884 #undef TARGET_MODE_EXIT
31885 #define TARGET_MODE_EXIT aarch64_mode_exit
31887 #undef TARGET_MODE_EH_HANDLER
31888 #define TARGET_MODE_EH_HANDLER aarch64_mode_eh_handler
31890 #undef TARGET_MODE_PRIORITY
31891 #define TARGET_MODE_PRIORITY aarch64_mode_priority
31893 #undef TARGET_MD_ASM_ADJUST
31894 #define TARGET_MD_ASM_ADJUST aarch64_md_asm_adjust
31896 #undef TARGET_ASM_FILE_END
31897 #define TARGET_ASM_FILE_END aarch64_asm_file_end
31899 #undef TARGET_ASM_FUNCTION_EPILOGUE
31900 #define TARGET_ASM_FUNCTION_EPILOGUE aarch64_sls_emit_blr_function_thunks
31902 #undef TARGET_HAVE_SHADOW_CALL_STACK
31903 #define TARGET_HAVE_SHADOW_CALL_STACK true
31905 #undef TARGET_CONST_ANCHOR
31906 #define TARGET_CONST_ANCHOR 0x1000000
31908 #undef TARGET_EXTRA_LIVE_ON_ENTRY
31909 #define TARGET_EXTRA_LIVE_ON_ENTRY aarch64_extra_live_on_entry
31911 #undef TARGET_USE_LATE_PROLOGUE_EPILOGUE
31912 #define TARGET_USE_LATE_PROLOGUE_EPILOGUE aarch64_use_late_prologue_epilogue
31914 #undef TARGET_EMIT_EPILOGUE_FOR_SIBCALL
31915 #define TARGET_EMIT_EPILOGUE_FOR_SIBCALL aarch64_expand_epilogue
31917 #undef TARGET_OPTION_FUNCTION_VERSIONS
31918 #define TARGET_OPTION_FUNCTION_VERSIONS aarch64_common_function_versions
31920 #undef TARGET_COMPARE_VERSION_PRIORITY
31921 #define TARGET_COMPARE_VERSION_PRIORITY aarch64_compare_version_priority
31923 #undef TARGET_GENERATE_VERSION_DISPATCHER_BODY
31924 #define TARGET_GENERATE_VERSION_DISPATCHER_BODY \
31925 aarch64_generate_version_dispatcher_body
31927 #undef TARGET_GET_FUNCTION_VERSIONS_DISPATCHER
31928 #define TARGET_GET_FUNCTION_VERSIONS_DISPATCHER \
31929 aarch64_get_function_versions_dispatcher
31931 #undef TARGET_MANGLE_DECL_ASSEMBLER_NAME
31932 #define TARGET_MANGLE_DECL_ASSEMBLER_NAME aarch64_mangle_decl_assembler_name
31934 #undef TARGET_DOCUMENTATION_NAME
31935 #define TARGET_DOCUMENTATION_NAME "AArch64"
31937 struct gcc_target targetm = TARGET_INITIALIZER;
31939 #include "gt-aarch64.h"