1 /* Machine description for AArch64 architecture.
2 Copyright (C) 2009-2024 Free Software Foundation, Inc.
3 Contributed by ARM Ltd.
5 This file is part of GCC.
7 GCC is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3, or (at your option)
12 GCC is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
21 #define IN_TARGET_CODE 1
23 #define INCLUDE_STRING
24 #define INCLUDE_ALGORITHM
25 #define INCLUDE_MEMORY
26 #define INCLUDE_VECTOR
29 #include "coretypes.h"
40 #include "stringpool.h"
47 #include "diagnostic.h"
48 #include "insn-attr.h"
50 #include "fold-const.h"
51 #include "stor-layout.h"
59 #include "langhooks.h"
63 #include "dwarf2out.h"
64 #include "gimple-iterator.h"
65 #include "tree-vectorizer.h"
66 #include "aarch64-cost-tables.h"
70 #include "tm-constrs.h"
71 #include "sched-int.h"
72 #include "target-globals.h"
73 #include "common/common-target.h"
76 #include "selftest-rtl.h"
77 #include "rtx-vector-builder.h"
80 #include "function-abi.h"
81 #include "gimple-pretty-print.h"
82 #include "tree-ssa-loop-niter.h"
83 #include "fractional-cost.h"
87 #include "aarch64-feature-deps.h"
88 #include "config/arm/aarch-common.h"
89 #include "config/arm/aarch-common-protos.h"
90 #include "common/config/aarch64/cpuinfo.h"
93 #include "tree-pass.h"
95 #include "symbol-summary.h"
99 #include "ipa-fnsummary.h"
100 #include "hash-map.h"
102 /* This file should be included last. */
103 #include "target-def.h"
105 /* Defined for convenience. */
106 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
108 /* Maximum bytes set for an inline memset expansion. With -Os use 3 STP
109 and 1 MOVI/DUP (same size as a call). */
110 #define MAX_SET_SIZE(speed) (speed ? 256 : 96)
112 /* Flags that describe how a function shares certain architectural state
115 - AARCH64_STATE_SHARED indicates that the function does share the state
118 - AARCH64_STATE_IN indicates that the function reads (or might read) the
119 incoming state. The converse is that the function ignores the incoming
122 - AARCH64_STATE_OUT indicates that the function returns new state.
123 The converse is that the state on return is the same as it was on entry.
125 A function that partially modifies the state treats it as both IN
126 and OUT (because the value on return depends to some extent on the
128 constexpr auto AARCH64_STATE_SHARED
= 1U << 0;
129 constexpr auto AARCH64_STATE_IN
= 1U << 1;
130 constexpr auto AARCH64_STATE_OUT
= 1U << 2;
132 /* Information about a legitimate vector immediate operand. */
133 struct simd_immediate_info
135 enum insn_type
{ MOV
, MVN
, INDEX
, PTRUE
};
136 enum modifier_type
{ LSL
, MSL
};
138 simd_immediate_info () {}
139 simd_immediate_info (scalar_float_mode
, rtx
);
140 simd_immediate_info (scalar_int_mode
, unsigned HOST_WIDE_INT
,
141 insn_type
= MOV
, modifier_type
= LSL
,
143 simd_immediate_info (scalar_mode
, rtx
, rtx
);
144 simd_immediate_info (scalar_int_mode
, aarch64_svpattern
);
146 /* The mode of the elements. */
147 scalar_mode elt_mode
;
149 /* The instruction to use to move the immediate into a vector. */
154 /* For MOV and MVN. */
157 /* The value of each element. */
160 /* The kind of shift modifier to use, and the number of bits to shift.
161 This is (LSL, 0) if no shift is needed. */
162 modifier_type modifier
;
169 /* The value of the first element and the step to be added for each
170 subsequent element. */
175 aarch64_svpattern pattern
;
179 /* Construct a floating-point immediate in which each element has mode
180 ELT_MODE_IN and value VALUE_IN. */
181 inline simd_immediate_info
182 ::simd_immediate_info (scalar_float_mode elt_mode_in
, rtx value_in
)
183 : elt_mode (elt_mode_in
), insn (MOV
)
185 u
.mov
.value
= value_in
;
186 u
.mov
.modifier
= LSL
;
190 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
191 and value VALUE_IN. The other parameters are as for the structure
193 inline simd_immediate_info
194 ::simd_immediate_info (scalar_int_mode elt_mode_in
,
195 unsigned HOST_WIDE_INT value_in
,
196 insn_type insn_in
, modifier_type modifier_in
,
197 unsigned int shift_in
)
198 : elt_mode (elt_mode_in
), insn (insn_in
)
200 u
.mov
.value
= gen_int_mode (value_in
, elt_mode_in
);
201 u
.mov
.modifier
= modifier_in
;
202 u
.mov
.shift
= shift_in
;
205 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
206 and where element I is equal to BASE_IN + I * STEP_IN. */
207 inline simd_immediate_info
208 ::simd_immediate_info (scalar_mode elt_mode_in
, rtx base_in
, rtx step_in
)
209 : elt_mode (elt_mode_in
), insn (INDEX
)
211 u
.index
.base
= base_in
;
212 u
.index
.step
= step_in
;
215 /* Construct a predicate that controls elements of mode ELT_MODE_IN
216 and has PTRUE pattern PATTERN_IN. */
217 inline simd_immediate_info
218 ::simd_immediate_info (scalar_int_mode elt_mode_in
,
219 aarch64_svpattern pattern_in
)
220 : elt_mode (elt_mode_in
), insn (PTRUE
)
222 u
.pattern
= pattern_in
;
227 /* Describes types that map to Pure Scalable Types (PSTs) in the AAPCS64. */
228 class pure_scalable_type_info
231 /* Represents the result of analyzing a type. All values are nonzero,
232 in the possibly forlorn hope that accidental conversions to bool
233 trigger a warning. */
236 /* The type does not have an ABI identity; i.e. it doesn't contain
237 at least one object whose type is a Fundamental Data Type. */
240 /* The type is definitely a Pure Scalable Type. */
243 /* The type is definitely not a Pure Scalable Type. */
246 /* It doesn't matter for PCS purposes whether the type is a Pure
247 Scalable Type or not, since the type will be handled the same
250 Specifically, this means that if the type is a Pure Scalable Type,
251 there aren't enough argument registers to hold it, and so it will
252 need to be passed or returned in memory. If the type isn't a
253 Pure Scalable Type, it's too big to be passed or returned in core
254 or SIMD&FP registers, and so again will need to go in memory. */
258 /* Aggregates of 17 bytes or more are normally passed and returned
259 in memory, so aggregates of that size can safely be analyzed as
260 DOESNT_MATTER. We need to be able to collect enough pieces to
261 represent a PST that is smaller than that. Since predicates are
262 2 bytes in size for -msve-vector-bits=128, that means we need to be
263 able to store at least 8 pieces.
265 We also need to be able to store enough pieces to represent
266 a single vector in each vector argument register and a single
267 predicate in each predicate argument register. This means that
268 we need at least 12 pieces. */
269 static const unsigned int MAX_PIECES
= NUM_FP_ARG_REGS
+ NUM_PR_ARG_REGS
;
270 static_assert (MAX_PIECES
>= 8, "Need to store at least 8 predicates");
272 /* Describes one piece of a PST. Each piece is one of:
274 - a single Scalable Vector Type (SVT)
275 - a single Scalable Predicate Type (SPT)
276 - a PST containing 2, 3 or 4 SVTs, with no padding
278 It either represents a single built-in type or a PST formed from
279 multiple homogeneous built-in types. */
282 rtx
get_rtx (unsigned int, unsigned int) const;
284 /* The number of vector and predicate registers that the piece
285 occupies. One of the two is always zero. */
289 /* The mode of the registers described above. */
292 /* If this piece is formed from multiple homogeneous built-in types,
293 this is the mode of the built-in types, otherwise it is MODE. */
294 machine_mode orig_mode
;
296 /* The offset in bytes of the piece from the start of the type. */
300 /* Divides types analyzed as IS_PST into individual pieces. The pieces
301 are in memory order. */
302 auto_vec
<piece
, MAX_PIECES
> pieces
;
304 unsigned int num_zr () const;
305 unsigned int num_pr () const;
307 rtx
get_rtx (machine_mode mode
, unsigned int, unsigned int) const;
309 analysis_result
analyze (const_tree
);
310 bool analyze_registers (const_tree
);
313 analysis_result
analyze_array (const_tree
);
314 analysis_result
analyze_record (const_tree
);
315 void add_piece (const piece
&);
319 /* The current code model. */
320 enum aarch64_code_model aarch64_cmodel
;
322 enum aarch64_tp_reg aarch64_tpidr_register
;
324 /* The number of 64-bit elements in an SVE vector. */
325 poly_uint16 aarch64_sve_vg
;
328 #undef TARGET_HAVE_TLS
329 #define TARGET_HAVE_TLS 1
332 static bool aarch64_composite_type_p (const_tree
, machine_mode
);
333 static bool aarch64_return_in_memory_1 (const_tree
);
334 static bool aarch64_vfp_is_call_or_return_candidate (machine_mode
,
336 machine_mode
*, int *,
338 static void aarch64_elf_asm_constructor (rtx
, int) ATTRIBUTE_UNUSED
;
339 static void aarch64_elf_asm_destructor (rtx
, int) ATTRIBUTE_UNUSED
;
340 static void aarch64_override_options_after_change (void);
341 static bool aarch64_vector_mode_supported_p (machine_mode
);
342 static int aarch64_address_cost (rtx
, machine_mode
, addr_space_t
, bool);
343 static bool aarch64_builtin_support_vector_misalignment (machine_mode mode
,
347 static machine_mode
aarch64_simd_container_mode (scalar_mode
, poly_int64
);
348 static bool aarch64_print_address_internal (FILE*, machine_mode
, rtx
,
349 aarch64_addr_query_type
);
351 /* The processor for which instructions should be scheduled. */
352 enum aarch64_processor aarch64_tune
= cortexa53
;
354 /* Global flag for PC relative loads. */
355 bool aarch64_pcrelative_literal_loads
;
357 /* Global flag for whether frame pointer is enabled. */
358 bool aarch64_use_frame_pointer
;
360 /* Support for command line parsing of boolean flags in the tuning
362 struct aarch64_flag_desc
368 #define AARCH64_FUSION_PAIR(name, internal_name) \
369 { name, AARCH64_FUSE_##internal_name },
370 static const struct aarch64_flag_desc aarch64_fusible_pairs
[] =
372 { "none", AARCH64_FUSE_NOTHING
},
373 #include "aarch64-fusion-pairs.def"
374 { "all", AARCH64_FUSE_ALL
},
375 { NULL
, AARCH64_FUSE_NOTHING
}
378 #define AARCH64_EXTRA_TUNING_OPTION(name, internal_name) \
379 { name, AARCH64_EXTRA_TUNE_##internal_name },
380 static const struct aarch64_flag_desc aarch64_tuning_flags
[] =
382 { "none", AARCH64_EXTRA_TUNE_NONE
},
383 #include "aarch64-tuning-flags.def"
384 { "all", AARCH64_EXTRA_TUNE_ALL
},
385 { NULL
, AARCH64_EXTRA_TUNE_NONE
}
388 /* Tuning parameters. */
389 #include "tuning_models/generic.h"
390 #include "tuning_models/generic_armv8_a.h"
391 #include "tuning_models/generic_armv9_a.h"
392 #include "tuning_models/cortexa35.h"
393 #include "tuning_models/cortexa53.h"
394 #include "tuning_models/cortexa57.h"
395 #include "tuning_models/cortexa72.h"
396 #include "tuning_models/cortexa73.h"
397 #include "tuning_models/cortexx925.h"
398 #include "tuning_models/exynosm1.h"
399 #include "tuning_models/thunderxt88.h"
400 #include "tuning_models/thunderx.h"
401 #include "tuning_models/tsv110.h"
402 #include "tuning_models/xgene1.h"
403 #include "tuning_models/emag.h"
404 #include "tuning_models/qdf24xx.h"
405 #include "tuning_models/saphira.h"
406 #include "tuning_models/thunderx2t99.h"
407 #include "tuning_models/thunderx3t110.h"
408 #include "tuning_models/neoversen1.h"
409 #include "tuning_models/ampere1.h"
410 #include "tuning_models/ampere1a.h"
411 #include "tuning_models/ampere1b.h"
412 #include "tuning_models/neoversev1.h"
413 #include "tuning_models/neoverse512tvb.h"
414 #include "tuning_models/neoversen2.h"
415 #include "tuning_models/neoversen3.h"
416 #include "tuning_models/neoversev2.h"
417 #include "tuning_models/neoversev3.h"
418 #include "tuning_models/neoversev3ae.h"
419 #include "tuning_models/a64fx.h"
421 /* Support for fine-grained override of the tuning structures. */
422 struct aarch64_tuning_override_function
425 void (*parse_override
)(const char*, struct tune_params
*);
428 static void aarch64_parse_fuse_string (const char*, struct tune_params
*);
429 static void aarch64_parse_tune_string (const char*, struct tune_params
*);
430 static void aarch64_parse_sve_width_string (const char*, struct tune_params
*);
432 static const struct aarch64_tuning_override_function
433 aarch64_tuning_override_functions
[] =
435 { "fuse", aarch64_parse_fuse_string
},
436 { "tune", aarch64_parse_tune_string
},
437 { "sve_width", aarch64_parse_sve_width_string
},
441 /* A processor implementing AArch64. */
445 aarch64_processor ident
;
446 aarch64_processor sched_core
;
448 aarch64_feature_flags flags
;
449 const tune_params
*tune
;
452 /* Architectures implementing AArch64. */
453 static CONSTEXPR
const processor all_architectures
[] =
455 #define AARCH64_ARCH(NAME, CORE, ARCH_IDENT, D, E) \
456 {NAME, CORE, CORE, AARCH64_ARCH_##ARCH_IDENT, \
457 feature_deps::ARCH_IDENT ().enable, NULL},
458 #include "aarch64-arches.def"
459 {NULL
, aarch64_none
, aarch64_none
, aarch64_no_arch
, 0, NULL
}
462 /* Processor cores implementing AArch64. */
463 static const struct processor all_cores
[] =
465 #define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, E, COSTS, G, H, I) \
466 {NAME, IDENT, SCHED, AARCH64_ARCH_##ARCH, \
467 feature_deps::cpu_##IDENT, &COSTS##_tunings},
468 #include "aarch64-cores.def"
469 {NULL
, aarch64_none
, aarch64_none
, aarch64_no_arch
, 0, NULL
}
471 /* Internal representation of system registers. */
474 /* Stringified sysreg encoding values, represented as
475 s<sn>_<op1>_c<cn>_c<cm>_<op2>. */
476 const char *encoding
;
477 /* Flags affecting sysreg usage, such as read/write-only. */
479 /* Architectural features implied by sysreg. */
480 aarch64_feature_flags arch_reqs
;
483 /* An aarch64_feature_set initializer for a single feature,
484 AARCH64_FEATURE_<FEAT>. */
485 #define AARCH64_FEATURE(FEAT) AARCH64_FL_##FEAT
487 /* Used by AARCH64_FEATURES. */
488 #define AARCH64_OR_FEATURES_1(X, F1) \
490 #define AARCH64_OR_FEATURES_2(X, F1, F2) \
491 (AARCH64_FEATURE (F1) | AARCH64_OR_FEATURES_1 (X, F2))
492 #define AARCH64_OR_FEATURES_3(X, F1, ...) \
493 (AARCH64_FEATURE (F1) | AARCH64_OR_FEATURES_2 (X, __VA_ARGS__))
495 /* An aarch64_feature_set initializer for the N features listed in "...". */
496 #define AARCH64_FEATURES(N, ...) \
497 AARCH64_OR_FEATURES_##N (0, __VA_ARGS__)
499 #define AARCH64_NO_FEATURES 0
501 /* Flags associated with the properties of system registers. It mainly serves
502 to mark particular registers as read or write only. */
503 #define F_DEPRECATED (1 << 1)
504 #define F_REG_READ (1 << 2)
505 #define F_REG_WRITE (1 << 3)
506 #define F_ARCHEXT (1 << 4)
507 /* Flag indicating register name is alias for another system register. */
508 #define F_REG_ALIAS (1 << 5)
509 /* Flag indicatinig registers which may be implemented with 128-bits. */
510 #define F_REG_128 (1 << 6)
512 /* Database of system registers, their encodings and architectural
514 const sysreg_t aarch64_sysregs
[] =
516 #define CPENC(SN, OP1, CN, CM, OP2) "s"#SN"_"#OP1"_c"#CN"_c"#CM"_"#OP2
517 #define SYSREG(NAME, ENC, FLAGS, ARCH) \
518 { NAME, ENC, FLAGS, ARCH },
519 #include "aarch64-sys-regs.def"
523 #undef AARCH64_NO_FEATURES
525 using sysreg_map_t
= hash_map
<nofree_string_hash
, const sysreg_t
*>;
526 static sysreg_map_t
*sysreg_map
= nullptr;
528 /* Map system register names to their hardware metadata: encoding,
529 feature flags and architectural feature requirements, all of which
530 are encoded in a sysreg_t struct. */
532 aarch64_register_sysreg (const char *name
, const sysreg_t
*metadata
)
534 bool dup
= sysreg_map
->put (name
, metadata
);
535 gcc_checking_assert (!dup
);
538 /* Lazily initialize hash table for system register validation,
539 checking the validity of supplied register name and returning
540 register's associated metadata. */
542 aarch64_init_sysregs (void)
544 gcc_assert (!sysreg_map
);
545 sysreg_map
= new sysreg_map_t
;
548 for (unsigned i
= 0; i
< ARRAY_SIZE (aarch64_sysregs
); i
++)
550 const sysreg_t
*reg
= aarch64_sysregs
+ i
;
551 aarch64_register_sysreg (reg
->name
, reg
);
555 /* No direct access to the sysreg hash-map should be made. Doing so
556 risks trying to acess an unitialized hash-map and dereferencing the
557 returned double pointer without due care risks dereferencing a
560 aarch64_lookup_sysreg_map (const char *regname
)
563 aarch64_init_sysregs ();
565 const sysreg_t
**sysreg_entry
= sysreg_map
->get (regname
);
566 if (sysreg_entry
!= NULL
)
567 return *sysreg_entry
;
571 /* The current tuning set. */
572 struct tune_params aarch64_tune_params
= generic_tunings
;
574 /* If NAME is the name of an arm:: attribute that describes shared state,
575 return its associated AARCH64_STATE_* flags, otherwise return 0. */
577 aarch64_attribute_shared_state_flags (const char *name
)
579 if (strcmp (name
, "in") == 0)
580 return AARCH64_STATE_SHARED
| AARCH64_STATE_IN
;
581 if (strcmp (name
, "inout") == 0)
582 return AARCH64_STATE_SHARED
| AARCH64_STATE_IN
| AARCH64_STATE_OUT
;
583 if (strcmp (name
, "out") == 0)
584 return AARCH64_STATE_SHARED
| AARCH64_STATE_OUT
;
585 if (strcmp (name
, "preserves") == 0)
586 return AARCH64_STATE_SHARED
;
590 /* See whether attribute list ATTRS has any sharing information
591 for state STATE_NAME. Return the associated state flags if so,
592 otherwise return 0. */
594 aarch64_lookup_shared_state_flags (tree attrs
, const char *state_name
)
596 for (tree attr
= attrs
; attr
; attr
= TREE_CHAIN (attr
))
598 if (!is_attribute_namespace_p ("arm", attr
))
601 auto attr_name
= IDENTIFIER_POINTER (get_attribute_name (attr
));
602 auto flags
= aarch64_attribute_shared_state_flags (attr_name
);
606 for (tree arg
= TREE_VALUE (attr
); arg
; arg
= TREE_CHAIN (arg
))
608 tree value
= TREE_VALUE (arg
);
609 if (TREE_CODE (value
) == STRING_CST
610 && strcmp (TREE_STRING_POINTER (value
), state_name
) == 0)
617 /* Return true if DECL creates a new scope for state STATE_STRING. */
619 aarch64_fndecl_has_new_state (const_tree decl
, const char *state_name
)
621 if (tree attr
= lookup_attribute ("arm", "new", DECL_ATTRIBUTES (decl
)))
622 for (tree arg
= TREE_VALUE (attr
); arg
; arg
= TREE_CHAIN (arg
))
624 tree value
= TREE_VALUE (arg
);
625 if (TREE_CODE (value
) == STRING_CST
626 && strcmp (TREE_STRING_POINTER (value
), state_name
) == 0)
632 /* Return true if attribute argument VALUE is a recognized state string,
633 otherwise report an error. NAME is the name of the attribute to which
634 VALUE is being passed. */
636 aarch64_check_state_string (tree name
, tree value
)
638 if (TREE_CODE (value
) != STRING_CST
)
640 error ("the arguments to %qE must be constant strings", name
);
644 const char *state_name
= TREE_STRING_POINTER (value
);
645 if (strcmp (state_name
, "za") != 0
646 && strcmp (state_name
, "zt0") != 0)
648 error ("unrecognized state string %qs", state_name
);
655 /* qsort callback to compare two STRING_CSTs. */
657 cmp_string_csts (const void *a
, const void *b
)
659 return strcmp (TREE_STRING_POINTER (*(const_tree
const *) a
),
660 TREE_STRING_POINTER (*(const_tree
const *) b
));
663 /* Canonicalize a list of state strings. ARGS contains the arguments to
664 a new attribute while OLD_ATTR, if nonnull, contains a previous attribute
665 of the same type. If CAN_MERGE_IN_PLACE, it is safe to adjust OLD_ATTR's
666 arguments and drop the new attribute. Otherwise, the new attribute must
667 be kept and ARGS must include the information in OLD_ATTR.
669 In both cases, the new arguments must be a sorted list of state strings
670 with duplicates removed.
672 Return true if new attribute should be kept, false if it should be
675 aarch64_merge_string_arguments (tree args
, tree old_attr
,
676 bool can_merge_in_place
)
678 /* Get a sorted list of all state strings (including duplicates). */
679 auto add_args
= [](vec
<tree
> &strings
, const_tree args
)
681 for (const_tree arg
= args
; arg
; arg
= TREE_CHAIN (arg
))
682 if (TREE_CODE (TREE_VALUE (arg
)) == STRING_CST
)
683 strings
.safe_push (TREE_VALUE (arg
));
685 auto_vec
<tree
, 16> strings
;
686 add_args (strings
, args
);
688 add_args (strings
, TREE_VALUE (old_attr
));
689 strings
.qsort (cmp_string_csts
);
691 /* The list can be empty if there was no previous attribute and if all
692 the new arguments are erroneous. Drop the attribute in that case. */
693 if (strings
.is_empty ())
696 /* Destructively modify one of the argument lists, removing duplicates
698 bool use_old_attr
= old_attr
&& can_merge_in_place
;
699 tree
*end
= use_old_attr
? &TREE_VALUE (old_attr
) : &args
;
700 tree prev
= NULL_TREE
;
701 for (tree arg
: strings
)
703 if (prev
&& simple_cst_equal (arg
, prev
))
707 *end
= tree_cons (NULL_TREE
, arg
, NULL_TREE
);
709 TREE_VALUE (*end
) = arg
;
710 end
= &TREE_CHAIN (*end
);
713 return !use_old_attr
;
716 /* Check whether an 'aarch64_vector_pcs' attribute is valid. */
719 handle_aarch64_vector_pcs_attribute (tree
*node
, tree name
, tree
,
720 int, bool *no_add_attrs
)
722 /* Since we set fn_type_req to true, the caller should have checked
724 gcc_assert (FUNC_OR_METHOD_TYPE_P (*node
));
725 switch ((arm_pcs
) fntype_abi (*node
).id ())
727 case ARM_PCS_AAPCS64
:
732 error ("the %qE attribute cannot be applied to an SVE function type",
734 *no_add_attrs
= true;
737 case ARM_PCS_TLSDESC
:
738 case ARM_PCS_UNKNOWN
:
744 /* Return true if arm::new(ARGS) is compatible with the type of decl DECL,
745 otherwise report an error. */
747 aarch64_check_arm_new_against_type (tree args
, tree decl
)
749 tree type_attrs
= TYPE_ATTRIBUTES (TREE_TYPE (decl
));
750 for (tree arg
= args
; arg
; arg
= TREE_CHAIN (arg
))
752 tree value
= TREE_VALUE (arg
);
753 if (TREE_CODE (value
) == STRING_CST
)
755 const char *state_name
= TREE_STRING_POINTER (value
);
756 if (aarch64_lookup_shared_state_flags (type_attrs
, state_name
))
758 error_at (DECL_SOURCE_LOCATION (decl
),
759 "cannot create a new %qs scope since %qs is shared"
760 " with callers", state_name
, state_name
);
768 /* Callback for arm::new attributes. */
770 handle_arm_new (tree
*node
, tree name
, tree args
, int, bool *no_add_attrs
)
773 if (TREE_CODE (decl
) != FUNCTION_DECL
)
775 error ("%qE attribute applies only to function definitions", name
);
776 *no_add_attrs
= true;
779 if (TREE_TYPE (decl
) == error_mark_node
)
781 *no_add_attrs
= true;
785 for (tree arg
= args
; arg
; arg
= TREE_CHAIN (arg
))
786 aarch64_check_state_string (name
, TREE_VALUE (arg
));
788 if (!aarch64_check_arm_new_against_type (args
, decl
))
790 *no_add_attrs
= true;
794 /* If there is an old attribute, we should try to update it in-place,
795 so that there is only one (definitive) arm::new attribute on the decl. */
796 tree old_attr
= lookup_attribute ("arm", "new", DECL_ATTRIBUTES (decl
));
797 if (!aarch64_merge_string_arguments (args
, old_attr
, true))
798 *no_add_attrs
= true;
803 /* Callback for arm::{in,out,inout,preserves} attributes. */
805 handle_arm_shared (tree
*node
, tree name
, tree args
,
806 int, bool *no_add_attrs
)
809 tree old_attrs
= TYPE_ATTRIBUTES (type
);
810 auto flags
= aarch64_attribute_shared_state_flags (IDENTIFIER_POINTER (name
));
811 for (tree arg
= args
; arg
; arg
= TREE_CHAIN (arg
))
813 tree value
= TREE_VALUE (arg
);
814 if (aarch64_check_state_string (name
, value
))
816 const char *state_name
= TREE_STRING_POINTER (value
);
817 auto old_flags
= aarch64_lookup_shared_state_flags (old_attrs
,
819 if (old_flags
&& old_flags
!= flags
)
821 error ("inconsistent attributes for state %qs", state_name
);
822 *no_add_attrs
= true;
828 /* We can't update an old attribute in-place, since types are shared.
829 Instead make sure that this new attribute contains all the
830 information, so that the old attribute becomes redundant. */
831 tree old_attr
= lookup_attribute ("arm", IDENTIFIER_POINTER (name
),
833 if (!aarch64_merge_string_arguments (args
, old_attr
, false))
834 *no_add_attrs
= true;
839 /* Mutually-exclusive function type attributes for controlling PSTATE.SM. */
840 static const struct attribute_spec::exclusions attr_streaming_exclusions
[] =
842 /* Attribute name exclusion applies to:
843 function, type, variable */
844 { "streaming", false, true, false },
845 { "streaming_compatible", false, true, false },
846 { NULL
, false, false, false }
849 /* Table of machine attributes. */
850 static const attribute_spec aarch64_gnu_attributes
[] =
852 /* { name, min_len, max_len, decl_req, type_req, fn_type_req,
853 affects_type_identity, handler, exclude } */
854 { "aarch64_vector_pcs", 0, 0, false, true, true, true,
855 handle_aarch64_vector_pcs_attribute
, NULL
},
856 { "arm_sve_vector_bits", 1, 1, false, true, false, true,
857 aarch64_sve::handle_arm_sve_vector_bits_attribute
,
859 { "Advanced SIMD type", 1, 1, false, true, false, true, NULL
, NULL
},
860 { "SVE type", 3, 3, false, true, false, true, NULL
, NULL
},
861 { "SVE sizeless type", 0, 0, false, true, false, true, NULL
, NULL
},
862 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
863 { "dllimport", 0, 0, false, false, false, false, handle_dll_attribute
, NULL
},
864 { "dllexport", 0, 0, false, false, false, false, handle_dll_attribute
, NULL
},
866 #ifdef SUBTARGET_ATTRIBUTE_TABLE
867 SUBTARGET_ATTRIBUTE_TABLE
871 static const scoped_attribute_specs aarch64_gnu_attribute_table
=
873 "gnu", { aarch64_gnu_attributes
}
876 static const attribute_spec aarch64_arm_attributes
[] =
878 { "streaming", 0, 0, false, true, true, true,
879 NULL
, attr_streaming_exclusions
},
880 { "streaming_compatible", 0, 0, false, true, true, true,
881 NULL
, attr_streaming_exclusions
},
882 { "locally_streaming", 0, 0, true, false, false, false, NULL
, NULL
},
883 { "new", 1, -1, true, false, false, false,
884 handle_arm_new
, NULL
},
885 { "preserves", 1, -1, false, true, true, true,
886 handle_arm_shared
, NULL
},
887 { "in", 1, -1, false, true, true, true,
888 handle_arm_shared
, NULL
},
889 { "out", 1, -1, false, true, true, true,
890 handle_arm_shared
, NULL
},
891 { "inout", 1, -1, false, true, true, true,
892 handle_arm_shared
, NULL
}
895 static const scoped_attribute_specs aarch64_arm_attribute_table
=
897 "arm", { aarch64_arm_attributes
}
900 static const scoped_attribute_specs
*const aarch64_attribute_table
[] =
902 &aarch64_gnu_attribute_table
,
903 &aarch64_arm_attribute_table
906 typedef enum aarch64_cond_code
908 AARCH64_EQ
= 0, AARCH64_NE
, AARCH64_CS
, AARCH64_CC
, AARCH64_MI
, AARCH64_PL
,
909 AARCH64_VS
, AARCH64_VC
, AARCH64_HI
, AARCH64_LS
, AARCH64_GE
, AARCH64_LT
,
910 AARCH64_GT
, AARCH64_LE
, AARCH64_AL
, AARCH64_NV
914 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
917 /* The condition codes of the processor, and the inverse function. */
918 static const char * const aarch64_condition_codes
[] =
920 "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
921 "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
924 /* The preferred condition codes for SVE conditions. */
925 static const char *const aarch64_sve_condition_codes
[] =
927 "none", "any", "nlast", "last", "first", "nfrst", "vs", "vc",
928 "pmore", "plast", "tcont", "tstop", "gt", "le", "al", "nv"
931 /* Return the assembly token for svpattern value VALUE. */
934 svpattern_token (enum aarch64_svpattern pattern
)
938 #define CASE(UPPER, LOWER, VALUE) case AARCH64_SV_##UPPER: return #LOWER;
939 AARCH64_FOR_SVPATTERN (CASE
)
941 case AARCH64_NUM_SVPATTERNS
:
947 /* Return the location of a piece that is known to be passed or returned
948 in registers. FIRST_ZR is the first unused vector argument register
949 and FIRST_PR is the first unused predicate argument register. */
952 pure_scalable_type_info::piece::get_rtx (unsigned int first_zr
,
953 unsigned int first_pr
) const
955 gcc_assert (VECTOR_MODE_P (mode
)
956 && first_zr
+ num_zr
<= V0_REGNUM
+ NUM_FP_ARG_REGS
957 && first_pr
+ num_pr
<= P0_REGNUM
+ NUM_PR_ARG_REGS
);
959 if (num_zr
> 0 && num_pr
== 0)
960 return gen_rtx_REG (mode
, first_zr
);
962 if (num_zr
== 0 && num_pr
<= 2)
963 return gen_rtx_REG (mode
, first_pr
);
968 /* Return the total number of vector registers required by the PST. */
971 pure_scalable_type_info::num_zr () const
973 unsigned int res
= 0;
974 for (unsigned int i
= 0; i
< pieces
.length (); ++i
)
975 res
+= pieces
[i
].num_zr
;
979 /* Return the total number of predicate registers required by the PST. */
982 pure_scalable_type_info::num_pr () const
984 unsigned int res
= 0;
985 for (unsigned int i
= 0; i
< pieces
.length (); ++i
)
986 res
+= pieces
[i
].num_pr
;
990 /* Return the location of a PST that is known to be passed or returned
991 in registers. FIRST_ZR is the first unused vector argument register
992 and FIRST_PR is the first unused predicate argument register. */
995 pure_scalable_type_info::get_rtx (machine_mode mode
,
996 unsigned int first_zr
,
997 unsigned int first_pr
) const
999 /* Try to return a single REG if possible. This leads to better
1000 code generation; it isn't required for correctness. */
1001 if (mode
== pieces
[0].mode
)
1003 gcc_assert (pieces
.length () == 1);
1004 return pieces
[0].get_rtx (first_zr
, first_pr
);
1007 /* Build up a PARALLEL that contains the individual pieces. */
1008 rtvec rtxes
= rtvec_alloc (pieces
.length ());
1009 for (unsigned int i
= 0; i
< pieces
.length (); ++i
)
1011 rtx reg
= pieces
[i
].get_rtx (first_zr
, first_pr
);
1012 rtx offset
= gen_int_mode (pieces
[i
].offset
, Pmode
);
1013 RTVEC_ELT (rtxes
, i
) = gen_rtx_EXPR_LIST (VOIDmode
, reg
, offset
);
1014 first_zr
+= pieces
[i
].num_zr
;
1015 first_pr
+= pieces
[i
].num_pr
;
1017 return gen_rtx_PARALLEL (mode
, rtxes
);
1020 /* Analyze whether TYPE is a Pure Scalable Type according to the rules
1023 pure_scalable_type_info::analysis_result
1024 pure_scalable_type_info::analyze (const_tree type
)
1026 /* Prevent accidental reuse. */
1027 gcc_assert (pieces
.is_empty ());
1029 /* No code will be generated for erroneous types, so we won't establish
1031 if (type
== error_mark_node
)
1032 return NO_ABI_IDENTITY
;
1034 /* Zero-sized types disappear in the language->ABI mapping. */
1035 if (TYPE_SIZE (type
) && integer_zerop (TYPE_SIZE (type
)))
1036 return NO_ABI_IDENTITY
;
1038 /* Check for SVTs, SPTs, and built-in tuple types that map to PSTs. */
1040 if (aarch64_sve::builtin_type_p (type
, &p
.num_zr
, &p
.num_pr
))
1042 machine_mode mode
= TYPE_MODE_RAW (type
);
1043 gcc_assert (VECTOR_MODE_P (mode
)
1044 && (!TARGET_SVE
|| aarch64_sve_mode_p (mode
)));
1046 p
.mode
= p
.orig_mode
= mode
;
1051 /* Check for user-defined PSTs. */
1052 if (TREE_CODE (type
) == ARRAY_TYPE
)
1053 return analyze_array (type
);
1054 if (TREE_CODE (type
) == RECORD_TYPE
)
1055 return analyze_record (type
);
1060 /* Analyze a type that is known not to be passed or returned in memory.
1061 Return true if it has an ABI identity and is a Pure Scalable Type. */
1064 pure_scalable_type_info::analyze_registers (const_tree type
)
1066 analysis_result result
= analyze (type
);
1067 gcc_assert (result
!= DOESNT_MATTER
);
1068 return result
== IS_PST
;
1071 /* Subroutine of analyze for handling ARRAY_TYPEs. */
1073 pure_scalable_type_info::analysis_result
1074 pure_scalable_type_info::analyze_array (const_tree type
)
1076 /* Analyze the element type. */
1077 pure_scalable_type_info element_info
;
1078 analysis_result result
= element_info
.analyze (TREE_TYPE (type
));
1079 if (result
!= IS_PST
)
1082 /* An array of unknown, flexible or variable length will be passed and
1083 returned by reference whatever we do. */
1084 tree nelts_minus_one
= array_type_nelts (type
);
1085 if (!tree_fits_uhwi_p (nelts_minus_one
))
1086 return DOESNT_MATTER
;
1088 /* Likewise if the array is constant-sized but too big to be interesting.
1089 The double checks against MAX_PIECES are to protect against overflow. */
1090 unsigned HOST_WIDE_INT count
= tree_to_uhwi (nelts_minus_one
);
1091 if (count
> MAX_PIECES
)
1092 return DOESNT_MATTER
;
1094 if (count
* element_info
.pieces
.length () > MAX_PIECES
)
1095 return DOESNT_MATTER
;
1097 /* The above checks should have weeded out elements of unknown size. */
1098 poly_uint64 element_bytes
;
1099 if (!poly_int_tree_p (TYPE_SIZE_UNIT (TREE_TYPE (type
)), &element_bytes
))
1102 /* Build up the list of individual vectors and predicates. */
1103 gcc_assert (!element_info
.pieces
.is_empty ());
1104 for (unsigned int i
= 0; i
< count
; ++i
)
1105 for (unsigned int j
= 0; j
< element_info
.pieces
.length (); ++j
)
1107 piece p
= element_info
.pieces
[j
];
1108 p
.offset
+= i
* element_bytes
;
1114 /* Subroutine of analyze for handling RECORD_TYPEs. */
1116 pure_scalable_type_info::analysis_result
1117 pure_scalable_type_info::analyze_record (const_tree type
)
1119 for (tree field
= TYPE_FIELDS (type
); field
; field
= TREE_CHAIN (field
))
1121 if (TREE_CODE (field
) != FIELD_DECL
)
1124 /* Zero-sized fields disappear in the language->ABI mapping. */
1125 if (DECL_SIZE (field
) && integer_zerop (DECL_SIZE (field
)))
1128 /* All fields with an ABI identity must be PSTs for the record as
1129 a whole to be a PST. If any individual field is too big to be
1130 interesting then the record is too. */
1131 pure_scalable_type_info field_info
;
1132 analysis_result subresult
= field_info
.analyze (TREE_TYPE (field
));
1133 if (subresult
== NO_ABI_IDENTITY
)
1135 if (subresult
!= IS_PST
)
1138 /* Since all previous fields are PSTs, we ought to be able to track
1139 the field offset using poly_ints. */
1140 tree bitpos
= bit_position (field
);
1141 gcc_assert (poly_int_tree_p (bitpos
));
1143 /* For the same reason, it shouldn't be possible to create a PST field
1144 whose offset isn't byte-aligned. */
1145 poly_widest_int wide_bytepos
= exact_div (wi::to_poly_widest (bitpos
),
1148 /* Punt if the record is too big to be interesting. */
1149 poly_uint64 bytepos
;
1150 if (!wide_bytepos
.to_uhwi (&bytepos
)
1151 || pieces
.length () + field_info
.pieces
.length () > MAX_PIECES
)
1152 return DOESNT_MATTER
;
1154 /* Add the individual vectors and predicates in the field to the
1156 gcc_assert (!field_info
.pieces
.is_empty ());
1157 for (unsigned int i
= 0; i
< field_info
.pieces
.length (); ++i
)
1159 piece p
= field_info
.pieces
[i
];
1160 p
.offset
+= bytepos
;
1164 /* Empty structures disappear in the language->ABI mapping. */
1165 return pieces
.is_empty () ? NO_ABI_IDENTITY
: IS_PST
;
1168 /* Add P to the list of pieces in the type. */
1171 pure_scalable_type_info::add_piece (const piece
&p
)
1173 /* Try to fold the new piece into the previous one to form a
1174 single-mode PST. For example, if we see three consecutive vectors
1175 of the same mode, we can represent them using the corresponding
1178 This is purely an optimization. */
1179 if (!pieces
.is_empty ())
1181 piece
&prev
= pieces
.last ();
1182 gcc_assert (VECTOR_MODE_P (p
.mode
) && VECTOR_MODE_P (prev
.mode
));
1183 unsigned int nelems1
, nelems2
;
1184 if (prev
.orig_mode
== p
.orig_mode
1185 && GET_MODE_CLASS (p
.orig_mode
) != MODE_VECTOR_BOOL
1186 && known_eq (prev
.offset
+ GET_MODE_SIZE (prev
.mode
), p
.offset
)
1187 && constant_multiple_p (GET_MODE_NUNITS (prev
.mode
),
1188 GET_MODE_NUNITS (p
.orig_mode
), &nelems1
)
1189 && constant_multiple_p (GET_MODE_NUNITS (p
.mode
),
1190 GET_MODE_NUNITS (p
.orig_mode
), &nelems2
)
1191 && targetm
.array_mode (p
.orig_mode
,
1192 nelems1
+ nelems2
).exists (&prev
.mode
))
1194 prev
.num_zr
+= p
.num_zr
;
1195 prev
.num_pr
+= p
.num_pr
;
1199 pieces
.quick_push (p
);
1202 /* Return true if at least one possible value of type TYPE includes at
1203 least one object of Pure Scalable Type, in the sense of the AAPCS64.
1205 This is a relatively expensive test for some types, so it should
1206 generally be made as late as possible. */
1209 aarch64_some_values_include_pst_objects_p (const_tree type
)
1211 if (TYPE_SIZE (type
) && integer_zerop (TYPE_SIZE (type
)))
1214 if (aarch64_sve::builtin_type_p (type
))
1217 if (TREE_CODE (type
) == ARRAY_TYPE
|| TREE_CODE (type
) == COMPLEX_TYPE
)
1218 return aarch64_some_values_include_pst_objects_p (TREE_TYPE (type
));
1220 if (RECORD_OR_UNION_TYPE_P (type
))
1221 for (tree field
= TYPE_FIELDS (type
); field
; field
= TREE_CHAIN (field
))
1222 if (TREE_CODE (field
) == FIELD_DECL
1223 && aarch64_some_values_include_pst_objects_p (TREE_TYPE (field
)))
1229 /* Return the descriptor of the SIMD ABI. */
1231 static const predefined_function_abi
&
1232 aarch64_simd_abi (void)
1234 predefined_function_abi
&simd_abi
= function_abis
[ARM_PCS_SIMD
];
1235 if (!simd_abi
.initialized_p ())
1237 HARD_REG_SET full_reg_clobbers
1238 = default_function_abi
.full_reg_clobbers ();
1239 for (int regno
= 0; regno
< FIRST_PSEUDO_REGISTER
; regno
++)
1240 if (FP_SIMD_SAVED_REGNUM_P (regno
))
1241 CLEAR_HARD_REG_BIT (full_reg_clobbers
, regno
);
1242 simd_abi
.initialize (ARM_PCS_SIMD
, full_reg_clobbers
);
1247 /* Return the descriptor of the SVE PCS. */
1249 static const predefined_function_abi
&
1250 aarch64_sve_abi (void)
1252 predefined_function_abi
&sve_abi
= function_abis
[ARM_PCS_SVE
];
1253 if (!sve_abi
.initialized_p ())
1255 HARD_REG_SET full_reg_clobbers
1256 = default_function_abi
.full_reg_clobbers ();
1257 for (int regno
= V8_REGNUM
; regno
<= V23_REGNUM
; ++regno
)
1258 CLEAR_HARD_REG_BIT (full_reg_clobbers
, regno
);
1259 for (int regno
= P4_REGNUM
; regno
<= P15_REGNUM
; ++regno
)
1260 CLEAR_HARD_REG_BIT (full_reg_clobbers
, regno
);
1261 sve_abi
.initialize (ARM_PCS_SVE
, full_reg_clobbers
);
1266 /* If X is an UNSPEC_SALT_ADDR expression, return the address that it
1267 wraps, otherwise return X itself. */
1273 if (GET_CODE (search
) == CONST
)
1274 search
= XEXP (search
, 0);
1275 if (GET_CODE (search
) == UNSPEC
&& XINT (search
, 1) == UNSPEC_SALT_ADDR
)
1276 x
= XVECEXP (search
, 0, 0);
1280 /* Like strip_offset, but also strip any UNSPEC_SALT_ADDR from the
1284 strip_offset_and_salt (rtx addr
, poly_int64
*offset
)
1286 return strip_salt (strip_offset (addr
, offset
));
1289 /* Generate code to enable conditional branches in functions over 1 MiB. */
1291 aarch64_gen_far_branch (rtx
* operands
, int pos_label
, const char * dest
,
1292 const char * branch_format
)
1294 rtx_code_label
* tmp_label
= gen_label_rtx ();
1295 char label_buf
[256];
1297 ASM_GENERATE_INTERNAL_LABEL (label_buf
, dest
,
1298 CODE_LABEL_NUMBER (tmp_label
));
1299 const char *label_ptr
= targetm
.strip_name_encoding (label_buf
);
1300 rtx dest_label
= operands
[pos_label
];
1301 operands
[pos_label
] = tmp_label
;
1303 snprintf (buffer
, sizeof (buffer
), "%s%s", branch_format
, label_ptr
);
1304 output_asm_insn (buffer
, operands
);
1306 snprintf (buffer
, sizeof (buffer
), "b\t%%l%d\n%s:", pos_label
, label_ptr
);
1307 operands
[pos_label
] = dest_label
;
1308 output_asm_insn (buffer
, operands
);
1313 aarch64_err_no_fpadvsimd (machine_mode mode
)
1315 if (TARGET_GENERAL_REGS_ONLY
)
1316 if (FLOAT_MODE_P (mode
))
1317 error ("%qs is incompatible with the use of floating-point types",
1318 "-mgeneral-regs-only");
1320 error ("%qs is incompatible with the use of vector types",
1321 "-mgeneral-regs-only");
1323 if (FLOAT_MODE_P (mode
))
1324 error ("%qs feature modifier is incompatible with the use of"
1325 " floating-point types", "+nofp");
1327 error ("%qs feature modifier is incompatible with the use of"
1328 " vector types", "+nofp");
1331 /* Report when we try to do something that requires SVE when SVE is disabled.
1332 This is an error of last resort and isn't very high-quality. It usually
1333 involves attempts to measure the vector length in some way. */
1335 aarch64_report_sve_required (void)
1337 static bool reported_p
= false;
1339 /* Avoid reporting a slew of messages for a single oversight. */
1343 error ("this operation requires the SVE ISA extension");
1344 inform (input_location
, "you can enable SVE using the command-line"
1345 " option %<-march%>, or by using the %<target%>"
1346 " attribute or pragma");
1350 /* Return true if REGNO is P0-P15 or one of the special FFR-related
1353 pr_or_ffr_regnum_p (unsigned int regno
)
1355 return PR_REGNUM_P (regno
) || regno
== FFR_REGNUM
|| regno
== FFRT_REGNUM
;
1358 /* Implement TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS.
1359 The register allocator chooses POINTER_AND_FP_REGS if FP_REGS and
1360 GENERAL_REGS have the same cost - even if POINTER_AND_FP_REGS has a much
1361 higher cost. POINTER_AND_FP_REGS is also used if the cost of both FP_REGS
1362 and GENERAL_REGS is lower than the memory cost (in this case the best class
1363 is the lowest cost one). Using POINTER_AND_FP_REGS irrespectively of its
1364 cost results in bad allocations with many redundant int<->FP moves which
1365 are expensive on various cores.
1366 To avoid this we don't allow POINTER_AND_FP_REGS as the allocno class, but
1367 force a decision between FP_REGS and GENERAL_REGS. We use the allocno class
1368 if it isn't POINTER_AND_FP_REGS. Similarly, use the best class if it isn't
1369 POINTER_AND_FP_REGS. Otherwise set the allocno class depending on the mode.
1370 The result of this is that it is no longer inefficient to have a higher
1371 memory move cost than the register move cost.
1375 aarch64_ira_change_pseudo_allocno_class (int regno
, reg_class_t allocno_class
,
1376 reg_class_t best_class
)
1380 if (!reg_class_subset_p (GENERAL_REGS
, allocno_class
)
1381 || !reg_class_subset_p (FP_REGS
, allocno_class
))
1382 return allocno_class
;
1384 if (!reg_class_subset_p (GENERAL_REGS
, best_class
)
1385 || !reg_class_subset_p (FP_REGS
, best_class
))
1388 mode
= PSEUDO_REGNO_MODE (regno
);
1389 return FLOAT_MODE_P (mode
) || VECTOR_MODE_P (mode
) ? FP_REGS
: GENERAL_REGS
;
1393 aarch64_min_divisions_for_recip_mul (machine_mode mode
)
1395 if (GET_MODE_UNIT_SIZE (mode
) == 4)
1396 return aarch64_tune_params
.min_div_recip_mul_sf
;
1397 return aarch64_tune_params
.min_div_recip_mul_df
;
1400 /* Return the reassociation width of treeop OPC with mode MODE. */
1402 aarch64_reassociation_width (unsigned opc
, machine_mode mode
)
1404 if (VECTOR_MODE_P (mode
))
1405 return aarch64_tune_params
.vec_reassoc_width
;
1406 if (INTEGRAL_MODE_P (mode
))
1407 return aarch64_tune_params
.int_reassoc_width
;
1408 /* Reassociation reduces the number of FMAs which may result in worse
1409 performance. Use a per-CPU setting for FMA reassociation which allows
1410 narrow CPUs with few FP pipes to switch it off (value of 1), and wider
1411 CPUs with many FP pipes to enable reassociation.
1412 Since the reassociation pass doesn't understand FMA at all, assume
1413 that any FP addition might turn into FMA. */
1414 if (FLOAT_MODE_P (mode
))
1415 return opc
== PLUS_EXPR
? aarch64_tune_params
.fma_reassoc_width
1416 : aarch64_tune_params
.fp_reassoc_width
;
1420 /* Provide a mapping from gcc register numbers to dwarf register numbers. */
1422 aarch64_debugger_regno (unsigned regno
)
1424 if (GP_REGNUM_P (regno
))
1425 return AARCH64_DWARF_R0
+ regno
- R0_REGNUM
;
1426 else if (regno
== SP_REGNUM
)
1427 return AARCH64_DWARF_SP
;
1428 else if (FP_REGNUM_P (regno
))
1429 return AARCH64_DWARF_V0
+ regno
- V0_REGNUM
;
1430 else if (PR_REGNUM_P (regno
))
1431 return AARCH64_DWARF_P0
+ regno
- P0_REGNUM
;
1432 else if (regno
== VG_REGNUM
)
1433 return AARCH64_DWARF_VG
;
1435 /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
1436 equivalent DWARF register. */
1437 return DWARF_FRAME_REGISTERS
;
1440 /* Implement TARGET_DWARF_FRAME_REG_MODE. */
1442 aarch64_dwarf_frame_reg_mode (int regno
)
1444 /* Predicate registers are call-clobbered in the EH ABI (which is
1445 ARM_PCS_AAPCS64), so they should not be described by CFI.
1446 Their size changes as VL changes, so any values computed by
1447 __builtin_init_dwarf_reg_size_table might not be valid for
1449 if (PR_REGNUM_P (regno
))
1451 return default_dwarf_frame_reg_mode (regno
);
1454 /* Implement TARGET_OUTPUT_CFI_DIRECTIVE. */
1456 aarch64_output_cfi_directive (FILE *f
, dw_cfi_ref cfi
)
1459 if (cfi
->dw_cfi_opc
== DW_CFA_AARCH64_negate_ra_state
)
1461 fprintf (f
, "\t.cfi_negate_ra_state\n");
1467 /* Implement TARGET_DW_CFI_OPRND1_DESC. */
1469 aarch64_dw_cfi_oprnd1_desc (dwarf_call_frame_info cfi_opc
,
1470 dw_cfi_oprnd_type
&oprnd_type
)
1472 if (cfi_opc
== DW_CFA_AARCH64_negate_ra_state
)
1474 oprnd_type
= dw_cfi_oprnd_unused
;
1480 /* If X is a CONST_DOUBLE, return its bit representation as a constant
1481 integer, otherwise return X unmodified. */
1483 aarch64_bit_representation (rtx x
)
1485 if (CONST_DOUBLE_P (x
))
1486 x
= gen_lowpart (int_mode_for_mode (GET_MODE (x
)).require (), x
);
1490 /* Return an estimate for the number of quadwords in an SVE vector. This is
1491 equivalent to the number of Advanced SIMD vectors in an SVE vector. */
1493 aarch64_estimated_sve_vq ()
1495 return estimated_poly_value (BITS_PER_SVE_VECTOR
) / 128;
1498 /* Return true if MODE is an SVE predicate mode. */
1500 aarch64_sve_pred_mode_p (machine_mode mode
)
1503 && (mode
== VNx16BImode
1504 || mode
== VNx8BImode
1505 || mode
== VNx4BImode
1506 || mode
== VNx2BImode
));
1509 /* Three mutually-exclusive flags describing a vector or predicate type. */
1510 const unsigned int VEC_ADVSIMD
= 1;
1511 const unsigned int VEC_SVE_DATA
= 2;
1512 const unsigned int VEC_SVE_PRED
= 4;
1513 /* Indicates a structure of 2, 3 or 4 vectors or predicates. */
1514 const unsigned int VEC_STRUCT
= 8;
1515 /* Can be used in combination with VEC_SVE_DATA to indicate that the
1516 vector has fewer significant bytes than a full SVE vector. */
1517 const unsigned int VEC_PARTIAL
= 16;
1518 /* Useful combinations of the above. */
1519 const unsigned int VEC_ANY_SVE
= VEC_SVE_DATA
| VEC_SVE_PRED
;
1520 const unsigned int VEC_ANY_DATA
= VEC_ADVSIMD
| VEC_SVE_DATA
;
1522 /* Return a set of flags describing the vector properties of mode MODE.
1523 If ANY_TARGET_P is false (the default), ignore modes that are not supported
1524 by the current target. Otherwise categorize the modes that can be used
1525 with the set of all targets supported by the port. */
1528 aarch64_classify_vector_mode (machine_mode mode
, bool any_target_p
= false)
1530 if (aarch64_sve_pred_mode_p (mode
))
1531 return VEC_SVE_PRED
;
1533 /* Make the decision based on the mode's enum value rather than its
1534 properties, so that we keep the correct classification regardless
1535 of -msve-vector-bits. */
1538 /* Partial SVE QI vectors. */
1542 /* Partial SVE HI vectors. */
1545 /* Partial SVE SI vector. */
1547 /* Partial SVE HF vectors. */
1550 /* Partial SVE BF vectors. */
1553 /* Partial SVE SF vector. */
1555 return (TARGET_SVE
|| any_target_p
) ? VEC_SVE_DATA
| VEC_PARTIAL
: 0;
1565 return (TARGET_SVE
|| any_target_p
) ? VEC_SVE_DATA
: 0;
1567 /* x2 SVE vectors. */
1576 /* x3 SVE vectors. */
1585 /* x4 SVE vectors. */
1594 return (TARGET_SVE
|| any_target_p
) ? VEC_SVE_DATA
| VEC_STRUCT
: 0;
1599 return (TARGET_FLOAT
|| any_target_p
) ? VEC_ADVSIMD
| VEC_STRUCT
: 0;
1601 /* Structures of 64-bit Advanced SIMD vectors. */
1626 return (TARGET_FLOAT
|| any_target_p
)
1627 ? VEC_ADVSIMD
| VEC_STRUCT
| VEC_PARTIAL
: 0;
1629 /* Structures of 128-bit Advanced SIMD vectors. */
1654 return (TARGET_FLOAT
|| any_target_p
) ? VEC_ADVSIMD
| VEC_STRUCT
: 0;
1656 /* 64-bit Advanced SIMD vectors. */
1665 /* 128-bit Advanced SIMD vectors. */
1674 return (TARGET_FLOAT
|| any_target_p
) ? VEC_ADVSIMD
: 0;
1677 return TARGET_SVE
? VEC_SVE_PRED
| VEC_STRUCT
: 0;
1684 /* Return true if MODE is any of the Advanced SIMD structure modes. */
1686 aarch64_advsimd_struct_mode_p (machine_mode mode
)
1688 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
1689 return (vec_flags
& VEC_ADVSIMD
) && (vec_flags
& VEC_STRUCT
);
1692 /* Return true if MODE is an Advanced SIMD D-register structure mode. */
1694 aarch64_advsimd_partial_struct_mode_p (machine_mode mode
)
1696 return (aarch64_classify_vector_mode (mode
)
1697 == (VEC_ADVSIMD
| VEC_STRUCT
| VEC_PARTIAL
));
1700 /* Return true if MODE is an Advanced SIMD Q-register structure mode. */
1702 aarch64_advsimd_full_struct_mode_p (machine_mode mode
)
1704 return (aarch64_classify_vector_mode (mode
) == (VEC_ADVSIMD
| VEC_STRUCT
));
1707 /* Return true if MODE is any of the data vector modes, including
1710 aarch64_vector_data_mode_p (machine_mode mode
)
1712 return aarch64_classify_vector_mode (mode
) & VEC_ANY_DATA
;
1715 /* Return true if MODE is any form of SVE mode, including predicates,
1716 vectors and structures. */
1718 aarch64_sve_mode_p (machine_mode mode
)
1720 return aarch64_classify_vector_mode (mode
) & VEC_ANY_SVE
;
1723 /* Return true if MODE is an SVE data vector mode; either a single vector
1724 or a structure of vectors. */
1726 aarch64_sve_data_mode_p (machine_mode mode
)
1728 return aarch64_classify_vector_mode (mode
) & VEC_SVE_DATA
;
1731 /* Return the number of defined bytes in one constituent vector of
1732 SVE mode MODE, which has vector flags VEC_FLAGS. */
1734 aarch64_vl_bytes (machine_mode mode
, unsigned int vec_flags
)
1736 if (vec_flags
& VEC_PARTIAL
)
1737 /* A single partial vector. */
1738 return GET_MODE_SIZE (mode
);
1740 if (vec_flags
& VEC_SVE_DATA
)
1741 /* A single vector or a tuple. */
1742 return BYTES_PER_SVE_VECTOR
;
1744 /* A single predicate. */
1745 gcc_assert (vec_flags
& VEC_SVE_PRED
);
1746 return BYTES_PER_SVE_PRED
;
1749 /* If MODE holds an array of vectors, return the number of vectors
1750 in the array, otherwise return 1. */
1753 aarch64_ldn_stn_vectors (machine_mode mode
)
1755 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
1756 if (vec_flags
== (VEC_ADVSIMD
| VEC_PARTIAL
| VEC_STRUCT
))
1757 return exact_div (GET_MODE_SIZE (mode
), 8).to_constant ();
1758 if (vec_flags
== (VEC_ADVSIMD
| VEC_STRUCT
))
1759 return exact_div (GET_MODE_SIZE (mode
), 16).to_constant ();
1760 if (vec_flags
== (VEC_SVE_DATA
| VEC_STRUCT
))
1761 return exact_div (GET_MODE_SIZE (mode
),
1762 BYTES_PER_SVE_VECTOR
).to_constant ();
1766 /* Given an Advanced SIMD vector mode MODE and a tuple size NELEMS, return the
1767 corresponding vector structure mode. */
1768 static opt_machine_mode
1769 aarch64_advsimd_vector_array_mode (machine_mode mode
,
1770 unsigned HOST_WIDE_INT nelems
)
1772 unsigned int flags
= VEC_ADVSIMD
| VEC_STRUCT
;
1773 if (known_eq (GET_MODE_SIZE (mode
), 8))
1774 flags
|= VEC_PARTIAL
;
1776 machine_mode struct_mode
;
1777 FOR_EACH_MODE_IN_CLASS (struct_mode
, GET_MODE_CLASS (mode
))
1778 if (aarch64_classify_vector_mode (struct_mode
) == flags
1779 && GET_MODE_INNER (struct_mode
) == GET_MODE_INNER (mode
)
1780 && known_eq (GET_MODE_NUNITS (struct_mode
),
1781 GET_MODE_NUNITS (mode
) * nelems
))
1783 return opt_machine_mode ();
1786 /* Return the SVE vector mode that has NUNITS elements of mode INNER_MODE. */
1789 aarch64_sve_data_mode (scalar_mode inner_mode
, poly_uint64 nunits
)
1791 enum mode_class mclass
= (is_a
<scalar_float_mode
> (inner_mode
)
1792 ? MODE_VECTOR_FLOAT
: MODE_VECTOR_INT
);
1794 FOR_EACH_MODE_IN_CLASS (mode
, mclass
)
1795 if (inner_mode
== GET_MODE_INNER (mode
)
1796 && known_eq (nunits
, GET_MODE_NUNITS (mode
))
1797 && aarch64_sve_data_mode_p (mode
))
1799 return opt_machine_mode ();
1802 /* Implement target hook TARGET_ARRAY_MODE. */
1803 static opt_machine_mode
1804 aarch64_array_mode (machine_mode mode
, unsigned HOST_WIDE_INT nelems
)
1806 if (TARGET_SVE
&& GET_MODE_CLASS (mode
) == MODE_VECTOR_BOOL
)
1808 /* Use VNx32BI for pairs of predicates, but explicitly reject giving
1809 a mode to other array sizes. Using integer modes requires a round
1810 trip through memory and generates terrible code. */
1813 if (mode
== VNx16BImode
&& nelems
== 2)
1818 auto flags
= aarch64_classify_vector_mode (mode
);
1819 if (flags
== VEC_SVE_DATA
&& IN_RANGE (nelems
, 2, 4))
1820 return aarch64_sve_data_mode (GET_MODE_INNER (mode
),
1821 GET_MODE_NUNITS (mode
) * nelems
);
1823 if (flags
== VEC_ADVSIMD
&& IN_RANGE (nelems
, 2, 4))
1824 return aarch64_advsimd_vector_array_mode (mode
, nelems
);
1826 return opt_machine_mode ();
1829 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P. */
1831 aarch64_array_mode_supported_p (machine_mode mode
,
1832 unsigned HOST_WIDE_INT nelems
)
1834 if (TARGET_BASE_SIMD
1835 && (AARCH64_VALID_SIMD_QREG_MODE (mode
)
1836 || AARCH64_VALID_SIMD_DREG_MODE (mode
))
1837 && (nelems
>= 2 && nelems
<= 4))
1843 /* MODE is some form of SVE vector mode. For data modes, return the number
1844 of vector register bits that each element of MODE occupies, such as 64
1845 for both VNx2DImode and VNx2SImode (where each 32-bit value is stored
1846 in a 64-bit container). For predicate modes, return the number of
1847 data bits controlled by each significant predicate bit. */
1850 aarch64_sve_container_bits (machine_mode mode
)
1852 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
1853 poly_uint64 vector_bits
= (vec_flags
& (VEC_PARTIAL
| VEC_SVE_PRED
)
1854 ? BITS_PER_SVE_VECTOR
1855 : GET_MODE_BITSIZE (mode
));
1856 return vector_element_size (vector_bits
, GET_MODE_NUNITS (mode
));
1859 /* Return the SVE predicate mode to use for elements that have
1860 ELEM_NBYTES bytes, if such a mode exists. */
1863 aarch64_sve_pred_mode (unsigned int elem_nbytes
)
1867 if (elem_nbytes
== 1)
1869 if (elem_nbytes
== 2)
1871 if (elem_nbytes
== 4)
1873 if (elem_nbytes
== 8)
1876 return opt_machine_mode ();
1879 /* Return the SVE predicate mode that should be used to control
1883 aarch64_sve_pred_mode (machine_mode mode
)
1885 unsigned int bits
= aarch64_sve_container_bits (mode
);
1886 return aarch64_sve_pred_mode (bits
/ BITS_PER_UNIT
).require ();
1889 /* Implement TARGET_VECTORIZE_GET_MASK_MODE. */
1891 static opt_machine_mode
1892 aarch64_get_mask_mode (machine_mode mode
)
1894 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
1895 if (vec_flags
& VEC_SVE_DATA
)
1896 return aarch64_sve_pred_mode (mode
);
1898 return default_get_mask_mode (mode
);
1901 /* Return the integer element mode associated with SVE mode MODE. */
1903 static scalar_int_mode
1904 aarch64_sve_element_int_mode (machine_mode mode
)
1906 poly_uint64 vector_bits
= (GET_MODE_CLASS (mode
) == MODE_VECTOR_BOOL
1907 ? BITS_PER_SVE_VECTOR
1908 : GET_MODE_BITSIZE (mode
));
1909 unsigned int elt_bits
= vector_element_size (vector_bits
,
1910 GET_MODE_NUNITS (mode
));
1911 return int_mode_for_size (elt_bits
, 0).require ();
1914 /* Return an integer element mode that contains exactly
1915 aarch64_sve_container_bits (MODE) bits. This is wider than
1916 aarch64_sve_element_int_mode if MODE is a partial vector,
1917 otherwise it's the same. */
1919 static scalar_int_mode
1920 aarch64_sve_container_int_mode (machine_mode mode
)
1922 return int_mode_for_size (aarch64_sve_container_bits (mode
), 0).require ();
1925 /* Return the integer vector mode associated with SVE mode MODE.
1926 Unlike related_int_vector_mode, this can handle the case in which
1927 MODE is a predicate (and thus has a different total size). */
1930 aarch64_sve_int_mode (machine_mode mode
)
1932 scalar_int_mode int_mode
= aarch64_sve_element_int_mode (mode
);
1933 return aarch64_sve_data_mode (int_mode
, GET_MODE_NUNITS (mode
)).require ();
1936 /* Look for a vector mode with the same classification as VEC_MODE,
1937 but with each group of FACTOR elements coalesced into a single element.
1938 In other words, look for a mode in which the elements are FACTOR times
1939 larger and in which the number of elements is FACTOR times smaller.
1941 Return the mode found, if one exists. */
1943 static opt_machine_mode
1944 aarch64_coalesce_units (machine_mode vec_mode
, unsigned int factor
)
1946 auto elt_bits
= vector_element_size (GET_MODE_BITSIZE (vec_mode
),
1947 GET_MODE_NUNITS (vec_mode
));
1948 auto vec_flags
= aarch64_classify_vector_mode (vec_mode
);
1949 if (vec_flags
& VEC_SVE_PRED
)
1951 if (known_eq (GET_MODE_SIZE (vec_mode
), BYTES_PER_SVE_PRED
))
1952 return aarch64_sve_pred_mode (elt_bits
* factor
);
1956 scalar_mode new_elt_mode
;
1957 if (!int_mode_for_size (elt_bits
* factor
, false).exists (&new_elt_mode
))
1960 if (vec_flags
== VEC_ADVSIMD
)
1962 auto mode
= aarch64_simd_container_mode (new_elt_mode
,
1963 GET_MODE_BITSIZE (vec_mode
));
1964 if (mode
!= word_mode
)
1967 else if (vec_flags
& VEC_SVE_DATA
)
1969 poly_uint64 new_nunits
;
1970 if (multiple_p (GET_MODE_NUNITS (vec_mode
), factor
, &new_nunits
))
1971 return aarch64_sve_data_mode (new_elt_mode
, new_nunits
);
1976 /* Implement TARGET_VECTORIZE_RELATED_MODE. */
1978 static opt_machine_mode
1979 aarch64_vectorize_related_mode (machine_mode vector_mode
,
1980 scalar_mode element_mode
,
1983 unsigned int vec_flags
= aarch64_classify_vector_mode (vector_mode
);
1985 /* If we're operating on SVE vectors, try to return an SVE mode. */
1986 poly_uint64 sve_nunits
;
1987 if ((vec_flags
& VEC_SVE_DATA
)
1988 && multiple_p (BYTES_PER_SVE_VECTOR
,
1989 GET_MODE_SIZE (element_mode
), &sve_nunits
))
1991 machine_mode sve_mode
;
1992 if (maybe_ne (nunits
, 0U))
1994 /* Try to find a full or partial SVE mode with exactly
1996 if (multiple_p (sve_nunits
, nunits
)
1997 && aarch64_sve_data_mode (element_mode
,
1998 nunits
).exists (&sve_mode
))
2003 /* Take the preferred number of units from the number of bytes
2004 that fit in VECTOR_MODE. We always start by "autodetecting"
2005 a full vector mode with preferred_simd_mode, so vectors
2006 chosen here will also be full vector modes. Then
2007 autovectorize_vector_modes tries smaller starting modes
2008 and thus smaller preferred numbers of units. */
2009 sve_nunits
= ordered_min (sve_nunits
, GET_MODE_SIZE (vector_mode
));
2010 if (aarch64_sve_data_mode (element_mode
,
2011 sve_nunits
).exists (&sve_mode
))
2016 /* Prefer to use 1 128-bit vector instead of 2 64-bit vectors. */
2018 && (vec_flags
& VEC_ADVSIMD
)
2019 && known_eq (nunits
, 0U)
2020 && known_eq (GET_MODE_BITSIZE (vector_mode
), 64U)
2021 && maybe_ge (GET_MODE_BITSIZE (element_mode
)
2022 * GET_MODE_NUNITS (vector_mode
), 128U))
2024 machine_mode res
= aarch64_simd_container_mode (element_mode
, 128);
2025 if (VECTOR_MODE_P (res
))
2029 return default_vectorize_related_mode (vector_mode
, element_mode
, nunits
);
2032 /* Implement TARGET_VECTORIZE_PREFERRED_DIV_AS_SHIFTS_OVER_MULT. */
2035 aarch64_vectorize_preferred_div_as_shifts_over_mult (const_tree type
)
2037 machine_mode mode
= TYPE_MODE (type
);
2038 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
2039 bool sve_p
= (vec_flags
& VEC_ANY_SVE
);
2040 bool simd_p
= (vec_flags
& VEC_ADVSIMD
);
2042 return (sve_p
&& TARGET_SVE2
) || (simd_p
&& TARGET_SIMD
);
2045 /* Implement TARGET_PREFERRED_ELSE_VALUE. For binary operations,
2046 prefer to use the first arithmetic operand as the else value if
2047 the else value doesn't matter, since that exactly matches the SVE
2048 destructive merging form. For ternary operations we could either
2049 pick the first operand and use FMAD-like instructions or the last
2050 operand and use FMLA-like instructions; the latter seems more
2054 aarch64_preferred_else_value (unsigned, tree
, unsigned int nops
, tree
*ops
)
2056 return nops
== 3 ? ops
[2] : ops
[0];
2059 /* Implement TARGET_HARD_REGNO_NREGS. */
2062 aarch64_hard_regno_nregs (unsigned regno
, machine_mode mode
)
2064 /* ??? Logically we should only need to provide a value when
2065 HARD_REGNO_MODE_OK says that the combination is valid,
2066 but at the moment we need to handle all modes. Just ignore
2067 any runtime parts for registers that can't store them. */
2068 HOST_WIDE_INT lowest_size
= constant_lower_bound (GET_MODE_SIZE (mode
));
2069 switch (aarch64_regno_regclass (regno
))
2075 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
2076 if (vec_flags
& VEC_SVE_DATA
)
2077 return exact_div (GET_MODE_SIZE (mode
),
2078 aarch64_vl_bytes (mode
, vec_flags
)).to_constant ();
2079 if (vec_flags
== (VEC_ADVSIMD
| VEC_STRUCT
| VEC_PARTIAL
))
2080 return GET_MODE_SIZE (mode
).to_constant () / 8;
2081 return CEIL (lowest_size
, UNITS_PER_VREG
);
2087 return mode
== VNx32BImode
? 2 : 1;
2089 case MOVEABLE_SYSREGS
:
2091 case PR_AND_FFR_REGS
:
2096 return CEIL (lowest_size
, UNITS_PER_WORD
);
2101 /* Implement TARGET_HARD_REGNO_MODE_OK. */
2104 aarch64_hard_regno_mode_ok (unsigned regno
, machine_mode mode
)
2106 if (mode
== V8DImode
)
2107 return IN_RANGE (regno
, R0_REGNUM
, R23_REGNUM
)
2108 && multiple_p (regno
- R0_REGNUM
, 2);
2110 if (GET_MODE_CLASS (mode
) == MODE_CC
)
2111 return regno
== CC_REGNUM
;
2113 if (regno
== VG_REGNUM
)
2114 /* This must have the same size as _Unwind_Word. */
2115 return mode
== DImode
;
2117 if (regno
== FPM_REGNUM
)
2118 return mode
== QImode
|| mode
== HImode
|| mode
== SImode
|| mode
== DImode
;
2120 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
2121 if (vec_flags
== VEC_SVE_PRED
)
2122 return pr_or_ffr_regnum_p (regno
);
2124 if (vec_flags
== (VEC_SVE_PRED
| VEC_STRUCT
))
2125 return PR_REGNUM_P (regno
);
2127 if (pr_or_ffr_regnum_p (regno
))
2130 /* These registers are abstract; their modes don't matter. */
2131 if (FAKE_REGNUM_P (regno
))
2134 if (regno
== SP_REGNUM
)
2135 /* The purpose of comparing with ptr_mode is to support the
2136 global register variable associated with the stack pointer
2137 register via the syntax of asm ("wsp") in ILP32. */
2138 return mode
== Pmode
|| mode
== ptr_mode
;
2140 if (regno
== FRAME_POINTER_REGNUM
|| regno
== ARG_POINTER_REGNUM
)
2141 return mode
== Pmode
;
2143 if (GP_REGNUM_P (regno
))
2145 if (vec_flags
& (VEC_ANY_SVE
| VEC_STRUCT
))
2147 if (known_le (GET_MODE_SIZE (mode
), 8))
2149 if (known_le (GET_MODE_SIZE (mode
), 16))
2150 return (regno
& 1) == 0;
2152 else if (FP_REGNUM_P (regno
))
2154 if (vec_flags
& VEC_STRUCT
)
2155 return end_hard_regno (mode
, regno
) - 1 <= V31_REGNUM
;
2157 return !VECTOR_MODE_P (mode
) || vec_flags
!= 0;
2163 /* Return true if a function with type FNTYPE returns its value in
2164 SVE vector or predicate registers. */
2167 aarch64_returns_value_in_sve_regs_p (const_tree fntype
)
2169 tree return_type
= TREE_TYPE (fntype
);
2171 pure_scalable_type_info pst_info
;
2172 switch (pst_info
.analyze (return_type
))
2174 case pure_scalable_type_info::IS_PST
:
2175 return (pst_info
.num_zr () <= NUM_FP_ARG_REGS
2176 && pst_info
.num_pr () <= NUM_PR_ARG_REGS
);
2178 case pure_scalable_type_info::DOESNT_MATTER
:
2179 gcc_assert (aarch64_return_in_memory_1 (return_type
));
2182 case pure_scalable_type_info::NO_ABI_IDENTITY
:
2183 case pure_scalable_type_info::ISNT_PST
:
2189 /* Return true if a function with type FNTYPE takes arguments in
2190 SVE vector or predicate registers. */
2193 aarch64_takes_arguments_in_sve_regs_p (const_tree fntype
)
2195 CUMULATIVE_ARGS args_so_far_v
;
2196 aarch64_init_cumulative_args (&args_so_far_v
, NULL_TREE
, NULL_RTX
,
2197 NULL_TREE
, 0, true);
2198 cumulative_args_t args_so_far
= pack_cumulative_args (&args_so_far_v
);
2200 for (tree chain
= TYPE_ARG_TYPES (fntype
);
2201 chain
&& chain
!= void_list_node
;
2202 chain
= TREE_CHAIN (chain
))
2204 tree arg_type
= TREE_VALUE (chain
);
2205 if (arg_type
== error_mark_node
)
2208 function_arg_info
arg (arg_type
, /*named=*/true);
2209 apply_pass_by_reference_rules (&args_so_far_v
, arg
);
2210 pure_scalable_type_info pst_info
;
2211 if (pst_info
.analyze_registers (arg
.type
))
2213 unsigned int end_zr
= args_so_far_v
.aapcs_nvrn
+ pst_info
.num_zr ();
2214 unsigned int end_pr
= args_so_far_v
.aapcs_nprn
+ pst_info
.num_pr ();
2215 gcc_assert (end_zr
<= NUM_FP_ARG_REGS
&& end_pr
<= NUM_PR_ARG_REGS
);
2219 targetm
.calls
.function_arg_advance (args_so_far
, arg
);
2224 /* Implement TARGET_FNTYPE_ABI. */
2226 static const predefined_function_abi
&
2227 aarch64_fntype_abi (const_tree fntype
)
2229 if (lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (fntype
)))
2230 return aarch64_simd_abi ();
2232 if (aarch64_returns_value_in_sve_regs_p (fntype
)
2233 || aarch64_takes_arguments_in_sve_regs_p (fntype
))
2234 return aarch64_sve_abi ();
2236 return default_function_abi
;
2239 /* Return the state of PSTATE.SM on entry to functions of type FNTYPE. */
2241 static aarch64_isa_mode
2242 aarch64_fntype_pstate_sm (const_tree fntype
)
2244 if (lookup_attribute ("arm", "streaming", TYPE_ATTRIBUTES (fntype
)))
2245 return AARCH64_ISA_MODE_SM_ON
;
2247 if (lookup_attribute ("arm", "streaming_compatible",
2248 TYPE_ATTRIBUTES (fntype
)))
2251 return AARCH64_ISA_MODE_SM_OFF
;
2254 /* Return state flags that describe whether and how functions of type
2255 FNTYPE share state STATE_NAME with their callers. */
2258 aarch64_fntype_shared_flags (const_tree fntype
, const char *state_name
)
2260 return aarch64_lookup_shared_state_flags (TYPE_ATTRIBUTES (fntype
),
2264 /* Return the state of PSTATE.ZA on entry to functions of type FNTYPE. */
2266 static aarch64_isa_mode
2267 aarch64_fntype_pstate_za (const_tree fntype
)
2269 if (aarch64_fntype_shared_flags (fntype
, "za")
2270 || aarch64_fntype_shared_flags (fntype
, "zt0"))
2271 return AARCH64_ISA_MODE_ZA_ON
;
2276 /* Return the ISA mode on entry to functions of type FNTYPE. */
2278 static aarch64_isa_mode
2279 aarch64_fntype_isa_mode (const_tree fntype
)
2281 return (aarch64_fntype_pstate_sm (fntype
)
2282 | aarch64_fntype_pstate_za (fntype
));
2285 /* Return true if FNDECL uses streaming mode internally, as an
2286 implementation choice. */
2289 aarch64_fndecl_is_locally_streaming (const_tree fndecl
)
2291 return lookup_attribute ("arm", "locally_streaming",
2292 DECL_ATTRIBUTES (fndecl
));
2295 /* Return the state of PSTATE.SM when compiling the body of
2296 function FNDECL. This might be different from the state of
2297 PSTATE.SM on entry. */
2299 static aarch64_isa_mode
2300 aarch64_fndecl_pstate_sm (const_tree fndecl
)
2302 if (aarch64_fndecl_is_locally_streaming (fndecl
))
2303 return AARCH64_ISA_MODE_SM_ON
;
2305 return aarch64_fntype_pstate_sm (TREE_TYPE (fndecl
));
2308 /* Return true if function FNDECL has state STATE_NAME, either by creating
2309 new state itself or by sharing state with callers. */
2312 aarch64_fndecl_has_state (tree fndecl
, const char *state_name
)
2314 return (aarch64_fndecl_has_new_state (fndecl
, state_name
)
2315 || aarch64_fntype_shared_flags (TREE_TYPE (fndecl
),
2319 /* Return the state of PSTATE.ZA when compiling the body of function FNDECL.
2320 This might be different from the state of PSTATE.ZA on entry. */
2322 static aarch64_isa_mode
2323 aarch64_fndecl_pstate_za (const_tree fndecl
)
2325 if (aarch64_fndecl_has_new_state (fndecl
, "za")
2326 || aarch64_fndecl_has_new_state (fndecl
, "zt0"))
2327 return AARCH64_ISA_MODE_ZA_ON
;
2329 return aarch64_fntype_pstate_za (TREE_TYPE (fndecl
));
2332 /* Return the ISA mode that should be used to compile the body of
2335 static aarch64_isa_mode
2336 aarch64_fndecl_isa_mode (const_tree fndecl
)
2338 return (aarch64_fndecl_pstate_sm (fndecl
)
2339 | aarch64_fndecl_pstate_za (fndecl
));
2342 /* Return the state of PSTATE.SM on entry to the current function.
2343 This might be different from the state of PSTATE.SM in the function
2346 static aarch64_isa_mode
2347 aarch64_cfun_incoming_pstate_sm ()
2349 return aarch64_fntype_pstate_sm (TREE_TYPE (cfun
->decl
));
2352 /* Return the state of PSTATE.ZA on entry to the current function.
2353 This might be different from the state of PSTATE.ZA in the function
2356 static aarch64_isa_mode
2357 aarch64_cfun_incoming_pstate_za ()
2359 return aarch64_fntype_pstate_za (TREE_TYPE (cfun
->decl
));
2362 /* Return state flags that describe whether and how the current function shares
2363 state STATE_NAME with callers. */
2366 aarch64_cfun_shared_flags (const char *state_name
)
2368 return aarch64_fntype_shared_flags (TREE_TYPE (cfun
->decl
), state_name
);
2371 /* Return true if the current function creates new state of type STATE_NAME
2372 (as opposed to sharing the state with its callers or ignoring the state
2376 aarch64_cfun_has_new_state (const char *state_name
)
2378 return aarch64_fndecl_has_new_state (cfun
->decl
, state_name
);
2381 /* Return true if PSTATE.SM is 1 in the body of the current function,
2382 but is not guaranteed to be 1 on entry. */
2385 aarch64_cfun_enables_pstate_sm ()
2387 return (aarch64_fndecl_is_locally_streaming (cfun
->decl
)
2388 && aarch64_cfun_incoming_pstate_sm () != AARCH64_ISA_MODE_SM_ON
);
2391 /* Return true if the current function has state STATE_NAME, either by
2392 creating new state itself or by sharing state with callers. */
2395 aarch64_cfun_has_state (const char *state_name
)
2397 return aarch64_fndecl_has_state (cfun
->decl
, state_name
);
2400 /* Return true if a call from the current function to a function with
2401 ISA mode CALLEE_MODE would involve a change to PSTATE.SM around
2402 the BL instruction. */
2405 aarch64_call_switches_pstate_sm (aarch64_isa_mode callee_mode
)
2407 return (bool) (callee_mode
& ~AARCH64_ISA_MODE
& AARCH64_ISA_MODE_SM_STATE
);
2410 /* Implement TARGET_COMPATIBLE_VECTOR_TYPES_P. */
2413 aarch64_compatible_vector_types_p (const_tree type1
, const_tree type2
)
2415 return (aarch64_sve::builtin_type_p (type1
)
2416 == aarch64_sve::builtin_type_p (type2
));
2419 /* Return true if we should emit CFI for register REGNO. */
2422 aarch64_emit_cfi_for_reg_p (unsigned int regno
)
2424 return (GP_REGNUM_P (regno
)
2425 || !default_function_abi
.clobbers_full_reg_p (regno
));
2428 /* Return the mode we should use to save and restore register REGNO. */
2431 aarch64_reg_save_mode (unsigned int regno
)
2433 if (GP_REGNUM_P (regno
) || regno
== VG_REGNUM
)
2436 if (FP_REGNUM_P (regno
))
2437 switch (crtl
->abi
->id ())
2439 case ARM_PCS_AAPCS64
:
2440 /* Only the low 64 bits are saved by the base PCS. */
2444 /* The vector PCS saves the low 128 bits (which is the full
2445 register on non-SVE targets). */
2449 /* Use vectors of DImode for registers that need frame
2450 information, so that the first 64 bytes of the save slot
2451 are always the equivalent of what storing D<n> would give. */
2452 if (aarch64_emit_cfi_for_reg_p (regno
))
2455 /* Use vectors of bytes otherwise, so that the layout is
2456 endian-agnostic, and so that we can use LDR and STR for
2457 big-endian targets. */
2460 case ARM_PCS_TLSDESC
:
2461 case ARM_PCS_UNKNOWN
:
2465 if (PR_REGNUM_P (regno
))
2466 /* Save the full predicate register. */
2472 /* Given the ISA mode on entry to a callee and the ABI of the callee,
2473 return the CONST_INT that should be placed in an UNSPEC_CALLEE_ABI rtx. */
2476 aarch64_gen_callee_cookie (aarch64_isa_mode isa_mode
, arm_pcs pcs_variant
)
2478 return gen_int_mode ((unsigned int) isa_mode
2479 | (unsigned int) pcs_variant
<< AARCH64_NUM_ISA_MODES
,
2483 /* COOKIE is a CONST_INT from an UNSPEC_CALLEE_ABI rtx. Return the
2486 static const predefined_function_abi
&
2487 aarch64_callee_abi (rtx cookie
)
2489 return function_abis
[UINTVAL (cookie
) >> AARCH64_NUM_ISA_MODES
];
2492 /* COOKIE is a CONST_INT from an UNSPEC_CALLEE_ABI rtx. Return the
2493 required ISA mode on entry to the callee, which is also the ISA
2494 mode on return from the callee. */
2496 static aarch64_isa_mode
2497 aarch64_callee_isa_mode (rtx cookie
)
2499 return UINTVAL (cookie
) & ((1 << AARCH64_NUM_ISA_MODES
) - 1);
2502 /* INSN is a call instruction. Return the CONST_INT stored in its
2503 UNSPEC_CALLEE_ABI rtx. */
2506 aarch64_insn_callee_cookie (const rtx_insn
*insn
)
2508 rtx pat
= PATTERN (insn
);
2509 gcc_assert (GET_CODE (pat
) == PARALLEL
);
2510 rtx unspec
= XVECEXP (pat
, 0, 1);
2511 gcc_assert (GET_CODE (unspec
) == UNSPEC
2512 && XINT (unspec
, 1) == UNSPEC_CALLEE_ABI
);
2513 return XVECEXP (unspec
, 0, 0);
2516 /* Implement TARGET_INSN_CALLEE_ABI. */
2518 const predefined_function_abi
&
2519 aarch64_insn_callee_abi (const rtx_insn
*insn
)
2521 return aarch64_callee_abi (aarch64_insn_callee_cookie (insn
));
2524 /* INSN is a call instruction. Return the required ISA mode on entry to
2525 the callee, which is also the ISA mode on return from the callee. */
2527 static aarch64_isa_mode
2528 aarch64_insn_callee_isa_mode (const rtx_insn
*insn
)
2530 return aarch64_callee_isa_mode (aarch64_insn_callee_cookie (insn
));
2533 /* Implement TARGET_HARD_REGNO_CALL_PART_CLOBBERED. The callee only saves
2534 the lower 64 bits of a 128-bit register. Tell the compiler the callee
2535 clobbers the top 64 bits when restoring the bottom 64 bits. */
2538 aarch64_hard_regno_call_part_clobbered (unsigned int abi_id
,
2542 if (FP_REGNUM_P (regno
) && abi_id
!= ARM_PCS_SVE
)
2544 poly_int64 per_register_size
= GET_MODE_SIZE (mode
);
2545 unsigned int nregs
= hard_regno_nregs (regno
, mode
);
2547 per_register_size
= exact_div (per_register_size
, nregs
);
2548 if (abi_id
== ARM_PCS_SIMD
|| abi_id
== ARM_PCS_TLSDESC
)
2549 return maybe_gt (per_register_size
, 16);
2550 return maybe_gt (per_register_size
, 8);
2555 /* Implement REGMODE_NATURAL_SIZE. */
2557 aarch64_regmode_natural_size (machine_mode mode
)
2559 /* The natural size for SVE data modes is one SVE data vector,
2560 and similarly for predicates. We can't independently modify
2561 anything smaller than that. */
2562 /* ??? For now, only do this for variable-width SVE registers.
2563 Doing it for constant-sized registers breaks lower-subreg.cc. */
2564 /* ??? And once that's fixed, we should probably have similar
2565 code for Advanced SIMD. */
2566 if (!aarch64_sve_vg
.is_constant ())
2568 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
2569 if (vec_flags
& VEC_SVE_PRED
)
2570 return BYTES_PER_SVE_PRED
;
2571 if (vec_flags
& VEC_SVE_DATA
)
2572 return BYTES_PER_SVE_VECTOR
;
2574 return UNITS_PER_WORD
;
2577 /* Implement HARD_REGNO_CALLER_SAVE_MODE. */
2579 aarch64_hard_regno_caller_save_mode (unsigned regno
, unsigned,
2582 /* The predicate mode determines which bits are significant and
2583 which are "don't care". Decreasing the number of lanes would
2584 lose data while increasing the number of lanes would make bits
2585 unnecessarily significant. */
2586 if (PR_REGNUM_P (regno
))
2588 if (known_lt (GET_MODE_SIZE (mode
), 4)
2589 && REG_CAN_CHANGE_MODE_P (regno
, mode
, SImode
)
2590 && REG_CAN_CHANGE_MODE_P (regno
, SImode
, mode
))
2595 /* Return true if I's bits are consecutive ones from the MSB. */
2597 aarch64_high_bits_all_ones_p (HOST_WIDE_INT i
)
2599 return exact_log2 (-i
) != HOST_WIDE_INT_M1
;
2602 /* Implement TARGET_CONSTANT_ALIGNMENT. Make strings word-aligned so
2603 that strcpy from constants will be faster. */
2605 static HOST_WIDE_INT
2606 aarch64_constant_alignment (const_tree exp
, HOST_WIDE_INT align
)
2608 if (TREE_CODE (exp
) == STRING_CST
&& !optimize_size
)
2609 return MAX (align
, BITS_PER_WORD
);
2613 /* Return true if calls to DECL should be treated as
2614 long-calls (ie called via a register). */
2616 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED
)
2621 /* Return true if calls to symbol-ref SYM should be treated as
2622 long-calls (ie called via a register). */
2624 aarch64_is_long_call_p (rtx sym
)
2626 return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym
));
2629 /* Return true if calls to symbol-ref SYM should not go through
2633 aarch64_is_noplt_call_p (rtx sym
)
2635 const_tree decl
= SYMBOL_REF_DECL (sym
);
2640 || lookup_attribute ("noplt", DECL_ATTRIBUTES (decl
)))
2641 && !targetm
.binds_local_p (decl
))
2647 /* Emit an insn that's a simple single-set. Both the operands must be
2648 known to be valid. */
2649 inline static rtx_insn
*
2650 emit_set_insn (rtx x
, rtx y
)
2652 return emit_insn (gen_rtx_SET (x
, y
));
2655 /* X and Y are two things to compare using CODE. Emit the compare insn and
2656 return the rtx for register 0 in the proper mode. */
2658 aarch64_gen_compare_reg (RTX_CODE code
, rtx x
, rtx y
)
2660 machine_mode cmp_mode
= GET_MODE (x
);
2661 machine_mode cc_mode
;
2664 if (cmp_mode
== TImode
)
2666 gcc_assert (code
== NE
);
2669 cc_reg
= gen_rtx_REG (cc_mode
, CC_REGNUM
);
2671 rtx x_lo
= operand_subword (x
, 0, 0, TImode
);
2672 rtx y_lo
= operand_subword (y
, 0, 0, TImode
);
2673 emit_set_insn (cc_reg
, gen_rtx_COMPARE (cc_mode
, x_lo
, y_lo
));
2675 rtx x_hi
= operand_subword (x
, 1, 0, TImode
);
2676 rtx y_hi
= operand_subword (y
, 1, 0, TImode
);
2677 emit_insn (gen_ccmpccdi (cc_reg
, cc_reg
, x_hi
, y_hi
,
2678 gen_rtx_EQ (cc_mode
, cc_reg
, const0_rtx
),
2679 GEN_INT (AARCH64_EQ
)));
2683 cc_mode
= SELECT_CC_MODE (code
, x
, y
);
2684 cc_reg
= gen_rtx_REG (cc_mode
, CC_REGNUM
);
2685 emit_set_insn (cc_reg
, gen_rtx_COMPARE (cc_mode
, x
, y
));
2690 /* Similarly, but maybe zero-extend Y if Y_MODE < SImode. */
2693 aarch64_gen_compare_reg_maybe_ze (RTX_CODE code
, rtx x
, rtx y
,
2694 machine_mode y_mode
)
2696 if (y_mode
== E_QImode
|| y_mode
== E_HImode
)
2698 if (CONST_INT_P (y
))
2700 y
= GEN_INT (INTVAL (y
) & GET_MODE_MASK (y_mode
));
2706 machine_mode cc_mode
;
2708 t
= gen_rtx_ZERO_EXTEND (SImode
, y
);
2709 t
= gen_rtx_COMPARE (CC_SWPmode
, t
, x
);
2710 cc_mode
= CC_SWPmode
;
2711 cc_reg
= gen_rtx_REG (cc_mode
, CC_REGNUM
);
2712 emit_set_insn (cc_reg
, t
);
2717 if (!aarch64_plus_operand (y
, y_mode
))
2718 y
= force_reg (y_mode
, y
);
2720 return aarch64_gen_compare_reg (code
, x
, y
);
2723 /* Generate conditional branch to LABEL, comparing X to 0 using CODE.
2724 Return the jump instruction. */
2727 aarch64_gen_compare_zero_and_branch (rtx_code code
, rtx x
,
2728 rtx_code_label
*label
)
2730 if (aarch64_track_speculation
)
2732 /* Emit an explicit compare instruction, so that we can correctly
2733 track the condition codes. */
2734 rtx cc_reg
= aarch64_gen_compare_reg (code
, x
, const0_rtx
);
2735 x
= gen_rtx_fmt_ee (code
, GET_MODE (cc_reg
), cc_reg
, const0_rtx
);
2738 x
= gen_rtx_fmt_ee (code
, VOIDmode
, x
, const0_rtx
);
2740 x
= gen_rtx_IF_THEN_ELSE (VOIDmode
, x
,
2741 gen_rtx_LABEL_REF (Pmode
, label
), pc_rtx
);
2742 return gen_rtx_SET (pc_rtx
, x
);
2745 /* Return an rtx that branches to LABEL based on the value of bit BITNUM of X.
2746 If CODE is NE, it branches to LABEL when the bit is set; if CODE is EQ,
2747 it branches to LABEL when the bit is clear. */
2750 aarch64_gen_test_and_branch (rtx_code code
, rtx x
, int bitnum
,
2751 rtx_code_label
*label
)
2753 auto mode
= GET_MODE (x
);
2754 if (aarch64_track_speculation
)
2756 auto mask
= gen_int_mode (HOST_WIDE_INT_1U
<< bitnum
, mode
);
2757 emit_insn (gen_aarch64_and3nr_compare0 (mode
, x
, mask
));
2758 rtx cc_reg
= gen_rtx_REG (CC_NZVmode
, CC_REGNUM
);
2759 rtx x
= gen_rtx_fmt_ee (code
, CC_NZVmode
, cc_reg
, const0_rtx
);
2760 return gen_condjump (x
, cc_reg
, label
);
2762 return gen_aarch64_tb (code
, mode
, mode
,
2763 x
, gen_int_mode (bitnum
, mode
), label
);
2766 /* Consider the operation:
2768 OPERANDS[0] = CODE (OPERANDS[1], OPERANDS[2]) + OPERANDS[3]
2772 - CODE is [SU]MAX or [SU]MIN
2773 - OPERANDS[2] and OPERANDS[3] are constant integers
2774 - OPERANDS[3] is a positive or negative shifted 12-bit immediate
2775 - all operands have mode MODE
2777 Decide whether it is possible to implement the operation using:
2779 SUBS <tmp>, OPERANDS[1], -OPERANDS[3]
2781 ADDS <tmp>, OPERANDS[1], OPERANDS[3]
2785 <insn> OPERANDS[0], <tmp>, [wx]zr, <cond>
2787 where <insn> is one of CSEL, CSINV or CSINC. Return true if so.
2788 If GENERATE_P is true, also update OPERANDS as follows:
2790 OPERANDS[4] = -OPERANDS[3]
2791 OPERANDS[5] = the rtl condition representing <cond>
2793 OPERANDS[7] = 0 for CSEL, -1 for CSINV or 1 for CSINC. */
2795 aarch64_maxmin_plus_const (rtx_code code
, rtx
*operands
, bool generate_p
)
2797 signop sgn
= (code
== UMAX
|| code
== UMIN
? UNSIGNED
: SIGNED
);
2798 rtx dst
= operands
[0];
2799 rtx maxmin_op
= operands
[2];
2800 rtx add_op
= operands
[3];
2801 machine_mode mode
= GET_MODE (dst
);
2803 /* max (x, y) - z == (x >= y + 1 ? x : y) - z
2804 == (x >= y ? x : y) - z
2805 == (x > y ? x : y) - z
2806 == (x > y - 1 ? x : y) - z
2808 min (x, y) - z == (x <= y - 1 ? x : y) - z
2809 == (x <= y ? x : y) - z
2810 == (x < y ? x : y) - z
2811 == (x < y + 1 ? x : y) - z
2813 Check whether z is in { y - 1, y, y + 1 } and pick the form(s) for
2814 which x is compared with z. Set DIFF to y - z. Thus the supported
2815 combinations are as follows, with DIFF being the value after the ":":
2817 max (x, y) - z == x >= y + 1 ? x - (y + 1) : -1 [z == y + 1]
2818 == x >= y ? x - y : 0 [z == y]
2819 == x > y ? x - y : 0 [z == y]
2820 == x > y - 1 ? x - (y - 1) : 1 [z == y - 1]
2822 min (x, y) - z == x <= y - 1 ? x - (y - 1) : 1 [z == y - 1]
2823 == x <= y ? x - y : 0 [z == y]
2824 == x < y ? x - y : 0 [z == y]
2825 == x < y + 1 ? x - (y + 1) : -1 [z == y + 1]. */
2826 auto maxmin_val
= rtx_mode_t (maxmin_op
, mode
);
2827 auto add_val
= rtx_mode_t (add_op
, mode
);
2828 auto sub_val
= wi::neg (add_val
);
2829 auto diff
= wi::sub (maxmin_val
, sub_val
);
2831 || (diff
== 1 && wi::gt_p (maxmin_val
, sub_val
, sgn
))
2832 || (diff
== -1 && wi::lt_p (maxmin_val
, sub_val
, sgn
))))
2842 cmp
= diff
== 1 ? GT
: GE
;
2845 cmp
= diff
== 1 ? GTU
: GEU
;
2848 cmp
= diff
== -1 ? LT
: LE
;
2851 cmp
= diff
== -1 ? LTU
: LEU
;
2856 rtx cc
= gen_rtx_REG (CCmode
, CC_REGNUM
);
2858 operands
[4] = immed_wide_int_const (sub_val
, mode
);
2859 operands
[5] = gen_rtx_fmt_ee (cmp
, VOIDmode
, cc
, const0_rtx
);
2860 if (can_create_pseudo_p ())
2861 operands
[6] = gen_reg_rtx (mode
);
2864 operands
[7] = immed_wide_int_const (diff
, mode
);
2870 /* Build the SYMBOL_REF for __tls_get_addr. */
2872 static GTY(()) rtx tls_get_addr_libfunc
;
2875 aarch64_tls_get_addr (void)
2877 if (!tls_get_addr_libfunc
)
2878 tls_get_addr_libfunc
= init_one_libfunc ("__tls_get_addr");
2879 return tls_get_addr_libfunc
;
2882 /* Return the TLS model to use for ADDR. */
2884 static enum tls_model
2885 tls_symbolic_operand_type (rtx addr
)
2887 enum tls_model tls_kind
= TLS_MODEL_NONE
;
2889 addr
= strip_offset_and_salt (addr
, &offset
);
2890 if (SYMBOL_REF_P (addr
))
2891 tls_kind
= SYMBOL_REF_TLS_MODEL (addr
);
2896 /* We'll allow lo_sum's in addresses in our legitimate addresses
2897 so that combine would take care of combining addresses where
2898 necessary, but for generation purposes, we'll generate the address
2901 tmp = hi (symbol_ref); adrp x1, foo
2902 dest = lo_sum (tmp, symbol_ref); add dest, x1, :lo_12:foo
2906 adrp x1, :got:foo adrp tmp, :tlsgd:foo
2907 ldr x1, [:got_lo12:foo] add dest, tmp, :tlsgd_lo12:foo
2911 Load TLS symbol, depending on TLS mechanism and TLS access model.
2913 Global Dynamic - Traditional TLS:
2914 adrp tmp, :tlsgd:imm
2915 add dest, tmp, #:tlsgd_lo12:imm
2918 Global Dynamic - TLS Descriptors:
2919 adrp dest, :tlsdesc:imm
2920 ldr tmp, [dest, #:tlsdesc_lo12:imm]
2921 add dest, dest, #:tlsdesc_lo12:imm
2928 adrp tmp, :gottprel:imm
2929 ldr dest, [tmp, #:gottprel_lo12:imm]
2934 add t0, tp, #:tprel_hi12:imm, lsl #12
2935 add t0, t0, #:tprel_lo12_nc:imm
2939 aarch64_load_symref_appropriately (rtx dest
, rtx imm
,
2940 enum aarch64_symbol_type type
)
2943 rtx tmp
= legitimize_pe_coff_symbol (imm
, true);
2946 emit_insn (gen_rtx_SET (dest
, tmp
));
2953 case SYMBOL_SMALL_ABSOLUTE
:
2955 /* In ILP32, the mode of dest can be either SImode or DImode. */
2957 machine_mode mode
= GET_MODE (dest
);
2959 gcc_assert (mode
== Pmode
|| mode
== ptr_mode
);
2961 if (can_create_pseudo_p ())
2962 tmp_reg
= gen_reg_rtx (mode
);
2964 emit_move_insn (tmp_reg
, gen_rtx_HIGH (mode
, copy_rtx (imm
)));
2965 emit_insn (gen_add_losym (dest
, tmp_reg
, imm
));
2969 case SYMBOL_TINY_ABSOLUTE
:
2970 emit_insn (gen_rtx_SET (dest
, imm
));
2973 case SYMBOL_SMALL_GOT_28K
:
2975 machine_mode mode
= GET_MODE (dest
);
2976 rtx gp_rtx
= pic_offset_table_rtx
;
2980 /* NOTE: pic_offset_table_rtx can be NULL_RTX, because we can reach
2981 here before rtl expand. Tree IVOPT will generate rtl pattern to
2982 decide rtx costs, in which case pic_offset_table_rtx is not
2983 initialized. For that case no need to generate the first adrp
2984 instruction as the final cost for global variable access is
2988 /* -fpic for -mcmodel=small allow 32K GOT table size (but we are
2989 using the page base as GOT base, the first page may be wasted,
2990 in the worst scenario, there is only 28K space for GOT).
2992 The generate instruction sequence for accessing global variable
2995 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym]
2997 Only one instruction needed. But we must initialize
2998 pic_offset_table_rtx properly. We generate initialize insn for
2999 every global access, and allow CSE to remove all redundant.
3001 The final instruction sequences will look like the following
3002 for multiply global variables access.
3004 adrp pic_offset_table_rtx, _GLOBAL_OFFSET_TABLE_
3006 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym1]
3007 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym2]
3008 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym3]
3011 rtx s
= gen_rtx_SYMBOL_REF (Pmode
, "_GLOBAL_OFFSET_TABLE_");
3012 crtl
->uses_pic_offset_table
= 1;
3013 emit_move_insn (gp_rtx
, gen_rtx_HIGH (Pmode
, s
));
3015 if (mode
!= GET_MODE (gp_rtx
))
3016 gp_rtx
= gen_lowpart (mode
, gp_rtx
);
3020 if (mode
== ptr_mode
)
3023 insn
= gen_ldr_got_small_28k_di (dest
, gp_rtx
, imm
);
3025 insn
= gen_ldr_got_small_28k_si (dest
, gp_rtx
, imm
);
3027 mem
= XVECEXP (SET_SRC (insn
), 0, 0);
3031 gcc_assert (mode
== Pmode
);
3033 insn
= gen_ldr_got_small_28k_sidi (dest
, gp_rtx
, imm
);
3034 mem
= XVECEXP (XEXP (SET_SRC (insn
), 0), 0, 0);
3037 /* The operand is expected to be MEM. Whenever the related insn
3038 pattern changed, above code which calculate mem should be
3040 gcc_assert (MEM_P (mem
));
3041 MEM_READONLY_P (mem
) = 1;
3042 MEM_NOTRAP_P (mem
) = 1;
3047 case SYMBOL_SMALL_GOT_4G
:
3048 emit_insn (gen_rtx_SET (dest
, imm
));
3051 case SYMBOL_SMALL_TLSGD
:
3054 /* The return type of __tls_get_addr is the C pointer type
3056 rtx result
= gen_rtx_REG (ptr_mode
, R0_REGNUM
);
3059 if (GET_MODE (dest
) != ptr_mode
)
3060 tmp_reg
= can_create_pseudo_p () ? gen_reg_rtx (ptr_mode
) : result
;
3063 if (ptr_mode
== SImode
)
3064 aarch64_emit_call_insn (gen_tlsgd_small_si (result
, imm
));
3066 aarch64_emit_call_insn (gen_tlsgd_small_di (result
, imm
));
3067 insns
= get_insns ();
3070 RTL_CONST_CALL_P (insns
) = 1;
3071 emit_libcall_block (insns
, tmp_reg
, result
, imm
);
3072 /* Convert back to the mode of the dest adding a zero_extend
3073 from SImode (ptr_mode) to DImode (Pmode). */
3074 if (dest
!= tmp_reg
)
3075 convert_move (dest
, tmp_reg
, true);
3079 case SYMBOL_SMALL_TLSDESC
:
3081 machine_mode mode
= GET_MODE (dest
);
3082 rtx x0
= gen_rtx_REG (mode
, R0_REGNUM
);
3085 gcc_assert (mode
== Pmode
|| mode
== ptr_mode
);
3087 /* In ILP32, the got entry is always of SImode size. Unlike
3088 small GOT, the dest is fixed at reg 0. */
3090 emit_insn (gen_tlsdesc_small_si (imm
));
3092 emit_insn (gen_tlsdesc_small_di (imm
));
3093 tp
= aarch64_load_tp (NULL
);
3096 tp
= gen_lowpart (mode
, tp
);
3098 emit_insn (gen_rtx_SET (dest
, gen_rtx_PLUS (mode
, tp
, x0
)));
3100 set_unique_reg_note (get_last_insn (), REG_EQUIV
, imm
);
3104 case SYMBOL_SMALL_TLSIE
:
3106 /* In ILP32, the mode of dest can be either SImode or DImode,
3107 while the got entry is always of SImode size. The mode of
3108 dest depends on how dest is used: if dest is assigned to a
3109 pointer (e.g. in the memory), it has SImode; it may have
3110 DImode if dest is dereferenced to access the memeory.
3111 This is why we have to handle three different tlsie_small
3112 patterns here (two patterns for ILP32). */
3113 machine_mode mode
= GET_MODE (dest
);
3114 rtx tmp_reg
= gen_reg_rtx (mode
);
3115 rtx tp
= aarch64_load_tp (NULL
);
3117 if (mode
== ptr_mode
)
3120 emit_insn (gen_tlsie_small_di (tmp_reg
, imm
));
3123 emit_insn (gen_tlsie_small_si (tmp_reg
, imm
));
3124 tp
= gen_lowpart (mode
, tp
);
3129 gcc_assert (mode
== Pmode
);
3130 emit_insn (gen_tlsie_small_sidi (tmp_reg
, imm
));
3133 emit_insn (gen_rtx_SET (dest
, gen_rtx_PLUS (mode
, tp
, tmp_reg
)));
3135 set_unique_reg_note (get_last_insn (), REG_EQUIV
, imm
);
3139 case SYMBOL_TLSLE12
:
3140 case SYMBOL_TLSLE24
:
3141 case SYMBOL_TLSLE32
:
3142 case SYMBOL_TLSLE48
:
3144 machine_mode mode
= GET_MODE (dest
);
3145 rtx tp
= aarch64_load_tp (NULL
);
3148 tp
= gen_lowpart (mode
, tp
);
3152 case SYMBOL_TLSLE12
:
3153 emit_insn ((mode
== DImode
? gen_tlsle12_di
: gen_tlsle12_si
)
3156 case SYMBOL_TLSLE24
:
3157 emit_insn ((mode
== DImode
? gen_tlsle24_di
: gen_tlsle24_si
)
3160 case SYMBOL_TLSLE32
:
3161 emit_insn ((mode
== DImode
? gen_tlsle32_di
: gen_tlsle32_si
)
3163 emit_insn ((mode
== DImode
? gen_adddi3
: gen_addsi3
)
3166 case SYMBOL_TLSLE48
:
3167 emit_insn ((mode
== DImode
? gen_tlsle48_di
: gen_tlsle48_si
)
3169 emit_insn ((mode
== DImode
? gen_adddi3
: gen_addsi3
)
3177 set_unique_reg_note (get_last_insn (), REG_EQUIV
, imm
);
3181 case SYMBOL_TINY_GOT
:
3184 machine_mode mode
= GET_MODE (dest
);
3186 if (mode
== ptr_mode
)
3187 insn
= gen_ldr_got_tiny (mode
, dest
, imm
);
3190 gcc_assert (mode
== Pmode
);
3191 insn
= gen_ldr_got_tiny_sidi (dest
, imm
);
3198 case SYMBOL_TINY_TLSIE
:
3200 machine_mode mode
= GET_MODE (dest
);
3201 rtx tp
= aarch64_load_tp (NULL
);
3203 if (mode
== ptr_mode
)
3206 emit_insn (gen_tlsie_tiny_di (dest
, imm
, tp
));
3209 tp
= gen_lowpart (mode
, tp
);
3210 emit_insn (gen_tlsie_tiny_si (dest
, imm
, tp
));
3215 gcc_assert (mode
== Pmode
);
3216 emit_insn (gen_tlsie_tiny_sidi (dest
, imm
, tp
));
3220 set_unique_reg_note (get_last_insn (), REG_EQUIV
, imm
);
3229 /* Emit a move from SRC to DEST. Assume that the move expanders can
3230 handle all moves if !can_create_pseudo_p (). The distinction is
3231 important because, unlike emit_move_insn, the move expanders know
3232 how to force Pmode objects into the constant pool even when the
3233 constant pool address is not itself legitimate. */
3235 aarch64_emit_move (rtx dest
, rtx src
)
3237 return (can_create_pseudo_p ()
3238 ? emit_move_insn (dest
, src
)
3239 : emit_move_insn_1 (dest
, src
));
3242 /* Apply UNOPTAB to OP and store the result in DEST. */
3245 aarch64_emit_unop (rtx dest
, optab unoptab
, rtx op
)
3247 rtx tmp
= expand_unop (GET_MODE (dest
), unoptab
, op
, dest
, 0);
3249 emit_move_insn (dest
, tmp
);
3252 /* Apply BINOPTAB to OP0 and OP1 and store the result in DEST. */
3255 aarch64_emit_binop (rtx dest
, optab binoptab
, rtx op0
, rtx op1
)
3257 rtx tmp
= expand_binop (GET_MODE (dest
), binoptab
, op0
, op1
, dest
, 0,
3260 emit_move_insn (dest
, tmp
);
3263 /* Split a move from SRC to DST into two moves of mode SINGLE_MODE. */
3266 aarch64_split_double_move (rtx dst
, rtx src
, machine_mode single_mode
)
3268 machine_mode mode
= GET_MODE (dst
);
3270 rtx dst0
= simplify_gen_subreg (single_mode
, dst
, mode
, 0);
3271 rtx dst1
= simplify_gen_subreg (single_mode
, dst
, mode
,
3272 GET_MODE_SIZE (single_mode
));
3273 rtx src0
= simplify_gen_subreg (single_mode
, src
, mode
, 0);
3274 rtx src1
= simplify_gen_subreg (single_mode
, src
, mode
,
3275 GET_MODE_SIZE (single_mode
));
3277 /* At most one pairing may overlap. */
3278 if (reg_overlap_mentioned_p (dst0
, src1
))
3280 aarch64_emit_move (dst1
, src1
);
3281 aarch64_emit_move (dst0
, src0
);
3285 aarch64_emit_move (dst0
, src0
);
3286 aarch64_emit_move (dst1
, src1
);
3290 /* Split a 128-bit move operation into two 64-bit move operations,
3291 taking care to handle partial overlap of register to register
3292 copies. Special cases are needed when moving between GP regs and
3293 FP regs. SRC can be a register, constant or memory; DST a register
3294 or memory. If either operand is memory it must not have any side
3297 aarch64_split_128bit_move (rtx dst
, rtx src
)
3299 machine_mode mode
= GET_MODE (dst
);
3301 gcc_assert (mode
== TImode
|| mode
== TFmode
|| mode
== TDmode
);
3302 gcc_assert (!(side_effects_p (src
) || side_effects_p (dst
)));
3303 gcc_assert (mode
== GET_MODE (src
) || GET_MODE (src
) == VOIDmode
);
3305 if (REG_P (dst
) && REG_P (src
))
3307 int src_regno
= REGNO (src
);
3308 int dst_regno
= REGNO (dst
);
3310 /* Handle FP <-> GP regs. */
3311 if (FP_REGNUM_P (dst_regno
) && GP_REGNUM_P (src_regno
))
3313 rtx src_lo
= gen_lowpart (word_mode
, src
);
3314 rtx src_hi
= gen_highpart (word_mode
, src
);
3316 emit_insn (gen_aarch64_movlow_di (mode
, dst
, src_lo
));
3317 emit_insn (gen_aarch64_movhigh_di (mode
, dst
, src_hi
));
3320 else if (GP_REGNUM_P (dst_regno
) && FP_REGNUM_P (src_regno
))
3322 rtx dst_lo
= gen_lowpart (word_mode
, dst
);
3323 rtx dst_hi
= gen_highpart (word_mode
, dst
);
3325 emit_insn (gen_aarch64_movdi_low (mode
, dst_lo
, src
));
3326 emit_insn (gen_aarch64_movdi_high (mode
, dst_hi
, src
));
3331 aarch64_split_double_move (dst
, src
, word_mode
);
3334 /* Return true if we should split a move from 128-bit value SRC
3335 to 128-bit register DEST. */
3338 aarch64_split_128bit_move_p (rtx dst
, rtx src
)
3340 if (FP_REGNUM_P (REGNO (dst
)))
3341 return REG_P (src
) && !FP_REGNUM_P (REGNO (src
));
3342 /* All moves to GPRs need to be split. */
3346 /* Split a complex SIMD move. */
3349 aarch64_split_simd_move (rtx dst
, rtx src
)
3351 machine_mode src_mode
= GET_MODE (src
);
3352 machine_mode dst_mode
= GET_MODE (dst
);
3354 gcc_assert (VECTOR_MODE_P (dst_mode
));
3356 if (REG_P (dst
) && REG_P (src
))
3358 gcc_assert (VECTOR_MODE_P (src_mode
));
3359 emit_insn (gen_aarch64_split_simd_mov (src_mode
, dst
, src
));
3363 /* Return a register that contains SVE value X reinterpreted as SVE mode MODE.
3364 The semantics of those of svreinterpret rather than those of subregs;
3365 see the comment at the head of aarch64-sve.md for details about the
3369 aarch64_sve_reinterpret (machine_mode mode
, rtx x
)
3371 if (GET_MODE (x
) == mode
)
3374 /* can_change_mode_class must only return true if subregs and svreinterprets
3375 have the same semantics. */
3376 if (targetm
.can_change_mode_class (GET_MODE (x
), mode
, FP_REGS
))
3377 return force_lowpart_subreg (mode
, x
, GET_MODE (x
));
3379 rtx res
= gen_reg_rtx (mode
);
3380 x
= force_reg (GET_MODE (x
), x
);
3381 emit_insn (gen_aarch64_sve_reinterpret (mode
, res
, x
));
3386 aarch64_zero_extend_const_eq (machine_mode xmode
, rtx x
,
3387 machine_mode ymode
, rtx y
)
3389 rtx r
= simplify_const_unary_operation (ZERO_EXTEND
, xmode
, y
, ymode
);
3390 gcc_assert (r
!= NULL
);
3391 return rtx_equal_p (x
, r
);
3394 /* Return TARGET if it is nonnull and a register of mode MODE.
3395 Otherwise, return a fresh register of mode MODE if we can,
3396 or TARGET reinterpreted as MODE if we can't. */
3399 aarch64_target_reg (rtx target
, machine_mode mode
)
3401 if (target
&& REG_P (target
) && GET_MODE (target
) == mode
)
3403 if (!can_create_pseudo_p ())
3405 gcc_assert (target
);
3406 return gen_lowpart (mode
, target
);
3408 return gen_reg_rtx (mode
);
3411 /* Return a register that contains the constant in BUILDER, given that
3412 the constant is a legitimate move operand. Use TARGET as the register
3413 if it is nonnull and convenient. */
3416 aarch64_emit_set_immediate (rtx target
, rtx_vector_builder
&builder
)
3418 rtx src
= builder
.build ();
3419 target
= aarch64_target_reg (target
, GET_MODE (src
));
3420 emit_insn (gen_rtx_SET (target
, src
));
3425 aarch64_force_temporary (machine_mode mode
, rtx x
, rtx value
)
3427 if (can_create_pseudo_p ())
3428 return force_reg (mode
, value
);
3432 aarch64_emit_move (x
, value
);
3437 /* Return true if predicate value X is a constant in which every element
3438 is a CONST_INT. When returning true, describe X in BUILDER as a VNx16BI
3439 value, i.e. as a predicate in which all bits are significant. */
3442 aarch64_get_sve_pred_bits (rtx_vector_builder
&builder
, rtx x
)
3444 if (!CONST_VECTOR_P (x
))
3447 unsigned int factor
= vector_element_size (GET_MODE_NUNITS (VNx16BImode
),
3448 GET_MODE_NUNITS (GET_MODE (x
)));
3449 unsigned int npatterns
= CONST_VECTOR_NPATTERNS (x
) * factor
;
3450 unsigned int nelts_per_pattern
= CONST_VECTOR_NELTS_PER_PATTERN (x
);
3451 builder
.new_vector (VNx16BImode
, npatterns
, nelts_per_pattern
);
3453 unsigned int nelts
= const_vector_encoded_nelts (x
);
3454 for (unsigned int i
= 0; i
< nelts
; ++i
)
3456 rtx elt
= CONST_VECTOR_ENCODED_ELT (x
, i
);
3457 if (!CONST_INT_P (elt
))
3460 builder
.quick_push (elt
);
3461 for (unsigned int j
= 1; j
< factor
; ++j
)
3462 builder
.quick_push (const0_rtx
);
3464 builder
.finalize ();
3468 /* BUILDER contains a predicate constant of mode VNx16BI. Return the
3469 widest predicate element size it can have (that is, the largest size
3470 for which each element would still be 0 or 1). */
3473 aarch64_widest_sve_pred_elt_size (rtx_vector_builder
&builder
)
3475 /* Start with the most optimistic assumption: that we only need
3476 one bit per pattern. This is what we will use if only the first
3477 bit in each pattern is ever set. */
3478 unsigned int mask
= GET_MODE_SIZE (DImode
);
3479 mask
|= builder
.npatterns ();
3481 /* Look for set bits. */
3482 unsigned int nelts
= builder
.encoded_nelts ();
3483 for (unsigned int i
= 1; i
< nelts
; ++i
)
3484 if (INTVAL (builder
.elt (i
)) != 0)
3490 return mask
& -mask
;
3493 /* If VNx16BImode rtx X is a canonical PTRUE for a predicate mode,
3494 return that predicate mode, otherwise return opt_machine_mode (). */
3497 aarch64_ptrue_all_mode (rtx x
)
3499 gcc_assert (GET_MODE (x
) == VNx16BImode
);
3500 if (!CONST_VECTOR_P (x
)
3501 || !CONST_VECTOR_DUPLICATE_P (x
)
3502 || !CONST_INT_P (CONST_VECTOR_ENCODED_ELT (x
, 0))
3503 || INTVAL (CONST_VECTOR_ENCODED_ELT (x
, 0)) == 0)
3504 return opt_machine_mode ();
3506 unsigned int nelts
= const_vector_encoded_nelts (x
);
3507 for (unsigned int i
= 1; i
< nelts
; ++i
)
3508 if (CONST_VECTOR_ENCODED_ELT (x
, i
) != const0_rtx
)
3509 return opt_machine_mode ();
3511 return aarch64_sve_pred_mode (nelts
);
3514 /* BUILDER is a predicate constant of mode VNx16BI. Consider the value
3515 that the constant would have with predicate element size ELT_SIZE
3516 (ignoring the upper bits in each element) and return:
3518 * -1 if all bits are set
3519 * N if the predicate has N leading set bits followed by all clear bits
3520 * 0 if the predicate does not have any of these forms. */
3523 aarch64_partial_ptrue_length (rtx_vector_builder
&builder
,
3524 unsigned int elt_size
)
3526 /* If nelts_per_pattern is 3, we have set bits followed by clear bits
3527 followed by set bits. */
3528 if (builder
.nelts_per_pattern () == 3)
3531 /* Skip over leading set bits. */
3532 unsigned int nelts
= builder
.encoded_nelts ();
3534 for (; i
< nelts
; i
+= elt_size
)
3535 if (INTVAL (builder
.elt (i
)) == 0)
3537 unsigned int vl
= i
/ elt_size
;
3539 /* Check for the all-true case. */
3543 /* If nelts_per_pattern is 1, then either VL is zero, or we have a
3544 repeating pattern of set bits followed by clear bits. */
3545 if (builder
.nelts_per_pattern () != 2)
3548 /* We have a "foreground" value and a duplicated "background" value.
3549 If the background might repeat and the last set bit belongs to it,
3550 we might have set bits followed by clear bits followed by set bits. */
3551 if (i
> builder
.npatterns () && maybe_ne (nelts
, builder
.full_nelts ()))
3554 /* Make sure that the rest are all clear. */
3555 for (; i
< nelts
; i
+= elt_size
)
3556 if (INTVAL (builder
.elt (i
)) != 0)
3562 /* See if there is an svpattern that encodes an SVE predicate of mode
3563 PRED_MODE in which the first VL bits are set and the rest are clear.
3564 Return the pattern if so, otherwise return AARCH64_NUM_SVPATTERNS.
3565 A VL of -1 indicates an all-true vector. */
3568 aarch64_svpattern_for_vl (machine_mode pred_mode
, int vl
)
3571 return AARCH64_SV_ALL
;
3573 if (maybe_gt (vl
, GET_MODE_NUNITS (pred_mode
)))
3574 return AARCH64_NUM_SVPATTERNS
;
3576 if (vl
>= 1 && vl
<= 8)
3577 return aarch64_svpattern (AARCH64_SV_VL1
+ (vl
- 1));
3579 if (vl
>= 16 && vl
<= 256 && pow2p_hwi (vl
))
3580 return aarch64_svpattern (AARCH64_SV_VL16
+ (exact_log2 (vl
) - 4));
3583 if (GET_MODE_NUNITS (pred_mode
).is_constant (&max_vl
))
3585 if (vl
== (max_vl
/ 3) * 3)
3586 return AARCH64_SV_MUL3
;
3587 /* These would only trigger for non-power-of-2 lengths. */
3588 if (vl
== (max_vl
& -4))
3589 return AARCH64_SV_MUL4
;
3590 if (vl
== (1 << floor_log2 (max_vl
)))
3591 return AARCH64_SV_POW2
;
3593 return AARCH64_SV_ALL
;
3595 return AARCH64_NUM_SVPATTERNS
;
3598 /* Return a VNx16BImode constant in which every sequence of ELT_SIZE
3599 bits has the lowest bit set and the upper bits clear. This is the
3600 VNx16BImode equivalent of a PTRUE for controlling elements of
3601 ELT_SIZE bytes. However, because the constant is VNx16BImode,
3602 all bits are significant, even the upper zeros. */
3605 aarch64_ptrue_all (unsigned int elt_size
)
3607 rtx_vector_builder
builder (VNx16BImode
, elt_size
, 1);
3608 builder
.quick_push (const1_rtx
);
3609 for (unsigned int i
= 1; i
< elt_size
; ++i
)
3610 builder
.quick_push (const0_rtx
);
3611 return builder
.build ();
3614 /* Return an all-true predicate register of mode MODE. */
3617 aarch64_ptrue_reg (machine_mode mode
)
3619 gcc_assert (aarch64_sve_pred_mode_p (mode
));
3620 rtx reg
= force_reg (VNx16BImode
, CONSTM1_RTX (VNx16BImode
));
3621 return gen_lowpart (mode
, reg
);
3624 /* Return an all-false predicate register of mode MODE. */
3627 aarch64_pfalse_reg (machine_mode mode
)
3629 gcc_assert (aarch64_sve_pred_mode_p (mode
));
3630 rtx reg
= force_reg (VNx16BImode
, CONST0_RTX (VNx16BImode
));
3631 return gen_lowpart (mode
, reg
);
3634 /* PRED1[0] is a PTEST predicate and PRED1[1] is an aarch64_sve_ptrue_flag
3635 for it. PRED2[0] is the predicate for the instruction whose result
3636 is tested by the PTEST and PRED2[1] is again an aarch64_sve_ptrue_flag
3637 for it. Return true if we can prove that the two predicates are
3638 equivalent for PTEST purposes; that is, if we can replace PRED2[0]
3639 with PRED1[0] without changing behavior. */
3642 aarch64_sve_same_pred_for_ptest_p (rtx
*pred1
, rtx
*pred2
)
3644 machine_mode mode
= GET_MODE (pred1
[0]);
3645 gcc_assert (aarch64_sve_pred_mode_p (mode
)
3646 && mode
== GET_MODE (pred2
[0])
3647 && aarch64_sve_ptrue_flag (pred1
[1], SImode
)
3648 && aarch64_sve_ptrue_flag (pred2
[1], SImode
));
3650 bool ptrue1_p
= (pred1
[0] == CONSTM1_RTX (mode
)
3651 || INTVAL (pred1
[1]) == SVE_KNOWN_PTRUE
);
3652 bool ptrue2_p
= (pred2
[0] == CONSTM1_RTX (mode
)
3653 || INTVAL (pred2
[1]) == SVE_KNOWN_PTRUE
);
3654 return (ptrue1_p
&& ptrue2_p
) || rtx_equal_p (pred1
[0], pred2
[0]);
3657 /* Emit a comparison CMP between OP0 and OP1, both of which have mode
3658 DATA_MODE, and return the result in a predicate of mode PRED_MODE.
3659 Use TARGET as the target register if nonnull and convenient. */
3662 aarch64_sve_emit_int_cmp (rtx target
, machine_mode pred_mode
, rtx_code cmp
,
3663 machine_mode data_mode
, rtx op1
, rtx op2
)
3665 insn_code icode
= code_for_aarch64_pred_cmp (cmp
, data_mode
);
3666 expand_operand ops
[5];
3667 create_output_operand (&ops
[0], target
, pred_mode
);
3668 create_input_operand (&ops
[1], CONSTM1_RTX (pred_mode
), pred_mode
);
3669 create_integer_operand (&ops
[2], SVE_KNOWN_PTRUE
);
3670 create_input_operand (&ops
[3], op1
, data_mode
);
3671 create_input_operand (&ops
[4], op2
, data_mode
);
3672 expand_insn (icode
, 5, ops
);
3673 return ops
[0].value
;
3676 /* Use a comparison to convert integer vector SRC into MODE, which is
3677 the corresponding SVE predicate mode. Use TARGET for the result
3678 if it's nonnull and convenient. */
3681 aarch64_convert_sve_data_to_pred (rtx target
, machine_mode mode
, rtx src
)
3683 machine_mode src_mode
= GET_MODE (src
);
3684 return aarch64_sve_emit_int_cmp (target
, mode
, NE
, src_mode
,
3685 src
, CONST0_RTX (src_mode
));
3688 /* Return the assembly token for svprfop value PRFOP. */
3691 svprfop_token (enum aarch64_svprfop prfop
)
3695 #define CASE(UPPER, LOWER, VALUE) case AARCH64_SV_##UPPER: return #LOWER;
3696 AARCH64_FOR_SVPRFOP (CASE
)
3698 case AARCH64_NUM_SVPRFOPS
:
3704 /* Return the assembly string for an SVE prefetch operation with
3705 mnemonic MNEMONIC, given that PRFOP_RTX is the prefetch operation
3706 and that SUFFIX is the format for the remaining operands. */
3709 aarch64_output_sve_prefetch (const char *mnemonic
, rtx prfop_rtx
,
3712 static char buffer
[128];
3713 aarch64_svprfop prfop
= (aarch64_svprfop
) INTVAL (prfop_rtx
);
3714 unsigned int written
= snprintf (buffer
, sizeof (buffer
), "%s\t%s, %s",
3715 mnemonic
, svprfop_token (prfop
), suffix
);
3716 gcc_assert (written
< sizeof (buffer
));
3720 /* Check whether we can calculate the number of elements in PATTERN
3721 at compile time, given that there are NELTS_PER_VQ elements per
3722 128-bit block. Return the value if so, otherwise return -1. */
3725 aarch64_fold_sve_cnt_pat (aarch64_svpattern pattern
, unsigned int nelts_per_vq
)
3727 unsigned int vl
, const_vg
;
3728 if (pattern
>= AARCH64_SV_VL1
&& pattern
<= AARCH64_SV_VL8
)
3729 vl
= 1 + (pattern
- AARCH64_SV_VL1
);
3730 else if (pattern
>= AARCH64_SV_VL16
&& pattern
<= AARCH64_SV_VL256
)
3731 vl
= 16 << (pattern
- AARCH64_SV_VL16
);
3732 else if (aarch64_sve_vg
.is_constant (&const_vg
))
3734 /* There are two vector granules per quadword. */
3735 unsigned int nelts
= (const_vg
/ 2) * nelts_per_vq
;
3738 case AARCH64_SV_POW2
: return 1 << floor_log2 (nelts
);
3739 case AARCH64_SV_MUL4
: return nelts
& -4;
3740 case AARCH64_SV_MUL3
: return (nelts
/ 3) * 3;
3741 case AARCH64_SV_ALL
: return nelts
;
3742 default: gcc_unreachable ();
3748 /* There are two vector granules per quadword. */
3749 poly_uint64 nelts_all
= exact_div (aarch64_sve_vg
, 2) * nelts_per_vq
;
3750 if (known_le (vl
, nelts_all
))
3753 /* Requesting more elements than are available results in a PFALSE. */
3754 if (known_gt (vl
, nelts_all
))
3760 /* Return true if a single CNT[BHWD] instruction can multiply FACTOR
3761 by the number of 128-bit quadwords in an SVE vector. */
3764 aarch64_sve_cnt_factor_p (HOST_WIDE_INT factor
)
3766 /* The coefficient must be [1, 16] * {2, 4, 8, 16}. */
3767 return (IN_RANGE (factor
, 2, 16 * 16)
3768 && (factor
& 1) == 0
3769 && factor
<= 16 * (factor
& -factor
));
3772 /* Return true if we can move VALUE into a register using a single
3773 CNT[BHWD] instruction. */
3776 aarch64_sve_cnt_immediate_p (poly_int64 value
)
3778 HOST_WIDE_INT factor
= value
.coeffs
[0];
3779 return value
.coeffs
[1] == factor
&& aarch64_sve_cnt_factor_p (factor
);
3782 /* Likewise for rtx X. */
3785 aarch64_sve_cnt_immediate_p (rtx x
)
3788 return poly_int_rtx_p (x
, &value
) && aarch64_sve_cnt_immediate_p (value
);
3791 /* Return the asm string for an instruction with a CNT-like vector size
3792 operand (a vector pattern followed by a multiplier in the range [1, 16]).
3793 PREFIX is the mnemonic without the size suffix and OPERANDS is the
3794 first part of the operands template (the part that comes before the
3795 vector size itself). PATTERN is the pattern to use. FACTOR is the
3796 number of quadwords. NELTS_PER_VQ, if nonzero, is the number of elements
3797 in each quadword. If it is zero, we can use any element size. */
3800 aarch64_output_sve_cnt_immediate (const char *prefix
, const char *operands
,
3801 aarch64_svpattern pattern
,
3802 unsigned int factor
,
3803 unsigned int nelts_per_vq
)
3805 static char buffer
[sizeof ("sqincd\t%x0, %w0, vl256, mul #16")];
3807 if (nelts_per_vq
== 0)
3808 /* There is some overlap in the ranges of the four CNT instructions.
3809 Here we always use the smallest possible element size, so that the
3810 multiplier is 1 whereever possible. */
3811 nelts_per_vq
= factor
& -factor
;
3812 int shift
= std::min (exact_log2 (nelts_per_vq
), 4);
3813 gcc_assert (IN_RANGE (shift
, 1, 4));
3814 char suffix
= "dwhb"[shift
- 1];
3817 unsigned int written
;
3818 if (pattern
== AARCH64_SV_ALL
&& factor
== 1)
3819 written
= snprintf (buffer
, sizeof (buffer
), "%s%c\t%s",
3820 prefix
, suffix
, operands
);
3821 else if (factor
== 1)
3822 written
= snprintf (buffer
, sizeof (buffer
), "%s%c\t%s, %s",
3823 prefix
, suffix
, operands
, svpattern_token (pattern
));
3825 written
= snprintf (buffer
, sizeof (buffer
), "%s%c\t%s, %s, mul #%d",
3826 prefix
, suffix
, operands
, svpattern_token (pattern
),
3828 gcc_assert (written
< sizeof (buffer
));
3832 /* Return the asm string for an instruction with a CNT-like vector size
3833 operand (a vector pattern followed by a multiplier in the range [1, 16]).
3834 PREFIX is the mnemonic without the size suffix and OPERANDS is the
3835 first part of the operands template (the part that comes before the
3836 vector size itself). X is the value of the vector size operand,
3837 as a polynomial integer rtx; we need to convert this into an "all"
3838 pattern with a multiplier. */
3841 aarch64_output_sve_cnt_immediate (const char *prefix
, const char *operands
,
3844 poly_int64 value
= rtx_to_poly_int64 (x
);
3845 gcc_assert (aarch64_sve_cnt_immediate_p (value
));
3846 return aarch64_output_sve_cnt_immediate (prefix
, operands
, AARCH64_SV_ALL
,
3847 value
.coeffs
[1], 0);
3850 /* Return the asm string for an instruction with a CNT-like vector size
3851 operand (a vector pattern followed by a multiplier in the range [1, 16]).
3852 PREFIX is the mnemonic without the size suffix and OPERANDS is the
3853 first part of the operands template (the part that comes before the
3854 vector size itself). CNT_PAT[0..2] are the operands of the
3855 UNSPEC_SVE_CNT_PAT; see aarch64_sve_cnt_pat for details. */
3858 aarch64_output_sve_cnt_pat_immediate (const char *prefix
,
3859 const char *operands
, rtx
*cnt_pat
)
3861 aarch64_svpattern pattern
= (aarch64_svpattern
) INTVAL (cnt_pat
[0]);
3862 unsigned int nelts_per_vq
= INTVAL (cnt_pat
[1]);
3863 unsigned int factor
= INTVAL (cnt_pat
[2]) * nelts_per_vq
;
3864 return aarch64_output_sve_cnt_immediate (prefix
, operands
, pattern
,
3865 factor
, nelts_per_vq
);
3868 /* Return true if we can add X using a single SVE INC or DEC instruction. */
3871 aarch64_sve_scalar_inc_dec_immediate_p (rtx x
)
3874 return (poly_int_rtx_p (x
, &value
)
3875 && (aarch64_sve_cnt_immediate_p (value
)
3876 || aarch64_sve_cnt_immediate_p (-value
)));
3879 /* Return the asm string for adding SVE INC/DEC immediate OFFSET to
3883 aarch64_output_sve_scalar_inc_dec (rtx offset
)
3885 poly_int64 offset_value
= rtx_to_poly_int64 (offset
);
3886 gcc_assert (offset_value
.coeffs
[0] == offset_value
.coeffs
[1]);
3887 if (offset_value
.coeffs
[1] > 0)
3888 return aarch64_output_sve_cnt_immediate ("inc", "%x0", AARCH64_SV_ALL
,
3889 offset_value
.coeffs
[1], 0);
3891 return aarch64_output_sve_cnt_immediate ("dec", "%x0", AARCH64_SV_ALL
,
3892 -offset_value
.coeffs
[1], 0);
3895 /* Return true if a single RDVL instruction can multiply FACTOR by the
3896 number of 128-bit quadwords in an SVE vector. This is also the
3900 aarch64_sve_rdvl_addvl_factor_p (HOST_WIDE_INT factor
)
3902 return (multiple_p (factor
, 16)
3903 && IN_RANGE (factor
, -32 * 16, 31 * 16));
3906 /* Return true if ADDPL can be used to add FACTOR multiplied by the number
3907 of quadwords in an SVE vector. */
3910 aarch64_sve_addpl_factor_p (HOST_WIDE_INT factor
)
3912 return (multiple_p (factor
, 2)
3913 && IN_RANGE (factor
, -32 * 2, 31 * 2));
3916 /* Return true if we can move VALUE into a register using a single
3917 RDVL instruction. */
3920 aarch64_sve_rdvl_immediate_p (poly_int64 value
)
3922 HOST_WIDE_INT factor
= value
.coeffs
[0];
3923 return value
.coeffs
[1] == factor
&& aarch64_sve_rdvl_addvl_factor_p (factor
);
3926 /* Likewise for rtx X. */
3929 aarch64_sve_rdvl_immediate_p (rtx x
)
3932 return poly_int_rtx_p (x
, &value
) && aarch64_sve_rdvl_immediate_p (value
);
3935 /* Return the asm string for moving RDVL immediate OFFSET into register
3939 aarch64_output_sve_rdvl (rtx offset
)
3941 static char buffer
[sizeof ("rdvl\t%x0, #-") + 3 * sizeof (int)];
3942 poly_int64 offset_value
= rtx_to_poly_int64 (offset
);
3943 gcc_assert (aarch64_sve_rdvl_immediate_p (offset_value
));
3945 int factor
= offset_value
.coeffs
[1];
3946 snprintf (buffer
, sizeof (buffer
), "rdvl\t%%x0, #%d", factor
/ 16);
3950 /* Return true if we can add VALUE to a register using a single ADDVL
3951 or ADDPL instruction. */
3954 aarch64_sve_addvl_addpl_immediate_p (poly_int64 value
)
3956 HOST_WIDE_INT factor
= value
.coeffs
[0];
3957 if (factor
== 0 || value
.coeffs
[1] != factor
)
3959 return (aarch64_sve_rdvl_addvl_factor_p (factor
)
3960 || aarch64_sve_addpl_factor_p (factor
));
3963 /* Likewise for rtx X. */
3966 aarch64_sve_addvl_addpl_immediate_p (rtx x
)
3969 return (poly_int_rtx_p (x
, &value
)
3970 && aarch64_sve_addvl_addpl_immediate_p (value
));
3973 /* Return the asm string for adding ADDVL or ADDPL immediate OFFSET
3974 to operand 1 and storing the result in operand 0. */
3977 aarch64_output_sve_addvl_addpl (rtx offset
)
3979 static char buffer
[sizeof ("addpl\t%x0, %x1, #-") + 3 * sizeof (int)];
3980 poly_int64 offset_value
= rtx_to_poly_int64 (offset
);
3981 gcc_assert (aarch64_sve_addvl_addpl_immediate_p (offset_value
));
3983 int factor
= offset_value
.coeffs
[1];
3984 if ((factor
& 15) == 0)
3985 snprintf (buffer
, sizeof (buffer
), "addvl\t%%x0, %%x1, #%d", factor
/ 16);
3987 snprintf (buffer
, sizeof (buffer
), "addpl\t%%x0, %%x1, #%d", factor
/ 2);
3991 /* Return true if X is a valid immediate for an SVE vector INC or DEC
3992 instruction. If it is, store the number of elements in each vector
3993 quadword in *NELTS_PER_VQ_OUT (if nonnull) and store the multiplication
3994 factor in *FACTOR_OUT (if nonnull). */
3997 aarch64_sve_vector_inc_dec_immediate_p (rtx x
, int *factor_out
,
3998 unsigned int *nelts_per_vq_out
)
4003 if (!const_vec_duplicate_p (x
, &elt
)
4004 || !poly_int_rtx_p (elt
, &value
))
4007 unsigned int nelts_per_vq
= 128 / GET_MODE_UNIT_BITSIZE (GET_MODE (x
));
4008 if (nelts_per_vq
!= 8 && nelts_per_vq
!= 4 && nelts_per_vq
!= 2)
4009 /* There's no vector INCB. */
4012 HOST_WIDE_INT factor
= value
.coeffs
[0];
4013 if (value
.coeffs
[1] != factor
)
4016 /* The coefficient must be [1, 16] * NELTS_PER_VQ. */
4017 if ((factor
% nelts_per_vq
) != 0
4018 || !IN_RANGE (abs (factor
), nelts_per_vq
, 16 * nelts_per_vq
))
4022 *factor_out
= factor
;
4023 if (nelts_per_vq_out
)
4024 *nelts_per_vq_out
= nelts_per_vq
;
4028 /* Return true if X is a valid immediate for an SVE vector INC or DEC
4032 aarch64_sve_vector_inc_dec_immediate_p (rtx x
)
4034 return aarch64_sve_vector_inc_dec_immediate_p (x
, NULL
, NULL
);
4037 /* Return the asm template for an SVE vector INC or DEC instruction.
4038 OPERANDS gives the operands before the vector count and X is the
4039 value of the vector count operand itself. */
4042 aarch64_output_sve_vector_inc_dec (const char *operands
, rtx x
)
4045 unsigned int nelts_per_vq
;
4046 if (!aarch64_sve_vector_inc_dec_immediate_p (x
, &factor
, &nelts_per_vq
))
4049 return aarch64_output_sve_cnt_immediate ("dec", operands
, AARCH64_SV_ALL
,
4050 -factor
, nelts_per_vq
);
4052 return aarch64_output_sve_cnt_immediate ("inc", operands
, AARCH64_SV_ALL
,
4053 factor
, nelts_per_vq
);
4056 /* Return a constant that represents FACTOR multiplied by the
4057 number of 128-bit quadwords in an SME vector. ISA_MODE is the
4058 ISA mode in which the calculation is being performed. */
4061 aarch64_sme_vq_immediate (machine_mode mode
, HOST_WIDE_INT factor
,
4062 aarch64_isa_mode isa_mode
)
4064 gcc_assert (aarch64_sve_rdvl_addvl_factor_p (factor
));
4065 if (isa_mode
& AARCH64_ISA_MODE_SM_ON
)
4066 /* We're in streaming mode, so we can use normal poly-int values. */
4067 return gen_int_mode ({ factor
, factor
}, mode
);
4069 rtvec vec
= gen_rtvec (1, gen_int_mode (factor
, SImode
));
4070 rtx unspec
= gen_rtx_UNSPEC (mode
, vec
, UNSPEC_SME_VQ
);
4071 return gen_rtx_CONST (mode
, unspec
);
4074 /* Return true if X is a constant that represents some number X
4075 multiplied by the number of quadwords in an SME vector. Store this X
4076 in *FACTOR if so. */
4079 aarch64_sme_vq_unspec_p (const_rtx x
, HOST_WIDE_INT
*factor
)
4081 if (!TARGET_SME
|| GET_CODE (x
) != CONST
)
4085 if (GET_CODE (x
) != UNSPEC
4086 || XINT (x
, 1) != UNSPEC_SME_VQ
4087 || XVECLEN (x
, 0) != 1)
4090 x
= XVECEXP (x
, 0, 0);
4091 if (!CONST_INT_P (x
))
4094 *factor
= INTVAL (x
);
4098 /* Return true if X is a constant that represents some number Y
4099 multiplied by the number of quadwords in an SME vector, and if
4100 that Y is in the range of RDSVL. */
4103 aarch64_rdsvl_immediate_p (const_rtx x
)
4105 HOST_WIDE_INT factor
;
4106 return (aarch64_sme_vq_unspec_p (x
, &factor
)
4107 && aarch64_sve_rdvl_addvl_factor_p (factor
));
4110 /* Return the asm string for an RDSVL instruction that calculates X,
4111 which is a constant that satisfies aarch64_rdsvl_immediate_p. */
4114 aarch64_output_rdsvl (const_rtx x
)
4116 gcc_assert (aarch64_rdsvl_immediate_p (x
));
4117 static char buffer
[sizeof ("rdsvl\t%x0, #-") + 3 * sizeof (int)];
4118 x
= XVECEXP (XEXP (x
, 0), 0, 0);
4119 snprintf (buffer
, sizeof (buffer
), "rdsvl\t%%x0, #%d",
4120 (int) INTVAL (x
) / 16);
4124 /* Return true if X is a constant that can be added using ADDSVL or ADDSPL. */
4127 aarch64_addsvl_addspl_immediate_p (const_rtx x
)
4129 HOST_WIDE_INT factor
;
4130 return (aarch64_sme_vq_unspec_p (x
, &factor
)
4131 && (aarch64_sve_rdvl_addvl_factor_p (factor
)
4132 || aarch64_sve_addpl_factor_p (factor
)));
4135 /* X is a constant that satisfies aarch64_addsvl_addspl_immediate_p.
4136 Return the asm string for the associated instruction. */
4139 aarch64_output_addsvl_addspl (rtx x
)
4141 static char buffer
[sizeof ("addspl\t%x0, %x1, #-") + 3 * sizeof (int)];
4142 HOST_WIDE_INT factor
;
4143 if (!aarch64_sme_vq_unspec_p (x
, &factor
))
4145 if (aarch64_sve_rdvl_addvl_factor_p (factor
))
4146 snprintf (buffer
, sizeof (buffer
), "addsvl\t%%x0, %%x1, #%d",
4148 else if (aarch64_sve_addpl_factor_p (factor
))
4149 snprintf (buffer
, sizeof (buffer
), "addspl\t%%x0, %%x1, #%d",
4156 /* Multipliers for repeating bitmasks of width 32, 16, 8, 4, and 2. */
4158 static const unsigned HOST_WIDE_INT bitmask_imm_mul
[] =
4160 0x0000000100000001ull
,
4161 0x0001000100010001ull
,
4162 0x0101010101010101ull
,
4163 0x1111111111111111ull
,
4164 0x5555555555555555ull
,
4169 /* Return true if 64-bit VAL is a valid bitmask immediate. */
4171 aarch64_bitmask_imm (unsigned HOST_WIDE_INT val
)
4173 unsigned HOST_WIDE_INT tmp
, mask
, first_one
, next_one
;
4176 /* Check for a single sequence of one bits and return quickly if so.
4177 The special cases of all ones and all zeroes returns false. */
4178 tmp
= val
+ (val
& -val
);
4180 if (tmp
== (tmp
& -tmp
))
4181 return (val
+ 1) > 1;
4183 /* Invert if the immediate doesn't start with a zero bit - this means we
4184 only need to search for sequences of one bits. */
4188 /* Find the first set bit and set tmp to val with the first sequence of one
4189 bits removed. Return success if there is a single sequence of ones. */
4190 first_one
= val
& -val
;
4191 tmp
= val
& (val
+ first_one
);
4196 /* Find the next set bit and compute the difference in bit position. */
4197 next_one
= tmp
& -tmp
;
4198 bits
= clz_hwi (first_one
) - clz_hwi (next_one
);
4201 /* Check the bit position difference is a power of 2, and that the first
4202 sequence of one bits fits within 'bits' bits. */
4203 if ((mask
>> bits
) != 0 || bits
!= (bits
& -bits
))
4206 /* Check the sequence of one bits is repeated 64/bits times. */
4207 return val
== mask
* bitmask_imm_mul
[__builtin_clz (bits
) - 26];
4211 /* Return true if VAL is a valid bitmask immediate for MODE. */
4213 aarch64_bitmask_imm (unsigned HOST_WIDE_INT val
, machine_mode mode
)
4216 return aarch64_bitmask_imm (val
);
4219 return aarch64_bitmask_imm ((val
& 0xffffffff) | (val
<< 32));
4221 /* Replicate small immediates to fit 64 bits. */
4222 int size
= GET_MODE_UNIT_PRECISION (mode
);
4223 val
&= (HOST_WIDE_INT_1U
<< size
) - 1;
4224 val
*= bitmask_imm_mul
[__builtin_clz (size
) - 26];
4226 return aarch64_bitmask_imm (val
);
4230 /* Return true if the immediate VAL can be a bitfield immediate
4231 by changing the given MASK bits in VAL to zeroes, ones or bits
4232 from the other half of VAL. Return the new immediate in VAL2. */
4234 aarch64_check_bitmask (unsigned HOST_WIDE_INT val
,
4235 unsigned HOST_WIDE_INT
&val2
,
4236 unsigned HOST_WIDE_INT mask
)
4239 if (val2
!= val
&& aarch64_bitmask_imm (val2
))
4242 if (val2
!= val
&& aarch64_bitmask_imm (val2
))
4245 val2
= val
| (((val
>> 32) | (val
<< 32)) & mask
);
4246 if (val2
!= val
&& aarch64_bitmask_imm (val2
))
4248 val2
= val
| (((val
>> 16) | (val
<< 48)) & mask
);
4249 if (val2
!= val
&& aarch64_bitmask_imm (val2
))
4255 /* Return true if VAL is a valid MOVZ immediate. */
4257 aarch64_is_movz (unsigned HOST_WIDE_INT val
)
4259 return (val
>> (ctz_hwi (val
) & 48)) < 65536;
4263 /* Return true if immediate VAL can be created by a 64-bit MOVI/MOVN/MOVZ. */
4265 aarch64_is_mov_xn_imm (unsigned HOST_WIDE_INT val
)
4267 return aarch64_is_movz (val
) || aarch64_is_movz (~val
)
4268 || aarch64_bitmask_imm (val
);
4272 /* Return true if VAL is an immediate that can be created by a single
4275 aarch64_move_imm (unsigned HOST_WIDE_INT val
, machine_mode mode
)
4277 gcc_assert (mode
== SImode
|| mode
== DImode
);
4282 unsigned HOST_WIDE_INT mask
=
4283 (val
>> 32) == 0 || mode
== SImode
? 0xffffffff : HOST_WIDE_INT_M1U
;
4285 if (aarch64_is_movz (val
& mask
) || aarch64_is_movz (~val
& mask
))
4288 val
= (val
& mask
) | ((val
<< 32) & ~mask
);
4289 return aarch64_bitmask_imm (val
);
4294 aarch64_internal_mov_immediate (rtx dest
, rtx imm
, bool generate
,
4298 unsigned HOST_WIDE_INT val
, val2
, val3
, mask
;
4299 int one_match
, zero_match
;
4302 gcc_assert (mode
== SImode
|| mode
== DImode
);
4306 if (aarch64_move_imm (val
, mode
))
4309 emit_insn (gen_rtx_SET (dest
, imm
));
4313 if ((val
>> 32) == 0 || mode
== SImode
)
4317 emit_insn (gen_rtx_SET (dest
, GEN_INT (val
& 0xffff)));
4319 emit_insn (gen_insv_immsi (dest
, GEN_INT (16),
4320 GEN_INT ((val
>> 16) & 0xffff)));
4322 emit_insn (gen_insv_immdi (dest
, GEN_INT (16),
4323 GEN_INT ((val
>> 16) & 0xffff)));
4328 /* Remaining cases are all for DImode. */
4331 zero_match
= ((val
& mask
) == 0) + ((val
& (mask
<< 16)) == 0) +
4332 ((val
& (mask
<< 32)) == 0) + ((val
& (mask
<< 48)) == 0);
4333 one_match
= ((~val
& mask
) == 0) + ((~val
& (mask
<< 16)) == 0) +
4334 ((~val
& (mask
<< 32)) == 0) + ((~val
& (mask
<< 48)) == 0);
4336 /* Try a bitmask immediate and a movk to generate the immediate
4337 in 2 instructions. */
4339 if (zero_match
< 2 && one_match
< 2)
4341 for (i
= 0; i
< 64; i
+= 16)
4343 if (aarch64_check_bitmask (val
, val2
, mask
<< i
))
4346 val2
= val
& ~(mask
<< i
);
4347 if ((val2
>> 32) == 0 && aarch64_move_imm (val2
, DImode
))
4355 emit_insn (gen_rtx_SET (dest
, GEN_INT (val2
)));
4356 emit_insn (gen_insv_immdi (dest
, GEN_INT (i
),
4357 GEN_INT ((val
>> i
) & 0xffff)));
4362 /* Try 2 bitmask immediates which are xor'd together. */
4363 for (i
= 0; i
< 64; i
+= 16)
4365 val2
= (val
>> i
) & mask
;
4368 if (aarch64_bitmask_imm (val2
) && aarch64_bitmask_imm (val
^ val2
))
4376 emit_insn (gen_rtx_SET (dest
, GEN_INT (val2
)));
4377 emit_insn (gen_xordi3 (dest
, dest
, GEN_INT (val
^ val2
)));
4383 /* Try a bitmask plus 2 movk to generate the immediate in 3 instructions. */
4384 if (zero_match
+ one_match
== 0)
4386 for (i
= 0; i
< 48; i
+= 16)
4387 for (int j
= i
+ 16; j
< 64; j
+= 16)
4388 if (aarch64_check_bitmask (val
, val2
, (mask
<< i
) | (mask
<< j
)))
4392 emit_insn (gen_rtx_SET (dest
, GEN_INT (val2
)));
4393 emit_insn (gen_insv_immdi (dest
, GEN_INT (i
),
4394 GEN_INT ((val
>> i
) & 0xffff)));
4395 emit_insn (gen_insv_immdi (dest
, GEN_INT (j
),
4396 GEN_INT ((val
>> j
) & 0xffff)));
4401 /* Try shifting and inserting the bottom 32-bits into the top bits. */
4402 val2
= val
& 0xffffffff;
4404 val3
= val2
| (val3
<< 32);
4405 for (i
= 17; i
< 48; i
++)
4406 if ((val2
| (val2
<< i
)) == val
)
4410 emit_insn (gen_rtx_SET (dest
, GEN_INT (val2
& 0xffff)));
4411 emit_insn (gen_insv_immdi (dest
, GEN_INT (16),
4412 GEN_INT (val2
>> 16)));
4413 emit_insn (gen_ior_ashldi3 (dest
, dest
, GEN_INT (i
), dest
));
4417 else if ((val3
& ~(val3
<< i
)) == val
)
4421 emit_insn (gen_rtx_SET (dest
, GEN_INT (val3
| 0xffff0000)));
4422 emit_insn (gen_insv_immdi (dest
, GEN_INT (16),
4423 GEN_INT (val2
>> 16)));
4424 emit_insn (gen_and_one_cmpl_ashldi3 (dest
, dest
, GEN_INT (i
),
4431 /* Generate 2-4 instructions, skipping 16 bits of all zeroes or ones which
4432 are emitted by the initial mov. If one_match > zero_match, skip set bits,
4433 otherwise skip zero bits. */
4437 val2
= one_match
> zero_match
? ~val
: val
;
4438 i
= (val2
& mask
) != 0 ? 0 : (val2
& (mask
<< 16)) != 0 ? 16 : 32;
4441 emit_insn (gen_rtx_SET (dest
, GEN_INT (one_match
> zero_match
4442 ? (val
| ~(mask
<< i
))
4443 : (val
& (mask
<< i
)))));
4444 for (i
+= 16; i
< 64; i
+= 16)
4446 if ((val2
& (mask
<< i
)) == 0)
4449 emit_insn (gen_insv_immdi (dest
, GEN_INT (i
),
4450 GEN_INT ((val
>> i
) & 0xffff)));
4457 /* Return whether imm is a 128-bit immediate which is simple enough to
4460 aarch64_mov128_immediate (rtx imm
)
4462 if (CONST_INT_P (imm
))
4465 gcc_assert (CONST_WIDE_INT_NUNITS (imm
) == 2);
4467 rtx lo
= GEN_INT (CONST_WIDE_INT_ELT (imm
, 0));
4468 rtx hi
= GEN_INT (CONST_WIDE_INT_ELT (imm
, 1));
4470 return aarch64_internal_mov_immediate (NULL_RTX
, lo
, false, DImode
)
4471 + aarch64_internal_mov_immediate (NULL_RTX
, hi
, false, DImode
) <= 4;
4475 /* Return true if val can be encoded as a 12-bit unsigned immediate with
4476 a left shift of 0 or 12 bits. */
4478 aarch64_uimm12_shift (unsigned HOST_WIDE_INT val
)
4480 return val
< 4096 || (val
& 0xfff000) == val
;
4483 /* Returns the nearest value to VAL that will fit as a 12-bit unsigned immediate
4484 that can be created with a left shift of 0 or 12. */
4485 static HOST_WIDE_INT
4486 aarch64_clamp_to_uimm12_shift (unsigned HOST_WIDE_INT val
)
4488 /* Check to see if the value fits in 24 bits, as that is the maximum we can
4489 handle correctly. */
4490 gcc_assert (val
< 0x1000000);
4495 return val
& 0xfff000;
4501 X = (X & AND_VAL) | IOR_VAL;
4503 can be implemented using:
4505 MOVK X, #(IOR_VAL >> shift), LSL #shift
4507 Return the shift if so, otherwise return -1. */
4509 aarch64_movk_shift (const wide_int_ref
&and_val
,
4510 const wide_int_ref
&ior_val
)
4512 unsigned int precision
= and_val
.get_precision ();
4513 unsigned HOST_WIDE_INT mask
= 0xffff;
4514 for (unsigned int shift
= 0; shift
< precision
; shift
+= 16)
4516 if (and_val
== ~mask
&& (ior_val
& mask
) == ior_val
)
4523 /* Create mask of ones, covering the lowest to highest bits set in VAL_IN.
4524 Assumed precondition: VAL_IN Is not zero. */
4526 unsigned HOST_WIDE_INT
4527 aarch64_and_split_imm1 (HOST_WIDE_INT val_in
)
4529 int lowest_bit_set
= ctz_hwi (val_in
);
4530 int highest_bit_set
= floor_log2 (val_in
);
4531 gcc_assert (val_in
!= 0);
4533 return ((HOST_WIDE_INT_UC (2) << highest_bit_set
) -
4534 (HOST_WIDE_INT_1U
<< lowest_bit_set
));
4537 /* Create constant where bits outside of lowest bit set to highest bit set
4540 unsigned HOST_WIDE_INT
4541 aarch64_and_split_imm2 (HOST_WIDE_INT val_in
)
4543 return val_in
| ~aarch64_and_split_imm1 (val_in
);
4546 /* Return true if VAL_IN is a valid 'and' bitmask immediate. */
4549 aarch64_and_bitmask_imm (unsigned HOST_WIDE_INT val_in
, machine_mode mode
)
4551 scalar_int_mode int_mode
;
4552 if (!is_a
<scalar_int_mode
> (mode
, &int_mode
))
4555 if (aarch64_bitmask_imm (val_in
, int_mode
))
4558 if (aarch64_move_imm (val_in
, int_mode
))
4561 unsigned HOST_WIDE_INT imm2
= aarch64_and_split_imm2 (val_in
);
4563 return aarch64_bitmask_imm (imm2
, int_mode
);
4566 /* Return the number of temporary registers that aarch64_add_offset_1
4567 would need to add OFFSET to a register. */
4570 aarch64_add_offset_1_temporaries (HOST_WIDE_INT offset
)
4572 return absu_hwi (offset
) < 0x1000000 ? 0 : 1;
4575 /* A subroutine of aarch64_add_offset. Set DEST to SRC + OFFSET for
4576 a non-polynomial OFFSET. MODE is the mode of the addition.
4577 FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
4578 be set and CFA adjustments added to the generated instructions.
4580 TEMP1, if nonnull, is a register of mode MODE that can be used as a
4581 temporary if register allocation is already complete. This temporary
4582 register may overlap DEST but must not overlap SRC. If TEMP1 is known
4583 to hold abs (OFFSET), EMIT_MOVE_IMM can be set to false to avoid emitting
4584 the immediate again.
4586 Since this function may be used to adjust the stack pointer, we must
4587 ensure that it cannot cause transient stack deallocation (for example
4588 by first incrementing SP and then decrementing when adjusting by a
4589 large immediate). */
4592 aarch64_add_offset_1 (scalar_int_mode mode
, rtx dest
,
4593 rtx src
, HOST_WIDE_INT offset
, rtx temp1
,
4594 bool frame_related_p
, bool emit_move_imm
)
4596 gcc_assert (emit_move_imm
|| temp1
!= NULL_RTX
);
4597 gcc_assert (temp1
== NULL_RTX
|| !reg_overlap_mentioned_p (temp1
, src
));
4599 unsigned HOST_WIDE_INT moffset
= absu_hwi (offset
);
4604 if (!rtx_equal_p (dest
, src
))
4606 insn
= emit_insn (gen_rtx_SET (dest
, src
));
4607 RTX_FRAME_RELATED_P (insn
) = frame_related_p
;
4612 /* Single instruction adjustment. */
4613 if (aarch64_uimm12_shift (moffset
))
4615 insn
= emit_insn (gen_add3_insn (dest
, src
, GEN_INT (offset
)));
4616 RTX_FRAME_RELATED_P (insn
) = frame_related_p
;
4620 /* Emit 2 additions/subtractions if the adjustment is less than 24 bits
4623 a) the offset cannot be loaded by a 16-bit move or
4624 b) there is no spare register into which we can move it. */
4625 if (moffset
< 0x1000000
4626 && ((!temp1
&& !can_create_pseudo_p ())
4627 || !aarch64_move_imm (moffset
, mode
)))
4629 HOST_WIDE_INT low_off
= moffset
& 0xfff;
4631 low_off
= offset
< 0 ? -low_off
: low_off
;
4632 insn
= emit_insn (gen_add3_insn (dest
, src
, GEN_INT (low_off
)));
4633 RTX_FRAME_RELATED_P (insn
) = frame_related_p
;
4634 insn
= emit_insn (gen_add2_insn (dest
, GEN_INT (offset
- low_off
)));
4635 RTX_FRAME_RELATED_P (insn
) = frame_related_p
;
4639 /* Emit a move immediate if required and an addition/subtraction. */
4642 gcc_assert (temp1
!= NULL_RTX
|| can_create_pseudo_p ());
4643 temp1
= aarch64_force_temporary (mode
, temp1
,
4644 gen_int_mode (moffset
, mode
));
4646 insn
= emit_insn (offset
< 0
4647 ? gen_sub3_insn (dest
, src
, temp1
)
4648 : gen_add3_insn (dest
, src
, temp1
));
4649 if (frame_related_p
)
4651 RTX_FRAME_RELATED_P (insn
) = frame_related_p
;
4652 rtx adj
= plus_constant (mode
, src
, offset
);
4653 add_reg_note (insn
, REG_CFA_ADJUST_CFA
, gen_rtx_SET (dest
, adj
));
4657 /* Return the number of temporary registers that aarch64_add_offset
4658 would need to move OFFSET into a register or add OFFSET to a register;
4659 ADD_P is true if we want the latter rather than the former. */
4662 aarch64_offset_temporaries (bool add_p
, poly_int64 offset
)
4664 /* This follows the same structure as aarch64_add_offset. */
4665 if (add_p
&& aarch64_sve_addvl_addpl_immediate_p (offset
))
4668 unsigned int count
= 0;
4669 HOST_WIDE_INT factor
= offset
.coeffs
[1];
4670 HOST_WIDE_INT constant
= offset
.coeffs
[0] - factor
;
4671 poly_int64
poly_offset (factor
, factor
);
4672 if (add_p
&& aarch64_sve_addvl_addpl_immediate_p (poly_offset
))
4673 /* Need one register for the ADDVL/ADDPL result. */
4675 else if (factor
!= 0)
4677 factor
/= (HOST_WIDE_INT
) least_bit_hwi (factor
);
4678 if (!IN_RANGE (factor
, -32, 31))
4679 /* Need one register for the CNT or RDVL result and one for the
4680 multiplication factor. If necessary, the second temporary
4681 can be reused for the constant part of the offset. */
4683 /* Need one register for the CNT or RDVL result (which might then
4687 return count
+ aarch64_add_offset_1_temporaries (constant
);
4690 /* If X can be represented as a poly_int64, return the number
4691 of temporaries that are required to add it to a register.
4692 Return -1 otherwise. */
4695 aarch64_add_offset_temporaries (rtx x
)
4698 if (!poly_int_rtx_p (x
, &offset
))
4700 return aarch64_offset_temporaries (true, offset
);
4703 /* Set DEST to SRC + OFFSET. MODE is the mode of the addition.
4704 FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
4705 be set and CFA adjustments added to the generated instructions.
4707 TEMP1, if nonnull, is a register of mode MODE that can be used as a
4708 temporary if register allocation is already complete. This temporary
4709 register may overlap DEST if !FRAME_RELATED_P but must not overlap SRC.
4710 If TEMP1 is known to hold abs (OFFSET), EMIT_MOVE_IMM can be set to
4711 false to avoid emitting the immediate again.
4713 TEMP2, if nonnull, is a second temporary register that doesn't
4714 overlap either DEST or REG.
4716 FORCE_ISA_MODE is AARCH64_ISA_MODE_SM_ON if any variable component of OFFSET
4717 is measured relative to the SME vector length instead of the current
4718 prevailing vector length. It is 0 otherwise.
4720 Since this function may be used to adjust the stack pointer, we must
4721 ensure that it cannot cause transient stack deallocation (for example
4722 by first incrementing SP and then decrementing when adjusting by a
4723 large immediate). */
4726 aarch64_add_offset (scalar_int_mode mode
, rtx dest
, rtx src
,
4727 poly_int64 offset
, rtx temp1
, rtx temp2
,
4728 aarch64_isa_mode force_isa_mode
,
4729 bool frame_related_p
, bool emit_move_imm
= true)
4731 gcc_assert (emit_move_imm
|| temp1
!= NULL_RTX
);
4732 gcc_assert (temp1
== NULL_RTX
|| !reg_overlap_mentioned_p (temp1
, src
));
4733 gcc_assert (temp1
== NULL_RTX
4735 || !reg_overlap_mentioned_p (temp1
, dest
));
4736 gcc_assert (temp2
== NULL_RTX
|| !reg_overlap_mentioned_p (dest
, temp2
));
4738 /* Try using ADDVL or ADDPL to add the whole value. */
4739 if (src
!= const0_rtx
&& aarch64_sve_addvl_addpl_immediate_p (offset
))
4741 gcc_assert (offset
.coeffs
[0] == offset
.coeffs
[1]);
4743 if (force_isa_mode
== 0)
4744 offset_rtx
= gen_int_mode (offset
, mode
);
4746 offset_rtx
= aarch64_sme_vq_immediate (mode
, offset
.coeffs
[0], 0);
4747 rtx_insn
*insn
= emit_insn (gen_add3_insn (dest
, src
, offset_rtx
));
4748 RTX_FRAME_RELATED_P (insn
) = frame_related_p
;
4749 if (frame_related_p
&& (force_isa_mode
& AARCH64_ISA_MODE_SM_ON
))
4750 add_reg_note (insn
, REG_CFA_ADJUST_CFA
,
4751 gen_rtx_SET (dest
, plus_constant (Pmode
, src
,
4756 /* Coefficient 1 is multiplied by the number of 128-bit blocks in an
4757 SVE vector register, over and above the minimum size of 128 bits.
4758 This is equivalent to half the value returned by CNTD with a
4759 vector shape of ALL. */
4760 HOST_WIDE_INT factor
= offset
.coeffs
[1];
4761 HOST_WIDE_INT constant
= offset
.coeffs
[0] - factor
;
4763 /* Try using ADDVL or ADDPL to add the VG-based part. */
4764 poly_int64
poly_offset (factor
, factor
);
4765 if (src
!= const0_rtx
4766 && aarch64_sve_addvl_addpl_immediate_p (poly_offset
))
4769 if (force_isa_mode
== 0)
4770 offset_rtx
= gen_int_mode (poly_offset
, mode
);
4772 offset_rtx
= aarch64_sme_vq_immediate (mode
, factor
, 0);
4773 if (frame_related_p
)
4775 rtx_insn
*insn
= emit_insn (gen_add3_insn (dest
, src
, offset_rtx
));
4776 RTX_FRAME_RELATED_P (insn
) = true;
4777 if (force_isa_mode
& AARCH64_ISA_MODE_SM_ON
)
4778 add_reg_note (insn
, REG_CFA_ADJUST_CFA
,
4779 gen_rtx_SET (dest
, plus_constant (Pmode
, src
,
4785 rtx addr
= gen_rtx_PLUS (mode
, src
, offset_rtx
);
4786 src
= aarch64_force_temporary (mode
, temp1
, addr
);
4791 /* Otherwise use a CNT-based sequence. */
4792 else if (factor
!= 0)
4794 /* Calculate CNTB * FACTOR / 16 as CNTB * REL_FACTOR * 2**SHIFT,
4795 with negative shifts indicating a shift right. */
4796 HOST_WIDE_INT low_bit
= least_bit_hwi (factor
);
4797 HOST_WIDE_INT rel_factor
= factor
/ low_bit
;
4798 int shift
= exact_log2 (low_bit
) - 4;
4799 gcc_assert (shift
>= -4 && (rel_factor
& 1) != 0);
4801 /* Set CODE, VAL and SHIFT so that [+-] VAL * 2**SHIFT is
4802 equal to CNTB * FACTOR / 16, with CODE being the [+-].
4804 We can avoid a multiplication if REL_FACTOR is in the range
4805 of RDVL, although there are then various optimizations that
4806 we can try on top. */
4807 rtx_code code
= PLUS
;
4809 if (IN_RANGE (rel_factor
, -32, 31))
4811 if (force_isa_mode
& AARCH64_ISA_MODE_SM_ON
)
4813 /* Try to use an unshifted RDSVL, otherwise fall back on
4814 a shifted RDSVL #1. */
4815 if (aarch64_sve_rdvl_addvl_factor_p (factor
))
4818 factor
= rel_factor
* 16;
4819 val
= aarch64_sme_vq_immediate (mode
, factor
, 0);
4821 /* Try to use an unshifted CNT[BHWD] or RDVL. */
4822 else if (aarch64_sve_cnt_factor_p (factor
)
4823 || aarch64_sve_rdvl_addvl_factor_p (factor
))
4825 val
= gen_int_mode (poly_int64 (factor
, factor
), mode
);
4828 /* Try to subtract an unshifted CNT[BHWD]. */
4829 else if (aarch64_sve_cnt_factor_p (-factor
))
4832 val
= gen_int_mode (poly_int64 (-factor
, -factor
), mode
);
4835 /* If subtraction is free, prefer to load a positive constant.
4836 In the best case this will fit a shifted CNTB. */
4837 else if (src
!= const0_rtx
&& rel_factor
< 0)
4840 val
= gen_int_mode (-rel_factor
* BYTES_PER_SVE_VECTOR
, mode
);
4842 /* Otherwise use a shifted RDVL or CNT[BHWD]. */
4844 val
= gen_int_mode (rel_factor
* BYTES_PER_SVE_VECTOR
, mode
);
4848 /* If we can calculate CNTB << SHIFT directly, prefer to do that,
4849 since it should increase the chances of being able to use
4850 a shift and add sequence for the multiplication.
4851 If CNTB << SHIFT is out of range, stick with the current
4853 if (force_isa_mode
== 0
4854 && IN_RANGE (low_bit
, 2, 16 * 16))
4856 val
= gen_int_mode (poly_int64 (low_bit
, low_bit
), mode
);
4859 else if ((force_isa_mode
& AARCH64_ISA_MODE_SM_ON
)
4860 && aarch64_sve_rdvl_addvl_factor_p (low_bit
))
4862 val
= aarch64_sme_vq_immediate (mode
, low_bit
, 0);
4866 val
= gen_int_mode (BYTES_PER_SVE_VECTOR
, mode
);
4868 val
= aarch64_force_temporary (mode
, temp1
, val
);
4870 /* Prefer to multiply by a positive factor and subtract rather
4871 than multiply by a negative factor and add, since positive
4872 values are usually easier to move. */
4873 if (rel_factor
< 0 && src
!= const0_rtx
)
4875 rel_factor
= -rel_factor
;
4879 if (can_create_pseudo_p ())
4881 rtx coeff1
= gen_int_mode (rel_factor
, mode
);
4882 val
= expand_mult (mode
, val
, coeff1
, NULL_RTX
, true, true);
4886 rtx coeff1
= gen_int_mode (rel_factor
, mode
);
4887 coeff1
= aarch64_force_temporary (mode
, temp2
, coeff1
);
4888 val
= gen_rtx_MULT (mode
, val
, coeff1
);
4892 /* Multiply by 2 ** SHIFT. */
4895 val
= aarch64_force_temporary (mode
, temp1
, val
);
4896 val
= gen_rtx_ASHIFT (mode
, val
, GEN_INT (shift
));
4900 val
= aarch64_force_temporary (mode
, temp1
, val
);
4901 val
= gen_rtx_ASHIFTRT (mode
, val
, GEN_INT (-shift
));
4904 /* Add the result to SRC or subtract the result from SRC. */
4905 if (src
!= const0_rtx
)
4907 val
= aarch64_force_temporary (mode
, temp1
, val
);
4908 val
= gen_rtx_fmt_ee (code
, mode
, src
, val
);
4910 else if (code
== MINUS
)
4912 val
= aarch64_force_temporary (mode
, temp1
, val
);
4913 val
= gen_rtx_NEG (mode
, val
);
4916 if (constant
== 0 || frame_related_p
)
4918 rtx_insn
*insn
= emit_insn (gen_rtx_SET (dest
, val
));
4919 if (frame_related_p
)
4921 RTX_FRAME_RELATED_P (insn
) = true;
4922 add_reg_note (insn
, REG_CFA_ADJUST_CFA
,
4923 gen_rtx_SET (dest
, plus_constant (Pmode
, src
,
4932 src
= aarch64_force_temporary (mode
, temp1
, val
);
4937 emit_move_imm
= true;
4940 aarch64_add_offset_1 (mode
, dest
, src
, constant
, temp1
,
4941 frame_related_p
, emit_move_imm
);
4944 /* Like aarch64_add_offset, but the offset is given as an rtx rather
4945 than a poly_int64. */
4948 aarch64_split_add_offset (scalar_int_mode mode
, rtx dest
, rtx src
,
4949 rtx offset_rtx
, rtx temp1
, rtx temp2
)
4951 aarch64_add_offset (mode
, dest
, src
, rtx_to_poly_int64 (offset_rtx
),
4952 temp1
, temp2
, 0, false);
4955 /* Add DELTA to the stack pointer, marking the instructions frame-related.
4956 TEMP1 is available as a temporary if nonnull. FORCE_ISA_MODE is as
4957 for aarch64_add_offset. EMIT_MOVE_IMM is false if TEMP1 already
4958 contains abs (DELTA). */
4961 aarch64_add_sp (rtx temp1
, rtx temp2
, poly_int64 delta
,
4962 aarch64_isa_mode force_isa_mode
, bool emit_move_imm
)
4964 aarch64_add_offset (Pmode
, stack_pointer_rtx
, stack_pointer_rtx
, delta
,
4965 temp1
, temp2
, force_isa_mode
, true, emit_move_imm
);
4968 /* Subtract DELTA from the stack pointer, marking the instructions
4969 frame-related if FRAME_RELATED_P. FORCE_ISA_MODE is as for
4970 aarch64_add_offset. TEMP1 is available as a temporary if nonnull. */
4973 aarch64_sub_sp (rtx temp1
, rtx temp2
, poly_int64 delta
,
4974 aarch64_isa_mode force_isa_mode
,
4975 bool frame_related_p
, bool emit_move_imm
= true)
4977 aarch64_add_offset (Pmode
, stack_pointer_rtx
, stack_pointer_rtx
, -delta
,
4978 temp1
, temp2
, force_isa_mode
, frame_related_p
,
4982 /* A streaming-compatible function needs to switch temporarily to the known
4983 PSTATE.SM mode described by LOCAL_MODE. The low bit of OLD_SVCR contains
4984 the runtime state of PSTATE.SM in the streaming-compatible code, before
4985 the start of the switch to LOCAL_MODE.
4987 Emit instructions to branch around the mode switch if PSTATE.SM already
4988 matches LOCAL_MODE. Return the label that the branch jumps to. */
4991 aarch64_guard_switch_pstate_sm (rtx old_svcr
, aarch64_isa_mode local_mode
)
4993 local_mode
&= AARCH64_ISA_MODE_SM_STATE
;
4994 gcc_assert (local_mode
!= 0);
4995 auto already_ok_cond
= (local_mode
& AARCH64_ISA_MODE_SM_ON
? NE
: EQ
);
4996 auto *label
= gen_label_rtx ();
4997 auto branch
= aarch64_gen_test_and_branch (already_ok_cond
, old_svcr
, 0,
4999 auto *jump
= emit_jump_insn (branch
);
5000 JUMP_LABEL (jump
) = label
;
5004 /* Emit code to switch from the PSTATE.SM state in OLD_MODE to the PSTATE.SM
5005 state in NEW_MODE. This is known to involve either an SMSTART SM or
5009 aarch64_switch_pstate_sm (aarch64_isa_mode old_mode
, aarch64_isa_mode new_mode
)
5011 old_mode
&= AARCH64_ISA_MODE_SM_STATE
;
5012 new_mode
&= AARCH64_ISA_MODE_SM_STATE
;
5013 gcc_assert (old_mode
!= new_mode
);
5015 if ((new_mode
& AARCH64_ISA_MODE_SM_ON
)
5016 || (!new_mode
&& (old_mode
& AARCH64_ISA_MODE_SM_OFF
)))
5017 emit_insn (gen_aarch64_smstart_sm ());
5019 emit_insn (gen_aarch64_smstop_sm ());
5022 /* As a side-effect, SMSTART SM and SMSTOP SM clobber the contents of all
5023 FP and predicate registers. This class emits code to preserve any
5024 necessary registers around the mode switch.
5026 The class uses four approaches to saving and restoring contents, enumerated
5029 - GPR: save and restore the contents of FP registers using GPRs.
5030 This is used if the FP register contains no more than 64 significant
5031 bits. The registers used are FIRST_GPR onwards.
5033 - MEM_128: save and restore 128-bit SIMD registers using memory.
5035 - MEM_SVE_PRED: save and restore full SVE predicate registers using memory.
5037 - MEM_SVE_DATA: save and restore full SVE vector registers using memory.
5039 The save slots within each memory group are consecutive, with the
5040 MEM_SVE_PRED slots occupying a region below the MEM_SVE_DATA slots.
5042 There will only be two mode switches for each use of SME, so they should
5043 not be particularly performance-sensitive. It's also rare for SIMD, SVE
5044 or predicate registers to be live across mode switches. We therefore
5045 don't preallocate the save slots but instead allocate them locally on
5046 demand. This makes the code emitted by the class self-contained. */
5048 class aarch64_sme_mode_switch_regs
5051 static const unsigned int FIRST_GPR
= R10_REGNUM
;
5053 void add_reg (machine_mode
, unsigned int);
5054 void add_call_args (rtx_call_insn
*);
5055 void add_call_result (rtx_call_insn
*);
5056 void add_call_preserved_reg (unsigned int);
5057 void add_call_preserved_regs (bitmap
);
5059 void emit_prologue ();
5060 void emit_epilogue ();
5062 /* The number of GPRs needed to save FP registers, starting from
5064 unsigned int num_gprs () { return m_group_count
[GPR
]; }
5067 enum sequence
{ PROLOGUE
, EPILOGUE
};
5068 enum group_type
{ GPR
, MEM_128
, MEM_SVE_PRED
, MEM_SVE_DATA
, NUM_GROUPS
};
5070 /* Information about the save location for one FP, SIMD, SVE data, or
5071 SVE predicate register. */
5072 struct save_location
{
5073 /* The register to be saved. */
5076 /* Which group the save location belongs to. */
5079 /* A zero-based index of the register within the group. */
5083 unsigned int sve_data_headroom ();
5084 rtx
get_slot_mem (machine_mode
, poly_int64
);
5085 void emit_stack_adjust (sequence
, poly_int64
);
5086 void emit_mem_move (sequence
, const save_location
&, poly_int64
);
5088 void emit_gpr_moves (sequence
);
5089 void emit_mem_128_moves (sequence
);
5090 void emit_sve_sp_adjust (sequence
);
5091 void emit_sve_pred_moves (sequence
);
5092 void emit_sve_data_moves (sequence
);
5094 /* All save locations, in no particular order. */
5095 auto_vec
<save_location
, 12> m_save_locations
;
5097 /* The number of registers in each group. */
5098 unsigned int m_group_count
[NUM_GROUPS
] = {};
5101 /* Record that (reg:MODE REGNO) needs to be preserved around the mode
5105 aarch64_sme_mode_switch_regs::add_reg (machine_mode mode
, unsigned int regno
)
5107 if (!FP_REGNUM_P (regno
) && !PR_REGNUM_P (regno
))
5110 unsigned int end_regno
= end_hard_regno (mode
, regno
);
5111 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
5112 gcc_assert ((vec_flags
& VEC_STRUCT
) || end_regno
== regno
+ 1);
5113 for (; regno
< end_regno
; regno
++)
5115 /* Force the mode of SVE saves and restores even for single registers.
5116 This is necessary because big-endian targets only allow LDR Z and
5117 STR Z to be used with byte modes. */
5118 machine_mode submode
= mode
;
5119 if (vec_flags
& VEC_SVE_PRED
)
5120 submode
= VNx16BImode
;
5121 else if (vec_flags
& VEC_SVE_DATA
)
5122 submode
= SVE_BYTE_MODE
;
5123 else if (vec_flags
& VEC_STRUCT
)
5125 if (vec_flags
& VEC_PARTIAL
)
5128 submode
= V16QImode
;
5131 loc
.reg
= gen_rtx_REG (submode
, regno
);
5132 if (vec_flags
& VEC_SVE_PRED
)
5134 gcc_assert (PR_REGNUM_P (regno
));
5135 loc
.group
= MEM_SVE_PRED
;
5139 gcc_assert (FP_REGNUM_P (regno
));
5140 if (known_le (GET_MODE_SIZE (submode
), 8))
5142 else if (known_eq (GET_MODE_SIZE (submode
), 16))
5143 loc
.group
= MEM_128
;
5145 loc
.group
= MEM_SVE_DATA
;
5147 loc
.index
= m_group_count
[loc
.group
]++;
5148 m_save_locations
.quick_push (loc
);
5152 /* Record that the arguments to CALL_INSN need to be preserved around
5156 aarch64_sme_mode_switch_regs::add_call_args (rtx_call_insn
*call_insn
)
5158 for (rtx node
= CALL_INSN_FUNCTION_USAGE (call_insn
);
5159 node
; node
= XEXP (node
, 1))
5161 rtx item
= XEXP (node
, 0);
5162 if (GET_CODE (item
) != USE
)
5164 item
= XEXP (item
, 0);
5167 add_reg (GET_MODE (item
), REGNO (item
));
5171 /* Record that the return value from CALL_INSN (if any) needs to be
5172 preserved around the mode switch. */
5175 aarch64_sme_mode_switch_regs::add_call_result (rtx_call_insn
*call_insn
)
5177 rtx pat
= PATTERN (call_insn
);
5178 gcc_assert (GET_CODE (pat
) == PARALLEL
);
5179 pat
= XVECEXP (pat
, 0, 0);
5180 if (GET_CODE (pat
) == CALL
)
5182 rtx dest
= SET_DEST (pat
);
5183 if (GET_CODE (dest
) == PARALLEL
)
5184 for (int i
= 0; i
< XVECLEN (dest
, 0); ++i
)
5186 rtx x
= XVECEXP (dest
, 0, i
);
5187 gcc_assert (GET_CODE (x
) == EXPR_LIST
);
5188 rtx reg
= XEXP (x
, 0);
5189 add_reg (GET_MODE (reg
), REGNO (reg
));
5192 add_reg (GET_MODE (dest
), REGNO (dest
));
5195 /* REGNO is a register that is call-preserved under the current function's ABI.
5196 Record that it must be preserved around the mode switch. */
5199 aarch64_sme_mode_switch_regs::add_call_preserved_reg (unsigned int regno
)
5201 if (FP_REGNUM_P (regno
))
5202 switch (crtl
->abi
->id ())
5205 add_reg (VNx16QImode
, regno
);
5208 add_reg (V16QImode
, regno
);
5210 case ARM_PCS_AAPCS64
:
5211 add_reg (DImode
, regno
);
5216 else if (PR_REGNUM_P (regno
))
5217 add_reg (VNx16BImode
, regno
);
5220 /* The hard registers in REGS are call-preserved under the current function's
5221 ABI. Record that they must be preserved around the mode switch. */
5224 aarch64_sme_mode_switch_regs::add_call_preserved_regs (bitmap regs
)
5228 EXECUTE_IF_SET_IN_BITMAP (regs
, 0, regno
, bi
)
5229 if (HARD_REGISTER_NUM_P (regno
))
5230 add_call_preserved_reg (regno
);
5235 /* Emit code to save registers before the mode switch. */
5238 aarch64_sme_mode_switch_regs::emit_prologue ()
5240 emit_sve_sp_adjust (PROLOGUE
);
5241 emit_sve_pred_moves (PROLOGUE
);
5242 emit_sve_data_moves (PROLOGUE
);
5243 emit_mem_128_moves (PROLOGUE
);
5244 emit_gpr_moves (PROLOGUE
);
5247 /* Emit code to restore registers after the mode switch. */
5250 aarch64_sme_mode_switch_regs::emit_epilogue ()
5252 emit_gpr_moves (EPILOGUE
);
5253 emit_mem_128_moves (EPILOGUE
);
5254 emit_sve_pred_moves (EPILOGUE
);
5255 emit_sve_data_moves (EPILOGUE
);
5256 emit_sve_sp_adjust (EPILOGUE
);
5259 /* The SVE predicate registers are stored below the SVE data registers,
5260 with the predicate save area being padded to a data-register-sized
5261 boundary. Return the size of this padded area as a whole number
5262 of data register slots. */
5265 aarch64_sme_mode_switch_regs::sve_data_headroom ()
5267 return CEIL (m_group_count
[MEM_SVE_PRED
], 8);
5270 /* Return a memory reference of mode MODE to OFFSET bytes from the
5274 aarch64_sme_mode_switch_regs::get_slot_mem (machine_mode mode
,
5277 rtx addr
= plus_constant (Pmode
, stack_pointer_rtx
, offset
);
5278 return gen_rtx_MEM (mode
, addr
);
5281 /* Allocate or deallocate SIZE bytes of stack space: SEQ decides which. */
5284 aarch64_sme_mode_switch_regs::emit_stack_adjust (sequence seq
,
5287 if (seq
== PROLOGUE
)
5289 emit_insn (gen_rtx_SET (stack_pointer_rtx
,
5290 plus_constant (Pmode
, stack_pointer_rtx
, size
)));
5293 /* Save or restore the register in LOC, whose slot is OFFSET bytes from
5294 the stack pointer. SEQ chooses between saving and restoring. */
5297 aarch64_sme_mode_switch_regs::emit_mem_move (sequence seq
,
5298 const save_location
&loc
,
5301 rtx mem
= get_slot_mem (GET_MODE (loc
.reg
), offset
);
5302 if (seq
== PROLOGUE
)
5303 emit_move_insn (mem
, loc
.reg
);
5305 emit_move_insn (loc
.reg
, mem
);
5308 /* Emit instructions to save or restore the GPR group. SEQ chooses between
5309 saving and restoring. */
5312 aarch64_sme_mode_switch_regs::emit_gpr_moves (sequence seq
)
5314 for (auto &loc
: m_save_locations
)
5315 if (loc
.group
== GPR
)
5317 gcc_assert (loc
.index
< 8);
5318 rtx gpr
= gen_rtx_REG (GET_MODE (loc
.reg
), FIRST_GPR
+ loc
.index
);
5319 if (seq
== PROLOGUE
)
5320 emit_move_insn (gpr
, loc
.reg
);
5322 emit_move_insn (loc
.reg
, gpr
);
5326 /* Emit instructions to save or restore the MEM_128 group. SEQ chooses
5327 between saving and restoring. */
5330 aarch64_sme_mode_switch_regs::emit_mem_128_moves (sequence seq
)
5332 HOST_WIDE_INT count
= m_group_count
[MEM_128
];
5336 auto sp
= stack_pointer_rtx
;
5337 auto sp_adjust
= (seq
== PROLOGUE
? -count
: count
) * 16;
5339 /* Pick a common mode that supports LDR & STR with pre/post-modification
5340 and LDP & STP with pre/post-modification. */
5343 /* An instruction pattern that should be emitted at the end. */
5344 rtx last_pat
= NULL_RTX
;
5346 /* A previous MEM_128 location that hasn't been handled yet. */
5347 save_location
*prev_loc
= nullptr;
5349 /* Look for LDP/STPs and record any leftover LDR/STR in PREV_LOC. */
5350 for (auto &loc
: m_save_locations
)
5351 if (loc
.group
== MEM_128
)
5358 gcc_assert (loc
.index
== prev_loc
->index
+ 1);
5360 /* The offset of the base of the save area from the current
5362 HOST_WIDE_INT bias
= 0;
5363 if (prev_loc
->index
== 0 && seq
== PROLOGUE
)
5366 /* Get the two sets in the LDP/STP. */
5368 gen_rtx_REG (mode
, REGNO (prev_loc
->reg
)),
5369 get_slot_mem (mode
, prev_loc
->index
* 16 + bias
),
5370 gen_rtx_REG (mode
, REGNO (loc
.reg
)),
5371 get_slot_mem (mode
, loc
.index
* 16 + bias
)
5373 unsigned int lhs
= (seq
== PROLOGUE
);
5374 rtx set1
= gen_rtx_SET (ops
[lhs
], ops
[1 - lhs
]);
5375 rtx set2
= gen_rtx_SET (ops
[lhs
+ 2], ops
[3 - lhs
]);
5377 /* Combine the sets with any stack allocation/deallocation. */
5379 if (prev_loc
->index
== 0)
5381 rtx plus_sp
= plus_constant (Pmode
, sp
, sp_adjust
);
5382 rtvec vec
= gen_rtvec (3, gen_rtx_SET (sp
, plus_sp
), set1
, set2
);
5383 pat
= gen_rtx_PARALLEL (VOIDmode
, vec
);
5385 else if (seq
== PROLOGUE
)
5386 pat
= aarch64_gen_store_pair (ops
[1], ops
[0], ops
[2]);
5388 pat
= aarch64_gen_load_pair (ops
[0], ops
[2], ops
[1]);
5390 /* Queue a deallocation to the end, otherwise emit the
5392 if (seq
== EPILOGUE
&& prev_loc
->index
== 0)
5399 /* Handle any leftover LDR/STR. */
5402 rtx reg
= gen_rtx_REG (mode
, REGNO (prev_loc
->reg
));
5404 if (prev_loc
->index
!= 0)
5405 addr
= plus_constant (Pmode
, sp
, prev_loc
->index
* 16);
5406 else if (seq
== PROLOGUE
)
5408 rtx allocate
= plus_constant (Pmode
, sp
, -count
* 16);
5409 addr
= gen_rtx_PRE_MODIFY (Pmode
, sp
, allocate
);
5413 rtx deallocate
= plus_constant (Pmode
, sp
, count
* 16);
5414 addr
= gen_rtx_POST_MODIFY (Pmode
, sp
, deallocate
);
5416 rtx mem
= gen_rtx_MEM (mode
, addr
);
5417 if (seq
== PROLOGUE
)
5418 emit_move_insn (mem
, reg
);
5420 emit_move_insn (reg
, mem
);
5424 emit_insn (last_pat
);
5427 /* Allocate or deallocate the stack space needed by the SVE groups.
5428 SEQ chooses between allocating and deallocating. */
5431 aarch64_sme_mode_switch_regs::emit_sve_sp_adjust (sequence seq
)
5433 if (unsigned int count
= m_group_count
[MEM_SVE_DATA
] + sve_data_headroom ())
5434 emit_stack_adjust (seq
, count
* BYTES_PER_SVE_VECTOR
);
5437 /* Save or restore the MEM_SVE_DATA group. SEQ chooses between saving
5441 aarch64_sme_mode_switch_regs::emit_sve_data_moves (sequence seq
)
5443 for (auto &loc
: m_save_locations
)
5444 if (loc
.group
== MEM_SVE_DATA
)
5446 auto index
= loc
.index
+ sve_data_headroom ();
5447 emit_mem_move (seq
, loc
, index
* BYTES_PER_SVE_VECTOR
);
5451 /* Save or restore the MEM_SVE_PRED group. SEQ chooses between saving
5455 aarch64_sme_mode_switch_regs::emit_sve_pred_moves (sequence seq
)
5457 for (auto &loc
: m_save_locations
)
5458 if (loc
.group
== MEM_SVE_PRED
)
5459 emit_mem_move (seq
, loc
, loc
.index
* BYTES_PER_SVE_PRED
);
5462 /* Set DEST to (vec_series BASE STEP). */
5465 aarch64_expand_vec_series (rtx dest
, rtx base
, rtx step
)
5467 machine_mode mode
= GET_MODE (dest
);
5468 scalar_mode inner
= GET_MODE_INNER (mode
);
5470 /* Each operand can be a register or an immediate in the range [-16, 15]. */
5471 if (!aarch64_sve_index_immediate_p (base
))
5472 base
= force_reg (inner
, base
);
5473 if (!aarch64_sve_index_immediate_p (step
))
5474 step
= force_reg (inner
, step
);
5476 emit_set_insn (dest
, gen_rtx_VEC_SERIES (mode
, base
, step
));
5479 /* Duplicate 128-bit Advanced SIMD vector SRC so that it fills an SVE
5480 register of mode MODE. Use TARGET for the result if it's nonnull
5483 The two vector modes must have the same element mode. The behavior
5484 is to duplicate architectural lane N of SRC into architectural lanes
5485 N + I * STEP of the result. On big-endian targets, architectural
5486 lane 0 of an Advanced SIMD vector is the last element of the vector
5487 in memory layout, so for big-endian targets this operation has the
5488 effect of reversing SRC before duplicating it. Callers need to
5489 account for this. */
5492 aarch64_expand_sve_dupq (rtx target
, machine_mode mode
, rtx src
)
5494 machine_mode src_mode
= GET_MODE (src
);
5495 gcc_assert (GET_MODE_INNER (mode
) == GET_MODE_INNER (src_mode
));
5496 insn_code icode
= (BYTES_BIG_ENDIAN
5497 ? code_for_aarch64_vec_duplicate_vq_be (mode
)
5498 : code_for_aarch64_vec_duplicate_vq_le (mode
));
5501 expand_operand ops
[3];
5502 create_output_operand (&ops
[i
++], target
, mode
);
5503 create_output_operand (&ops
[i
++], src
, src_mode
);
5504 if (BYTES_BIG_ENDIAN
)
5506 /* Create a PARALLEL describing the reversal of SRC. */
5507 unsigned int nelts_per_vq
= 128 / GET_MODE_UNIT_BITSIZE (mode
);
5508 rtx sel
= aarch64_gen_stepped_int_parallel (nelts_per_vq
,
5509 nelts_per_vq
- 1, -1);
5510 create_fixed_operand (&ops
[i
++], sel
);
5512 expand_insn (icode
, i
, ops
);
5513 return ops
[0].value
;
5516 /* Try to force 128-bit vector value SRC into memory and use LD1RQ to fetch
5517 the memory image into DEST. Return true on success. */
5520 aarch64_expand_sve_ld1rq (rtx dest
, rtx src
)
5522 src
= force_const_mem (GET_MODE (src
), src
);
5526 /* Make sure that the address is legitimate. */
5527 if (!aarch64_sve_ld1rq_operand_p (src
))
5529 rtx addr
= force_reg (Pmode
, XEXP (src
, 0));
5530 src
= replace_equiv_address (src
, addr
);
5533 machine_mode mode
= GET_MODE (dest
);
5534 machine_mode pred_mode
= aarch64_sve_pred_mode (mode
);
5535 rtx ptrue
= aarch64_ptrue_reg (pred_mode
);
5536 emit_insn (gen_aarch64_sve_ld1rq (mode
, dest
, src
, ptrue
));
5540 /* SRC is an SVE CONST_VECTOR that contains N "foreground" values followed
5541 by N "background" values. Try to move it into TARGET using:
5543 PTRUE PRED.<T>, VL<N>
5544 MOV TRUE.<T>, #<foreground>
5545 MOV FALSE.<T>, #<background>
5546 SEL TARGET.<T>, PRED.<T>, TRUE.<T>, FALSE.<T>
5548 The PTRUE is always a single instruction but the MOVs might need a
5549 longer sequence. If the background value is zero (as it often is),
5550 the sequence can sometimes collapse to a PTRUE followed by a
5551 zero-predicated move.
5553 Return the target on success, otherwise return null. */
5556 aarch64_expand_sve_const_vector_sel (rtx target
, rtx src
)
5558 gcc_assert (CONST_VECTOR_NELTS_PER_PATTERN (src
) == 2);
5560 /* Make sure that the PTRUE is valid. */
5561 machine_mode mode
= GET_MODE (src
);
5562 machine_mode pred_mode
= aarch64_sve_pred_mode (mode
);
5563 unsigned int npatterns
= CONST_VECTOR_NPATTERNS (src
);
5564 if (aarch64_svpattern_for_vl (pred_mode
, npatterns
)
5565 == AARCH64_NUM_SVPATTERNS
)
5568 rtx_vector_builder
pred_builder (pred_mode
, npatterns
, 2);
5569 rtx_vector_builder
true_builder (mode
, npatterns
, 1);
5570 rtx_vector_builder
false_builder (mode
, npatterns
, 1);
5571 for (unsigned int i
= 0; i
< npatterns
; ++i
)
5573 true_builder
.quick_push (CONST_VECTOR_ENCODED_ELT (src
, i
));
5574 pred_builder
.quick_push (CONST1_RTX (BImode
));
5576 for (unsigned int i
= 0; i
< npatterns
; ++i
)
5578 false_builder
.quick_push (CONST_VECTOR_ENCODED_ELT (src
, i
+ npatterns
));
5579 pred_builder
.quick_push (CONST0_RTX (BImode
));
5581 expand_operand ops
[4];
5582 create_output_operand (&ops
[0], target
, mode
);
5583 create_input_operand (&ops
[1], true_builder
.build (), mode
);
5584 create_input_operand (&ops
[2], false_builder
.build (), mode
);
5585 create_input_operand (&ops
[3], pred_builder
.build (), pred_mode
);
5586 expand_insn (code_for_vcond_mask (mode
, mode
), 4, ops
);
5590 /* Return a register containing CONST_VECTOR SRC, given that SRC has an
5591 SVE data mode and isn't a legitimate constant. Use TARGET for the
5592 result if convenient.
5594 The returned register can have whatever mode seems most natural
5595 given the contents of SRC. */
5598 aarch64_expand_sve_const_vector (rtx target
, rtx src
)
5600 machine_mode mode
= GET_MODE (src
);
5601 unsigned int npatterns
= CONST_VECTOR_NPATTERNS (src
);
5602 unsigned int nelts_per_pattern
= CONST_VECTOR_NELTS_PER_PATTERN (src
);
5603 scalar_mode elt_mode
= GET_MODE_INNER (mode
);
5604 unsigned int elt_bits
= GET_MODE_BITSIZE (elt_mode
);
5605 unsigned int container_bits
= aarch64_sve_container_bits (mode
);
5606 unsigned int encoded_bits
= npatterns
* nelts_per_pattern
* container_bits
;
5608 if (nelts_per_pattern
== 1
5609 && encoded_bits
<= 128
5610 && container_bits
!= elt_bits
)
5612 /* We have a partial vector mode and a constant whose full-vector
5613 equivalent would occupy a repeating 128-bit sequence. Build that
5614 full-vector equivalent instead, so that we have the option of
5615 using LD1RQ and Advanced SIMD operations. */
5616 unsigned int repeat
= container_bits
/ elt_bits
;
5617 machine_mode full_mode
= aarch64_full_sve_mode (elt_mode
).require ();
5618 rtx_vector_builder
builder (full_mode
, npatterns
* repeat
, 1);
5619 for (unsigned int i
= 0; i
< npatterns
; ++i
)
5620 for (unsigned int j
= 0; j
< repeat
; ++j
)
5621 builder
.quick_push (CONST_VECTOR_ENCODED_ELT (src
, i
));
5622 target
= aarch64_target_reg (target
, full_mode
);
5623 return aarch64_expand_sve_const_vector (target
, builder
.build ());
5626 if (nelts_per_pattern
== 1 && encoded_bits
== 128)
5628 /* The constant is a duplicated quadword but can't be narrowed
5629 beyond a quadword. Get the memory image of the first quadword
5630 as a 128-bit vector and try using LD1RQ to load it from memory.
5632 The effect for both endiannesses is to load memory lane N into
5633 architectural lanes N + I * STEP of the result. On big-endian
5634 targets, the layout of the 128-bit vector in an Advanced SIMD
5635 register would be different from its layout in an SVE register,
5636 but this 128-bit vector is a memory value only. */
5637 machine_mode vq_mode
= aarch64_vq_mode (elt_mode
).require ();
5638 rtx vq_value
= simplify_gen_subreg (vq_mode
, src
, mode
, 0);
5639 if (vq_value
&& aarch64_expand_sve_ld1rq (target
, vq_value
))
5643 if (nelts_per_pattern
== 1 && encoded_bits
< 128)
5645 /* The vector is a repeating sequence of 64 bits or fewer.
5646 See if we can load them using an Advanced SIMD move and then
5647 duplicate it to fill a vector. This is better than using a GPR
5648 move because it keeps everything in the same register file. */
5649 machine_mode vq_mode
= aarch64_vq_mode (elt_mode
).require ();
5650 rtx_vector_builder
builder (vq_mode
, npatterns
, 1);
5651 for (unsigned int i
= 0; i
< npatterns
; ++i
)
5653 /* We want memory lane N to go into architectural lane N,
5654 so reverse for big-endian targets. The DUP .Q pattern
5655 has a compensating reverse built-in. */
5656 unsigned int srci
= BYTES_BIG_ENDIAN
? npatterns
- i
- 1 : i
;
5657 builder
.quick_push (CONST_VECTOR_ENCODED_ELT (src
, srci
));
5659 rtx vq_src
= builder
.build ();
5660 if (aarch64_simd_valid_immediate (vq_src
, NULL
))
5662 vq_src
= force_reg (vq_mode
, vq_src
);
5663 return aarch64_expand_sve_dupq (target
, mode
, vq_src
);
5666 /* Get an integer representation of the repeating part of Advanced
5667 SIMD vector VQ_SRC. This preserves the endianness of VQ_SRC,
5668 which for big-endian targets is lane-swapped wrt a normal
5669 Advanced SIMD vector. This means that for both endiannesses,
5670 memory lane N of SVE vector SRC corresponds to architectural
5671 lane N of a register holding VQ_SRC. This in turn means that
5672 memory lane 0 of SVE vector SRC is in the lsb of VQ_SRC (viewed
5673 as a single 128-bit value) and thus that memory lane 0 of SRC is
5674 in the lsb of the integer. Duplicating the integer therefore
5675 ensures that memory lane N of SRC goes into architectural lane
5676 N + I * INDEX of the SVE register. */
5677 scalar_mode int_mode
= int_mode_for_size (encoded_bits
, 0).require ();
5678 rtx elt_value
= simplify_gen_subreg (int_mode
, vq_src
, vq_mode
, 0);
5681 /* Pretend that we had a vector of INT_MODE to start with. */
5682 elt_mode
= int_mode
;
5683 mode
= aarch64_full_sve_mode (int_mode
).require ();
5685 /* If the integer can be moved into a general register by a
5686 single instruction, do that and duplicate the result. */
5687 if (CONST_INT_P (elt_value
)
5688 && aarch64_move_imm (INTVAL (elt_value
),
5689 encoded_bits
<= 32 ? SImode
: DImode
))
5691 elt_value
= force_reg (elt_mode
, elt_value
);
5692 return expand_vector_broadcast (mode
, elt_value
);
5695 else if (npatterns
== 1)
5696 /* We're duplicating a single value, but can't do better than
5697 force it to memory and load from there. This handles things
5698 like symbolic constants. */
5699 elt_value
= CONST_VECTOR_ENCODED_ELT (src
, 0);
5703 /* Load the element from memory if we can, otherwise move it into
5704 a register and use a DUP. */
5705 rtx op
= force_const_mem (elt_mode
, elt_value
);
5707 op
= force_reg (elt_mode
, elt_value
);
5708 return expand_vector_broadcast (mode
, op
);
5712 /* Try using INDEX. */
5714 if (const_vec_series_p (src
, &base
, &step
))
5716 aarch64_expand_vec_series (target
, base
, step
);
5720 /* From here on, it's better to force the whole constant to memory
5722 if (GET_MODE_NUNITS (mode
).is_constant ())
5725 if (nelts_per_pattern
== 2)
5726 if (rtx res
= aarch64_expand_sve_const_vector_sel (target
, src
))
5729 /* Expand each pattern individually. */
5730 gcc_assert (npatterns
> 1);
5731 rtx_vector_builder builder
;
5732 auto_vec
<rtx
, 16> vectors (npatterns
);
5733 for (unsigned int i
= 0; i
< npatterns
; ++i
)
5735 builder
.new_vector (mode
, 1, nelts_per_pattern
);
5736 for (unsigned int j
= 0; j
< nelts_per_pattern
; ++j
)
5737 builder
.quick_push (CONST_VECTOR_ELT (src
, i
+ j
* npatterns
));
5738 vectors
.quick_push (force_reg (mode
, builder
.build ()));
5741 /* Use permutes to interleave the separate vectors. */
5742 while (npatterns
> 1)
5745 for (unsigned int i
= 0; i
< npatterns
; ++i
)
5747 rtx tmp
= (npatterns
== 1 ? target
: gen_reg_rtx (mode
));
5748 rtvec v
= gen_rtvec (2, vectors
[i
], vectors
[i
+ npatterns
]);
5749 emit_set_insn (tmp
, gen_rtx_UNSPEC (mode
, v
, UNSPEC_ZIP1
));
5753 gcc_assert (vectors
[0] == target
);
5757 /* Use WHILE to set a predicate register of mode MODE in which the first
5758 VL bits are set and the rest are clear. Use TARGET for the register
5759 if it's nonnull and convenient. */
5762 aarch64_sve_move_pred_via_while (rtx target
, machine_mode mode
,
5765 rtx limit
= force_reg (DImode
, gen_int_mode (vl
, DImode
));
5766 target
= aarch64_target_reg (target
, mode
);
5767 emit_insn (gen_while (UNSPEC_WHILELO
, DImode
, mode
,
5768 target
, const0_rtx
, limit
));
5773 aarch64_expand_sve_const_pred_1 (rtx
, rtx_vector_builder
&, bool);
5775 /* BUILDER is a constant predicate in which the index of every set bit
5776 is a multiple of ELT_SIZE (which is <= 8). Try to load the constant
5777 by inverting every element at a multiple of ELT_SIZE and EORing the
5778 result with an ELT_SIZE PTRUE.
5780 Return a register that contains the constant on success, otherwise
5781 return null. Use TARGET as the register if it is nonnull and
5785 aarch64_expand_sve_const_pred_eor (rtx target
, rtx_vector_builder
&builder
,
5786 unsigned int elt_size
)
5788 /* Invert every element at a multiple of ELT_SIZE, keeping the
5790 rtx_vector_builder
inv_builder (VNx16BImode
, builder
.npatterns (),
5791 builder
.nelts_per_pattern ());
5792 for (unsigned int i
= 0; i
< builder
.encoded_nelts (); ++i
)
5793 if ((i
& (elt_size
- 1)) == 0 && INTVAL (builder
.elt (i
)) == 0)
5794 inv_builder
.quick_push (const1_rtx
);
5796 inv_builder
.quick_push (const0_rtx
);
5797 inv_builder
.finalize ();
5799 /* See if we can load the constant cheaply. */
5800 rtx inv
= aarch64_expand_sve_const_pred_1 (NULL_RTX
, inv_builder
, false);
5804 /* EOR the result with an ELT_SIZE PTRUE. */
5805 rtx mask
= aarch64_ptrue_all (elt_size
);
5806 mask
= force_reg (VNx16BImode
, mask
);
5807 inv
= gen_lowpart (VNx16BImode
, inv
);
5808 target
= aarch64_target_reg (target
, VNx16BImode
);
5809 emit_insn (gen_aarch64_pred_z (XOR
, VNx16BImode
, target
, mask
, inv
, mask
));
5813 /* BUILDER is a constant predicate in which the index of every set bit
5814 is a multiple of ELT_SIZE (which is <= 8). Try to load the constant
5815 using a TRN1 of size PERMUTE_SIZE, which is >= ELT_SIZE. Return the
5816 register on success, otherwise return null. Use TARGET as the register
5817 if nonnull and convenient. */
5820 aarch64_expand_sve_const_pred_trn (rtx target
, rtx_vector_builder
&builder
,
5821 unsigned int elt_size
,
5822 unsigned int permute_size
)
5824 /* We're going to split the constant into two new constants A and B,
5825 with element I of BUILDER going into A if (I & PERMUTE_SIZE) == 0
5826 and into B otherwise. E.g. for PERMUTE_SIZE == 4 && ELT_SIZE == 1:
5828 A: { 0, 1, 2, 3, _, _, _, _, 8, 9, 10, 11, _, _, _, _ }
5829 B: { 4, 5, 6, 7, _, _, _, _, 12, 13, 14, 15, _, _, _, _ }
5831 where _ indicates elements that will be discarded by the permute.
5833 First calculate the ELT_SIZEs for A and B. */
5834 unsigned int a_elt_size
= GET_MODE_SIZE (DImode
);
5835 unsigned int b_elt_size
= GET_MODE_SIZE (DImode
);
5836 for (unsigned int i
= 0; i
< builder
.encoded_nelts (); i
+= elt_size
)
5837 if (INTVAL (builder
.elt (i
)) != 0)
5839 if (i
& permute_size
)
5840 b_elt_size
|= i
- permute_size
;
5844 a_elt_size
&= -a_elt_size
;
5845 b_elt_size
&= -b_elt_size
;
5847 /* Now construct the vectors themselves. */
5848 rtx_vector_builder
a_builder (VNx16BImode
, builder
.npatterns (),
5849 builder
.nelts_per_pattern ());
5850 rtx_vector_builder
b_builder (VNx16BImode
, builder
.npatterns (),
5851 builder
.nelts_per_pattern ());
5852 unsigned int nelts
= builder
.encoded_nelts ();
5853 for (unsigned int i
= 0; i
< nelts
; ++i
)
5854 if (i
& (elt_size
- 1))
5856 a_builder
.quick_push (const0_rtx
);
5857 b_builder
.quick_push (const0_rtx
);
5859 else if ((i
& permute_size
) == 0)
5861 /* The A and B elements are significant. */
5862 a_builder
.quick_push (builder
.elt (i
));
5863 b_builder
.quick_push (builder
.elt (i
+ permute_size
));
5867 /* The A and B elements are going to be discarded, so pick whatever
5868 is likely to give a nice constant. We are targeting element
5869 sizes A_ELT_SIZE and B_ELT_SIZE for A and B respectively,
5870 with the aim of each being a sequence of ones followed by
5871 a sequence of zeros. So:
5873 * if X_ELT_SIZE <= PERMUTE_SIZE, the best approach is to
5874 duplicate the last X_ELT_SIZE element, to extend the
5875 current sequence of ones or zeros.
5877 * if X_ELT_SIZE > PERMUTE_SIZE, the best approach is to add a
5878 zero, so that the constant really does have X_ELT_SIZE and
5879 not a smaller size. */
5880 if (a_elt_size
> permute_size
)
5881 a_builder
.quick_push (const0_rtx
);
5883 a_builder
.quick_push (a_builder
.elt (i
- a_elt_size
));
5884 if (b_elt_size
> permute_size
)
5885 b_builder
.quick_push (const0_rtx
);
5887 b_builder
.quick_push (b_builder
.elt (i
- b_elt_size
));
5889 a_builder
.finalize ();
5890 b_builder
.finalize ();
5892 /* Try loading A into a register. */
5893 rtx_insn
*last
= get_last_insn ();
5894 rtx a
= aarch64_expand_sve_const_pred_1 (NULL_RTX
, a_builder
, false);
5898 /* Try loading B into a register. */
5900 if (a_builder
!= b_builder
)
5902 b
= aarch64_expand_sve_const_pred_1 (NULL_RTX
, b_builder
, false);
5905 delete_insns_since (last
);
5910 /* Emit the TRN1 itself. We emit a TRN that operates on VNx16BI
5911 operands but permutes them as though they had mode MODE. */
5912 machine_mode mode
= aarch64_sve_pred_mode (permute_size
).require ();
5913 target
= aarch64_target_reg (target
, GET_MODE (a
));
5914 rtx type_reg
= CONST0_RTX (mode
);
5915 emit_insn (gen_aarch64_sve_trn1_conv (mode
, target
, a
, b
, type_reg
));
5919 /* Subroutine of aarch64_expand_sve_const_pred. Try to load the VNx16BI
5920 constant in BUILDER into an SVE predicate register. Return the register
5921 on success, otherwise return null. Use TARGET for the register if
5922 nonnull and convenient.
5924 ALLOW_RECURSE_P is true if we can use methods that would call this
5925 function recursively. */
5928 aarch64_expand_sve_const_pred_1 (rtx target
, rtx_vector_builder
&builder
,
5929 bool allow_recurse_p
)
5931 if (builder
.encoded_nelts () == 1)
5932 /* A PFALSE or a PTRUE .B ALL. */
5933 return aarch64_emit_set_immediate (target
, builder
);
5935 unsigned int elt_size
= aarch64_widest_sve_pred_elt_size (builder
);
5936 if (int vl
= aarch64_partial_ptrue_length (builder
, elt_size
))
5938 /* If we can load the constant using PTRUE, use it as-is. */
5939 machine_mode mode
= aarch64_sve_pred_mode (elt_size
).require ();
5940 if (aarch64_svpattern_for_vl (mode
, vl
) != AARCH64_NUM_SVPATTERNS
)
5941 return aarch64_emit_set_immediate (target
, builder
);
5943 /* Otherwise use WHILE to set the first VL bits. */
5944 return aarch64_sve_move_pred_via_while (target
, mode
, vl
);
5947 if (!allow_recurse_p
)
5950 /* Try inverting the vector in element size ELT_SIZE and then EORing
5951 the result with an ELT_SIZE PTRUE. */
5952 if (INTVAL (builder
.elt (0)) == 0)
5953 if (rtx res
= aarch64_expand_sve_const_pred_eor (target
, builder
,
5957 /* Try using TRN1 to permute two simpler constants. */
5958 for (unsigned int i
= elt_size
; i
<= 8; i
*= 2)
5959 if (rtx res
= aarch64_expand_sve_const_pred_trn (target
, builder
,
5966 /* Return an SVE predicate register that contains the VNx16BImode
5967 constant in BUILDER, without going through the move expanders.
5969 The returned register can have whatever mode seems most natural
5970 given the contents of BUILDER. Use TARGET for the result if
5974 aarch64_expand_sve_const_pred (rtx target
, rtx_vector_builder
&builder
)
5976 /* Try loading the constant using pure predicate operations. */
5977 if (rtx res
= aarch64_expand_sve_const_pred_1 (target
, builder
, true))
5980 /* Try forcing the constant to memory. */
5981 if (builder
.full_nelts ().is_constant ())
5982 if (rtx mem
= force_const_mem (VNx16BImode
, builder
.build ()))
5984 target
= aarch64_target_reg (target
, VNx16BImode
);
5985 emit_move_insn (target
, mem
);
5989 /* The last resort is to load the constant as an integer and then
5990 compare it against zero. Use -1 for set bits in order to increase
5991 the changes of using SVE DUPM or an Advanced SIMD byte mask. */
5992 rtx_vector_builder
int_builder (VNx16QImode
, builder
.npatterns (),
5993 builder
.nelts_per_pattern ());
5994 for (unsigned int i
= 0; i
< builder
.encoded_nelts (); ++i
)
5995 int_builder
.quick_push (INTVAL (builder
.elt (i
))
5996 ? constm1_rtx
: const0_rtx
);
5997 return aarch64_convert_sve_data_to_pred (target
, VNx16BImode
,
5998 int_builder
.build ());
6001 /* Set DEST to immediate IMM. */
6004 aarch64_expand_mov_immediate (rtx dest
, rtx imm
)
6006 machine_mode mode
= GET_MODE (dest
);
6008 /* Check on what type of symbol it is. */
6009 scalar_int_mode int_mode
;
6010 if ((SYMBOL_REF_P (imm
)
6011 || LABEL_REF_P (imm
)
6012 || GET_CODE (imm
) == CONST
6013 || GET_CODE (imm
) == CONST_POLY_INT
)
6014 && is_a
<scalar_int_mode
> (mode
, &int_mode
))
6018 HOST_WIDE_INT const_offset
;
6019 enum aarch64_symbol_type sty
;
6021 /* If we have (const (plus symbol offset)), separate out the offset
6022 before we start classifying the symbol. */
6023 rtx base
= strip_offset (imm
, &offset
);
6025 /* We must always add an offset involving VL separately, rather than
6026 folding it into the relocation. */
6027 if (!offset
.is_constant (&const_offset
))
6031 aarch64_report_sve_required ();
6034 if (base
== const0_rtx
6035 && (aarch64_sve_cnt_immediate_p (offset
)
6036 || aarch64_sve_rdvl_immediate_p (offset
)))
6037 emit_insn (gen_rtx_SET (dest
, imm
));
6040 /* Do arithmetic on 32-bit values if the result is smaller
6042 if (partial_subreg_p (int_mode
, SImode
))
6044 /* It is invalid to do symbol calculations in modes
6045 narrower than SImode. */
6046 gcc_assert (base
== const0_rtx
);
6047 dest
= gen_lowpart (SImode
, dest
);
6050 if (base
!= const0_rtx
)
6052 base
= aarch64_force_temporary (int_mode
, dest
, base
);
6053 aarch64_add_offset (int_mode
, dest
, base
, offset
,
6054 NULL_RTX
, NULL_RTX
, 0, false);
6057 aarch64_add_offset (int_mode
, dest
, base
, offset
,
6058 dest
, NULL_RTX
, 0, false);
6063 if (aarch64_rdsvl_immediate_p (base
))
6065 /* We could handle non-constant offsets if they are ever
6067 gcc_assert (const_offset
== 0);
6068 emit_insn (gen_rtx_SET (dest
, imm
));
6072 sty
= aarch64_classify_symbol (base
, const_offset
);
6075 case SYMBOL_FORCE_TO_MEM
:
6076 if (int_mode
!= ptr_mode
)
6077 imm
= convert_memory_address (ptr_mode
, imm
);
6079 if (const_offset
!= 0
6080 && targetm
.cannot_force_const_mem (ptr_mode
, imm
))
6082 gcc_assert (can_create_pseudo_p ());
6083 base
= aarch64_force_temporary (int_mode
, dest
, base
);
6084 aarch64_add_offset (int_mode
, dest
, base
, const_offset
,
6085 NULL_RTX
, NULL_RTX
, 0, false);
6089 mem
= force_const_mem (ptr_mode
, imm
);
6092 /* If we aren't generating PC relative literals, then
6093 we need to expand the literal pool access carefully.
6094 This is something that needs to be done in a number
6095 of places, so could well live as a separate function. */
6096 if (!aarch64_pcrelative_literal_loads
)
6098 gcc_assert (can_create_pseudo_p ());
6099 base
= gen_reg_rtx (ptr_mode
);
6100 aarch64_expand_mov_immediate (base
, XEXP (mem
, 0));
6101 if (ptr_mode
!= Pmode
)
6102 base
= convert_memory_address (Pmode
, base
);
6103 mem
= gen_rtx_MEM (ptr_mode
, base
);
6106 if (int_mode
!= ptr_mode
)
6107 mem
= gen_rtx_ZERO_EXTEND (int_mode
, mem
);
6109 emit_insn (gen_rtx_SET (dest
, mem
));
6113 case SYMBOL_SMALL_TLSGD
:
6114 case SYMBOL_SMALL_TLSDESC
:
6115 case SYMBOL_SMALL_TLSIE
:
6116 case SYMBOL_SMALL_GOT_28K
:
6117 case SYMBOL_SMALL_GOT_4G
:
6118 case SYMBOL_TINY_GOT
:
6119 case SYMBOL_TINY_TLSIE
:
6120 if (const_offset
!= 0)
6122 gcc_assert(can_create_pseudo_p ());
6123 base
= aarch64_force_temporary (int_mode
, dest
, base
);
6124 aarch64_add_offset (int_mode
, dest
, base
, const_offset
,
6125 NULL_RTX
, NULL_RTX
, 0, false);
6130 case SYMBOL_SMALL_ABSOLUTE
:
6131 case SYMBOL_TINY_ABSOLUTE
:
6132 case SYMBOL_TLSLE12
:
6133 case SYMBOL_TLSLE24
:
6134 case SYMBOL_TLSLE32
:
6135 case SYMBOL_TLSLE48
:
6136 aarch64_load_symref_appropriately (dest
, imm
, sty
);
6144 if (!CONST_INT_P (imm
))
6146 if (aarch64_sve_pred_mode_p (mode
))
6148 /* Only the low bit of each .H, .S and .D element is defined,
6149 so we can set the upper bits to whatever we like. If the
6150 predicate is all-true in MODE, prefer to set all the undefined
6151 bits as well, so that we can share a single .B predicate for
6153 if (imm
== CONSTM1_RTX (mode
))
6154 imm
= CONSTM1_RTX (VNx16BImode
);
6156 /* All methods for constructing predicate modes wider than VNx16BI
6157 will set the upper bits of each element to zero. Expose this
6158 by moving such constants as a VNx16BI, so that all bits are
6159 significant and so that constants for different modes can be
6160 shared. The wider constant will still be available as a
6162 rtx_vector_builder builder
;
6163 if (aarch64_get_sve_pred_bits (builder
, imm
))
6165 rtx res
= aarch64_expand_sve_const_pred (dest
, builder
);
6167 emit_move_insn (dest
, gen_lowpart (mode
, res
));
6172 if (GET_CODE (imm
) == HIGH
6173 || aarch64_simd_valid_immediate (imm
, NULL
))
6175 emit_insn (gen_rtx_SET (dest
, imm
));
6179 if (CONST_VECTOR_P (imm
) && aarch64_sve_data_mode_p (mode
))
6180 if (rtx res
= aarch64_expand_sve_const_vector (dest
, imm
))
6183 emit_insn (gen_aarch64_sve_reinterpret (mode
, dest
, res
));
6187 rtx mem
= force_const_mem (mode
, imm
);
6189 emit_move_insn (dest
, mem
);
6193 aarch64_internal_mov_immediate (dest
, imm
, true, mode
);
6196 /* Return the MEM rtx that provides the canary value that should be used
6197 for stack-smashing protection. MODE is the mode of the memory.
6198 For SSP_GLOBAL, DECL_RTL is the MEM rtx for the canary variable
6199 (__stack_chk_guard), otherwise it has no useful value. SALT_TYPE
6200 indicates whether the caller is performing a SET or a TEST operation. */
6203 aarch64_stack_protect_canary_mem (machine_mode mode
, rtx decl_rtl
,
6204 aarch64_salt_type salt_type
)
6207 if (aarch64_stack_protector_guard
== SSP_GLOBAL
)
6209 gcc_assert (MEM_P (decl_rtl
));
6210 addr
= XEXP (decl_rtl
, 0);
6212 rtx base
= strip_offset_and_salt (addr
, &offset
);
6213 if (!SYMBOL_REF_P (base
))
6216 rtvec v
= gen_rtvec (2, base
, GEN_INT (salt_type
));
6217 addr
= gen_rtx_UNSPEC (Pmode
, v
, UNSPEC_SALT_ADDR
);
6218 addr
= gen_rtx_CONST (Pmode
, addr
);
6219 addr
= plus_constant (Pmode
, addr
, offset
);
6223 /* Calculate the address from the system register. */
6224 rtx salt
= GEN_INT (salt_type
);
6225 addr
= gen_reg_rtx (mode
);
6227 emit_insn (gen_reg_stack_protect_address_di (addr
, salt
));
6230 emit_insn (gen_reg_stack_protect_address_si (addr
, salt
));
6231 addr
= convert_memory_address (Pmode
, addr
);
6233 addr
= plus_constant (Pmode
, addr
, aarch64_stack_protector_guard_offset
);
6235 return gen_rtx_MEM (mode
, force_reg (Pmode
, addr
));
6238 /* Emit an SVE predicated move from SRC to DEST. PRED is a predicate
6239 that is known to contain PTRUE. */
6242 aarch64_emit_sve_pred_move (rtx dest
, rtx pred
, rtx src
)
6244 expand_operand ops
[3];
6245 machine_mode mode
= GET_MODE (dest
);
6246 create_output_operand (&ops
[0], dest
, mode
);
6247 create_input_operand (&ops
[1], pred
, GET_MODE(pred
));
6248 create_input_operand (&ops
[2], src
, mode
);
6249 temporary_volatile_ok
v (true);
6250 expand_insn (code_for_aarch64_pred_mov (mode
), 3, ops
);
6253 /* Expand a pre-RA SVE data move from SRC to DEST in which at least one
6254 operand is in memory. In this case we need to use the predicated LD1
6255 and ST1 instead of LDR and STR, both for correctness on big-endian
6256 targets and because LD1 and ST1 support a wider range of addressing modes.
6257 PRED_MODE is the mode of the predicate.
6259 See the comment at the head of aarch64-sve.md for details about the
6260 big-endian handling. */
6263 aarch64_expand_sve_mem_move (rtx dest
, rtx src
, machine_mode pred_mode
)
6265 machine_mode mode
= GET_MODE (dest
);
6266 rtx ptrue
= aarch64_ptrue_reg (pred_mode
);
6267 if (!register_operand (src
, mode
)
6268 && !register_operand (dest
, mode
))
6270 rtx tmp
= gen_reg_rtx (mode
);
6272 aarch64_emit_sve_pred_move (tmp
, ptrue
, src
);
6274 emit_move_insn (tmp
, src
);
6277 aarch64_emit_sve_pred_move (dest
, ptrue
, src
);
6280 /* Called only on big-endian targets. See whether an SVE vector move
6281 from SRC to DEST is effectively a REV[BHW] instruction, because at
6282 least one operand is a subreg of an SVE vector that has wider or
6283 narrower elements. Return true and emit the instruction if so.
6287 (set (reg:VNx8HI R1) (subreg:VNx8HI (reg:VNx16QI R2) 0))
6289 represents a VIEW_CONVERT between the following vectors, viewed
6292 R2: { [0].high, [0].low, [1].high, [1].low, ... }
6293 R1: { [0], [1], [2], [3], ... }
6295 The high part of lane X in R2 should therefore correspond to lane X*2
6296 of R1, but the register representations are:
6299 R2: ...... [1].high [1].low [0].high [0].low
6300 R1: ...... [3] [2] [1] [0]
6302 where the low part of lane X in R2 corresponds to lane X*2 in R1.
6303 We therefore need a reverse operation to swap the high and low values
6306 This is purely an optimization. Without it we would spill the
6307 subreg operand to the stack in one mode and reload it in the
6308 other mode, which has the same effect as the REV. */
6311 aarch64_maybe_expand_sve_subreg_move (rtx dest
, rtx src
)
6313 gcc_assert (BYTES_BIG_ENDIAN
);
6315 /* Do not try to optimize subregs that LRA has created for matched
6316 reloads. These subregs only exist as a temporary measure to make
6317 the RTL well-formed, but they are exempt from the usual
6318 TARGET_CAN_CHANGE_MODE_CLASS rules.
6320 For example, if we have:
6322 (set (reg:VNx8HI R1) (foo:VNx8HI (reg:VNx4SI R2)))
6324 and the constraints require R1 and R2 to be in the same register,
6325 LRA may need to create RTL such as:
6327 (set (subreg:VNx4SI (reg:VNx8HI TMP) 0) (reg:VNx4SI R2))
6328 (set (reg:VNx8HI TMP) (foo:VNx8HI (subreg:VNx4SI (reg:VNx8HI TMP) 0)))
6329 (set (reg:VNx8HI R1) (reg:VNx8HI TMP))
6331 which forces both the input and output of the original instruction
6332 to use the same hard register. But for this to work, the normal
6333 rules have to be suppressed on the subreg input, otherwise LRA
6334 would need to reload that input too, meaning that the process
6335 would never terminate. To compensate for this, the normal rules
6336 are also suppressed for the subreg output of the first move.
6337 Ignoring the special case and handling the first move normally
6338 would therefore generate wrong code: we would reverse the elements
6339 for the first subreg but not reverse them back for the second subreg. */
6340 if (SUBREG_P (dest
) && !LRA_SUBREG_P (dest
))
6341 dest
= SUBREG_REG (dest
);
6342 if (SUBREG_P (src
) && !LRA_SUBREG_P (src
))
6343 src
= SUBREG_REG (src
);
6345 /* The optimization handles two single SVE REGs with different element
6349 || aarch64_classify_vector_mode (GET_MODE (dest
)) != VEC_SVE_DATA
6350 || aarch64_classify_vector_mode (GET_MODE (src
)) != VEC_SVE_DATA
6351 || (GET_MODE_UNIT_SIZE (GET_MODE (dest
))
6352 == GET_MODE_UNIT_SIZE (GET_MODE (src
))))
6355 /* Generate *aarch64_sve_mov<mode>_subreg_be. */
6356 rtx ptrue
= aarch64_ptrue_reg (VNx16BImode
);
6357 rtx unspec
= gen_rtx_UNSPEC (GET_MODE (dest
), gen_rtvec (2, ptrue
, src
),
6359 emit_insn (gen_rtx_SET (dest
, unspec
));
6363 /* Return a copy of X with mode MODE, without changing its other
6364 attributes. Unlike gen_lowpart, this doesn't care whether the
6365 mode change is valid. */
6368 aarch64_replace_reg_mode (rtx x
, machine_mode mode
)
6370 if (GET_MODE (x
) == mode
)
6373 x
= shallow_copy_rtx (x
);
6374 set_mode_and_regno (x
, mode
, REGNO (x
));
6378 /* Return the SVE REV[BHW] unspec for reversing quantites of mode MODE
6379 stored in wider integer containers. */
6382 aarch64_sve_rev_unspec (machine_mode mode
)
6384 switch (GET_MODE_UNIT_SIZE (mode
))
6386 case 1: return UNSPEC_REVB
;
6387 case 2: return UNSPEC_REVH
;
6388 case 4: return UNSPEC_REVW
;
6393 /* Split a *aarch64_sve_mov<mode>_subreg_be pattern with the given
6397 aarch64_split_sve_subreg_move (rtx dest
, rtx ptrue
, rtx src
)
6399 /* Decide which REV operation we need. The mode with wider elements
6400 determines the mode of the operands and the mode with the narrower
6401 elements determines the reverse width. */
6402 machine_mode mode_with_wider_elts
= aarch64_sve_int_mode (GET_MODE (dest
));
6403 machine_mode mode_with_narrower_elts
= aarch64_sve_int_mode (GET_MODE (src
));
6404 if (GET_MODE_UNIT_SIZE (mode_with_wider_elts
)
6405 < GET_MODE_UNIT_SIZE (mode_with_narrower_elts
))
6406 std::swap (mode_with_wider_elts
, mode_with_narrower_elts
);
6408 unsigned int unspec
= aarch64_sve_rev_unspec (mode_with_narrower_elts
);
6409 machine_mode pred_mode
= aarch64_sve_pred_mode (mode_with_wider_elts
);
6411 /* Get the operands in the appropriate modes and emit the instruction. */
6412 ptrue
= gen_lowpart (pred_mode
, ptrue
);
6413 dest
= aarch64_replace_reg_mode (dest
, mode_with_wider_elts
);
6414 src
= aarch64_replace_reg_mode (src
, mode_with_wider_elts
);
6415 emit_insn (gen_aarch64_pred (unspec
, mode_with_wider_elts
,
6420 aarch64_function_ok_for_sibcall (tree
, tree exp
)
6422 if (crtl
->abi
->id () != expr_callee_abi (exp
).id ())
6425 tree fntype
= TREE_TYPE (TREE_TYPE (CALL_EXPR_FN (exp
)));
6426 if (aarch64_fntype_pstate_sm (fntype
) & ~aarch64_cfun_incoming_pstate_sm ())
6428 for (auto state
: { "za", "zt0" })
6429 if (bool (aarch64_cfun_shared_flags (state
))
6430 != bool (aarch64_fntype_shared_flags (fntype
, state
)))
6435 /* Subroutine of aarch64_pass_by_reference for arguments that are not
6436 passed in SVE registers. */
6439 aarch64_pass_by_reference_1 (CUMULATIVE_ARGS
*pcum
,
6440 const function_arg_info
&arg
)
6443 machine_mode dummymode
;
6446 /* GET_MODE_SIZE (BLKmode) is useless since it is 0. */
6447 if (arg
.mode
== BLKmode
&& arg
.type
)
6448 size
= int_size_in_bytes (arg
.type
);
6450 /* No frontends can create types with variable-sized modes, so we
6451 shouldn't be asked to pass or return them. */
6452 size
= GET_MODE_SIZE (arg
.mode
).to_constant ();
6454 /* Aggregates are passed by reference based on their size. */
6455 if (arg
.aggregate_type_p ())
6456 size
= int_size_in_bytes (arg
.type
);
6458 /* Variable sized arguments are always returned by reference. */
6462 /* Can this be a candidate to be passed in fp/simd register(s)? */
6463 if (aarch64_vfp_is_call_or_return_candidate (arg
.mode
, arg
.type
,
6464 &dummymode
, &nregs
, NULL
,
6465 !pcum
|| pcum
->silent_p
))
6468 /* Arguments which are variable sized or larger than 2 registers are
6469 passed by reference unless they are a homogenous floating point
6471 return size
> 2 * UNITS_PER_WORD
;
6474 /* Implement TARGET_PASS_BY_REFERENCE. */
6477 aarch64_pass_by_reference (cumulative_args_t pcum_v
,
6478 const function_arg_info
&arg
)
6480 CUMULATIVE_ARGS
*pcum
= get_cumulative_args (pcum_v
);
6483 return aarch64_pass_by_reference_1 (pcum
, arg
);
6485 pure_scalable_type_info pst_info
;
6486 switch (pst_info
.analyze (arg
.type
))
6488 case pure_scalable_type_info::IS_PST
:
6489 if (pcum
&& !pcum
->silent_p
&& !TARGET_SVE
)
6490 /* We can't gracefully recover at this point, so make this a
6492 fatal_error (input_location
, "arguments of type %qT require"
6493 " the SVE ISA extension", arg
.type
);
6495 /* Variadic SVE types are passed by reference. Normal non-variadic
6496 arguments are too if we've run out of registers. */
6498 || pcum
->aapcs_nvrn
+ pst_info
.num_zr () > NUM_FP_ARG_REGS
6499 || pcum
->aapcs_nprn
+ pst_info
.num_pr () > NUM_PR_ARG_REGS
);
6501 case pure_scalable_type_info::DOESNT_MATTER
:
6502 gcc_assert (aarch64_pass_by_reference_1 (pcum
, arg
));
6505 case pure_scalable_type_info::NO_ABI_IDENTITY
:
6506 case pure_scalable_type_info::ISNT_PST
:
6507 return aarch64_pass_by_reference_1 (pcum
, arg
);
6512 /* Return TRUE if VALTYPE is padded to its least significant bits. */
6514 aarch64_return_in_msb (const_tree valtype
)
6516 machine_mode dummy_mode
;
6519 /* Never happens in little-endian mode. */
6520 if (!BYTES_BIG_ENDIAN
)
6523 /* Only composite types smaller than or equal to 16 bytes can
6524 be potentially returned in registers. */
6525 if (!aarch64_composite_type_p (valtype
, TYPE_MODE (valtype
))
6526 || int_size_in_bytes (valtype
) <= 0
6527 || int_size_in_bytes (valtype
) > 16)
6530 /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
6531 or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
6532 is always passed/returned in the least significant bits of fp/simd
6534 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype
), valtype
,
6535 &dummy_mode
, &dummy_int
, NULL
,
6539 /* Likewise pure scalable types for SVE vector and predicate registers. */
6540 pure_scalable_type_info pst_info
;
6541 if (pst_info
.analyze_registers (valtype
))
6547 /* Implement TARGET_FUNCTION_VALUE.
6548 Define how to find the value returned by a function. */
6551 aarch64_function_value (const_tree type
, const_tree func
,
6552 bool outgoing ATTRIBUTE_UNUSED
)
6557 mode
= TYPE_MODE (type
);
6558 if (INTEGRAL_TYPE_P (type
))
6559 mode
= promote_function_mode (type
, mode
, &unsignedp
, func
, 1);
6561 pure_scalable_type_info pst_info
;
6562 if (type
&& pst_info
.analyze_registers (type
))
6563 return pst_info
.get_rtx (mode
, V0_REGNUM
, P0_REGNUM
);
6565 /* Generic vectors that map to full SVE modes with -msve-vector-bits=N
6566 are returned in memory, not by value. */
6567 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
6568 bool sve_p
= (vec_flags
& VEC_ANY_SVE
);
6570 if (aarch64_return_in_msb (type
))
6572 HOST_WIDE_INT size
= int_size_in_bytes (type
);
6574 if (size
% UNITS_PER_WORD
!= 0)
6576 size
+= UNITS_PER_WORD
- size
% UNITS_PER_WORD
;
6577 mode
= int_mode_for_size (size
* BITS_PER_UNIT
, 0).require ();
6582 machine_mode ag_mode
;
6583 if (aarch64_vfp_is_call_or_return_candidate (mode
, type
, &ag_mode
, &count
,
6586 gcc_assert (!sve_p
);
6587 if (!aarch64_composite_type_p (type
, mode
))
6589 gcc_assert (count
== 1 && mode
== ag_mode
);
6590 return gen_rtx_REG (mode
, V0_REGNUM
);
6592 else if (aarch64_advsimd_full_struct_mode_p (mode
)
6593 && known_eq (GET_MODE_SIZE (ag_mode
), 16))
6594 return gen_rtx_REG (mode
, V0_REGNUM
);
6595 else if (aarch64_advsimd_partial_struct_mode_p (mode
)
6596 && known_eq (GET_MODE_SIZE (ag_mode
), 8))
6597 return gen_rtx_REG (mode
, V0_REGNUM
);
6603 par
= gen_rtx_PARALLEL (mode
, rtvec_alloc (count
));
6604 for (i
= 0; i
< count
; i
++)
6606 rtx tmp
= gen_rtx_REG (ag_mode
, V0_REGNUM
+ i
);
6607 rtx offset
= gen_int_mode (i
* GET_MODE_SIZE (ag_mode
), Pmode
);
6608 tmp
= gen_rtx_EXPR_LIST (VOIDmode
, tmp
, offset
);
6609 XVECEXP (par
, 0, i
) = tmp
;
6618 /* Vector types can acquire a partial SVE mode using things like
6619 __attribute__((vector_size(N))), and this is potentially useful.
6620 However, the choice of mode doesn't affect the type's ABI
6621 identity, so we should treat the types as though they had
6622 the associated integer mode, just like they did before SVE
6625 We know that the vector must be 128 bits or smaller,
6626 otherwise we'd have returned it in memory instead. */
6628 && (aarch64_some_values_include_pst_objects_p (type
)
6629 || (vec_flags
& VEC_PARTIAL
)));
6631 scalar_int_mode int_mode
= int_mode_for_mode (mode
).require ();
6632 rtx reg
= gen_rtx_REG (int_mode
, R0_REGNUM
);
6633 rtx pair
= gen_rtx_EXPR_LIST (VOIDmode
, reg
, const0_rtx
);
6634 return gen_rtx_PARALLEL (mode
, gen_rtvec (1, pair
));
6636 return gen_rtx_REG (mode
, R0_REGNUM
);
6640 /* Implements TARGET_FUNCTION_VALUE_REGNO_P.
6641 Return true if REGNO is the number of a hard register in which the values
6642 of called function may come back. */
6645 aarch64_function_value_regno_p (const unsigned int regno
)
6647 /* Maximum of 16 bytes can be returned in the general registers. Examples
6648 of 16-byte return values are: 128-bit integers and 16-byte small
6649 structures (excluding homogeneous floating-point aggregates). */
6650 if (regno
== R0_REGNUM
|| regno
== R1_REGNUM
)
6653 /* Up to four fp/simd registers can return a function value, e.g. a
6654 homogeneous floating-point aggregate having four members. */
6655 if (regno
>= V0_REGNUM
&& regno
< V0_REGNUM
+ HA_MAX_NUM_FLDS
)
6656 return TARGET_FLOAT
;
6658 if (regno
>= P0_REGNUM
&& regno
< P0_REGNUM
+ HA_MAX_NUM_FLDS
)
6664 /* Subroutine for aarch64_return_in_memory for types that are not returned
6665 in SVE registers. */
6668 aarch64_return_in_memory_1 (const_tree type
)
6671 machine_mode ag_mode
;
6674 if (!AGGREGATE_TYPE_P (type
)
6675 && TREE_CODE (type
) != BITINT_TYPE
6676 && TREE_CODE (type
) != COMPLEX_TYPE
6677 && TREE_CODE (type
) != VECTOR_TYPE
)
6678 /* Simple scalar types always returned in registers. */
6681 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type
), type
,
6682 &ag_mode
, &count
, NULL
, false))
6685 /* Types larger than 2 registers returned in memory. */
6686 size
= int_size_in_bytes (type
);
6687 return (size
< 0 || size
> 2 * UNITS_PER_WORD
);
6690 /* Implement TARGET_RETURN_IN_MEMORY.
6692 If the type T of the result of a function is such that
6694 would require that arg be passed as a value in a register (or set of
6695 registers) according to the parameter passing rules, then the result
6696 is returned in the same registers as would be used for such an
6700 aarch64_return_in_memory (const_tree type
, const_tree fndecl ATTRIBUTE_UNUSED
)
6702 pure_scalable_type_info pst_info
;
6703 switch (pst_info
.analyze (type
))
6705 case pure_scalable_type_info::IS_PST
:
6706 return (pst_info
.num_zr () > NUM_FP_ARG_REGS
6707 || pst_info
.num_pr () > NUM_PR_ARG_REGS
);
6709 case pure_scalable_type_info::DOESNT_MATTER
:
6710 gcc_assert (aarch64_return_in_memory_1 (type
));
6713 case pure_scalable_type_info::NO_ABI_IDENTITY
:
6714 case pure_scalable_type_info::ISNT_PST
:
6715 return aarch64_return_in_memory_1 (type
);
6721 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v
, machine_mode mode
,
6722 const_tree type
, int *nregs
)
6724 CUMULATIVE_ARGS
*pcum
= get_cumulative_args (pcum_v
);
6725 return aarch64_vfp_is_call_or_return_candidate (mode
, type
,
6726 &pcum
->aapcs_vfp_rmode
,
6727 nregs
, NULL
, pcum
->silent_p
);
6730 /* Given MODE and TYPE of a function argument, return the alignment in
6731 bits. The idea is to suppress any stronger alignment requested by
6732 the user and opt for the natural alignment (specified in AAPCS64 \S
6733 4.1). ABI_BREAK_GCC_9 is set to the old alignment if the alignment
6734 was incorrectly calculated in versions of GCC prior to GCC 9.
6735 ABI_BREAK_GCC_13 is set to the old alignment if it was incorrectly
6736 calculated in versions between GCC 9 and GCC 13. If the alignment
6737 might have changed between GCC 13 and GCC 14, ABI_BREAK_GCC_14
6738 is the old GCC 13 alignment, otherwise it is zero.
6740 This is a helper function for local use only. */
6743 aarch64_function_arg_alignment (machine_mode mode
, const_tree type
,
6744 unsigned int *abi_break_gcc_9
,
6745 unsigned int *abi_break_gcc_13
,
6746 unsigned int *abi_break_gcc_14
)
6748 *abi_break_gcc_9
= 0;
6749 *abi_break_gcc_13
= 0;
6750 *abi_break_gcc_14
= 0;
6752 return GET_MODE_ALIGNMENT (mode
);
6754 if (integer_zerop (TYPE_SIZE (type
)))
6757 gcc_assert (TYPE_MODE (type
) == mode
);
6759 if (!AGGREGATE_TYPE_P (type
))
6761 /* The ABI alignment is the natural alignment of the type, without
6762 any attributes applied. Normally this is the alignment of the
6763 TYPE_MAIN_VARIANT, but not always; see PR108910 for a counterexample.
6764 For now we just handle the known exceptions explicitly. */
6765 type
= TYPE_MAIN_VARIANT (type
);
6766 if (POINTER_TYPE_P (type
))
6768 gcc_assert (known_eq (POINTER_SIZE
, GET_MODE_BITSIZE (mode
)));
6769 return POINTER_SIZE
;
6771 if (TREE_CODE (type
) == ENUMERAL_TYPE
&& TREE_TYPE (type
))
6773 *abi_break_gcc_14
= TYPE_ALIGN (type
);
6774 type
= TYPE_MAIN_VARIANT (TREE_TYPE (type
));
6776 gcc_assert (!TYPE_USER_ALIGN (type
));
6777 return TYPE_ALIGN (type
);
6780 if (TREE_CODE (type
) == ARRAY_TYPE
)
6781 return TYPE_ALIGN (TREE_TYPE (type
));
6783 unsigned int alignment
= 0;
6784 unsigned int bitfield_alignment_with_packed
= 0;
6785 unsigned int bitfield_alignment
= 0;
6786 for (tree field
= TYPE_FIELDS (type
); field
; field
= DECL_CHAIN (field
))
6787 if (TREE_CODE (field
) == FIELD_DECL
)
6789 /* Note that we explicitly consider zero-sized fields here,
6790 even though they don't map to AAPCS64 machine types.
6793 struct __attribute__((aligned(8))) empty {};
6796 [[no_unique_address]] empty e;
6800 "s" contains only one Fundamental Data Type (the int field)
6801 but gains 8-byte alignment and size thanks to "e". */
6802 alignment
= std::max (alignment
, DECL_ALIGN (field
));
6803 if (DECL_BIT_FIELD_TYPE (field
))
6805 /* Take the bit-field type's alignment into account only
6806 if the user didn't reduce this field's alignment with
6807 the packed attribute. */
6808 if (!DECL_PACKED (field
))
6810 = std::max (bitfield_alignment
,
6811 TYPE_ALIGN (DECL_BIT_FIELD_TYPE (field
)));
6813 /* Compute the alignment even if the bit-field is
6814 packed, so that we can emit a warning in case the
6815 alignment changed between GCC versions. */
6816 bitfield_alignment_with_packed
6817 = std::max (bitfield_alignment_with_packed
,
6818 TYPE_ALIGN (DECL_BIT_FIELD_TYPE (field
)));
6822 /* Emit a warning if the alignment is different when taking the
6823 'packed' attribute into account. */
6824 if (bitfield_alignment
!= bitfield_alignment_with_packed
6825 && bitfield_alignment_with_packed
> alignment
)
6826 *abi_break_gcc_13
= bitfield_alignment_with_packed
;
6828 if (bitfield_alignment
> alignment
)
6830 *abi_break_gcc_9
= alignment
;
6831 return bitfield_alignment
;
6837 /* Return true if TYPE describes a _BitInt(N) or an angreggate that uses the
6838 _BitInt(N) type. These include ARRAY_TYPE's with an element that is a
6839 _BitInt(N) or an aggregate that uses it, and a RECORD_TYPE or a UNION_TYPE
6840 with a field member that is a _BitInt(N) or an aggregate that uses it.
6841 Return false otherwise. */
6844 bitint_or_aggr_of_bitint_p (tree type
)
6849 if (TREE_CODE (type
) == BITINT_TYPE
)
6852 /* If ARRAY_TYPE, check it's element type. */
6853 if (TREE_CODE (type
) == ARRAY_TYPE
)
6854 return bitint_or_aggr_of_bitint_p (TREE_TYPE (type
));
6856 /* If RECORD_TYPE or UNION_TYPE, check the fields' types. */
6857 if (RECORD_OR_UNION_TYPE_P (type
))
6858 for (tree field
= TYPE_FIELDS (type
); field
; field
= TREE_CHAIN (field
))
6860 if (TREE_CODE (field
) != FIELD_DECL
)
6862 if (bitint_or_aggr_of_bitint_p (TREE_TYPE (field
)))
6868 /* Layout a function argument according to the AAPCS64 rules. The rule
6869 numbers refer to the rule numbers in the AAPCS64. ORIG_MODE is the
6870 mode that was originally given to us by the target hook, whereas the
6871 mode in ARG might be the result of replacing partial SVE modes with
6872 the equivalent integer mode. */
6875 aarch64_layout_arg (cumulative_args_t pcum_v
, const function_arg_info
&arg
)
6877 CUMULATIVE_ARGS
*pcum
= get_cumulative_args (pcum_v
);
6878 tree type
= arg
.type
;
6879 machine_mode mode
= arg
.mode
;
6880 int ncrn
, nvrn
, nregs
;
6881 bool allocate_ncrn
, allocate_nvrn
;
6883 unsigned int abi_break_gcc_9
;
6884 unsigned int abi_break_gcc_13
;
6885 unsigned int abi_break_gcc_14
;
6887 /* We need to do this once per argument. */
6888 if (pcum
->aapcs_arg_processed
)
6891 bool warn_pcs_change
6894 && (currently_expanding_function_start
6895 || currently_expanding_gimple_stmt
));
6897 /* HFAs and HVAs can have an alignment greater than 16 bytes. For example:
6899 typedef struct foo {
6900 __Int8x16_t foo[2] __attribute__((aligned(32)));
6903 is still a HVA despite its larger-than-normal alignment.
6904 However, such over-aligned HFAs and HVAs are guaranteed to have
6907 If we exclude HFAs and HVAs from the discussion below, then there
6908 are several things to note:
6910 - Both the C and AAPCS64 interpretations of a type's alignment should
6911 give a value that is no greater than the type's size.
6913 - Types bigger than 16 bytes are passed indirectly.
6915 - If an argument of type T is passed indirectly, TYPE and MODE describe
6916 a pointer to T rather than T iself.
6918 It follows that the AAPCS64 alignment of TYPE must be no greater
6921 Versions prior to GCC 9.1 ignored a bitfield's underlying type
6922 and so could calculate an alignment that was too small. If this
6923 happened for TYPE then ABI_BREAK_GCC_9 is this older, too-small alignment.
6925 Although GCC 9.1 fixed that bug, it introduced a different one:
6926 it would consider the alignment of a bitfield's underlying type even
6927 if the field was packed (which should have the effect of overriding
6928 the alignment of the underlying type). This was fixed in GCC 13.1.
6930 As a result of this bug, GCC 9 to GCC 12 could calculate an alignment
6931 that was too big. If this happened for TYPE, ABI_BREAK_GCC_13 is
6932 this older, too-big alignment.
6934 Also, the fact that GCC 9 to GCC 12 considered irrelevant
6935 alignments meant they could calculate type alignments that were
6936 bigger than the type's size, contrary to the assumption above.
6937 The handling of register arguments was nevertheless (and justifiably)
6938 written to follow the assumption that the alignment can never be
6939 greater than the size. The same was not true for stack arguments;
6940 their alignment was instead handled by MIN bounds in
6941 aarch64_function_arg_boundary.
6943 The net effect is that, if GCC 9 to GCC 12 incorrectly calculated
6944 an alignment of more than 16 bytes for TYPE then:
6946 - If the argument was passed in registers, these GCC versions
6947 would treat the alignment as though it was *less than* 16 bytes.
6949 - If the argument was passed on the stack, these GCC versions
6950 would treat the alignment as though it was *equal to* 16 bytes.
6952 Both behaviors were wrong, but in different cases. */
6954 pcum
->aapcs_arg_processed
= true;
6956 pure_scalable_type_info pst_info
;
6957 if (type
&& pst_info
.analyze_registers (type
))
6959 /* aarch64_function_arg_alignment has never had an effect on
6962 /* The PCS says that it is invalid to pass an SVE value to an
6963 unprototyped function. There is no ABI-defined location we
6964 can return in this case, so we have no real choice but to raise
6965 an error immediately, even though this is only a query function. */
6966 if (arg
.named
&& pcum
->pcs_variant
!= ARM_PCS_SVE
)
6968 gcc_assert (!pcum
->silent_p
);
6969 error ("SVE type %qT cannot be passed to an unprototyped function",
6971 /* Avoid repeating the message, and avoid tripping the assert
6973 pcum
->pcs_variant
= ARM_PCS_SVE
;
6976 /* We would have converted the argument into pass-by-reference
6977 form if it didn't fit in registers. */
6978 pcum
->aapcs_nextnvrn
= pcum
->aapcs_nvrn
+ pst_info
.num_zr ();
6979 pcum
->aapcs_nextnprn
= pcum
->aapcs_nprn
+ pst_info
.num_pr ();
6980 gcc_assert (arg
.named
6981 && pcum
->pcs_variant
== ARM_PCS_SVE
6982 && pcum
->aapcs_nextnvrn
<= NUM_FP_ARG_REGS
6983 && pcum
->aapcs_nextnprn
<= NUM_PR_ARG_REGS
);
6984 pcum
->aapcs_reg
= pst_info
.get_rtx (mode
, V0_REGNUM
+ pcum
->aapcs_nvrn
,
6985 P0_REGNUM
+ pcum
->aapcs_nprn
);
6989 /* Generic vectors that map to full SVE modes with -msve-vector-bits=N
6990 are passed by reference, not by value. */
6991 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
6992 bool sve_p
= (vec_flags
& VEC_ANY_SVE
);
6994 /* Vector types can acquire a partial SVE mode using things like
6995 __attribute__((vector_size(N))), and this is potentially useful.
6996 However, the choice of mode doesn't affect the type's ABI
6997 identity, so we should treat the types as though they had
6998 the associated integer mode, just like they did before SVE
7001 We know that the vector must be 128 bits or smaller,
7002 otherwise we'd have passed it in memory instead. */
7004 && (aarch64_some_values_include_pst_objects_p (type
)
7005 || (vec_flags
& VEC_PARTIAL
)));
7007 /* Size in bytes, rounded to the nearest multiple of 8 bytes. */
7009 size
= int_size_in_bytes (type
);
7011 /* No frontends can create types with variable-sized modes, so we
7012 shouldn't be asked to pass or return them. */
7013 size
= GET_MODE_SIZE (mode
).to_constant ();
7014 size
= ROUND_UP (size
, UNITS_PER_WORD
);
7016 allocate_ncrn
= (type
) ? !(FLOAT_TYPE_P (type
)) : !FLOAT_MODE_P (mode
);
7017 allocate_nvrn
= aarch64_vfp_is_call_candidate (pcum_v
,
7021 gcc_assert (!sve_p
|| !allocate_nvrn
);
7023 unsigned int alignment
7024 = aarch64_function_arg_alignment (mode
, type
, &abi_break_gcc_9
,
7025 &abi_break_gcc_13
, &abi_break_gcc_14
);
7027 gcc_assert ((allocate_nvrn
|| alignment
<= 16 * BITS_PER_UNIT
)
7028 && (!alignment
|| abi_break_gcc_9
< alignment
)
7029 && (!abi_break_gcc_13
|| alignment
< abi_break_gcc_13
));
7031 /* _BitInt(N) was only added in GCC 14. */
7032 bool warn_pcs_change_le_gcc14
7033 = warn_pcs_change
&& !bitint_or_aggr_of_bitint_p (type
);
7035 /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
7036 The following code thus handles passing by SIMD/FP registers first. */
7038 nvrn
= pcum
->aapcs_nvrn
;
7040 /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
7041 and homogenous short-vector aggregates (HVA). */
7044 /* aarch64_function_arg_alignment has never had an effect on
7046 if (!pcum
->silent_p
&& !TARGET_FLOAT
)
7047 aarch64_err_no_fpadvsimd (mode
);
7049 if (nvrn
+ nregs
<= NUM_FP_ARG_REGS
)
7051 pcum
->aapcs_nextnvrn
= nvrn
+ nregs
;
7052 if (!aarch64_composite_type_p (type
, mode
))
7054 gcc_assert (nregs
== 1);
7055 pcum
->aapcs_reg
= gen_rtx_REG (mode
, V0_REGNUM
+ nvrn
);
7057 else if (aarch64_advsimd_full_struct_mode_p (mode
)
7058 && known_eq (GET_MODE_SIZE (pcum
->aapcs_vfp_rmode
), 16))
7059 pcum
->aapcs_reg
= gen_rtx_REG (mode
, V0_REGNUM
+ nvrn
);
7060 else if (aarch64_advsimd_partial_struct_mode_p (mode
)
7061 && known_eq (GET_MODE_SIZE (pcum
->aapcs_vfp_rmode
), 8))
7062 pcum
->aapcs_reg
= gen_rtx_REG (mode
, V0_REGNUM
+ nvrn
);
7067 par
= gen_rtx_PARALLEL (mode
, rtvec_alloc (nregs
));
7068 for (i
= 0; i
< nregs
; i
++)
7070 rtx tmp
= gen_rtx_REG (pcum
->aapcs_vfp_rmode
,
7071 V0_REGNUM
+ nvrn
+ i
);
7072 rtx offset
= gen_int_mode
7073 (i
* GET_MODE_SIZE (pcum
->aapcs_vfp_rmode
), Pmode
);
7074 tmp
= gen_rtx_EXPR_LIST (VOIDmode
, tmp
, offset
);
7075 XVECEXP (par
, 0, i
) = tmp
;
7077 pcum
->aapcs_reg
= par
;
7083 /* C.3 NSRN is set to 8. */
7084 pcum
->aapcs_nextnvrn
= NUM_FP_ARG_REGS
;
7089 ncrn
= pcum
->aapcs_ncrn
;
7090 nregs
= size
/ UNITS_PER_WORD
;
7092 /* C6 - C9. though the sign and zero extension semantics are
7093 handled elsewhere. This is the case where the argument fits
7094 entirely general registers. */
7095 if (allocate_ncrn
&& (ncrn
+ nregs
<= NUM_ARG_REGS
))
7097 gcc_assert (nregs
== 0 || nregs
== 1 || nregs
== 2);
7099 /* C.8 if the argument has an alignment of 16 then the NGRN is
7100 rounded up to the next even number. */
7104 /* Emit a warning if the alignment changed when taking the
7105 'packed' attribute into account. */
7106 if (warn_pcs_change_le_gcc14
7108 && ((abi_break_gcc_13
== 16 * BITS_PER_UNIT
)
7109 != (alignment
== 16 * BITS_PER_UNIT
)))
7110 inform (input_location
, "parameter passing for argument of type "
7111 "%qT changed in GCC 13.1", type
);
7113 if (warn_pcs_change_le_gcc14
7115 && ((abi_break_gcc_14
== 16 * BITS_PER_UNIT
)
7116 != (alignment
== 16 * BITS_PER_UNIT
)))
7117 inform (input_location
, "parameter passing for argument of type "
7118 "%qT changed in GCC 14.1", type
);
7120 /* The == 16 * BITS_PER_UNIT instead of >= 16 * BITS_PER_UNIT
7121 comparison is there because for > 16 * BITS_PER_UNIT
7122 alignment nregs should be > 2 and therefore it should be
7123 passed by reference rather than value. */
7124 if (alignment
== 16 * BITS_PER_UNIT
)
7126 if (warn_pcs_change_le_gcc14
7128 inform (input_location
, "parameter passing for argument of type "
7129 "%qT changed in GCC 9.1", type
);
7131 gcc_assert (ncrn
+ nregs
<= NUM_ARG_REGS
);
7135 /* If an argument with an SVE mode needs to be shifted up to the
7136 high part of the register, treat it as though it had an integer mode.
7137 Using the normal (parallel [...]) would suppress the shifting. */
7140 && maybe_ne (GET_MODE_SIZE (mode
), nregs
* UNITS_PER_WORD
)
7141 && aarch64_pad_reg_upward (mode
, type
, false))
7143 mode
= int_mode_for_mode (mode
).require ();
7147 /* NREGS can be 0 when e.g. an empty structure is to be passed.
7148 A reg is still generated for it, but the caller should be smart
7149 enough not to use it. */
7151 || (nregs
== 1 && !sve_p
)
7152 || GET_MODE_CLASS (mode
) == MODE_INT
)
7153 pcum
->aapcs_reg
= gen_rtx_REG (mode
, R0_REGNUM
+ ncrn
);
7159 par
= gen_rtx_PARALLEL (mode
, rtvec_alloc (nregs
));
7160 for (i
= 0; i
< nregs
; i
++)
7162 scalar_int_mode reg_mode
= word_mode
;
7164 reg_mode
= int_mode_for_mode (mode
).require ();
7165 rtx tmp
= gen_rtx_REG (reg_mode
, R0_REGNUM
+ ncrn
+ i
);
7166 tmp
= gen_rtx_EXPR_LIST (VOIDmode
, tmp
,
7167 GEN_INT (i
* UNITS_PER_WORD
));
7168 XVECEXP (par
, 0, i
) = tmp
;
7170 pcum
->aapcs_reg
= par
;
7173 pcum
->aapcs_nextncrn
= ncrn
+ nregs
;
7178 pcum
->aapcs_nextncrn
= NUM_ARG_REGS
;
7180 /* The argument is passed on stack; record the needed number of words for
7181 this argument and align the total size if necessary. */
7183 pcum
->aapcs_stack_words
= size
/ UNITS_PER_WORD
;
7185 if (warn_pcs_change_le_gcc14
7187 && ((abi_break_gcc_13
>= 16 * BITS_PER_UNIT
)
7188 != (alignment
>= 16 * BITS_PER_UNIT
)))
7189 inform (input_location
, "parameter passing for argument of type "
7190 "%qT changed in GCC 13.1", type
);
7192 if (warn_pcs_change_le_gcc14
7194 && ((abi_break_gcc_14
>= 16 * BITS_PER_UNIT
)
7195 != (alignment
>= 16 * BITS_PER_UNIT
)))
7196 inform (input_location
, "parameter passing for argument of type "
7197 "%qT changed in GCC 14.1", type
);
7199 if (alignment
== 16 * BITS_PER_UNIT
)
7201 int new_size
= ROUND_UP (pcum
->aapcs_stack_size
, 16 / UNITS_PER_WORD
);
7202 if (pcum
->aapcs_stack_size
!= new_size
)
7204 if (warn_pcs_change_le_gcc14
7206 inform (input_location
, "parameter passing for argument of type "
7207 "%qT changed in GCC 9.1", type
);
7208 pcum
->aapcs_stack_size
= new_size
;
7214 /* Add the current argument register to the set of those that need
7215 to be saved and restored around a change to PSTATE.SM. */
7218 aarch64_record_sme_mode_switch_args (CUMULATIVE_ARGS
*pcum
)
7220 subrtx_var_iterator::array_type array
;
7221 FOR_EACH_SUBRTX_VAR (iter
, array
, pcum
->aapcs_reg
, NONCONST
)
7224 if (REG_P (x
) && (FP_REGNUM_P (REGNO (x
)) || PR_REGNUM_P (REGNO (x
))))
7226 unsigned int i
= pcum
->num_sme_mode_switch_args
++;
7227 gcc_assert (i
< ARRAY_SIZE (pcum
->sme_mode_switch_args
));
7228 pcum
->sme_mode_switch_args
[i
] = x
;
7233 /* Return a parallel that contains all the registers that need to be
7234 saved around a change to PSTATE.SM. Return const0_rtx if there is
7235 no such mode switch, or if no registers need to be saved. */
7238 aarch64_finish_sme_mode_switch_args (CUMULATIVE_ARGS
*pcum
)
7240 if (!pcum
->num_sme_mode_switch_args
)
7243 auto argvec
= gen_rtvec_v (pcum
->num_sme_mode_switch_args
,
7244 pcum
->sme_mode_switch_args
);
7245 return gen_rtx_PARALLEL (VOIDmode
, argvec
);
7248 /* Implement TARGET_FUNCTION_ARG. */
7251 aarch64_function_arg (cumulative_args_t pcum_v
, const function_arg_info
&arg
)
7253 CUMULATIVE_ARGS
*pcum
= get_cumulative_args (pcum_v
);
7254 gcc_assert (pcum
->pcs_variant
== ARM_PCS_AAPCS64
7255 || pcum
->pcs_variant
== ARM_PCS_SIMD
7256 || pcum
->pcs_variant
== ARM_PCS_SVE
);
7258 if (arg
.end_marker_p ())
7260 rtx abi_cookie
= aarch64_gen_callee_cookie (pcum
->isa_mode
,
7262 rtx sme_mode_switch_args
= aarch64_finish_sme_mode_switch_args (pcum
);
7263 rtx shared_za_flags
= gen_int_mode (pcum
->shared_za_flags
, SImode
);
7264 rtx shared_zt0_flags
= gen_int_mode (pcum
->shared_zt0_flags
, SImode
);
7265 return gen_rtx_PARALLEL (VOIDmode
, gen_rtvec (4, abi_cookie
,
7266 sme_mode_switch_args
,
7271 aarch64_layout_arg (pcum_v
, arg
);
7272 return pcum
->aapcs_reg
;
7276 aarch64_init_cumulative_args (CUMULATIVE_ARGS
*pcum
,
7278 rtx libname ATTRIBUTE_UNUSED
,
7280 unsigned n_named ATTRIBUTE_UNUSED
,
7283 pcum
->aapcs_ncrn
= 0;
7284 pcum
->aapcs_nvrn
= 0;
7285 pcum
->aapcs_nprn
= 0;
7286 pcum
->aapcs_nextncrn
= 0;
7287 pcum
->aapcs_nextnvrn
= 0;
7288 pcum
->aapcs_nextnprn
= 0;
7291 pcum
->pcs_variant
= (arm_pcs
) fntype_abi (fntype
).id ();
7292 pcum
->isa_mode
= aarch64_fntype_isa_mode (fntype
);
7296 pcum
->pcs_variant
= ARM_PCS_AAPCS64
;
7297 pcum
->isa_mode
= AARCH64_DEFAULT_ISA_MODE
;
7299 pcum
->aapcs_reg
= NULL_RTX
;
7300 pcum
->aapcs_arg_processed
= false;
7301 pcum
->aapcs_stack_words
= 0;
7302 pcum
->aapcs_stack_size
= 0;
7303 pcum
->silent_p
= silent_p
;
7304 pcum
->shared_za_flags
7305 = (fntype
? aarch64_fntype_shared_flags (fntype
, "za") : 0U);
7306 pcum
->shared_zt0_flags
7307 = (fntype
? aarch64_fntype_shared_flags (fntype
, "zt0") : 0U);
7308 pcum
->num_sme_mode_switch_args
= 0;
7312 && fntype
&& fntype
!= error_mark_node
)
7314 const_tree type
= TREE_TYPE (fntype
);
7315 machine_mode mode ATTRIBUTE_UNUSED
; /* To pass pointer as argument. */
7316 int nregs ATTRIBUTE_UNUSED
; /* Likewise. */
7317 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type
), type
,
7318 &mode
, &nregs
, NULL
, false))
7319 aarch64_err_no_fpadvsimd (TYPE_MODE (type
));
7324 && pcum
->pcs_variant
== ARM_PCS_SVE
)
7326 /* We can't gracefully recover at this point, so make this a
7329 fatal_error (input_location
, "%qE requires the SVE ISA extension",
7332 fatal_error (input_location
, "calls to functions of type %qT require"
7333 " the SVE ISA extension", fntype
);
7338 aarch64_function_arg_advance (cumulative_args_t pcum_v
,
7339 const function_arg_info
&arg
)
7341 CUMULATIVE_ARGS
*pcum
= get_cumulative_args (pcum_v
);
7342 if (pcum
->pcs_variant
== ARM_PCS_AAPCS64
7343 || pcum
->pcs_variant
== ARM_PCS_SIMD
7344 || pcum
->pcs_variant
== ARM_PCS_SVE
)
7346 aarch64_layout_arg (pcum_v
, arg
);
7347 gcc_assert ((pcum
->aapcs_reg
!= NULL_RTX
)
7348 != (pcum
->aapcs_stack_words
!= 0));
7350 && aarch64_call_switches_pstate_sm (pcum
->isa_mode
))
7351 aarch64_record_sme_mode_switch_args (pcum
);
7353 pcum
->aapcs_arg_processed
= false;
7354 pcum
->aapcs_ncrn
= pcum
->aapcs_nextncrn
;
7355 pcum
->aapcs_nvrn
= pcum
->aapcs_nextnvrn
;
7356 pcum
->aapcs_nprn
= pcum
->aapcs_nextnprn
;
7357 pcum
->aapcs_stack_size
+= pcum
->aapcs_stack_words
;
7358 pcum
->aapcs_stack_words
= 0;
7359 pcum
->aapcs_reg
= NULL_RTX
;
7364 aarch64_function_arg_regno_p (unsigned regno
)
7366 return ((GP_REGNUM_P (regno
) && regno
< R0_REGNUM
+ NUM_ARG_REGS
)
7367 || (FP_REGNUM_P (regno
) && regno
< V0_REGNUM
+ NUM_FP_ARG_REGS
)
7368 || (PR_REGNUM_P (regno
) && regno
< P0_REGNUM
+ NUM_PR_ARG_REGS
));
7371 /* Implement FUNCTION_ARG_BOUNDARY. Every parameter gets at least
7372 PARM_BOUNDARY bits of alignment, but will be given anything up
7373 to STACK_BOUNDARY bits if the type requires it. This makes sure
7374 that both before and after the layout of each argument, the Next
7375 Stacked Argument Address (NSAA) will have a minimum alignment of
7379 aarch64_function_arg_boundary (machine_mode mode
, const_tree type
)
7381 unsigned int abi_break_gcc_9
;
7382 unsigned int abi_break_gcc_13
;
7383 unsigned int abi_break_gcc_14
;
7384 unsigned int alignment
= aarch64_function_arg_alignment (mode
, type
,
7388 /* We rely on aarch64_layout_arg and aarch64_gimplify_va_arg_expr
7389 to emit warnings about ABI incompatibility. */
7390 alignment
= MIN (MAX (alignment
, PARM_BOUNDARY
), STACK_BOUNDARY
);
7394 /* Implement TARGET_GET_RAW_RESULT_MODE and TARGET_GET_RAW_ARG_MODE. */
7396 static fixed_size_mode
7397 aarch64_get_reg_raw_mode (int regno
)
7399 /* Don't use any non GP registers for __builtin_apply and
7400 __builtin_return if general registers only mode is requested. */
7401 if (TARGET_GENERAL_REGS_ONLY
&& !GP_REGNUM_P (regno
))
7402 return as_a
<fixed_size_mode
> (VOIDmode
);
7403 if (TARGET_SVE
&& FP_REGNUM_P (regno
))
7404 /* Don't use the SVE part of the register for __builtin_apply and
7405 __builtin_return. The SVE registers aren't used by the normal PCS,
7406 so using them there would be a waste of time. The PCS extensions
7407 for SVE types are fundamentally incompatible with the
7408 __builtin_return/__builtin_apply interface. */
7409 return as_a
<fixed_size_mode
> (V16QImode
);
7410 if (PR_REGNUM_P (regno
))
7411 /* For SVE PR regs, indicate that they should be ignored for
7412 __builtin_apply/__builtin_return. */
7413 return as_a
<fixed_size_mode
> (VOIDmode
);
7414 return default_get_reg_raw_mode (regno
);
7417 /* Implement TARGET_FUNCTION_ARG_PADDING.
7419 Small aggregate types are placed in the lowest memory address.
7421 The related parameter passing rules are B.4, C.3, C.5 and C.14. */
7423 static pad_direction
7424 aarch64_function_arg_padding (machine_mode mode
, const_tree type
)
7426 /* On little-endian targets, the least significant byte of every stack
7427 argument is passed at the lowest byte address of the stack slot. */
7428 if (!BYTES_BIG_ENDIAN
)
7431 /* Otherwise, integral, floating-point and pointer types are padded downward:
7432 the least significant byte of a stack argument is passed at the highest
7433 byte address of the stack slot. */
7435 ? (INTEGRAL_TYPE_P (type
) || SCALAR_FLOAT_TYPE_P (type
)
7436 || POINTER_TYPE_P (type
))
7437 : (SCALAR_INT_MODE_P (mode
) || SCALAR_FLOAT_MODE_P (mode
)))
7438 return PAD_DOWNWARD
;
7440 /* Everything else padded upward, i.e. data in first byte of stack slot. */
7444 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
7446 It specifies padding for the last (may also be the only)
7447 element of a block move between registers and memory. If
7448 assuming the block is in the memory, padding upward means that
7449 the last element is padded after its highest significant byte,
7450 while in downward padding, the last element is padded at the
7451 its least significant byte side.
7453 Small aggregates and small complex types are always padded
7456 We don't need to worry about homogeneous floating-point or
7457 short-vector aggregates; their move is not affected by the
7458 padding direction determined here. Regardless of endianness,
7459 each element of such an aggregate is put in the least
7460 significant bits of a fp/simd register.
7462 Return !BYTES_BIG_ENDIAN if the least significant byte of the
7463 register has useful data, and return the opposite if the most
7464 significant byte does. */
7467 aarch64_pad_reg_upward (machine_mode mode
, const_tree type
,
7468 bool first ATTRIBUTE_UNUSED
)
7471 /* Aside from pure scalable types, small composite types are always
7473 if (BYTES_BIG_ENDIAN
&& aarch64_composite_type_p (type
, mode
))
7477 size
= int_size_in_bytes (type
);
7479 /* No frontends can create types with variable-sized modes, so we
7480 shouldn't be asked to pass or return them. */
7481 size
= GET_MODE_SIZE (mode
).to_constant ();
7482 if (size
< 2 * UNITS_PER_WORD
)
7484 pure_scalable_type_info pst_info
;
7485 if (pst_info
.analyze_registers (type
))
7491 /* Otherwise, use the default padding. */
7492 return !BYTES_BIG_ENDIAN
;
7495 static scalar_int_mode
7496 aarch64_libgcc_cmp_return_mode (void)
7501 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
7503 /* We use the 12-bit shifted immediate arithmetic instructions so values
7504 must be multiple of (1 << 12), i.e. 4096. */
7505 #define ARITH_FACTOR 4096
7507 #if (PROBE_INTERVAL % ARITH_FACTOR) != 0
7508 #error Cannot use simple address calculation for stack probing
7511 /* Emit code to probe a range of stack addresses from FIRST to FIRST+POLY_SIZE,
7512 inclusive. These are offsets from the current stack pointer. */
7515 aarch64_emit_probe_stack_range (HOST_WIDE_INT first
, poly_int64 poly_size
)
7518 if (!poly_size
.is_constant (&size
))
7520 sorry ("stack probes for SVE frames");
7524 rtx reg1
= gen_rtx_REG (Pmode
, PROBE_STACK_FIRST_REGNUM
);
7526 /* See the same assertion on PROBE_INTERVAL above. */
7527 gcc_assert ((first
% ARITH_FACTOR
) == 0);
7529 /* See if we have a constant small number of probes to generate. If so,
7530 that's the easy case. */
7531 if (size
<= PROBE_INTERVAL
)
7533 const HOST_WIDE_INT base
= ROUND_UP (size
, ARITH_FACTOR
);
7535 emit_set_insn (reg1
,
7536 plus_constant (Pmode
,
7537 stack_pointer_rtx
, -(first
+ base
)));
7538 emit_stack_probe (plus_constant (Pmode
, reg1
, base
- size
));
7541 /* The run-time loop is made up of 8 insns in the generic case while the
7542 compile-time loop is made up of 4+2*(n-2) insns for n # of intervals. */
7543 else if (size
<= 4 * PROBE_INTERVAL
)
7545 HOST_WIDE_INT i
, rem
;
7547 emit_set_insn (reg1
,
7548 plus_constant (Pmode
,
7550 -(first
+ PROBE_INTERVAL
)));
7551 emit_stack_probe (reg1
);
7553 /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 2 until
7554 it exceeds SIZE. If only two probes are needed, this will not
7555 generate any code. Then probe at FIRST + SIZE. */
7556 for (i
= 2 * PROBE_INTERVAL
; i
< size
; i
+= PROBE_INTERVAL
)
7558 emit_set_insn (reg1
,
7559 plus_constant (Pmode
, reg1
, -PROBE_INTERVAL
));
7560 emit_stack_probe (reg1
);
7563 rem
= size
- (i
- PROBE_INTERVAL
);
7566 const HOST_WIDE_INT base
= ROUND_UP (rem
, ARITH_FACTOR
);
7568 emit_set_insn (reg1
, plus_constant (Pmode
, reg1
, -base
));
7569 emit_stack_probe (plus_constant (Pmode
, reg1
, base
- rem
));
7572 emit_stack_probe (plus_constant (Pmode
, reg1
, -rem
));
7575 /* Otherwise, do the same as above, but in a loop. Note that we must be
7576 extra careful with variables wrapping around because we might be at
7577 the very top (or the very bottom) of the address space and we have
7578 to be able to handle this case properly; in particular, we use an
7579 equality test for the loop condition. */
7582 rtx reg2
= gen_rtx_REG (Pmode
, PROBE_STACK_SECOND_REGNUM
);
7584 /* Step 1: round SIZE to the previous multiple of the interval. */
7586 HOST_WIDE_INT rounded_size
= size
& -PROBE_INTERVAL
;
7589 /* Step 2: compute initial and final value of the loop counter. */
7591 /* TEST_ADDR = SP + FIRST. */
7592 emit_set_insn (reg1
,
7593 plus_constant (Pmode
, stack_pointer_rtx
, -first
));
7595 /* LAST_ADDR = SP + FIRST + ROUNDED_SIZE. */
7596 HOST_WIDE_INT adjustment
= - (first
+ rounded_size
);
7597 if (! aarch64_uimm12_shift (adjustment
))
7599 aarch64_internal_mov_immediate (reg2
, GEN_INT (adjustment
),
7601 emit_set_insn (reg2
, gen_rtx_PLUS (Pmode
, stack_pointer_rtx
, reg2
));
7604 emit_set_insn (reg2
,
7605 plus_constant (Pmode
, stack_pointer_rtx
, adjustment
));
7611 TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
7614 while (TEST_ADDR != LAST_ADDR)
7616 probes at FIRST + N * PROBE_INTERVAL for values of N from 1
7617 until it is equal to ROUNDED_SIZE. */
7619 emit_insn (gen_probe_stack_range (reg1
, reg1
, reg2
));
7622 /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
7623 that SIZE is equal to ROUNDED_SIZE. */
7625 if (size
!= rounded_size
)
7627 HOST_WIDE_INT rem
= size
- rounded_size
;
7631 const HOST_WIDE_INT base
= ROUND_UP (rem
, ARITH_FACTOR
);
7633 emit_set_insn (reg2
, plus_constant (Pmode
, reg2
, -base
));
7634 emit_stack_probe (plus_constant (Pmode
, reg2
, base
- rem
));
7637 emit_stack_probe (plus_constant (Pmode
, reg2
, -rem
));
7641 /* Make sure nothing is scheduled before we are done. */
7642 emit_insn (gen_blockage ());
7645 /* Probe a range of stack addresses from REG1 to REG2 inclusive. These are
7646 absolute addresses. */
7649 aarch64_output_probe_stack_range (rtx reg1
, rtx reg2
)
7651 static int labelno
= 0;
7655 ASM_GENERATE_INTERNAL_LABEL (loop_lab
, "LPSRL", labelno
++);
7658 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file
, loop_lab
);
7660 HOST_WIDE_INT stack_clash_probe_interval
7661 = 1 << param_stack_clash_protection_guard_size
;
7663 /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL. */
7665 HOST_WIDE_INT interval
;
7666 if (flag_stack_clash_protection
)
7667 interval
= stack_clash_probe_interval
;
7669 interval
= PROBE_INTERVAL
;
7671 gcc_assert (aarch64_uimm12_shift (interval
));
7672 xops
[1] = GEN_INT (interval
);
7674 output_asm_insn ("sub\t%0, %0, %1", xops
);
7676 /* If doing stack clash protection then we probe up by the ABI specified
7677 amount. We do this because we're dropping full pages at a time in the
7678 loop. But if we're doing non-stack clash probing, probe at SP 0. */
7679 if (flag_stack_clash_protection
)
7680 xops
[1] = GEN_INT (STACK_CLASH_CALLER_GUARD
);
7682 xops
[1] = CONST0_RTX (GET_MODE (xops
[1]));
7684 /* Probe at TEST_ADDR. If we're inside the loop it is always safe to probe
7685 by this amount for each iteration. */
7686 output_asm_insn ("str\txzr, [%0, %1]", xops
);
7688 /* Test if TEST_ADDR == LAST_ADDR. */
7690 output_asm_insn ("cmp\t%0, %1", xops
);
7693 fputs ("\tb.ne\t", asm_out_file
);
7694 assemble_name_raw (asm_out_file
, loop_lab
);
7695 fputc ('\n', asm_out_file
);
7700 /* Emit the probe loop for doing stack clash probes and stack adjustments for
7701 SVE. This emits probes from BASE to BASE - ADJUSTMENT based on a guard size
7702 of GUARD_SIZE. When a probe is emitted it is done at most
7703 MIN_PROBE_THRESHOLD bytes from the current BASE at an interval of
7704 at most MIN_PROBE_THRESHOLD. By the end of this function
7705 BASE = BASE - ADJUSTMENT. */
7708 aarch64_output_probe_sve_stack_clash (rtx base
, rtx adjustment
,
7709 rtx min_probe_threshold
, rtx guard_size
)
7711 /* This function is not allowed to use any instruction generation function
7712 like gen_ and friends. If you do you'll likely ICE during CFG validation,
7713 so instead emit the code you want using output_asm_insn. */
7714 gcc_assert (flag_stack_clash_protection
);
7715 gcc_assert (CONST_INT_P (min_probe_threshold
) && CONST_INT_P (guard_size
));
7716 gcc_assert (INTVAL (guard_size
) > INTVAL (min_probe_threshold
));
7718 /* The minimum required allocation before the residual requires probing. */
7719 HOST_WIDE_INT residual_probe_guard
= INTVAL (min_probe_threshold
);
7721 /* Clamp the value down to the nearest value that can be used with a cmp. */
7722 residual_probe_guard
= aarch64_clamp_to_uimm12_shift (residual_probe_guard
);
7723 rtx probe_offset_value_rtx
= gen_int_mode (residual_probe_guard
, Pmode
);
7725 gcc_assert (INTVAL (min_probe_threshold
) >= residual_probe_guard
);
7726 gcc_assert (aarch64_uimm12_shift (residual_probe_guard
));
7728 static int labelno
= 0;
7729 char loop_start_lab
[32];
7730 char loop_end_lab
[32];
7733 ASM_GENERATE_INTERNAL_LABEL (loop_start_lab
, "SVLPSPL", labelno
);
7734 ASM_GENERATE_INTERNAL_LABEL (loop_end_lab
, "SVLPEND", labelno
++);
7736 /* Emit loop start label. */
7737 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file
, loop_start_lab
);
7739 /* ADJUSTMENT < RESIDUAL_PROBE_GUARD. */
7740 xops
[0] = adjustment
;
7741 xops
[1] = probe_offset_value_rtx
;
7742 output_asm_insn ("cmp\t%0, %1", xops
);
7744 /* Branch to end if not enough adjustment to probe. */
7745 fputs ("\tb.lt\t", asm_out_file
);
7746 assemble_name_raw (asm_out_file
, loop_end_lab
);
7747 fputc ('\n', asm_out_file
);
7749 /* BASE = BASE - RESIDUAL_PROBE_GUARD. */
7751 xops
[1] = probe_offset_value_rtx
;
7752 output_asm_insn ("sub\t%0, %0, %1", xops
);
7754 /* Probe at BASE. */
7755 xops
[1] = const0_rtx
;
7756 output_asm_insn ("str\txzr, [%0, %1]", xops
);
7758 /* ADJUSTMENT = ADJUSTMENT - RESIDUAL_PROBE_GUARD. */
7759 xops
[0] = adjustment
;
7760 xops
[1] = probe_offset_value_rtx
;
7761 output_asm_insn ("sub\t%0, %0, %1", xops
);
7763 /* Branch to start if still more bytes to allocate. */
7764 fputs ("\tb\t", asm_out_file
);
7765 assemble_name_raw (asm_out_file
, loop_start_lab
);
7766 fputc ('\n', asm_out_file
);
7768 /* No probe leave. */
7769 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file
, loop_end_lab
);
7771 /* BASE = BASE - ADJUSTMENT. */
7773 xops
[1] = adjustment
;
7774 output_asm_insn ("sub\t%0, %0, %1", xops
);
7778 /* Determine whether a frame chain needs to be generated. */
7780 aarch64_needs_frame_chain (void)
7782 if (frame_pointer_needed
)
7785 /* A leaf function cannot have calls or write LR. */
7786 bool is_leaf
= crtl
->is_leaf
&& !df_regs_ever_live_p (LR_REGNUM
);
7788 /* Don't use a frame chain in leaf functions if leaf frame pointers
7790 if (flag_omit_leaf_frame_pointer
&& is_leaf
)
7793 return aarch64_use_frame_pointer
;
7796 /* Return true if the current function should save registers above
7797 the locals area, rather than below it. */
7800 aarch64_save_regs_above_locals_p ()
7802 /* When using stack smash protection, make sure that the canary slot
7803 comes between the locals and the saved registers. Otherwise,
7804 it would be possible for a carefully sized smash attack to change
7805 the saved registers (particularly LR and FP) without reaching the
7807 return crtl
->stack_protect_guard
;
7810 /* Return true if the current function needs to record the incoming
7811 value of PSTATE.SM. */
7813 aarch64_need_old_pstate_sm ()
7815 /* Exit early if the incoming value of PSTATE.SM is known at
7817 if (aarch64_cfun_incoming_pstate_sm () != 0)
7820 if (aarch64_cfun_enables_pstate_sm ())
7823 /* Non-local goto receivers are entered with PSTATE.SM equal to 0,
7824 but the function needs to return with PSTATE.SM unchanged. */
7825 if (nonlocal_goto_handler_labels
)
7828 /* Likewise for exception handlers. */
7830 for (unsigned int i
= 1; vec_safe_iterate (cfun
->eh
->lp_array
, i
, &lp
); ++i
)
7831 if (lp
&& lp
->post_landing_pad
)
7834 /* Non-local gotos need to set PSTATE.SM to zero. It's possible to call
7835 streaming-compatible functions without SME being available, so PSTATE.SM
7836 should only be changed if it is currently set to one. */
7837 if (crtl
->has_nonlocal_goto
)
7840 if (cfun
->machine
->call_switches_pstate_sm
)
7841 for (auto insn
= get_insns (); insn
; insn
= NEXT_INSN (insn
))
7842 if (auto *call
= dyn_cast
<rtx_call_insn
*> (insn
))
7843 if (!SIBLING_CALL_P (call
))
7845 /* Return true if there is a call to a non-streaming-compatible
7847 auto callee_isa_mode
= aarch64_insn_callee_isa_mode (call
);
7848 if (aarch64_call_switches_pstate_sm (callee_isa_mode
))
7854 /* Mark the registers that need to be saved by the callee and calculate
7855 the size of the callee-saved registers area and frame record (both FP
7856 and LR may be omitted). */
7858 aarch64_layout_frame (void)
7860 unsigned regno
, last_fp_reg
= INVALID_REGNUM
;
7861 machine_mode vector_save_mode
= aarch64_reg_save_mode (V8_REGNUM
);
7862 poly_int64 vector_save_size
= GET_MODE_SIZE (vector_save_mode
);
7863 bool frame_related_fp_reg_p
= false;
7864 aarch64_frame
&frame
= cfun
->machine
->frame
;
7865 poly_int64 top_of_locals
= -1;
7866 bool enables_pstate_sm
= aarch64_cfun_enables_pstate_sm ();
7868 vec_safe_truncate (frame
.saved_gprs
, 0);
7869 vec_safe_truncate (frame
.saved_fprs
, 0);
7870 vec_safe_truncate (frame
.saved_prs
, 0);
7872 frame
.emit_frame_chain
= aarch64_needs_frame_chain ();
7874 /* Adjust the outgoing arguments size if required. Keep it in sync with what
7875 the mid-end is doing. */
7876 crtl
->outgoing_args_size
= STACK_DYNAMIC_OFFSET (cfun
);
7878 #define SLOT_NOT_REQUIRED (-2)
7879 #define SLOT_REQUIRED (-1)
7881 frame
.wb_push_candidate1
= INVALID_REGNUM
;
7882 frame
.wb_push_candidate2
= INVALID_REGNUM
;
7883 frame
.spare_pred_reg
= INVALID_REGNUM
;
7885 /* First mark all the registers that really need to be saved... */
7886 for (regno
= 0; regno
<= LAST_SAVED_REGNUM
; regno
++)
7887 frame
.reg_offset
[regno
] = SLOT_NOT_REQUIRED
;
7888 frame
.old_svcr_offset
= SLOT_NOT_REQUIRED
;
7890 /* ... that includes the eh data registers (if needed)... */
7891 if (crtl
->calls_eh_return
)
7892 for (regno
= 0; EH_RETURN_DATA_REGNO (regno
) != INVALID_REGNUM
; regno
++)
7893 frame
.reg_offset
[EH_RETURN_DATA_REGNO (regno
)] = SLOT_REQUIRED
;
7895 /* ... and any callee saved register that dataflow says is live. */
7896 for (regno
= R0_REGNUM
; regno
<= R30_REGNUM
; regno
++)
7897 if (df_regs_ever_live_p (regno
)
7898 && !fixed_regs
[regno
]
7899 && (regno
== R30_REGNUM
7900 || !crtl
->abi
->clobbers_full_reg_p (regno
)))
7901 frame
.reg_offset
[regno
] = SLOT_REQUIRED
;
7903 for (regno
= V0_REGNUM
; regno
<= V31_REGNUM
; regno
++)
7904 if ((enables_pstate_sm
|| df_regs_ever_live_p (regno
))
7905 && !fixed_regs
[regno
]
7906 && !crtl
->abi
->clobbers_full_reg_p (regno
))
7908 frame
.reg_offset
[regno
] = SLOT_REQUIRED
;
7909 last_fp_reg
= regno
;
7910 if (aarch64_emit_cfi_for_reg_p (regno
))
7911 frame_related_fp_reg_p
= true;
7914 /* Big-endian SVE frames need a spare predicate register in order
7915 to save Z8-Z15. Decide which register they should use. Prefer
7916 an unused argument register if possible, so that we don't force P4
7917 to be saved unnecessarily. */
7918 if (frame_related_fp_reg_p
7919 && crtl
->abi
->id () == ARM_PCS_SVE
7920 && BYTES_BIG_ENDIAN
)
7922 bitmap live1
= df_get_live_out (ENTRY_BLOCK_PTR_FOR_FN (cfun
));
7923 bitmap live2
= df_get_live_in (EXIT_BLOCK_PTR_FOR_FN (cfun
));
7924 for (regno
= P0_REGNUM
; regno
<= P7_REGNUM
; regno
++)
7925 if (!bitmap_bit_p (live1
, regno
) && !bitmap_bit_p (live2
, regno
))
7927 gcc_assert (regno
<= P7_REGNUM
);
7928 frame
.spare_pred_reg
= regno
;
7929 df_set_regs_ever_live (regno
, true);
7932 for (regno
= P0_REGNUM
; regno
<= P15_REGNUM
; regno
++)
7933 if ((enables_pstate_sm
|| df_regs_ever_live_p (regno
))
7934 && !fixed_regs
[regno
]
7935 && !crtl
->abi
->clobbers_full_reg_p (regno
))
7936 frame
.reg_offset
[regno
] = SLOT_REQUIRED
;
7938 bool regs_at_top_p
= aarch64_save_regs_above_locals_p ();
7940 poly_int64 offset
= crtl
->outgoing_args_size
;
7941 gcc_assert (multiple_p (offset
, STACK_BOUNDARY
/ BITS_PER_UNIT
));
7944 offset
+= get_frame_size ();
7945 offset
= aligned_upper_bound (offset
, STACK_BOUNDARY
/ BITS_PER_UNIT
);
7946 top_of_locals
= offset
;
7948 frame
.bytes_below_saved_regs
= offset
;
7949 frame
.sve_save_and_probe
= INVALID_REGNUM
;
7951 /* Now assign stack slots for the registers. Start with the predicate
7952 registers, since predicate LDR and STR have a relatively small
7953 offset range. These saves happen below the hard frame pointer. */
7954 for (regno
= P0_REGNUM
; regno
<= P15_REGNUM
; regno
++)
7955 if (known_eq (frame
.reg_offset
[regno
], SLOT_REQUIRED
))
7957 vec_safe_push (frame
.saved_prs
, regno
);
7958 if (frame
.sve_save_and_probe
== INVALID_REGNUM
)
7959 frame
.sve_save_and_probe
= regno
;
7960 frame
.reg_offset
[regno
] = offset
;
7961 offset
+= BYTES_PER_SVE_PRED
;
7964 poly_int64 saved_prs_size
= offset
- frame
.bytes_below_saved_regs
;
7965 if (maybe_ne (saved_prs_size
, 0))
7967 /* If we have any vector registers to save above the predicate registers,
7968 the offset of the vector register save slots need to be a multiple
7969 of the vector size. This lets us use the immediate forms of LDR/STR
7970 (or LD1/ST1 for big-endian).
7972 A vector register is 8 times the size of a predicate register,
7973 and we need to save a maximum of 12 predicate registers, so the
7974 first vector register will be at either #1, MUL VL or #2, MUL VL.
7976 If we don't have any vector registers to save, and we know how
7977 big the predicate save area is, we can just round it up to the
7978 next 16-byte boundary. */
7979 if (last_fp_reg
== INVALID_REGNUM
&& offset
.is_constant ())
7980 offset
= aligned_upper_bound (offset
, STACK_BOUNDARY
/ BITS_PER_UNIT
);
7983 if (known_le (saved_prs_size
, vector_save_size
))
7984 offset
= frame
.bytes_below_saved_regs
+ vector_save_size
;
7985 else if (known_le (saved_prs_size
, vector_save_size
* 2))
7986 offset
= frame
.bytes_below_saved_regs
+ vector_save_size
* 2;
7992 /* If we need to save any SVE vector registers, add them next. */
7993 if (last_fp_reg
!= INVALID_REGNUM
&& crtl
->abi
->id () == ARM_PCS_SVE
)
7994 for (regno
= V0_REGNUM
; regno
<= V31_REGNUM
; regno
++)
7995 if (known_eq (frame
.reg_offset
[regno
], SLOT_REQUIRED
))
7997 vec_safe_push (frame
.saved_fprs
, regno
);
7998 if (frame
.sve_save_and_probe
== INVALID_REGNUM
)
7999 frame
.sve_save_and_probe
= regno
;
8000 frame
.reg_offset
[regno
] = offset
;
8001 offset
+= vector_save_size
;
8004 /* OFFSET is now the offset of the hard frame pointer from the bottom
8005 of the callee save area. */
8006 auto below_hard_fp_saved_regs_size
= offset
- frame
.bytes_below_saved_regs
;
8007 bool saves_below_hard_fp_p
= maybe_ne (below_hard_fp_saved_regs_size
, 0);
8008 gcc_assert (!saves_below_hard_fp_p
8009 || (frame
.sve_save_and_probe
!= INVALID_REGNUM
8010 && known_eq (frame
.reg_offset
[frame
.sve_save_and_probe
],
8011 frame
.bytes_below_saved_regs
)));
8013 frame
.bytes_below_hard_fp
= offset
;
8014 frame
.hard_fp_save_and_probe
= INVALID_REGNUM
;
8016 auto allocate_gpr_slot
= [&](unsigned int regno
)
8018 vec_safe_push (frame
.saved_gprs
, regno
);
8019 frame
.reg_offset
[regno
] = offset
;
8020 offset
+= UNITS_PER_WORD
;
8023 if (frame
.emit_frame_chain
)
8025 /* FP and LR are placed in the linkage record. */
8026 allocate_gpr_slot (R29_REGNUM
);
8027 allocate_gpr_slot (R30_REGNUM
);
8029 else if ((flag_stack_clash_protection
|| !frame
.is_scs_enabled
)
8030 && known_eq (frame
.reg_offset
[R30_REGNUM
], SLOT_REQUIRED
))
8031 /* Put the LR save slot first, since it makes a good choice of probe
8032 for stack clash purposes. The idea is that the link register usually
8033 has to be saved before a call anyway, and so we lose little by
8034 stopping it from being individually shrink-wrapped. */
8035 allocate_gpr_slot (R30_REGNUM
);
8037 for (regno
= R0_REGNUM
; regno
<= R30_REGNUM
; regno
++)
8038 if (known_eq (frame
.reg_offset
[regno
], SLOT_REQUIRED
))
8039 allocate_gpr_slot (regno
);
8041 if (aarch64_need_old_pstate_sm ())
8043 frame
.old_svcr_offset
= offset
;
8044 offset
+= UNITS_PER_WORD
;
8047 /* If the current function changes the SVE vector length, ensure that the
8048 old value of the DWARF VG register is saved and available in the CFI,
8049 so that outer frames with VL-sized offsets can be processed correctly. */
8050 if (cfun
->machine
->call_switches_pstate_sm
8051 || aarch64_cfun_enables_pstate_sm ())
8053 frame
.reg_offset
[VG_REGNUM
] = offset
;
8054 offset
+= UNITS_PER_WORD
;
8057 poly_int64 max_int_offset
= offset
;
8058 offset
= aligned_upper_bound (offset
, STACK_BOUNDARY
/ BITS_PER_UNIT
);
8059 bool has_align_gap
= maybe_ne (offset
, max_int_offset
);
8061 for (regno
= V0_REGNUM
; regno
<= V31_REGNUM
; regno
++)
8062 if (known_eq (frame
.reg_offset
[regno
], SLOT_REQUIRED
))
8064 vec_safe_push (frame
.saved_fprs
, regno
);
8065 /* If there is an alignment gap between integer and fp callee-saves,
8066 allocate the last fp register to it if possible. */
8067 if (regno
== last_fp_reg
8069 && known_eq (vector_save_size
, 8)
8070 && multiple_p (offset
, 16))
8072 frame
.reg_offset
[regno
] = max_int_offset
;
8076 frame
.reg_offset
[regno
] = offset
;
8077 offset
+= vector_save_size
;
8080 offset
= aligned_upper_bound (offset
, STACK_BOUNDARY
/ BITS_PER_UNIT
);
8081 auto saved_regs_size
= offset
- frame
.bytes_below_saved_regs
;
8083 array_slice
<unsigned int> push_regs
= (!vec_safe_is_empty (frame
.saved_gprs
)
8085 : frame
.saved_fprs
);
8086 if (!push_regs
.empty ()
8087 && known_eq (frame
.reg_offset
[push_regs
[0]], frame
.bytes_below_hard_fp
))
8089 frame
.hard_fp_save_and_probe
= push_regs
[0];
8090 frame
.wb_push_candidate1
= push_regs
[0];
8091 if (push_regs
.size () > 1)
8092 frame
.wb_push_candidate2
= push_regs
[1];
8095 /* With stack-clash, a register must be saved in non-leaf functions.
8096 The saving of the bottommost register counts as an implicit probe,
8097 which allows us to maintain the invariant described in the comment
8098 at expand_prologue. */
8099 gcc_assert (crtl
->is_leaf
|| maybe_ne (saved_regs_size
, 0));
8103 offset
+= get_frame_size ();
8104 offset
= aligned_upper_bound (offset
, STACK_BOUNDARY
/ BITS_PER_UNIT
);
8105 top_of_locals
= offset
;
8107 offset
+= frame
.saved_varargs_size
;
8108 gcc_assert (multiple_p (offset
, STACK_BOUNDARY
/ BITS_PER_UNIT
));
8109 frame
.frame_size
= offset
;
8111 frame
.bytes_above_hard_fp
= frame
.frame_size
- frame
.bytes_below_hard_fp
;
8112 gcc_assert (known_ge (top_of_locals
, 0));
8113 frame
.bytes_above_locals
= frame
.frame_size
- top_of_locals
;
8115 frame
.initial_adjust
= 0;
8116 frame
.final_adjust
= 0;
8117 frame
.callee_adjust
= 0;
8118 frame
.sve_callee_adjust
= 0;
8120 frame
.wb_pop_candidate1
= frame
.wb_push_candidate1
;
8121 frame
.wb_pop_candidate2
= frame
.wb_push_candidate2
;
8123 /* Shadow call stack only deals with functions where the LR is pushed
8124 onto the stack and without specifying the "no_sanitize" attribute
8125 with the argument "shadow-call-stack". */
8126 frame
.is_scs_enabled
8127 = (!crtl
->calls_eh_return
8128 && sanitize_flags_p (SANITIZE_SHADOW_CALL_STACK
)
8129 && known_ge (frame
.reg_offset
[LR_REGNUM
], 0));
8131 /* When shadow call stack is enabled, the scs_pop in the epilogue will
8132 restore x30, and we don't need to pop x30 again in the traditional
8133 way. Pop candidates record the registers that need to be popped
8135 if (frame
.is_scs_enabled
)
8137 if (frame
.wb_pop_candidate2
== R30_REGNUM
)
8138 frame
.wb_pop_candidate2
= INVALID_REGNUM
;
8139 else if (frame
.wb_pop_candidate1
== R30_REGNUM
)
8140 frame
.wb_pop_candidate1
= INVALID_REGNUM
;
8143 /* If candidate2 is INVALID_REGNUM, we need to adjust max_push_offset to
8144 256 to ensure that the offset meets the requirements of emit_move_insn.
8145 Similarly, if candidate1 is INVALID_REGNUM, we need to set
8146 max_push_offset to 0, because no registers are popped at this time,
8147 so callee_adjust cannot be adjusted. */
8148 HOST_WIDE_INT max_push_offset
= 0;
8149 if (frame
.wb_pop_candidate1
!= INVALID_REGNUM
)
8151 if (frame
.wb_pop_candidate2
!= INVALID_REGNUM
)
8152 max_push_offset
= 512;
8154 max_push_offset
= 256;
8157 HOST_WIDE_INT const_size
, const_below_saved_regs
, const_above_fp
;
8158 HOST_WIDE_INT const_saved_regs_size
;
8159 if (known_eq (saved_regs_size
, 0))
8160 frame
.initial_adjust
= frame
.frame_size
;
8161 else if (frame
.frame_size
.is_constant (&const_size
)
8162 && const_size
< max_push_offset
8163 && known_eq (frame
.bytes_above_hard_fp
, const_size
))
8165 /* Simple, small frame with no data below the saved registers.
8167 stp reg1, reg2, [sp, -frame_size]!
8168 stp reg3, reg4, [sp, 16] */
8169 frame
.callee_adjust
= const_size
;
8171 else if (frame
.bytes_below_saved_regs
.is_constant (&const_below_saved_regs
)
8172 && saved_regs_size
.is_constant (&const_saved_regs_size
)
8173 && const_below_saved_regs
+ const_saved_regs_size
< 512
8174 /* We could handle this case even with data below the saved
8175 registers, provided that that data left us with valid offsets
8176 for all predicate and vector save slots. It's such a rare
8177 case that it hardly seems worth the effort though. */
8178 && (!saves_below_hard_fp_p
|| const_below_saved_regs
== 0)
8179 && !(cfun
->calls_alloca
8180 && frame
.bytes_above_hard_fp
.is_constant (&const_above_fp
)
8181 && const_above_fp
< max_push_offset
))
8183 /* Frame with small area below the saved registers:
8185 sub sp, sp, frame_size
8186 stp reg1, reg2, [sp, bytes_below_saved_regs]
8187 stp reg3, reg4, [sp, bytes_below_saved_regs + 16] */
8188 frame
.initial_adjust
= frame
.frame_size
;
8190 else if (saves_below_hard_fp_p
8191 && known_eq (saved_regs_size
, below_hard_fp_saved_regs_size
))
8193 /* Frame in which all saves are SVE saves:
8195 sub sp, sp, frame_size - bytes_below_saved_regs
8196 save SVE registers relative to SP
8197 sub sp, sp, bytes_below_saved_regs */
8198 frame
.initial_adjust
= frame
.frame_size
- frame
.bytes_below_saved_regs
;
8199 frame
.final_adjust
= frame
.bytes_below_saved_regs
;
8201 else if (frame
.wb_push_candidate1
!= INVALID_REGNUM
8202 && frame
.bytes_above_hard_fp
.is_constant (&const_above_fp
)
8203 && const_above_fp
< max_push_offset
)
8205 /* Frame with large area below the saved registers, or with SVE saves,
8206 but with a small area above:
8208 stp reg1, reg2, [sp, -hard_fp_offset]!
8209 stp reg3, reg4, [sp, 16]
8210 [sub sp, sp, below_hard_fp_saved_regs_size]
8211 [save SVE registers relative to SP]
8212 sub sp, sp, bytes_below_saved_regs */
8213 frame
.callee_adjust
= const_above_fp
;
8214 frame
.sve_callee_adjust
= below_hard_fp_saved_regs_size
;
8215 frame
.final_adjust
= frame
.bytes_below_saved_regs
;
8221 sub sp, sp, hard_fp_offset
8222 stp x29, x30, [sp, 0]
8224 stp reg3, reg4, [sp, 16]
8225 [sub sp, sp, below_hard_fp_saved_regs_size]
8226 [save SVE registers relative to SP]
8227 sub sp, sp, bytes_below_saved_regs */
8228 frame
.initial_adjust
= frame
.bytes_above_hard_fp
;
8229 frame
.sve_callee_adjust
= below_hard_fp_saved_regs_size
;
8230 frame
.final_adjust
= frame
.bytes_below_saved_regs
;
8233 /* The frame is allocated in pieces, with each non-final piece
8234 including a register save at offset 0 that acts as a probe for
8235 the following piece. In addition, the save of the bottommost register
8236 acts as a probe for callees and allocas. Roll back any probes that
8239 A probe isn't needed if it is associated with the final allocation
8240 (including callees and allocas) that happens before the epilogue is
8243 && !cfun
->calls_alloca
8244 && known_eq (frame
.final_adjust
, 0))
8246 if (maybe_ne (frame
.sve_callee_adjust
, 0))
8247 frame
.sve_save_and_probe
= INVALID_REGNUM
;
8249 frame
.hard_fp_save_and_probe
= INVALID_REGNUM
;
8252 /* Make sure the individual adjustments add up to the full frame size. */
8253 gcc_assert (known_eq (frame
.initial_adjust
8254 + frame
.callee_adjust
8255 + frame
.sve_callee_adjust
8256 + frame
.final_adjust
, frame
.frame_size
));
8258 if (frame
.callee_adjust
== 0)
8260 /* We've decided not to do a "real" push and pop. However,
8261 setting up the frame chain is treated as being essentially
8262 a multi-instruction push. */
8263 frame
.wb_pop_candidate1
= frame
.wb_pop_candidate2
= INVALID_REGNUM
;
8264 if (!frame
.emit_frame_chain
)
8265 frame
.wb_push_candidate1
= frame
.wb_push_candidate2
= INVALID_REGNUM
;
8268 frame
.laid_out
= true;
8271 /* Return true if the register REGNO is saved on entry to
8272 the current function. */
8275 aarch64_register_saved_on_entry (int regno
)
8277 return known_ge (cfun
->machine
->frame
.reg_offset
[regno
], 0);
8280 /* Push the register number REGNO of mode MODE to the stack with write-back
8281 adjusting the stack by ADJUSTMENT. */
8284 aarch64_pushwb_single_reg (machine_mode mode
, unsigned regno
,
8285 HOST_WIDE_INT adjustment
)
8287 rtx base_rtx
= stack_pointer_rtx
;
8290 reg
= gen_rtx_REG (mode
, regno
);
8291 mem
= gen_rtx_PRE_MODIFY (Pmode
, base_rtx
,
8292 plus_constant (Pmode
, base_rtx
, -adjustment
));
8293 mem
= gen_frame_mem (mode
, mem
);
8295 insn
= emit_move_insn (mem
, reg
);
8296 RTX_FRAME_RELATED_P (insn
) = 1;
8299 /* Generate and return an instruction to store the pair of registers
8300 REG and REG2 of mode MODE to location BASE with write-back adjusting
8301 the stack location BASE by ADJUSTMENT. */
8304 aarch64_gen_storewb_pair (machine_mode mode
, rtx base
, rtx reg
, rtx reg2
,
8305 HOST_WIDE_INT adjustment
)
8307 rtx new_base
= plus_constant (Pmode
, base
, -adjustment
);
8308 rtx mem
= gen_frame_mem (mode
, new_base
);
8309 rtx mem2
= adjust_address_nv (mem
, mode
, GET_MODE_SIZE (mode
));
8311 return gen_rtx_PARALLEL (VOIDmode
,
8313 gen_rtx_SET (base
, new_base
),
8314 gen_rtx_SET (mem
, reg
),
8315 gen_rtx_SET (mem2
, reg2
)));
8318 /* Push registers numbered REGNO1 and REGNO2 to the stack, adjusting the
8319 stack pointer by ADJUSTMENT. */
8322 aarch64_push_regs (unsigned regno1
, unsigned regno2
, HOST_WIDE_INT adjustment
)
8325 machine_mode mode
= aarch64_reg_save_mode (regno1
);
8327 if (regno2
== INVALID_REGNUM
)
8328 return aarch64_pushwb_single_reg (mode
, regno1
, adjustment
);
8330 rtx reg1
= gen_rtx_REG (mode
, regno1
);
8331 rtx reg2
= gen_rtx_REG (mode
, regno2
);
8333 insn
= emit_insn (aarch64_gen_storewb_pair (mode
, stack_pointer_rtx
, reg1
,
8335 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn
), 0, 2)) = 1;
8336 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn
), 0, 1)) = 1;
8337 RTX_FRAME_RELATED_P (insn
) = 1;
8340 /* Load the pair of register REG, REG2 of mode MODE from stack location BASE,
8341 adjusting it by ADJUSTMENT afterwards. */
8344 aarch64_gen_loadwb_pair (machine_mode mode
, rtx base
, rtx reg
, rtx reg2
,
8345 HOST_WIDE_INT adjustment
)
8347 rtx mem
= gen_frame_mem (mode
, base
);
8348 rtx mem2
= adjust_address_nv (mem
, mode
, GET_MODE_SIZE (mode
));
8349 rtx new_base
= plus_constant (Pmode
, base
, adjustment
);
8351 return gen_rtx_PARALLEL (VOIDmode
,
8353 gen_rtx_SET (base
, new_base
),
8354 gen_rtx_SET (reg
, mem
),
8355 gen_rtx_SET (reg2
, mem2
)));
8358 /* Pop the two registers numbered REGNO1, REGNO2 from the stack, adjusting it
8359 afterwards by ADJUSTMENT and writing the appropriate REG_CFA_RESTORE notes
8363 aarch64_pop_regs (unsigned regno1
, unsigned regno2
, HOST_WIDE_INT adjustment
,
8366 machine_mode mode
= aarch64_reg_save_mode (regno1
);
8367 rtx reg1
= gen_rtx_REG (mode
, regno1
);
8369 *cfi_ops
= alloc_reg_note (REG_CFA_RESTORE
, reg1
, *cfi_ops
);
8371 if (regno2
== INVALID_REGNUM
)
8373 rtx mem
= plus_constant (Pmode
, stack_pointer_rtx
, adjustment
);
8374 mem
= gen_rtx_POST_MODIFY (Pmode
, stack_pointer_rtx
, mem
);
8375 emit_move_insn (reg1
, gen_frame_mem (mode
, mem
));
8379 rtx reg2
= gen_rtx_REG (mode
, regno2
);
8380 *cfi_ops
= alloc_reg_note (REG_CFA_RESTORE
, reg2
, *cfi_ops
);
8381 emit_insn (aarch64_gen_loadwb_pair (mode
, stack_pointer_rtx
, reg1
,
8386 /* Given an ldp/stp register operand mode MODE, return a suitable mode to use
8387 for a mem rtx representing the entire pair. */
8390 aarch64_pair_mode_for_mode (machine_mode mode
)
8392 if (known_eq (GET_MODE_SIZE (mode
), 4))
8394 else if (known_eq (GET_MODE_SIZE (mode
), 8))
8396 else if (known_eq (GET_MODE_SIZE (mode
), 16))
8402 /* Given a base mem MEM with mode and address suitable for a single ldp/stp
8403 operand, return an rtx like MEM which instead represents the entire pair. */
8406 aarch64_pair_mem_from_base (rtx mem
)
8408 auto pair_mode
= aarch64_pair_mode_for_mode (GET_MODE (mem
));
8409 mem
= adjust_bitfield_address_nv (mem
, pair_mode
, 0);
8410 gcc_assert (aarch64_mem_pair_lanes_operand (mem
, pair_mode
));
8414 /* Generate and return a store pair instruction to store REG1 and REG2
8415 into memory starting at BASE_MEM. All three rtxes should have modes of the
8419 aarch64_gen_store_pair (rtx base_mem
, rtx reg1
, rtx reg2
)
8421 rtx pair_mem
= aarch64_pair_mem_from_base (base_mem
);
8423 return gen_rtx_SET (pair_mem
,
8424 gen_rtx_UNSPEC (GET_MODE (pair_mem
),
8425 gen_rtvec (2, reg1
, reg2
),
8429 /* Generate and return a load pair instruction to load a pair of
8430 registers starting at BASE_MEM into REG1 and REG2. If CODE is
8431 UNKNOWN, all three rtxes should have modes of the same size.
8432 Otherwise, CODE is {SIGN,ZERO}_EXTEND, base_mem should be in SImode,
8433 and REG{1,2} should be in DImode. */
8436 aarch64_gen_load_pair (rtx reg1
, rtx reg2
, rtx base_mem
, enum rtx_code code
)
8438 rtx pair_mem
= aarch64_pair_mem_from_base (base_mem
);
8440 const bool any_extend_p
= (code
== ZERO_EXTEND
|| code
== SIGN_EXTEND
);
8442 gcc_checking_assert (GET_MODE (base_mem
) == SImode
8443 && GET_MODE (reg1
) == DImode
8444 && GET_MODE (reg2
) == DImode
);
8446 gcc_assert (code
== UNKNOWN
);
8449 gen_rtx_UNSPEC (any_extend_p
? SImode
: GET_MODE (reg1
),
8450 gen_rtvec (1, pair_mem
),
8452 gen_rtx_UNSPEC (any_extend_p
? SImode
: GET_MODE (reg2
),
8453 gen_rtvec (1, copy_rtx (pair_mem
)),
8458 for (int i
= 0; i
< 2; i
++)
8459 unspecs
[i
] = gen_rtx_fmt_e (code
, DImode
, unspecs
[i
]);
8461 return gen_rtx_PARALLEL (VOIDmode
,
8463 gen_rtx_SET (reg1
, unspecs
[0]),
8464 gen_rtx_SET (reg2
, unspecs
[1])));
8467 /* Return TRUE if return address signing should be enabled for the current
8468 function, otherwise return FALSE. */
8471 aarch64_return_address_signing_enabled (void)
8473 /* This function should only be called after frame laid out. */
8474 gcc_assert (cfun
->machine
->frame
.laid_out
);
8476 /* If signing scope is AARCH_FUNCTION_NON_LEAF, we only sign a leaf function
8477 if its LR is pushed onto stack. */
8478 return (aarch_ra_sign_scope
== AARCH_FUNCTION_ALL
8479 || (aarch_ra_sign_scope
== AARCH_FUNCTION_NON_LEAF
8480 && known_ge (cfun
->machine
->frame
.reg_offset
[LR_REGNUM
], 0)));
8483 /* Only used by the arm backend. */
8484 void aarch_bti_arch_check (void)
8487 /* Return TRUE if Branch Target Identification Mechanism is enabled. */
8489 aarch_bti_enabled (void)
8491 return (aarch_enable_bti
== 1);
8494 /* Check if INSN is a BTI J insn. */
8496 aarch_bti_j_insn_p (rtx_insn
*insn
)
8498 if (!insn
|| !INSN_P (insn
))
8501 rtx pat
= PATTERN (insn
);
8502 return GET_CODE (pat
) == UNSPEC_VOLATILE
&& XINT (pat
, 1) == UNSPECV_BTI_J
;
8505 /* Check if X (or any sub-rtx of X) is a PACIASP/PACIBSP instruction. */
8507 aarch_pac_insn_p (rtx x
)
8512 subrtx_var_iterator::array_type array
;
8513 FOR_EACH_SUBRTX_VAR (iter
, array
, PATTERN (x
), ALL
)
8516 if (sub
&& GET_CODE (sub
) == UNSPEC
)
8518 int unspec_val
= XINT (sub
, 1);
8521 case UNSPEC_PACIASP
:
8522 case UNSPEC_PACIBSP
:
8528 iter
.skip_subrtxes ();
8534 rtx
aarch_gen_bti_c (void)
8536 return gen_bti_c ();
8539 rtx
aarch_gen_bti_j (void)
8541 return gen_bti_j ();
8544 /* The caller is going to use ST1D or LD1D to save or restore an SVE
8545 register in mode MODE at BASE_RTX + OFFSET, where OFFSET is in
8546 the range [1, 16] * GET_MODE_SIZE (MODE). Prepare for this by:
8548 (1) updating BASE_RTX + OFFSET so that it is a legitimate ST1D
8551 (2) setting PRED to a valid predicate register for the ST1D or LD1D,
8552 if the variable isn't already nonnull
8554 (1) is needed when OFFSET is in the range [8, 16] * GET_MODE_SIZE (MODE).
8555 Handle this case using a temporary base register that is suitable for
8556 all offsets in that range. Use ANCHOR_REG as this base register if it
8557 is nonnull, otherwise create a new register and store it in ANCHOR_REG. */
8560 aarch64_adjust_sve_callee_save_base (machine_mode mode
, rtx
&base_rtx
,
8561 rtx
&anchor_reg
, poly_int64
&offset
,
8564 if (maybe_ge (offset
, 8 * GET_MODE_SIZE (mode
)))
8566 /* This is the maximum valid offset of the anchor from the base.
8567 Lower values would be valid too. */
8568 poly_int64 anchor_offset
= 16 * GET_MODE_SIZE (mode
);
8571 anchor_reg
= gen_rtx_REG (Pmode
, STACK_CLASH_SVE_CFA_REGNUM
);
8572 emit_insn (gen_add3_insn (anchor_reg
, base_rtx
,
8573 gen_int_mode (anchor_offset
, Pmode
)));
8575 base_rtx
= anchor_reg
;
8576 offset
-= anchor_offset
;
8580 int pred_reg
= cfun
->machine
->frame
.spare_pred_reg
;
8581 emit_move_insn (gen_rtx_REG (VNx16BImode
, pred_reg
),
8582 CONSTM1_RTX (VNx16BImode
));
8583 ptrue
= gen_rtx_REG (VNx2BImode
, pred_reg
);
8587 /* Add a REG_CFA_EXPRESSION note to INSN to say that register REG
8588 is saved at BASE + OFFSET. */
8591 aarch64_add_cfa_expression (rtx_insn
*insn
, rtx reg
,
8592 rtx base
, poly_int64 offset
)
8594 rtx mem
= gen_frame_mem (GET_MODE (reg
),
8595 plus_constant (Pmode
, base
, offset
));
8596 add_reg_note (insn
, REG_CFA_EXPRESSION
, gen_rtx_SET (mem
, reg
));
8599 /* Emit code to save the callee-saved registers in REGS. Skip any
8600 write-back candidates if SKIP_WB is true, otherwise consider only
8601 write-back candidates.
8603 The stack pointer is currently BYTES_BELOW_SP bytes above the bottom
8604 of the static frame. HARD_FP_VALID_P is true if the hard frame pointer
8608 aarch64_save_callee_saves (poly_int64 bytes_below_sp
,
8609 array_slice
<unsigned int> regs
, bool skip_wb
,
8610 bool hard_fp_valid_p
)
8612 aarch64_frame
&frame
= cfun
->machine
->frame
;
8614 rtx anchor_reg
= NULL_RTX
, ptrue
= NULL_RTX
;
8616 auto skip_save_p
= [&](unsigned int regno
)
8618 if (cfun
->machine
->reg_is_wrapped_separately
[regno
])
8621 if (skip_wb
== (regno
== frame
.wb_push_candidate1
8622 || regno
== frame
.wb_push_candidate2
))
8628 for (unsigned int i
= 0; i
< regs
.size (); ++i
)
8630 unsigned int regno
= regs
[i
];
8632 bool frame_related_p
= aarch64_emit_cfi_for_reg_p (regno
);
8634 if (skip_save_p (regno
))
8637 machine_mode mode
= aarch64_reg_save_mode (regno
);
8638 rtx reg
= gen_rtx_REG (mode
, regno
);
8640 offset
= frame
.reg_offset
[regno
] - bytes_below_sp
;
8641 if (regno
== VG_REGNUM
)
8643 move_src
= gen_rtx_REG (DImode
, IP0_REGNUM
);
8644 emit_move_insn (move_src
, gen_int_mode (aarch64_sve_vg
, DImode
));
8646 rtx base_rtx
= stack_pointer_rtx
;
8647 poly_int64 sp_offset
= offset
;
8649 HOST_WIDE_INT const_offset
;
8650 if (mode
== VNx2DImode
&& BYTES_BIG_ENDIAN
)
8651 aarch64_adjust_sve_callee_save_base (mode
, base_rtx
, anchor_reg
,
8653 else if (GP_REGNUM_P (REGNO (reg
))
8654 && (!offset
.is_constant (&const_offset
) || const_offset
>= 512))
8656 poly_int64 fp_offset
= frame
.bytes_below_hard_fp
- bytes_below_sp
;
8657 if (hard_fp_valid_p
)
8658 base_rtx
= hard_frame_pointer_rtx
;
8663 anchor_reg
= gen_rtx_REG (Pmode
, STACK_CLASH_SVE_CFA_REGNUM
);
8664 emit_insn (gen_add3_insn (anchor_reg
, base_rtx
,
8665 gen_int_mode (fp_offset
, Pmode
)));
8667 base_rtx
= anchor_reg
;
8669 offset
-= fp_offset
;
8671 rtx mem
= gen_frame_mem (mode
, plus_constant (Pmode
, base_rtx
, offset
));
8672 rtx cfi_mem
= gen_frame_mem (mode
, plus_constant (Pmode
,
8675 rtx cfi_set
= gen_rtx_SET (cfi_mem
, reg
);
8676 bool need_cfi_note_p
= (base_rtx
!= stack_pointer_rtx
);
8678 unsigned int regno2
;
8679 if (!aarch64_sve_mode_p (mode
)
8681 && i
+ 1 < regs
.size ()
8682 && (regno2
= regs
[i
+ 1], !skip_save_p (regno2
))
8683 && known_eq (GET_MODE_SIZE (mode
),
8684 frame
.reg_offset
[regno2
] - frame
.reg_offset
[regno
]))
8686 rtx reg2
= gen_rtx_REG (mode
, regno2
);
8688 offset
+= GET_MODE_SIZE (mode
);
8689 insn
= emit_insn (aarch64_gen_store_pair (mem
, reg
, reg2
));
8692 = gen_frame_mem (mode
,
8693 plus_constant (Pmode
,
8695 sp_offset
+ GET_MODE_SIZE (mode
)));
8696 rtx cfi_set2
= gen_rtx_SET (cfi_mem2
, reg2
);
8698 /* The first part of a frame-related parallel insn is always
8699 assumed to be relevant to the frame calculations;
8700 subsequent parts, are only frame-related if
8701 explicitly marked. */
8702 if (aarch64_emit_cfi_for_reg_p (regno2
))
8703 RTX_FRAME_RELATED_P (cfi_set2
) = 1;
8705 /* Add a REG_FRAME_RELATED_EXPR note since the unspec
8706 representation of stp cannot be understood directly by
8708 rtx par
= gen_rtx_PARALLEL (VOIDmode
,
8709 gen_rtvec (2, cfi_set
, cfi_set2
));
8710 add_reg_note (insn
, REG_FRAME_RELATED_EXPR
, par
);
8717 if (mode
== VNx2DImode
&& BYTES_BIG_ENDIAN
)
8719 insn
= emit_insn (gen_aarch64_pred_mov (mode
, mem
,
8721 need_cfi_note_p
= true;
8723 else if (aarch64_sve_mode_p (mode
))
8724 insn
= emit_insn (gen_rtx_SET (mem
, move_src
));
8726 insn
= emit_move_insn (mem
, move_src
);
8728 if (frame_related_p
&& (need_cfi_note_p
|| move_src
!= reg
))
8729 add_reg_note (insn
, REG_FRAME_RELATED_EXPR
, cfi_set
);
8732 RTX_FRAME_RELATED_P (insn
) = frame_related_p
;
8734 /* Emit a fake instruction to indicate that the VG save slot has
8735 been initialized. */
8736 if (regno
== VG_REGNUM
)
8737 emit_insn (gen_aarch64_old_vg_saved (move_src
, mem
));
8741 /* Emit code to restore the callee registers in REGS, ignoring pop candidates
8742 and any other registers that are handled separately. Write the appropriate
8743 REG_CFA_RESTORE notes into CFI_OPS.
8745 The stack pointer is currently BYTES_BELOW_SP bytes above the bottom
8746 of the static frame. */
8749 aarch64_restore_callee_saves (poly_int64 bytes_below_sp
,
8750 array_slice
<unsigned int> regs
, rtx
*cfi_ops
)
8752 aarch64_frame
&frame
= cfun
->machine
->frame
;
8754 rtx anchor_reg
= NULL_RTX
, ptrue
= NULL_RTX
;
8756 auto skip_restore_p
= [&](unsigned int regno
)
8758 if (cfun
->machine
->reg_is_wrapped_separately
[regno
])
8761 if (regno
== frame
.wb_pop_candidate1
8762 || regno
== frame
.wb_pop_candidate2
)
8765 /* The shadow call stack code restores LR separately. */
8766 if (frame
.is_scs_enabled
&& regno
== LR_REGNUM
)
8772 for (unsigned int i
= 0; i
< regs
.size (); ++i
)
8774 unsigned int regno
= regs
[i
];
8775 bool frame_related_p
= aarch64_emit_cfi_for_reg_p (regno
);
8776 if (skip_restore_p (regno
))
8779 machine_mode mode
= aarch64_reg_save_mode (regno
);
8780 rtx reg
= gen_rtx_REG (mode
, regno
);
8781 offset
= frame
.reg_offset
[regno
] - bytes_below_sp
;
8782 rtx base_rtx
= stack_pointer_rtx
;
8783 if (mode
== VNx2DImode
&& BYTES_BIG_ENDIAN
)
8784 aarch64_adjust_sve_callee_save_base (mode
, base_rtx
, anchor_reg
,
8786 rtx mem
= gen_frame_mem (mode
, plus_constant (Pmode
, base_rtx
, offset
));
8788 unsigned int regno2
;
8789 if (!aarch64_sve_mode_p (mode
)
8790 && i
+ 1 < regs
.size ()
8791 && (regno2
= regs
[i
+ 1], !skip_restore_p (regno2
))
8792 && known_eq (GET_MODE_SIZE (mode
),
8793 frame
.reg_offset
[regno2
] - frame
.reg_offset
[regno
]))
8795 rtx reg2
= gen_rtx_REG (mode
, regno2
);
8797 offset
+= GET_MODE_SIZE (mode
);
8798 emit_insn (aarch64_gen_load_pair (reg
, reg2
, mem
));
8800 *cfi_ops
= alloc_reg_note (REG_CFA_RESTORE
, reg2
, *cfi_ops
);
8804 else if (mode
== VNx2DImode
&& BYTES_BIG_ENDIAN
)
8805 emit_insn (gen_aarch64_pred_mov (mode
, reg
, ptrue
, mem
));
8806 else if (aarch64_sve_mode_p (mode
))
8807 emit_insn (gen_rtx_SET (reg
, mem
));
8809 emit_move_insn (reg
, mem
);
8810 if (frame_related_p
)
8811 *cfi_ops
= alloc_reg_note (REG_CFA_RESTORE
, reg
, *cfi_ops
);
8815 /* Return true if OFFSET is a signed 4-bit value multiplied by the size
8819 offset_4bit_signed_scaled_p (machine_mode mode
, poly_int64 offset
)
8821 HOST_WIDE_INT multiple
;
8822 return (constant_multiple_p (offset
, GET_MODE_SIZE (mode
), &multiple
)
8823 && IN_RANGE (multiple
, -8, 7));
8826 /* Return true if OFFSET is a signed 6-bit value multiplied by the size
8830 offset_6bit_signed_scaled_p (machine_mode mode
, poly_int64 offset
)
8832 HOST_WIDE_INT multiple
;
8833 return (constant_multiple_p (offset
, GET_MODE_SIZE (mode
), &multiple
)
8834 && IN_RANGE (multiple
, -32, 31));
8837 /* Return true if OFFSET is an unsigned 6-bit value multiplied by the size
8841 offset_6bit_unsigned_scaled_p (machine_mode mode
, poly_int64 offset
)
8843 HOST_WIDE_INT multiple
;
8844 return (constant_multiple_p (offset
, GET_MODE_SIZE (mode
), &multiple
)
8845 && IN_RANGE (multiple
, 0, 63));
8848 /* Return true if OFFSET is a signed 7-bit value multiplied by the size
8852 aarch64_offset_7bit_signed_scaled_p (machine_mode mode
, poly_int64 offset
)
8854 HOST_WIDE_INT multiple
;
8855 return (constant_multiple_p (offset
, GET_MODE_SIZE (mode
), &multiple
)
8856 && IN_RANGE (multiple
, -64, 63));
8859 /* Return true if OFFSET is a signed 9-bit value. */
8862 aarch64_offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED
,
8865 HOST_WIDE_INT const_offset
;
8866 return (offset
.is_constant (&const_offset
)
8867 && IN_RANGE (const_offset
, -256, 255));
8870 /* Return true if OFFSET is a signed 9-bit value multiplied by the size
8874 offset_9bit_signed_scaled_p (machine_mode mode
, poly_int64 offset
)
8876 HOST_WIDE_INT multiple
;
8877 return (constant_multiple_p (offset
, GET_MODE_SIZE (mode
), &multiple
)
8878 && IN_RANGE (multiple
, -256, 255));
8881 /* Return true if OFFSET is an unsigned 12-bit value multiplied by the size
8885 offset_12bit_unsigned_scaled_p (machine_mode mode
, poly_int64 offset
)
8887 HOST_WIDE_INT multiple
;
8888 return (constant_multiple_p (offset
, GET_MODE_SIZE (mode
), &multiple
)
8889 && IN_RANGE (multiple
, 0, 4095));
8892 /* Implement TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS. */
8895 aarch64_get_separate_components (void)
8897 aarch64_frame
&frame
= cfun
->machine
->frame
;
8898 sbitmap components
= sbitmap_alloc (LAST_SAVED_REGNUM
+ 1);
8899 bitmap_clear (components
);
8901 /* The registers we need saved to the frame. */
8902 bool enables_pstate_sm
= aarch64_cfun_enables_pstate_sm ();
8903 for (unsigned regno
= 0; regno
<= LAST_SAVED_REGNUM
; regno
++)
8904 if (aarch64_register_saved_on_entry (regno
))
8906 /* Disallow shrink wrapping for registers that will be clobbered
8907 by an SMSTART SM in the prologue. */
8908 if (enables_pstate_sm
8909 && (FP_REGNUM_P (regno
) || PR_REGNUM_P (regno
)))
8912 /* Punt on saves and restores that use ST1D and LD1D. We could
8913 try to be smarter, but it would involve making sure that the
8914 spare predicate register itself is safe to use at the save
8915 and restore points. Also, when a frame pointer is being used,
8916 the slots are often out of reach of ST1D and LD1D anyway. */
8917 machine_mode mode
= aarch64_reg_save_mode (regno
);
8918 if (mode
== VNx2DImode
&& BYTES_BIG_ENDIAN
)
8921 poly_int64 offset
= frame
.reg_offset
[regno
];
8923 /* Get the offset relative to the register we'll use. */
8924 if (frame_pointer_needed
)
8925 offset
-= frame
.bytes_below_hard_fp
;
8927 /* Check that we can access the stack slot of the register with one
8928 direct load with no adjustments needed. */
8929 if (aarch64_sve_mode_p (mode
)
8930 ? offset_9bit_signed_scaled_p (mode
, offset
)
8931 : offset_12bit_unsigned_scaled_p (mode
, offset
))
8932 bitmap_set_bit (components
, regno
);
8935 /* Don't mess with the hard frame pointer. */
8936 if (frame_pointer_needed
)
8937 bitmap_clear_bit (components
, HARD_FRAME_POINTER_REGNUM
);
8939 /* If the spare predicate register used by big-endian SVE code
8940 is call-preserved, it must be saved in the main prologue
8941 before any saves that use it. */
8942 if (frame
.spare_pred_reg
!= INVALID_REGNUM
)
8943 bitmap_clear_bit (components
, frame
.spare_pred_reg
);
8945 unsigned reg1
= frame
.wb_push_candidate1
;
8946 unsigned reg2
= frame
.wb_push_candidate2
;
8947 /* If registers have been chosen to be stored/restored with
8948 writeback don't interfere with them to avoid having to output explicit
8949 stack adjustment instructions. */
8950 if (reg2
!= INVALID_REGNUM
)
8951 bitmap_clear_bit (components
, reg2
);
8952 if (reg1
!= INVALID_REGNUM
)
8953 bitmap_clear_bit (components
, reg1
);
8955 bitmap_clear_bit (components
, LR_REGNUM
);
8956 bitmap_clear_bit (components
, SP_REGNUM
);
8957 if (flag_stack_clash_protection
)
8959 if (frame
.sve_save_and_probe
!= INVALID_REGNUM
)
8960 bitmap_clear_bit (components
, frame
.sve_save_and_probe
);
8961 if (frame
.hard_fp_save_and_probe
!= INVALID_REGNUM
)
8962 bitmap_clear_bit (components
, frame
.hard_fp_save_and_probe
);
8965 /* The VG save sequence needs a temporary GPR. Punt for now on trying
8967 bitmap_clear_bit (components
, VG_REGNUM
);
8972 /* Implement TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB. */
8975 aarch64_components_for_bb (basic_block bb
)
8977 bitmap in
= DF_LIVE_IN (bb
);
8978 bitmap gen
= &DF_LIVE_BB_INFO (bb
)->gen
;
8979 bitmap kill
= &DF_LIVE_BB_INFO (bb
)->kill
;
8981 sbitmap components
= sbitmap_alloc (LAST_SAVED_REGNUM
+ 1);
8982 bitmap_clear (components
);
8984 /* Clobbered registers don't generate values in any meaningful sense,
8985 since nothing after the clobber can rely on their value. And we can't
8986 say that partially-clobbered registers are unconditionally killed,
8987 because whether they're killed or not depends on the mode of the
8988 value they're holding. Thus partially call-clobbered registers
8989 appear in neither the kill set nor the gen set.
8991 Check manually for any calls that clobber more of a register than the
8992 current function can. */
8993 function_abi_aggregator callee_abis
;
8995 FOR_BB_INSNS (bb
, insn
)
8997 callee_abis
.note_callee_abi (insn_callee_abi (insn
));
8998 HARD_REG_SET extra_caller_saves
= callee_abis
.caller_save_regs (*crtl
->abi
);
9000 /* GPRs are used in a bb if they are in the IN, GEN, or KILL sets. */
9001 for (unsigned regno
= 0; regno
<= LAST_SAVED_REGNUM
; regno
++)
9002 if (!fixed_regs
[regno
]
9003 && !crtl
->abi
->clobbers_full_reg_p (regno
)
9004 && (TEST_HARD_REG_BIT (extra_caller_saves
, regno
)
9005 || bitmap_bit_p (in
, regno
)
9006 || bitmap_bit_p (gen
, regno
)
9007 || bitmap_bit_p (kill
, regno
)))
9009 bitmap_set_bit (components
, regno
);
9011 /* If there is a callee-save at an adjacent offset, add it too
9012 to increase the use of LDP/STP. */
9013 poly_int64 offset
= cfun
->machine
->frame
.reg_offset
[regno
];
9014 unsigned regno2
= multiple_p (offset
, 16) ? regno
+ 1 : regno
- 1;
9016 if (regno2
<= LAST_SAVED_REGNUM
)
9018 poly_int64 offset2
= cfun
->machine
->frame
.reg_offset
[regno2
];
9020 ? known_eq (offset
+ 8, offset2
)
9021 : multiple_p (offset2
, 16) && known_eq (offset2
+ 8, offset
))
9022 bitmap_set_bit (components
, regno2
);
9029 /* Implement TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS.
9030 Nothing to do for aarch64. */
9033 aarch64_disqualify_components (sbitmap
, edge
, sbitmap
, bool)
9037 /* Return the next set bit in BMP from START onwards. Return the total number
9038 of bits in BMP if no set bit is found at or after START. */
9041 aarch64_get_next_set_bit (sbitmap bmp
, unsigned int start
)
9043 unsigned int nbits
= SBITMAP_SIZE (bmp
);
9047 gcc_assert (start
< nbits
);
9048 for (unsigned int i
= start
; i
< nbits
; i
++)
9049 if (bitmap_bit_p (bmp
, i
))
9055 /* Do the work for aarch64_emit_prologue_components and
9056 aarch64_emit_epilogue_components. COMPONENTS is the bitmap of registers
9057 to save/restore, PROLOGUE_P indicates whether to emit the prologue sequence
9058 for these components or the epilogue sequence. That is, it determines
9059 whether we should emit stores or loads and what kind of CFA notes to attach
9060 to the insns. Otherwise the logic for the two sequences is very
9064 aarch64_process_components (sbitmap components
, bool prologue_p
)
9066 aarch64_frame
&frame
= cfun
->machine
->frame
;
9067 rtx ptr_reg
= gen_rtx_REG (Pmode
, frame_pointer_needed
9068 ? HARD_FRAME_POINTER_REGNUM
9069 : STACK_POINTER_REGNUM
);
9071 unsigned last_regno
= SBITMAP_SIZE (components
);
9072 unsigned regno
= aarch64_get_next_set_bit (components
, R0_REGNUM
);
9073 rtx_insn
*insn
= NULL
;
9075 while (regno
!= last_regno
)
9077 bool frame_related_p
= aarch64_emit_cfi_for_reg_p (regno
);
9078 machine_mode mode
= aarch64_reg_save_mode (regno
);
9080 rtx reg
= gen_rtx_REG (mode
, regno
);
9081 poly_int64 offset
= frame
.reg_offset
[regno
];
9082 if (frame_pointer_needed
)
9083 offset
-= frame
.bytes_below_hard_fp
;
9085 rtx addr
= plus_constant (Pmode
, ptr_reg
, offset
);
9086 rtx mem
= gen_frame_mem (mode
, addr
);
9088 rtx set
= prologue_p
? gen_rtx_SET (mem
, reg
) : gen_rtx_SET (reg
, mem
);
9089 unsigned regno2
= aarch64_get_next_set_bit (components
, regno
+ 1);
9090 /* No more registers to handle after REGNO.
9091 Emit a single save/restore and exit. */
9092 if (regno2
== last_regno
)
9094 insn
= emit_insn (set
);
9095 if (frame_related_p
)
9097 RTX_FRAME_RELATED_P (insn
) = 1;
9099 add_reg_note (insn
, REG_CFA_OFFSET
, copy_rtx (set
));
9101 add_reg_note (insn
, REG_CFA_RESTORE
, reg
);
9106 poly_int64 offset2
= frame
.reg_offset
[regno2
];
9107 /* The next register is not of the same class or its offset is not
9108 mergeable with the current one into a pair. */
9109 if (aarch64_sve_mode_p (mode
)
9110 || !satisfies_constraint_Ump (mem
)
9111 || GP_REGNUM_P (regno
) != GP_REGNUM_P (regno2
)
9112 || (crtl
->abi
->id () == ARM_PCS_SIMD
&& FP_REGNUM_P (regno
))
9113 || maybe_ne ((offset2
- frame
.reg_offset
[regno
]),
9114 GET_MODE_SIZE (mode
)))
9116 insn
= emit_insn (set
);
9117 if (frame_related_p
)
9119 RTX_FRAME_RELATED_P (insn
) = 1;
9121 add_reg_note (insn
, REG_CFA_OFFSET
, copy_rtx (set
));
9123 add_reg_note (insn
, REG_CFA_RESTORE
, reg
);
9130 bool frame_related2_p
= aarch64_emit_cfi_for_reg_p (regno2
);
9132 /* REGNO2 can be saved/restored in a pair with REGNO. */
9133 rtx reg2
= gen_rtx_REG (mode
, regno2
);
9134 if (frame_pointer_needed
)
9135 offset2
-= frame
.bytes_below_hard_fp
;
9136 rtx addr2
= plus_constant (Pmode
, ptr_reg
, offset2
);
9137 rtx mem2
= gen_frame_mem (mode
, addr2
);
9138 rtx set2
= prologue_p
? gen_rtx_SET (mem2
, reg2
)
9139 : gen_rtx_SET (reg2
, mem2
);
9142 insn
= emit_insn (aarch64_gen_store_pair (mem
, reg
, reg2
));
9144 insn
= emit_insn (aarch64_gen_load_pair (reg
, reg2
, mem
));
9146 if (frame_related_p
|| frame_related2_p
)
9148 RTX_FRAME_RELATED_P (insn
) = 1;
9151 if (frame_related_p
)
9152 add_reg_note (insn
, REG_CFA_OFFSET
, set
);
9153 if (frame_related2_p
)
9154 add_reg_note (insn
, REG_CFA_OFFSET
, set2
);
9158 if (frame_related_p
)
9159 add_reg_note (insn
, REG_CFA_RESTORE
, reg
);
9160 if (frame_related2_p
)
9161 add_reg_note (insn
, REG_CFA_RESTORE
, reg2
);
9165 regno
= aarch64_get_next_set_bit (components
, regno2
+ 1);
9169 /* Implement TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS. */
9172 aarch64_emit_prologue_components (sbitmap components
)
9174 aarch64_process_components (components
, true);
9177 /* Implement TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS. */
9180 aarch64_emit_epilogue_components (sbitmap components
)
9182 aarch64_process_components (components
, false);
9185 /* Implement TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS. */
9188 aarch64_set_handled_components (sbitmap components
)
9190 for (unsigned regno
= 0; regno
<= LAST_SAVED_REGNUM
; regno
++)
9191 if (bitmap_bit_p (components
, regno
))
9192 cfun
->machine
->reg_is_wrapped_separately
[regno
] = true;
9195 /* On AArch64 we have an ABI defined safe buffer. This constant is used to
9196 determining the probe offset for alloca. */
9198 static HOST_WIDE_INT
9199 aarch64_stack_clash_protection_alloca_probe_range (void)
9201 return STACK_CLASH_CALLER_GUARD
;
9204 /* Emit a stack tie that acts as a scheduling barrier for all previous and
9205 subsequent memory accesses and that requires the stack pointer and REG
9206 to have their current values. REG can be stack_pointer_rtx if no
9207 other register's value needs to be fixed. */
9210 aarch64_emit_stack_tie (rtx reg
)
9212 emit_insn (gen_stack_tie (reg
, gen_int_mode (REGNO (reg
), DImode
)));
9215 /* Allocate POLY_SIZE bytes of stack space using TEMP1 and TEMP2 as scratch
9216 registers. If POLY_SIZE is not large enough to require a probe this function
9217 will only adjust the stack. When allocating the stack space
9218 FRAME_RELATED_P is then used to indicate if the allocation is frame related.
9219 FINAL_ADJUSTMENT_P indicates whether we are allocating the area below
9220 the saved registers. If we are then we ensure that any allocation
9221 larger than the ABI defined buffer needs a probe so that the
9222 invariant of having a 1KB buffer is maintained.
9224 We emit barriers after each stack adjustment to prevent optimizations from
9225 breaking the invariant that we never drop the stack more than a page. This
9226 invariant is needed to make it easier to correctly handle asynchronous
9227 events, e.g. if we were to allow the stack to be dropped by more than a page
9228 and then have multiple probes up and we take a signal somewhere in between
9229 then the signal handler doesn't know the state of the stack and can make no
9230 assumptions about which pages have been probed.
9232 FORCE_ISA_MODE is AARCH64_ISA_MODE_SM_ON if any variable component of
9233 POLY_SIZE is measured relative to the SME vector length instead of the
9234 current prevailing vector length. It is 0 otherwise. */
9237 aarch64_allocate_and_probe_stack_space (rtx temp1
, rtx temp2
,
9238 poly_int64 poly_size
,
9239 aarch64_isa_mode force_isa_mode
,
9240 bool frame_related_p
,
9241 bool final_adjustment_p
)
9243 aarch64_frame
&frame
= cfun
->machine
->frame
;
9244 HOST_WIDE_INT guard_size
9245 = 1 << param_stack_clash_protection_guard_size
;
9246 HOST_WIDE_INT guard_used_by_caller
= STACK_CLASH_CALLER_GUARD
;
9247 HOST_WIDE_INT byte_sp_alignment
= STACK_BOUNDARY
/ BITS_PER_UNIT
;
9248 gcc_assert (multiple_p (poly_size
, byte_sp_alignment
));
9249 HOST_WIDE_INT min_probe_threshold
9250 = (final_adjustment_p
9251 ? guard_used_by_caller
+ byte_sp_alignment
9252 : guard_size
- guard_used_by_caller
);
9253 poly_int64 frame_size
= frame
.frame_size
;
9255 /* We should always have a positive probe threshold. */
9256 gcc_assert (min_probe_threshold
> 0);
9258 if (flag_stack_clash_protection
&& !final_adjustment_p
)
9260 poly_int64 initial_adjust
= frame
.initial_adjust
;
9261 poly_int64 sve_callee_adjust
= frame
.sve_callee_adjust
;
9262 poly_int64 final_adjust
= frame
.final_adjust
;
9264 if (known_eq (frame_size
, 0))
9266 dump_stack_clash_frame_info (NO_PROBE_NO_FRAME
, false);
9268 else if (known_lt (initial_adjust
+ sve_callee_adjust
,
9269 guard_size
- guard_used_by_caller
)
9270 && known_lt (final_adjust
, guard_used_by_caller
))
9272 dump_stack_clash_frame_info (NO_PROBE_SMALL_FRAME
, true);
9276 /* If SIZE is not large enough to require probing, just adjust the stack and
9278 if (known_lt (poly_size
, min_probe_threshold
)
9279 || !flag_stack_clash_protection
)
9281 aarch64_sub_sp (temp1
, temp2
, poly_size
, force_isa_mode
,
9287 /* Handle the SVE non-constant case first. */
9288 if (!poly_size
.is_constant (&size
))
9292 fprintf (dump_file
, "Stack clash SVE prologue: ");
9293 print_dec (poly_size
, dump_file
);
9294 fprintf (dump_file
, " bytes, dynamic probing will be required.\n");
9297 /* First calculate the amount of bytes we're actually spilling. */
9298 aarch64_add_offset (Pmode
, temp1
, CONST0_RTX (Pmode
),
9299 poly_size
, temp1
, temp2
, force_isa_mode
,
9302 rtx_insn
*insn
= get_last_insn ();
9304 if (frame_related_p
)
9306 /* This is done to provide unwinding information for the stack
9307 adjustments we're about to do, however to prevent the optimizers
9308 from removing the R11 move and leaving the CFA note (which would be
9309 very wrong) we tie the old and new stack pointer together.
9310 The tie will expand to nothing but the optimizers will not touch
9312 rtx stack_ptr_copy
= gen_rtx_REG (Pmode
, STACK_CLASH_SVE_CFA_REGNUM
);
9313 emit_move_insn (stack_ptr_copy
, stack_pointer_rtx
);
9314 aarch64_emit_stack_tie (stack_ptr_copy
);
9316 /* We want the CFA independent of the stack pointer for the
9317 duration of the loop. */
9318 add_reg_note (insn
, REG_CFA_DEF_CFA
, stack_ptr_copy
);
9319 RTX_FRAME_RELATED_P (insn
) = 1;
9322 rtx probe_const
= gen_int_mode (min_probe_threshold
, Pmode
);
9323 rtx guard_const
= gen_int_mode (guard_size
, Pmode
);
9325 insn
= emit_insn (gen_probe_sve_stack_clash (Pmode
, stack_pointer_rtx
,
9326 stack_pointer_rtx
, temp1
,
9327 probe_const
, guard_const
));
9329 /* Now reset the CFA register if needed. */
9330 if (frame_related_p
)
9332 add_reg_note (insn
, REG_CFA_DEF_CFA
,
9333 gen_rtx_PLUS (Pmode
, stack_pointer_rtx
,
9334 gen_int_mode (poly_size
, Pmode
)));
9335 RTX_FRAME_RELATED_P (insn
) = 1;
9343 "Stack clash AArch64 prologue: " HOST_WIDE_INT_PRINT_DEC
9344 " bytes, probing will be required.\n", size
);
9346 /* Round size to the nearest multiple of guard_size, and calculate the
9347 residual as the difference between the original size and the rounded
9349 HOST_WIDE_INT rounded_size
= ROUND_DOWN (size
, guard_size
);
9350 HOST_WIDE_INT residual
= size
- rounded_size
;
9352 /* We can handle a small number of allocations/probes inline. Otherwise
9354 if (rounded_size
<= STACK_CLASH_MAX_UNROLL_PAGES
* guard_size
)
9356 for (HOST_WIDE_INT i
= 0; i
< rounded_size
; i
+= guard_size
)
9358 aarch64_sub_sp (NULL
, temp2
, guard_size
, force_isa_mode
, true);
9359 emit_stack_probe (plus_constant (Pmode
, stack_pointer_rtx
,
9360 guard_used_by_caller
));
9361 emit_insn (gen_blockage ());
9363 dump_stack_clash_frame_info (PROBE_INLINE
, size
!= rounded_size
);
9367 /* Compute the ending address. */
9368 aarch64_add_offset (Pmode
, temp1
, stack_pointer_rtx
, -rounded_size
,
9369 temp1
, NULL
, force_isa_mode
, false, true);
9370 rtx_insn
*insn
= get_last_insn ();
9372 /* For the initial allocation, we don't have a frame pointer
9373 set up, so we always need CFI notes. If we're doing the
9374 final allocation, then we may have a frame pointer, in which
9375 case it is the CFA, otherwise we need CFI notes.
9377 We can determine which allocation we are doing by looking at
9378 the value of FRAME_RELATED_P since the final allocations are not
9380 if (frame_related_p
)
9382 /* We want the CFA independent of the stack pointer for the
9383 duration of the loop. */
9384 add_reg_note (insn
, REG_CFA_DEF_CFA
,
9385 plus_constant (Pmode
, temp1
, rounded_size
));
9386 RTX_FRAME_RELATED_P (insn
) = 1;
9389 /* This allocates and probes the stack. Note that this re-uses some of
9390 the existing Ada stack protection code. However we are guaranteed not
9391 to enter the non loop or residual branches of that code.
9393 The non-loop part won't be entered because if our allocation amount
9394 doesn't require a loop, the case above would handle it.
9396 The residual amount won't be entered because TEMP1 is a mutliple of
9397 the allocation size. The residual will always be 0. As such, the only
9398 part we are actually using from that code is the loop setup. The
9399 actual probing is done in aarch64_output_probe_stack_range. */
9400 insn
= emit_insn (gen_probe_stack_range (stack_pointer_rtx
,
9401 stack_pointer_rtx
, temp1
));
9403 /* Now reset the CFA register if needed. */
9404 if (frame_related_p
)
9406 add_reg_note (insn
, REG_CFA_DEF_CFA
,
9407 plus_constant (Pmode
, stack_pointer_rtx
, rounded_size
));
9408 RTX_FRAME_RELATED_P (insn
) = 1;
9411 emit_insn (gen_blockage ());
9412 dump_stack_clash_frame_info (PROBE_LOOP
, size
!= rounded_size
);
9415 /* Handle any residuals. Residuals of at least MIN_PROBE_THRESHOLD have to
9416 be probed. This maintains the requirement that each page is probed at
9417 least once. For initial probing we probe only if the allocation is
9418 more than GUARD_SIZE - buffer, and below the saved registers we probe
9419 if the amount is larger than buffer. GUARD_SIZE - buffer + buffer ==
9420 GUARD_SIZE. This works that for any allocation that is large enough to
9421 trigger a probe here, we'll have at least one, and if they're not large
9422 enough for this code to emit anything for them, The page would have been
9423 probed by the saving of FP/LR either by this function or any callees. If
9424 we don't have any callees then we won't have more stack adjustments and so
9428 gcc_assert (guard_used_by_caller
+ byte_sp_alignment
<= size
);
9430 /* If we're doing final adjustments, and we've done any full page
9431 allocations then any residual needs to be probed. */
9432 if (final_adjustment_p
&& rounded_size
!= 0)
9433 min_probe_threshold
= 0;
9435 aarch64_sub_sp (temp1
, temp2
, residual
, force_isa_mode
, frame_related_p
);
9436 if (residual
>= min_probe_threshold
)
9440 "Stack clash AArch64 prologue residuals: "
9441 HOST_WIDE_INT_PRINT_DEC
" bytes, probing will be required."
9444 emit_stack_probe (plus_constant (Pmode
, stack_pointer_rtx
,
9445 guard_used_by_caller
));
9446 emit_insn (gen_blockage ());
9451 /* Implement TARGET_EXTRA_LIVE_ON_ENTRY. */
9454 aarch64_extra_live_on_entry (bitmap regs
)
9458 bitmap_set_bit (regs
, LOWERING_REGNUM
);
9459 bitmap_set_bit (regs
, SME_STATE_REGNUM
);
9460 bitmap_set_bit (regs
, TPIDR2_SETUP_REGNUM
);
9461 bitmap_set_bit (regs
, ZA_FREE_REGNUM
);
9462 bitmap_set_bit (regs
, ZA_SAVED_REGNUM
);
9464 /* The only time ZA can't have live contents on entry is when
9465 the function explicitly treats it as a pure output. */
9466 auto za_flags
= aarch64_cfun_shared_flags ("za");
9467 if (za_flags
!= (AARCH64_STATE_SHARED
| AARCH64_STATE_OUT
))
9468 bitmap_set_bit (regs
, ZA_REGNUM
);
9470 /* Since ZT0 is call-clobbered, it is only live on input if
9471 it is explicitly shared, and is not a pure output. */
9472 auto zt0_flags
= aarch64_cfun_shared_flags ("zt0");
9474 && zt0_flags
!= (AARCH64_STATE_SHARED
| AARCH64_STATE_OUT
))
9475 bitmap_set_bit (regs
, ZT0_REGNUM
);
9479 /* Return 1 if the register is used by the epilogue. We need to say the
9480 return register is used, but only after epilogue generation is complete.
9481 Note that in the case of sibcalls, the values "used by the epilogue" are
9482 considered live at the start of the called function. */
9485 aarch64_epilogue_uses (int regno
)
9487 if (epilogue_completed
)
9489 if (regno
== LR_REGNUM
)
9492 if (regno
== LOWERING_REGNUM
&& TARGET_ZA
)
9494 if (regno
== SME_STATE_REGNUM
&& TARGET_ZA
)
9496 if (regno
== TPIDR2_SETUP_REGNUM
&& TARGET_ZA
)
9498 /* If the function shares SME state with its caller, ensure that that
9499 data is not in the lazy save buffer on exit. */
9500 if (regno
== ZA_SAVED_REGNUM
&& aarch64_cfun_incoming_pstate_za () != 0)
9502 if (regno
== ZA_REGNUM
&& aarch64_cfun_shared_flags ("za") != 0)
9504 if (regno
== ZT0_REGNUM
&& aarch64_cfun_shared_flags ("zt0") != 0)
9509 /* Implement TARGET_USE_LATE_PROLOGUE_EPILOGUE. */
9512 aarch64_use_late_prologue_epilogue ()
9514 return aarch64_cfun_enables_pstate_sm ();
9517 /* The current function's frame has a save slot for the incoming state
9518 of SVCR. Return a legitimate memory for the slot, based on the hard
9522 aarch64_old_svcr_mem ()
9524 gcc_assert (frame_pointer_needed
9525 && known_ge (cfun
->machine
->frame
.old_svcr_offset
, 0));
9526 rtx base
= hard_frame_pointer_rtx
;
9527 poly_int64 offset
= (0
9528 /* hard fp -> bottom of frame. */
9529 - cfun
->machine
->frame
.bytes_below_hard_fp
9530 /* bottom of frame -> save slot. */
9531 + cfun
->machine
->frame
.old_svcr_offset
);
9532 return gen_frame_mem (DImode
, plus_constant (Pmode
, base
, offset
));
9535 /* The current function's frame has a save slot for the incoming state
9536 of SVCR. Load the slot into register REGNO and return the register. */
9539 aarch64_read_old_svcr (unsigned int regno
)
9541 rtx svcr
= gen_rtx_REG (DImode
, regno
);
9542 emit_move_insn (svcr
, aarch64_old_svcr_mem ());
9546 /* Like the rtx version of aarch64_guard_switch_pstate_sm, but first
9547 load the incoming value of SVCR from its save slot into temporary
9551 aarch64_guard_switch_pstate_sm (unsigned int regno
,
9552 aarch64_isa_mode local_mode
)
9554 rtx old_svcr
= aarch64_read_old_svcr (regno
);
9555 return aarch64_guard_switch_pstate_sm (old_svcr
, local_mode
);
9558 /* AArch64 stack frames generated by this compiler look like:
9560 +-------------------------------+
9562 | incoming stack arguments |
9564 +-------------------------------+
9565 | | <-- incoming stack pointer (aligned)
9566 | callee-allocated save area |
9567 | for register varargs |
9569 +-------------------------------+
9570 | local variables (1) | <-- frame_pointer_rtx
9572 +-------------------------------+
9574 +-------------------------------+
9575 | callee-saved registers |
9576 +-------------------------------+
9578 +-------------------------------+
9580 +-------------------------------+ <-- hard_frame_pointer_rtx (aligned)
9581 | SVE vector registers |
9582 +-------------------------------+
9583 | SVE predicate registers |
9584 +-------------------------------+
9585 | local variables (2) |
9586 +-------------------------------+
9588 +-------------------------------+
9589 | dynamic allocation |
9590 +-------------------------------+
9592 +-------------------------------+
9593 | outgoing stack arguments | <-- arg_pointer
9595 +-------------------------------+
9596 | | <-- stack_pointer_rtx (aligned)
9598 The regions marked (1) and (2) are mutually exclusive. (2) is used
9599 when aarch64_save_regs_above_locals_p is true.
9601 Dynamic stack allocations via alloca() decrease stack_pointer_rtx
9602 but leave frame_pointer_rtx and hard_frame_pointer_rtx
9605 By default for stack-clash we assume the guard is at least 64KB, but this
9606 value is configurable to either 4KB or 64KB. We also force the guard size to
9607 be the same as the probing interval and both values are kept in sync.
9609 With those assumptions the callee can allocate up to 63KB (or 3KB depending
9610 on the guard size) of stack space without probing.
9612 When probing is needed, we emit a probe at the start of the prologue
9613 and every PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE bytes thereafter.
9615 We can also use register saves as probes. These are stored in
9616 sve_save_and_probe and hard_fp_save_and_probe.
9618 For outgoing arguments we probe if the size is larger than 1KB, such that
9619 the ABI specified buffer is maintained for the next callee.
9621 The following registers are reserved during frame layout and should not be
9622 used for any other purpose:
9624 - r11: Used by stack clash protection when SVE is enabled, and also
9625 as an anchor register when saving and restoring registers
9626 - r12(EP0) and r13(EP1): Used as temporaries for stack adjustment.
9627 - r14 and r15: Used for speculation tracking.
9628 - r16(IP0), r17(IP1): Used by indirect tailcalls.
9629 - r30(LR), r29(FP): Used by standard frame layout.
9631 These registers must be avoided in frame layout related code unless the
9632 explicit intention is to interact with one of the features listed above. */
9634 /* Generate the prologue instructions for entry into a function.
9635 Establish the stack frame by decreasing the stack pointer with a
9636 properly calculated size and, if necessary, create a frame record
9637 filled with the values of LR and previous frame pointer. The
9638 current FP is also set up if it is in use. */
9641 aarch64_expand_prologue (void)
9643 aarch64_frame
&frame
= cfun
->machine
->frame
;
9644 poly_int64 frame_size
= frame
.frame_size
;
9645 poly_int64 initial_adjust
= frame
.initial_adjust
;
9646 HOST_WIDE_INT callee_adjust
= frame
.callee_adjust
;
9647 poly_int64 final_adjust
= frame
.final_adjust
;
9648 poly_int64 sve_callee_adjust
= frame
.sve_callee_adjust
;
9649 unsigned reg1
= frame
.wb_push_candidate1
;
9650 unsigned reg2
= frame
.wb_push_candidate2
;
9651 bool emit_frame_chain
= frame
.emit_frame_chain
;
9653 aarch64_isa_mode force_isa_mode
= 0;
9654 if (aarch64_cfun_enables_pstate_sm ())
9655 force_isa_mode
= AARCH64_ISA_MODE_SM_ON
;
9657 if (flag_stack_clash_protection
9658 && known_eq (callee_adjust
, 0)
9659 && known_lt (frame
.reg_offset
[VG_REGNUM
], 0))
9661 /* Fold the SVE allocation into the initial allocation.
9662 We don't do this in aarch64_layout_arg to avoid pessimizing
9663 the epilogue code. */
9664 initial_adjust
+= sve_callee_adjust
;
9665 sve_callee_adjust
= 0;
9668 /* Sign return address for functions. */
9669 if (aarch64_return_address_signing_enabled ())
9671 switch (aarch64_ra_sign_key
)
9674 insn
= emit_insn (gen_paciasp ());
9677 insn
= emit_insn (gen_pacibsp ());
9682 add_reg_note (insn
, REG_CFA_NEGATE_RA_STATE
, const0_rtx
);
9683 RTX_FRAME_RELATED_P (insn
) = 1;
9686 /* Push return address to shadow call stack. */
9687 if (frame
.is_scs_enabled
)
9688 emit_insn (gen_scs_push ());
9690 if (flag_stack_usage_info
)
9691 current_function_static_stack_size
= constant_lower_bound (frame_size
);
9693 if (flag_stack_check
== STATIC_BUILTIN_STACK_CHECK
)
9695 if (crtl
->is_leaf
&& !cfun
->calls_alloca
)
9697 if (maybe_gt (frame_size
, PROBE_INTERVAL
)
9698 && maybe_gt (frame_size
, get_stack_check_protect ()))
9699 aarch64_emit_probe_stack_range (get_stack_check_protect (),
9701 - get_stack_check_protect ()));
9703 else if (maybe_gt (frame_size
, 0))
9704 aarch64_emit_probe_stack_range (get_stack_check_protect (), frame_size
);
9707 rtx tmp0_rtx
= gen_rtx_REG (Pmode
, EP0_REGNUM
);
9708 rtx tmp1_rtx
= gen_rtx_REG (Pmode
, EP1_REGNUM
);
9710 /* In theory we should never have both an initial adjustment
9711 and a callee save adjustment. Verify that is the case since the
9712 code below does not handle it for -fstack-clash-protection. */
9713 gcc_assert (known_eq (initial_adjust
, 0) || callee_adjust
== 0);
9715 /* Will only probe if the initial adjustment is larger than the guard
9716 less the amount of the guard reserved for use by the caller's
9718 aarch64_allocate_and_probe_stack_space (tmp0_rtx
, tmp1_rtx
, initial_adjust
,
9719 force_isa_mode
, true, false);
9721 if (callee_adjust
!= 0)
9722 aarch64_push_regs (reg1
, reg2
, callee_adjust
);
9724 /* The offset of the current SP from the bottom of the static frame. */
9725 poly_int64 bytes_below_sp
= frame_size
- initial_adjust
- callee_adjust
;
9727 if (emit_frame_chain
)
9729 /* The offset of the frame chain record (if any) from the current SP. */
9730 poly_int64 chain_offset
= (initial_adjust
+ callee_adjust
9731 - frame
.bytes_above_hard_fp
);
9732 gcc_assert (known_ge (chain_offset
, 0));
9734 gcc_assert (reg1
== R29_REGNUM
&& reg2
== R30_REGNUM
);
9735 if (callee_adjust
== 0)
9736 aarch64_save_callee_saves (bytes_below_sp
, frame
.saved_gprs
,
9739 gcc_assert (known_eq (chain_offset
, 0));
9740 aarch64_add_offset (Pmode
, hard_frame_pointer_rtx
,
9741 stack_pointer_rtx
, chain_offset
,
9742 tmp1_rtx
, tmp0_rtx
, force_isa_mode
,
9743 frame_pointer_needed
);
9744 if (frame_pointer_needed
&& !frame_size
.is_constant ())
9746 /* Variable-sized frames need to describe the save slot
9747 address using DW_CFA_expression rather than DW_CFA_offset.
9748 This means that, without taking further action, the
9749 locations of the registers that we've already saved would
9750 remain based on the stack pointer even after we redefine
9751 the CFA based on the frame pointer. We therefore need new
9752 DW_CFA_expressions to re-express the save slots with addresses
9753 based on the frame pointer. */
9754 rtx_insn
*insn
= get_last_insn ();
9755 gcc_assert (RTX_FRAME_RELATED_P (insn
));
9757 /* Add an explicit CFA definition if this was previously
9759 if (!find_reg_note (insn
, REG_CFA_ADJUST_CFA
, NULL_RTX
))
9761 rtx src
= plus_constant (Pmode
, stack_pointer_rtx
, chain_offset
);
9762 add_reg_note (insn
, REG_CFA_ADJUST_CFA
,
9763 gen_rtx_SET (hard_frame_pointer_rtx
, src
));
9766 /* Change the save slot expressions for the registers that
9767 we've already saved. */
9768 aarch64_add_cfa_expression (insn
, regno_reg_rtx
[reg2
],
9769 hard_frame_pointer_rtx
, UNITS_PER_WORD
);
9770 aarch64_add_cfa_expression (insn
, regno_reg_rtx
[reg1
],
9771 hard_frame_pointer_rtx
, 0);
9773 aarch64_emit_stack_tie (hard_frame_pointer_rtx
);
9776 aarch64_save_callee_saves (bytes_below_sp
, frame
.saved_gprs
, true,
9778 if (maybe_ge (frame
.reg_offset
[VG_REGNUM
], 0))
9780 unsigned int saved_regs
[] = { VG_REGNUM
};
9781 aarch64_save_callee_saves (bytes_below_sp
, saved_regs
, true,
9784 if (maybe_ne (sve_callee_adjust
, 0))
9786 gcc_assert (!flag_stack_clash_protection
9787 || known_eq (initial_adjust
, 0)
9788 /* The VG save isn't shrink-wrapped and so serves as
9789 a probe of the initial allocation. */
9790 || known_eq (frame
.reg_offset
[VG_REGNUM
], bytes_below_sp
));
9791 aarch64_allocate_and_probe_stack_space (tmp1_rtx
, tmp0_rtx
,
9794 !frame_pointer_needed
, false);
9795 bytes_below_sp
-= sve_callee_adjust
;
9797 aarch64_save_callee_saves (bytes_below_sp
, frame
.saved_prs
, true,
9799 aarch64_save_callee_saves (bytes_below_sp
, frame
.saved_fprs
, true,
9802 /* We may need to probe the final adjustment if it is larger than the guard
9803 that is assumed by the called. */
9804 gcc_assert (known_eq (bytes_below_sp
, final_adjust
));
9805 aarch64_allocate_and_probe_stack_space (tmp1_rtx
, tmp0_rtx
, final_adjust
,
9807 !frame_pointer_needed
, true);
9808 if (emit_frame_chain
&& maybe_ne (final_adjust
, 0))
9809 aarch64_emit_stack_tie (hard_frame_pointer_rtx
);
9811 /* Save the incoming value of PSTATE.SM, if required. Code further
9812 down does this for locally-streaming functions. */
9813 if (known_ge (frame
.old_svcr_offset
, 0)
9814 && !aarch64_cfun_enables_pstate_sm ())
9816 rtx mem
= aarch64_old_svcr_mem ();
9817 MEM_VOLATILE_P (mem
) = 1;
9820 rtx reg
= gen_rtx_REG (DImode
, IP0_REGNUM
);
9821 emit_insn (gen_aarch64_read_svcr (reg
));
9822 emit_move_insn (mem
, reg
);
9826 rtx old_r0
= NULL_RTX
, old_r1
= NULL_RTX
;
9827 auto &args
= crtl
->args
.info
;
9828 if (args
.aapcs_ncrn
> 0)
9830 old_r0
= gen_rtx_REG (DImode
, PROBE_STACK_FIRST_REGNUM
);
9831 emit_move_insn (old_r0
, gen_rtx_REG (DImode
, R0_REGNUM
));
9833 if (args
.aapcs_ncrn
> 1)
9835 old_r1
= gen_rtx_REG (DImode
, PROBE_STACK_SECOND_REGNUM
);
9836 emit_move_insn (old_r1
, gen_rtx_REG (DImode
, R1_REGNUM
));
9838 emit_insn (gen_aarch64_get_sme_state ());
9839 emit_move_insn (mem
, gen_rtx_REG (DImode
, R0_REGNUM
));
9841 emit_move_insn (gen_rtx_REG (DImode
, R0_REGNUM
), old_r0
);
9843 emit_move_insn (gen_rtx_REG (DImode
, R1_REGNUM
), old_r1
);
9847 /* Enable PSTATE.SM, if required. */
9848 if (aarch64_cfun_enables_pstate_sm ())
9850 rtx_insn
*guard_label
= nullptr;
9851 if (known_ge (cfun
->machine
->frame
.old_svcr_offset
, 0))
9853 /* The current function is streaming-compatible. Save the
9854 original state of PSTATE.SM. */
9855 rtx svcr
= gen_rtx_REG (DImode
, IP0_REGNUM
);
9856 emit_insn (gen_aarch64_read_svcr (svcr
));
9857 emit_move_insn (aarch64_old_svcr_mem (), svcr
);
9858 guard_label
= aarch64_guard_switch_pstate_sm (svcr
,
9861 aarch64_sme_mode_switch_regs args_switch
;
9862 auto &args
= crtl
->args
.info
;
9863 for (unsigned int i
= 0; i
< args
.num_sme_mode_switch_args
; ++i
)
9865 rtx x
= args
.sme_mode_switch_args
[i
];
9866 args_switch
.add_reg (GET_MODE (x
), REGNO (x
));
9868 args_switch
.emit_prologue ();
9869 emit_insn (gen_aarch64_smstart_sm ());
9870 args_switch
.emit_epilogue ();
9872 emit_label (guard_label
);
9876 /* Return TRUE if we can use a simple_return insn.
9878 This function checks whether the callee saved stack is empty, which
9879 means no restore actions are need. The pro_and_epilogue will use
9880 this to check whether shrink-wrapping opt is feasible. */
9883 aarch64_use_return_insn_p (void)
9885 if (!reload_completed
)
9891 return known_eq (cfun
->machine
->frame
.frame_size
, 0);
9894 /* Generate the epilogue instructions for returning from a function.
9895 This is almost exactly the reverse of the prolog sequence, except
9896 that we need to insert barriers to avoid scheduling loads that read
9897 from a deallocated stack, and we optimize the unwind records by
9898 emitting them all together if possible. */
9900 aarch64_expand_epilogue (rtx_call_insn
*sibcall
)
9902 aarch64_frame
&frame
= cfun
->machine
->frame
;
9903 poly_int64 initial_adjust
= frame
.initial_adjust
;
9904 HOST_WIDE_INT callee_adjust
= frame
.callee_adjust
;
9905 poly_int64 final_adjust
= frame
.final_adjust
;
9906 poly_int64 sve_callee_adjust
= frame
.sve_callee_adjust
;
9907 poly_int64 bytes_below_hard_fp
= frame
.bytes_below_hard_fp
;
9908 unsigned reg1
= frame
.wb_pop_candidate1
;
9909 unsigned reg2
= frame
.wb_pop_candidate2
;
9912 /* A stack clash protection prologue may not have left EP0_REGNUM or
9913 EP1_REGNUM in a usable state. The same is true for allocations
9914 with an SVE component, since we then need both temporary registers
9915 for each allocation. For stack clash we are in a usable state if
9916 the adjustment is less than GUARD_SIZE - GUARD_USED_BY_CALLER. */
9917 HOST_WIDE_INT guard_size
9918 = 1 << param_stack_clash_protection_guard_size
;
9919 HOST_WIDE_INT guard_used_by_caller
= STACK_CLASH_CALLER_GUARD
;
9920 aarch64_isa_mode force_isa_mode
= 0;
9921 if (aarch64_cfun_enables_pstate_sm ())
9922 force_isa_mode
= AARCH64_ISA_MODE_SM_ON
;
9924 /* We can re-use the registers when:
9926 (a) the deallocation amount is the same as the corresponding
9927 allocation amount (which is false if we combine the initial
9928 and SVE callee save allocations in the prologue); and
9930 (b) the allocation amount doesn't need a probe (which is false
9931 if the amount is guard_size - guard_used_by_caller or greater).
9933 In such situations the register should remain live with the correct
9935 bool can_inherit_p
= (initial_adjust
.is_constant ()
9936 && final_adjust
.is_constant ()
9937 && (!flag_stack_clash_protection
9938 || (known_lt (initial_adjust
,
9939 guard_size
- guard_used_by_caller
)
9940 && known_eq (sve_callee_adjust
, 0))));
9942 /* We need to add memory barrier to prevent read from deallocated stack. */
9944 = maybe_ne (get_frame_size ()
9945 + frame
.saved_varargs_size
, 0);
9947 /* Reset PSTATE.SM, if required. */
9948 if (aarch64_cfun_enables_pstate_sm ())
9950 rtx_insn
*guard_label
= nullptr;
9951 if (known_ge (cfun
->machine
->frame
.old_svcr_offset
, 0))
9952 guard_label
= aarch64_guard_switch_pstate_sm (IP0_REGNUM
,
9954 aarch64_sme_mode_switch_regs return_switch
;
9956 return_switch
.add_call_args (sibcall
);
9957 else if (crtl
->return_rtx
&& REG_P (crtl
->return_rtx
))
9958 return_switch
.add_reg (GET_MODE (crtl
->return_rtx
),
9959 REGNO (crtl
->return_rtx
));
9960 return_switch
.emit_prologue ();
9961 emit_insn (gen_aarch64_smstop_sm ());
9962 return_switch
.emit_epilogue ();
9964 emit_label (guard_label
);
9967 /* Emit a barrier to prevent loads from a deallocated stack. */
9968 if (maybe_gt (final_adjust
, crtl
->outgoing_args_size
)
9969 || cfun
->calls_alloca
9970 || crtl
->calls_eh_return
)
9972 aarch64_emit_stack_tie (stack_pointer_rtx
);
9973 need_barrier_p
= false;
9976 /* Restore the stack pointer from the frame pointer if it may not
9977 be the same as the stack pointer. */
9978 rtx tmp0_rtx
= gen_rtx_REG (Pmode
, EP0_REGNUM
);
9979 rtx tmp1_rtx
= gen_rtx_REG (Pmode
, EP1_REGNUM
);
9980 if (frame_pointer_needed
9981 && (maybe_ne (final_adjust
, 0) || cfun
->calls_alloca
))
9982 /* If writeback is used when restoring callee-saves, the CFA
9983 is restored on the instruction doing the writeback. */
9984 aarch64_add_offset (Pmode
, stack_pointer_rtx
,
9985 hard_frame_pointer_rtx
,
9986 -bytes_below_hard_fp
+ final_adjust
,
9987 tmp1_rtx
, tmp0_rtx
, force_isa_mode
,
9988 callee_adjust
== 0);
9990 /* The case where we need to re-use the register here is very rare, so
9991 avoid the complicated condition and just always emit a move if the
9992 immediate doesn't fit. */
9993 aarch64_add_sp (tmp1_rtx
, tmp0_rtx
, final_adjust
, force_isa_mode
, true);
9995 /* Restore the vector registers before the predicate registers,
9996 so that we can use P4 as a temporary for big-endian SVE frames. */
9997 aarch64_restore_callee_saves (final_adjust
, frame
.saved_fprs
, &cfi_ops
);
9998 aarch64_restore_callee_saves (final_adjust
, frame
.saved_prs
, &cfi_ops
);
9999 if (maybe_ne (sve_callee_adjust
, 0))
10000 aarch64_add_sp (NULL_RTX
, NULL_RTX
, sve_callee_adjust
,
10001 force_isa_mode
, true);
10003 /* When shadow call stack is enabled, the scs_pop in the epilogue will
10004 restore x30, we don't need to restore x30 again in the traditional
10006 aarch64_restore_callee_saves (final_adjust
+ sve_callee_adjust
,
10007 frame
.saved_gprs
, &cfi_ops
);
10009 if (need_barrier_p
)
10010 aarch64_emit_stack_tie (stack_pointer_rtx
);
10012 if (callee_adjust
!= 0)
10013 aarch64_pop_regs (reg1
, reg2
, callee_adjust
, &cfi_ops
);
10015 /* If we have no register restore information, the CFA must have been
10016 defined in terms of the stack pointer since the end of the prologue. */
10017 gcc_assert (cfi_ops
|| !frame_pointer_needed
);
10019 if (cfi_ops
&& (callee_adjust
!= 0 || maybe_gt (initial_adjust
, 65536)))
10021 /* Emit delayed restores and set the CFA to be SP + initial_adjust. */
10022 insn
= get_last_insn ();
10023 rtx new_cfa
= plus_constant (Pmode
, stack_pointer_rtx
, initial_adjust
);
10024 REG_NOTES (insn
) = alloc_reg_note (REG_CFA_DEF_CFA
, new_cfa
, cfi_ops
);
10025 RTX_FRAME_RELATED_P (insn
) = 1;
10029 /* Liveness of EP0_REGNUM can not be trusted across function calls either, so
10030 add restriction on emit_move optimization to leaf functions. */
10031 aarch64_add_sp (tmp0_rtx
, tmp1_rtx
, initial_adjust
, force_isa_mode
,
10032 (!can_inherit_p
|| !crtl
->is_leaf
10033 || df_regs_ever_live_p (EP0_REGNUM
)));
10037 /* Emit delayed restores and reset the CFA to be SP. */
10038 insn
= get_last_insn ();
10039 cfi_ops
= alloc_reg_note (REG_CFA_DEF_CFA
, stack_pointer_rtx
, cfi_ops
);
10040 REG_NOTES (insn
) = cfi_ops
;
10041 RTX_FRAME_RELATED_P (insn
) = 1;
10044 /* Pop return address from shadow call stack. */
10045 if (frame
.is_scs_enabled
)
10047 machine_mode mode
= aarch64_reg_save_mode (R30_REGNUM
);
10048 rtx reg
= gen_rtx_REG (mode
, R30_REGNUM
);
10050 insn
= emit_insn (gen_scs_pop ());
10051 add_reg_note (insn
, REG_CFA_RESTORE
, reg
);
10052 RTX_FRAME_RELATED_P (insn
) = 1;
10055 /* Stack adjustment for exception handler. */
10056 if (crtl
->calls_eh_return
&& !sibcall
)
10058 /* If the EH_RETURN_TAKEN_RTX flag is set then we need
10059 to unwind the stack and jump to the handler, otherwise
10060 skip this eh_return logic and continue with normal
10061 return after the label. We have already reset the CFA
10062 to be SP; letting the CFA move during this adjustment
10063 is just as correct as retaining the CFA from the body
10064 of the function. Therefore, do nothing special. */
10065 rtx_code_label
*label
= gen_label_rtx ();
10066 rtx x
= aarch64_gen_compare_zero_and_branch (EQ
, EH_RETURN_TAKEN_RTX
,
10068 rtx jump
= emit_jump_insn (x
);
10069 JUMP_LABEL (jump
) = label
;
10070 LABEL_NUSES (label
)++;
10071 emit_insn (gen_add2_insn (stack_pointer_rtx
,
10072 EH_RETURN_STACKADJ_RTX
));
10073 emit_jump_insn (gen_indirect_jump (EH_RETURN_HANDLER_RTX
));
10075 emit_label (label
);
10078 /* We prefer to emit the combined return/authenticate instruction RETAA,
10079 however there are three cases in which we must instead emit an explicit
10080 authentication instruction.
10082 1) Sibcalls don't return in a normal way, so if we're about to call one
10083 we must authenticate.
10085 2) The RETAA instruction is not available before ARMv8.3-A, so if we are
10086 generating code for !TARGET_ARMV8_3 we can't use it and must
10087 explicitly authenticate.
10089 if (aarch64_return_address_signing_enabled ()
10090 && (sibcall
|| !TARGET_ARMV8_3
))
10092 switch (aarch64_ra_sign_key
)
10094 case AARCH64_KEY_A
:
10095 insn
= emit_insn (gen_autiasp ());
10097 case AARCH64_KEY_B
:
10098 insn
= emit_insn (gen_autibsp ());
10101 gcc_unreachable ();
10103 add_reg_note (insn
, REG_CFA_NEGATE_RA_STATE
, const0_rtx
);
10104 RTX_FRAME_RELATED_P (insn
) = 1;
10107 emit_use (gen_rtx_REG (DImode
, LR_REGNUM
));
10109 emit_jump_insn (ret_rtx
);
10112 /* Output code to add DELTA to the first argument, and then jump
10113 to FUNCTION. Used for C++ multiple inheritance. */
10115 aarch64_output_mi_thunk (FILE *file
, tree thunk ATTRIBUTE_UNUSED
,
10116 HOST_WIDE_INT delta
,
10117 HOST_WIDE_INT vcall_offset
,
10120 /* The this pointer is always in x0. Note that this differs from
10121 Arm where the this pointer maybe bumped to r1 if r0 is required
10122 to return a pointer to an aggregate. On AArch64 a result value
10123 pointer will be in x8. */
10124 int this_regno
= R0_REGNUM
;
10125 rtx this_rtx
, temp0
, temp1
, addr
, funexp
;
10127 const char *fnname
= IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (thunk
));
10129 if (aarch_bti_enabled ())
10130 emit_insn (gen_bti_c());
10132 reload_completed
= 1;
10133 emit_note (NOTE_INSN_PROLOGUE_END
);
10135 this_rtx
= gen_rtx_REG (Pmode
, this_regno
);
10136 temp0
= gen_rtx_REG (Pmode
, EP0_REGNUM
);
10137 temp1
= gen_rtx_REG (Pmode
, EP1_REGNUM
);
10139 if (vcall_offset
== 0)
10140 aarch64_add_offset (Pmode
, this_rtx
, this_rtx
, delta
, temp1
, temp0
,
10144 gcc_assert ((vcall_offset
& (POINTER_BYTES
- 1)) == 0);
10149 if (delta
>= -256 && delta
< 256)
10150 addr
= gen_rtx_PRE_MODIFY (Pmode
, this_rtx
,
10151 plus_constant (Pmode
, this_rtx
, delta
));
10153 aarch64_add_offset (Pmode
, this_rtx
, this_rtx
, delta
,
10154 temp1
, temp0
, 0, false);
10157 if (Pmode
== ptr_mode
)
10158 aarch64_emit_move (temp0
, gen_rtx_MEM (ptr_mode
, addr
));
10160 aarch64_emit_move (temp0
,
10161 gen_rtx_ZERO_EXTEND (Pmode
,
10162 gen_rtx_MEM (ptr_mode
, addr
)));
10164 if (vcall_offset
>= -256 && vcall_offset
< 4096 * POINTER_BYTES
)
10165 addr
= plus_constant (Pmode
, temp0
, vcall_offset
);
10168 aarch64_internal_mov_immediate (temp1
, GEN_INT (vcall_offset
), true,
10170 addr
= gen_rtx_PLUS (Pmode
, temp0
, temp1
);
10173 if (Pmode
== ptr_mode
)
10174 aarch64_emit_move (temp1
, gen_rtx_MEM (ptr_mode
,addr
));
10176 aarch64_emit_move (temp1
,
10177 gen_rtx_SIGN_EXTEND (Pmode
,
10178 gen_rtx_MEM (ptr_mode
, addr
)));
10180 emit_insn (gen_add2_insn (this_rtx
, temp1
));
10183 /* Generate a tail call to the target function. */
10184 if (!TREE_USED (function
))
10186 assemble_external (function
);
10187 TREE_USED (function
) = 1;
10189 funexp
= XEXP (DECL_RTL (function
), 0);
10190 funexp
= gen_rtx_MEM (FUNCTION_MODE
, funexp
);
10191 auto isa_mode
= aarch64_fntype_isa_mode (TREE_TYPE (function
));
10192 auto pcs_variant
= arm_pcs (fndecl_abi (function
).id ());
10193 rtx callee_abi
= aarch64_gen_callee_cookie (isa_mode
, pcs_variant
);
10194 insn
= emit_call_insn (gen_sibcall (funexp
, const0_rtx
, callee_abi
));
10195 SIBLING_CALL_P (insn
) = 1;
10197 insn
= get_insns ();
10198 shorten_branches (insn
);
10200 assemble_start_function (thunk
, fnname
);
10201 final_start_function (insn
, file
, 1);
10202 final (insn
, file
, 1);
10203 final_end_function ();
10204 assemble_end_function (thunk
, fnname
);
10206 /* Stop pretending to be a post-reload pass. */
10207 reload_completed
= 0;
10211 aarch64_tls_referenced_p (rtx x
)
10213 if (!TARGET_HAVE_TLS
)
10215 subrtx_iterator::array_type array
;
10216 FOR_EACH_SUBRTX (iter
, array
, x
, ALL
)
10218 const_rtx x
= *iter
;
10219 if (SYMBOL_REF_P (x
) && SYMBOL_REF_TLS_MODEL (x
) != 0)
10221 /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
10222 TLS offsets, not real symbol references. */
10223 if (GET_CODE (x
) == UNSPEC
&& XINT (x
, 1) == UNSPEC_TLS
)
10224 iter
.skip_subrtxes ();
10231 aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED
, rtx x
)
10233 if (GET_CODE (x
) == HIGH
)
10236 /* There's no way to calculate VL-based values using relocations. */
10237 subrtx_iterator::array_type array
;
10238 HOST_WIDE_INT factor
;
10239 FOR_EACH_SUBRTX (iter
, array
, x
, ALL
)
10240 if (GET_CODE (*iter
) == CONST_POLY_INT
10241 || aarch64_sme_vq_unspec_p (x
, &factor
))
10245 rtx base
= strip_offset_and_salt (x
, &offset
);
10246 if (SYMBOL_REF_P (base
) || LABEL_REF_P (base
))
10248 /* We checked for POLY_INT_CST offsets above. */
10249 if (aarch64_classify_symbol (base
, offset
.to_constant ())
10250 != SYMBOL_FORCE_TO_MEM
)
10253 /* Avoid generating a 64-bit relocation in ILP32; leave
10254 to aarch64_expand_mov_immediate to handle it properly. */
10255 return mode
!= ptr_mode
;
10258 return aarch64_tls_referenced_p (x
);
10261 /* Implement TARGET_CASE_VALUES_THRESHOLD.
10262 The expansion for a table switch is quite expensive due to the number
10263 of instructions, the table lookup and hard to predict indirect jump.
10264 When optimizing for speed, and -O3 enabled, use the per-core tuning if
10265 set, otherwise use tables for >= 11 cases as a tradeoff between size and
10266 performance. When optimizing for size, use 8 for smallest codesize. */
10268 static unsigned int
10269 aarch64_case_values_threshold (void)
10271 /* Use the specified limit for the number of cases before using jump
10272 tables at higher optimization levels. */
10274 && aarch64_tune_params
.max_case_values
!= 0)
10275 return aarch64_tune_params
.max_case_values
;
10277 return optimize_size
? 8 : 11;
10280 /* Return true if register REGNO is a valid index register.
10281 STRICT_P is true if REG_OK_STRICT is in effect. */
10284 aarch64_regno_ok_for_index_p (int regno
, bool strict_p
)
10286 if (!HARD_REGISTER_NUM_P (regno
))
10294 regno
= reg_renumber
[regno
];
10296 return GP_REGNUM_P (regno
);
10299 /* Return true if register REGNO is a valid base register for mode MODE.
10300 STRICT_P is true if REG_OK_STRICT is in effect. */
10303 aarch64_regno_ok_for_base_p (int regno
, bool strict_p
)
10305 if (!HARD_REGISTER_NUM_P (regno
))
10313 regno
= reg_renumber
[regno
];
10316 /* The fake registers will be eliminated to either the stack or
10317 hard frame pointer, both of which are usually valid base registers.
10318 Reload deals with the cases where the eliminated form isn't valid. */
10319 return (GP_REGNUM_P (regno
)
10320 || regno
== SP_REGNUM
10321 || regno
== FRAME_POINTER_REGNUM
10322 || regno
== ARG_POINTER_REGNUM
);
10325 /* Return true if X is a valid base register for mode MODE.
10326 STRICT_P is true if REG_OK_STRICT is in effect. */
10329 aarch64_base_register_rtx_p (rtx x
, bool strict_p
)
10333 && contains_reg_of_mode
[GENERAL_REGS
][GET_MODE (SUBREG_REG (x
))])
10334 x
= SUBREG_REG (x
);
10336 return (REG_P (x
) && aarch64_regno_ok_for_base_p (REGNO (x
), strict_p
));
10339 /* Return true if address offset is a valid index. If it is, fill in INFO
10340 appropriately. STRICT_P is true if REG_OK_STRICT is in effect. */
10343 aarch64_classify_index (struct aarch64_address_info
*info
, rtx x
,
10344 machine_mode mode
, bool strict_p
)
10346 enum aarch64_address_type type
;
10351 if ((REG_P (x
) || SUBREG_P (x
))
10352 && GET_MODE (x
) == Pmode
)
10354 type
= ADDRESS_REG_REG
;
10358 /* (sign_extend:DI (reg:SI)) */
10359 else if ((GET_CODE (x
) == SIGN_EXTEND
10360 || GET_CODE (x
) == ZERO_EXTEND
)
10361 && GET_MODE (x
) == DImode
10362 && GET_MODE (XEXP (x
, 0)) == SImode
)
10364 type
= (GET_CODE (x
) == SIGN_EXTEND
)
10365 ? ADDRESS_REG_SXTW
: ADDRESS_REG_UXTW
;
10366 index
= XEXP (x
, 0);
10369 /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
10370 else if (GET_CODE (x
) == MULT
10371 && (GET_CODE (XEXP (x
, 0)) == SIGN_EXTEND
10372 || GET_CODE (XEXP (x
, 0)) == ZERO_EXTEND
)
10373 && GET_MODE (XEXP (x
, 0)) == DImode
10374 && GET_MODE (XEXP (XEXP (x
, 0), 0)) == SImode
10375 && CONST_INT_P (XEXP (x
, 1)))
10377 type
= (GET_CODE (XEXP (x
, 0)) == SIGN_EXTEND
)
10378 ? ADDRESS_REG_SXTW
: ADDRESS_REG_UXTW
;
10379 index
= XEXP (XEXP (x
, 0), 0);
10380 shift
= exact_log2 (INTVAL (XEXP (x
, 1)));
10382 /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
10383 else if (GET_CODE (x
) == ASHIFT
10384 && (GET_CODE (XEXP (x
, 0)) == SIGN_EXTEND
10385 || GET_CODE (XEXP (x
, 0)) == ZERO_EXTEND
)
10386 && GET_MODE (XEXP (x
, 0)) == DImode
10387 && GET_MODE (XEXP (XEXP (x
, 0), 0)) == SImode
10388 && CONST_INT_P (XEXP (x
, 1)))
10390 type
= (GET_CODE (XEXP (x
, 0)) == SIGN_EXTEND
)
10391 ? ADDRESS_REG_SXTW
: ADDRESS_REG_UXTW
;
10392 index
= XEXP (XEXP (x
, 0), 0);
10393 shift
= INTVAL (XEXP (x
, 1));
10395 /* (and:DI (mult:DI (reg:DI) (const_int scale))
10396 (const_int 0xffffffff<<shift)) */
10397 else if (GET_CODE (x
) == AND
10398 && GET_MODE (x
) == DImode
10399 && GET_CODE (XEXP (x
, 0)) == MULT
10400 && GET_MODE (XEXP (XEXP (x
, 0), 0)) == DImode
10401 && CONST_INT_P (XEXP (XEXP (x
, 0), 1))
10402 && CONST_INT_P (XEXP (x
, 1)))
10404 type
= ADDRESS_REG_UXTW
;
10405 index
= XEXP (XEXP (x
, 0), 0);
10406 shift
= exact_log2 (INTVAL (XEXP (XEXP (x
, 0), 1)));
10407 /* Avoid undefined code dealing with shift being -1. */
10409 && INTVAL (XEXP (x
, 1)) != (HOST_WIDE_INT
)0xffffffff << shift
)
10412 /* (and:DI (ashift:DI (reg:DI) (const_int shift))
10413 (const_int 0xffffffff<<shift)) */
10414 else if (GET_CODE (x
) == AND
10415 && GET_MODE (x
) == DImode
10416 && GET_CODE (XEXP (x
, 0)) == ASHIFT
10417 && GET_MODE (XEXP (XEXP (x
, 0), 0)) == DImode
10418 && CONST_INT_P (XEXP (XEXP (x
, 0), 1))
10419 && CONST_INT_P (XEXP (x
, 1)))
10421 type
= ADDRESS_REG_UXTW
;
10422 index
= XEXP (XEXP (x
, 0), 0);
10423 shift
= INTVAL (XEXP (XEXP (x
, 0), 1));
10424 if (INTVAL (XEXP (x
, 1)) != (HOST_WIDE_INT
)0xffffffff << shift
)
10427 /* (mult:P (reg:P) (const_int scale)) */
10428 else if (GET_CODE (x
) == MULT
10429 && GET_MODE (x
) == Pmode
10430 && GET_MODE (XEXP (x
, 0)) == Pmode
10431 && CONST_INT_P (XEXP (x
, 1)))
10433 type
= ADDRESS_REG_REG
;
10434 index
= XEXP (x
, 0);
10435 shift
= exact_log2 (INTVAL (XEXP (x
, 1)));
10437 /* (ashift:P (reg:P) (const_int shift)) */
10438 else if (GET_CODE (x
) == ASHIFT
10439 && GET_MODE (x
) == Pmode
10440 && GET_MODE (XEXP (x
, 0)) == Pmode
10441 && CONST_INT_P (XEXP (x
, 1)))
10443 type
= ADDRESS_REG_REG
;
10444 index
= XEXP (x
, 0);
10445 shift
= INTVAL (XEXP (x
, 1));
10451 && SUBREG_P (index
)
10452 && contains_reg_of_mode
[GENERAL_REGS
][GET_MODE (SUBREG_REG (index
))])
10453 index
= SUBREG_REG (index
);
10455 if (aarch64_sve_data_mode_p (mode
) || mode
== VNx1TImode
)
10457 if (type
!= ADDRESS_REG_REG
10458 || (1 << shift
) != GET_MODE_UNIT_SIZE (mode
))
10464 && !(IN_RANGE (shift
, 1, 3)
10465 && known_eq (1 << shift
, GET_MODE_SIZE (mode
))))
10470 && aarch64_regno_ok_for_index_p (REGNO (index
), strict_p
))
10473 info
->offset
= index
;
10474 info
->shift
= shift
;
10481 /* Return true if MODE is one of the modes for which we
10482 support LDP/STP operations. */
10485 aarch64_mode_valid_for_sched_fusion_p (machine_mode mode
)
10487 return mode
== SImode
|| mode
== DImode
10488 || mode
== SFmode
|| mode
== DFmode
10489 || mode
== SDmode
|| mode
== DDmode
10490 || (aarch64_vector_mode_supported_p (mode
)
10491 && (known_eq (GET_MODE_SIZE (mode
), 8)
10492 || known_eq (GET_MODE_SIZE (mode
), 16)));
10495 /* Return true if REGNO is a virtual pointer register, or an eliminable
10496 "soft" frame register. Like REGNO_PTR_FRAME_P except that we don't
10497 include stack_pointer or hard_frame_pointer. */
10499 virt_or_elim_regno_p (unsigned regno
)
10501 return ((regno
>= FIRST_VIRTUAL_REGISTER
10502 && regno
<= LAST_VIRTUAL_POINTER_REGISTER
)
10503 || regno
== FRAME_POINTER_REGNUM
10504 || regno
== ARG_POINTER_REGNUM
);
10507 /* Return true if X is a valid address of type TYPE for machine mode MODE.
10508 If it is, fill in INFO appropriately. STRICT_P is true if
10509 REG_OK_STRICT is in effect. */
10512 aarch64_classify_address (struct aarch64_address_info
*info
,
10513 rtx x
, machine_mode mode
, bool strict_p
,
10514 aarch64_addr_query_type type
)
10516 enum rtx_code code
= GET_CODE (x
);
10520 HOST_WIDE_INT const_size
;
10522 /* Whether a vector mode is partial doesn't affect address legitimacy.
10523 Partial vectors like VNx8QImode allow the same indexed addressing
10524 mode and MUL VL addressing mode as full vectors like VNx16QImode;
10525 in both cases, MUL VL counts multiples of GET_MODE_SIZE. */
10526 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
10527 vec_flags
&= ~VEC_PARTIAL
;
10529 /* On BE, we use load/store pair for all large int mode load/stores.
10530 TI/TF/TDmode may also use a load/store pair. */
10531 bool advsimd_struct_p
= (vec_flags
== (VEC_ADVSIMD
| VEC_STRUCT
));
10532 bool load_store_pair_p
= (type
== ADDR_QUERY_LDP_STP
10533 || type
== ADDR_QUERY_LDP_STP_N
10537 || ((!TARGET_SIMD
|| BYTES_BIG_ENDIAN
)
10538 && advsimd_struct_p
));
10539 /* If we are dealing with ADDR_QUERY_LDP_STP_N that means the incoming mode
10540 corresponds to the actual size of the memory being loaded/stored and the
10541 mode of the corresponding addressing mode is half of that. */
10542 if (type
== ADDR_QUERY_LDP_STP_N
)
10544 if (known_eq (GET_MODE_SIZE (mode
), 32))
10546 else if (known_eq (GET_MODE_SIZE (mode
), 16))
10548 else if (known_eq (GET_MODE_SIZE (mode
), 8))
10553 /* This isn't really an Advanced SIMD struct mode, but a mode
10554 used to represent the complete mem in a load/store pair. */
10555 advsimd_struct_p
= false;
10558 bool allow_reg_index_p
= (!load_store_pair_p
10559 && ((vec_flags
== 0
10560 && known_lt (GET_MODE_SIZE (mode
), 16))
10561 || vec_flags
== VEC_ADVSIMD
10562 || vec_flags
& VEC_SVE_DATA
10563 || mode
== VNx1TImode
));
10565 /* For SVE, only accept [Rn], [Rn, #offset, MUL VL] and [Rn, Rm, LSL #shift].
10566 The latter is not valid for SVE predicates, and that's rejected through
10567 allow_reg_index_p above. */
10568 if ((vec_flags
& (VEC_SVE_DATA
| VEC_SVE_PRED
)) != 0
10569 && (code
!= REG
&& code
!= PLUS
))
10572 /* On LE, for AdvSIMD, don't support anything other than POST_INC or
10574 if (advsimd_struct_p
10576 && !BYTES_BIG_ENDIAN
10577 && (code
!= POST_INC
&& code
!= REG
))
10580 gcc_checking_assert (GET_MODE (x
) == VOIDmode
10581 || SCALAR_INT_MODE_P (GET_MODE (x
)));
10587 info
->type
= ADDRESS_REG_IMM
;
10589 info
->offset
= const0_rtx
;
10590 info
->const_offset
= 0;
10591 return aarch64_base_register_rtx_p (x
, strict_p
);
10599 && virt_or_elim_regno_p (REGNO (op0
))
10600 && poly_int_rtx_p (op1
, &offset
))
10602 info
->type
= ADDRESS_REG_IMM
;
10604 info
->offset
= op1
;
10605 info
->const_offset
= offset
;
10610 if (maybe_ne (GET_MODE_SIZE (mode
), 0)
10611 && aarch64_base_register_rtx_p (op0
, strict_p
)
10612 && poly_int_rtx_p (op1
, &offset
))
10614 info
->type
= ADDRESS_REG_IMM
;
10616 info
->offset
= op1
;
10617 info
->const_offset
= offset
;
10619 /* TImode, TFmode and TDmode values are allowed in both pairs of X
10620 registers and individual Q registers. The available
10622 X,X: 7-bit signed scaled offset
10623 Q: 9-bit signed offset
10624 We conservatively require an offset representable in either mode.
10625 When performing the check for pairs of X registers i.e. LDP/STP
10626 pass down DImode since that is the natural size of the LDP/STP
10627 instruction memory accesses. */
10628 if (mode
== TImode
|| mode
== TFmode
|| mode
== TDmode
)
10629 return (aarch64_offset_7bit_signed_scaled_p (DImode
, offset
)
10630 && (aarch64_offset_9bit_signed_unscaled_p (mode
, offset
)
10631 || offset_12bit_unsigned_scaled_p (mode
, offset
)));
10633 if (mode
== V8DImode
)
10634 return (aarch64_offset_7bit_signed_scaled_p (DImode
, offset
)
10635 && aarch64_offset_7bit_signed_scaled_p (DImode
, offset
+ 48));
10637 /* A 7bit offset check because OImode will emit a ldp/stp
10638 instruction (only !TARGET_SIMD or big endian will get here).
10639 For ldp/stp instructions, the offset is scaled for the size of a
10640 single element of the pair. */
10641 if (aarch64_advsimd_partial_struct_mode_p (mode
)
10642 && known_eq (GET_MODE_SIZE (mode
), 16))
10643 return aarch64_offset_7bit_signed_scaled_p (DImode
, offset
);
10644 if (aarch64_advsimd_full_struct_mode_p (mode
)
10645 && known_eq (GET_MODE_SIZE (mode
), 32))
10646 return aarch64_offset_7bit_signed_scaled_p (TImode
, offset
);
10648 /* Three 9/12 bit offsets checks because CImode will emit three
10649 ldr/str instructions (only !TARGET_SIMD or big endian will
10651 if (aarch64_advsimd_partial_struct_mode_p (mode
)
10652 && known_eq (GET_MODE_SIZE (mode
), 24))
10653 return (aarch64_offset_7bit_signed_scaled_p (DImode
, offset
)
10654 && (aarch64_offset_9bit_signed_unscaled_p (DImode
,
10656 || offset_12bit_unsigned_scaled_p (DImode
,
10658 if (aarch64_advsimd_full_struct_mode_p (mode
)
10659 && known_eq (GET_MODE_SIZE (mode
), 48))
10660 return (aarch64_offset_7bit_signed_scaled_p (TImode
, offset
)
10661 && (aarch64_offset_9bit_signed_unscaled_p (TImode
,
10663 || offset_12bit_unsigned_scaled_p (TImode
,
10666 /* Two 7bit offsets checks because XImode will emit two ldp/stp
10667 instructions (only big endian will get here). */
10668 if (aarch64_advsimd_partial_struct_mode_p (mode
)
10669 && known_eq (GET_MODE_SIZE (mode
), 32))
10670 return (aarch64_offset_7bit_signed_scaled_p (DImode
, offset
)
10671 && aarch64_offset_7bit_signed_scaled_p (DImode
,
10673 if (aarch64_advsimd_full_struct_mode_p (mode
)
10674 && known_eq (GET_MODE_SIZE (mode
), 64))
10675 return (aarch64_offset_7bit_signed_scaled_p (TImode
, offset
)
10676 && aarch64_offset_7bit_signed_scaled_p (TImode
,
10679 /* Make "m" use the LD1 offset range for SVE data modes, so
10680 that pre-RTL optimizers like ivopts will work to that
10681 instead of the wider LDR/STR range. */
10682 if (vec_flags
== VEC_SVE_DATA
|| mode
== VNx1TImode
)
10683 return (type
== ADDR_QUERY_M
10684 ? offset_4bit_signed_scaled_p (mode
, offset
)
10685 : offset_9bit_signed_scaled_p (mode
, offset
));
10687 if (vec_flags
== (VEC_SVE_DATA
| VEC_STRUCT
))
10689 poly_int64 end_offset
= (offset
10690 + GET_MODE_SIZE (mode
)
10691 - BYTES_PER_SVE_VECTOR
);
10692 return (type
== ADDR_QUERY_M
10693 ? offset_4bit_signed_scaled_p (mode
, offset
)
10694 : (offset_9bit_signed_scaled_p (SVE_BYTE_MODE
, offset
)
10695 && offset_9bit_signed_scaled_p (SVE_BYTE_MODE
,
10699 if (vec_flags
== VEC_SVE_PRED
)
10700 return offset_9bit_signed_scaled_p (mode
, offset
);
10702 if (vec_flags
== (VEC_SVE_PRED
| VEC_STRUCT
))
10704 poly_int64 end_offset
= (offset
10705 + GET_MODE_SIZE (mode
)
10706 - BYTES_PER_SVE_PRED
);
10707 return (offset_9bit_signed_scaled_p (VNx16BImode
, end_offset
)
10708 && offset_9bit_signed_scaled_p (VNx16BImode
, offset
));
10711 if (load_store_pair_p
)
10712 return ((known_eq (GET_MODE_SIZE (mode
), 4)
10713 || known_eq (GET_MODE_SIZE (mode
), 8)
10714 || known_eq (GET_MODE_SIZE (mode
), 16))
10715 && aarch64_offset_7bit_signed_scaled_p (mode
, offset
));
10717 return (aarch64_offset_9bit_signed_unscaled_p (mode
, offset
)
10718 || offset_12bit_unsigned_scaled_p (mode
, offset
));
10721 if (allow_reg_index_p
)
10723 /* Look for base + (scaled/extended) index register. */
10724 if (aarch64_base_register_rtx_p (op0
, strict_p
)
10725 && aarch64_classify_index (info
, op1
, mode
, strict_p
))
10730 if (aarch64_base_register_rtx_p (op1
, strict_p
)
10731 && aarch64_classify_index (info
, op0
, mode
, strict_p
))
10744 info
->type
= ADDRESS_REG_WB
;
10745 info
->base
= XEXP (x
, 0);
10746 info
->offset
= NULL_RTX
;
10747 return aarch64_base_register_rtx_p (info
->base
, strict_p
);
10751 info
->type
= ADDRESS_REG_WB
;
10752 info
->base
= XEXP (x
, 0);
10753 if (GET_CODE (XEXP (x
, 1)) == PLUS
10754 && poly_int_rtx_p (XEXP (XEXP (x
, 1), 1), &offset
)
10755 && rtx_equal_p (XEXP (XEXP (x
, 1), 0), info
->base
)
10756 && aarch64_base_register_rtx_p (info
->base
, strict_p
))
10758 info
->offset
= XEXP (XEXP (x
, 1), 1);
10759 info
->const_offset
= offset
;
10761 /* TImode, TFmode and TDmode values are allowed in both pairs of X
10762 registers and individual Q registers. The available
10764 X,X: 7-bit signed scaled offset
10765 Q: 9-bit signed offset
10766 We conservatively require an offset representable in either mode.
10768 if (mode
== TImode
|| mode
== TFmode
|| mode
== TDmode
)
10769 return (aarch64_offset_7bit_signed_scaled_p (mode
, offset
)
10770 && aarch64_offset_9bit_signed_unscaled_p (mode
, offset
));
10772 if (load_store_pair_p
)
10773 return ((known_eq (GET_MODE_SIZE (mode
), 4)
10774 || known_eq (GET_MODE_SIZE (mode
), 8)
10775 || known_eq (GET_MODE_SIZE (mode
), 16))
10776 && aarch64_offset_7bit_signed_scaled_p (mode
, offset
));
10778 return aarch64_offset_9bit_signed_unscaled_p (mode
, offset
);
10785 /* load literal: pc-relative constant pool entry. Only supported
10786 for SI mode or larger. */
10787 info
->type
= ADDRESS_SYMBOLIC
;
10789 if (!load_store_pair_p
10790 && GET_MODE_SIZE (mode
).is_constant (&const_size
)
10791 && const_size
>= 4)
10794 rtx sym
= strip_offset_and_salt (x
, &offset
);
10795 return ((LABEL_REF_P (sym
)
10796 || (SYMBOL_REF_P (sym
)
10797 && CONSTANT_POOL_ADDRESS_P (sym
)
10798 && aarch64_pcrelative_literal_loads
)));
10803 info
->type
= ADDRESS_LO_SUM
;
10804 info
->base
= XEXP (x
, 0);
10805 info
->offset
= XEXP (x
, 1);
10806 if (allow_reg_index_p
10807 && aarch64_base_register_rtx_p (info
->base
, strict_p
))
10810 HOST_WIDE_INT const_offset
;
10811 rtx sym
= strip_offset_and_salt (info
->offset
, &offset
);
10812 if (SYMBOL_REF_P (sym
)
10813 && offset
.is_constant (&const_offset
)
10814 && (aarch64_classify_symbol (sym
, const_offset
)
10815 == SYMBOL_SMALL_ABSOLUTE
))
10817 /* The symbol and offset must be aligned to the access size. */
10818 unsigned int align
;
10820 if (CONSTANT_POOL_ADDRESS_P (sym
))
10821 align
= GET_MODE_ALIGNMENT (get_pool_mode (sym
));
10822 else if (TREE_CONSTANT_POOL_ADDRESS_P (sym
))
10824 tree exp
= SYMBOL_REF_DECL (sym
);
10825 align
= TYPE_ALIGN (TREE_TYPE (exp
));
10826 align
= aarch64_constant_alignment (exp
, align
);
10828 else if (SYMBOL_REF_DECL (sym
))
10829 align
= DECL_ALIGN (SYMBOL_REF_DECL (sym
));
10830 else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym
)
10831 && SYMBOL_REF_BLOCK (sym
) != NULL
)
10832 align
= SYMBOL_REF_BLOCK (sym
)->alignment
;
10834 align
= BITS_PER_UNIT
;
10836 poly_int64 ref_size
= GET_MODE_SIZE (mode
);
10837 if (known_eq (ref_size
, 0))
10838 ref_size
= GET_MODE_SIZE (DImode
);
10840 return (multiple_p (const_offset
, ref_size
)
10841 && multiple_p (align
/ BITS_PER_UNIT
, ref_size
));
10851 /* Return true if the address X is valid for a PRFM instruction.
10852 STRICT_P is true if we should do strict checking with
10853 aarch64_classify_address. */
10856 aarch64_address_valid_for_prefetch_p (rtx x
, bool strict_p
)
10858 struct aarch64_address_info addr
;
10860 /* PRFM accepts the same addresses as DImode... */
10861 bool res
= aarch64_classify_address (&addr
, x
, DImode
, strict_p
);
10865 /* ... except writeback forms. */
10866 return addr
.type
!= ADDRESS_REG_WB
;
10870 aarch64_symbolic_address_p (rtx x
)
10873 x
= strip_offset_and_salt (x
, &offset
);
10874 return SYMBOL_REF_P (x
) || LABEL_REF_P (x
);
10877 /* Classify the base of symbolic expression X. */
10879 enum aarch64_symbol_type
10880 aarch64_classify_symbolic_expression (rtx x
)
10884 split_const (x
, &x
, &offset
);
10885 return aarch64_classify_symbol (x
, INTVAL (offset
));
10889 /* Return TRUE if X is a legitimate address for accessing memory in
10892 aarch64_legitimate_address_hook_p (machine_mode mode
, rtx x
, bool strict_p
,
10893 code_helper
= ERROR_MARK
)
10895 struct aarch64_address_info addr
;
10897 return aarch64_classify_address (&addr
, x
, mode
, strict_p
);
10900 /* Return TRUE if X is a legitimate address of type TYPE for accessing
10901 memory in mode MODE. STRICT_P is true if REG_OK_STRICT is in effect. */
10903 aarch64_legitimate_address_p (machine_mode mode
, rtx x
, bool strict_p
,
10904 aarch64_addr_query_type type
)
10906 struct aarch64_address_info addr
;
10908 return aarch64_classify_address (&addr
, x
, mode
, strict_p
, type
);
10911 /* Implement TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT. */
10914 aarch64_legitimize_address_displacement (rtx
*offset1
, rtx
*offset2
,
10915 poly_int64 orig_offset
,
10918 HOST_WIDE_INT size
;
10919 if (GET_MODE_SIZE (mode
).is_constant (&size
))
10921 HOST_WIDE_INT const_offset
, second_offset
;
10923 /* A general SVE offset is A * VQ + B. Remove the A component from
10924 coefficient 0 in order to get the constant B. */
10925 const_offset
= orig_offset
.coeffs
[0] - orig_offset
.coeffs
[1];
10927 /* Split an out-of-range address displacement into a base and
10928 offset. Use 4KB range for 1- and 2-byte accesses and a 16KB
10929 range otherwise to increase opportunities for sharing the base
10930 address of different sizes. Unaligned accesses use the signed
10931 9-bit range, TImode/TFmode/TDmode use the intersection of signed
10932 scaled 7-bit and signed 9-bit offset. */
10933 if (mode
== TImode
|| mode
== TFmode
|| mode
== TDmode
)
10934 second_offset
= ((const_offset
+ 0x100) & 0x1f8) - 0x100;
10935 else if ((const_offset
& (size
- 1)) != 0)
10936 second_offset
= ((const_offset
+ 0x100) & 0x1ff) - 0x100;
10938 second_offset
= const_offset
& (size
< 4 ? 0xfff : 0x3ffc);
10940 if (second_offset
== 0 || known_eq (orig_offset
, second_offset
))
10943 /* Split the offset into second_offset and the rest. */
10944 *offset1
= gen_int_mode (orig_offset
- second_offset
, Pmode
);
10945 *offset2
= gen_int_mode (second_offset
, Pmode
);
10950 /* Get the mode we should use as the basis of the range. For structure
10951 modes this is the mode of one vector. */
10952 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
10953 machine_mode step_mode
10954 = (vec_flags
& VEC_STRUCT
) != 0 ? SVE_BYTE_MODE
: mode
;
10956 /* Get the "mul vl" multiplier we'd like to use. */
10957 HOST_WIDE_INT factor
= GET_MODE_SIZE (step_mode
).coeffs
[1];
10958 HOST_WIDE_INT vnum
= orig_offset
.coeffs
[1] / factor
;
10959 if (vec_flags
& VEC_SVE_DATA
)
10960 /* LDR supports a 9-bit range, but the move patterns for
10961 structure modes require all vectors to be in range of the
10962 same base. The simplest way of accomodating that while still
10963 promoting reuse of anchor points between different modes is
10964 to use an 8-bit range unconditionally. */
10965 vnum
= ((vnum
+ 128) & 255) - 128;
10967 /* Predicates are only handled singly, so we might as well use
10969 vnum
= ((vnum
+ 256) & 511) - 256;
10973 /* Convert the "mul vl" multiplier into a byte offset. */
10974 poly_int64 second_offset
= GET_MODE_SIZE (step_mode
) * vnum
;
10975 if (known_eq (second_offset
, orig_offset
))
10978 /* Split the offset into second_offset and the rest. */
10979 *offset1
= gen_int_mode (orig_offset
- second_offset
, Pmode
);
10980 *offset2
= gen_int_mode (second_offset
, Pmode
);
10985 /* Return the binary representation of floating point constant VALUE in INTVAL.
10986 If the value cannot be converted, return false without setting INTVAL.
10987 The conversion is done in the given MODE. */
10989 aarch64_reinterpret_float_as_int (rtx value
, unsigned HOST_WIDE_INT
*intval
)
10992 /* We make a general exception for 0. */
10993 if (aarch64_float_const_zero_rtx_p (value
))
10999 scalar_float_mode mode
;
11000 if (!CONST_DOUBLE_P (value
)
11001 || !is_a
<scalar_float_mode
> (GET_MODE (value
), &mode
)
11002 || GET_MODE_BITSIZE (mode
) > HOST_BITS_PER_WIDE_INT
11003 /* Only support up to DF mode. */
11004 || GET_MODE_BITSIZE (mode
) > GET_MODE_BITSIZE (DFmode
))
11007 unsigned HOST_WIDE_INT ival
= 0;
11010 real_to_target (res
,
11011 CONST_DOUBLE_REAL_VALUE (value
),
11012 REAL_MODE_FORMAT (mode
));
11014 if (mode
== DFmode
|| mode
== DDmode
)
11016 int order
= BYTES_BIG_ENDIAN
? 1 : 0;
11017 ival
= zext_hwi (res
[order
], 32);
11018 ival
|= (zext_hwi (res
[1 - order
], 32) << 32);
11021 ival
= zext_hwi (res
[0], 32);
11027 /* Return TRUE if rtx X is an immediate constant that can be moved using a
11028 single MOV(+MOVK) followed by an FMOV. */
11030 aarch64_float_const_rtx_p (rtx x
)
11032 machine_mode mode
= GET_MODE (x
);
11033 if (mode
== VOIDmode
)
11036 /* Determine whether it's cheaper to write float constants as
11037 mov/movk pairs over ldr/adrp pairs. */
11038 unsigned HOST_WIDE_INT ival
;
11040 if (CONST_DOUBLE_P (x
)
11041 && SCALAR_FLOAT_MODE_P (mode
)
11042 && aarch64_reinterpret_float_as_int (x
, &ival
))
11044 machine_mode imode
= known_eq (GET_MODE_SIZE (mode
), 8) ? DImode
: SImode
;
11045 int num_instr
= aarch64_internal_mov_immediate
11046 (NULL_RTX
, gen_int_mode (ival
, imode
), false, imode
);
11047 return num_instr
< 3;
11053 /* Return TRUE if rtx X is immediate constant 0.0 (but not in Decimal
11054 Floating Point). */
11056 aarch64_float_const_zero_rtx_p (rtx x
)
11058 /* 0.0 in Decimal Floating Point cannot be represented by #0 or
11059 zr as our callers expect, so no need to check the actual
11060 value if X is of Decimal Floating Point type. */
11061 if (GET_MODE_CLASS (GET_MODE (x
)) == MODE_DECIMAL_FLOAT
)
11064 if (REAL_VALUE_MINUS_ZERO (*CONST_DOUBLE_REAL_VALUE (x
)))
11065 return !HONOR_SIGNED_ZEROS (GET_MODE (x
));
11066 return real_equal (CONST_DOUBLE_REAL_VALUE (x
), &dconst0
);
11069 /* Return true if X is any kind of constant zero rtx. */
11072 aarch64_const_zero_rtx_p (rtx x
)
11074 return (x
== CONST0_RTX (GET_MODE (x
))
11075 || (CONST_DOUBLE_P (x
) && aarch64_float_const_zero_rtx_p (x
)));
11078 /* Return TRUE if rtx X is immediate constant that fits in a single
11079 MOVI immediate operation. */
11081 aarch64_can_const_movi_rtx_p (rtx x
, machine_mode mode
)
11086 machine_mode vmode
;
11087 scalar_int_mode imode
;
11088 unsigned HOST_WIDE_INT ival
;
11090 if (CONST_DOUBLE_P (x
)
11091 && SCALAR_FLOAT_MODE_P (mode
))
11093 if (!aarch64_reinterpret_float_as_int (x
, &ival
))
11096 /* We make a general exception for 0. */
11097 if (aarch64_float_const_zero_rtx_p (x
))
11100 imode
= int_mode_for_mode (mode
).require ();
11102 else if (CONST_INT_P (x
)
11103 && is_a
<scalar_int_mode
> (mode
, &imode
))
11108 /* use a 64 bit mode for everything except for DI/DF/DD mode, where we use
11109 a 128 bit vector mode. */
11110 int width
= GET_MODE_BITSIZE (imode
) == 64 ? 128 : 64;
11112 vmode
= aarch64_simd_container_mode (imode
, width
);
11113 rtx v_op
= aarch64_simd_gen_const_vector_dup (vmode
, ival
);
11115 return aarch64_simd_valid_immediate (v_op
, NULL
);
11119 /* Return the fixed registers used for condition codes. */
11122 aarch64_fixed_condition_code_regs (unsigned int *p1
, unsigned int *p2
)
11125 *p2
= INVALID_REGNUM
;
11129 /* Return a fresh memory reference to the current function's TPIDR2 block,
11130 creating a block if necessary. */
11133 aarch64_get_tpidr2_block ()
11135 if (!cfun
->machine
->tpidr2_block
)
11136 /* The TPIDR2 block is 16 bytes in size and must be aligned to a 128-bit
11138 cfun
->machine
->tpidr2_block
= assign_stack_local (V16QImode
, 16, 128);
11139 return copy_rtx (cfun
->machine
->tpidr2_block
);
11142 /* Return a fresh register that points to the current function's
11143 TPIDR2 block, creating a block if necessary. */
11146 aarch64_get_tpidr2_ptr ()
11148 rtx block
= aarch64_get_tpidr2_block ();
11149 return force_reg (Pmode
, XEXP (block
, 0));
11152 /* Emit instructions to allocate a ZA lazy save buffer and initialize the
11153 current function's TPIDR2 block. */
11156 aarch64_init_tpidr2_block ()
11158 rtx block
= aarch64_get_tpidr2_block ();
11160 /* The ZA save buffer is SVL.B*SVL.B bytes in size. */
11161 rtx svl_bytes
= aarch64_sme_vq_immediate (Pmode
, 16, AARCH64_ISA_MODE
);
11162 rtx svl_bytes_reg
= force_reg (DImode
, svl_bytes
);
11163 rtx za_size
= expand_simple_binop (Pmode
, MULT
, svl_bytes_reg
,
11164 svl_bytes_reg
, NULL
, 0, OPTAB_LIB_WIDEN
);
11165 rtx za_save_buffer
= allocate_dynamic_stack_space (za_size
, 128,
11166 BITS_PER_UNIT
, -1, true);
11167 za_save_buffer
= force_reg (Pmode
, za_save_buffer
);
11168 cfun
->machine
->za_save_buffer
= za_save_buffer
;
11170 /* The first word of the block points to the save buffer and the second
11171 word is the number of ZA slices to save. */
11172 rtx block_0
= adjust_address (block
, DImode
, 0);
11173 emit_insn (aarch64_gen_store_pair (block_0
, za_save_buffer
, svl_bytes_reg
));
11175 if (!memory_operand (block
, V16QImode
))
11176 block
= replace_equiv_address (block
, force_reg (Pmode
, XEXP (block
, 0)));
11177 emit_insn (gen_aarch64_setup_local_tpidr2 (block
));
11180 /* Restore the contents of ZA from the lazy save buffer, given that
11181 register TPIDR2_BLOCK points to the current function's TPIDR2 block.
11182 PSTATE.ZA is known to be 0 and TPIDR2_EL0 is known to be null. */
11185 aarch64_restore_za (rtx tpidr2_block
)
11187 emit_insn (gen_aarch64_smstart_za ());
11188 if (REGNO (tpidr2_block
) != R0_REGNUM
)
11189 emit_move_insn (gen_rtx_REG (Pmode
, R0_REGNUM
), tpidr2_block
);
11190 emit_insn (gen_aarch64_tpidr2_restore ());
11193 /* Return the ZT0 save buffer, creating one if necessary. */
11196 aarch64_get_zt0_save_buffer ()
11198 if (!cfun
->machine
->zt0_save_buffer
)
11199 cfun
->machine
->zt0_save_buffer
= assign_stack_local (V8DImode
, 64, 128);
11200 return cfun
->machine
->zt0_save_buffer
;
11203 /* Save ZT0 to the current function's save buffer. */
11206 aarch64_save_zt0 ()
11208 rtx mem
= aarch64_get_zt0_save_buffer ();
11209 mem
= replace_equiv_address (mem
, force_reg (Pmode
, XEXP (mem
, 0)));
11210 emit_insn (gen_aarch64_sme_str_zt0 (mem
));
11213 /* Restore ZT0 from the current function's save buffer. FROM_LAZY_SAVE_P
11214 is true if the load is happening after a call to a private-ZA function,
11215 false if it can be treated as a normal load. */
11218 aarch64_restore_zt0 (bool from_lazy_save_p
)
11220 rtx mem
= aarch64_get_zt0_save_buffer ();
11221 mem
= replace_equiv_address (mem
, force_reg (Pmode
, XEXP (mem
, 0)));
11222 emit_insn (from_lazy_save_p
11223 ? gen_aarch64_restore_zt0 (mem
)
11224 : gen_aarch64_sme_ldr_zt0 (mem
));
11227 /* Implement TARGET_START_CALL_ARGS. */
11230 aarch64_start_call_args (cumulative_args_t ca_v
)
11232 CUMULATIVE_ARGS
*ca
= get_cumulative_args (ca_v
);
11234 if (!TARGET_SME
&& (ca
->isa_mode
& AARCH64_ISA_MODE_SM_ON
))
11236 error ("calling a streaming function requires the ISA extension %qs",
11238 inform (input_location
, "you can enable %qs using the command-line"
11239 " option %<-march%>, or by using the %<target%>"
11240 " attribute or pragma", "sme");
11243 if ((ca
->shared_za_flags
& (AARCH64_STATE_IN
| AARCH64_STATE_OUT
))
11244 && !aarch64_cfun_has_state ("za"))
11245 error ("call to a function that shares %qs state from a function"
11246 " that has no %qs state", "za", "za");
11247 else if ((ca
->shared_zt0_flags
& (AARCH64_STATE_IN
| AARCH64_STATE_OUT
))
11248 && !aarch64_cfun_has_state ("zt0"))
11249 error ("call to a function that shares %qs state from a function"
11250 " that has no %qs state", "zt0", "zt0");
11251 else if (!TARGET_ZA
&& (ca
->isa_mode
& AARCH64_ISA_MODE_ZA_ON
))
11252 error ("call to a function that shares SME state from a function"
11253 " that has no SME state");
11255 /* If this is a call to a private ZA function, emit a marker to
11256 indicate where any necessary set-up code could be inserted.
11257 The code itself is inserted by the mode-switching pass. */
11258 if (TARGET_ZA
&& !(ca
->isa_mode
& AARCH64_ISA_MODE_ZA_ON
))
11259 emit_insn (gen_aarch64_start_private_za_call ());
11261 /* If this is a call to a shared-ZA function that doesn't share ZT0,
11262 save and restore ZT0 around the call. */
11263 if (aarch64_cfun_has_state ("zt0")
11264 && (ca
->isa_mode
& AARCH64_ISA_MODE_ZA_ON
)
11265 && ca
->shared_zt0_flags
== 0)
11266 aarch64_save_zt0 ();
11269 /* This function is used by the call expanders of the machine description.
11270 RESULT is the register in which the result is returned. It's NULL for
11271 "call" and "sibcall".
11272 MEM is the location of the function call.
11274 - a const_int that gives the argument to the call's UNSPEC_CALLEE_ABI.
11275 - a PARALLEL that contains such a const_int as its first element.
11276 The second element is a PARALLEL that lists all the argument
11277 registers that need to be saved and restored around a change
11278 in PSTATE.SM, or const0_rtx if no such switch is needed.
11279 The third and fourth elements are const_ints that contain the
11280 sharing flags for ZA and ZT0 respectively.
11281 SIBCALL indicates whether this function call is normal call or sibling call.
11282 It will generate different pattern accordingly. */
11285 aarch64_expand_call (rtx result
, rtx mem
, rtx cookie
, bool sibcall
)
11287 rtx call
, callee
, tmp
;
11291 rtx callee_abi
= cookie
;
11292 rtx sme_mode_switch_args
= const0_rtx
;
11293 unsigned int shared_za_flags
= 0;
11294 unsigned int shared_zt0_flags
= 0;
11295 if (GET_CODE (cookie
) == PARALLEL
)
11297 callee_abi
= XVECEXP (cookie
, 0, 0);
11298 sme_mode_switch_args
= XVECEXP (cookie
, 0, 1);
11299 shared_za_flags
= INTVAL (XVECEXP (cookie
, 0, 2));
11300 shared_zt0_flags
= INTVAL (XVECEXP (cookie
, 0, 3));
11303 gcc_assert (CONST_INT_P (callee_abi
));
11304 auto callee_isa_mode
= aarch64_callee_isa_mode (callee_abi
);
11306 if (aarch64_cfun_has_state ("za")
11307 && (callee_isa_mode
& AARCH64_ISA_MODE_ZA_ON
)
11308 && !shared_za_flags
)
11310 sorry ("call to a function that shares state other than %qs"
11311 " from a function that has %qs state", "za", "za");
11312 inform (input_location
, "use %<__arm_preserves(\"za\")%> if the"
11313 " callee preserves ZA");
11316 gcc_assert (MEM_P (mem
));
11317 callee
= XEXP (mem
, 0);
11320 tmp
= legitimize_pe_coff_symbol (callee
, false);
11325 mode
= GET_MODE (callee
);
11326 gcc_assert (mode
== Pmode
);
11328 /* Decide if we should generate indirect calls by loading the
11329 address of the callee into a register before performing
11330 the branch-and-link. */
11331 if (SYMBOL_REF_P (callee
)
11332 ? (aarch64_is_long_call_p (callee
)
11333 || aarch64_is_noplt_call_p (callee
))
11335 XEXP (mem
, 0) = force_reg (mode
, callee
);
11337 /* Accumulate the return values, including state that is shared via
11339 auto_vec
<rtx
, 8> return_values
;
11342 if (GET_CODE (result
) == PARALLEL
)
11343 for (int i
= 0; i
< XVECLEN (result
, 0); ++i
)
11344 return_values
.safe_push (XVECEXP (result
, 0, i
));
11346 return_values
.safe_push (result
);
11348 unsigned int orig_num_return_values
= return_values
.length ();
11349 if (shared_za_flags
& AARCH64_STATE_OUT
)
11350 return_values
.safe_push (gen_rtx_REG (VNx16BImode
, ZA_REGNUM
));
11351 /* When calling private-ZA functions from functions with ZA state,
11352 we want to know whether the call committed a lazy save. */
11353 if (TARGET_ZA
&& !shared_za_flags
)
11354 return_values
.safe_push (gen_rtx_REG (VNx16BImode
, ZA_SAVED_REGNUM
));
11355 if (shared_zt0_flags
& AARCH64_STATE_OUT
)
11356 return_values
.safe_push (gen_rtx_REG (V8DImode
, ZT0_REGNUM
));
11358 /* Create the new return value, if necessary. */
11359 if (orig_num_return_values
!= return_values
.length ())
11361 if (return_values
.length () == 1)
11362 result
= return_values
[0];
11365 for (rtx
&x
: return_values
)
11366 if (GET_CODE (x
) != EXPR_LIST
)
11367 x
= gen_rtx_EXPR_LIST (VOIDmode
, x
, const0_rtx
);
11368 rtvec v
= gen_rtvec_v (return_values
.length (),
11369 return_values
.address ());
11370 result
= gen_rtx_PARALLEL (VOIDmode
, v
);
11374 call
= gen_rtx_CALL (VOIDmode
, mem
, const0_rtx
);
11376 if (result
!= NULL_RTX
)
11377 call
= gen_rtx_SET (result
, call
);
11382 tmp
= gen_rtx_CLOBBER (VOIDmode
, gen_rtx_REG (Pmode
, LR_REGNUM
));
11384 callee_abi
= gen_rtx_UNSPEC (DImode
, gen_rtvec (1, callee_abi
),
11385 UNSPEC_CALLEE_ABI
);
11387 vec
= gen_rtvec (3, call
, callee_abi
, tmp
);
11388 call
= gen_rtx_PARALLEL (VOIDmode
, vec
);
11390 auto call_insn
= aarch64_emit_call_insn (call
);
11392 /* Check whether the call requires a change to PSTATE.SM. We can't
11393 emit the instructions to change PSTATE.SM yet, since they involve
11394 a change in vector length and a change in instruction set, which
11395 cannot be represented in RTL.
11397 For now, just record which registers will be clobbered and used
11398 by the changes to PSTATE.SM. */
11399 if (!sibcall
&& aarch64_call_switches_pstate_sm (callee_isa_mode
))
11401 aarch64_sme_mode_switch_regs args_switch
;
11402 if (sme_mode_switch_args
!= const0_rtx
)
11404 unsigned int num_args
= XVECLEN (sme_mode_switch_args
, 0);
11405 for (unsigned int i
= 0; i
< num_args
; ++i
)
11407 rtx x
= XVECEXP (sme_mode_switch_args
, 0, i
);
11408 args_switch
.add_reg (GET_MODE (x
), REGNO (x
));
11412 aarch64_sme_mode_switch_regs result_switch
;
11414 result_switch
.add_call_result (call_insn
);
11416 unsigned int num_gprs
= MAX (args_switch
.num_gprs (),
11417 result_switch
.num_gprs ());
11418 for (unsigned int i
= 0; i
< num_gprs
; ++i
)
11419 clobber_reg (&CALL_INSN_FUNCTION_USAGE (call_insn
),
11420 gen_rtx_REG (DImode
, args_switch
.FIRST_GPR
+ i
));
11422 for (int regno
= V0_REGNUM
; regno
< V0_REGNUM
+ 32; regno
+= 4)
11423 clobber_reg (&CALL_INSN_FUNCTION_USAGE (call_insn
),
11424 gen_rtx_REG (V4x16QImode
, regno
));
11426 for (int regno
= P0_REGNUM
; regno
< P0_REGNUM
+ 16; regno
+= 1)
11427 clobber_reg (&CALL_INSN_FUNCTION_USAGE (call_insn
),
11428 gen_rtx_REG (VNx16BImode
, regno
));
11430 /* Ensure that the VG save slot has been initialized. Also emit
11431 an instruction to model the effect of the temporary clobber
11432 of VG, so that the prologue/epilogue pass sees the need to
11433 save the old value. */
11434 use_reg (&CALL_INSN_FUNCTION_USAGE (call_insn
),
11435 gen_rtx_REG (DImode
, VG_REGNUM
));
11436 emit_insn_before (gen_aarch64_update_vg (), call_insn
);
11438 cfun
->machine
->call_switches_pstate_sm
= true;
11441 /* Add any ZA-related information.
11443 ZA_REGNUM represents the current function's ZA state, rather than
11444 the contents of the ZA register itself. We ensure that the function's
11445 ZA state is preserved by private-ZA call sequences, so the call itself
11446 does not use or clobber ZA_REGNUM. The same thing applies to
11450 /* The callee requires ZA to be active if the callee is shared-ZA,
11451 otherwise it requires ZA to be dormant or off. The state of ZA is
11452 captured by a combination of SME_STATE_REGNUM, TPIDR2_SETUP_REGNUM,
11453 and ZA_SAVED_REGNUM. */
11454 use_reg (&CALL_INSN_FUNCTION_USAGE (call_insn
),
11455 gen_rtx_REG (DImode
, SME_STATE_REGNUM
));
11456 use_reg (&CALL_INSN_FUNCTION_USAGE (call_insn
),
11457 gen_rtx_REG (DImode
, TPIDR2_SETUP_REGNUM
));
11458 use_reg (&CALL_INSN_FUNCTION_USAGE (call_insn
),
11459 gen_rtx_REG (VNx16BImode
, ZA_SAVED_REGNUM
));
11461 /* Keep the aarch64_start/end_private_za_call markers live. */
11462 if (!(callee_isa_mode
& AARCH64_ISA_MODE_ZA_ON
))
11463 use_reg (&CALL_INSN_FUNCTION_USAGE (call_insn
),
11464 gen_rtx_REG (VNx16BImode
, LOWERING_REGNUM
));
11466 /* If the callee is a shared-ZA function, record whether it uses the
11467 current value of ZA and ZT0. */
11468 if (shared_za_flags
& AARCH64_STATE_IN
)
11469 use_reg (&CALL_INSN_FUNCTION_USAGE (call_insn
),
11470 gen_rtx_REG (VNx16BImode
, ZA_REGNUM
));
11472 if (shared_zt0_flags
& AARCH64_STATE_IN
)
11473 use_reg (&CALL_INSN_FUNCTION_USAGE (call_insn
),
11474 gen_rtx_REG (V8DImode
, ZT0_REGNUM
));
11478 /* Implement TARGET_END_CALL_ARGS. */
11481 aarch64_end_call_args (cumulative_args_t ca_v
)
11483 CUMULATIVE_ARGS
*ca
= get_cumulative_args (ca_v
);
11485 /* If this is a call to a private ZA function, emit a marker to
11486 indicate where any necessary restoration code could be inserted.
11487 The code itself is inserted by the mode-switching pass. */
11488 if (TARGET_ZA
&& !(ca
->isa_mode
& AARCH64_ISA_MODE_ZA_ON
))
11489 emit_insn (gen_aarch64_end_private_za_call ());
11491 /* If this is a call to a shared-ZA function that doesn't share ZT0,
11492 save and restore ZT0 around the call. */
11493 if (aarch64_cfun_has_state ("zt0")
11494 && (ca
->isa_mode
& AARCH64_ISA_MODE_ZA_ON
)
11495 && ca
->shared_zt0_flags
== 0)
11496 aarch64_restore_zt0 (false);
11499 /* Emit call insn with PAT and do aarch64-specific handling. */
11502 aarch64_emit_call_insn (rtx pat
)
11504 auto insn
= emit_call_insn (pat
);
11506 rtx
*fusage
= &CALL_INSN_FUNCTION_USAGE (insn
);
11507 clobber_reg (fusage
, gen_rtx_REG (word_mode
, IP0_REGNUM
));
11508 clobber_reg (fusage
, gen_rtx_REG (word_mode
, IP1_REGNUM
));
11509 return as_a
<rtx_call_insn
*> (insn
);
11513 aarch64_select_cc_mode (RTX_CODE code
, rtx x
, rtx y
)
11515 machine_mode mode_x
= GET_MODE (x
);
11516 rtx_code code_x
= GET_CODE (x
);
11518 /* All floating point compares return CCFP if it is an equality
11519 comparison, and CCFPE otherwise. */
11520 if (GET_MODE_CLASS (mode_x
) == MODE_FLOAT
)
11543 gcc_unreachable ();
11547 /* Equality comparisons of short modes against zero can be performed
11548 using the TST instruction with the appropriate bitmask. */
11549 if (y
== const0_rtx
&& (REG_P (x
) || SUBREG_P (x
))
11550 && (code
== EQ
|| code
== NE
)
11551 && (mode_x
== HImode
|| mode_x
== QImode
))
11554 /* Similarly, comparisons of zero_extends from shorter modes can
11555 be performed using an ANDS with an immediate mask. */
11556 if (y
== const0_rtx
&& code_x
== ZERO_EXTEND
11557 && (mode_x
== SImode
|| mode_x
== DImode
)
11558 && (GET_MODE (XEXP (x
, 0)) == HImode
|| GET_MODE (XEXP (x
, 0)) == QImode
)
11559 && (code
== EQ
|| code
== NE
))
11562 /* Zero extracts support equality comparisons. */
11563 if ((mode_x
== SImode
|| mode_x
== DImode
)
11565 && (code_x
== ZERO_EXTRACT
&& CONST_INT_P (XEXP (x
, 1))
11566 && CONST_INT_P (XEXP (x
, 2)))
11567 && (code
== EQ
|| code
== NE
))
11570 /* ANDS/BICS/TST support equality and all signed comparisons. */
11571 if ((mode_x
== SImode
|| mode_x
== DImode
)
11574 && (code
== EQ
|| code
== NE
|| code
== LT
|| code
== GE
11575 || code
== GT
|| code
== LE
))
11578 /* ADDS/SUBS correctly set N and Z flags. */
11579 if ((mode_x
== SImode
|| mode_x
== DImode
)
11581 && (code
== EQ
|| code
== NE
|| code
== LT
|| code
== GE
)
11582 && (code_x
== PLUS
|| code_x
== MINUS
|| code_x
== NEG
))
11585 /* A compare with a shifted operand. Because of canonicalization,
11586 the comparison will have to be swapped when we emit the assembly
11588 if ((mode_x
== SImode
|| mode_x
== DImode
)
11589 && (REG_P (y
) || SUBREG_P (y
) || y
== const0_rtx
)
11590 && (code_x
== ASHIFT
|| code_x
== ASHIFTRT
11591 || code_x
== LSHIFTRT
11592 || code_x
== ZERO_EXTEND
|| code_x
== SIGN_EXTEND
))
11595 /* Similarly for a negated operand, but we can only do this for
11597 if ((mode_x
== SImode
|| mode_x
== DImode
)
11598 && (REG_P (y
) || SUBREG_P (y
))
11599 && (code
== EQ
|| code
== NE
)
11603 /* A test for unsigned overflow from an addition. */
11604 if ((mode_x
== DImode
|| mode_x
== TImode
)
11605 && (code
== LTU
|| code
== GEU
)
11607 && rtx_equal_p (XEXP (x
, 0), y
))
11610 /* A test for unsigned overflow from an add with carry. */
11611 if ((mode_x
== DImode
|| mode_x
== TImode
)
11612 && (code
== LTU
|| code
== GEU
)
11614 && CONST_SCALAR_INT_P (y
)
11615 && (rtx_mode_t (y
, mode_x
)
11616 == (wi::shwi (1, mode_x
)
11617 << (GET_MODE_BITSIZE (mode_x
).to_constant () / 2))))
11620 /* A test for signed overflow. */
11621 if ((mode_x
== DImode
|| mode_x
== TImode
)
11624 && GET_CODE (y
) == SIGN_EXTEND
)
11627 /* For everything else, return CCmode. */
11632 aarch64_get_condition_code_1 (machine_mode
, enum rtx_code
);
11635 aarch64_get_condition_code (rtx x
)
11637 machine_mode mode
= GET_MODE (XEXP (x
, 0));
11638 enum rtx_code comp_code
= GET_CODE (x
);
11640 if (GET_MODE_CLASS (mode
) != MODE_CC
)
11641 mode
= SELECT_CC_MODE (comp_code
, XEXP (x
, 0), XEXP (x
, 1));
11642 return aarch64_get_condition_code_1 (mode
, comp_code
);
11646 aarch64_get_condition_code_1 (machine_mode mode
, enum rtx_code comp_code
)
11654 case GE
: return AARCH64_GE
;
11655 case GT
: return AARCH64_GT
;
11656 case LE
: return AARCH64_LS
;
11657 case LT
: return AARCH64_MI
;
11658 case NE
: return AARCH64_NE
;
11659 case EQ
: return AARCH64_EQ
;
11660 case ORDERED
: return AARCH64_VC
;
11661 case UNORDERED
: return AARCH64_VS
;
11662 case UNLT
: return AARCH64_LT
;
11663 case UNLE
: return AARCH64_LE
;
11664 case UNGT
: return AARCH64_HI
;
11665 case UNGE
: return AARCH64_PL
;
11666 default: return -1;
11673 case NE
: return AARCH64_NE
;
11674 case EQ
: return AARCH64_EQ
;
11675 case GE
: return AARCH64_GE
;
11676 case GT
: return AARCH64_GT
;
11677 case LE
: return AARCH64_LE
;
11678 case LT
: return AARCH64_LT
;
11679 case GEU
: return AARCH64_CS
;
11680 case GTU
: return AARCH64_HI
;
11681 case LEU
: return AARCH64_LS
;
11682 case LTU
: return AARCH64_CC
;
11683 default: return -1;
11690 case NE
: return AARCH64_NE
;
11691 case EQ
: return AARCH64_EQ
;
11692 case GE
: return AARCH64_LE
;
11693 case GT
: return AARCH64_LT
;
11694 case LE
: return AARCH64_GE
;
11695 case LT
: return AARCH64_GT
;
11696 case GEU
: return AARCH64_LS
;
11697 case GTU
: return AARCH64_CC
;
11698 case LEU
: return AARCH64_CS
;
11699 case LTU
: return AARCH64_HI
;
11700 default: return -1;
11707 case NE
: return AARCH64_NE
; /* = any */
11708 case EQ
: return AARCH64_EQ
; /* = none */
11709 case GE
: return AARCH64_PL
; /* = nfrst */
11710 case LT
: return AARCH64_MI
; /* = first */
11711 case GEU
: return AARCH64_CS
; /* = nlast */
11712 case GTU
: return AARCH64_HI
; /* = pmore */
11713 case LEU
: return AARCH64_LS
; /* = plast */
11714 case LTU
: return AARCH64_CC
; /* = last */
11715 default: return -1;
11722 case NE
: return AARCH64_NE
;
11723 case EQ
: return AARCH64_EQ
;
11724 case GE
: return AARCH64_PL
;
11725 case LT
: return AARCH64_MI
;
11726 case GT
: return AARCH64_GT
;
11727 case LE
: return AARCH64_LE
;
11728 default: return -1;
11735 case NE
: return AARCH64_NE
;
11736 case EQ
: return AARCH64_EQ
;
11737 case GE
: return AARCH64_PL
;
11738 case LT
: return AARCH64_MI
;
11739 default: return -1;
11746 case NE
: return AARCH64_NE
;
11747 case EQ
: return AARCH64_EQ
;
11748 default: return -1;
11755 case LTU
: return AARCH64_CS
;
11756 case GEU
: return AARCH64_CC
;
11757 default: return -1;
11764 case GEU
: return AARCH64_CS
;
11765 case LTU
: return AARCH64_CC
;
11766 default: return -1;
11773 case NE
: return AARCH64_VS
;
11774 case EQ
: return AARCH64_VC
;
11775 default: return -1;
11786 /* Return true if X is a CONST_INT, CONST_WIDE_INT or a constant vector
11787 duplicate of such constants. If so, store in RET_WI the wide_int
11788 representation of the constant paired with the inner mode of the vector mode
11789 or MODE for scalar X constants. If MODE is not provided then TImode is
11793 aarch64_extract_vec_duplicate_wide_int (rtx x
, wide_int
*ret_wi
,
11794 scalar_mode mode
= TImode
)
11796 rtx elt
= unwrap_const_vec_duplicate (x
);
11797 if (!CONST_SCALAR_INT_P (elt
))
11800 = CONST_SCALAR_INT_P (x
) ? mode
: GET_MODE_INNER (GET_MODE (x
));
11801 *ret_wi
= rtx_mode_t (elt
, smode
);
11805 /* Return true if X is a scalar or a constant vector of integer
11806 immediates that represent the rounding constant used in the fixed-point
11807 arithmetic instructions.
11808 The accepted form of the constant is (1 << (C - 1)) where C is in the range
11809 [1, MODE_WIDTH/2]. */
11812 aarch64_rnd_imm_p (rtx x
)
11815 if (!aarch64_extract_vec_duplicate_wide_int (x
, &rnd_cst
))
11817 int log2
= wi::exact_log2 (rnd_cst
);
11820 return IN_RANGE (log2
, 0, rnd_cst
.get_precision () / 2 - 1);
11823 /* Return true if RND is a constant vector of integer rounding constants
11824 corresponding to a constant vector of shifts, SHIFT.
11825 The relationship should be RND == (1 << (SHIFT - 1)). */
11828 aarch64_const_vec_rnd_cst_p (rtx rnd
, rtx shift
)
11830 wide_int rnd_cst
, shft_cst
;
11831 if (!aarch64_extract_vec_duplicate_wide_int (rnd
, &rnd_cst
)
11832 || !aarch64_extract_vec_duplicate_wide_int (shift
, &shft_cst
))
11835 return rnd_cst
== (wi::shwi (1, rnd_cst
.get_precision ()) << (shft_cst
- 1));
11839 aarch64_const_vec_all_same_in_range_p (rtx x
,
11840 HOST_WIDE_INT minval
,
11841 HOST_WIDE_INT maxval
)
11844 return (const_vec_duplicate_p (x
, &elt
)
11845 && CONST_INT_P (elt
)
11846 && IN_RANGE (INTVAL (elt
), minval
, maxval
));
11849 /* Some constants can't be made using normal mov instructions in Advanced SIMD
11850 but we can still create them in various ways. If the constant in VAL can be
11851 created using alternate methods then if possible then return true and
11852 additionally set TARGET to the rtx for the sequence if TARGET is not NULL.
11853 Otherwise return false if sequence is not possible. */
11856 aarch64_maybe_generate_simd_constant (rtx target
, rtx val
, machine_mode mode
)
11859 auto smode
= GET_MODE_INNER (mode
);
11860 if (!aarch64_extract_vec_duplicate_wide_int (val
, &wval
, smode
))
11863 /* For Advanced SIMD we can create an integer with only the top bit set
11864 using fneg (0.0f). */
11868 && wi::only_sign_bit_p (wval
))
11873 /* Use the same base type as aarch64_gen_shareable_zero. */
11874 rtx zero
= CONST0_RTX (V4SImode
);
11875 emit_move_insn (lowpart_subreg (V4SImode
, target
, mode
), zero
);
11876 rtx neg
= lowpart_subreg (V2DImode
, target
, mode
);
11877 emit_insn (gen_aarch64_fnegv2di2 (neg
, copy_rtx (neg
)));
11884 /* Check if the value in VAL with mode MODE can be created using special
11885 instruction sequences. */
11887 bool aarch64_simd_special_constant_p (rtx val
, machine_mode mode
)
11889 return aarch64_maybe_generate_simd_constant (NULL_RTX
, val
, mode
);
11893 aarch64_const_vec_all_same_int_p (rtx x
, HOST_WIDE_INT val
)
11895 return aarch64_const_vec_all_same_in_range_p (x
, val
, val
);
11898 /* Return true if VEC is a constant in which every element is in the range
11899 [MINVAL, MAXVAL]. The elements do not need to have the same value. */
11902 aarch64_const_vec_all_in_range_p (rtx vec
,
11903 HOST_WIDE_INT minval
,
11904 HOST_WIDE_INT maxval
)
11906 if (!CONST_VECTOR_P (vec
)
11907 || GET_MODE_CLASS (GET_MODE (vec
)) != MODE_VECTOR_INT
)
11911 if (!CONST_VECTOR_STEPPED_P (vec
))
11912 nunits
= const_vector_encoded_nelts (vec
);
11913 else if (!CONST_VECTOR_NUNITS (vec
).is_constant (&nunits
))
11916 for (int i
= 0; i
< nunits
; i
++)
11918 rtx vec_elem
= CONST_VECTOR_ELT (vec
, i
);
11919 if (!CONST_INT_P (vec_elem
)
11920 || !IN_RANGE (INTVAL (vec_elem
), minval
, maxval
))
11927 #define AARCH64_CC_V 1
11928 #define AARCH64_CC_C (1 << 1)
11929 #define AARCH64_CC_Z (1 << 2)
11930 #define AARCH64_CC_N (1 << 3)
11932 /* N Z C V flags for ccmp. Indexed by AARCH64_COND_CODE. */
11933 static const int aarch64_nzcv_codes
[] =
11935 0, /* EQ, Z == 1. */
11936 AARCH64_CC_Z
, /* NE, Z == 0. */
11937 0, /* CS, C == 1. */
11938 AARCH64_CC_C
, /* CC, C == 0. */
11939 0, /* MI, N == 1. */
11940 AARCH64_CC_N
, /* PL, N == 0. */
11941 0, /* VS, V == 1. */
11942 AARCH64_CC_V
, /* VC, V == 0. */
11943 0, /* HI, C ==1 && Z == 0. */
11944 AARCH64_CC_C
, /* LS, !(C == 1 && Z == 0). */
11945 AARCH64_CC_V
, /* GE, N == V. */
11946 0, /* LT, N != V. */
11947 AARCH64_CC_Z
, /* GT, Z == 0 && N == V. */
11948 0, /* LE, !(Z == 0 && N == V). */
11953 /* Print floating-point vector immediate operand X to F, negating it
11954 first if NEGATE is true. Return true on success, false if it isn't
11955 a constant we can handle. */
11958 aarch64_print_vector_float_operand (FILE *f
, rtx x
, bool negate
)
11962 if (!const_vec_duplicate_p (x
, &elt
))
11965 REAL_VALUE_TYPE r
= *CONST_DOUBLE_REAL_VALUE (elt
);
11967 r
= real_value_negate (&r
);
11969 /* Handle the SVE single-bit immediates specially, since they have a
11970 fixed form in the assembly syntax. */
11971 if (real_equal (&r
, &dconst0
))
11972 asm_fprintf (f
, "0.0");
11973 else if (real_equal (&r
, &dconst2
))
11974 asm_fprintf (f
, "2.0");
11975 else if (real_equal (&r
, &dconst1
))
11976 asm_fprintf (f
, "1.0");
11977 else if (real_equal (&r
, &dconsthalf
))
11978 asm_fprintf (f
, "0.5");
11981 const int buf_size
= 20;
11982 char float_buf
[buf_size
] = {'\0'};
11983 real_to_decimal_for_mode (float_buf
, &r
, buf_size
, buf_size
,
11984 1, GET_MODE (elt
));
11985 asm_fprintf (f
, "%s", float_buf
);
11991 /* Return the equivalent letter for size. */
11993 sizetochar (int size
)
11997 case 64: return 'd';
11998 case 32: return 's';
11999 case 16: return 'h';
12000 case 8 : return 'b';
12001 default: gcc_unreachable ();
12005 /* Print operand X to file F in a target specific manner according to CODE.
12006 The acceptable formatting commands given by CODE are:
12007 'c': An integer or symbol address without a preceding #
12009 'C': Take the duplicated element in a vector constant
12010 and print it in hex.
12011 'D': Take the duplicated element in a vector constant
12012 and print it as an unsigned integer, in decimal.
12013 'e': Print the sign/zero-extend size as a character 8->b,
12014 16->h, 32->w. Can also be used for masks:
12015 0xff->b, 0xffff->h, 0xffffffff->w.
12016 'I': If the operand is a duplicated vector constant,
12017 replace it with the duplicated scalar. If the
12018 operand is then a floating-point constant, replace
12019 it with the integer bit representation. Print the
12020 transformed constant as a signed decimal number.
12021 'p': Prints N such that 2^N == X (X must be power of 2 and
12023 'P': Print the number of non-zero bits in X (a const_int).
12024 'H': Print the higher numbered register of a pair (TImode)
12026 'm': Print a condition (eq, ne, etc).
12027 'M': Same as 'm', but invert condition.
12028 'N': Take the duplicated element in a vector constant
12029 and print the negative of it in decimal.
12030 'b/h/s/d/q': Print a scalar FP/SIMD register name.
12031 'Z': Same for SVE registers. ('z' was already taken.)
12032 Note that it is not necessary to use %Z for operands
12033 that have SVE modes. The convention is to use %Z
12034 only for non-SVE (or potentially non-SVE) modes.
12035 'S/T/U/V': Print a FP/SIMD register name for a register list.
12036 The register printed is the FP/SIMD register name
12037 of X + 0/1/2/3 for S/T/U/V.
12038 'R': Print a scalar Integer/FP/SIMD register name + 1.
12039 'X': Print bottom 16 bits of integer constant in hex.
12040 'w/x': Print a general register name or the zero register
12041 (32-bit or 64-bit).
12042 '0': Print a normal operand, if it's a general register,
12043 then we assume DImode.
12044 'k': Print NZCV for conditional compare instructions.
12045 'K': Print a predicate register as pn<N> rather than p<N>
12046 'A': Output address constant representing the first
12047 argument of X, specifying a relocation offset
12049 'L': Output constant address specified by X
12050 with a relocation offset if appropriate.
12051 'G': Prints address of X, specifying a PC relative
12052 relocation mode if appropriate.
12053 'y': Output address of LDP or STP - this is used for
12054 some LDP/STPs which don't use a PARALLEL in their
12055 pattern (so the mode needs to be adjusted).
12056 'z': Output address of a typical LDP or STP. */
12059 aarch64_print_operand (FILE *f
, rtx x
, int code
)
12065 if (CONST_INT_P (x
))
12066 fprintf (f
, HOST_WIDE_INT_PRINT_DEC
, INTVAL (x
));
12070 rtx base
= strip_offset_and_salt (x
, &offset
);
12071 if (SYMBOL_REF_P (base
))
12072 output_addr_const (f
, x
);
12074 output_operand_lossage ("unsupported operand for code '%c'", code
);
12080 x
= unwrap_const_vec_duplicate (x
);
12081 if (!CONST_INT_P (x
))
12083 output_operand_lossage ("invalid operand for '%%%c'", code
);
12087 HOST_WIDE_INT val
= INTVAL (x
);
12088 if ((val
& ~7) == 8 || val
== 0xff)
12090 else if ((val
& ~7) == 16 || val
== 0xffff)
12092 else if ((val
& ~7) == 32 || val
== 0xffffffff)
12096 output_operand_lossage ("invalid operand for '%%%c'", code
);
12106 if (!CONST_INT_P (x
) || (n
= exact_log2 (INTVAL (x
))) < 0)
12108 output_operand_lossage ("invalid operand for '%%%c'", code
);
12112 asm_fprintf (f
, "%d", n
);
12117 if (!CONST_INT_P (x
))
12119 output_operand_lossage ("invalid operand for '%%%c'", code
);
12123 asm_fprintf (f
, "%u", popcount_hwi (INTVAL (x
)));
12127 if (x
== const0_rtx
)
12129 asm_fprintf (f
, "xzr");
12133 if (!REG_P (x
) || !GP_REGNUM_P (REGNO (x
) + 1))
12135 output_operand_lossage ("invalid operand for '%%%c'", code
);
12139 asm_fprintf (f
, "%s", reg_names
[REGNO (x
) + 1]);
12144 x
= aarch64_bit_representation (unwrap_const_vec_duplicate (x
));
12145 if (CONST_INT_P (x
))
12146 asm_fprintf (f
, "%wd", INTVAL (x
));
12149 output_operand_lossage ("invalid operand for '%%%c'", code
);
12159 /* CONST_TRUE_RTX means al/nv (al is the default, don't print it). */
12160 if (x
== const_true_rtx
)
12167 if (!COMPARISON_P (x
))
12169 output_operand_lossage ("invalid operand for '%%%c'", code
);
12173 cond_code
= aarch64_get_condition_code (x
);
12174 gcc_assert (cond_code
>= 0);
12176 cond_code
= AARCH64_INVERSE_CONDITION_CODE (cond_code
);
12177 if (GET_MODE (XEXP (x
, 0)) == CC_NZCmode
)
12178 fputs (aarch64_sve_condition_codes
[cond_code
], f
);
12180 fputs (aarch64_condition_codes
[cond_code
], f
);
12185 if (!const_vec_duplicate_p (x
, &elt
))
12187 output_operand_lossage ("invalid vector constant");
12191 if (GET_MODE_CLASS (GET_MODE (x
)) == MODE_VECTOR_INT
)
12192 asm_fprintf (f
, "%wd", (HOST_WIDE_INT
) -UINTVAL (elt
));
12193 else if (GET_MODE_CLASS (GET_MODE (x
)) == MODE_VECTOR_FLOAT
12194 && aarch64_print_vector_float_operand (f
, x
, true))
12198 output_operand_lossage ("invalid vector constant");
12209 code
= TOLOWER (code
);
12210 if (!REG_P (x
) || !FP_REGNUM_P (REGNO (x
)))
12212 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code
);
12215 asm_fprintf (f
, "%c%d", code
, REGNO (x
) - V0_REGNUM
);
12222 if (!REG_P (x
) || (!FP_REGNUM_P (REGNO (x
)) && !PR_REGNUM_P (REGNO (x
))))
12224 output_operand_lossage ("incompatible operand for '%%%c'", code
);
12227 if (PR_REGNUM_P (REGNO (x
)))
12228 asm_fprintf (f
, "p%d", REGNO (x
) - P0_REGNUM
+ (code
- 'S'));
12230 asm_fprintf (f
, "%c%d",
12231 aarch64_sve_data_mode_p (GET_MODE (x
)) ? 'z' : 'v',
12232 REGNO (x
) - V0_REGNUM
+ (code
- 'S'));
12236 if (REG_P (x
) && FP_REGNUM_P (REGNO (x
))
12237 && (aarch64_advsimd_partial_struct_mode_p (GET_MODE (x
))))
12238 asm_fprintf (f
, "d%d", REGNO (x
) - V0_REGNUM
+ 1);
12239 else if (REG_P (x
) && FP_REGNUM_P (REGNO (x
)))
12240 asm_fprintf (f
, "q%d", REGNO (x
) - V0_REGNUM
+ 1);
12241 else if (REG_P (x
) && GP_REGNUM_P (REGNO (x
)))
12242 asm_fprintf (f
, "x%d", REGNO (x
) - R0_REGNUM
+ 1);
12244 output_operand_lossage ("incompatible register operand for '%%%c'",
12249 if (!CONST_INT_P (x
))
12251 output_operand_lossage ("invalid operand for '%%%c'", code
);
12254 asm_fprintf (f
, "0x%wx", UINTVAL (x
) & 0xffff);
12259 /* Print a replicated constant in hex. */
12260 if (!const_vec_duplicate_p (x
, &elt
) || !CONST_INT_P (elt
))
12262 output_operand_lossage ("invalid operand for '%%%c'", code
);
12265 scalar_mode inner_mode
= GET_MODE_INNER (GET_MODE (x
));
12266 asm_fprintf (f
, "0x%wx", UINTVAL (elt
) & GET_MODE_MASK (inner_mode
));
12272 /* Print a replicated constant in decimal, treating it as
12274 if (!const_vec_duplicate_p (x
, &elt
) || !CONST_INT_P (elt
))
12276 output_operand_lossage ("invalid operand for '%%%c'", code
);
12279 scalar_mode inner_mode
= GET_MODE_INNER (GET_MODE (x
));
12280 asm_fprintf (f
, "%wd", UINTVAL (elt
) & GET_MODE_MASK (inner_mode
));
12286 if (aarch64_const_zero_rtx_p (x
))
12288 asm_fprintf (f
, "%czr", code
);
12292 if (REG_P (x
) && GP_REGNUM_P (REGNO (x
)))
12294 asm_fprintf (f
, "%c%d", code
, REGNO (x
) - R0_REGNUM
);
12298 if (REG_P (x
) && REGNO (x
) == SP_REGNUM
)
12300 asm_fprintf (f
, "%ssp", code
== 'w' ? "w" : "");
12309 output_operand_lossage ("missing operand");
12313 switch (GET_CODE (x
))
12317 asm_fprintf (f
, "%s", XSTR (x
, 0));
12321 if (aarch64_sve_data_mode_p (GET_MODE (x
)))
12323 if (REG_NREGS (x
) == 1)
12324 asm_fprintf (f
, "z%d", REGNO (x
) - V0_REGNUM
);
12328 = sizetochar (GET_MODE_UNIT_BITSIZE (GET_MODE (x
)));
12329 asm_fprintf (f
, "{z%d.%c - z%d.%c}",
12330 REGNO (x
) - V0_REGNUM
, suffix
,
12331 END_REGNO (x
) - V0_REGNUM
- 1, suffix
);
12335 asm_fprintf (f
, "%s", reg_names
[REGNO (x
)]);
12339 output_address (GET_MODE (x
), XEXP (x
, 0));
12344 output_addr_const (asm_out_file
, x
);
12348 asm_fprintf (f
, "%wd", INTVAL (x
));
12352 if (!VECTOR_MODE_P (GET_MODE (x
)))
12354 output_addr_const (asm_out_file
, x
);
12360 if (!const_vec_duplicate_p (x
, &elt
))
12362 output_operand_lossage ("invalid vector constant");
12366 if (GET_MODE_CLASS (GET_MODE (x
)) == MODE_VECTOR_INT
)
12367 asm_fprintf (f
, "%wd", INTVAL (elt
));
12368 else if (GET_MODE_CLASS (GET_MODE (x
)) == MODE_VECTOR_FLOAT
12369 && aarch64_print_vector_float_operand (f
, x
, false))
12373 output_operand_lossage ("invalid vector constant");
12379 /* Since we define TARGET_SUPPORTS_WIDE_INT we shouldn't ever
12380 be getting CONST_DOUBLEs holding integers. */
12381 gcc_assert (GET_MODE (x
) != VOIDmode
);
12382 if (aarch64_float_const_zero_rtx_p (x
))
12387 else if (aarch64_float_const_representable_p (x
))
12389 #define buf_size 20
12390 char float_buf
[buf_size
] = {'\0'};
12391 real_to_decimal_for_mode (float_buf
,
12392 CONST_DOUBLE_REAL_VALUE (x
),
12393 buf_size
, buf_size
,
12395 asm_fprintf (asm_out_file
, "%s", float_buf
);
12399 output_operand_lossage ("invalid constant");
12402 output_operand_lossage ("invalid operand");
12408 if (GET_CODE (x
) == HIGH
)
12411 switch (aarch64_classify_symbolic_expression (x
))
12413 case SYMBOL_SMALL_GOT_4G
:
12414 asm_fprintf (asm_out_file
, ":got:");
12417 case SYMBOL_SMALL_TLSGD
:
12418 asm_fprintf (asm_out_file
, ":tlsgd:");
12421 case SYMBOL_SMALL_TLSDESC
:
12422 asm_fprintf (asm_out_file
, ":tlsdesc:");
12425 case SYMBOL_SMALL_TLSIE
:
12426 asm_fprintf (asm_out_file
, ":gottprel:");
12429 case SYMBOL_TLSLE24
:
12430 asm_fprintf (asm_out_file
, ":tprel:");
12433 case SYMBOL_TINY_GOT
:
12434 gcc_unreachable ();
12440 output_addr_const (asm_out_file
, x
);
12444 switch (aarch64_classify_symbolic_expression (x
))
12446 case SYMBOL_SMALL_GOT_4G
:
12447 asm_fprintf (asm_out_file
, ":got_lo12:");
12450 case SYMBOL_SMALL_TLSGD
:
12451 asm_fprintf (asm_out_file
, ":tlsgd_lo12:");
12454 case SYMBOL_SMALL_TLSDESC
:
12455 asm_fprintf (asm_out_file
, ":tlsdesc_lo12:");
12458 case SYMBOL_SMALL_TLSIE
:
12459 asm_fprintf (asm_out_file
, ":gottprel_lo12:");
12462 case SYMBOL_TLSLE12
:
12463 asm_fprintf (asm_out_file
, ":tprel_lo12:");
12466 case SYMBOL_TLSLE24
:
12467 asm_fprintf (asm_out_file
, ":tprel_lo12_nc:");
12470 case SYMBOL_TINY_GOT
:
12471 asm_fprintf (asm_out_file
, ":got:");
12474 case SYMBOL_TINY_TLSIE
:
12475 asm_fprintf (asm_out_file
, ":gottprel:");
12481 output_addr_const (asm_out_file
, x
);
12485 switch (aarch64_classify_symbolic_expression (x
))
12487 case SYMBOL_TLSLE24
:
12488 asm_fprintf (asm_out_file
, ":tprel_hi12:");
12493 output_addr_const (asm_out_file
, x
);
12498 HOST_WIDE_INT cond_code
;
12500 if (!CONST_INT_P (x
))
12502 output_operand_lossage ("invalid operand for '%%%c'", code
);
12506 cond_code
= INTVAL (x
);
12507 gcc_assert (cond_code
>= 0 && cond_code
<= AARCH64_NV
);
12508 asm_fprintf (f
, "%d", aarch64_nzcv_codes
[cond_code
]);
12513 if (!REG_P (x
) || !PR_REGNUM_P (REGNO (x
)))
12515 output_operand_lossage ("invalid operand for '%%%c'", code
);
12518 asm_fprintf (f
, "pn%d", REGNO (x
) - P0_REGNUM
);
12524 machine_mode mode
= GET_MODE (x
);
12528 && maybe_ne (GET_MODE_SIZE (mode
), 8)
12529 && maybe_ne (GET_MODE_SIZE (mode
), 16)
12530 && maybe_ne (GET_MODE_SIZE (mode
), 32)))
12532 output_operand_lossage ("invalid operand for '%%%c'", code
);
12536 if (!aarch64_print_address_internal (f
, mode
, XEXP (x
, 0),
12538 ? ADDR_QUERY_LDP_STP_N
12539 : ADDR_QUERY_LDP_STP
))
12540 output_operand_lossage ("invalid operand prefix '%%%c'", code
);
12545 output_operand_lossage ("invalid operand prefix '%%%c'", code
);
12550 /* Print address 'x' of a memory access with mode 'mode'.
12551 'op' is the context required by aarch64_classify_address. It can either be
12552 MEM for a normal memory access or PARALLEL for LDP/STP. */
12554 aarch64_print_address_internal (FILE *f
, machine_mode mode
, rtx x
,
12555 aarch64_addr_query_type type
)
12557 struct aarch64_address_info addr
;
12558 unsigned int size
, vec_flags
;
12560 /* Check all addresses are Pmode - including ILP32. */
12561 if (GET_MODE (x
) != Pmode
12562 && (!CONST_INT_P (x
)
12563 || trunc_int_for_mode (INTVAL (x
), Pmode
) != INTVAL (x
)))
12565 output_operand_lossage ("invalid address mode");
12569 const bool load_store_pair_p
= (type
== ADDR_QUERY_LDP_STP
12570 || type
== ADDR_QUERY_LDP_STP_N
);
12572 if (aarch64_classify_address (&addr
, x
, mode
, true, type
))
12575 case ADDRESS_REG_IMM
:
12576 if (known_eq (addr
.const_offset
, 0))
12578 asm_fprintf (f
, "[%s]", reg_names
[REGNO (addr
.base
)]);
12582 vec_flags
= aarch64_classify_vector_mode (mode
);
12583 if ((vec_flags
& VEC_ANY_SVE
) && !load_store_pair_p
)
12586 = exact_div (addr
.const_offset
,
12587 aarch64_vl_bytes (mode
, vec_flags
)).to_constant ();
12588 asm_fprintf (f
, "[%s, #%wd, mul vl]",
12589 reg_names
[REGNO (addr
.base
)], vnum
);
12593 if (!CONST_INT_P (addr
.offset
))
12596 asm_fprintf (f
, "[%s, %wd]", reg_names
[REGNO (addr
.base
)],
12597 INTVAL (addr
.offset
));
12600 case ADDRESS_REG_REG
:
12601 if (addr
.shift
== 0)
12602 asm_fprintf (f
, "[%s, %s]", reg_names
[REGNO (addr
.base
)],
12603 reg_names
[REGNO (addr
.offset
)]);
12605 asm_fprintf (f
, "[%s, %s, lsl %u]", reg_names
[REGNO (addr
.base
)],
12606 reg_names
[REGNO (addr
.offset
)], addr
.shift
);
12609 case ADDRESS_REG_UXTW
:
12610 if (addr
.shift
== 0)
12611 asm_fprintf (f
, "[%s, w%d, uxtw]", reg_names
[REGNO (addr
.base
)],
12612 REGNO (addr
.offset
) - R0_REGNUM
);
12614 asm_fprintf (f
, "[%s, w%d, uxtw %u]", reg_names
[REGNO (addr
.base
)],
12615 REGNO (addr
.offset
) - R0_REGNUM
, addr
.shift
);
12618 case ADDRESS_REG_SXTW
:
12619 if (addr
.shift
== 0)
12620 asm_fprintf (f
, "[%s, w%d, sxtw]", reg_names
[REGNO (addr
.base
)],
12621 REGNO (addr
.offset
) - R0_REGNUM
);
12623 asm_fprintf (f
, "[%s, w%d, sxtw %u]", reg_names
[REGNO (addr
.base
)],
12624 REGNO (addr
.offset
) - R0_REGNUM
, addr
.shift
);
12627 case ADDRESS_REG_WB
:
12628 /* Writeback is only supported for fixed-width modes. */
12629 size
= GET_MODE_SIZE (mode
).to_constant ();
12630 switch (GET_CODE (x
))
12633 asm_fprintf (f
, "[%s, %d]!", reg_names
[REGNO (addr
.base
)], size
);
12636 asm_fprintf (f
, "[%s], %d", reg_names
[REGNO (addr
.base
)], size
);
12639 asm_fprintf (f
, "[%s, -%d]!", reg_names
[REGNO (addr
.base
)], size
);
12642 asm_fprintf (f
, "[%s], -%d", reg_names
[REGNO (addr
.base
)], size
);
12645 asm_fprintf (f
, "[%s, %wd]!", reg_names
[REGNO (addr
.base
)],
12646 INTVAL (addr
.offset
));
12649 asm_fprintf (f
, "[%s], %wd", reg_names
[REGNO (addr
.base
)],
12650 INTVAL (addr
.offset
));
12657 case ADDRESS_LO_SUM
:
12658 asm_fprintf (f
, "[%s, #:lo12:", reg_names
[REGNO (addr
.base
)]);
12659 output_addr_const (f
, addr
.offset
);
12660 asm_fprintf (f
, "]");
12663 case ADDRESS_SYMBOLIC
:
12664 output_addr_const (f
, x
);
12671 /* Print address 'x' of a memory access with mode 'mode'. */
12673 aarch64_print_operand_address (FILE *f
, machine_mode mode
, rtx x
)
12675 if (!aarch64_print_address_internal (f
, mode
, x
, ADDR_QUERY_ANY
))
12676 output_addr_const (f
, x
);
12679 /* Implement TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA. */
12682 aarch64_output_addr_const_extra (FILE *file
, rtx x
)
12684 if (GET_CODE (x
) == UNSPEC
&& XINT (x
, 1) == UNSPEC_SALT_ADDR
)
12686 output_addr_const (file
, XVECEXP (x
, 0, 0));
12693 aarch64_label_mentioned_p (rtx x
)
12698 if (LABEL_REF_P (x
))
12701 /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
12702 referencing instruction, but they are constant offsets, not
12704 if (GET_CODE (x
) == UNSPEC
&& XINT (x
, 1) == UNSPEC_TLS
)
12707 fmt
= GET_RTX_FORMAT (GET_CODE (x
));
12708 for (i
= GET_RTX_LENGTH (GET_CODE (x
)) - 1; i
>= 0; i
--)
12714 for (j
= XVECLEN (x
, i
) - 1; j
>= 0; j
--)
12715 if (aarch64_label_mentioned_p (XVECEXP (x
, i
, j
)))
12718 else if (fmt
[i
] == 'e' && aarch64_label_mentioned_p (XEXP (x
, i
)))
12725 /* Implement REGNO_REG_CLASS. */
12728 aarch64_regno_regclass (unsigned regno
)
12730 if (W8_W11_REGNUM_P (regno
))
12731 return W8_W11_REGS
;
12733 if (W12_W15_REGNUM_P (regno
))
12734 return W12_W15_REGS
;
12736 if (STUB_REGNUM_P (regno
))
12739 if (GP_REGNUM_P (regno
))
12740 return GENERAL_REGS
;
12742 if (regno
== SP_REGNUM
)
12745 if (regno
== FRAME_POINTER_REGNUM
12746 || regno
== ARG_POINTER_REGNUM
)
12747 return POINTER_REGS
;
12749 if (FP_REGNUM_P (regno
))
12750 return (FP_LO8_REGNUM_P (regno
) ? FP_LO8_REGS
12751 : FP_LO_REGNUM_P (regno
) ? FP_LO_REGS
: FP_REGS
);
12753 if (PR_REGNUM_P (regno
))
12754 return PR_LO_REGNUM_P (regno
) ? PR_LO_REGS
: PR_HI_REGS
;
12756 if (regno
== FPM_REGNUM
)
12757 return MOVEABLE_SYSREGS
;
12759 if (regno
== FFR_REGNUM
|| regno
== FFRT_REGNUM
)
12762 if (FAKE_REGNUM_P (regno
))
12768 /* OFFSET is an address offset for mode MODE, which has SIZE bytes.
12769 If OFFSET is out of range, return an offset of an anchor point
12770 that is in range. Return 0 otherwise. */
12772 static HOST_WIDE_INT
12773 aarch64_anchor_offset (HOST_WIDE_INT offset
, HOST_WIDE_INT size
,
12776 /* Does it look like we'll need a 16-byte load/store-pair operation? */
12778 return (offset
+ 0x400) & ~0x7f0;
12780 /* For offsets that aren't a multiple of the access size, the limit is
12782 if (offset
& (size
- 1))
12784 /* BLKmode typically uses LDP of X-registers. */
12785 if (mode
== BLKmode
)
12786 return (offset
+ 512) & ~0x3ff;
12787 return (offset
+ 0x100) & ~0x1ff;
12790 /* Small negative offsets are supported. */
12791 if (IN_RANGE (offset
, -256, 0))
12794 if (mode
== TImode
|| mode
== TFmode
|| mode
== TDmode
)
12795 return (offset
+ 0x100) & ~0x1ff;
12797 /* Use 12-bit offset by access size. */
12798 return offset
& (~0xfff * size
);
12802 aarch64_legitimize_address (rtx x
, rtx
/* orig_x */, machine_mode mode
)
12805 rtx tmp
= legitimize_pe_coff_symbol (x
, true);
12810 /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask),
12811 where mask is selected by alignment and size of the offset.
12812 We try to pick as large a range for the offset as possible to
12813 maximize the chance of a CSE. However, for aligned addresses
12814 we limit the range to 4k so that structures with different sized
12815 elements are likely to use the same base. We need to be careful
12816 not to split a CONST for some forms of address expression, otherwise
12817 it will generate sub-optimal code. */
12819 /* First split X + CONST (base, offset) into (base + X) + offset. */
12820 if (GET_CODE (x
) == PLUS
&& GET_CODE (XEXP (x
, 1)) == CONST
)
12823 rtx base
= strip_offset (XEXP (x
, 1), &offset
);
12825 base
= expand_binop (Pmode
, add_optab
, base
, XEXP (x
, 0),
12826 NULL_RTX
, true, OPTAB_DIRECT
);
12827 x
= plus_constant (Pmode
, base
, offset
);
12830 if (GET_CODE (x
) == PLUS
&& CONST_INT_P (XEXP (x
, 1)))
12832 rtx base
= XEXP (x
, 0);
12833 rtx offset_rtx
= XEXP (x
, 1);
12834 HOST_WIDE_INT offset
= INTVAL (offset_rtx
);
12836 if (GET_CODE (base
) == PLUS
)
12838 rtx op0
= XEXP (base
, 0);
12839 rtx op1
= XEXP (base
, 1);
12841 /* Force any scaling into a temp for CSE. */
12842 op0
= force_reg (Pmode
, op0
);
12843 op1
= force_reg (Pmode
, op1
);
12845 /* Let the pointer register be in op0. */
12846 if (REG_POINTER (op1
))
12847 std::swap (op0
, op1
);
12849 /* If the pointer is virtual or frame related, then we know that
12850 virtual register instantiation or register elimination is going
12851 to apply a second constant. We want the two constants folded
12852 together easily. Therefore, emit as (OP0 + CONST) + OP1. */
12853 if (virt_or_elim_regno_p (REGNO (op0
)))
12855 base
= expand_binop (Pmode
, add_optab
, op0
, offset_rtx
,
12856 NULL_RTX
, true, OPTAB_DIRECT
);
12857 return gen_rtx_PLUS (Pmode
, base
, op1
);
12860 /* Otherwise, in order to encourage CSE (and thence loop strength
12861 reduce) scaled addresses, emit as (OP0 + OP1) + CONST. */
12862 base
= expand_binop (Pmode
, add_optab
, op0
, op1
,
12863 NULL_RTX
, true, OPTAB_DIRECT
);
12864 x
= gen_rtx_PLUS (Pmode
, base
, offset_rtx
);
12867 HOST_WIDE_INT size
;
12868 if (GET_MODE_SIZE (mode
).is_constant (&size
))
12870 HOST_WIDE_INT base_offset
= aarch64_anchor_offset (offset
, size
,
12872 if (base_offset
!= 0)
12874 base
= plus_constant (Pmode
, base
, base_offset
);
12875 base
= force_operand (base
, NULL_RTX
);
12876 return plus_constant (Pmode
, base
, offset
- base_offset
);
12885 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED
, rtx x
,
12886 reg_class_t rclass
,
12888 secondary_reload_info
*sri
)
12890 /* Use aarch64_sve_reload_mem for SVE memory reloads that cannot use
12891 LDR and STR. See the comment at the head of aarch64-sve.md for
12892 more details about the big-endian handling. */
12893 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
12894 if (reg_class_subset_p (rclass
, FP_REGS
)
12895 && !((REG_P (x
) && HARD_REGISTER_P (x
))
12896 || aarch64_simd_valid_immediate (x
, NULL
))
12897 && mode
!= VNx16QImode
12898 && (vec_flags
& VEC_SVE_DATA
)
12899 && ((vec_flags
& VEC_PARTIAL
) || BYTES_BIG_ENDIAN
))
12901 sri
->icode
= CODE_FOR_aarch64_sve_reload_mem
;
12905 /* If we have to disable direct literal pool loads and stores because the
12906 function is too big, then we need a scratch register. */
12907 if (MEM_P (x
) && SYMBOL_REF_P (x
) && CONSTANT_POOL_ADDRESS_P (x
)
12908 && (SCALAR_FLOAT_MODE_P (GET_MODE (x
))
12909 || targetm
.vector_mode_supported_p (GET_MODE (x
)))
12910 && !aarch64_pcrelative_literal_loads
)
12912 sri
->icode
= code_for_aarch64_reload_movcp (mode
, DImode
);
12916 /* Without the TARGET_SIMD or TARGET_SVE instructions we cannot move a
12917 Q register to a Q register directly. We need a scratch. */
12922 || (vec_flags
== VEC_ADVSIMD
&& known_eq (GET_MODE_SIZE (mode
), 16)))
12923 && mode
== GET_MODE (x
)
12925 && FP_REGNUM_P (REGNO (x
))
12926 && reg_class_subset_p (rclass
, FP_REGS
))
12928 sri
->icode
= code_for_aarch64_reload_mov (mode
);
12932 /* A TFmode, TImode or TDmode memory access should be handled via an FP_REGS
12933 because AArch64 has richer addressing modes for LDR/STR instructions
12934 than LDP/STP instructions. */
12935 if (TARGET_FLOAT
&& rclass
== GENERAL_REGS
12936 && known_eq (GET_MODE_SIZE (mode
), 16) && MEM_P (x
))
12939 if (rclass
== FP_REGS
12940 && (mode
== TImode
|| mode
== TFmode
|| mode
== TDmode
)
12942 return GENERAL_REGS
;
12947 /* Implement TARGET_SECONDARY_MEMORY_NEEDED. */
12950 aarch64_secondary_memory_needed (machine_mode mode
, reg_class_t class1
,
12951 reg_class_t class2
)
12954 && reg_classes_intersect_p (class1
, FP_REGS
)
12955 && reg_classes_intersect_p (class2
, FP_REGS
))
12957 /* We can't do a 128-bit FPR-to-FPR move without TARGET_SIMD,
12958 so we can't easily split a move involving tuples of 128-bit
12959 vectors. Force the copy through memory instead.
12961 (Tuples of 64-bit vectors are fine.) */
12962 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
12963 if (vec_flags
== (VEC_ADVSIMD
| VEC_STRUCT
))
12969 /* Implement TARGET_FRAME_POINTER_REQUIRED. */
12972 aarch64_frame_pointer_required ()
12974 /* If the function needs to record the incoming value of PSTATE.SM,
12975 make sure that the slot is accessible from the frame pointer. */
12976 return aarch64_need_old_pstate_sm ();
12980 aarch64_can_eliminate (const int from ATTRIBUTE_UNUSED
, const int to
)
12982 gcc_assert (from
== ARG_POINTER_REGNUM
|| from
== FRAME_POINTER_REGNUM
);
12984 /* If we need a frame pointer, ARG_POINTER_REGNUM and FRAME_POINTER_REGNUM
12985 can only eliminate to HARD_FRAME_POINTER_REGNUM. */
12986 if (frame_pointer_needed
)
12987 return to
== HARD_FRAME_POINTER_REGNUM
;
12992 aarch64_initial_elimination_offset (unsigned from
, unsigned to
)
12994 aarch64_frame
&frame
= cfun
->machine
->frame
;
12996 if (to
== HARD_FRAME_POINTER_REGNUM
)
12998 if (from
== ARG_POINTER_REGNUM
)
12999 return frame
.bytes_above_hard_fp
;
13001 if (from
== FRAME_POINTER_REGNUM
)
13002 return frame
.bytes_above_hard_fp
- frame
.bytes_above_locals
;
13005 if (to
== STACK_POINTER_REGNUM
)
13007 if (from
== FRAME_POINTER_REGNUM
)
13008 return frame
.frame_size
- frame
.bytes_above_locals
;
13011 return frame
.frame_size
;
13015 /* Get return address without mangling. */
13018 aarch64_return_addr_rtx (void)
13020 rtx val
= get_hard_reg_initial_val (Pmode
, LR_REGNUM
);
13021 /* Note: aarch64_return_address_signing_enabled only
13022 works after cfun->machine->frame.laid_out is set,
13023 so here we don't know if the return address will
13024 be signed or not. */
13025 rtx lr
= gen_rtx_REG (Pmode
, LR_REGNUM
);
13026 emit_move_insn (lr
, val
);
13027 emit_insn (GEN_FCN (CODE_FOR_xpaclri
) ());
13032 /* Implement RETURN_ADDR_RTX. We do not support moving back to a
13036 aarch64_return_addr (int count
, rtx frame ATTRIBUTE_UNUSED
)
13040 return aarch64_return_addr_rtx ();
13044 aarch64_asm_trampoline_template (FILE *f
)
13046 /* Even if the current function doesn't have branch protection, some
13047 later function might, so since this template is only generated once
13048 we have to add a BTI just in case. */
13049 asm_fprintf (f
, "\thint\t34 // bti c\n");
13053 asm_fprintf (f
, "\tldr\tw%d, .+20\n", IP1_REGNUM
- R0_REGNUM
);
13054 asm_fprintf (f
, "\tldr\tw%d, .+20\n", STATIC_CHAIN_REGNUM
- R0_REGNUM
);
13058 asm_fprintf (f
, "\tldr\t%s, .+20\n", reg_names
[IP1_REGNUM
]);
13059 asm_fprintf (f
, "\tldr\t%s, .+24\n", reg_names
[STATIC_CHAIN_REGNUM
]);
13061 asm_fprintf (f
, "\tbr\t%s\n", reg_names
[IP1_REGNUM
]);
13063 /* We always emit a speculation barrier.
13064 This is because the same trampoline template is used for every nested
13065 function. Since nested functions are not particularly common or
13066 performant we don't worry too much about the extra instructions to copy
13068 This is not yet a problem, since we have not yet implemented function
13069 specific attributes to choose between hardening against straight line
13070 speculation or not, but such function specific attributes are likely to
13071 happen in the future. */
13072 asm_fprintf (f
, "\tdsb\tsy\n\tisb\n");
13074 assemble_aligned_integer (POINTER_BYTES
, const0_rtx
);
13075 assemble_aligned_integer (POINTER_BYTES
, const0_rtx
);
13079 aarch64_trampoline_init (rtx m_tramp
, tree fndecl
, rtx chain_value
)
13081 rtx fnaddr
, mem
, a_tramp
;
13082 const int tramp_code_sz
= 24;
13084 /* Don't need to copy the trailing D-words, we fill those in below. */
13085 /* We create our own memory address in Pmode so that `emit_block_move` can
13086 use parts of the backend which expect Pmode addresses. */
13087 rtx temp
= convert_memory_address (Pmode
, XEXP (m_tramp
, 0));
13088 emit_block_move (gen_rtx_MEM (BLKmode
, temp
),
13089 assemble_trampoline_template (),
13090 GEN_INT (tramp_code_sz
), BLOCK_OP_NORMAL
);
13091 mem
= adjust_address (m_tramp
, ptr_mode
, tramp_code_sz
);
13092 fnaddr
= XEXP (DECL_RTL (fndecl
), 0);
13093 if (GET_MODE (fnaddr
) != ptr_mode
)
13094 fnaddr
= convert_memory_address (ptr_mode
, fnaddr
);
13095 emit_move_insn (mem
, fnaddr
);
13097 mem
= adjust_address (m_tramp
, ptr_mode
, tramp_code_sz
+ POINTER_BYTES
);
13098 emit_move_insn (mem
, chain_value
);
13100 /* XXX We should really define a "clear_cache" pattern and use
13101 gen_clear_cache(). */
13102 a_tramp
= XEXP (m_tramp
, 0);
13103 maybe_emit_call_builtin___clear_cache (a_tramp
,
13104 plus_constant (ptr_mode
,
13109 static unsigned char
13110 aarch64_class_max_nregs (reg_class_t regclass
, machine_mode mode
)
13112 /* ??? Logically we should only need to provide a value when
13113 HARD_REGNO_MODE_OK says that at least one register in REGCLASS
13114 can hold MODE, but at the moment we need to handle all modes.
13115 Just ignore any runtime parts for registers that can't store them. */
13116 HOST_WIDE_INT lowest_size
= constant_lower_bound (GET_MODE_SIZE (mode
));
13117 unsigned int nregs
, vec_flags
;
13123 case TAILCALL_ADDR_REGS
:
13127 case POINTER_AND_FP_REGS
:
13131 vec_flags
= aarch64_classify_vector_mode (mode
);
13132 if ((vec_flags
& VEC_SVE_DATA
)
13133 && constant_multiple_p (GET_MODE_SIZE (mode
),
13134 aarch64_vl_bytes (mode
, vec_flags
), &nregs
))
13136 if (vec_flags
== (VEC_ADVSIMD
| VEC_STRUCT
| VEC_PARTIAL
))
13137 return GET_MODE_SIZE (mode
).to_constant () / 8;
13138 return (vec_flags
& VEC_ADVSIMD
13139 ? CEIL (lowest_size
, UNITS_PER_VREG
)
13140 : CEIL (lowest_size
, UNITS_PER_WORD
));
13145 return mode
== VNx32BImode
? 2 : 1;
13147 case MOVEABLE_SYSREGS
:
13150 case PR_AND_FFR_REGS
:
13160 gcc_unreachable ();
13164 aarch64_preferred_reload_class (rtx x
, reg_class_t regclass
)
13166 if (regclass
== POINTER_REGS
)
13167 return GENERAL_REGS
;
13169 if (regclass
== STACK_REG
)
13172 && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x
)), POINTER_REGS
))
13178 /* Register eliminiation can result in a request for
13179 SP+constant->FP_REGS. We cannot support such operations which
13180 use SP as source and an FP_REG as destination, so reject out
13182 if (! reg_class_subset_p (regclass
, GENERAL_REGS
) && GET_CODE (x
) == PLUS
)
13184 rtx lhs
= XEXP (x
, 0);
13186 /* Look through a possible SUBREG introduced by ILP32. */
13187 if (SUBREG_P (lhs
))
13188 lhs
= SUBREG_REG (lhs
);
13190 gcc_assert (REG_P (lhs
));
13191 gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs
)),
13200 aarch64_asm_output_labelref (FILE* f
, const char *name
)
13202 asm_fprintf (f
, "%U%s", name
);
13206 aarch64_elf_asm_constructor (rtx symbol
, int priority
)
13208 if (priority
== DEFAULT_INIT_PRIORITY
)
13209 default_ctor_section_asm_out_constructor (symbol
, priority
);
13213 /* While priority is known to be in range [0, 65535], so 18 bytes
13214 would be enough, the compiler might not know that. To avoid
13215 -Wformat-truncation false positive, use a larger size. */
13217 snprintf (buf
, sizeof (buf
), ".init_array.%.5u", priority
);
13218 s
= get_section (buf
, SECTION_WRITE
| SECTION_NOTYPE
, NULL
);
13219 switch_to_section (s
);
13220 assemble_align (POINTER_SIZE
);
13221 assemble_aligned_integer (POINTER_BYTES
, symbol
);
13226 aarch64_elf_asm_destructor (rtx symbol
, int priority
)
13228 if (priority
== DEFAULT_INIT_PRIORITY
)
13229 default_dtor_section_asm_out_destructor (symbol
, priority
);
13233 /* While priority is known to be in range [0, 65535], so 18 bytes
13234 would be enough, the compiler might not know that. To avoid
13235 -Wformat-truncation false positive, use a larger size. */
13237 snprintf (buf
, sizeof (buf
), ".fini_array.%.5u", priority
);
13238 s
= get_section (buf
, SECTION_WRITE
| SECTION_NOTYPE
, NULL
);
13239 switch_to_section (s
);
13240 assemble_align (POINTER_SIZE
);
13241 assemble_aligned_integer (POINTER_BYTES
, symbol
);
13246 aarch64_output_casesi (rtx
*operands
)
13250 rtx diff_vec
= PATTERN (NEXT_INSN (as_a
<rtx_insn
*> (operands
[2])));
13252 static const char *const patterns
[4][2] =
13255 "ldrb\t%w3, [%0,%w1,uxtw]",
13256 "add\t%3, %4, %w3, sxtb #2"
13259 "ldrh\t%w3, [%0,%w1,uxtw #1]",
13260 "add\t%3, %4, %w3, sxth #2"
13263 "ldr\t%w3, [%0,%w1,uxtw #2]",
13264 "add\t%3, %4, %w3, sxtw #2"
13266 /* We assume that DImode is only generated when not optimizing and
13267 that we don't really need 64-bit address offsets. That would
13268 imply an object file with 8GB of code in a single function! */
13270 "ldr\t%w3, [%0,%w1,uxtw #2]",
13271 "add\t%3, %4, %w3, sxtw #2"
13275 gcc_assert (GET_CODE (diff_vec
) == ADDR_DIFF_VEC
);
13277 scalar_int_mode mode
= as_a
<scalar_int_mode
> (GET_MODE (diff_vec
));
13278 index
= exact_log2 (GET_MODE_SIZE (mode
));
13280 gcc_assert (index
>= 0 && index
<= 3);
13282 /* Need to implement table size reduction, by chaning the code below. */
13283 output_asm_insn (patterns
[index
][0], operands
);
13284 ASM_GENERATE_INTERNAL_LABEL (label
, "Lrtx", CODE_LABEL_NUMBER (operands
[2]));
13285 snprintf (buf
, sizeof (buf
),
13286 "adr\t%%4, %s", targetm
.strip_name_encoding (label
));
13287 output_asm_insn (buf
, operands
);
13288 output_asm_insn (patterns
[index
][1], operands
);
13289 output_asm_insn ("br\t%3", operands
);
13290 output_asm_insn (aarch64_sls_barrier (aarch64_harden_sls_retbr_p ()),
13292 assemble_label (asm_out_file
, label
);
13296 /* Return the asm string for an SME ZERO instruction whose 8-bit mask
13297 operand is MASK. */
13299 aarch64_output_sme_zero_za (rtx mask
)
13301 auto mask_val
= UINTVAL (mask
);
13305 if (mask_val
== 0xff)
13306 return "zero\t{ za }";
13308 static constexpr struct { unsigned char mask
; char letter
; } tiles
[] = {
13314 /* The last entry in the list has the form "za7.d }", but that's the
13315 same length as "za7.d, ". */
13316 static char buffer
[sizeof("zero\t{ ") + sizeof ("za7.d, ") * 8 + 1];
13317 for (auto &tile
: tiles
)
13319 unsigned int tile_mask
= tile
.mask
;
13320 unsigned int tile_index
= 0;
13321 unsigned int i
= snprintf (buffer
, sizeof (buffer
), "zero\t");
13322 const char *prefix
= "{ ";
13323 auto remaining_mask
= mask_val
;
13324 while (tile_mask
< 0x100)
13326 if ((remaining_mask
& tile_mask
) == tile_mask
)
13328 i
+= snprintf (buffer
+ i
, sizeof (buffer
) - i
, "%sza%d.%c",
13329 prefix
, tile_index
, tile
.letter
);
13331 remaining_mask
&= ~tile_mask
;
13336 if (remaining_mask
== 0)
13338 gcc_assert (i
+ 3 <= sizeof (buffer
));
13339 snprintf (buffer
+ i
, sizeof (buffer
) - i
, " }");
13343 gcc_unreachable ();
13346 /* Return size in bits of an arithmetic operand which is shifted/scaled and
13347 masked such that it is suitable for a UXTB, UXTH, or UXTW extend
13351 aarch64_uxt_size (int shift
, HOST_WIDE_INT mask
)
13353 if (shift
>= 0 && shift
<= 4)
13356 for (size
= 8; size
<= 32; size
*= 2)
13358 HOST_WIDE_INT bits
= ((HOST_WIDE_INT
)1U << size
) - 1;
13359 if (mask
== bits
<< shift
)
13366 /* Constant pools are per function only when PC relative
13367 literal loads are true or we are in the large memory
13371 aarch64_can_use_per_function_literal_pools_p (void)
13373 return (aarch64_pcrelative_literal_loads
13374 || aarch64_cmodel
== AARCH64_CMODEL_LARGE
);
13378 aarch64_use_blocks_for_constant_p (machine_mode
, const_rtx
)
13380 /* We can't use blocks for constants when we're using a per-function
13382 return !aarch64_can_use_per_function_literal_pools_p ();
13385 /* Select appropriate section for constants depending
13386 on where we place literal pools. */
13389 aarch64_select_rtx_section (machine_mode mode
,
13391 unsigned HOST_WIDE_INT align
)
13393 if (aarch64_can_use_per_function_literal_pools_p ())
13394 return function_section (current_function_decl
);
13396 return default_elf_select_rtx_section (mode
, x
, align
);
13399 /* Implement ASM_OUTPUT_POOL_EPILOGUE. */
13401 aarch64_asm_output_pool_epilogue (FILE *f
, const char *, tree
,
13402 HOST_WIDE_INT offset
)
13404 /* When using per-function literal pools, we must ensure that any code
13405 section is aligned to the minimal instruction length, lest we get
13406 errors from the assembler re "unaligned instructions". */
13407 if ((offset
& 3) && aarch64_can_use_per_function_literal_pools_p ())
13408 ASM_OUTPUT_ALIGN (f
, 2);
13413 /* Helper function for rtx cost calculation. Strip a shift expression
13414 from X. Returns the inner operand if successful, or the original
13415 expression on failure. */
13417 aarch64_strip_shift (rtx x
)
13421 /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
13422 we can convert both to ROR during final output. */
13423 if ((GET_CODE (op
) == ASHIFT
13424 || GET_CODE (op
) == ASHIFTRT
13425 || GET_CODE (op
) == LSHIFTRT
13426 || GET_CODE (op
) == ROTATERT
13427 || GET_CODE (op
) == ROTATE
)
13428 && CONST_INT_P (XEXP (op
, 1)))
13429 return XEXP (op
, 0);
13431 if (GET_CODE (op
) == MULT
13432 && CONST_INT_P (XEXP (op
, 1))
13433 && ((unsigned) exact_log2 (INTVAL (XEXP (op
, 1)))) < 64)
13434 return XEXP (op
, 0);
13439 /* Helper function for rtx cost calculation. Strip an extend
13440 expression from X. Returns the inner operand if successful, or the
13441 original expression on failure. We deal with a number of possible
13442 canonicalization variations here. If STRIP_SHIFT is true, then
13443 we can strip off a shift also. */
13445 aarch64_strip_extend (rtx x
, bool strip_shift
)
13447 scalar_int_mode mode
;
13450 if (!is_a
<scalar_int_mode
> (GET_MODE (op
), &mode
))
13453 if (GET_CODE (op
) == AND
13454 && GET_CODE (XEXP (op
, 0)) == MULT
13455 && CONST_INT_P (XEXP (XEXP (op
, 0), 1))
13456 && CONST_INT_P (XEXP (op
, 1))
13457 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op
, 0), 1))),
13458 INTVAL (XEXP (op
, 1))) != 0)
13459 return XEXP (XEXP (op
, 0), 0);
13461 /* Now handle extended register, as this may also have an optional
13462 left shift by 1..4. */
13464 && GET_CODE (op
) == ASHIFT
13465 && CONST_INT_P (XEXP (op
, 1))
13466 && ((unsigned HOST_WIDE_INT
) INTVAL (XEXP (op
, 1))) <= 4)
13469 if (GET_CODE (op
) == ZERO_EXTEND
13470 || GET_CODE (op
) == SIGN_EXTEND
)
13479 /* Helper function for rtx cost calculation. Strip extension as well as any
13480 inner VEC_SELECT high-half from X. Returns the inner vector operand if
13481 successful, or the original expression on failure. */
13483 aarch64_strip_extend_vec_half (rtx x
)
13485 if (GET_CODE (x
) == ZERO_EXTEND
|| GET_CODE (x
) == SIGN_EXTEND
)
13488 if (GET_CODE (x
) == VEC_SELECT
13489 && vec_series_highpart_p (GET_MODE (x
), GET_MODE (XEXP (x
, 0)),
13496 /* Helper function for rtx cost calculation. Strip VEC_DUPLICATE as well as
13497 any subsequent extend and VEC_SELECT from X. Returns the inner scalar
13498 operand if successful, or the original expression on failure. */
13500 aarch64_strip_duplicate_vec_elt (rtx x
)
13502 if (GET_CODE (x
) == VEC_DUPLICATE
13503 && is_a
<scalar_mode
> (GET_MODE (XEXP (x
, 0))))
13506 if (GET_CODE (x
) == VEC_SELECT
)
13508 else if ((GET_CODE (x
) == ZERO_EXTEND
|| GET_CODE (x
) == SIGN_EXTEND
)
13509 && GET_CODE (XEXP (x
, 0)) == VEC_SELECT
)
13510 x
= XEXP (XEXP (x
, 0), 0);
13515 /* Return true iff CODE is a shift supported in combination
13516 with arithmetic instructions. */
13519 aarch64_shift_p (enum rtx_code code
)
13521 return code
== ASHIFT
|| code
== ASHIFTRT
|| code
== LSHIFTRT
;
13525 /* Return true iff X is a cheap shift without a sign extend. */
13528 aarch64_cheap_mult_shift_p (rtx x
)
13535 if (!(aarch64_tune_params
.extra_tuning_flags
13536 & AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND
))
13539 if (GET_CODE (op0
) == SIGN_EXTEND
)
13542 if (GET_CODE (x
) == ASHIFT
&& CONST_INT_P (op1
)
13543 && UINTVAL (op1
) <= 4)
13546 if (GET_CODE (x
) != MULT
|| !CONST_INT_P (op1
))
13549 HOST_WIDE_INT l2
= exact_log2 (INTVAL (op1
));
13551 if (l2
> 0 && l2
<= 4)
13557 /* Helper function for rtx cost calculation. Calculate the cost of
13558 a MULT or ASHIFT, which may be part of a compound PLUS/MINUS rtx.
13559 Return the calculated cost of the expression, recursing manually in to
13560 operands where needed. */
13563 aarch64_rtx_mult_cost (rtx x
, enum rtx_code code
, int outer
, bool speed
)
13566 const struct cpu_cost_table
*extra_cost
13567 = aarch64_tune_params
.insn_extra_cost
;
13569 bool compound_p
= (outer
== PLUS
|| outer
== MINUS
);
13570 machine_mode mode
= GET_MODE (x
);
13572 gcc_checking_assert (code
== MULT
);
13577 if (VECTOR_MODE_P (mode
))
13579 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
13580 if (TARGET_SIMD
&& (vec_flags
& VEC_ADVSIMD
))
13582 /* The select-operand-high-half versions of the instruction have the
13583 same cost as the three vector version - don't add the costs of the
13584 extension or selection into the costs of the multiply. */
13585 op0
= aarch64_strip_extend_vec_half (op0
);
13586 op1
= aarch64_strip_extend_vec_half (op1
);
13587 /* The by-element versions of the instruction have the same costs as
13588 the normal 3-vector version. We make an assumption that the input
13589 to the VEC_DUPLICATE is already on the FP & SIMD side. This means
13590 costing of a MUL by element pre RA is a bit optimistic. */
13591 op0
= aarch64_strip_duplicate_vec_elt (op0
);
13592 op1
= aarch64_strip_duplicate_vec_elt (op1
);
13594 cost
+= rtx_cost (op0
, mode
, MULT
, 0, speed
);
13595 cost
+= rtx_cost (op1
, mode
, MULT
, 1, speed
);
13598 if (GET_CODE (x
) == MULT
)
13599 cost
+= extra_cost
->vect
.mult
;
13600 /* This is to catch the SSRA costing currently flowing here. */
13602 cost
+= extra_cost
->vect
.alu
;
13607 /* Integer multiply/fma. */
13608 if (GET_MODE_CLASS (mode
) == MODE_INT
)
13610 /* The multiply will be canonicalized as a shift, cost it as such. */
13611 if (aarch64_shift_p (GET_CODE (x
))
13612 || (CONST_INT_P (op1
)
13613 && exact_log2 (INTVAL (op1
)) > 0))
13615 bool is_extend
= GET_CODE (op0
) == ZERO_EXTEND
13616 || GET_CODE (op0
) == SIGN_EXTEND
;
13621 /* If the shift is considered cheap,
13622 then don't add any cost. */
13623 if (aarch64_cheap_mult_shift_p (x
))
13625 else if (REG_P (op1
))
13626 /* ARITH + shift-by-register. */
13627 cost
+= extra_cost
->alu
.arith_shift_reg
;
13628 else if (is_extend
)
13629 /* ARITH + extended register. We don't have a cost field
13630 for ARITH+EXTEND+SHIFT, so use extend_arith here. */
13631 cost
+= extra_cost
->alu
.extend_arith
;
13633 /* ARITH + shift-by-immediate. */
13634 cost
+= extra_cost
->alu
.arith_shift
;
13637 /* LSL (immediate). */
13638 cost
+= extra_cost
->alu
.shift
;
13641 /* Strip extends as we will have costed them in the case above. */
13643 op0
= aarch64_strip_extend (op0
, true);
13645 cost
+= rtx_cost (op0
, VOIDmode
, code
, 0, speed
);
13650 /* MNEG or [US]MNEGL. Extract the NEG operand and indicate that it's a
13651 compound and let the below cases handle it. After all, MNEG is a
13652 special-case alias of MSUB. */
13653 if (GET_CODE (op0
) == NEG
)
13655 op0
= XEXP (op0
, 0);
13659 /* Integer multiplies or FMAs have zero/sign extending variants. */
13660 if ((GET_CODE (op0
) == ZERO_EXTEND
13661 && GET_CODE (op1
) == ZERO_EXTEND
)
13662 || (GET_CODE (op0
) == SIGN_EXTEND
13663 && GET_CODE (op1
) == SIGN_EXTEND
))
13665 cost
+= rtx_cost (XEXP (op0
, 0), VOIDmode
, MULT
, 0, speed
);
13666 cost
+= rtx_cost (XEXP (op1
, 0), VOIDmode
, MULT
, 1, speed
);
13671 /* SMADDL/UMADDL/UMSUBL/SMSUBL. */
13672 cost
+= extra_cost
->mult
[0].extend_add
;
13674 /* MUL/SMULL/UMULL. */
13675 cost
+= extra_cost
->mult
[0].extend
;
13681 /* This is either an integer multiply or a MADD. In both cases
13682 we want to recurse and cost the operands. */
13683 cost
+= rtx_cost (op0
, mode
, MULT
, 0, speed
);
13684 cost
+= rtx_cost (op1
, mode
, MULT
, 1, speed
);
13690 cost
+= extra_cost
->mult
[mode
== DImode
].add
;
13693 cost
+= extra_cost
->mult
[mode
== DImode
].simple
;
13702 /* Floating-point FMA/FMUL can also support negations of the
13703 operands, unless the rounding mode is upward or downward in
13704 which case FNMUL is different than FMUL with operand negation. */
13705 bool neg0
= GET_CODE (op0
) == NEG
;
13706 bool neg1
= GET_CODE (op1
) == NEG
;
13707 if (compound_p
|| !flag_rounding_math
|| (neg0
&& neg1
))
13710 op0
= XEXP (op0
, 0);
13712 op1
= XEXP (op1
, 0);
13716 /* FMADD/FNMADD/FNMSUB/FMSUB. */
13717 cost
+= extra_cost
->fp
[mode
== DFmode
].fma
;
13720 cost
+= extra_cost
->fp
[mode
== DFmode
].mult
;
13723 cost
+= rtx_cost (op0
, mode
, MULT
, 0, speed
);
13724 cost
+= rtx_cost (op1
, mode
, MULT
, 1, speed
);
13730 aarch64_address_cost (rtx x
,
13732 addr_space_t as ATTRIBUTE_UNUSED
,
13735 enum rtx_code c
= GET_CODE (x
);
13736 const struct cpu_addrcost_table
*addr_cost
= aarch64_tune_params
.addr_cost
;
13737 struct aarch64_address_info info
;
13741 if (!aarch64_classify_address (&info
, x
, mode
, false))
13743 if (GET_CODE (x
) == CONST
|| SYMBOL_REF_P (x
))
13745 /* This is a CONST or SYMBOL ref which will be split
13746 in a different way depending on the code model in use.
13747 Cost it through the generic infrastructure. */
13748 int cost_symbol_ref
= rtx_cost (x
, Pmode
, MEM
, 1, speed
);
13749 /* Divide through by the cost of one instruction to
13750 bring it to the same units as the address costs. */
13751 cost_symbol_ref
/= COSTS_N_INSNS (1);
13752 /* The cost is then the cost of preparing the address,
13753 followed by an immediate (possibly 0) offset. */
13754 return cost_symbol_ref
+ addr_cost
->imm_offset
;
13758 /* This is most likely a jump table from a case
13760 return addr_cost
->register_offset
;
13766 case ADDRESS_LO_SUM
:
13767 case ADDRESS_SYMBOLIC
:
13768 case ADDRESS_REG_IMM
:
13769 cost
+= addr_cost
->imm_offset
;
13772 case ADDRESS_REG_WB
:
13773 if (c
== PRE_INC
|| c
== PRE_DEC
|| c
== PRE_MODIFY
)
13774 cost
+= addr_cost
->pre_modify
;
13775 else if (c
== POST_INC
|| c
== POST_DEC
|| c
== POST_MODIFY
)
13777 unsigned int nvectors
= aarch64_ldn_stn_vectors (mode
);
13779 cost
+= addr_cost
->post_modify_ld3_st3
;
13780 else if (nvectors
== 4)
13781 cost
+= addr_cost
->post_modify_ld4_st4
;
13783 cost
+= addr_cost
->post_modify
;
13786 gcc_unreachable ();
13790 case ADDRESS_REG_REG
:
13791 cost
+= addr_cost
->register_offset
;
13794 case ADDRESS_REG_SXTW
:
13795 cost
+= addr_cost
->register_sextend
;
13798 case ADDRESS_REG_UXTW
:
13799 cost
+= addr_cost
->register_zextend
;
13803 gcc_unreachable ();
13807 if (info
.shift
> 0)
13809 /* For the sake of calculating the cost of the shifted register
13810 component, we can treat same sized modes in the same way. */
13811 if (known_eq (GET_MODE_BITSIZE (mode
), 16))
13812 cost
+= addr_cost
->addr_scale_costs
.hi
;
13813 else if (known_eq (GET_MODE_BITSIZE (mode
), 32))
13814 cost
+= addr_cost
->addr_scale_costs
.si
;
13815 else if (known_eq (GET_MODE_BITSIZE (mode
), 64))
13816 cost
+= addr_cost
->addr_scale_costs
.di
;
13818 /* We can't tell, or this is a 128-bit vector. */
13819 cost
+= addr_cost
->addr_scale_costs
.ti
;
13825 /* Return the cost of a branch. If SPEED_P is true then the compiler is
13826 optimizing for speed. If PREDICTABLE_P is true then the branch is predicted
13830 aarch64_branch_cost (bool speed_p
, bool predictable_p
)
13832 /* When optimizing for speed, use the cost of unpredictable branches. */
13833 const struct cpu_branch_cost
*branch_costs
=
13834 aarch64_tune_params
.branch_costs
;
13836 if (!speed_p
|| predictable_p
)
13837 return branch_costs
->predictable
;
13839 return branch_costs
->unpredictable
;
13842 /* Return true if X is a zero or sign extract
13843 usable in an ADD or SUB (extended register) instruction. */
13845 aarch64_rtx_arith_op_extract_p (rtx x
)
13847 /* The simple case <ARITH>, XD, XN, XM, [us]xt.
13849 if (GET_CODE (x
) == SIGN_EXTEND
13850 || GET_CODE (x
) == ZERO_EXTEND
)
13851 return REG_P (XEXP (x
, 0));
13857 aarch64_frint_unspec_p (unsigned int u
)
13861 case UNSPEC_FRINTZ
:
13862 case UNSPEC_FRINTP
:
13863 case UNSPEC_FRINTM
:
13864 case UNSPEC_FRINTA
:
13865 case UNSPEC_FRINTN
:
13866 case UNSPEC_FRINTX
:
13867 case UNSPEC_FRINTI
:
13875 /* Return true iff X is an rtx that will match an extr instruction
13876 i.e. as described in the *extr<mode>5_insn family of patterns.
13877 OP0 and OP1 will be set to the operands of the shifts involved
13878 on success and will be NULL_RTX otherwise. */
13881 aarch64_extr_rtx_p (rtx x
, rtx
*res_op0
, rtx
*res_op1
)
13884 scalar_int_mode mode
;
13885 if (!is_a
<scalar_int_mode
> (GET_MODE (x
), &mode
))
13888 *res_op0
= NULL_RTX
;
13889 *res_op1
= NULL_RTX
;
13891 if (GET_CODE (x
) != IOR
)
13897 if ((GET_CODE (op0
) == ASHIFT
&& GET_CODE (op1
) == LSHIFTRT
)
13898 || (GET_CODE (op1
) == ASHIFT
&& GET_CODE (op0
) == LSHIFTRT
))
13900 /* Canonicalise locally to ashift in op0, lshiftrt in op1. */
13901 if (GET_CODE (op1
) == ASHIFT
)
13902 std::swap (op0
, op1
);
13904 if (!CONST_INT_P (XEXP (op0
, 1)) || !CONST_INT_P (XEXP (op1
, 1)))
13907 unsigned HOST_WIDE_INT shft_amnt_0
= UINTVAL (XEXP (op0
, 1));
13908 unsigned HOST_WIDE_INT shft_amnt_1
= UINTVAL (XEXP (op1
, 1));
13910 if (shft_amnt_0
< GET_MODE_BITSIZE (mode
)
13911 && shft_amnt_0
+ shft_amnt_1
== GET_MODE_BITSIZE (mode
))
13913 *res_op0
= XEXP (op0
, 0);
13914 *res_op1
= XEXP (op1
, 0);
13922 /* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
13923 storing it in *COST. Result is true if the total cost of the operation
13924 has now been calculated. */
13926 aarch64_if_then_else_costs (rtx op0
, rtx op1
, rtx op2
, int *cost
, bool speed
)
13930 enum rtx_code cmpcode
;
13931 const struct cpu_cost_table
*extra_cost
13932 = aarch64_tune_params
.insn_extra_cost
;
13934 if (COMPARISON_P (op0
))
13936 inner
= XEXP (op0
, 0);
13937 comparator
= XEXP (op0
, 1);
13938 cmpcode
= GET_CODE (op0
);
13943 comparator
= const0_rtx
;
13947 if (GET_CODE (op1
) == PC
|| GET_CODE (op2
) == PC
)
13949 /* Conditional branch. */
13950 if (GET_MODE_CLASS (GET_MODE (inner
)) == MODE_CC
)
13954 if (cmpcode
== NE
|| cmpcode
== EQ
)
13956 if (comparator
== const0_rtx
)
13958 /* TBZ/TBNZ/CBZ/CBNZ. */
13959 if (GET_CODE (inner
) == ZERO_EXTRACT
)
13961 *cost
+= rtx_cost (XEXP (inner
, 0), VOIDmode
,
13962 ZERO_EXTRACT
, 0, speed
);
13965 *cost
+= rtx_cost (inner
, VOIDmode
, cmpcode
, 0, speed
);
13969 if (register_operand (inner
, VOIDmode
)
13970 && aarch64_imm24 (comparator
, VOIDmode
))
13972 /* SUB and SUBS. */
13973 *cost
+= COSTS_N_INSNS (2);
13975 *cost
+= extra_cost
->alu
.arith
* 2;
13979 else if (cmpcode
== LT
|| cmpcode
== GE
)
13982 if (comparator
== const0_rtx
)
13987 else if (GET_MODE_CLASS (GET_MODE (inner
)) == MODE_CC
)
13990 if (GET_CODE (op1
) == COMPARE
)
13992 /* Increase cost of CCMP reg, 0, imm, CC to prefer CMP reg, 0. */
13993 if (XEXP (op1
, 1) == const0_rtx
)
13997 machine_mode mode
= GET_MODE (XEXP (op1
, 0));
13999 if (GET_MODE_CLASS (mode
) == MODE_INT
)
14000 *cost
+= extra_cost
->alu
.arith
;
14002 *cost
+= extra_cost
->fp
[mode
== DFmode
].compare
;
14007 /* It's a conditional operation based on the status flags,
14008 so it must be some flavor of CSEL. */
14010 /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL. */
14011 if (GET_CODE (op1
) == NEG
14012 || GET_CODE (op1
) == NOT
14013 || (GET_CODE (op1
) == PLUS
&& XEXP (op1
, 1) == const1_rtx
))
14014 op1
= XEXP (op1
, 0);
14015 else if (GET_CODE (op1
) == ZERO_EXTEND
&& GET_CODE (op2
) == ZERO_EXTEND
)
14017 /* CSEL with zero-extension (*cmovdi_insn_uxtw). */
14018 op1
= XEXP (op1
, 0);
14019 op2
= XEXP (op2
, 0);
14021 else if (GET_CODE (op1
) == ZERO_EXTEND
&& op2
== const0_rtx
)
14023 inner
= XEXP (op1
, 0);
14024 if (GET_CODE (inner
) == NEG
|| GET_CODE (inner
) == NOT
)
14025 /* CSINV/NEG with zero extend + const 0 (*csinv3_uxtw_insn3). */
14026 op1
= XEXP (inner
, 0);
14028 else if (op1
== constm1_rtx
|| op1
== const1_rtx
)
14030 /* Use CSINV or CSINC. */
14031 *cost
+= rtx_cost (op2
, VOIDmode
, IF_THEN_ELSE
, 2, speed
);
14034 else if (op2
== constm1_rtx
|| op2
== const1_rtx
)
14036 /* Use CSINV or CSINC. */
14037 *cost
+= rtx_cost (op1
, VOIDmode
, IF_THEN_ELSE
, 1, speed
);
14041 *cost
+= rtx_cost (op1
, VOIDmode
, IF_THEN_ELSE
, 1, speed
);
14042 *cost
+= rtx_cost (op2
, VOIDmode
, IF_THEN_ELSE
, 2, speed
);
14046 /* We don't know what this is, cost all operands. */
14050 /* Check whether X is a bitfield operation of the form shift + extend that
14051 maps down to a UBFIZ/SBFIZ/UBFX/SBFX instruction. If so, return the
14052 operand to which the bitfield operation is applied. Otherwise return
14056 aarch64_extend_bitfield_pattern_p (rtx x
)
14058 rtx_code outer_code
= GET_CODE (x
);
14059 machine_mode outer_mode
= GET_MODE (x
);
14061 if (outer_code
!= ZERO_EXTEND
&& outer_code
!= SIGN_EXTEND
14062 && outer_mode
!= SImode
&& outer_mode
!= DImode
)
14065 rtx inner
= XEXP (x
, 0);
14066 rtx_code inner_code
= GET_CODE (inner
);
14067 machine_mode inner_mode
= GET_MODE (inner
);
14070 switch (inner_code
)
14073 if (CONST_INT_P (XEXP (inner
, 1))
14074 && (inner_mode
== QImode
|| inner_mode
== HImode
))
14075 op
= XEXP (inner
, 0);
14078 if (outer_code
== ZERO_EXTEND
&& CONST_INT_P (XEXP (inner
, 1))
14079 && (inner_mode
== QImode
|| inner_mode
== HImode
))
14080 op
= XEXP (inner
, 0);
14083 if (outer_code
== SIGN_EXTEND
&& CONST_INT_P (XEXP (inner
, 1))
14084 && (inner_mode
== QImode
|| inner_mode
== HImode
))
14085 op
= XEXP (inner
, 0);
14094 /* Return true if the mask and a shift amount from an RTX of the form
14095 (x << SHFT_AMNT) & MASK are valid to combine into a UBFIZ instruction of
14096 mode MODE. See the *andim_ashift<mode>_bfiz pattern. */
14099 aarch64_mask_and_shift_for_ubfiz_p (scalar_int_mode mode
, rtx mask
,
14102 return CONST_INT_P (mask
) && CONST_INT_P (shft_amnt
)
14103 && INTVAL (mask
) > 0
14104 && UINTVAL (shft_amnt
) < GET_MODE_BITSIZE (mode
)
14105 && exact_log2 ((UINTVAL (mask
) >> UINTVAL (shft_amnt
)) + 1) >= 0
14107 & ((HOST_WIDE_INT_1U
<< UINTVAL (shft_amnt
)) - 1)) == 0;
14110 /* Return true if the masks and a shift amount from an RTX of the form
14111 ((x & MASK1) | ((y << SHIFT_AMNT) & MASK2)) are valid to combine into
14112 a BFI instruction of mode MODE. See *arch64_bfi patterns. */
14115 aarch64_masks_and_shift_for_bfi_p (scalar_int_mode mode
,
14116 unsigned HOST_WIDE_INT mask1
,
14117 unsigned HOST_WIDE_INT shft_amnt
,
14118 unsigned HOST_WIDE_INT mask2
)
14120 unsigned HOST_WIDE_INT t
;
14122 /* Verify that there is no overlap in what bits are set in the two masks. */
14123 if (mask1
!= ~mask2
)
14126 /* Verify that mask2 is not all zeros or ones. */
14127 if (mask2
== 0 || mask2
== HOST_WIDE_INT_M1U
)
14130 /* The shift amount should always be less than the mode size. */
14131 gcc_assert (shft_amnt
< GET_MODE_BITSIZE (mode
));
14133 /* Verify that the mask being shifted is contiguous and would be in the
14134 least significant bits after shifting by shft_amnt. */
14135 t
= mask2
+ (HOST_WIDE_INT_1U
<< shft_amnt
);
14136 return (t
== (t
& -t
));
14139 /* Return true if X is an RTX representing an operation in the ABD family
14140 of instructions. */
14143 aarch64_abd_rtx_p (rtx x
)
14145 if (GET_CODE (x
) != MINUS
)
14147 rtx max_arm
= XEXP (x
, 0);
14148 rtx min_arm
= XEXP (x
, 1);
14149 if (GET_CODE (max_arm
) != SMAX
&& GET_CODE (max_arm
) != UMAX
)
14151 bool signed_p
= GET_CODE (max_arm
) == SMAX
;
14152 if (signed_p
&& GET_CODE (min_arm
) != SMIN
)
14154 else if (!signed_p
&& GET_CODE (min_arm
) != UMIN
)
14157 rtx maxop0
= XEXP (max_arm
, 0);
14158 rtx maxop1
= XEXP (max_arm
, 1);
14159 rtx minop0
= XEXP (min_arm
, 0);
14160 rtx minop1
= XEXP (min_arm
, 1);
14161 return rtx_equal_p (maxop0
, minop0
) && rtx_equal_p (maxop1
, minop1
);
14164 /* Calculate the cost of calculating X, storing it in *COST. Result
14165 is true if the total cost of the operation has now been calculated. */
14167 aarch64_rtx_costs (rtx x
, machine_mode mode
, int outer ATTRIBUTE_UNUSED
,
14168 int param ATTRIBUTE_UNUSED
, int *cost
, bool speed
)
14171 const struct cpu_cost_table
*extra_cost
14172 = aarch64_tune_params
.insn_extra_cost
;
14173 rtx_code code
= GET_CODE (x
);
14174 scalar_int_mode int_mode
;
14176 /* By default, assume that everything has equivalent cost to the
14177 cheapest instruction. Any additional costs are applied as a delta
14178 above this default. */
14179 *cost
= COSTS_N_INSNS (1);
14184 /* The cost depends entirely on the operands to SET. */
14186 op0
= SET_DEST (x
);
14189 switch (GET_CODE (op0
))
14194 rtx address
= XEXP (op0
, 0);
14195 if (VECTOR_MODE_P (mode
))
14196 *cost
+= extra_cost
->ldst
.storev
;
14197 else if (GET_MODE_CLASS (mode
) == MODE_INT
)
14198 *cost
+= extra_cost
->ldst
.store
;
14199 else if (mode
== SFmode
|| mode
== SDmode
)
14200 *cost
+= extra_cost
->ldst
.storef
;
14201 else if (mode
== DFmode
|| mode
== DDmode
)
14202 *cost
+= extra_cost
->ldst
.stored
;
14205 COSTS_N_INSNS (aarch64_address_cost (address
, mode
,
14209 *cost
+= rtx_cost (op1
, mode
, SET
, 1, speed
);
14213 if (! REG_P (SUBREG_REG (op0
)))
14214 *cost
+= rtx_cost (SUBREG_REG (op0
), VOIDmode
, SET
, 0, speed
);
14216 /* Fall through. */
14218 /* The cost is one per vector-register copied. */
14219 if (VECTOR_MODE_P (GET_MODE (op0
)) && REG_P (op1
))
14221 int nregs
= aarch64_hard_regno_nregs (V0_REGNUM
, GET_MODE (op0
));
14222 *cost
= COSTS_N_INSNS (nregs
);
14224 /* const0_rtx is in general free, but we will use an
14225 instruction to set a register to 0. */
14226 else if (REG_P (op1
) || op1
== const0_rtx
)
14228 /* The cost is 1 per register copied. */
14229 int nregs
= aarch64_hard_regno_nregs (R0_REGNUM
, GET_MODE (op0
));
14230 *cost
= COSTS_N_INSNS (nregs
);
14233 /* Cost is just the cost of the RHS of the set. */
14234 *cost
+= rtx_cost (op1
, mode
, SET
, 1, speed
);
14239 /* Bit-field insertion. Strip any redundant widening of
14240 the RHS to meet the width of the target. */
14241 if (SUBREG_P (op1
))
14242 op1
= SUBREG_REG (op1
);
14243 if ((GET_CODE (op1
) == ZERO_EXTEND
14244 || GET_CODE (op1
) == SIGN_EXTEND
)
14245 && CONST_INT_P (XEXP (op0
, 1))
14246 && is_a
<scalar_int_mode
> (GET_MODE (XEXP (op1
, 0)), &int_mode
)
14247 && GET_MODE_BITSIZE (int_mode
) >= INTVAL (XEXP (op0
, 1)))
14248 op1
= XEXP (op1
, 0);
14250 if (CONST_INT_P (op1
))
14252 /* MOV immediate is assumed to always be cheap. */
14253 *cost
= COSTS_N_INSNS (1);
14259 *cost
+= extra_cost
->alu
.bfi
;
14260 *cost
+= rtx_cost (op1
, VOIDmode
, (enum rtx_code
) code
, 1, speed
);
14266 /* We can't make sense of this, assume default cost. */
14267 *cost
= COSTS_N_INSNS (1);
14273 /* If an instruction can incorporate a constant within the
14274 instruction, the instruction's expression avoids calling
14275 rtx_cost() on the constant. If rtx_cost() is called on a
14276 constant, then it is usually because the constant must be
14277 moved into a register by one or more instructions.
14279 The exception is constant 0, which can be expressed
14280 as XZR/WZR and is therefore free. The exception to this is
14281 if we have (set (reg) (const0_rtx)) in which case we must cost
14282 the move. However, we can catch that when we cost the SET, so
14283 we don't need to consider that here. */
14284 if (x
== const0_rtx
)
14288 /* To an approximation, building any other constant is
14289 proportionally expensive to the number of instructions
14290 required to build that constant. This is true whether we
14291 are compiling for SPEED or otherwise. */
14292 machine_mode imode
= known_le (GET_MODE_SIZE (mode
), 4)
14294 *cost
= COSTS_N_INSNS (aarch64_internal_mov_immediate
14295 (NULL_RTX
, x
, false, imode
));
14301 /* First determine number of instructions to do the move
14302 as an integer constant. */
14303 if (!aarch64_float_const_representable_p (x
)
14304 && !aarch64_can_const_movi_rtx_p (x
, mode
)
14305 && aarch64_float_const_rtx_p (x
))
14307 unsigned HOST_WIDE_INT ival
;
14308 bool succeed
= aarch64_reinterpret_float_as_int (x
, &ival
);
14309 gcc_assert (succeed
);
14311 machine_mode imode
= known_eq (GET_MODE_SIZE (mode
), 8)
14313 int ncost
= aarch64_internal_mov_immediate
14314 (NULL_RTX
, gen_int_mode (ival
, imode
), false, imode
);
14315 *cost
+= COSTS_N_INSNS (ncost
);
14321 /* mov[df,sf]_aarch64. */
14322 if (aarch64_float_const_representable_p (x
))
14323 /* FMOV (scalar immediate). */
14324 *cost
+= extra_cost
->fp
[mode
== DFmode
|| mode
== DDmode
].fpconst
;
14325 else if (!aarch64_float_const_zero_rtx_p (x
))
14327 /* This will be a load from memory. */
14328 if (mode
== DFmode
|| mode
== DDmode
)
14329 *cost
+= extra_cost
->ldst
.loadd
;
14331 *cost
+= extra_cost
->ldst
.loadf
;
14334 /* Otherwise this is +0.0. We get this using MOVI d0, #0
14335 or MOV v0.s[0], wzr - neither of which are modeled by the
14336 cost tables. Just use the default cost. */
14346 /* For loads we want the base cost of a load, plus an
14347 approximation for the additional cost of the addressing
14349 rtx address
= XEXP (x
, 0);
14350 if (VECTOR_MODE_P (mode
))
14351 *cost
+= extra_cost
->ldst
.loadv
;
14352 else if (GET_MODE_CLASS (mode
) == MODE_INT
)
14353 *cost
+= extra_cost
->ldst
.load
;
14354 else if (mode
== SFmode
|| mode
== SDmode
)
14355 *cost
+= extra_cost
->ldst
.loadf
;
14356 else if (mode
== DFmode
|| mode
== DDmode
)
14357 *cost
+= extra_cost
->ldst
.loadd
;
14360 COSTS_N_INSNS (aarch64_address_cost (address
, mode
,
14369 if (VECTOR_MODE_P (mode
))
14371 /* Many vector comparison operations are represented as NEG
14372 of a comparison. */
14373 if (COMPARISON_P (op0
))
14375 rtx op00
= XEXP (op0
, 0);
14376 rtx op01
= XEXP (op0
, 1);
14377 machine_mode inner_mode
= GET_MODE (op00
);
14379 if (GET_MODE_CLASS (inner_mode
) == MODE_VECTOR_FLOAT
14380 && GET_CODE (op00
) == ABS
14381 && GET_CODE (op01
) == ABS
)
14383 op00
= XEXP (op00
, 0);
14384 op01
= XEXP (op01
, 0);
14386 *cost
+= rtx_cost (op00
, inner_mode
, GET_CODE (op0
), 0, speed
);
14387 *cost
+= rtx_cost (op01
, inner_mode
, GET_CODE (op0
), 1, speed
);
14389 *cost
+= extra_cost
->vect
.alu
;
14395 *cost
+= extra_cost
->vect
.alu
;
14400 if (GET_MODE_CLASS (mode
) == MODE_INT
)
14402 if (GET_RTX_CLASS (GET_CODE (op0
)) == RTX_COMPARE
14403 || GET_RTX_CLASS (GET_CODE (op0
)) == RTX_COMM_COMPARE
)
14406 *cost
+= rtx_cost (XEXP (op0
, 0), VOIDmode
, NEG
, 0, speed
);
14410 /* Cost this as SUB wzr, X. */
14411 op0
= CONST0_RTX (mode
);
14416 if (GET_MODE_CLASS (mode
) == MODE_FLOAT
)
14418 /* Support (neg(fma...)) as a single instruction only if
14419 sign of zeros is unimportant. This matches the decision
14420 making in aarch64.md. */
14421 if (GET_CODE (op0
) == FMA
&& !HONOR_SIGNED_ZEROS (GET_MODE (op0
)))
14424 *cost
= rtx_cost (op0
, mode
, NEG
, 0, speed
);
14427 if (GET_CODE (op0
) == MULT
)
14430 *cost
= rtx_cost (op0
, mode
, NEG
, 0, speed
);
14435 *cost
+= extra_cost
->fp
[mode
== DFmode
].neg
;
14445 if (VECTOR_MODE_P (mode
))
14446 *cost
+= extra_cost
->vect
.alu
;
14448 *cost
+= extra_cost
->alu
.clz
;
14454 if (VECTOR_MODE_P (mode
))
14456 *cost
= COSTS_N_INSNS (3);
14458 *cost
+= extra_cost
->vect
.alu
* 3;
14460 else if (TARGET_CSSC
)
14462 *cost
= COSTS_N_INSNS (1);
14464 *cost
+= extra_cost
->alu
.clz
;
14468 *cost
= COSTS_N_INSNS (2);
14470 *cost
+= extra_cost
->alu
.clz
+ extra_cost
->alu
.rev
;
14478 if (op1
== const0_rtx
14479 && GET_CODE (op0
) == AND
)
14482 mode
= GET_MODE (op0
);
14486 if (GET_MODE_CLASS (GET_MODE (op0
)) == MODE_INT
)
14488 /* TODO: A write to the CC flags possibly costs extra, this
14489 needs encoding in the cost tables. */
14491 mode
= GET_MODE (op0
);
14493 if (GET_CODE (op0
) == AND
)
14499 if (GET_CODE (op0
) == PLUS
)
14501 /* ADDS (and CMN alias). */
14506 if (GET_CODE (op0
) == MINUS
)
14513 if (GET_CODE (op0
) == ZERO_EXTRACT
&& op1
== const0_rtx
14514 && GET_MODE (x
) == CC_NZmode
&& CONST_INT_P (XEXP (op0
, 1))
14515 && CONST_INT_P (XEXP (op0
, 2)))
14517 /* COMPARE of ZERO_EXTRACT form of TST-immediate.
14518 Handle it here directly rather than going to cost_logic
14519 since we know the immediate generated for the TST is valid
14520 so we can avoid creating an intermediate rtx for it only
14521 for costing purposes. */
14523 *cost
+= extra_cost
->alu
.logical
;
14525 *cost
+= rtx_cost (XEXP (op0
, 0), GET_MODE (op0
),
14526 ZERO_EXTRACT
, 0, speed
);
14530 if (GET_CODE (op1
) == NEG
)
14534 *cost
+= extra_cost
->alu
.arith
;
14536 *cost
+= rtx_cost (op0
, mode
, COMPARE
, 0, speed
);
14537 *cost
+= rtx_cost (XEXP (op1
, 0), mode
, NEG
, 1, speed
);
14543 Compare can freely swap the order of operands, and
14544 canonicalization puts the more complex operation first.
14545 But the integer MINUS logic expects the shift/extend
14546 operation in op1. */
14548 || (SUBREG_P (op0
) && REG_P (SUBREG_REG (op0
)))))
14556 if (GET_MODE_CLASS (GET_MODE (op0
)) == MODE_FLOAT
)
14560 *cost
+= extra_cost
->fp
[mode
== DFmode
].compare
;
14562 if (CONST_DOUBLE_P (op1
) && aarch64_float_const_zero_rtx_p (op1
))
14564 *cost
+= rtx_cost (op0
, VOIDmode
, COMPARE
, 0, speed
);
14565 /* FCMP supports constant 0.0 for no extra cost. */
14571 if (VECTOR_MODE_P (mode
))
14573 /* Vector compare. */
14575 *cost
+= extra_cost
->vect
.alu
;
14577 if (aarch64_float_const_zero_rtx_p (op1
))
14579 /* Vector cm (eq|ge|gt|lt|le) supports constant 0.0 for no extra
14593 if (VECTOR_MODE_P (mode
))
14595 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
14596 if (TARGET_SIMD
&& (vec_flags
& VEC_ADVSIMD
))
14598 /* Recognise the SABD and UABD operation here.
14599 Recursion from the PLUS case will catch the accumulating
14601 if (aarch64_abd_rtx_p (x
))
14604 *cost
+= extra_cost
->vect
.alu
;
14607 /* SUBL2 and SUBW2.
14608 The select-operand-high-half versions of the sub instruction
14609 have the same cost as the regular three vector version -
14610 don't add the costs of the select into the costs of the sub.
14612 op0
= aarch64_strip_extend_vec_half (op0
);
14613 op1
= aarch64_strip_extend_vec_half (op1
);
14617 *cost
+= rtx_cost (op0
, mode
, MINUS
, 0, speed
);
14619 /* Detect valid immediates. */
14620 if ((GET_MODE_CLASS (mode
) == MODE_INT
14621 || (GET_MODE_CLASS (mode
) == MODE_CC
14622 && GET_MODE_CLASS (GET_MODE (op0
)) == MODE_INT
))
14623 && CONST_INT_P (op1
)
14624 && aarch64_uimm12_shift (INTVAL (op1
)))
14627 /* SUB(S) (immediate). */
14628 *cost
+= extra_cost
->alu
.arith
;
14632 /* Look for SUB (extended register). */
14633 if (is_a
<scalar_int_mode
> (mode
)
14634 && aarch64_rtx_arith_op_extract_p (op1
))
14637 *cost
+= extra_cost
->alu
.extend_arith
;
14639 op1
= aarch64_strip_extend (op1
, true);
14640 *cost
+= rtx_cost (op1
, VOIDmode
,
14641 (enum rtx_code
) GET_CODE (op1
), 0, speed
);
14645 rtx new_op1
= aarch64_strip_extend (op1
, false);
14647 /* Cost this as an FMA-alike operation. */
14648 if ((GET_CODE (new_op1
) == MULT
14649 || aarch64_shift_p (GET_CODE (new_op1
)))
14650 && code
!= COMPARE
)
14652 *cost
+= aarch64_rtx_mult_cost (new_op1
, MULT
,
14653 (enum rtx_code
) code
,
14658 *cost
+= rtx_cost (new_op1
, VOIDmode
, MINUS
, 1, speed
);
14662 if (VECTOR_MODE_P (mode
))
14665 *cost
+= extra_cost
->vect
.alu
;
14667 else if (GET_MODE_CLASS (mode
) == MODE_INT
)
14670 *cost
+= extra_cost
->alu
.arith
;
14672 else if (GET_MODE_CLASS (mode
) == MODE_FLOAT
)
14675 *cost
+= extra_cost
->fp
[mode
== DFmode
].addsub
;
14689 if (VECTOR_MODE_P (mode
))
14691 /* ADDL2 and ADDW2. */
14692 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
14693 if (TARGET_SIMD
&& (vec_flags
& VEC_ADVSIMD
))
14695 /* The select-operand-high-half versions of the add instruction
14696 have the same cost as the regular three vector version -
14697 don't add the costs of the select into the costs of the add.
14699 op0
= aarch64_strip_extend_vec_half (op0
);
14700 op1
= aarch64_strip_extend_vec_half (op1
);
14704 if (GET_RTX_CLASS (GET_CODE (op0
)) == RTX_COMPARE
14705 || GET_RTX_CLASS (GET_CODE (op0
)) == RTX_COMM_COMPARE
)
14708 *cost
+= rtx_cost (XEXP (op0
, 0), mode
, PLUS
, 0, speed
);
14709 *cost
+= rtx_cost (op1
, mode
, PLUS
, 1, speed
);
14713 if (GET_MODE_CLASS (mode
) == MODE_INT
14714 && (aarch64_plus_immediate (op1
, mode
)
14715 || aarch64_sve_addvl_addpl_immediate (op1
, mode
)))
14717 *cost
+= rtx_cost (op0
, mode
, PLUS
, 0, speed
);
14721 /* ADD (immediate). */
14722 *cost
+= extra_cost
->alu
.arith
;
14724 /* Some tunings prefer to not use the VL-based scalar ops.
14725 Increase the cost of the poly immediate to prevent their
14727 if (GET_CODE (op1
) == CONST_POLY_INT
14728 && (aarch64_tune_params
.extra_tuning_flags
14729 & AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS
))
14730 *cost
+= COSTS_N_INSNS (1);
14735 if (aarch64_pluslong_immediate (op1
, mode
))
14737 /* 24-bit add in 2 instructions or 12-bit shifted add. */
14738 if ((INTVAL (op1
) & 0xfff) != 0)
14739 *cost
+= COSTS_N_INSNS (1);
14741 *cost
+= rtx_cost (op0
, mode
, PLUS
, 0, speed
);
14745 *cost
+= rtx_cost (op1
, mode
, PLUS
, 1, speed
);
14747 /* Look for ADD (extended register). */
14748 if (is_a
<scalar_int_mode
> (mode
)
14749 && aarch64_rtx_arith_op_extract_p (op0
))
14752 *cost
+= extra_cost
->alu
.extend_arith
;
14754 op0
= aarch64_strip_extend (op0
, true);
14755 *cost
+= rtx_cost (op0
, VOIDmode
,
14756 (enum rtx_code
) GET_CODE (op0
), 0, speed
);
14760 /* Strip any extend, leave shifts behind as we will
14761 cost them through mult_cost. */
14762 new_op0
= aarch64_strip_extend (op0
, false);
14764 if (GET_CODE (new_op0
) == MULT
14765 || aarch64_shift_p (GET_CODE (new_op0
)))
14767 *cost
+= aarch64_rtx_mult_cost (new_op0
, MULT
, PLUS
,
14772 *cost
+= rtx_cost (new_op0
, VOIDmode
, PLUS
, 0, speed
);
14776 if (VECTOR_MODE_P (mode
))
14779 *cost
+= extra_cost
->vect
.alu
;
14781 else if (GET_MODE_CLASS (mode
) == MODE_INT
)
14784 *cost
+= extra_cost
->alu
.arith
;
14786 else if (GET_MODE_CLASS (mode
) == MODE_FLOAT
)
14789 *cost
+= extra_cost
->fp
[mode
== DFmode
].addsub
;
14797 *cost
= COSTS_N_INSNS (1);
14801 if (VECTOR_MODE_P (mode
))
14802 *cost
+= extra_cost
->vect
.alu
;
14804 *cost
+= extra_cost
->alu
.rev
;
14809 if (aarch_rev16_p (x
))
14811 *cost
= COSTS_N_INSNS (1);
14815 if (VECTOR_MODE_P (mode
))
14816 *cost
+= extra_cost
->vect
.alu
;
14818 *cost
+= extra_cost
->alu
.rev
;
14823 if (aarch64_extr_rtx_p (x
, &op0
, &op1
))
14825 *cost
+= rtx_cost (op0
, mode
, IOR
, 0, speed
);
14826 *cost
+= rtx_cost (op1
, mode
, IOR
, 1, speed
);
14828 *cost
+= extra_cost
->alu
.shift
;
14832 /* Fall through. */
14839 if (VECTOR_MODE_P (mode
))
14842 *cost
+= extra_cost
->vect
.alu
;
14847 && GET_CODE (op0
) == MULT
14848 && CONST_INT_P (XEXP (op0
, 1))
14849 && CONST_INT_P (op1
)
14850 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0
, 1))),
14851 INTVAL (op1
)) != 0)
14853 /* This is a UBFM/SBFM. */
14854 *cost
+= rtx_cost (XEXP (op0
, 0), mode
, ZERO_EXTRACT
, 0, speed
);
14856 *cost
+= extra_cost
->alu
.bfx
;
14860 if (is_int_mode (mode
, &int_mode
))
14862 if (CONST_INT_P (op1
))
14864 /* We have a mask + shift version of a UBFIZ
14865 i.e. the *andim_ashift<mode>_bfiz pattern. */
14866 if (GET_CODE (op0
) == ASHIFT
14867 && aarch64_mask_and_shift_for_ubfiz_p (int_mode
, op1
,
14870 *cost
+= rtx_cost (XEXP (op0
, 0), int_mode
,
14871 (enum rtx_code
) code
, 0, speed
);
14873 *cost
+= extra_cost
->alu
.bfx
;
14877 else if (aarch64_bitmask_imm (INTVAL (op1
), int_mode
))
14879 /* We possibly get the immediate for free, this is not
14881 *cost
+= rtx_cost (op0
, int_mode
,
14882 (enum rtx_code
) code
, 0, speed
);
14884 *cost
+= extra_cost
->alu
.logical
;
14893 /* Handle ORN, EON, or BIC. */
14894 if (GET_CODE (op0
) == NOT
)
14895 op0
= XEXP (op0
, 0);
14897 new_op0
= aarch64_strip_shift (op0
);
14899 /* If we had a shift on op0 then this is a logical-shift-
14900 by-register/immediate operation. Otherwise, this is just
14901 a logical operation. */
14904 if (new_op0
!= op0
)
14906 /* Shift by immediate. */
14907 if (CONST_INT_P (XEXP (op0
, 1)))
14908 *cost
+= extra_cost
->alu
.log_shift
;
14910 *cost
+= extra_cost
->alu
.log_shift_reg
;
14913 *cost
+= extra_cost
->alu
.logical
;
14916 /* In both cases we want to cost both operands. */
14917 *cost
+= rtx_cost (new_op0
, int_mode
, (enum rtx_code
) code
,
14919 *cost
+= rtx_cost (op1
, int_mode
, (enum rtx_code
) code
,
14929 op0
= aarch64_strip_shift (x
);
14931 if (VECTOR_MODE_P (mode
))
14934 *cost
+= extra_cost
->vect
.alu
;
14938 /* MVN-shifted-reg. */
14941 *cost
+= rtx_cost (op0
, mode
, (enum rtx_code
) code
, 0, speed
);
14944 *cost
+= extra_cost
->alu
.log_shift
;
14948 /* EON can have two forms: (xor (not a) b) but also (not (xor a b)).
14949 Handle the second form here taking care that 'a' in the above can
14951 else if (GET_CODE (op0
) == XOR
)
14953 rtx newop0
= XEXP (op0
, 0);
14954 rtx newop1
= XEXP (op0
, 1);
14955 rtx op0_stripped
= aarch64_strip_shift (newop0
);
14957 *cost
+= rtx_cost (newop1
, mode
, (enum rtx_code
) code
, 1, speed
);
14958 *cost
+= rtx_cost (op0_stripped
, mode
, XOR
, 0, speed
);
14962 if (op0_stripped
!= newop0
)
14963 *cost
+= extra_cost
->alu
.log_shift
;
14965 *cost
+= extra_cost
->alu
.logical
;
14972 *cost
+= extra_cost
->alu
.logical
;
14979 /* If a value is written in SI mode, then zero extended to DI
14980 mode, the operation will in general be free as a write to
14981 a 'w' register implicitly zeroes the upper bits of an 'x'
14982 register. However, if this is
14984 (set (reg) (zero_extend (reg)))
14986 we must cost the explicit register move. */
14988 && GET_MODE (op0
) == SImode
)
14990 int op_cost
= rtx_cost (op0
, VOIDmode
, ZERO_EXTEND
, 0, speed
);
14992 /* If OP_COST is non-zero, then the cost of the zero extend
14993 is effectively the cost of the inner operation. Otherwise
14994 we have a MOV instruction and we take the cost from the MOV
14995 itself. This is true independently of whether we are
14996 optimizing for space or time. */
15002 else if (MEM_P (op0
))
15004 /* All loads can zero extend to any size for free. */
15005 *cost
= rtx_cost (op0
, VOIDmode
, ZERO_EXTEND
, param
, speed
);
15009 op0
= aarch64_extend_bitfield_pattern_p (x
);
15012 *cost
+= rtx_cost (op0
, mode
, ZERO_EXTEND
, 0, speed
);
15014 *cost
+= extra_cost
->alu
.bfx
;
15020 if (VECTOR_MODE_P (mode
))
15023 *cost
+= extra_cost
->vect
.alu
;
15027 /* We generate an AND instead of UXTB/UXTH. */
15028 *cost
+= extra_cost
->alu
.logical
;
15034 if (MEM_P (XEXP (x
, 0)))
15039 rtx address
= XEXP (XEXP (x
, 0), 0);
15040 *cost
+= extra_cost
->ldst
.load_sign_extend
;
15043 COSTS_N_INSNS (aarch64_address_cost (address
, mode
,
15049 op0
= aarch64_extend_bitfield_pattern_p (x
);
15052 *cost
+= rtx_cost (op0
, mode
, SIGN_EXTEND
, 0, speed
);
15054 *cost
+= extra_cost
->alu
.bfx
;
15060 if (VECTOR_MODE_P (mode
))
15061 *cost
+= extra_cost
->vect
.alu
;
15063 *cost
+= extra_cost
->alu
.extend
;
15075 if (CONST_INT_P (op1
))
15079 if (VECTOR_MODE_P (mode
))
15081 /* Vector shift (immediate). */
15082 *cost
+= extra_cost
->vect
.alu
;
15086 /* LSL (immediate), ASR (immediate), UBMF, UBFIZ and friends.
15087 These are all aliases. */
15088 *cost
+= extra_cost
->alu
.shift
;
15092 /* We can incorporate zero/sign extend for free. */
15093 if (GET_CODE (op0
) == ZERO_EXTEND
15094 || GET_CODE (op0
) == SIGN_EXTEND
)
15095 op0
= XEXP (op0
, 0);
15097 *cost
+= rtx_cost (op0
, VOIDmode
, ASHIFT
, 0, speed
);
15102 if (VECTOR_MODE_P (mode
))
15105 /* Vector shift (register). */
15106 *cost
+= extra_cost
->vect
.alu
;
15112 *cost
+= extra_cost
->alu
.shift_reg
;
15114 /* The register shift amount may be in a shorter mode expressed
15115 as a lowpart SUBREG. For costing purposes just look inside. */
15116 if (SUBREG_P (op1
) && subreg_lowpart_p (op1
))
15117 op1
= SUBREG_REG (op1
);
15118 if (GET_CODE (op1
) == AND
&& REG_P (XEXP (op1
, 0))
15119 && CONST_INT_P (XEXP (op1
, 1))
15120 && known_eq (INTVAL (XEXP (op1
, 1)),
15121 GET_MODE_BITSIZE (mode
) - 1))
15123 *cost
+= rtx_cost (op0
, mode
, (rtx_code
) code
, 0, speed
);
15124 /* We already demanded XEXP (op1, 0) to be REG_P, so
15125 don't recurse into it. */
15129 return false; /* All arguments need to be in registers. */
15134 if (aarch64_cmodel
== AARCH64_CMODEL_LARGE
15135 || aarch64_cmodel
== AARCH64_CMODEL_SMALL_SPIC
)
15139 *cost
+= extra_cost
->ldst
.load
;
15141 else if (aarch64_cmodel
== AARCH64_CMODEL_SMALL
15142 || aarch64_cmodel
== AARCH64_CMODEL_SMALL_PIC
)
15144 /* ADRP, followed by ADD. */
15145 *cost
+= COSTS_N_INSNS (1);
15147 *cost
+= 2 * extra_cost
->alu
.arith
;
15149 else if (aarch64_cmodel
== AARCH64_CMODEL_TINY
15150 || aarch64_cmodel
== AARCH64_CMODEL_TINY_PIC
)
15154 *cost
+= extra_cost
->alu
.arith
;
15159 /* One extra load instruction, after accessing the GOT. */
15160 *cost
+= COSTS_N_INSNS (1);
15162 *cost
+= extra_cost
->ldst
.load
;
15168 /* ADRP/ADD (immediate). */
15170 *cost
+= extra_cost
->alu
.arith
;
15178 if (VECTOR_MODE_P (mode
))
15179 *cost
+= extra_cost
->vect
.alu
;
15181 *cost
+= extra_cost
->alu
.bfx
;
15184 /* We can trust that the immediates used will be correct (there
15185 are no by-register forms), so we need only cost op0. */
15186 *cost
+= rtx_cost (XEXP (x
, 0), VOIDmode
, (enum rtx_code
) code
, 0, speed
);
15190 *cost
+= aarch64_rtx_mult_cost (x
, MULT
, 0, speed
);
15191 /* aarch64_rtx_mult_cost always handles recursion to its
15196 /* We can expand signed mod by power of 2 using a NEGS, two parallel
15197 ANDs and a CSNEG. Assume here that CSNEG is the same as the cost of
15198 an unconditional negate. This case should only ever be reached through
15199 the set_smod_pow2_cheap check in expmed.cc. */
15200 if (CONST_INT_P (XEXP (x
, 1))
15201 && exact_log2 (INTVAL (XEXP (x
, 1))) > 0
15202 && (mode
== SImode
|| mode
== DImode
))
15204 /* We expand to 4 instructions. Reset the baseline. */
15205 *cost
= COSTS_N_INSNS (4);
15208 *cost
+= 2 * extra_cost
->alu
.logical
15209 + 2 * extra_cost
->alu
.arith
;
15214 /* Fall-through. */
15218 /* Slighly prefer UMOD over SMOD. */
15219 if (VECTOR_MODE_P (mode
))
15220 *cost
+= extra_cost
->vect
.alu
;
15221 else if (GET_MODE_CLASS (mode
) == MODE_INT
)
15222 *cost
+= (extra_cost
->mult
[mode
== DImode
].add
15223 + extra_cost
->mult
[mode
== DImode
].idiv
15224 + (code
== MOD
? 1 : 0));
15226 return false; /* All arguments need to be in registers. */
15233 if (VECTOR_MODE_P (mode
))
15234 *cost
+= extra_cost
->vect
.alu
;
15235 else if (GET_MODE_CLASS (mode
) == MODE_INT
)
15236 /* There is no integer SQRT, so only DIV and UDIV can get
15238 *cost
+= (extra_cost
->mult
[mode
== DImode
].idiv
15239 /* Slighly prefer UDIV over SDIV. */
15240 + (code
== DIV
? 1 : 0));
15242 *cost
+= extra_cost
->fp
[mode
== DFmode
].div
;
15244 return false; /* All arguments need to be in registers. */
15247 return aarch64_if_then_else_costs (XEXP (x
, 0), XEXP (x
, 1),
15248 XEXP (x
, 2), cost
, speed
);
15261 return false; /* All arguments must be in registers. */
15270 if (VECTOR_MODE_P (mode
))
15271 *cost
+= extra_cost
->vect
.alu
;
15273 *cost
+= extra_cost
->fp
[mode
== DFmode
].fma
;
15276 /* FMSUB, FNMADD, and FNMSUB are free. */
15277 if (GET_CODE (op0
) == NEG
)
15278 op0
= XEXP (op0
, 0);
15280 if (GET_CODE (op2
) == NEG
)
15281 op2
= XEXP (op2
, 0);
15283 /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
15284 and the by-element operand as operand 0. */
15285 if (GET_CODE (op1
) == NEG
)
15286 op1
= XEXP (op1
, 0);
15288 /* Catch vector-by-element operations. The by-element operand can
15289 either be (vec_duplicate (vec_select (x))) or just
15290 (vec_select (x)), depending on whether we are multiplying by
15291 a vector or a scalar.
15293 Canonicalization is not very good in these cases, FMA4 will put the
15294 by-element operand as operand 0, FNMA4 will have it as operand 1. */
15295 if (GET_CODE (op0
) == VEC_DUPLICATE
)
15296 op0
= XEXP (op0
, 0);
15297 else if (GET_CODE (op1
) == VEC_DUPLICATE
)
15298 op1
= XEXP (op1
, 0);
15300 if (GET_CODE (op0
) == VEC_SELECT
)
15301 op0
= XEXP (op0
, 0);
15302 else if (GET_CODE (op1
) == VEC_SELECT
)
15303 op1
= XEXP (op1
, 0);
15305 /* If the remaining parameters are not registers,
15306 get the cost to put them into registers. */
15307 *cost
+= rtx_cost (op0
, mode
, FMA
, 0, speed
);
15308 *cost
+= rtx_cost (op1
, mode
, FMA
, 1, speed
);
15309 *cost
+= rtx_cost (op2
, mode
, FMA
, 2, speed
);
15313 case UNSIGNED_FLOAT
:
15315 *cost
+= extra_cost
->fp
[mode
== DFmode
].fromint
;
15321 if (VECTOR_MODE_P (mode
))
15323 /*Vector truncate. */
15324 *cost
+= extra_cost
->vect
.alu
;
15327 *cost
+= extra_cost
->fp
[mode
== DFmode
].widen
;
15331 case FLOAT_TRUNCATE
:
15334 if (VECTOR_MODE_P (mode
))
15336 /*Vector conversion. */
15337 *cost
+= extra_cost
->vect
.alu
;
15340 *cost
+= extra_cost
->fp
[mode
== DFmode
].narrow
;
15347 /* Strip the rounding part. They will all be implemented
15348 by the fcvt* family of instructions anyway. */
15349 if (GET_CODE (x
) == UNSPEC
)
15351 unsigned int uns_code
= XINT (x
, 1);
15353 if (uns_code
== UNSPEC_FRINTA
15354 || uns_code
== UNSPEC_FRINTM
15355 || uns_code
== UNSPEC_FRINTN
15356 || uns_code
== UNSPEC_FRINTP
15357 || uns_code
== UNSPEC_FRINTZ
)
15358 x
= XVECEXP (x
, 0, 0);
15363 if (VECTOR_MODE_P (mode
))
15364 *cost
+= extra_cost
->vect
.alu
;
15366 *cost
+= extra_cost
->fp
[GET_MODE (x
) == DFmode
].toint
;
15369 /* We can combine fmul by a power of 2 followed by a fcvt into a single
15370 fixed-point fcvt. */
15371 if (GET_CODE (x
) == MULT
15372 && ((VECTOR_MODE_P (mode
)
15373 && aarch64_vec_fpconst_pow_of_2 (XEXP (x
, 1)) > 0)
15374 || aarch64_fpconst_pow_of_2 (XEXP (x
, 1)) > 0))
15376 *cost
+= rtx_cost (XEXP (x
, 0), VOIDmode
, (rtx_code
) code
,
15381 *cost
+= rtx_cost (x
, VOIDmode
, (enum rtx_code
) code
, 0, speed
);
15385 if (VECTOR_MODE_P (mode
))
15387 /* ABS (vector). */
15389 *cost
+= extra_cost
->vect
.alu
;
15391 else if (GET_MODE_CLASS (mode
) == MODE_FLOAT
)
15395 /* FABD, which is analogous to FADD. */
15396 if (GET_CODE (op0
) == MINUS
)
15398 *cost
+= rtx_cost (XEXP (op0
, 0), mode
, MINUS
, 0, speed
);
15399 *cost
+= rtx_cost (XEXP (op0
, 1), mode
, MINUS
, 1, speed
);
15401 *cost
+= extra_cost
->fp
[mode
== DFmode
].addsub
;
15405 /* Simple FABS is analogous to FNEG. */
15407 *cost
+= extra_cost
->fp
[mode
== DFmode
].neg
;
15411 /* Integer ABS will either be split to
15412 two arithmetic instructions, or will be an ABS
15413 (scalar), which we don't model. */
15414 *cost
= COSTS_N_INSNS (2);
15416 *cost
+= 2 * extra_cost
->alu
.arith
;
15424 if (VECTOR_MODE_P (mode
))
15425 *cost
+= extra_cost
->vect
.alu
;
15428 /* FMAXNM/FMINNM/FMAX/FMIN.
15429 TODO: This may not be accurate for all implementations, but
15430 we do not model this in the cost tables. */
15431 *cost
+= extra_cost
->fp
[mode
== DFmode
].addsub
;
15437 /* The floating point round to integer frint* instructions. */
15438 if (aarch64_frint_unspec_p (XINT (x
, 1)))
15441 *cost
+= extra_cost
->fp
[mode
== DFmode
].roundint
;
15449 /* Decompose <su>muldi3_highpart. */
15450 if (/* (truncate:DI */
15453 && GET_MODE (XEXP (x
, 0)) == TImode
15454 && GET_CODE (XEXP (x
, 0)) == LSHIFTRT
15456 && GET_CODE (XEXP (XEXP (x
, 0), 0)) == MULT
15457 /* (ANY_EXTEND:TI (reg:DI))
15458 (ANY_EXTEND:TI (reg:DI))) */
15459 && ((GET_CODE (XEXP (XEXP (XEXP (x
, 0), 0), 0)) == ZERO_EXTEND
15460 && GET_CODE (XEXP (XEXP (XEXP (x
, 0), 0), 1)) == ZERO_EXTEND
)
15461 || (GET_CODE (XEXP (XEXP (XEXP (x
, 0), 0), 0)) == SIGN_EXTEND
15462 && GET_CODE (XEXP (XEXP (XEXP (x
, 0), 0), 1)) == SIGN_EXTEND
))
15463 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x
, 0), 0), 0), 0)) == DImode
15464 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x
, 0), 0), 1), 0)) == DImode
15465 /* (const_int 64) */
15466 && CONST_INT_P (XEXP (XEXP (x
, 0), 1))
15467 && UINTVAL (XEXP (XEXP (x
, 0), 1)) == 64)
15471 *cost
+= extra_cost
->mult
[mode
== DImode
].extend
;
15472 *cost
+= rtx_cost (XEXP (XEXP (XEXP (XEXP (x
, 0), 0), 0), 0),
15473 mode
, MULT
, 0, speed
);
15474 *cost
+= rtx_cost (XEXP (XEXP (XEXP (XEXP (x
, 0), 0), 1), 0),
15475 mode
, MULT
, 1, speed
);
15481 /* Load using MOVI/MVNI. */
15482 if (aarch64_simd_valid_immediate (x
, NULL
))
15483 *cost
= extra_cost
->vect
.movi
;
15484 else /* Load using constant pool. */
15485 *cost
= extra_cost
->ldst
.load
;
15489 /* depending on the operation, either DUP or INS.
15490 For now, keep default costing. */
15492 case VEC_DUPLICATE
:
15493 /* Load using a DUP. */
15494 *cost
= extra_cost
->vect
.dup
;
15498 rtx op0
= XEXP (x
, 0);
15499 *cost
= rtx_cost (op0
, GET_MODE (op0
), VEC_SELECT
, 0, speed
);
15501 /* cost subreg of 0 as free, otherwise as DUP */
15502 rtx op1
= XEXP (x
, 1);
15503 if (vec_series_lowpart_p (mode
, GET_MODE (op1
), op1
))
15505 else if (vec_series_highpart_p (mode
, GET_MODE (op1
), op1
))
15506 *cost
= extra_cost
->vect
.dup
;
15508 *cost
= extra_cost
->vect
.extract
;
15516 && flag_aarch64_verbose_cost
)
15517 fprintf (dump_file
,
15518 "\nFailed to cost RTX. Assuming default cost.\n");
15523 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
15524 calculated for X. This cost is stored in *COST. Returns true
15525 if the total cost of X was calculated. */
15527 aarch64_rtx_costs_wrapper (rtx x
, machine_mode mode
, int outer
,
15528 int param
, int *cost
, bool speed
)
15530 bool result
= aarch64_rtx_costs (x
, mode
, outer
, param
, cost
, speed
);
15533 && flag_aarch64_verbose_cost
)
15535 print_rtl_single (dump_file
, x
);
15536 fprintf (dump_file
, "\n%s cost: %d (%s)\n",
15537 speed
? "Hot" : "Cold",
15538 *cost
, result
? "final" : "partial");
15545 aarch64_register_move_cost (machine_mode mode
,
15546 reg_class_t from_i
, reg_class_t to_i
)
15548 enum reg_class from
= (enum reg_class
) from_i
;
15549 enum reg_class to
= (enum reg_class
) to_i
;
15550 const struct cpu_regmove_cost
*regmove_cost
15551 = aarch64_tune_params
.regmove_cost
;
15553 /* Trest any subset of POINTER_REGS as though it were GENERAL_REGS. */
15554 if (reg_class_subset_p (to
, POINTER_REGS
))
15557 if (reg_class_subset_p (from
, POINTER_REGS
))
15558 from
= GENERAL_REGS
;
15560 /* Make RDFFR very expensive. In particular, if we know that the FFR
15561 contains a PTRUE (e.g. after a SETFFR), we must never use RDFFR
15562 as a way of obtaining a PTRUE. */
15563 if (GET_MODE_CLASS (mode
) == MODE_VECTOR_BOOL
15564 && hard_reg_set_subset_p (reg_class_contents
[from_i
],
15565 reg_class_contents
[FFR_REGS
]))
15568 /* Moving between GPR and stack cost is the same as GP2GP. */
15569 if ((from
== GENERAL_REGS
&& to
== STACK_REG
)
15570 || (to
== GENERAL_REGS
&& from
== STACK_REG
))
15571 return regmove_cost
->GP2GP
;
15573 /* To/From the stack register, we move via the gprs. */
15574 if (to
== STACK_REG
|| from
== STACK_REG
)
15575 return aarch64_register_move_cost (mode
, from
, GENERAL_REGS
)
15576 + aarch64_register_move_cost (mode
, GENERAL_REGS
, to
);
15578 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
15579 if (vec_flags
!= (VEC_ADVSIMD
| VEC_STRUCT
| VEC_PARTIAL
)
15580 && known_eq (GET_MODE_SIZE (mode
), 16))
15582 /* 128-bit operations on general registers require 2 instructions. */
15583 if (from
== GENERAL_REGS
&& to
== GENERAL_REGS
)
15584 return regmove_cost
->GP2GP
* 2;
15585 else if (from
== GENERAL_REGS
)
15586 return regmove_cost
->GP2FP
* 2;
15587 else if (to
== GENERAL_REGS
)
15588 return regmove_cost
->FP2GP
* 2;
15590 /* When AdvSIMD instructions are disabled it is not possible to move
15591 a 128-bit value directly between Q registers. This is handled in
15592 secondary reload. A general register is used as a scratch to move
15593 the upper DI value and the lower DI value is moved directly,
15594 hence the cost is the sum of three moves. */
15595 if (!TARGET_SIMD
&& !TARGET_SVE
)
15596 return regmove_cost
->GP2FP
+ regmove_cost
->FP2GP
+ regmove_cost
->FP2FP
;
15598 return regmove_cost
->FP2FP
;
15601 if (from
== GENERAL_REGS
&& to
== GENERAL_REGS
)
15602 return regmove_cost
->GP2GP
;
15603 else if (from
== GENERAL_REGS
)
15604 return regmove_cost
->GP2FP
;
15605 else if (to
== GENERAL_REGS
)
15606 return regmove_cost
->FP2GP
;
15608 if (!TARGET_SIMD
&& vec_flags
== (VEC_ADVSIMD
| VEC_STRUCT
))
15610 /* Needs a round-trip through memory, which can use LDP/STP for pairs.
15611 The cost must be greater than 2 units to indicate that direct
15612 moves aren't possible. */
15613 auto per_vector
= (aarch64_tune_params
.memmov_cost
.load_fp
15614 + aarch64_tune_params
.memmov_cost
.store_fp
);
15615 return MIN (CEIL (per_vector
, 2), 4);
15618 return regmove_cost
->FP2FP
;
15621 /* Implements TARGET_MEMORY_MOVE_COST. */
15623 aarch64_memory_move_cost (machine_mode mode
, reg_class_t rclass_i
, bool in
)
15625 enum reg_class rclass
= (enum reg_class
) rclass_i
;
15626 if (GET_MODE_CLASS (mode
) == MODE_VECTOR_BOOL
15627 ? reg_classes_intersect_p (rclass
, PR_REGS
)
15628 : reg_class_subset_p (rclass
, PR_REGS
))
15630 ? aarch64_tune_params
.memmov_cost
.load_pred
15631 : aarch64_tune_params
.memmov_cost
.store_pred
);
15633 if (VECTOR_MODE_P (mode
) || FLOAT_MODE_P (mode
)
15634 ? reg_classes_intersect_p (rclass
, FP_REGS
)
15635 : reg_class_subset_p (rclass
, FP_REGS
))
15637 ? aarch64_tune_params
.memmov_cost
.load_fp
15638 : aarch64_tune_params
.memmov_cost
.store_fp
);
15641 ? aarch64_tune_params
.memmov_cost
.load_int
15642 : aarch64_tune_params
.memmov_cost
.store_int
);
15645 /* Implement TARGET_INSN_COST. We have the opportunity to do something
15646 much more productive here, such as using insn attributes to cost things.
15647 But we don't, not yet.
15649 The main point of this current definition is to make calling insn_cost
15650 on one instruction equivalent to calling seq_cost on a sequence that
15651 contains only that instruction. The default definition would instead
15652 only look at SET_SRCs, ignoring SET_DESTs.
15654 This ensures that, for example, storing a 128-bit zero vector is more
15655 expensive than storing a 128-bit vector register. A move of zero
15656 into a 128-bit vector register followed by multiple stores of that
15657 register is then cheaper than multiple stores of zero (which would
15658 use STP of XZR). This in turn allows STP Qs to be formed. */
15660 aarch64_insn_cost (rtx_insn
*insn
, bool speed
)
15662 if (rtx set
= single_set (insn
))
15663 return set_rtx_cost (set
, speed
);
15664 return pattern_cost (PATTERN (insn
), speed
);
15667 /* Implement TARGET_INIT_BUILTINS. */
15669 aarch64_init_builtins ()
15671 aarch64_general_init_builtins ();
15672 aarch64_sve::init_builtins ();
15673 #ifdef SUBTARGET_INIT_BUILTINS
15674 SUBTARGET_INIT_BUILTINS
;
15678 /* Implement TARGET_FOLD_BUILTIN. */
15680 aarch64_fold_builtin (tree fndecl
, int nargs
, tree
*args
, bool)
15682 unsigned int code
= DECL_MD_FUNCTION_CODE (fndecl
);
15683 unsigned int subcode
= code
>> AARCH64_BUILTIN_SHIFT
;
15684 tree type
= TREE_TYPE (TREE_TYPE (fndecl
));
15685 switch (code
& AARCH64_BUILTIN_CLASS
)
15687 case AARCH64_BUILTIN_GENERAL
:
15688 return aarch64_general_fold_builtin (subcode
, type
, nargs
, args
);
15690 case AARCH64_BUILTIN_SVE
:
15693 gcc_unreachable ();
15696 /* Implement TARGET_GIMPLE_FOLD_BUILTIN. */
15698 aarch64_gimple_fold_builtin (gimple_stmt_iterator
*gsi
)
15700 gcall
*stmt
= as_a
<gcall
*> (gsi_stmt (*gsi
));
15701 tree fndecl
= gimple_call_fndecl (stmt
);
15702 unsigned int code
= DECL_MD_FUNCTION_CODE (fndecl
);
15703 unsigned int subcode
= code
>> AARCH64_BUILTIN_SHIFT
;
15704 gimple
*new_stmt
= NULL
;
15705 switch (code
& AARCH64_BUILTIN_CLASS
)
15707 case AARCH64_BUILTIN_GENERAL
:
15708 new_stmt
= aarch64_general_gimple_fold_builtin (subcode
, stmt
, gsi
);
15711 case AARCH64_BUILTIN_SVE
:
15712 new_stmt
= aarch64_sve::gimple_fold_builtin (subcode
, gsi
, stmt
);
15719 gsi_replace (gsi
, new_stmt
, false);
15723 /* Implement TARGET_EXPAND_BUILTIN. */
15725 aarch64_expand_builtin (tree exp
, rtx target
, rtx
, machine_mode
, int ignore
)
15727 tree fndecl
= TREE_OPERAND (CALL_EXPR_FN (exp
), 0);
15728 unsigned int code
= DECL_MD_FUNCTION_CODE (fndecl
);
15729 unsigned int subcode
= code
>> AARCH64_BUILTIN_SHIFT
;
15730 switch (code
& AARCH64_BUILTIN_CLASS
)
15732 case AARCH64_BUILTIN_GENERAL
:
15733 return aarch64_general_expand_builtin (subcode
, exp
, target
, ignore
);
15735 case AARCH64_BUILTIN_SVE
:
15736 return aarch64_sve::expand_builtin (subcode
, exp
, target
);
15738 gcc_unreachable ();
15741 /* Implement TARGET_BUILTIN_DECL. */
15743 aarch64_builtin_decl (unsigned int code
, bool initialize_p
)
15745 unsigned int subcode
= code
>> AARCH64_BUILTIN_SHIFT
;
15746 switch (code
& AARCH64_BUILTIN_CLASS
)
15748 case AARCH64_BUILTIN_GENERAL
:
15749 return aarch64_general_builtin_decl (subcode
, initialize_p
);
15751 case AARCH64_BUILTIN_SVE
:
15752 return aarch64_sve::builtin_decl (subcode
, initialize_p
);
15754 gcc_unreachable ();
15757 /* Return true if it is safe and beneficial to use the approximate rsqrt optabs
15758 to optimize 1.0/sqrt. */
15761 use_rsqrt_p (machine_mode mode
)
15763 return (!flag_trapping_math
15764 && flag_unsafe_math_optimizations
15765 && ((aarch64_tune_params
.approx_modes
->recip_sqrt
15766 & AARCH64_APPROX_MODE (mode
))
15767 || flag_mrecip_low_precision_sqrt
));
15770 /* Function to decide when to use the approximate reciprocal square root
15774 aarch64_builtin_reciprocal (tree fndecl
)
15776 machine_mode mode
= TYPE_MODE (TREE_TYPE (fndecl
));
15778 if (!use_rsqrt_p (mode
))
15780 unsigned int code
= DECL_MD_FUNCTION_CODE (fndecl
);
15781 unsigned int subcode
= code
>> AARCH64_BUILTIN_SHIFT
;
15782 switch (code
& AARCH64_BUILTIN_CLASS
)
15784 case AARCH64_BUILTIN_GENERAL
:
15785 return aarch64_general_builtin_rsqrt (subcode
);
15787 case AARCH64_BUILTIN_SVE
:
15790 gcc_unreachable ();
15793 /* Emit code to perform the floating-point operation:
15797 where all three operands are already known to be registers.
15798 If the operation is an SVE one, PTRUE is a suitable all-true
15802 aarch64_emit_mult (rtx dst
, rtx ptrue
, rtx src1
, rtx src2
)
15805 emit_insn (gen_aarch64_pred (UNSPEC_COND_FMUL
, GET_MODE (dst
),
15806 dst
, ptrue
, src1
, src2
,
15807 gen_int_mode (SVE_RELAXED_GP
, SImode
)));
15809 emit_set_insn (dst
, gen_rtx_MULT (GET_MODE (dst
), src1
, src2
));
15812 /* Emit instruction sequence to compute either the approximate square root
15813 or its approximate reciprocal, depending on the flag RECP, and return
15814 whether the sequence was emitted or not. */
15817 aarch64_emit_approx_sqrt (rtx dst
, rtx src
, bool recp
)
15819 machine_mode mode
= GET_MODE (dst
);
15821 if (GET_MODE_INNER (mode
) == HFmode
)
15823 gcc_assert (!recp
);
15829 if (!(flag_mlow_precision_sqrt
15830 || (aarch64_tune_params
.approx_modes
->sqrt
15831 & AARCH64_APPROX_MODE (mode
))))
15834 if (!flag_finite_math_only
15835 || flag_trapping_math
15836 || !flag_unsafe_math_optimizations
15837 || optimize_function_for_size_p (cfun
))
15841 /* Caller assumes we cannot fail. */
15842 gcc_assert (use_rsqrt_p (mode
));
15845 if (aarch64_sve_mode_p (mode
))
15846 pg
= aarch64_ptrue_reg (aarch64_sve_pred_mode (mode
));
15847 machine_mode mmsk
= (VECTOR_MODE_P (mode
)
15848 ? related_int_vector_mode (mode
).require ()
15849 : int_mode_for_mode (mode
).require ());
15850 rtx xmsk
= NULL_RTX
;
15853 /* When calculating the approximate square root, compare the
15854 argument with 0.0 and create a mask. */
15855 rtx zero
= CONST0_RTX (mode
);
15858 xmsk
= gen_reg_rtx (GET_MODE (pg
));
15859 rtx hint
= gen_int_mode (SVE_KNOWN_PTRUE
, SImode
);
15860 emit_insn (gen_aarch64_pred_fcm (UNSPEC_COND_FCMNE
, mode
,
15861 xmsk
, pg
, hint
, src
, zero
));
15865 xmsk
= gen_reg_rtx (mmsk
);
15866 emit_insn (gen_rtx_SET (xmsk
,
15868 gen_rtx_EQ (mmsk
, src
, zero
))));
15872 /* Estimate the approximate reciprocal square root. */
15873 rtx xdst
= gen_reg_rtx (mode
);
15874 emit_insn (gen_aarch64_rsqrte (mode
, xdst
, src
));
15876 /* Iterate over the series twice for SF and thrice for DF. */
15877 int iterations
= (GET_MODE_INNER (mode
) == DFmode
) ? 3 : 2;
15879 /* Optionally iterate over the series once less for faster performance
15880 while sacrificing the accuracy. */
15881 if ((recp
&& flag_mrecip_low_precision_sqrt
)
15882 || (!recp
&& flag_mlow_precision_sqrt
))
15885 /* Iterate over the series to calculate the approximate reciprocal square
15887 rtx x1
= gen_reg_rtx (mode
);
15888 while (iterations
--)
15890 rtx x2
= gen_reg_rtx (mode
);
15891 aarch64_emit_mult (x2
, pg
, xdst
, xdst
);
15893 emit_insn (gen_aarch64_rsqrts (mode
, x1
, src
, x2
));
15895 if (iterations
> 0)
15896 aarch64_emit_mult (xdst
, pg
, xdst
, x1
);
15902 /* Multiply nonzero source values by the corresponding intermediate
15903 result elements, so that the final calculation is the approximate
15904 square root rather than its reciprocal. Select a zero result for
15905 zero source values, to avoid the Inf * 0 -> NaN that we'd get
15907 emit_insn (gen_cond (UNSPEC_COND_FMUL
, mode
,
15908 xdst
, xmsk
, xdst
, src
, CONST0_RTX (mode
)));
15911 /* Qualify the approximate reciprocal square root when the
15912 argument is 0.0 by squashing the intermediary result to 0.0. */
15913 rtx xtmp
= gen_reg_rtx (mmsk
);
15914 emit_set_insn (xtmp
, gen_rtx_AND (mmsk
, gen_rtx_NOT (mmsk
, xmsk
),
15915 gen_rtx_SUBREG (mmsk
, xdst
, 0)));
15916 emit_move_insn (xdst
, gen_rtx_SUBREG (mode
, xtmp
, 0));
15918 /* Calculate the approximate square root. */
15919 aarch64_emit_mult (xdst
, pg
, xdst
, src
);
15923 /* Finalize the approximation. */
15924 aarch64_emit_mult (dst
, pg
, xdst
, x1
);
15929 /* Emit the instruction sequence to compute the approximation for the division
15930 of NUM by DEN in QUO and return whether the sequence was emitted or not. */
15933 aarch64_emit_approx_div (rtx quo
, rtx num
, rtx den
)
15935 machine_mode mode
= GET_MODE (quo
);
15937 if (GET_MODE_INNER (mode
) == HFmode
)
15940 bool use_approx_division_p
= (flag_mlow_precision_div
15941 || (aarch64_tune_params
.approx_modes
->division
15942 & AARCH64_APPROX_MODE (mode
)));
15944 if (!flag_finite_math_only
15945 || flag_trapping_math
15946 || !flag_unsafe_math_optimizations
15947 || optimize_function_for_size_p (cfun
)
15948 || !use_approx_division_p
)
15951 if (!TARGET_SIMD
&& VECTOR_MODE_P (mode
))
15955 if (aarch64_sve_mode_p (mode
))
15956 pg
= aarch64_ptrue_reg (aarch64_sve_pred_mode (mode
));
15958 /* Estimate the approximate reciprocal. */
15959 rtx xrcp
= gen_reg_rtx (mode
);
15960 emit_insn (gen_aarch64_frecpe (mode
, xrcp
, den
));
15962 /* Iterate over the series twice for SF and thrice for DF. */
15963 int iterations
= (GET_MODE_INNER (mode
) == DFmode
) ? 3 : 2;
15965 /* Optionally iterate over the series less for faster performance,
15966 while sacrificing the accuracy. The default is 2 for DF and 1 for SF. */
15967 if (flag_mlow_precision_div
)
15968 iterations
= (GET_MODE_INNER (mode
) == DFmode
15969 ? aarch64_double_recp_precision
15970 : aarch64_float_recp_precision
);
15972 /* Iterate over the series to calculate the approximate reciprocal. */
15973 rtx xtmp
= gen_reg_rtx (mode
);
15974 while (iterations
--)
15976 emit_insn (gen_aarch64_frecps (mode
, xtmp
, xrcp
, den
));
15978 if (iterations
> 0)
15979 aarch64_emit_mult (xrcp
, pg
, xrcp
, xtmp
);
15982 if (num
!= CONST1_RTX (mode
))
15984 /* As the approximate reciprocal of DEN is already calculated, only
15985 calculate the approximate division when NUM is not 1.0. */
15986 rtx xnum
= force_reg (mode
, num
);
15987 aarch64_emit_mult (xrcp
, pg
, xrcp
, xnum
);
15990 /* Finalize the approximation. */
15991 aarch64_emit_mult (quo
, pg
, xrcp
, xtmp
);
15995 /* Return the number of instructions that can be issued per cycle. */
15997 aarch64_sched_issue_rate (void)
15999 return aarch64_tune_params
.issue_rate
;
16002 /* Implement TARGET_SCHED_VARIABLE_ISSUE. */
16004 aarch64_sched_variable_issue (FILE *, int, rtx_insn
*insn
, int more
)
16006 if (DEBUG_INSN_P (insn
))
16009 rtx_code code
= GET_CODE (PATTERN (insn
));
16010 if (code
== USE
|| code
== CLOBBER
)
16013 if (get_attr_type (insn
) == TYPE_NO_INSN
)
16020 aarch64_sched_first_cycle_multipass_dfa_lookahead (void)
16022 int issue_rate
= aarch64_sched_issue_rate ();
16024 return issue_rate
> 1 && !sched_fusion
? issue_rate
: 0;
16028 /* Implement TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD as
16029 autopref_multipass_dfa_lookahead_guard from haifa-sched.cc. It only
16030 has an effect if PARAM_SCHED_AUTOPREF_QUEUE_DEPTH > 0. */
16033 aarch64_first_cycle_multipass_dfa_lookahead_guard (rtx_insn
*insn
,
16036 return autopref_multipass_dfa_lookahead_guard (insn
, ready_index
);
16040 /* Vectorizer cost model target hooks. */
16042 /* If a vld1 from address ADDR should be recorded in vector_load_decls,
16043 return the decl that should be recorded. Return null otherwise. */
16045 aarch64_vector_load_decl (tree addr
)
16047 if (TREE_CODE (addr
) != ADDR_EXPR
)
16049 tree base
= get_base_address (TREE_OPERAND (addr
, 0));
16050 if (TREE_CODE (base
) != VAR_DECL
)
16055 /* Return true if STMT_INFO accesses a decl that is known to be the
16056 argument to a vld1 in the same function. */
16058 aarch64_accesses_vector_load_decl_p (stmt_vec_info stmt_info
)
16060 if (!cfun
->machine
->vector_load_decls
)
16062 auto dr
= STMT_VINFO_DATA_REF (stmt_info
);
16065 tree decl
= aarch64_vector_load_decl (DR_BASE_ADDRESS (dr
));
16066 return decl
&& cfun
->machine
->vector_load_decls
->contains (decl
);
16069 /* Information about how the CPU would issue the scalar, Advanced SIMD
16070 or SVE version of a vector loop, using the scheme defined by the
16071 aarch64_base_vec_issue_info hierarchy of structures. */
16072 class aarch64_vec_op_count
16075 aarch64_vec_op_count () = default;
16076 aarch64_vec_op_count (const aarch64_vec_issue_info
*, unsigned int,
16079 unsigned int vec_flags () const { return m_vec_flags
; }
16080 unsigned int vf_factor () const { return m_vf_factor
; }
16082 const aarch64_base_vec_issue_info
*base_issue_info () const;
16083 const aarch64_simd_vec_issue_info
*simd_issue_info () const;
16084 const aarch64_sve_vec_issue_info
*sve_issue_info () const;
16086 fractional_cost
rename_cycles_per_iter () const;
16087 fractional_cost
min_nonpred_cycles_per_iter () const;
16088 fractional_cost
min_pred_cycles_per_iter () const;
16089 fractional_cost
min_cycles_per_iter () const;
16091 void dump () const;
16093 /* The number of individual "general" operations. See the comments
16094 in aarch64_base_vec_issue_info for details. */
16095 unsigned int general_ops
= 0;
16097 /* The number of load and store operations, under the same scheme
16099 unsigned int loads
= 0;
16100 unsigned int stores
= 0;
16102 /* The minimum number of cycles needed to execute all loop-carried
16103 operations, which in the vector code become associated with
16105 unsigned int reduction_latency
= 0;
16107 /* The number of individual predicate operations. See the comments
16108 in aarch64_sve_vec_issue_info for details. */
16109 unsigned int pred_ops
= 0;
16112 /* The issue information for the core. */
16113 const aarch64_vec_issue_info
*m_issue_info
= nullptr;
16115 /* - If M_VEC_FLAGS is zero then this structure describes scalar code
16116 - If M_VEC_FLAGS & VEC_ADVSIMD is nonzero then this structure describes
16117 Advanced SIMD code.
16118 - If M_VEC_FLAGS & VEC_ANY_SVE is nonzero then this structure describes
16120 unsigned int m_vec_flags
= 0;
16122 /* Assume that, when the code is executing on the core described
16123 by M_ISSUE_INFO, one iteration of the loop will handle M_VF_FACTOR
16124 times more data than the vectorizer anticipates.
16126 This is only ever different from 1 for SVE. It allows us to consider
16127 what would happen on a 256-bit SVE target even when the -mtune
16128 parameters say that the “likely” SVE length is 128 bits. */
16129 unsigned int m_vf_factor
= 1;
16132 aarch64_vec_op_count::
16133 aarch64_vec_op_count (const aarch64_vec_issue_info
*issue_info
,
16134 unsigned int vec_flags
, unsigned int vf_factor
)
16135 : m_issue_info (issue_info
),
16136 m_vec_flags (vec_flags
),
16137 m_vf_factor (vf_factor
)
16141 /* Return the base issue information (i.e. the parts that make sense
16142 for both scalar and vector code). Return null if we have no issue
16144 const aarch64_base_vec_issue_info
*
16145 aarch64_vec_op_count::base_issue_info () const
16147 if (auto *ret
= simd_issue_info ())
16149 return m_issue_info
->scalar
;
16152 /* If the structure describes vector code and we have associated issue
16153 information, return that issue information, otherwise return null. */
16154 const aarch64_simd_vec_issue_info
*
16155 aarch64_vec_op_count::simd_issue_info () const
16157 if (auto *ret
= sve_issue_info ())
16160 return m_issue_info
->advsimd
;
16164 /* If the structure describes SVE code and we have associated issue
16165 information, return that issue information, otherwise return null. */
16166 const aarch64_sve_vec_issue_info
*
16167 aarch64_vec_op_count::sve_issue_info () const
16169 if (m_vec_flags
& VEC_ANY_SVE
)
16170 return m_issue_info
->sve
;
16174 /* Estimate the minimum number of cycles per iteration needed to rename
16177 ??? For now this is done inline rather than via cost tables, since it
16178 isn't clear how it should be parameterized for the general case. */
16180 aarch64_vec_op_count::rename_cycles_per_iter () const
16182 if (sve_issue_info () == &neoverse512tvb_sve_issue_info
16183 || sve_issue_info () == &neoversen2_sve_issue_info
16184 || sve_issue_info () == &neoversev2_sve_issue_info
)
16185 /* + 1 for an addition. We've already counted a general op for each
16186 store, so we don't need to account for stores separately. The branch
16187 reads no registers and so does not need to be counted either.
16189 ??? This value is very much on the pessimistic side, but seems to work
16190 pretty well in practice. */
16191 return { general_ops
+ loads
+ pred_ops
+ 1, 5 };
16196 /* Like min_cycles_per_iter, but excluding predicate operations. */
16198 aarch64_vec_op_count::min_nonpred_cycles_per_iter () const
16200 auto *issue_info
= base_issue_info ();
16202 fractional_cost cycles
= MAX (reduction_latency
, 1);
16203 cycles
= std::max (cycles
, { stores
, issue_info
->stores_per_cycle
});
16204 cycles
= std::max (cycles
, { loads
+ stores
,
16205 issue_info
->loads_stores_per_cycle
});
16206 cycles
= std::max (cycles
, { general_ops
,
16207 issue_info
->general_ops_per_cycle
});
16208 cycles
= std::max (cycles
, rename_cycles_per_iter ());
16212 /* Like min_cycles_per_iter, but including only the predicate operations. */
16214 aarch64_vec_op_count::min_pred_cycles_per_iter () const
16216 if (auto *issue_info
= sve_issue_info ())
16217 return { pred_ops
, issue_info
->pred_ops_per_cycle
};
16221 /* Estimate the minimum number of cycles needed to issue the operations.
16222 This is a very simplistic model! */
16224 aarch64_vec_op_count::min_cycles_per_iter () const
16226 return std::max (min_nonpred_cycles_per_iter (),
16227 min_pred_cycles_per_iter ());
16230 /* Dump information about the structure. */
16232 aarch64_vec_op_count::dump () const
16234 dump_printf_loc (MSG_NOTE
, vect_location
,
16235 " load operations = %d\n", loads
);
16236 dump_printf_loc (MSG_NOTE
, vect_location
,
16237 " store operations = %d\n", stores
);
16238 dump_printf_loc (MSG_NOTE
, vect_location
,
16239 " general operations = %d\n", general_ops
);
16240 if (sve_issue_info ())
16241 dump_printf_loc (MSG_NOTE
, vect_location
,
16242 " predicate operations = %d\n", pred_ops
);
16243 dump_printf_loc (MSG_NOTE
, vect_location
,
16244 " reduction latency = %d\n", reduction_latency
);
16245 if (auto rcpi
= rename_cycles_per_iter ())
16246 dump_printf_loc (MSG_NOTE
, vect_location
,
16247 " estimated cycles per iteration to rename = %f\n",
16248 rcpi
.as_double ());
16249 if (auto pred_cpi
= min_pred_cycles_per_iter ())
16251 dump_printf_loc (MSG_NOTE
, vect_location
,
16252 " estimated min cycles per iteration"
16253 " without predication = %f\n",
16254 min_nonpred_cycles_per_iter ().as_double ());
16255 dump_printf_loc (MSG_NOTE
, vect_location
,
16256 " estimated min cycles per iteration"
16257 " for predication = %f\n", pred_cpi
.as_double ());
16259 if (auto cpi
= min_cycles_per_iter ())
16260 dump_printf_loc (MSG_NOTE
, vect_location
,
16261 " estimated min cycles per iteration = %f\n",
16265 /* Information about vector code that we're in the process of costing. */
16266 class aarch64_vector_costs
: public vector_costs
16269 aarch64_vector_costs (vec_info
*, bool);
16271 unsigned int add_stmt_cost (int count
, vect_cost_for_stmt kind
,
16272 stmt_vec_info stmt_info
, slp_tree
, tree vectype
,
16274 vect_cost_model_location where
) override
;
16275 void finish_cost (const vector_costs
*) override
;
16276 bool better_main_loop_than_p (const vector_costs
*other
) const override
;
16279 void record_potential_advsimd_unrolling (loop_vec_info
);
16280 void analyze_loop_vinfo (loop_vec_info
);
16281 void count_ops (unsigned int, vect_cost_for_stmt
, stmt_vec_info
, slp_tree
,
16282 aarch64_vec_op_count
*);
16283 fractional_cost
adjust_body_cost_sve (const aarch64_vec_op_count
*,
16284 fractional_cost
, unsigned int,
16285 unsigned int *, bool *);
16286 unsigned int adjust_body_cost (loop_vec_info
, const aarch64_vector_costs
*,
16288 bool prefer_unrolled_loop () const;
16289 unsigned int determine_suggested_unroll_factor ();
16291 /* True if we have performed one-time initialization based on the
16293 bool m_analyzed_vinfo
= false;
16295 /* This loop uses an average operation that is not supported by SVE, but is
16296 supported by Advanced SIMD and SVE2. */
16297 bool m_has_avg
= false;
16299 /* Additional initialization costs for using gather or scatter operation in
16300 the current loop. */
16301 unsigned int m_sve_gather_scatter_init_cost
= 0;
16303 /* True if the vector body contains a store to a decl and if the
16304 function is known to have a vld1 from the same decl.
16306 In the Advanced SIMD ACLE, the recommended endian-agnostic way of
16307 initializing a vector is:
16309 float f[4] = { elts };
16310 float32x4_t x = vld1q_f32(f);
16312 We should strongly prefer vectorization of the initialization of f,
16313 so that the store to f and the load back can be optimized away,
16314 leaving a vectorization of { elts }. */
16315 bool m_stores_to_vector_load_decl
= false;
16317 /* Non-zero if the last operation we costed is a vector promotion or demotion.
16318 In this case the value is the number of insns in the last operation.
16320 On AArch64 vector promotion and demotions require us to first widen or
16321 narrow the input and only after that emit conversion instructions. For
16322 costing this means we need to emit the cost of the final conversions as
16324 unsigned int m_num_last_promote_demote
= 0;
16326 /* - If M_VEC_FLAGS is zero then we're costing the original scalar code.
16327 - If M_VEC_FLAGS & VEC_ADVSIMD is nonzero then we're costing Advanced
16329 - If M_VEC_FLAGS & VEC_ANY_SVE is nonzero then we're costing SVE code. */
16330 unsigned int m_vec_flags
= 0;
16332 /* At the moment, we do not model LDP and STP in the vector and scalar costs.
16333 This means that code such as:
16338 will be costed as two scalar instructions and two vector instructions
16339 (a scalar_to_vec and an unaligned_store). For SLP, the vector form
16340 wins if the costs are equal, because of the fact that the vector costs
16341 include constant initializations whereas the scalar costs don't.
16342 We would therefore tend to vectorize the code above, even though
16343 the scalar version can use a single STP.
16345 We should eventually fix this and model LDP and STP in the main costs;
16346 see the comment in aarch64_sve_adjust_stmt_cost for some of the problems.
16347 Until then, we look specifically for code that does nothing more than
16348 STP-like operations. We cost them on that basis in addition to the
16349 normal latency-based costs.
16351 If the scalar or vector code could be a sequence of STPs +
16352 initialization, this variable counts the cost of the sequence,
16353 with 2 units per instruction. The variable is ~0U for other
16355 unsigned int m_stp_sequence_cost
= 0;
16357 /* On some CPUs, SVE and Advanced SIMD provide the same theoretical vector
16358 throughput, such as 4x128 Advanced SIMD vs. 2x256 SVE. In those
16359 situations, we try to predict whether an Advanced SIMD implementation
16360 of the loop could be completely unrolled and become straight-line code.
16361 If so, it is generally better to use the Advanced SIMD version rather
16362 than length-agnostic SVE, since the SVE loop would execute an unknown
16363 number of times and so could not be completely unrolled in the same way.
16365 If we're applying this heuristic, M_UNROLLED_ADVSIMD_NITERS is the
16366 number of Advanced SIMD loop iterations that would be unrolled and
16367 M_UNROLLED_ADVSIMD_STMTS estimates the total number of statements
16368 in the unrolled loop. Both values are zero if we're not applying
16370 unsigned HOST_WIDE_INT m_unrolled_advsimd_niters
= 0;
16371 unsigned HOST_WIDE_INT m_unrolled_advsimd_stmts
= 0;
16373 /* If we're vectorizing a loop that executes a constant number of times,
16374 this variable gives the number of times that the vector loop would
16375 iterate, otherwise it is zero. */
16376 uint64_t m_num_vector_iterations
= 0;
16378 /* Used only when vectorizing loops. Estimates the number and kind of
16379 operations that would be needed by one iteration of the scalar
16380 or vector loop. There is one entry for each tuning option of
16382 auto_vec
<aarch64_vec_op_count
, 2> m_ops
;
16385 aarch64_vector_costs::aarch64_vector_costs (vec_info
*vinfo
,
16386 bool costing_for_scalar
)
16387 : vector_costs (vinfo
, costing_for_scalar
),
16388 m_vec_flags (costing_for_scalar
? 0
16389 : aarch64_classify_vector_mode (vinfo
->vector_mode
))
16391 if (auto *issue_info
= aarch64_tune_params
.vec_costs
->issue_info
)
16393 m_ops
.quick_push ({ issue_info
, m_vec_flags
});
16394 if (aarch64_tune_params
.vec_costs
== &neoverse512tvb_vector_cost
)
16396 unsigned int vf_factor
= (m_vec_flags
& VEC_ANY_SVE
) ? 2 : 1;
16397 m_ops
.quick_push ({ &neoversev1_vec_issue_info
, m_vec_flags
,
16403 /* Implement TARGET_VECTORIZE_CREATE_COSTS. */
16405 aarch64_vectorize_create_costs (vec_info
*vinfo
, bool costing_for_scalar
)
16407 return new aarch64_vector_costs (vinfo
, costing_for_scalar
);
16410 /* Return true if the current CPU should use the new costs defined
16411 in GCC 11. This should be removed for GCC 12 and above, with the
16412 costs applying to all CPUs instead. */
16414 aarch64_use_new_vector_costs_p ()
16416 return (aarch64_tune_params
.extra_tuning_flags
16417 & AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS
);
16420 /* Return the appropriate SIMD costs for vectors of type VECTYPE. */
16421 static const simd_vec_cost
*
16422 aarch64_simd_vec_costs (tree vectype
)
16424 const cpu_vector_cost
*costs
= aarch64_tune_params
.vec_costs
;
16425 if (vectype
!= NULL
16426 && aarch64_sve_mode_p (TYPE_MODE (vectype
))
16427 && costs
->sve
!= NULL
)
16429 return costs
->advsimd
;
16432 /* Return the appropriate SIMD costs for vectors with VEC_* flags FLAGS. */
16433 static const simd_vec_cost
*
16434 aarch64_simd_vec_costs_for_flags (unsigned int flags
)
16436 const cpu_vector_cost
*costs
= aarch64_tune_params
.vec_costs
;
16437 if ((flags
& VEC_ANY_SVE
) && costs
->sve
)
16439 return costs
->advsimd
;
16442 /* If STMT_INFO is a memory reference, return the scalar memory type,
16443 otherwise return null. */
16445 aarch64_dr_type (stmt_vec_info stmt_info
)
16447 if (auto dr
= STMT_VINFO_DATA_REF (stmt_info
))
16448 return TREE_TYPE (DR_REF (dr
));
16452 /* Decide whether to use the unrolling heuristic described above
16453 m_unrolled_advsimd_niters, updating that field if so. LOOP_VINFO
16454 describes the loop that we're vectorizing. */
16456 aarch64_vector_costs::
16457 record_potential_advsimd_unrolling (loop_vec_info loop_vinfo
)
16459 /* The heuristic only makes sense on targets that have the same
16460 vector throughput for SVE and Advanced SIMD. */
16461 if (!(aarch64_tune_params
.extra_tuning_flags
16462 & AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT
))
16465 /* We only want to apply the heuristic if LOOP_VINFO is being
16466 vectorized for SVE. */
16467 if (!(m_vec_flags
& VEC_ANY_SVE
))
16470 /* Check whether it is possible in principle to use Advanced SIMD
16472 if (aarch64_autovec_preference
== AARCH64_AUTOVEC_SVE_ONLY
)
16475 /* We don't want to apply the heuristic to outer loops, since it's
16476 harder to track two levels of unrolling. */
16477 if (LOOP_VINFO_LOOP (loop_vinfo
)->inner
)
16480 /* Only handle cases in which the number of Advanced SIMD iterations
16481 would be known at compile time but the number of SVE iterations
16483 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
)
16484 || aarch64_sve_vg
.is_constant ())
16487 /* Guess how many times the Advanced SIMD loop would iterate and make
16488 sure that it is within the complete unrolling limit. Even if the
16489 number of iterations is small enough, the number of statements might
16490 not be, which is why we need to estimate the number of statements too. */
16491 unsigned int estimated_vq
= aarch64_estimated_sve_vq ();
16492 unsigned int advsimd_vf
= CEIL (vect_vf_for_cost (loop_vinfo
), estimated_vq
);
16493 unsigned HOST_WIDE_INT unrolled_advsimd_niters
16494 = LOOP_VINFO_INT_NITERS (loop_vinfo
) / advsimd_vf
;
16495 if (unrolled_advsimd_niters
> (unsigned int) param_max_completely_peel_times
)
16498 /* Record that we're applying the heuristic and should try to estimate
16499 the number of statements in the Advanced SIMD loop. */
16500 m_unrolled_advsimd_niters
= unrolled_advsimd_niters
;
16503 /* Do one-time initialization of the aarch64_vector_costs given that we're
16504 costing the loop vectorization described by LOOP_VINFO. */
16506 aarch64_vector_costs::analyze_loop_vinfo (loop_vec_info loop_vinfo
)
16508 /* Record the number of times that the vector loop would execute,
16510 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
16511 auto scalar_niters
= max_stmt_executions_int (loop
);
16512 if (scalar_niters
>= 0)
16514 unsigned int vf
= vect_vf_for_cost (loop_vinfo
);
16515 if (LOOP_VINFO_MASKS (loop_vinfo
).is_empty ())
16516 m_num_vector_iterations
= scalar_niters
/ vf
;
16518 m_num_vector_iterations
= CEIL (scalar_niters
, vf
);
16521 /* Detect whether we're vectorizing for SVE and should apply the unrolling
16522 heuristic described above m_unrolled_advsimd_niters. */
16523 record_potential_advsimd_unrolling (loop_vinfo
);
16526 /* Implement targetm.vectorize.builtin_vectorization_cost. */
16528 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost
,
16530 int misalign ATTRIBUTE_UNUSED
)
16533 const cpu_vector_cost
*costs
= aarch64_tune_params
.vec_costs
;
16536 if (vectype
!= NULL
)
16537 fp
= FLOAT_TYPE_P (vectype
);
16539 const simd_vec_cost
*simd_costs
= aarch64_simd_vec_costs (vectype
);
16541 switch (type_of_cost
)
16544 return fp
? costs
->scalar_fp_stmt_cost
: costs
->scalar_int_stmt_cost
;
16547 return costs
->scalar_load_cost
;
16550 return costs
->scalar_store_cost
;
16553 return fp
? simd_costs
->fp_stmt_cost
16554 : simd_costs
->int_stmt_cost
;
16557 return simd_costs
->align_load_cost
;
16560 return simd_costs
->store_cost
;
16562 case vec_to_scalar
:
16563 return simd_costs
->vec_to_scalar_cost
;
16565 case scalar_to_vec
:
16566 return simd_costs
->scalar_to_vec_cost
;
16568 case unaligned_load
:
16569 case vector_gather_load
:
16570 return simd_costs
->unalign_load_cost
;
16572 case unaligned_store
:
16573 case vector_scatter_store
:
16574 return simd_costs
->unalign_store_cost
;
16576 case cond_branch_taken
:
16577 return costs
->cond_taken_branch_cost
;
16579 case cond_branch_not_taken
:
16580 return costs
->cond_not_taken_branch_cost
;
16583 return simd_costs
->permute_cost
;
16585 case vec_promote_demote
:
16586 return fp
? simd_costs
->fp_stmt_cost
16587 : simd_costs
->int_stmt_cost
;
16589 case vec_construct
:
16590 elements
= estimated_poly_value (TYPE_VECTOR_SUBPARTS (vectype
));
16591 return elements
/ 2 + 1;
16594 gcc_unreachable ();
16598 /* Return true if an access of kind KIND for STMT_INFO (or NODE if SLP)
16599 represents one vector of an LD[234] or ST[234] operation. Return the total
16600 number of vectors (2, 3 or 4) if so, otherwise return a value outside that
16603 aarch64_ld234_st234_vectors (vect_cost_for_stmt kind
, stmt_vec_info stmt_info
,
16606 if ((kind
== vector_load
16607 || kind
== unaligned_load
16608 || kind
== vector_store
16609 || kind
== unaligned_store
)
16610 && STMT_VINFO_DATA_REF (stmt_info
))
16612 stmt_info
= DR_GROUP_FIRST_ELEMENT (stmt_info
);
16614 && vect_mem_access_type (stmt_info
, node
) == VMAT_LOAD_STORE_LANES
)
16615 return DR_GROUP_SIZE (stmt_info
);
16620 /* Return true if creating multiple copies of STMT_INFO for Advanced SIMD
16621 vectors would produce a series of LDP or STP operations. KIND is the
16622 kind of statement that STMT_INFO represents. */
16624 aarch64_advsimd_ldp_stp_p (enum vect_cost_for_stmt kind
,
16625 stmt_vec_info stmt_info
)
16631 case unaligned_load
:
16632 case unaligned_store
:
16639 return is_gimple_assign (stmt_info
->stmt
);
16642 /* Return true if STMT_INFO is the second part of a two-statement multiply-add
16643 or multiply-subtract sequence that might be suitable for fusing into a
16644 single instruction. If VEC_FLAGS is zero, analyze the operation as
16645 a scalar one, otherwise analyze it as an operation on vectors with those
16648 aarch64_multiply_add_p (vec_info
*vinfo
, stmt_vec_info stmt_info
,
16649 unsigned int vec_flags
)
16651 gassign
*assign
= dyn_cast
<gassign
*> (stmt_info
->stmt
);
16654 tree_code code
= gimple_assign_rhs_code (assign
);
16655 if (code
!= PLUS_EXPR
&& code
!= MINUS_EXPR
)
16658 auto is_mul_result
= [&](int i
)
16660 tree rhs
= gimple_op (assign
, i
);
16661 /* ??? Should we try to check for a single use as well? */
16662 if (TREE_CODE (rhs
) != SSA_NAME
)
16665 stmt_vec_info def_stmt_info
= vinfo
->lookup_def (rhs
);
16667 || STMT_VINFO_DEF_TYPE (def_stmt_info
) != vect_internal_def
)
16669 gassign
*rhs_assign
= dyn_cast
<gassign
*> (def_stmt_info
->stmt
);
16670 if (!rhs_assign
|| gimple_assign_rhs_code (rhs_assign
) != MULT_EXPR
)
16673 if (vec_flags
& VEC_ADVSIMD
)
16675 /* Scalar and SVE code can tie the result to any FMLA input (or none,
16676 although that requires a MOVPRFX for SVE). However, Advanced SIMD
16677 only supports MLA forms, so will require a move if the result
16678 cannot be tied to the accumulator. The most important case in
16679 which this is true is when the accumulator input is invariant. */
16680 rhs
= gimple_op (assign
, 3 - i
);
16681 if (TREE_CODE (rhs
) != SSA_NAME
)
16683 def_stmt_info
= vinfo
->lookup_def (rhs
);
16685 || STMT_VINFO_DEF_TYPE (def_stmt_info
) == vect_external_def
16686 || STMT_VINFO_DEF_TYPE (def_stmt_info
) == vect_constant_def
)
16693 if (code
== MINUS_EXPR
&& (vec_flags
& VEC_ADVSIMD
))
16694 /* Advanced SIMD doesn't have FNMADD/FNMSUB/FNMLA/FNMLS, so the
16695 multiplication must be on the second operand (to form an FMLS).
16696 But if both operands are multiplications and the second operand
16697 is used more than once, we'll instead negate the second operand
16698 and use it as an accumulator for the first operand. */
16699 return (is_mul_result (2)
16700 && (has_single_use (gimple_assign_rhs2 (assign
))
16701 || !is_mul_result (1)));
16703 return is_mul_result (1) || is_mul_result (2);
16706 /* Return true if STMT_INFO is the second part of a two-statement boolean AND
16707 expression sequence that might be suitable for fusing into a
16708 single instruction. If VEC_FLAGS is zero, analyze the operation as
16709 a scalar one, otherwise analyze it as an operation on vectors with those
16713 aarch64_bool_compound_p (vec_info
*vinfo
, stmt_vec_info stmt_info
,
16714 unsigned int vec_flags
)
16716 gassign
*assign
= dyn_cast
<gassign
*> (stmt_info
->stmt
);
16718 || gimple_assign_rhs_code (assign
) != BIT_AND_EXPR
16719 || !STMT_VINFO_VECTYPE (stmt_info
)
16720 || !VECTOR_BOOLEAN_TYPE_P (STMT_VINFO_VECTYPE (stmt_info
)))
16723 for (int i
= 1; i
< 3; ++i
)
16725 tree rhs
= gimple_op (assign
, i
);
16727 if (TREE_CODE (rhs
) != SSA_NAME
)
16730 stmt_vec_info def_stmt_info
= vinfo
->lookup_def (rhs
);
16732 || STMT_VINFO_DEF_TYPE (def_stmt_info
) != vect_internal_def
)
16735 gassign
*rhs_assign
= dyn_cast
<gassign
*> (def_stmt_info
->stmt
);
16737 || TREE_CODE_CLASS (gimple_assign_rhs_code (rhs_assign
))
16741 if (vec_flags
& VEC_ADVSIMD
)
16749 /* We are considering implementing STMT_INFO using SVE. If STMT_INFO is an
16750 in-loop reduction that SVE supports directly, return its latency in cycles,
16751 otherwise return zero. SVE_COSTS specifies the latencies of the relevant
16753 static unsigned int
16754 aarch64_sve_in_loop_reduction_latency (vec_info
*vinfo
,
16755 stmt_vec_info stmt_info
,
16756 const sve_vec_cost
*sve_costs
)
16758 switch (vect_reduc_type (vinfo
, stmt_info
))
16760 case EXTRACT_LAST_REDUCTION
:
16761 return sve_costs
->clast_cost
;
16763 case FOLD_LEFT_REDUCTION
:
16764 switch (TYPE_MODE (TREE_TYPE (gimple_get_lhs (stmt_info
->stmt
))))
16768 return sve_costs
->fadda_f16_cost
;
16771 return sve_costs
->fadda_f32_cost
;
16774 return sve_costs
->fadda_f64_cost
;
16785 /* STMT_INFO describes a loop-carried operation in the original scalar code
16786 that we are considering implementing as a reduction. Return one of the
16787 following values, depending on VEC_FLAGS:
16789 - If VEC_FLAGS is zero, return the loop carry latency of the original
16792 - If VEC_FLAGS & VEC_ADVSIMD, return the loop carry latency of the
16793 Advanced SIMD implementation.
16795 - If VEC_FLAGS & VEC_ANY_SVE, return the loop carry latency of the
16796 SVE implementation. */
16797 static unsigned int
16798 aarch64_in_loop_reduction_latency (vec_info
*vinfo
, stmt_vec_info stmt_info
,
16799 unsigned int vec_flags
)
16801 const cpu_vector_cost
*vec_costs
= aarch64_tune_params
.vec_costs
;
16802 const sve_vec_cost
*sve_costs
= nullptr;
16803 if (vec_flags
& VEC_ANY_SVE
)
16804 sve_costs
= aarch64_tune_params
.vec_costs
->sve
;
16806 /* If the caller is asking for the SVE latency, check for forms of reduction
16807 that only SVE can handle directly. */
16810 unsigned int latency
16811 = aarch64_sve_in_loop_reduction_latency (vinfo
, stmt_info
, sve_costs
);
16816 /* Handle scalar costs. */
16817 bool is_float
= FLOAT_TYPE_P (TREE_TYPE (gimple_get_lhs (stmt_info
->stmt
)));
16818 if (vec_flags
== 0)
16821 return vec_costs
->scalar_fp_stmt_cost
;
16822 return vec_costs
->scalar_int_stmt_cost
;
16825 /* Otherwise, the loop body just contains normal integer or FP operations,
16826 with a vector reduction outside the loop. */
16827 const simd_vec_cost
*simd_costs
16828 = aarch64_simd_vec_costs_for_flags (vec_flags
);
16830 return simd_costs
->fp_stmt_cost
;
16831 return simd_costs
->int_stmt_cost
;
16834 /* STMT_COST is the cost calculated by aarch64_builtin_vectorization_cost
16835 for STMT_INFO, which has cost kind KIND. If this is a scalar operation,
16836 try to subdivide the target-independent categorization provided by KIND
16837 to get a more accurate cost. */
16838 static fractional_cost
16839 aarch64_detect_scalar_stmt_subtype (vec_info
*vinfo
, vect_cost_for_stmt kind
,
16840 stmt_vec_info stmt_info
,
16841 fractional_cost stmt_cost
)
16843 /* Detect an extension of a loaded value. In general, we'll be able to fuse
16844 the extension with the load. */
16845 if (kind
== scalar_stmt
&& vect_is_extending_load (vinfo
, stmt_info
))
16851 /* STMT_COST is the cost calculated by aarch64_builtin_vectorization_cost
16852 for the vectorized form of STMT_INFO possibly using SLP node NODE, which has
16853 cost kind KIND and which when vectorized would operate on vector type
16854 VECTYPE. Try to subdivide the target-independent categorization provided by
16855 KIND to get a more accurate cost. WHERE specifies where the cost associated
16856 with KIND occurs. */
16857 static fractional_cost
16858 aarch64_detect_vector_stmt_subtype (vec_info
*vinfo
, vect_cost_for_stmt kind
,
16859 stmt_vec_info stmt_info
, slp_tree node
,
16861 enum vect_cost_model_location where
,
16862 fractional_cost stmt_cost
)
16864 const simd_vec_cost
*simd_costs
= aarch64_simd_vec_costs (vectype
);
16865 const sve_vec_cost
*sve_costs
= nullptr;
16866 if (aarch64_sve_mode_p (TYPE_MODE (vectype
)))
16867 sve_costs
= aarch64_tune_params
.vec_costs
->sve
;
16869 /* It's generally better to avoid costing inductions, since the induction
16870 will usually be hidden by other operations. This is particularly true
16871 for things like COND_REDUCTIONS. */
16872 if (is_a
<gphi
*> (stmt_info
->stmt
))
16875 /* Detect cases in which vec_to_scalar is describing the extraction of a
16876 vector element in preparation for a scalar store. The store itself is
16877 costed separately. */
16878 if (vect_is_store_elt_extraction (kind
, stmt_info
))
16879 return simd_costs
->store_elt_extra_cost
;
16881 /* Detect SVE gather loads, which are costed as a single scalar_load
16882 for each element. We therefore need to divide the full-instruction
16883 cost by the number of elements in the vector. */
16884 if (kind
== scalar_load
16886 && vect_mem_access_type (stmt_info
, node
) == VMAT_GATHER_SCATTER
)
16888 unsigned int nunits
= vect_nunits_for_cost (vectype
);
16889 /* Test for VNx2 modes, which have 64-bit containers. */
16890 if (known_eq (GET_MODE_NUNITS (TYPE_MODE (vectype
)), aarch64_sve_vg
))
16891 return { sve_costs
->gather_load_x64_cost
, nunits
};
16892 return { sve_costs
->gather_load_x32_cost
, nunits
};
16895 /* Detect cases in which a scalar_store is really storing one element
16896 in a scatter operation. */
16897 if (kind
== scalar_store
16899 && vect_mem_access_type (stmt_info
, node
) == VMAT_GATHER_SCATTER
)
16900 return sve_costs
->scatter_store_elt_cost
;
16902 /* Detect cases in which vec_to_scalar represents an in-loop reduction. */
16903 if (kind
== vec_to_scalar
16904 && where
== vect_body
16907 unsigned int latency
16908 = aarch64_sve_in_loop_reduction_latency (vinfo
, stmt_info
, sve_costs
);
16913 /* Detect cases in which vec_to_scalar represents a single reduction
16914 instruction like FADDP or MAXV. */
16915 if (kind
== vec_to_scalar
16916 && where
== vect_epilogue
16917 && vect_is_reduction (stmt_info
))
16918 switch (GET_MODE_INNER (TYPE_MODE (vectype
)))
16921 return simd_costs
->reduc_i8_cost
;
16924 return simd_costs
->reduc_i16_cost
;
16927 return simd_costs
->reduc_i32_cost
;
16930 return simd_costs
->reduc_i64_cost
;
16934 return simd_costs
->reduc_f16_cost
;
16937 return simd_costs
->reduc_f32_cost
;
16940 return simd_costs
->reduc_f64_cost
;
16946 /* Otherwise stick with the original categorization. */
16950 /* STMT_COST is the cost calculated by aarch64_builtin_vectorization_cost
16951 for STMT_INFO, which has cost kind KIND and which when vectorized would
16952 operate on vector type VECTYPE. Adjust the cost as necessary for SVE
16954 static fractional_cost
16955 aarch64_sve_adjust_stmt_cost (class vec_info
*vinfo
, vect_cost_for_stmt kind
,
16956 stmt_vec_info stmt_info
, tree vectype
,
16957 fractional_cost stmt_cost
)
16959 /* Unlike vec_promote_demote, vector_stmt conversions do not change the
16960 vector register size or number of units. Integer promotions of this
16961 type therefore map to SXT[BHW] or UXT[BHW].
16963 Most loads have extending forms that can do the sign or zero extension
16964 on the fly. Optimistically assume that a load followed by an extension
16965 will fold to this form during combine, and that the extension therefore
16967 if (kind
== vector_stmt
&& vect_is_extending_load (vinfo
, stmt_info
))
16970 /* For similar reasons, vector_stmt integer truncations are a no-op,
16971 because we can just ignore the unused upper bits of the source. */
16972 if (kind
== vector_stmt
&& vect_is_integer_truncation (stmt_info
))
16975 /* Advanced SIMD can load and store pairs of registers using LDP and STP,
16976 but there are no equivalent instructions for SVE. This means that
16977 (all other things being equal) 128-bit SVE needs twice as many load
16978 and store instructions as Advanced SIMD in order to process vector pairs.
16980 Also, scalar code can often use LDP and STP to access pairs of values,
16981 so it is too simplistic to say that one SVE load or store replaces
16982 VF scalar loads and stores.
16984 Ideally we would account for this in the scalar and Advanced SIMD
16985 costs by making suitable load/store pairs as cheap as a single
16986 load/store. However, that would be a very invasive change and in
16987 practice it tends to stress other parts of the cost model too much.
16988 E.g. stores of scalar constants currently count just a store,
16989 whereas stores of vector constants count a store and a vec_init.
16990 This is an artificial distinction for AArch64, where stores of
16991 nonzero scalar constants need the same kind of register invariant
16994 An alternative would be to double the cost of any SVE loads and stores
16995 that could be paired in Advanced SIMD (and possibly also paired in
16996 scalar code). But this tends to stress other parts of the cost model
16997 in the same way. It also means that we can fall back to Advanced SIMD
16998 even if full-loop predication would have been useful.
17000 Here we go for a more conservative version: double the costs of SVE
17001 loads and stores if one iteration of the scalar loop processes enough
17002 elements for it to use a whole number of Advanced SIMD LDP or STP
17003 instructions. This makes it very likely that the VF would be 1 for
17004 Advanced SIMD, and so no epilogue should be needed. */
17005 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
))
17007 stmt_vec_info first
= DR_GROUP_FIRST_ELEMENT (stmt_info
);
17008 unsigned int count
= DR_GROUP_SIZE (first
) - DR_GROUP_GAP (first
);
17009 unsigned int elt_bits
= GET_MODE_UNIT_BITSIZE (TYPE_MODE (vectype
));
17010 if (multiple_p (count
* elt_bits
, 256)
17011 && aarch64_advsimd_ldp_stp_p (kind
, stmt_info
))
17018 /* STMT_COST is the cost calculated for STMT_INFO, which has cost kind KIND
17019 and which when vectorized would operate on vector type VECTYPE. Add the
17020 cost of any embedded operations. */
17021 static fractional_cost
17022 aarch64_adjust_stmt_cost (vec_info
*vinfo
, vect_cost_for_stmt kind
,
17023 stmt_vec_info stmt_info
, slp_tree node
, tree vectype
,
17024 unsigned vec_flags
, fractional_cost stmt_cost
)
17028 const simd_vec_cost
*simd_costs
= aarch64_simd_vec_costs (vectype
);
17030 /* Detect cases in which a vector load or store represents an
17031 LD[234] or ST[234] instruction. */
17032 switch (aarch64_ld234_st234_vectors (kind
, stmt_info
, node
))
17035 stmt_cost
+= simd_costs
->ld2_st2_permute_cost
;
17039 stmt_cost
+= simd_costs
->ld3_st3_permute_cost
;
17043 stmt_cost
+= simd_costs
->ld4_st4_permute_cost
;
17047 gassign
*assign
= dyn_cast
<gassign
*> (STMT_VINFO_STMT (stmt_info
));
17048 if ((kind
== scalar_stmt
|| kind
== vector_stmt
) && assign
)
17050 /* For MLA we need to reduce the cost since MLA is 1 instruction. */
17051 if (!vect_is_reduction (stmt_info
)
17052 && aarch64_multiply_add_p (vinfo
, stmt_info
, vec_flags
))
17055 /* For vector boolean ANDs with a compare operand we just need
17057 if (aarch64_bool_compound_p (vinfo
, stmt_info
, vec_flags
))
17061 if (kind
== vector_stmt
|| kind
== vec_to_scalar
)
17062 if (tree cmp_type
= vect_embedded_comparison_type (stmt_info
))
17064 if (FLOAT_TYPE_P (cmp_type
))
17065 stmt_cost
+= simd_costs
->fp_stmt_cost
;
17067 stmt_cost
+= simd_costs
->int_stmt_cost
;
17071 if (kind
== scalar_stmt
)
17072 if (tree cmp_type
= vect_embedded_comparison_type (stmt_info
))
17074 if (FLOAT_TYPE_P (cmp_type
))
17075 stmt_cost
+= aarch64_tune_params
.vec_costs
->scalar_fp_stmt_cost
;
17077 stmt_cost
+= aarch64_tune_params
.vec_costs
->scalar_int_stmt_cost
;
17083 /* Return true if STMT_INFO is part of a reduction that has the form:
17088 with the single accumulator being read and written multiple times. */
17090 aarch64_force_single_cycle (vec_info
*vinfo
, stmt_vec_info stmt_info
)
17092 if (!STMT_VINFO_REDUC_DEF (stmt_info
))
17095 auto reduc_info
= info_for_reduction (vinfo
, stmt_info
);
17096 return STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info
);
17099 /* COUNT, KIND and STMT_INFO are the same as for vector_costs::add_stmt_cost
17100 and they describe an operation in the body of a vector loop. Record issue
17101 information relating to the vector operation in OPS. */
17103 aarch64_vector_costs::count_ops (unsigned int count
, vect_cost_for_stmt kind
,
17104 stmt_vec_info stmt_info
, slp_tree node
,
17105 aarch64_vec_op_count
*ops
)
17107 const aarch64_base_vec_issue_info
*base_issue
= ops
->base_issue_info ();
17110 const aarch64_simd_vec_issue_info
*simd_issue
= ops
->simd_issue_info ();
17111 const aarch64_sve_vec_issue_info
*sve_issue
= ops
->sve_issue_info ();
17113 /* Calculate the minimum cycles per iteration imposed by a reduction
17115 if ((kind
== scalar_stmt
|| kind
== vector_stmt
|| kind
== vec_to_scalar
)
17116 && vect_is_reduction (stmt_info
))
17119 = aarch64_in_loop_reduction_latency (m_vinfo
, stmt_info
, m_vec_flags
);
17120 if (aarch64_force_single_cycle (m_vinfo
, stmt_info
))
17121 /* ??? Ideally we'd use a tree to reduce the copies down to 1 vector,
17122 and then accumulate that, but at the moment the loop-carried
17123 dependency includes all copies. */
17124 ops
->reduction_latency
= MAX (ops
->reduction_latency
, base
* count
);
17126 ops
->reduction_latency
= MAX (ops
->reduction_latency
, base
);
17129 if (stmt_info
&& (kind
== scalar_stmt
|| kind
== vector_stmt
))
17131 /* Assume that multiply-adds will become a single operation. */
17132 if (aarch64_multiply_add_p (m_vinfo
, stmt_info
, m_vec_flags
))
17135 /* Assume that bool AND with compare operands will become a single
17137 if (aarch64_bool_compound_p (m_vinfo
, stmt_info
, m_vec_flags
))
17142 /* Count the basic operation cost associated with KIND. */
17145 case cond_branch_taken
:
17146 case cond_branch_not_taken
:
17147 case vector_gather_load
:
17148 case vector_scatter_store
:
17149 /* We currently don't expect these to be used in a loop body. */
17153 case vec_promote_demote
:
17154 case vec_construct
:
17155 case vec_to_scalar
:
17156 case scalar_to_vec
:
17159 ops
->general_ops
+= count
;
17164 case unaligned_load
:
17165 ops
->loads
+= count
;
17166 if (m_vec_flags
|| FLOAT_TYPE_P (aarch64_dr_type (stmt_info
)))
17167 ops
->general_ops
+= base_issue
->fp_simd_load_general_ops
* count
;
17171 case unaligned_store
:
17173 ops
->stores
+= count
;
17174 if (m_vec_flags
|| FLOAT_TYPE_P (aarch64_dr_type (stmt_info
)))
17175 ops
->general_ops
+= base_issue
->fp_simd_store_general_ops
* count
;
17179 /* Add any embedded comparison operations. */
17180 if ((kind
== scalar_stmt
|| kind
== vector_stmt
|| kind
== vec_to_scalar
)
17181 && vect_embedded_comparison_type (stmt_info
))
17182 ops
->general_ops
+= count
;
17184 /* COND_REDUCTIONS need two sets of VEC_COND_EXPRs, whereas so far we
17185 have only accounted for one. */
17186 if ((kind
== vector_stmt
|| kind
== vec_to_scalar
)
17187 && vect_reduc_type (m_vinfo
, stmt_info
) == COND_REDUCTION
)
17188 ops
->general_ops
+= count
;
17190 /* Count the predicate operations needed by an SVE comparison. */
17191 if (sve_issue
&& (kind
== vector_stmt
|| kind
== vec_to_scalar
))
17192 if (tree type
= vect_comparison_type (stmt_info
))
17194 unsigned int base
= (FLOAT_TYPE_P (type
)
17195 ? sve_issue
->fp_cmp_pred_ops
17196 : sve_issue
->int_cmp_pred_ops
);
17197 ops
->pred_ops
+= base
* count
;
17200 /* Add any extra overhead associated with LD[234] and ST[234] operations. */
17202 switch (aarch64_ld234_st234_vectors (kind
, stmt_info
, node
))
17205 ops
->general_ops
+= simd_issue
->ld2_st2_general_ops
* count
;
17209 ops
->general_ops
+= simd_issue
->ld3_st3_general_ops
* count
;
17213 ops
->general_ops
+= simd_issue
->ld4_st4_general_ops
* count
;
17217 /* Add any overhead associated with gather loads and scatter stores. */
17219 && (kind
== scalar_load
|| kind
== scalar_store
)
17220 && vect_mem_access_type (stmt_info
, node
) == VMAT_GATHER_SCATTER
)
17222 unsigned int pairs
= CEIL (count
, 2);
17223 ops
->pred_ops
+= sve_issue
->gather_scatter_pair_pred_ops
* pairs
;
17224 ops
->general_ops
+= sve_issue
->gather_scatter_pair_general_ops
* pairs
;
17228 /* Return true if STMT_INFO contains a memory access and if the constant
17229 component of the memory address is aligned to SIZE bytes. */
17231 aarch64_aligned_constant_offset_p (stmt_vec_info stmt_info
,
17234 if (!STMT_VINFO_DATA_REF (stmt_info
))
17237 if (auto first_stmt
= DR_GROUP_FIRST_ELEMENT (stmt_info
))
17238 stmt_info
= first_stmt
;
17239 tree constant_offset
= DR_INIT (STMT_VINFO_DATA_REF (stmt_info
));
17240 /* Needed for gathers & scatters, for example. */
17241 if (!constant_offset
)
17244 return multiple_p (wi::to_poly_offset (constant_offset
), size
);
17247 /* Check if a scalar or vector stmt could be part of a region of code
17248 that does nothing more than store values to memory, in the scalar
17249 case using STP. Return the cost of the stmt if so, counting 2 for
17250 one instruction. Return ~0U otherwise.
17252 The arguments are a subset of those passed to add_stmt_cost. */
17254 aarch64_stp_sequence_cost (unsigned int count
, vect_cost_for_stmt kind
,
17255 stmt_vec_info stmt_info
, tree vectype
)
17257 /* Code that stores vector constants uses a vector_load to create
17258 the constant. We don't apply the heuristic to that case for two
17261 - At the moment, STPs are only formed via peephole2, and the
17262 constant scalar moves would often come between STRs and so
17263 prevent STP formation.
17265 - The scalar code also has to load the constant somehow, and that
17269 case scalar_to_vec
:
17270 /* Count 2 insns for a GPR->SIMD dup and 1 insn for a FPR->SIMD dup. */
17271 return (FLOAT_TYPE_P (vectype
) ? 2 : 4) * count
;
17273 case vec_construct
:
17274 if (FLOAT_TYPE_P (vectype
))
17275 /* Count 1 insn for the maximum number of FP->SIMD INS
17277 return (vect_nunits_for_cost (vectype
) - 1) * 2 * count
;
17279 /* Count 2 insns for a GPR->SIMD move and 2 insns for the
17280 maximum number of GPR->SIMD INS instructions. */
17281 return vect_nunits_for_cost (vectype
) * 4 * count
;
17284 case unaligned_store
:
17285 /* Count 1 insn per vector if we can't form STP Q pairs. */
17286 if (aarch64_sve_mode_p (TYPE_MODE (vectype
)))
17291 /* Assume we won't be able to use STP if the constant offset
17292 component of the address is misaligned. ??? This could be
17293 removed if we formed STP pairs earlier, rather than relying
17295 auto size
= GET_MODE_SIZE (TYPE_MODE (vectype
));
17296 if (!aarch64_aligned_constant_offset_p (stmt_info
, size
))
17299 return CEIL (count
, 2) * 2;
17302 if (stmt_info
&& STMT_VINFO_DATA_REF (stmt_info
))
17304 /* Check for a mode in which STP pairs can be formed. */
17305 auto size
= GET_MODE_SIZE (TYPE_MODE (aarch64_dr_type (stmt_info
)));
17306 if (maybe_ne (size
, 4) && maybe_ne (size
, 8))
17309 /* Assume we won't be able to use STP if the constant offset
17310 component of the address is misaligned. ??? This could be
17311 removed if we formed STP pairs earlier, rather than relying
17313 if (!aarch64_aligned_constant_offset_p (stmt_info
, size
))
17324 aarch64_vector_costs::add_stmt_cost (int count
, vect_cost_for_stmt kind
,
17325 stmt_vec_info stmt_info
, slp_tree node
,
17326 tree vectype
, int misalign
,
17327 vect_cost_model_location where
)
17329 fractional_cost stmt_cost
17330 = aarch64_builtin_vectorization_cost (kind
, vectype
, misalign
);
17332 bool in_inner_loop_p
= (where
== vect_body
17334 && stmt_in_inner_loop_p (m_vinfo
, stmt_info
));
17336 /* Do one-time initialization based on the vinfo. */
17337 loop_vec_info loop_vinfo
= dyn_cast
<loop_vec_info
> (m_vinfo
);
17338 if (!m_analyzed_vinfo
&& aarch64_use_new_vector_costs_p ())
17341 analyze_loop_vinfo (loop_vinfo
);
17343 m_analyzed_vinfo
= true;
17346 /* Apply the heuristic described above m_stp_sequence_cost. */
17347 if (m_stp_sequence_cost
!= ~0U)
17349 uint64_t cost
= aarch64_stp_sequence_cost (count
, kind
,
17350 stmt_info
, vectype
);
17351 m_stp_sequence_cost
= MIN (m_stp_sequence_cost
+ cost
, ~0U);
17354 /* Try to get a more accurate cost by looking at STMT_INFO instead
17355 of just looking at KIND. */
17356 if (stmt_info
&& aarch64_use_new_vector_costs_p ())
17358 /* If we scalarize a strided store, the vectorizer costs one
17359 vec_to_scalar for each element. However, we can store the first
17360 element using an FP store without a separate extract step. */
17361 if (vect_is_store_elt_extraction (kind
, stmt_info
))
17364 stmt_cost
= aarch64_detect_scalar_stmt_subtype (m_vinfo
, kind
,
17365 stmt_info
, stmt_cost
);
17367 if (vectype
&& m_vec_flags
)
17368 stmt_cost
= aarch64_detect_vector_stmt_subtype (m_vinfo
, kind
,
17373 /* Check if we've seen an SVE gather/scatter operation and which size. */
17374 if (kind
== scalar_load
17375 && aarch64_sve_mode_p (TYPE_MODE (vectype
))
17376 && vect_mem_access_type (stmt_info
, node
) == VMAT_GATHER_SCATTER
)
17378 const sve_vec_cost
*sve_costs
= aarch64_tune_params
.vec_costs
->sve
;
17381 /* Test for VNx2 modes, which have 64-bit containers. */
17382 if (known_eq (GET_MODE_NUNITS (TYPE_MODE (vectype
)),
17384 m_sve_gather_scatter_init_cost
17385 += sve_costs
->gather_load_x64_init_cost
;
17387 m_sve_gather_scatter_init_cost
17388 += sve_costs
->gather_load_x32_init_cost
;
17393 /* Do any SVE-specific adjustments to the cost. */
17394 if (stmt_info
&& vectype
&& aarch64_sve_mode_p (TYPE_MODE (vectype
)))
17395 stmt_cost
= aarch64_sve_adjust_stmt_cost (m_vinfo
, kind
, stmt_info
,
17396 vectype
, stmt_cost
);
17398 /* Vector promotion and demotion requires us to widen the operation first
17399 and only after that perform the conversion. Unfortunately the mid-end
17400 expects this to be doable as a single operation and doesn't pass on
17401 enough context here for us to tell which operation is happening. To
17402 account for this we count every promote-demote operation twice and if
17403 the previously costed operation was also a promote-demote we reduce
17404 the cost of the currently being costed operation to simulate the final
17405 conversion cost. Note that for SVE we can do better here if the converted
17406 value comes from a load since the widening load would consume the widening
17407 operations. However since we're in stage 3 we can't change the helper
17408 vect_is_extending_load and duplicating the code seems not useful. */
17409 gassign
*assign
= NULL
;
17410 if (kind
== vec_promote_demote
17411 && (assign
= dyn_cast
<gassign
*> (STMT_VINFO_STMT (stmt_info
)))
17412 && gimple_assign_rhs_code (assign
) == FLOAT_EXPR
)
17414 auto new_count
= count
* 2 - m_num_last_promote_demote
;
17415 m_num_last_promote_demote
= count
;
17419 m_num_last_promote_demote
= 0;
17421 if (stmt_info
&& aarch64_use_new_vector_costs_p ())
17423 /* Account for any extra "embedded" costs that apply additively
17424 to the base cost calculated above. */
17425 stmt_cost
= aarch64_adjust_stmt_cost (m_vinfo
, kind
, stmt_info
, node
,
17426 vectype
, m_vec_flags
, stmt_cost
);
17428 /* If we're recording a nonzero vector loop body cost for the
17429 innermost loop, also estimate the operations that would need
17430 to be issued by all relevant implementations of the loop. */
17432 && (m_costing_for_scalar
|| where
== vect_body
)
17433 && (!LOOP_VINFO_LOOP (loop_vinfo
)->inner
|| in_inner_loop_p
)
17435 for (auto &ops
: m_ops
)
17436 count_ops (count
, kind
, stmt_info
, node
, &ops
);
17438 /* If we're applying the SVE vs. Advanced SIMD unrolling heuristic,
17439 estimate the number of statements in the unrolled Advanced SIMD
17440 loop. For simplicitly, we assume that one iteration of the
17441 Advanced SIMD loop would need the same number of statements
17442 as one iteration of the SVE loop. */
17443 if (where
== vect_body
&& m_unrolled_advsimd_niters
)
17444 m_unrolled_advsimd_stmts
+= count
* m_unrolled_advsimd_niters
;
17446 /* Detect the use of an averaging operation. */
17447 gimple
*stmt
= stmt_info
->stmt
;
17448 if (is_gimple_call (stmt
)
17449 && gimple_call_internal_p (stmt
))
17451 switch (gimple_call_internal_fn (stmt
))
17453 case IFN_AVG_FLOOR
:
17462 /* If the statement stores to a decl that is known to be the argument
17463 to a vld1 in the same function, ignore the store for costing purposes.
17464 See the comment above m_stores_to_vector_load_decl for more details. */
17466 && (kind
== vector_store
|| kind
== unaligned_store
)
17467 && aarch64_accesses_vector_load_decl_p (stmt_info
))
17470 m_stores_to_vector_load_decl
= true;
17473 return record_stmt_cost (stmt_info
, where
, (count
* stmt_cost
).ceil ());
17476 /* Return true if (a) we're applying the Advanced SIMD vs. SVE unrolling
17477 heuristic described above m_unrolled_advsimd_niters and (b) the heuristic
17478 says that we should prefer the Advanced SIMD loop. */
17480 aarch64_vector_costs::prefer_unrolled_loop () const
17482 if (!m_unrolled_advsimd_stmts
)
17485 if (dump_enabled_p ())
17486 dump_printf_loc (MSG_NOTE
, vect_location
, "Number of insns in"
17487 " unrolled Advanced SIMD loop = "
17488 HOST_WIDE_INT_PRINT_UNSIGNED
"\n",
17489 m_unrolled_advsimd_stmts
);
17491 /* The balance here is tricky. On the one hand, we can't be sure whether
17492 the code is vectorizable with Advanced SIMD or not. However, even if
17493 it isn't vectorizable with Advanced SIMD, there's a possibility that
17494 the scalar code could also be unrolled. Some of the code might then
17495 benefit from SLP, or from using LDP and STP. We therefore apply
17496 the heuristic regardless of can_use_advsimd_p. */
17497 return (m_unrolled_advsimd_stmts
17498 && (m_unrolled_advsimd_stmts
17499 <= (unsigned int) param_max_completely_peeled_insns
));
17502 /* Subroutine of adjust_body_cost for handling SVE. Use ISSUE_INFO to work out
17503 how fast the SVE code can be issued and compare it to the equivalent value
17504 for scalar code (SCALAR_CYCLES_PER_ITER). If COULD_USE_ADVSIMD is true,
17505 also compare it to the issue rate of Advanced SIMD code
17506 (ADVSIMD_CYCLES_PER_ITER).
17508 ORIG_BODY_COST is the cost originally passed to adjust_body_cost and
17509 *BODY_COST is the current value of the adjusted cost. *SHOULD_DISPARAGE
17510 is true if we think the loop body is too expensive. */
17513 aarch64_vector_costs::
17514 adjust_body_cost_sve (const aarch64_vec_op_count
*ops
,
17515 fractional_cost scalar_cycles_per_iter
,
17516 unsigned int orig_body_cost
, unsigned int *body_cost
,
17517 bool *should_disparage
)
17519 if (dump_enabled_p ())
17522 fractional_cost sve_pred_cycles_per_iter
= ops
->min_pred_cycles_per_iter ();
17523 fractional_cost sve_cycles_per_iter
= ops
->min_cycles_per_iter ();
17525 /* If the scalar version of the loop could issue at least as
17526 quickly as the predicate parts of the SVE loop, make the SVE loop
17527 prohibitively expensive. In this case vectorization is adding an
17528 overhead that the original scalar code didn't have.
17530 This is mostly intended to detect cases in which WHILELOs dominate
17531 for very tight loops, which is something that normal latency-based
17532 costs would not model. Adding this kind of cliffedge would be
17533 too drastic for scalar_cycles_per_iter vs. sve_cycles_per_iter;
17534 code in the caller handles that case in a more conservative way. */
17535 fractional_cost sve_estimate
= sve_pred_cycles_per_iter
+ 1;
17536 if (scalar_cycles_per_iter
< sve_estimate
)
17538 unsigned int min_cost
17539 = orig_body_cost
* estimated_poly_value (BYTES_PER_SVE_VECTOR
);
17540 if (*body_cost
< min_cost
)
17542 if (dump_enabled_p ())
17543 dump_printf_loc (MSG_NOTE
, vect_location
,
17544 "Increasing body cost to %d because the"
17545 " scalar code could issue within the limit"
17546 " imposed by predicate operations\n",
17548 *body_cost
= min_cost
;
17549 *should_disparage
= true;
17553 return sve_cycles_per_iter
;
17557 aarch64_vector_costs::determine_suggested_unroll_factor ()
17559 bool sve
= m_vec_flags
& VEC_ANY_SVE
;
17560 /* If we are trying to unroll an Advanced SIMD main loop that contains
17561 an averaging operation that we do not support with SVE and we might use a
17562 predicated epilogue, we need to be conservative and block unrolling as
17563 this might lead to a less optimal loop for the first and only epilogue
17564 using the original loop's vectorization factor.
17565 TODO: Remove this constraint when we add support for multiple epilogue
17567 if (!sve
&& !TARGET_SVE2
&& m_has_avg
)
17570 unsigned int max_unroll_factor
= 1;
17571 for (auto vec_ops
: m_ops
)
17573 aarch64_simd_vec_issue_info
const *vec_issue
17574 = vec_ops
.simd_issue_info ();
17577 /* Limit unroll factor to a value adjustable by the user, the default
17579 unsigned int unroll_factor
= aarch64_vect_unroll_limit
;
17580 unsigned int factor
17581 = vec_ops
.reduction_latency
> 1 ? vec_ops
.reduction_latency
: 1;
17584 /* Sanity check, this should never happen. */
17585 if ((vec_ops
.stores
+ vec_ops
.loads
+ vec_ops
.general_ops
) == 0)
17588 /* Check stores. */
17589 if (vec_ops
.stores
> 0)
17591 temp
= CEIL (factor
* vec_issue
->stores_per_cycle
,
17593 unroll_factor
= MIN (unroll_factor
, temp
);
17596 /* Check loads + stores. */
17597 if (vec_ops
.loads
> 0)
17599 temp
= CEIL (factor
* vec_issue
->loads_stores_per_cycle
,
17600 vec_ops
.loads
+ vec_ops
.stores
);
17601 unroll_factor
= MIN (unroll_factor
, temp
);
17604 /* Check general ops. */
17605 if (vec_ops
.general_ops
> 0)
17607 temp
= CEIL (factor
* vec_issue
->general_ops_per_cycle
,
17608 vec_ops
.general_ops
);
17609 unroll_factor
= MIN (unroll_factor
, temp
);
17611 max_unroll_factor
= MAX (max_unroll_factor
, unroll_factor
);
17614 /* Make sure unroll factor is power of 2. */
17615 return 1 << ceil_log2 (max_unroll_factor
);
17618 /* BODY_COST is the cost of a vector loop body. Adjust the cost as necessary
17619 and return the new cost. */
17621 aarch64_vector_costs::
17622 adjust_body_cost (loop_vec_info loop_vinfo
,
17623 const aarch64_vector_costs
*scalar_costs
,
17624 unsigned int body_cost
)
17626 if (scalar_costs
->m_ops
.is_empty () || m_ops
.is_empty ())
17629 const auto &scalar_ops
= scalar_costs
->m_ops
[0];
17630 const auto &vector_ops
= m_ops
[0];
17631 unsigned int estimated_vf
= vect_vf_for_cost (loop_vinfo
);
17632 unsigned int orig_body_cost
= body_cost
;
17633 bool should_disparage
= false;
17635 if (dump_enabled_p ())
17636 dump_printf_loc (MSG_NOTE
, vect_location
,
17637 "Original vector body cost = %d\n", body_cost
);
17639 /* If we know we have a single partial vector iteration, cap the VF
17640 to the number of scalar iterations for costing purposes. */
17641 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
))
17643 auto niters
= LOOP_VINFO_INT_NITERS (loop_vinfo
);
17644 if (niters
< estimated_vf
&& dump_enabled_p ())
17645 dump_printf_loc (MSG_NOTE
, vect_location
,
17646 "Scalar loop iterates at most %wd times. Capping VF "
17647 " from %d to %wd\n", niters
, estimated_vf
, niters
);
17649 estimated_vf
= MIN (estimated_vf
, niters
);
17652 fractional_cost scalar_cycles_per_iter
17653 = scalar_ops
.min_cycles_per_iter () * estimated_vf
;
17655 fractional_cost vector_cycles_per_iter
= vector_ops
.min_cycles_per_iter ();
17657 if (dump_enabled_p ())
17659 if (IN_RANGE (m_num_vector_iterations
, 0, 65536))
17660 dump_printf_loc (MSG_NOTE
, vect_location
,
17661 "Vector loop iterates at most %wd times\n",
17662 m_num_vector_iterations
);
17663 dump_printf_loc (MSG_NOTE
, vect_location
, "Scalar issue estimate:\n");
17664 scalar_ops
.dump ();
17665 dump_printf_loc (MSG_NOTE
, vect_location
,
17666 " estimated cycles per vector iteration"
17667 " (for VF %d) = %f\n",
17668 estimated_vf
, scalar_cycles_per_iter
.as_double ());
17671 if (vector_ops
.sve_issue_info ())
17673 if (dump_enabled_p ())
17674 dump_printf_loc (MSG_NOTE
, vect_location
, "SVE issue estimate:\n");
17675 vector_cycles_per_iter
17676 = adjust_body_cost_sve (&vector_ops
, scalar_cycles_per_iter
,
17677 orig_body_cost
, &body_cost
, &should_disparage
);
17679 if (aarch64_tune_params
.vec_costs
== &neoverse512tvb_vector_cost
)
17681 /* Also take Neoverse V1 tuning into account, doubling the
17682 scalar and Advanced SIMD estimates to account for the
17683 doubling in SVE vector length. */
17684 if (dump_enabled_p ())
17685 dump_printf_loc (MSG_NOTE
, vect_location
,
17686 "Neoverse V1 estimate:\n");
17687 auto vf_factor
= m_ops
[1].vf_factor ();
17688 adjust_body_cost_sve (&m_ops
[1], scalar_cycles_per_iter
* vf_factor
,
17689 orig_body_cost
, &body_cost
, &should_disparage
);
17694 if (dump_enabled_p ())
17696 dump_printf_loc (MSG_NOTE
, vect_location
,
17697 "Vector issue estimate:\n");
17698 vector_ops
.dump ();
17702 /* Decide whether to stick to latency-based costs or whether to try to
17703 take issue rates into account. */
17704 unsigned int threshold
= aarch64_loop_vect_issue_rate_niters
;
17705 if (m_vec_flags
& VEC_ANY_SVE
)
17706 threshold
= CEIL (threshold
, aarch64_estimated_sve_vq ());
17708 if (m_num_vector_iterations
>= 1
17709 && m_num_vector_iterations
< threshold
)
17711 if (dump_enabled_p ())
17712 dump_printf_loc (MSG_NOTE
, vect_location
,
17713 "Low iteration count, so using pure latency"
17716 /* Increase the cost of the vector code if it looks like the scalar code
17717 could issue more quickly. These values are only rough estimates,
17718 so minor differences should only result in minor changes. */
17719 else if (scalar_cycles_per_iter
< vector_cycles_per_iter
)
17721 body_cost
= fractional_cost::scale (body_cost
, vector_cycles_per_iter
,
17722 scalar_cycles_per_iter
);
17723 if (dump_enabled_p ())
17724 dump_printf_loc (MSG_NOTE
, vect_location
,
17725 "Increasing body cost to %d because scalar code"
17726 " would issue more quickly\n", body_cost
);
17728 /* In general, it's expected that the proposed vector code would be able
17729 to issue more quickly than the original scalar code. This should
17730 already be reflected to some extent in the latency-based costs.
17732 However, the latency-based costs effectively assume that the scalar
17733 code and the vector code execute serially, which tends to underplay
17734 one important case: if the real (non-serialized) execution time of
17735 a scalar iteration is dominated by loop-carried dependencies,
17736 and if the vector code is able to reduce both the length of
17737 the loop-carried dependencies *and* the number of cycles needed
17738 to issue the code in general, we can be more confident that the
17739 vector code is an improvement, even if adding the other (non-loop-carried)
17740 latencies tends to hide this saving. We therefore reduce the cost of the
17741 vector loop body in proportion to the saving. */
17742 else if (scalar_ops
.reduction_latency
> vector_ops
.reduction_latency
17743 && scalar_ops
.reduction_latency
== scalar_cycles_per_iter
17744 && scalar_cycles_per_iter
> vector_cycles_per_iter
17745 && !should_disparage
)
17747 body_cost
= fractional_cost::scale (body_cost
, vector_cycles_per_iter
,
17748 scalar_cycles_per_iter
);
17749 if (dump_enabled_p ())
17750 dump_printf_loc (MSG_NOTE
, vect_location
,
17751 "Decreasing body cost to %d account for smaller"
17752 " reduction latency\n", body_cost
);
17759 aarch64_vector_costs::finish_cost (const vector_costs
*uncast_scalar_costs
)
17761 /* Record the issue information for any SVE WHILE instructions that the
17763 loop_vec_info loop_vinfo
= dyn_cast
<loop_vec_info
> (m_vinfo
);
17764 if (!m_ops
.is_empty ()
17766 && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo
))
17768 unsigned int num_masks
= 0;
17769 rgroup_controls
*rgm
;
17770 unsigned int num_vectors_m1
;
17771 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo
).rgc_vec
,
17772 num_vectors_m1
, rgm
)
17774 num_masks
+= num_vectors_m1
+ 1;
17775 for (auto &ops
: m_ops
)
17776 if (auto *issue
= ops
.sve_issue_info ())
17777 ops
.pred_ops
+= num_masks
* issue
->while_pred_ops
;
17781 = static_cast<const aarch64_vector_costs
*> (uncast_scalar_costs
);
17784 && aarch64_use_new_vector_costs_p ())
17786 m_costs
[vect_body
] = adjust_body_cost (loop_vinfo
, scalar_costs
,
17787 m_costs
[vect_body
]);
17788 m_suggested_unroll_factor
= determine_suggested_unroll_factor ();
17790 /* For gather and scatters there's an additional overhead for the first
17791 iteration. For low count loops they're not beneficial so model the
17792 overhead as loop prologue costs. */
17793 m_costs
[vect_prologue
] += m_sve_gather_scatter_init_cost
;
17796 /* Apply the heuristic described above m_stp_sequence_cost. Prefer
17797 the scalar code in the event of a tie, since there is more chance
17798 of scalar code being optimized with surrounding operations.
17800 In addition, if the vector body is a simple store to a decl that
17801 is elsewhere loaded using vld1, strongly prefer the vector form,
17802 to the extent of giving the prologue a zero cost. See the comment
17803 above m_stores_to_vector_load_decl for details. */
17806 && m_stp_sequence_cost
!= ~0U)
17808 if (m_stores_to_vector_load_decl
)
17809 m_costs
[vect_prologue
] = 0;
17810 else if (m_stp_sequence_cost
>= scalar_costs
->m_stp_sequence_cost
)
17811 m_costs
[vect_body
] = 2 * scalar_costs
->total_cost ();
17814 vector_costs::finish_cost (scalar_costs
);
17818 aarch64_vector_costs::
17819 better_main_loop_than_p (const vector_costs
*uncast_other
) const
17821 auto other
= static_cast<const aarch64_vector_costs
*> (uncast_other
);
17823 auto this_loop_vinfo
= as_a
<loop_vec_info
> (this->m_vinfo
);
17824 auto other_loop_vinfo
= as_a
<loop_vec_info
> (other
->m_vinfo
);
17826 if (dump_enabled_p ())
17827 dump_printf_loc (MSG_NOTE
, vect_location
,
17828 "Comparing two main loops (%s at VF %d vs %s at VF %d)\n",
17829 GET_MODE_NAME (this_loop_vinfo
->vector_mode
),
17830 vect_vf_for_cost (this_loop_vinfo
),
17831 GET_MODE_NAME (other_loop_vinfo
->vector_mode
),
17832 vect_vf_for_cost (other_loop_vinfo
));
17834 /* Apply the unrolling heuristic described above
17835 m_unrolled_advsimd_niters. */
17836 if (bool (m_unrolled_advsimd_stmts
)
17837 != bool (other
->m_unrolled_advsimd_stmts
))
17839 bool this_prefer_unrolled
= this->prefer_unrolled_loop ();
17840 bool other_prefer_unrolled
= other
->prefer_unrolled_loop ();
17841 if (this_prefer_unrolled
!= other_prefer_unrolled
)
17843 if (dump_enabled_p ())
17844 dump_printf_loc (MSG_NOTE
, vect_location
,
17845 "Preferring Advanced SIMD loop because"
17846 " it can be unrolled\n");
17847 return other_prefer_unrolled
;
17851 for (unsigned int i
= 0; i
< m_ops
.length (); ++i
)
17853 if (dump_enabled_p ())
17856 dump_printf_loc (MSG_NOTE
, vect_location
,
17857 "Reconsidering with subtuning %d\n", i
);
17858 dump_printf_loc (MSG_NOTE
, vect_location
,
17859 "Issue info for %s loop:\n",
17860 GET_MODE_NAME (this_loop_vinfo
->vector_mode
));
17861 this->m_ops
[i
].dump ();
17862 dump_printf_loc (MSG_NOTE
, vect_location
,
17863 "Issue info for %s loop:\n",
17864 GET_MODE_NAME (other_loop_vinfo
->vector_mode
));
17865 other
->m_ops
[i
].dump ();
17868 auto this_estimated_vf
= (vect_vf_for_cost (this_loop_vinfo
)
17869 * this->m_ops
[i
].vf_factor ());
17870 auto other_estimated_vf
= (vect_vf_for_cost (other_loop_vinfo
)
17871 * other
->m_ops
[i
].vf_factor ());
17873 /* If it appears that one loop could process the same amount of data
17874 in fewer cycles, prefer that loop over the other one. */
17875 fractional_cost this_cost
17876 = this->m_ops
[i
].min_cycles_per_iter () * other_estimated_vf
;
17877 fractional_cost other_cost
17878 = other
->m_ops
[i
].min_cycles_per_iter () * this_estimated_vf
;
17879 if (dump_enabled_p ())
17881 dump_printf_loc (MSG_NOTE
, vect_location
,
17882 "Weighted cycles per iteration of %s loop ~= %f\n",
17883 GET_MODE_NAME (this_loop_vinfo
->vector_mode
),
17884 this_cost
.as_double ());
17885 dump_printf_loc (MSG_NOTE
, vect_location
,
17886 "Weighted cycles per iteration of %s loop ~= %f\n",
17887 GET_MODE_NAME (other_loop_vinfo
->vector_mode
),
17888 other_cost
.as_double ());
17890 if (this_cost
!= other_cost
)
17892 if (dump_enabled_p ())
17893 dump_printf_loc (MSG_NOTE
, vect_location
,
17894 "Preferring loop with lower cycles"
17895 " per iteration\n");
17896 return this_cost
< other_cost
;
17899 /* If the issue rate of SVE code is limited by predicate operations
17900 (i.e. if sve_pred_cycles_per_iter > sve_nonpred_cycles_per_iter),
17901 and if Advanced SIMD code could issue within the limit imposed
17902 by the predicate operations, the predicate operations are adding an
17903 overhead that the original code didn't have and so we should prefer
17904 the Advanced SIMD version. */
17905 auto better_pred_limit_p
= [](const aarch64_vec_op_count
&a
,
17906 const aarch64_vec_op_count
&b
) -> bool
17908 if (a
.pred_ops
== 0
17909 && (b
.min_pred_cycles_per_iter ()
17910 > b
.min_nonpred_cycles_per_iter ()))
17912 if (dump_enabled_p ())
17913 dump_printf_loc (MSG_NOTE
, vect_location
,
17914 "Preferring Advanced SIMD loop since"
17915 " SVE loop is predicate-limited\n");
17920 if (better_pred_limit_p (this->m_ops
[i
], other
->m_ops
[i
]))
17922 if (better_pred_limit_p (other
->m_ops
[i
], this->m_ops
[i
]))
17926 return vector_costs::better_main_loop_than_p (other
);
17929 static void initialize_aarch64_code_model (struct gcc_options
*);
17931 /* Parse the TO_PARSE string and put the architecture struct that it
17932 selects into RES and the architectural features into ISA_FLAGS.
17933 Return an aarch_parse_opt_result describing the parse result.
17934 If there is an error parsing, RES and ISA_FLAGS are left unchanged.
17935 When the TO_PARSE string contains an invalid extension,
17936 a copy of the string is created and stored to INVALID_EXTENSION. */
17938 static enum aarch_parse_opt_result
17939 aarch64_parse_arch (const char *to_parse
, const struct processor
**res
,
17940 aarch64_feature_flags
*isa_flags
,
17941 std::string
*invalid_extension
)
17944 const struct processor
*arch
;
17947 ext
= strchr (to_parse
, '+');
17950 len
= ext
- to_parse
;
17952 len
= strlen (to_parse
);
17955 return AARCH_PARSE_MISSING_ARG
;
17958 /* Loop through the list of supported ARCHes to find a match. */
17959 for (arch
= all_architectures
; arch
->name
!= NULL
; arch
++)
17961 if (strlen (arch
->name
) == len
17962 && strncmp (arch
->name
, to_parse
, len
) == 0)
17964 auto isa_temp
= arch
->flags
;
17968 /* TO_PARSE string contains at least one extension. */
17969 enum aarch_parse_opt_result ext_res
17970 = aarch64_parse_extension (ext
, &isa_temp
, invalid_extension
);
17972 if (ext_res
!= AARCH_PARSE_OK
)
17975 /* Extension parsing was successful. Confirm the result
17976 arch and ISA flags. */
17978 *isa_flags
= isa_temp
;
17979 return AARCH_PARSE_OK
;
17983 /* ARCH name not found in list. */
17984 return AARCH_PARSE_INVALID_ARG
;
17987 /* Parse the TO_PARSE string and put the result tuning in RES and the
17988 architecture flags in ISA_FLAGS. Return an aarch_parse_opt_result
17989 describing the parse result. If there is an error parsing, RES and
17990 ISA_FLAGS are left unchanged.
17991 When the TO_PARSE string contains an invalid extension,
17992 a copy of the string is created and stored to INVALID_EXTENSION. */
17994 static enum aarch_parse_opt_result
17995 aarch64_parse_cpu (const char *to_parse
, const struct processor
**res
,
17996 aarch64_feature_flags
*isa_flags
,
17997 std::string
*invalid_extension
)
18000 const struct processor
*cpu
;
18003 ext
= strchr (to_parse
, '+');
18006 len
= ext
- to_parse
;
18008 len
= strlen (to_parse
);
18011 return AARCH_PARSE_MISSING_ARG
;
18014 /* Loop through the list of supported CPUs to find a match. */
18015 for (cpu
= all_cores
; cpu
->name
!= NULL
; cpu
++)
18017 if (strlen (cpu
->name
) == len
&& strncmp (cpu
->name
, to_parse
, len
) == 0)
18019 auto isa_temp
= cpu
->flags
;
18023 /* TO_PARSE string contains at least one extension. */
18024 enum aarch_parse_opt_result ext_res
18025 = aarch64_parse_extension (ext
, &isa_temp
, invalid_extension
);
18027 if (ext_res
!= AARCH_PARSE_OK
)
18030 /* Extension parsing was successfull. Confirm the result
18031 cpu and ISA flags. */
18033 *isa_flags
= isa_temp
;
18034 return AARCH_PARSE_OK
;
18038 /* CPU name not found in list. */
18039 return AARCH_PARSE_INVALID_ARG
;
18042 /* Parse the TO_PARSE string and put the cpu it selects into RES.
18043 Return an aarch_parse_opt_result describing the parse result.
18044 If the parsing fails the RES does not change. */
18046 static enum aarch_parse_opt_result
18047 aarch64_parse_tune (const char *to_parse
, const struct processor
**res
)
18049 const struct processor
*cpu
;
18051 /* Loop through the list of supported CPUs to find a match. */
18052 for (cpu
= all_cores
; cpu
->name
!= NULL
; cpu
++)
18054 if (strcmp (cpu
->name
, to_parse
) == 0)
18057 return AARCH_PARSE_OK
;
18061 /* CPU name not found in list. */
18062 return AARCH_PARSE_INVALID_ARG
;
18065 /* Parse TOKEN, which has length LENGTH to see if it is an option
18066 described in FLAG. If it is, return the index bit for that fusion type.
18067 If not, error (printing OPTION_NAME) and return zero. */
18069 static unsigned int
18070 aarch64_parse_one_option_token (const char *token
,
18072 const struct aarch64_flag_desc
*flag
,
18073 const char *option_name
)
18075 for (; flag
->name
!= NULL
; flag
++)
18077 if (length
== strlen (flag
->name
)
18078 && !strncmp (flag
->name
, token
, length
))
18082 error ("unknown flag passed in %<-moverride=%s%> (%s)", option_name
, token
);
18086 /* Parse OPTION which is a comma-separated list of flags to enable.
18087 FLAGS gives the list of flags we understand, INITIAL_STATE gives any
18088 default state we inherit from the CPU tuning structures. OPTION_NAME
18089 gives the top-level option we are parsing in the -moverride string,
18090 for use in error messages. */
18092 static unsigned int
18093 aarch64_parse_boolean_options (const char *option
,
18094 const struct aarch64_flag_desc
*flags
,
18095 unsigned int initial_state
,
18096 const char *option_name
)
18098 const char separator
= '.';
18099 const char* specs
= option
;
18100 const char* ntoken
= option
;
18101 unsigned int found_flags
= initial_state
;
18103 while ((ntoken
= strchr (specs
, separator
)))
18105 size_t token_length
= ntoken
- specs
;
18106 unsigned token_ops
= aarch64_parse_one_option_token (specs
,
18110 /* If we find "none" (or, for simplicity's sake, an error) anywhere
18111 in the token stream, reset the supported operations. So:
18113 adrp+add.cmp+branch.none.adrp+add
18115 would have the result of turning on only adrp+add fusion. */
18119 found_flags
|= token_ops
;
18123 /* We ended with a comma, print something. */
18126 error ("%qs string ill-formed", option_name
);
18130 /* We still have one more token to parse. */
18131 size_t token_length
= strlen (specs
);
18132 unsigned token_ops
= aarch64_parse_one_option_token (specs
,
18139 found_flags
|= token_ops
;
18140 return found_flags
;
18143 /* Support for overriding instruction fusion. */
18146 aarch64_parse_fuse_string (const char *fuse_string
,
18147 struct tune_params
*tune
)
18149 tune
->fusible_ops
= aarch64_parse_boolean_options (fuse_string
,
18150 aarch64_fusible_pairs
,
18155 /* Support for overriding other tuning flags. */
18158 aarch64_parse_tune_string (const char *tune_string
,
18159 struct tune_params
*tune
)
18161 tune
->extra_tuning_flags
18162 = aarch64_parse_boolean_options (tune_string
,
18163 aarch64_tuning_flags
,
18164 tune
->extra_tuning_flags
,
18168 /* Parse the sve_width tuning moverride string in TUNE_STRING.
18169 Accept the valid SVE vector widths allowed by
18170 aarch64_sve_vector_bits_enum and use it to override sve_width
18174 aarch64_parse_sve_width_string (const char *tune_string
,
18175 struct tune_params
*tune
)
18179 int n
= sscanf (tune_string
, "%d", &width
);
18182 error ("invalid format for %<sve_width%>");
18194 error ("invalid %<sve_width%> value: %d", width
);
18196 tune
->sve_width
= (enum aarch64_sve_vector_bits_enum
) width
;
18199 /* Parse TOKEN, which has length LENGTH to see if it is a tuning option
18200 we understand. If it is, extract the option string and handoff to
18201 the appropriate function. */
18204 aarch64_parse_one_override_token (const char* token
,
18206 struct tune_params
*tune
)
18208 const struct aarch64_tuning_override_function
*fn
18209 = aarch64_tuning_override_functions
;
18211 const char *option_part
= strchr (token
, '=');
18214 error ("tuning string missing in option (%s)", token
);
18218 /* Get the length of the option name. */
18219 length
= option_part
- token
;
18220 /* Skip the '=' to get to the option string. */
18223 for (; fn
->name
!= NULL
; fn
++)
18225 if (!strncmp (fn
->name
, token
, length
))
18227 fn
->parse_override (option_part
, tune
);
18232 error ("unknown tuning option (%s)",token
);
18236 /* A checking mechanism for the implementation of the tls size. */
18239 initialize_aarch64_tls_size (struct gcc_options
*opts
)
18241 if (aarch64_tls_size
== 0)
18242 aarch64_tls_size
= 24;
18244 switch (opts
->x_aarch64_cmodel_var
)
18246 case AARCH64_CMODEL_TINY
:
18247 /* Both the default and maximum TLS size allowed under tiny is 1M which
18248 needs two instructions to address, so we clamp the size to 24. */
18249 if (aarch64_tls_size
> 24)
18250 aarch64_tls_size
= 24;
18252 case AARCH64_CMODEL_SMALL
:
18253 /* The maximum TLS size allowed under small is 4G. */
18254 if (aarch64_tls_size
> 32)
18255 aarch64_tls_size
= 32;
18257 case AARCH64_CMODEL_LARGE
:
18258 /* The maximum TLS size allowed under large is 16E.
18259 FIXME: 16E should be 64bit, we only support 48bit offset now. */
18260 if (aarch64_tls_size
> 48)
18261 aarch64_tls_size
= 48;
18264 gcc_unreachable ();
18270 /* Return the CPU corresponding to the enum CPU. */
18272 static const struct processor
*
18273 aarch64_get_tune_cpu (enum aarch64_processor cpu
)
18275 gcc_assert (cpu
!= aarch64_none
);
18277 return &all_cores
[cpu
];
18280 /* Return the architecture corresponding to the enum ARCH. */
18282 static const struct processor
*
18283 aarch64_get_arch (enum aarch64_arch arch
)
18285 gcc_assert (arch
!= aarch64_no_arch
);
18287 return &all_architectures
[arch
];
18290 /* Parse STRING looking for options in the format:
18291 string :: option:string
18292 option :: name=substring
18294 substring :: defined by option. */
18297 aarch64_parse_override_string (const char* input_string
,
18298 struct tune_params
* tune
)
18300 const char separator
= ':';
18301 size_t string_length
= strlen (input_string
) + 1;
18302 char *string_root
= (char *) xmalloc (sizeof (*string_root
) * string_length
);
18303 char *string
= string_root
;
18304 strncpy (string
, input_string
, string_length
);
18305 string
[string_length
- 1] = '\0';
18307 char* ntoken
= string
;
18309 while ((ntoken
= strchr (string
, separator
)))
18311 size_t token_length
= ntoken
- string
;
18312 /* Make this substring look like a string. */
18314 aarch64_parse_one_override_token (string
, token_length
, tune
);
18318 /* One last option to parse. */
18319 aarch64_parse_one_override_token (string
, strlen (string
), tune
);
18320 free (string_root
);
18323 /* Adjust CURRENT_TUNE (a generic tuning struct) with settings that
18324 are best for a generic target with the currently-enabled architecture
18327 aarch64_adjust_generic_arch_tuning (struct tune_params
¤t_tune
)
18329 /* Neoverse V1 is the only core that is known to benefit from
18330 AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS. There is therefore no
18331 point enabling it for SVE2 and above. */
18333 current_tune
.extra_tuning_flags
18334 &= ~AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS
;
18338 aarch64_override_options_after_change_1 (struct gcc_options
*opts
)
18340 /* PR 70044: We have to be careful about being called multiple times for the
18341 same function. This means all changes should be repeatable. */
18343 /* Set aarch64_use_frame_pointer based on -fno-omit-frame-pointer.
18344 Disable the frame pointer flag so the mid-end will not use a frame
18345 pointer in leaf functions in order to support -fomit-leaf-frame-pointer.
18346 Set x_flag_omit_frame_pointer to the special value 2 to differentiate
18347 between -fomit-frame-pointer (1) and -fno-omit-frame-pointer (2). */
18348 aarch64_use_frame_pointer
= opts
->x_flag_omit_frame_pointer
!= 1;
18349 if (opts
->x_flag_omit_frame_pointer
== 0)
18350 opts
->x_flag_omit_frame_pointer
= 2;
18352 /* If not optimizing for size, set the default
18353 alignment to what the target wants. */
18354 if (!opts
->x_optimize_size
)
18356 if (opts
->x_flag_align_loops
&& !opts
->x_str_align_loops
)
18357 opts
->x_str_align_loops
= aarch64_tune_params
.loop_align
;
18358 if (opts
->x_flag_align_jumps
&& !opts
->x_str_align_jumps
)
18359 opts
->x_str_align_jumps
= aarch64_tune_params
.jump_align
;
18360 if (opts
->x_flag_align_functions
&& !opts
->x_str_align_functions
)
18361 opts
->x_str_align_functions
= aarch64_tune_params
.function_align
;
18364 /* We default to no pc-relative literal loads. */
18366 aarch64_pcrelative_literal_loads
= false;
18368 /* If -mpc-relative-literal-loads is set on the command line, this
18369 implies that the user asked for PC relative literal loads. */
18370 if (opts
->x_pcrelative_literal_loads
== 1)
18371 aarch64_pcrelative_literal_loads
= true;
18373 /* In the tiny memory model it makes no sense to disallow PC relative
18374 literal pool loads. */
18375 if (aarch64_cmodel
== AARCH64_CMODEL_TINY
18376 || aarch64_cmodel
== AARCH64_CMODEL_TINY_PIC
)
18377 aarch64_pcrelative_literal_loads
= true;
18379 /* When enabling the lower precision Newton series for the square root, also
18380 enable it for the reciprocal square root, since the latter is an
18381 intermediary step for the former. */
18382 if (flag_mlow_precision_sqrt
)
18383 flag_mrecip_low_precision_sqrt
= true;
18386 /* 'Unpack' up the internal tuning structs and update the options
18387 in OPTS. The caller must have set up selected_tune and selected_arch
18388 as all the other target-specific codegen decisions are
18389 derived from them. */
18392 aarch64_override_options_internal (struct gcc_options
*opts
)
18394 const struct processor
*tune
= aarch64_get_tune_cpu (opts
->x_selected_tune
);
18395 aarch64_tune
= tune
->sched_core
;
18396 /* Make a copy of the tuning parameters attached to the core, which
18397 we may later overwrite. */
18398 aarch64_tune_params
= *(tune
->tune
);
18399 if (tune
->tune
== &generic_tunings
)
18400 aarch64_adjust_generic_arch_tuning (aarch64_tune_params
);
18402 if (opts
->x_aarch64_override_tune_string
)
18403 aarch64_parse_override_string (opts
->x_aarch64_override_tune_string
,
18404 &aarch64_tune_params
);
18406 if (opts
->x_aarch64_ldp_policy_param
)
18407 aarch64_tune_params
.ldp_policy_model
= opts
->x_aarch64_ldp_policy_param
;
18409 if (opts
->x_aarch64_stp_policy_param
)
18410 aarch64_tune_params
.stp_policy_model
= opts
->x_aarch64_stp_policy_param
;
18412 /* This target defaults to strict volatile bitfields. */
18413 if (opts
->x_flag_strict_volatile_bitfields
< 0 && abi_version_at_least (2))
18414 opts
->x_flag_strict_volatile_bitfields
= 1;
18416 if (aarch64_stack_protector_guard
== SSP_GLOBAL
18417 && opts
->x_aarch64_stack_protector_guard_offset_str
)
18419 error ("incompatible options %<-mstack-protector-guard=global%> and "
18420 "%<-mstack-protector-guard-offset=%s%>",
18421 aarch64_stack_protector_guard_offset_str
);
18424 if (aarch64_stack_protector_guard
== SSP_SYSREG
18425 && !(opts
->x_aarch64_stack_protector_guard_offset_str
18426 && opts
->x_aarch64_stack_protector_guard_reg_str
))
18428 error ("both %<-mstack-protector-guard-offset%> and "
18429 "%<-mstack-protector-guard-reg%> must be used "
18430 "with %<-mstack-protector-guard=sysreg%>");
18433 if (opts
->x_aarch64_stack_protector_guard_reg_str
)
18435 if (strlen (opts
->x_aarch64_stack_protector_guard_reg_str
) > 100)
18436 error ("specify a system register with a small string length");
18439 if (opts
->x_aarch64_stack_protector_guard_offset_str
)
18442 const char *str
= aarch64_stack_protector_guard_offset_str
;
18444 long offs
= strtol (aarch64_stack_protector_guard_offset_str
, &end
, 0);
18445 if (!*str
|| *end
|| errno
)
18446 error ("%qs is not a valid offset in %qs", str
,
18447 "-mstack-protector-guard-offset=");
18448 aarch64_stack_protector_guard_offset
= offs
;
18451 if ((flag_sanitize
& SANITIZE_SHADOW_CALL_STACK
)
18452 && !fixed_regs
[R18_REGNUM
])
18453 error ("%<-fsanitize=shadow-call-stack%> requires %<-ffixed-x18%>");
18455 aarch64_feature_flags isa_flags
= aarch64_get_isa_flags (opts
);
18456 if ((isa_flags
& (AARCH64_FL_SM_ON
| AARCH64_FL_ZA_ON
))
18457 && !(isa_flags
& AARCH64_FL_SME
))
18459 if (isa_flags
& AARCH64_FL_SM_ON
)
18460 error ("streaming functions require the ISA extension %qs", "sme");
18462 error ("functions with SME state require the ISA extension %qs",
18464 inform (input_location
, "you can enable %qs using the command-line"
18465 " option %<-march%>, or by using the %<target%>"
18466 " attribute or pragma", "sme");
18467 opts
->x_target_flags
&= ~MASK_GENERAL_REGS_ONLY
;
18468 auto new_flags
= isa_flags
| feature_deps::SME ().enable
;
18469 aarch64_set_asm_isa_flags (opts
, new_flags
);
18472 initialize_aarch64_code_model (opts
);
18473 initialize_aarch64_tls_size (opts
);
18474 aarch64_tpidr_register
= opts
->x_aarch64_tpidr_reg
;
18476 int queue_depth
= 0;
18477 switch (aarch64_tune_params
.autoprefetcher_model
)
18479 case tune_params::AUTOPREFETCHER_OFF
:
18482 case tune_params::AUTOPREFETCHER_WEAK
:
18485 case tune_params::AUTOPREFETCHER_STRONG
:
18486 queue_depth
= max_insn_queue_index
+ 1;
18489 gcc_unreachable ();
18492 /* We don't mind passing in global_options_set here as we don't use
18493 the *options_set structs anyway. */
18494 SET_OPTION_IF_UNSET (opts
, &global_options_set
,
18495 param_sched_autopref_queue_depth
, queue_depth
);
18497 /* Set up parameters to be used in prefetching algorithm. Do not
18498 override the defaults unless we are tuning for a core we have
18499 researched values for. */
18500 if (aarch64_tune_params
.prefetch
->num_slots
> 0)
18501 SET_OPTION_IF_UNSET (opts
, &global_options_set
,
18502 param_simultaneous_prefetches
,
18503 aarch64_tune_params
.prefetch
->num_slots
);
18504 if (aarch64_tune_params
.prefetch
->l1_cache_size
>= 0)
18505 SET_OPTION_IF_UNSET (opts
, &global_options_set
,
18506 param_l1_cache_size
,
18507 aarch64_tune_params
.prefetch
->l1_cache_size
);
18508 if (aarch64_tune_params
.prefetch
->l1_cache_line_size
>= 0)
18509 SET_OPTION_IF_UNSET (opts
, &global_options_set
,
18510 param_l1_cache_line_size
,
18511 aarch64_tune_params
.prefetch
->l1_cache_line_size
);
18513 if (aarch64_tune_params
.prefetch
->l1_cache_line_size
>= 0)
18515 SET_OPTION_IF_UNSET (opts
, &global_options_set
,
18516 param_destruct_interfere_size
,
18517 aarch64_tune_params
.prefetch
->l1_cache_line_size
);
18518 SET_OPTION_IF_UNSET (opts
, &global_options_set
,
18519 param_construct_interfere_size
,
18520 aarch64_tune_params
.prefetch
->l1_cache_line_size
);
18524 /* For a generic AArch64 target, cover the current range of cache line
18526 SET_OPTION_IF_UNSET (opts
, &global_options_set
,
18527 param_destruct_interfere_size
,
18529 SET_OPTION_IF_UNSET (opts
, &global_options_set
,
18530 param_construct_interfere_size
,
18534 if (aarch64_tune_params
.prefetch
->l2_cache_size
>= 0)
18535 SET_OPTION_IF_UNSET (opts
, &global_options_set
,
18536 param_l2_cache_size
,
18537 aarch64_tune_params
.prefetch
->l2_cache_size
);
18538 if (!aarch64_tune_params
.prefetch
->prefetch_dynamic_strides
)
18539 SET_OPTION_IF_UNSET (opts
, &global_options_set
,
18540 param_prefetch_dynamic_strides
, 0);
18541 if (aarch64_tune_params
.prefetch
->minimum_stride
>= 0)
18542 SET_OPTION_IF_UNSET (opts
, &global_options_set
,
18543 param_prefetch_minimum_stride
,
18544 aarch64_tune_params
.prefetch
->minimum_stride
);
18546 /* Use the alternative scheduling-pressure algorithm by default. */
18547 SET_OPTION_IF_UNSET (opts
, &global_options_set
,
18548 param_sched_pressure_algorithm
,
18549 SCHED_PRESSURE_MODEL
);
18551 /* Validate the guard size. */
18552 int guard_size
= param_stack_clash_protection_guard_size
;
18554 if (guard_size
!= 12 && guard_size
!= 16)
18555 error ("only values 12 (4 KB) and 16 (64 KB) are supported for guard "
18556 "size. Given value %d (%llu KB) is out of range",
18557 guard_size
, (1ULL << guard_size
) / 1024ULL);
18559 /* Enforce that interval is the same size as size so the mid-end does the
18561 SET_OPTION_IF_UNSET (opts
, &global_options_set
,
18562 param_stack_clash_protection_probe_interval
,
18565 /* The maybe_set calls won't update the value if the user has explicitly set
18566 one. Which means we need to validate that probing interval and guard size
18569 = param_stack_clash_protection_probe_interval
;
18570 if (guard_size
!= probe_interval
)
18571 error ("stack clash guard size %<%d%> must be equal to probing interval "
18572 "%<%d%>", guard_size
, probe_interval
);
18574 /* Enable sw prefetching at specified optimization level for
18575 CPUS that have prefetch. Lower optimization level threshold by 1
18576 when profiling is enabled. */
18577 if (opts
->x_flag_prefetch_loop_arrays
< 0
18578 && !opts
->x_optimize_size
18579 && aarch64_tune_params
.prefetch
->default_opt_level
>= 0
18580 && opts
->x_optimize
>= aarch64_tune_params
.prefetch
->default_opt_level
)
18581 opts
->x_flag_prefetch_loop_arrays
= 1;
18583 /* Avoid loop-dependant FMA chains. */
18584 if (aarch64_tune_params
.extra_tuning_flags
18585 & AARCH64_EXTRA_TUNE_AVOID_CROSS_LOOP_FMA
)
18586 SET_OPTION_IF_UNSET (opts
, &global_options_set
, param_avoid_fma_max_bits
,
18589 /* Consider fully pipelined FMA in reassociation. */
18590 if (aarch64_tune_params
.extra_tuning_flags
18591 & AARCH64_EXTRA_TUNE_FULLY_PIPELINED_FMA
)
18592 SET_OPTION_IF_UNSET (opts
, &global_options_set
, param_fully_pipelined_fma
,
18595 aarch64_override_options_after_change_1 (opts
);
18598 /* Print a hint with a suggestion for a core or architecture name that
18599 most closely resembles what the user passed in STR. ARCH is true if
18600 the user is asking for an architecture name. ARCH is false if the user
18601 is asking for a core name. */
18604 aarch64_print_hint_for_core_or_arch (const char *str
, bool arch
)
18606 auto_vec
<const char *> candidates
;
18607 const struct processor
*entry
= arch
? all_architectures
: all_cores
;
18608 for (; entry
->name
!= NULL
; entry
++)
18609 candidates
.safe_push (entry
->name
);
18611 #ifdef HAVE_LOCAL_CPU_DETECT
18612 /* Add also "native" as possible value. */
18614 candidates
.safe_push ("native");
18618 const char *hint
= candidates_list_and_hint (str
, s
, candidates
);
18620 inform (input_location
, "valid arguments are: %s;"
18621 " did you mean %qs?", s
, hint
);
18623 inform (input_location
, "valid arguments are: %s", s
);
18628 /* Print a hint with a suggestion for a core name that most closely resembles
18629 what the user passed in STR. */
18632 aarch64_print_hint_for_core (const char *str
)
18634 aarch64_print_hint_for_core_or_arch (str
, false);
18637 /* Print a hint with a suggestion for an architecture name that most closely
18638 resembles what the user passed in STR. */
18641 aarch64_print_hint_for_arch (const char *str
)
18643 aarch64_print_hint_for_core_or_arch (str
, true);
18647 /* Print a hint with a suggestion for an extension name
18648 that most closely resembles what the user passed in STR. */
18651 aarch64_print_hint_for_extensions (const std::string
&str
)
18653 auto_vec
<const char *> candidates
;
18654 aarch64_get_all_extension_candidates (&candidates
);
18656 const char *hint
= candidates_list_and_hint (str
.c_str (), s
, candidates
);
18658 inform (input_location
, "valid arguments are: %s;"
18659 " did you mean %qs?", s
, hint
);
18661 inform (input_location
, "valid arguments are: %s", s
);
18666 /* Validate a command-line -mcpu option. Parse the cpu and extensions (if any)
18667 specified in STR and throw errors if appropriate. Put the results if
18668 they are valid in RES and ISA_FLAGS. Return whether the option is
18672 aarch64_validate_mcpu (const char *str
, const struct processor
**res
,
18673 aarch64_feature_flags
*isa_flags
)
18675 std::string invalid_extension
;
18676 enum aarch_parse_opt_result parse_res
18677 = aarch64_parse_cpu (str
, res
, isa_flags
, &invalid_extension
);
18679 if (parse_res
== AARCH_PARSE_OK
)
18684 case AARCH_PARSE_MISSING_ARG
:
18685 error ("missing cpu name in %<-mcpu=%s%>", str
);
18687 case AARCH_PARSE_INVALID_ARG
:
18688 error ("unknown value %qs for %<-mcpu%>", str
);
18689 aarch64_print_hint_for_core (str
);
18690 /* A common user error is confusing -march and -mcpu.
18691 If the -mcpu string matches a known architecture then suggest
18693 parse_res
= aarch64_parse_arch (str
, res
, isa_flags
, &invalid_extension
);
18694 if (parse_res
== AARCH_PARSE_OK
)
18695 inform (input_location
, "did you mean %<-march=%s%>?", str
);
18697 case AARCH_PARSE_INVALID_FEATURE
:
18698 error ("invalid feature modifier %qs in %<-mcpu=%s%>",
18699 invalid_extension
.c_str (), str
);
18700 aarch64_print_hint_for_extensions (invalid_extension
);
18703 gcc_unreachable ();
18709 /* Straight line speculation indicators. */
18710 enum aarch64_sls_hardening_type
18717 static enum aarch64_sls_hardening_type aarch64_sls_hardening
;
18719 /* Return whether we should mitigatate Straight Line Speculation for the RET
18720 and BR instructions. */
18722 aarch64_harden_sls_retbr_p (void)
18724 return aarch64_sls_hardening
& SLS_RETBR
;
18727 /* Return whether we should mitigatate Straight Line Speculation for the BLR
18730 aarch64_harden_sls_blr_p (void)
18732 return aarch64_sls_hardening
& SLS_BLR
;
18735 /* As of yet we only allow setting these options globally, in the future we may
18736 allow setting them per function. */
18738 aarch64_validate_sls_mitigation (const char *const_str
)
18740 char *token_save
= NULL
;
18743 if (strcmp (const_str
, "none") == 0)
18745 aarch64_sls_hardening
= SLS_NONE
;
18748 if (strcmp (const_str
, "all") == 0)
18750 aarch64_sls_hardening
= SLS_ALL
;
18754 char *str_root
= xstrdup (const_str
);
18755 str
= strtok_r (str_root
, ",", &token_save
);
18757 error ("invalid argument given to %<-mharden-sls=%>");
18759 int temp
= SLS_NONE
;
18762 if (strcmp (str
, "blr") == 0)
18764 else if (strcmp (str
, "retbr") == 0)
18766 else if (strcmp (str
, "none") == 0 || strcmp (str
, "all") == 0)
18768 error ("%qs must be by itself for %<-mharden-sls=%>", str
);
18773 error ("invalid argument %<%s%> for %<-mharden-sls=%>", str
);
18776 str
= strtok_r (NULL
, ",", &token_save
);
18778 aarch64_sls_hardening
= (aarch64_sls_hardening_type
) temp
;
18782 /* Validate a command-line -march option. Parse the arch and extensions
18783 (if any) specified in STR and throw errors if appropriate. Put the
18784 results, if they are valid, in RES and ISA_FLAGS. Return whether the
18785 option is valid. */
18788 aarch64_validate_march (const char *str
, const struct processor
**res
,
18789 aarch64_feature_flags
*isa_flags
)
18791 std::string invalid_extension
;
18792 enum aarch_parse_opt_result parse_res
18793 = aarch64_parse_arch (str
, res
, isa_flags
, &invalid_extension
);
18795 if (parse_res
== AARCH_PARSE_OK
)
18800 case AARCH_PARSE_MISSING_ARG
:
18801 error ("missing arch name in %<-march=%s%>", str
);
18803 case AARCH_PARSE_INVALID_ARG
:
18804 error ("unknown value %qs for %<-march%>", str
);
18805 aarch64_print_hint_for_arch (str
);
18806 /* A common user error is confusing -march and -mcpu.
18807 If the -march string matches a known CPU suggest -mcpu. */
18808 parse_res
= aarch64_parse_cpu (str
, res
, isa_flags
, &invalid_extension
);
18809 if (parse_res
== AARCH_PARSE_OK
)
18810 inform (input_location
, "did you mean %<-mcpu=%s%>?", str
);
18812 case AARCH_PARSE_INVALID_FEATURE
:
18813 error ("invalid feature modifier %qs in %<-march=%s%>",
18814 invalid_extension
.c_str (), str
);
18815 aarch64_print_hint_for_extensions (invalid_extension
);
18818 gcc_unreachable ();
18824 /* Validate a command-line -mtune option. Parse the cpu
18825 specified in STR and throw errors if appropriate. Put the
18826 result, if it is valid, in RES. Return whether the option is
18830 aarch64_validate_mtune (const char *str
, const struct processor
**res
)
18832 enum aarch_parse_opt_result parse_res
18833 = aarch64_parse_tune (str
, res
);
18835 if (parse_res
== AARCH_PARSE_OK
)
18840 case AARCH_PARSE_MISSING_ARG
:
18841 error ("missing cpu name in %<-mtune=%s%>", str
);
18843 case AARCH_PARSE_INVALID_ARG
:
18844 error ("unknown value %qs for %<-mtune%>", str
);
18845 aarch64_print_hint_for_core (str
);
18848 gcc_unreachable ();
18853 /* Return the VG value associated with -msve-vector-bits= value VALUE. */
18856 aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits_enum value
)
18858 /* 128-bit SVE and Advanced SIMD modes use different register layouts
18859 on big-endian targets, so we would need to forbid subregs that convert
18860 from one to the other. By default a reinterpret sequence would then
18861 involve a store to memory in one mode and a load back in the other.
18862 Even if we optimize that sequence using reverse instructions,
18863 it would still be a significant potential overhead.
18865 For now, it seems better to generate length-agnostic code for that
18867 if (value
== SVE_SCALABLE
18868 || (value
== SVE_128
&& BYTES_BIG_ENDIAN
))
18869 return poly_uint16 (2, 2);
18871 return (int) value
/ 64;
18874 /* Set the global aarch64_asm_isa_flags to FLAGS and update
18875 aarch64_isa_flags accordingly. */
18878 aarch64_set_asm_isa_flags (aarch64_feature_flags flags
)
18880 aarch64_set_asm_isa_flags (&global_options
, flags
);
18884 aarch64_handle_no_branch_protection (void)
18886 aarch_ra_sign_scope
= AARCH_FUNCTION_NONE
;
18887 aarch_enable_bti
= 0;
18891 aarch64_handle_standard_branch_protection (void)
18893 aarch_ra_sign_scope
= AARCH_FUNCTION_NON_LEAF
;
18894 aarch64_ra_sign_key
= AARCH64_KEY_A
;
18895 aarch_enable_bti
= 1;
18899 aarch64_handle_pac_ret_protection (void)
18901 aarch_ra_sign_scope
= AARCH_FUNCTION_NON_LEAF
;
18902 aarch64_ra_sign_key
= AARCH64_KEY_A
;
18906 aarch64_handle_pac_ret_leaf (void)
18908 aarch_ra_sign_scope
= AARCH_FUNCTION_ALL
;
18912 aarch64_handle_pac_ret_b_key (void)
18914 aarch64_ra_sign_key
= AARCH64_KEY_B
;
18918 aarch64_handle_bti_protection (void)
18920 aarch_enable_bti
= 1;
18923 static const struct aarch_branch_protect_type aarch64_pac_ret_subtypes
[] = {
18924 { "leaf", false, aarch64_handle_pac_ret_leaf
, NULL
, 0 },
18925 { "b-key", false, aarch64_handle_pac_ret_b_key
, NULL
, 0 },
18926 { NULL
, false, NULL
, NULL
, 0 }
18929 static const struct aarch_branch_protect_type aarch64_branch_protect_types
[] =
18931 { "none", true, aarch64_handle_no_branch_protection
, NULL
, 0 },
18932 { "standard", true, aarch64_handle_standard_branch_protection
, NULL
, 0 },
18933 { "pac-ret", false, aarch64_handle_pac_ret_protection
,
18934 aarch64_pac_ret_subtypes
, ARRAY_SIZE (aarch64_pac_ret_subtypes
) },
18935 { "bti", false, aarch64_handle_bti_protection
, NULL
, 0 },
18936 { NULL
, false, NULL
, NULL
, 0 }
18939 /* Implement TARGET_OPTION_OVERRIDE. This is called once in the beginning
18940 and is used to parse the -m{cpu,tune,arch} strings and setup the initial
18941 tuning structs. In particular it must set selected_tune and
18942 aarch64_asm_isa_flags that define the available ISA features and tuning
18943 decisions. It must also set selected_arch as this will be used to
18944 output the .arch asm tags for each function. */
18947 aarch64_override_options (void)
18949 aarch64_feature_flags cpu_isa
= 0;
18950 aarch64_feature_flags arch_isa
= 0;
18951 aarch64_set_asm_isa_flags (0);
18953 const struct processor
*cpu
= NULL
;
18954 const struct processor
*arch
= NULL
;
18955 const struct processor
*tune
= NULL
;
18957 if (aarch64_harden_sls_string
)
18958 aarch64_validate_sls_mitigation (aarch64_harden_sls_string
);
18960 if (aarch64_branch_protection_string
)
18961 aarch_validate_mbranch_protection (aarch64_branch_protect_types
,
18962 aarch64_branch_protection_string
,
18963 "-mbranch-protection=");
18965 /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
18966 If either of -march or -mtune is given, they override their
18967 respective component of -mcpu. */
18968 if (aarch64_cpu_string
)
18969 aarch64_validate_mcpu (aarch64_cpu_string
, &cpu
, &cpu_isa
);
18971 if (aarch64_arch_string
)
18972 aarch64_validate_march (aarch64_arch_string
, &arch
, &arch_isa
);
18974 if (aarch64_tune_string
)
18975 aarch64_validate_mtune (aarch64_tune_string
, &tune
);
18977 #ifdef SUBTARGET_OVERRIDE_OPTIONS
18978 SUBTARGET_OVERRIDE_OPTIONS
;
18983 /* If both -mcpu and -march are specified, warn if they are not
18984 feature compatible. feature compatible means that the inclusion of the
18985 cpu features would end up disabling an achitecture feature. In
18986 otherwords the cpu features need to be a strict superset of the arch
18987 features and if so prefer the -march ISA flags. */
18988 auto full_arch_flags
= arch
->flags
| arch_isa
;
18989 auto full_cpu_flags
= cpu
->flags
| cpu_isa
;
18990 if (~full_cpu_flags
& full_arch_flags
)
18992 std::string ext_diff
18993 = aarch64_get_extension_string_for_isa_flags (full_arch_flags
,
18995 warning (0, "switch %<-mcpu=%s%> conflicts with %<-march=%s%> switch "
18996 "and resulted in options %<%s%> being added",
18997 aarch64_cpu_string
,
18998 aarch64_arch_string
,
18999 ext_diff
.c_str ());
19002 selected_arch
= arch
->arch
;
19003 aarch64_set_asm_isa_flags (arch_isa
| AARCH64_FL_DEFAULT_ISA_MODE
);
19007 selected_arch
= cpu
->arch
;
19008 aarch64_set_asm_isa_flags (cpu_isa
| AARCH64_FL_DEFAULT_ISA_MODE
);
19012 cpu
= &all_cores
[arch
->ident
];
19013 selected_arch
= arch
->arch
;
19014 aarch64_set_asm_isa_flags (arch_isa
| AARCH64_FL_DEFAULT_ISA_MODE
);
19018 /* No -mcpu or -march specified, so use the default CPU. */
19019 cpu
= &all_cores
[TARGET_CPU_DEFAULT
];
19020 selected_arch
= cpu
->arch
;
19021 aarch64_set_asm_isa_flags (cpu
->flags
| AARCH64_FL_DEFAULT_ISA_MODE
);
19024 selected_tune
= tune
? tune
->ident
: cpu
->ident
;
19026 if (aarch_enable_bti
== 2)
19028 #ifdef TARGET_ENABLE_BTI
19029 aarch_enable_bti
= 1;
19031 aarch_enable_bti
= 0;
19035 /* Return address signing is currently not supported for ILP32 targets. For
19036 LP64 targets use the configured option in the absence of a command-line
19037 option for -mbranch-protection. */
19038 if (!TARGET_ILP32
&& aarch64_branch_protection_string
== NULL
)
19040 #ifdef TARGET_ENABLE_PAC_RET
19041 aarch_ra_sign_scope
= AARCH_FUNCTION_NON_LEAF
;
19043 aarch_ra_sign_scope
= AARCH_FUNCTION_NONE
;
19047 #ifndef HAVE_AS_MABI_OPTION
19048 /* The compiler may have been configured with 2.23.* binutils, which does
19049 not have support for ILP32. */
19051 error ("assembler does not support %<-mabi=ilp32%>");
19054 /* Convert -msve-vector-bits to a VG count. */
19055 aarch64_sve_vg
= aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits
);
19057 if (aarch_ra_sign_scope
!= AARCH_FUNCTION_NONE
&& TARGET_ILP32
)
19058 sorry ("return address signing is only supported for %<-mabi=lp64%>");
19060 /* The pass to insert speculation tracking runs before
19061 shrink-wrapping and the latter does not know how to update the
19062 tracking status. So disable it in this case. */
19063 if (aarch64_track_speculation
)
19064 flag_shrink_wrap
= 0;
19066 aarch64_override_options_internal (&global_options
);
19068 /* Save these options as the default ones in case we push and pop them later
19069 while processing functions with potential target attributes. */
19070 target_option_default_node
= target_option_current_node
19071 = build_target_option_node (&global_options
, &global_options_set
);
19074 /* Implement targetm.override_options_after_change. */
19077 aarch64_override_options_after_change (void)
19079 aarch64_override_options_after_change_1 (&global_options
);
19082 /* Implement the TARGET_OFFLOAD_OPTIONS hook. */
19084 aarch64_offload_options (void)
19087 return xstrdup ("-foffload-abi=ilp32 -foffload-abi-host-opts=-mabi=ilp32");
19089 return xstrdup ("-foffload-abi=lp64 -foffload-abi-host-opts=-mabi=lp64");
19092 static struct machine_function
*
19093 aarch64_init_machine_status (void)
19095 struct machine_function
*machine
;
19096 machine
= ggc_cleared_alloc
<machine_function
> ();
19101 aarch64_init_expanders (void)
19103 init_machine_status
= aarch64_init_machine_status
;
19106 /* A checking mechanism for the implementation of the various code models. */
19108 initialize_aarch64_code_model (struct gcc_options
*opts
)
19110 aarch64_cmodel
= opts
->x_aarch64_cmodel_var
;
19111 switch (opts
->x_aarch64_cmodel_var
)
19113 case AARCH64_CMODEL_TINY
:
19114 if (opts
->x_flag_pic
)
19115 aarch64_cmodel
= AARCH64_CMODEL_TINY_PIC
;
19117 case AARCH64_CMODEL_SMALL
:
19118 if (opts
->x_flag_pic
)
19120 #ifdef HAVE_AS_SMALL_PIC_RELOCS
19121 aarch64_cmodel
= (flag_pic
== 2
19122 ? AARCH64_CMODEL_SMALL_PIC
19123 : AARCH64_CMODEL_SMALL_SPIC
);
19125 aarch64_cmodel
= AARCH64_CMODEL_SMALL_PIC
;
19129 case AARCH64_CMODEL_LARGE
:
19130 if (opts
->x_flag_pic
)
19131 sorry ("code model %qs with %<-f%s%>", "large",
19132 opts
->x_flag_pic
> 1 ? "PIC" : "pic");
19133 if (opts
->x_aarch64_abi
== AARCH64_ABI_ILP32
)
19134 sorry ("code model %qs not supported in ilp32 mode", "large");
19136 case AARCH64_CMODEL_TINY_PIC
:
19137 case AARCH64_CMODEL_SMALL_PIC
:
19138 case AARCH64_CMODEL_SMALL_SPIC
:
19139 gcc_unreachable ();
19143 /* Implements TARGET_OPTION_RESTORE. Restore the backend codegen decisions
19144 using the information saved in PTR. */
19147 aarch64_option_restore (struct gcc_options
*opts
,
19148 struct gcc_options
* /* opts_set */,
19149 struct cl_target_option
* /* ptr */)
19151 aarch64_override_options_internal (opts
);
19154 /* Implement TARGET_OPTION_PRINT. */
19157 aarch64_option_print (FILE *file
, int indent
, struct cl_target_option
*ptr
)
19159 const struct processor
*cpu
19160 = aarch64_get_tune_cpu (ptr
->x_selected_tune
);
19161 const struct processor
*arch
= aarch64_get_arch (ptr
->x_selected_arch
);
19162 aarch64_feature_flags isa_flags
= aarch64_get_asm_isa_flags(ptr
);
19163 std::string extension
19164 = aarch64_get_extension_string_for_isa_flags (isa_flags
, arch
->flags
);
19166 fprintf (file
, "%*sselected tune = %s\n", indent
, "", cpu
->name
);
19167 fprintf (file
, "%*sselected arch = %s%s\n", indent
, "",
19168 arch
->name
, extension
.c_str ());
19171 static GTY(()) tree aarch64_previous_fndecl
;
19174 aarch64_reset_previous_fndecl (void)
19176 aarch64_previous_fndecl
= NULL
;
19179 /* Restore or save the TREE_TARGET_GLOBALS from or to NEW_TREE.
19180 Used by aarch64_set_current_function and aarch64_pragma_target_parse to
19181 make sure optab availability predicates are recomputed when necessary. */
19184 aarch64_save_restore_target_globals (tree new_tree
)
19186 if (TREE_TARGET_GLOBALS (new_tree
))
19187 restore_target_globals (TREE_TARGET_GLOBALS (new_tree
));
19188 else if (new_tree
== target_option_default_node
)
19189 restore_target_globals (&default_target_globals
);
19191 TREE_TARGET_GLOBALS (new_tree
) = save_target_globals_default_opts ();
19194 /* Return the target_option_node for FNDECL, or the current options
19195 if FNDECL is null. */
19198 aarch64_fndecl_options (tree fndecl
)
19201 return target_option_current_node
;
19203 if (tree options
= DECL_FUNCTION_SPECIFIC_TARGET (fndecl
))
19206 return target_option_default_node
;
19209 /* Implement TARGET_SET_CURRENT_FUNCTION. Unpack the codegen decisions
19210 like tuning and ISA features from the DECL_FUNCTION_SPECIFIC_TARGET
19211 of the function, if such exists. This function may be called multiple
19212 times on a single function so use aarch64_previous_fndecl to avoid
19213 setting up identical state. */
19216 aarch64_set_current_function (tree fndecl
)
19218 tree old_tree
= aarch64_fndecl_options (aarch64_previous_fndecl
);
19219 tree new_tree
= aarch64_fndecl_options (fndecl
);
19221 auto new_isa_mode
= (fndecl
19222 ? aarch64_fndecl_isa_mode (fndecl
)
19223 : AARCH64_DEFAULT_ISA_MODE
);
19224 auto isa_flags
= aarch64_get_isa_flags (TREE_TARGET_OPTION (new_tree
));
19226 static bool reported_zt0_p
;
19227 if (!reported_zt0_p
19228 && !(isa_flags
& AARCH64_FL_SME2
)
19230 && aarch64_fndecl_has_state (fndecl
, "zt0"))
19232 error ("functions with %qs state require the ISA extension %qs",
19234 inform (input_location
, "you can enable %qs using the command-line"
19235 " option %<-march%>, or by using the %<target%>"
19236 " attribute or pragma", "sme2");
19237 reported_zt0_p
= true;
19240 /* If nothing to do, return. #pragma GCC reset or #pragma GCC pop to
19241 the default have been handled by aarch64_save_restore_target_globals from
19242 aarch64_pragma_target_parse. */
19243 if (old_tree
== new_tree
19244 && (!fndecl
|| aarch64_previous_fndecl
)
19245 && (isa_flags
& AARCH64_FL_ISA_MODES
).val
[0] == new_isa_mode
)
19247 gcc_assert (AARCH64_ISA_MODE
== new_isa_mode
);
19251 aarch64_previous_fndecl
= fndecl
;
19253 /* First set the target options. */
19254 cl_target_option_restore (&global_options
, &global_options_set
,
19255 TREE_TARGET_OPTION (new_tree
));
19257 /* The ISA mode can vary based on function type attributes and
19258 function declaration attributes. Make sure that the target
19259 options correctly reflect these attributes. */
19260 if ((isa_flags
& AARCH64_FL_ISA_MODES
).val
[0] != new_isa_mode
)
19262 auto base_flags
= (aarch64_asm_isa_flags
& ~AARCH64_FL_ISA_MODES
);
19263 aarch64_set_asm_isa_flags (base_flags
19264 | aarch64_feature_flags (new_isa_mode
));
19266 aarch64_override_options_internal (&global_options
);
19267 new_tree
= build_target_option_node (&global_options
,
19268 &global_options_set
);
19269 DECL_FUNCTION_SPECIFIC_TARGET (fndecl
) = new_tree
;
19271 tree new_optimize
= build_optimization_node (&global_options
,
19272 &global_options_set
);
19273 if (new_optimize
!= optimization_default_node
)
19274 DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl
) = new_optimize
;
19277 aarch64_save_restore_target_globals (new_tree
);
19279 gcc_assert (AARCH64_ISA_MODE
== new_isa_mode
);
19282 /* Enum describing the various ways we can handle attributes.
19283 In many cases we can reuse the generic option handling machinery. */
19285 enum aarch64_attr_opt_type
19287 aarch64_attr_mask
, /* Attribute should set a bit in target_flags. */
19288 aarch64_attr_bool
, /* Attribute sets or unsets a boolean variable. */
19289 aarch64_attr_enum
, /* Attribute sets an enum variable. */
19290 aarch64_attr_custom
/* Attribute requires a custom handling function. */
19293 /* All the information needed to handle a target attribute.
19294 NAME is the name of the attribute.
19295 ATTR_TYPE specifies the type of behavior of the attribute as described
19296 in the definition of enum aarch64_attr_opt_type.
19297 ALLOW_NEG is true if the attribute supports a "no-" form.
19298 HANDLER is the function that takes the attribute string as an argument
19299 It is needed only when the ATTR_TYPE is aarch64_attr_custom.
19300 OPT_NUM is the enum specifying the option that the attribute modifies.
19301 This is needed for attributes that mirror the behavior of a command-line
19302 option, that is it has ATTR_TYPE aarch64_attr_mask, aarch64_attr_bool or
19303 aarch64_attr_enum. */
19305 struct aarch64_attribute_info
19308 enum aarch64_attr_opt_type attr_type
;
19310 bool (*handler
) (const char *);
19311 enum opt_code opt_num
;
19314 /* Handle the ARCH_STR argument to the arch= target attribute. */
19317 aarch64_handle_attr_arch (const char *str
)
19319 const struct processor
*tmp_arch
= NULL
;
19320 std::string invalid_extension
;
19321 aarch64_feature_flags tmp_flags
;
19322 enum aarch_parse_opt_result parse_res
19323 = aarch64_parse_arch (str
, &tmp_arch
, &tmp_flags
, &invalid_extension
);
19325 if (parse_res
== AARCH_PARSE_OK
)
19327 gcc_assert (tmp_arch
);
19328 selected_arch
= tmp_arch
->arch
;
19329 aarch64_set_asm_isa_flags (tmp_flags
| (aarch64_asm_isa_flags
19330 & AARCH64_FL_ISA_MODES
));
19336 case AARCH_PARSE_MISSING_ARG
:
19337 error ("missing name in %<target(\"arch=\")%> pragma or attribute");
19339 case AARCH_PARSE_INVALID_ARG
:
19340 error ("invalid name %qs in %<target(\"arch=\")%> pragma or attribute", str
);
19341 aarch64_print_hint_for_arch (str
);
19343 case AARCH_PARSE_INVALID_FEATURE
:
19344 error ("invalid feature modifier %s of value %qs in "
19345 "%<target()%> pragma or attribute", invalid_extension
.c_str (), str
);
19346 aarch64_print_hint_for_extensions (invalid_extension
);
19349 gcc_unreachable ();
19355 /* Handle the argument CPU_STR to the cpu= target attribute. */
19358 aarch64_handle_attr_cpu (const char *str
)
19360 const struct processor
*tmp_cpu
= NULL
;
19361 std::string invalid_extension
;
19362 aarch64_feature_flags tmp_flags
;
19363 enum aarch_parse_opt_result parse_res
19364 = aarch64_parse_cpu (str
, &tmp_cpu
, &tmp_flags
, &invalid_extension
);
19366 if (parse_res
== AARCH_PARSE_OK
)
19368 gcc_assert (tmp_cpu
);
19369 selected_tune
= tmp_cpu
->ident
;
19370 selected_arch
= tmp_cpu
->arch
;
19371 aarch64_set_asm_isa_flags (tmp_flags
| (aarch64_asm_isa_flags
19372 & AARCH64_FL_ISA_MODES
));
19378 case AARCH_PARSE_MISSING_ARG
:
19379 error ("missing name in %<target(\"cpu=\")%> pragma or attribute");
19381 case AARCH_PARSE_INVALID_ARG
:
19382 error ("invalid name %qs in %<target(\"cpu=\")%> pragma or attribute", str
);
19383 aarch64_print_hint_for_core (str
);
19385 case AARCH_PARSE_INVALID_FEATURE
:
19386 error ("invalid feature modifier %qs of value %qs in "
19387 "%<target()%> pragma or attribute", invalid_extension
.c_str (), str
);
19388 aarch64_print_hint_for_extensions (invalid_extension
);
19391 gcc_unreachable ();
19397 /* Handle the argument STR to the branch-protection= attribute. */
19400 aarch64_handle_attr_branch_protection (const char* str
)
19402 return aarch_validate_mbranch_protection (aarch64_branch_protect_types
, str
,
19403 "target(\"branch-protection=\")");
19406 /* Handle the argument STR to the tune= target attribute. */
19409 aarch64_handle_attr_tune (const char *str
)
19411 const struct processor
*tmp_tune
= NULL
;
19412 enum aarch_parse_opt_result parse_res
19413 = aarch64_parse_tune (str
, &tmp_tune
);
19415 if (parse_res
== AARCH_PARSE_OK
)
19417 gcc_assert (tmp_tune
);
19418 selected_tune
= tmp_tune
->ident
;
19424 case AARCH_PARSE_INVALID_ARG
:
19425 error ("invalid name %qs in %<target(\"tune=\")%> pragma or attribute", str
);
19426 aarch64_print_hint_for_core (str
);
19429 gcc_unreachable ();
19435 /* Parse an architecture extensions target attribute string specified in STR.
19436 For example "+fp+nosimd". Show any errors if needed. Return TRUE
19437 if successful. Update aarch64_isa_flags to reflect the ISA features
19441 aarch64_handle_attr_isa_flags (char *str
)
19443 enum aarch_parse_opt_result parse_res
;
19444 auto isa_flags
= aarch64_asm_isa_flags
;
19446 /* We allow "+nothing" in the beginning to clear out all architectural
19447 features if the user wants to handpick specific features. */
19448 if (strncmp ("+nothing", str
, 8) == 0)
19450 isa_flags
&= AARCH64_FL_ISA_MODES
;
19454 std::string invalid_extension
;
19455 parse_res
= aarch64_parse_extension (str
, &isa_flags
, &invalid_extension
);
19457 if (parse_res
== AARCH_PARSE_OK
)
19459 aarch64_set_asm_isa_flags (isa_flags
);
19465 case AARCH_PARSE_MISSING_ARG
:
19466 error ("missing value in %<target()%> pragma or attribute");
19469 case AARCH_PARSE_INVALID_FEATURE
:
19470 error ("invalid feature modifier %qs of value %qs in "
19471 "%<target()%> pragma or attribute", invalid_extension
.c_str (), str
);
19475 gcc_unreachable ();
19481 /* The target attributes that we support. On top of these we also support just
19482 ISA extensions, like __attribute__ ((target ("+crc"))), but that case is
19483 handled explicitly in aarch64_process_one_target_attr. */
19485 static const struct aarch64_attribute_info aarch64_attributes
[] =
19487 { "general-regs-only", aarch64_attr_mask
, false, NULL
,
19488 OPT_mgeneral_regs_only
},
19489 { "fix-cortex-a53-835769", aarch64_attr_bool
, true, NULL
,
19490 OPT_mfix_cortex_a53_835769
},
19491 { "fix-cortex-a53-843419", aarch64_attr_bool
, true, NULL
,
19492 OPT_mfix_cortex_a53_843419
},
19493 { "cmodel", aarch64_attr_enum
, false, NULL
, OPT_mcmodel_
},
19494 { "strict-align", aarch64_attr_mask
, true, NULL
, OPT_mstrict_align
},
19495 { "omit-leaf-frame-pointer", aarch64_attr_bool
, true, NULL
,
19496 OPT_momit_leaf_frame_pointer
},
19497 { "tls-dialect", aarch64_attr_enum
, false, NULL
, OPT_mtls_dialect_
},
19498 { "arch", aarch64_attr_custom
, false, aarch64_handle_attr_arch
,
19500 { "cpu", aarch64_attr_custom
, false, aarch64_handle_attr_cpu
, OPT_mcpu_
},
19501 { "tune", aarch64_attr_custom
, false, aarch64_handle_attr_tune
,
19503 { "branch-protection", aarch64_attr_custom
, false,
19504 aarch64_handle_attr_branch_protection
, OPT_mbranch_protection_
},
19505 { "sign-return-address", aarch64_attr_enum
, false, NULL
,
19506 OPT_msign_return_address_
},
19507 { "outline-atomics", aarch64_attr_bool
, true, NULL
,
19508 OPT_moutline_atomics
},
19509 { NULL
, aarch64_attr_custom
, false, NULL
, OPT____
}
19512 /* Parse ARG_STR which contains the definition of one target attribute.
19513 Show appropriate errors if any or return true if the attribute is valid. */
19516 aarch64_process_one_target_attr (char *arg_str
)
19518 bool invert
= false;
19520 size_t len
= strlen (arg_str
);
19524 error ("malformed %<target()%> pragma or attribute");
19528 auto_vec
<char, 32> buffer
;
19529 buffer
.safe_grow (len
+ 1);
19530 char *str_to_check
= buffer
.address ();
19531 memcpy (str_to_check
, arg_str
, len
+ 1);
19533 /* We have something like __attribute__ ((target ("+fp+nosimd"))).
19534 It is easier to detect and handle it explicitly here rather than going
19535 through the machinery for the rest of the target attributes in this
19537 if (*str_to_check
== '+')
19538 return aarch64_handle_attr_isa_flags (str_to_check
);
19540 if (len
> 3 && startswith (str_to_check
, "no-"))
19545 char *arg
= strchr (str_to_check
, '=');
19547 /* If we found opt=foo then terminate STR_TO_CHECK at the '='
19548 and point ARG to "foo". */
19554 const struct aarch64_attribute_info
*p_attr
;
19555 bool found
= false;
19556 for (p_attr
= aarch64_attributes
; p_attr
->name
; p_attr
++)
19558 /* If the names don't match up, or the user has given an argument
19559 to an attribute that doesn't accept one, or didn't give an argument
19560 to an attribute that expects one, fail to match. */
19561 if (strcmp (str_to_check
, p_attr
->name
) != 0)
19565 bool attr_need_arg_p
= p_attr
->attr_type
== aarch64_attr_custom
19566 || p_attr
->attr_type
== aarch64_attr_enum
;
19568 if (attr_need_arg_p
^ (arg
!= NULL
))
19570 error ("pragma or attribute %<target(\"%s\")%> does not accept an argument", str_to_check
);
19574 /* If the name matches but the attribute does not allow "no-" versions
19575 then we can't match. */
19576 if (invert
&& !p_attr
->allow_neg
)
19578 error ("pragma or attribute %<target(\"%s\")%> does not allow a negated form", str_to_check
);
19582 switch (p_attr
->attr_type
)
19584 /* Has a custom handler registered.
19585 For example, cpu=, arch=, tune=. */
19586 case aarch64_attr_custom
:
19587 gcc_assert (p_attr
->handler
);
19588 if (!p_attr
->handler (arg
))
19592 /* Either set or unset a boolean option. */
19593 case aarch64_attr_bool
:
19595 struct cl_decoded_option decoded
;
19597 generate_option (p_attr
->opt_num
, NULL
, !invert
,
19598 CL_TARGET
, &decoded
);
19599 aarch64_handle_option (&global_options
, &global_options_set
,
19600 &decoded
, input_location
);
19603 /* Set or unset a bit in the target_flags. aarch64_handle_option
19604 should know what mask to apply given the option number. */
19605 case aarch64_attr_mask
:
19607 struct cl_decoded_option decoded
;
19608 /* We only need to specify the option number.
19609 aarch64_handle_option will know which mask to apply. */
19610 decoded
.opt_index
= p_attr
->opt_num
;
19611 decoded
.value
= !invert
;
19612 aarch64_handle_option (&global_options
, &global_options_set
,
19613 &decoded
, input_location
);
19616 /* Use the option setting machinery to set an option to an enum. */
19617 case aarch64_attr_enum
:
19622 valid
= opt_enum_arg_to_value (p_attr
->opt_num
, arg
,
19623 &value
, CL_TARGET
);
19626 set_option (&global_options
, NULL
, p_attr
->opt_num
, value
,
19627 NULL
, DK_UNSPECIFIED
, input_location
,
19632 error ("pragma or attribute %<target(\"%s=%s\")%> is not valid", str_to_check
, arg
);
19637 gcc_unreachable ();
19641 /* If we reached here we either have found an attribute and validated
19642 it or didn't match any. If we matched an attribute but its arguments
19643 were malformed we will have returned false already. */
19647 /* Count how many times the character C appears in
19648 NULL-terminated string STR. */
19650 static unsigned int
19651 num_occurences_in_str (char c
, char *str
)
19653 unsigned int res
= 0;
19654 while (*str
!= '\0')
19665 /* Parse the tree in ARGS that contains the target attribute information
19666 and update the global target options space. */
19669 aarch64_process_target_attr (tree args
)
19671 if (TREE_CODE (args
) == TREE_LIST
)
19675 tree head
= TREE_VALUE (args
);
19678 if (!aarch64_process_target_attr (head
))
19681 args
= TREE_CHAIN (args
);
19687 if (TREE_CODE (args
) != STRING_CST
)
19689 error ("attribute %<target%> argument not a string");
19693 size_t len
= strlen (TREE_STRING_POINTER (args
));
19694 auto_vec
<char, 32> buffer
;
19695 buffer
.safe_grow (len
+ 1);
19696 char *str_to_check
= buffer
.address ();
19697 memcpy (str_to_check
, TREE_STRING_POINTER (args
), len
+ 1);
19701 error ("malformed %<target()%> pragma or attribute");
19705 /* Used to catch empty spaces between commas i.e.
19706 attribute ((target ("attr1,,attr2"))). */
19707 unsigned int num_commas
= num_occurences_in_str (',', str_to_check
);
19709 /* Handle multiple target attributes separated by ','. */
19710 char *token
= strtok_r (str_to_check
, ",", &str_to_check
);
19712 unsigned int num_attrs
= 0;
19716 if (!aarch64_process_one_target_attr (token
))
19718 /* Check if token is possibly an arch extension without
19720 aarch64_feature_flags isa_temp
= 0;
19721 auto with_plus
= std::string ("+") + token
;
19722 enum aarch_parse_opt_result ext_res
19723 = aarch64_parse_extension (with_plus
.c_str (), &isa_temp
, nullptr);
19725 if (ext_res
== AARCH_PARSE_OK
)
19726 error ("arch extension %<%s%> should be prefixed by %<+%>",
19729 error ("pragma or attribute %<target(\"%s\")%> is not valid", token
);
19733 token
= strtok_r (NULL
, ",", &str_to_check
);
19736 if (num_attrs
!= num_commas
+ 1)
19738 error ("malformed %<target(\"%s\")%> pragma or attribute", TREE_STRING_POINTER (args
));
19745 static bool aarch64_process_target_version_attr (tree args
);
19747 /* Implement TARGET_OPTION_VALID_ATTRIBUTE_P. This is used to
19748 process attribute ((target ("..."))). */
19751 aarch64_option_valid_attribute_p (tree fndecl
, tree
, tree args
, int)
19753 struct cl_target_option cur_target
;
19756 tree new_target
, new_optimize
;
19757 tree existing_target
= DECL_FUNCTION_SPECIFIC_TARGET (fndecl
);
19759 /* If what we're processing is the current pragma string then the
19760 target option node is already stored in target_option_current_node
19761 by aarch64_pragma_target_parse in aarch64-c.cc. Use that to avoid
19762 having to re-parse the string. This is especially useful to keep
19763 arm_neon.h compile times down since that header contains a lot
19764 of intrinsics enclosed in pragmas. */
19765 if (!existing_target
&& args
== current_target_pragma
)
19767 DECL_FUNCTION_SPECIFIC_TARGET (fndecl
) = target_option_current_node
;
19770 tree func_optimize
= DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl
);
19773 = build_optimization_node (&global_options
, &global_options_set
);
19774 func_optimize
= DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl
);
19776 /* If the function changed the optimization levels as well as setting
19777 target options, start with the optimizations specified. */
19778 if (func_optimize
&& func_optimize
!= old_optimize
)
19779 cl_optimization_restore (&global_options
, &global_options_set
,
19780 TREE_OPTIMIZATION (func_optimize
));
19782 /* Save the current target options to restore at the end. */
19783 cl_target_option_save (&cur_target
, &global_options
, &global_options_set
);
19785 /* If fndecl already has some target attributes applied to it, unpack
19786 them so that we add this attribute on top of them, rather than
19787 overwriting them. */
19788 if (existing_target
)
19790 struct cl_target_option
*existing_options
19791 = TREE_TARGET_OPTION (existing_target
);
19793 if (existing_options
)
19794 cl_target_option_restore (&global_options
, &global_options_set
,
19798 cl_target_option_restore (&global_options
, &global_options_set
,
19799 TREE_TARGET_OPTION (target_option_current_node
));
19801 ret
= aarch64_process_target_attr (args
);
19804 tree version_attr
= lookup_attribute ("target_version",
19805 DECL_ATTRIBUTES (fndecl
));
19806 if (version_attr
!= NULL_TREE
)
19808 /* Reapply any target_version attribute after target attribute.
19809 This should be equivalent to applying the target_version once
19810 after processing all target attributes. */
19811 tree version_args
= TREE_VALUE (version_attr
);
19812 ret
= aarch64_process_target_version_attr (version_args
);
19816 /* Set up any additional state. */
19819 aarch64_override_options_internal (&global_options
);
19820 new_target
= build_target_option_node (&global_options
,
19821 &global_options_set
);
19826 new_optimize
= build_optimization_node (&global_options
,
19827 &global_options_set
);
19831 DECL_FUNCTION_SPECIFIC_TARGET (fndecl
) = new_target
;
19833 if (old_optimize
!= new_optimize
)
19834 DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl
) = new_optimize
;
19837 cl_target_option_restore (&global_options
, &global_options_set
, &cur_target
);
19839 if (old_optimize
!= new_optimize
)
19840 cl_optimization_restore (&global_options
, &global_options_set
,
19841 TREE_OPTIMIZATION (old_optimize
));
19845 typedef unsigned long long aarch64_fmv_feature_mask
;
19850 aarch64_fmv_feature_mask feature_mask
;
19851 aarch64_feature_flags opt_flags
;
19852 } aarch64_fmv_feature_datum
;
19854 #define AARCH64_FMV_FEATURE(NAME, FEAT_NAME, C) \
19855 {NAME, 1ULL << FEAT_##FEAT_NAME, ::feature_deps::fmv_deps_##FEAT_NAME},
19857 /* The "rdma" alias uses a different FEAT_NAME to avoid a duplicate
19858 feature_deps name. */
19859 #define FEAT_RDMA FEAT_RDM
19861 /* FMV features are listed in priority order, to make it easier to sort target
19863 static aarch64_fmv_feature_datum aarch64_fmv_feature_data
[] = {
19864 #include "config/aarch64/aarch64-option-extensions.def"
19867 /* Parse a function multiversioning feature string STR, as found in a
19868 target_version or target_clones attribute.
19870 If ISA_FLAGS is nonnull, then update it with the specified architecture
19871 features turned on. If FEATURE_MASK is nonnull, then assign to it a bitmask
19872 representing the set of features explicitly specified in the feature string.
19873 Return an aarch_parse_opt_result describing the result.
19875 When the STR string contains an invalid or duplicate extension, a copy of
19876 the extension string is created and stored to INVALID_EXTENSION. */
19878 static enum aarch_parse_opt_result
19879 aarch64_parse_fmv_features (const char *str
, aarch64_feature_flags
*isa_flags
,
19880 aarch64_fmv_feature_mask
*feature_mask
,
19881 std::string
*invalid_extension
)
19884 *feature_mask
= 0ULL;
19886 if (strcmp (str
, "default") == 0)
19887 return AARCH_PARSE_OK
;
19889 while (str
!= NULL
&& *str
!= 0)
19894 ext
= strchr (str
, '+');
19899 len
= strlen (str
);
19902 return AARCH_PARSE_MISSING_ARG
;
19904 int num_features
= ARRAY_SIZE (aarch64_fmv_feature_data
);
19906 for (i
= 0; i
< num_features
; i
++)
19908 if (strlen (aarch64_fmv_feature_data
[i
].name
) == len
19909 && strncmp (aarch64_fmv_feature_data
[i
].name
, str
, len
) == 0)
19912 *isa_flags
|= aarch64_fmv_feature_data
[i
].opt_flags
;
19915 auto old_feature_mask
= *feature_mask
;
19916 *feature_mask
|= aarch64_fmv_feature_data
[i
].feature_mask
;
19917 if (*feature_mask
== old_feature_mask
)
19919 /* Duplicate feature. */
19920 if (invalid_extension
)
19921 *invalid_extension
= std::string (str
, len
);
19922 return AARCH_PARSE_DUPLICATE_FEATURE
;
19929 if (i
== num_features
)
19931 /* Feature not found in list. */
19932 if (invalid_extension
)
19933 *invalid_extension
= std::string (str
, len
);
19934 return AARCH_PARSE_INVALID_FEATURE
;
19939 /* Skip over the next '+'. */
19943 return AARCH_PARSE_OK
;
19946 /* Parse the tree in ARGS that contains the target_version attribute
19947 information and update the global target options space. */
19950 aarch64_process_target_version_attr (tree args
)
19952 if (TREE_CODE (args
) == TREE_LIST
)
19954 if (TREE_CHAIN (args
))
19956 error ("attribute %<target_version%> has multiple values");
19959 args
= TREE_VALUE (args
);
19962 if (!args
|| TREE_CODE (args
) != STRING_CST
)
19964 error ("attribute %<target_version%> argument not a string");
19968 const char *str
= TREE_STRING_POINTER (args
);
19970 enum aarch_parse_opt_result parse_res
;
19971 auto isa_flags
= aarch64_asm_isa_flags
;
19973 std::string invalid_extension
;
19974 parse_res
= aarch64_parse_fmv_features (str
, &isa_flags
, NULL
,
19975 &invalid_extension
);
19977 if (parse_res
== AARCH_PARSE_OK
)
19979 aarch64_set_asm_isa_flags (isa_flags
);
19985 case AARCH_PARSE_MISSING_ARG
:
19986 error ("missing value in %<target_version%> attribute");
19989 case AARCH_PARSE_INVALID_FEATURE
:
19990 error ("invalid feature modifier %qs of value %qs in "
19991 "%<target_version%> attribute", invalid_extension
.c_str (),
19995 case AARCH_PARSE_DUPLICATE_FEATURE
:
19996 error ("duplicate feature modifier %qs of value %qs in "
19997 "%<target_version%> attribute", invalid_extension
.c_str (),
20002 gcc_unreachable ();
20008 /* Implement TARGET_OPTION_VALID_VERSION_ATTRIBUTE_P. This is used to
20009 process attribute ((target_version ("..."))). */
20012 aarch64_option_valid_version_attribute_p (tree fndecl
, tree
, tree args
, int)
20014 struct cl_target_option cur_target
;
20017 tree existing_target
= DECL_FUNCTION_SPECIFIC_TARGET (fndecl
);
20019 /* Save the current target options to restore at the end. */
20020 cl_target_option_save (&cur_target
, &global_options
, &global_options_set
);
20022 /* If fndecl already has some target attributes applied to it, unpack
20023 them so that we add this attribute on top of them, rather than
20024 overwriting them. */
20025 if (existing_target
)
20027 struct cl_target_option
*existing_options
20028 = TREE_TARGET_OPTION (existing_target
);
20030 if (existing_options
)
20031 cl_target_option_restore (&global_options
, &global_options_set
,
20035 cl_target_option_restore (&global_options
, &global_options_set
,
20036 TREE_TARGET_OPTION (target_option_current_node
));
20038 ret
= aarch64_process_target_version_attr (args
);
20040 /* Set up any additional state. */
20043 aarch64_override_options_internal (&global_options
);
20044 new_target
= build_target_option_node (&global_options
,
20045 &global_options_set
);
20051 DECL_FUNCTION_SPECIFIC_TARGET (fndecl
) = new_target
;
20053 cl_target_option_restore (&global_options
, &global_options_set
, &cur_target
);
20058 /* This parses the attribute arguments to target_version in DECL and the
20059 feature mask required to select those targets. No adjustments are made to
20060 add or remove redundant feature requirements. */
20062 static aarch64_fmv_feature_mask
20063 get_feature_mask_for_version (tree decl
)
20065 tree version_attr
= lookup_attribute ("target_version",
20066 DECL_ATTRIBUTES (decl
));
20067 if (version_attr
== NULL
)
20070 const char *version_string
= TREE_STRING_POINTER (TREE_VALUE (TREE_VALUE
20072 enum aarch_parse_opt_result parse_res
;
20073 aarch64_fmv_feature_mask feature_mask
;
20075 parse_res
= aarch64_parse_fmv_features (version_string
, NULL
, &feature_mask
,
20078 /* We should have detected any errors before getting here. */
20079 gcc_assert (parse_res
== AARCH_PARSE_OK
);
20081 return feature_mask
;
20084 /* Compare priorities of two feature masks. Return:
20085 1: mask1 is higher priority
20086 -1: mask2 is higher priority
20087 0: masks are equal. */
20090 compare_feature_masks (aarch64_fmv_feature_mask mask1
,
20091 aarch64_fmv_feature_mask mask2
)
20093 int pop1
= popcount_hwi (mask1
);
20094 int pop2
= popcount_hwi (mask2
);
20100 auto diff_mask
= mask1
^ mask2
;
20101 if (diff_mask
== 0ULL)
20103 int num_features
= ARRAY_SIZE (aarch64_fmv_feature_data
);
20104 for (int i
= num_features
- 1; i
>= 0; i
--)
20106 auto bit_mask
= aarch64_fmv_feature_data
[i
].feature_mask
;
20107 if (diff_mask
& bit_mask
)
20108 return (mask1
& bit_mask
) ? 1 : -1;
20113 /* Compare priorities of two version decls. */
20116 aarch64_compare_version_priority (tree decl1
, tree decl2
)
20118 auto mask1
= get_feature_mask_for_version (decl1
);
20119 auto mask2
= get_feature_mask_for_version (decl2
);
20121 return compare_feature_masks (mask1
, mask2
);
20124 /* Build the struct __ifunc_arg_t type:
20126 struct __ifunc_arg_t
20128 unsigned long _size; // Size of the struct, so it can grow.
20129 unsigned long _hwcap;
20130 unsigned long _hwcap2;
20135 build_ifunc_arg_type ()
20137 tree ifunc_arg_type
= lang_hooks
.types
.make_type (RECORD_TYPE
);
20138 tree field1
= build_decl (UNKNOWN_LOCATION
, FIELD_DECL
,
20139 get_identifier ("_size"),
20140 long_unsigned_type_node
);
20141 tree field2
= build_decl (UNKNOWN_LOCATION
, FIELD_DECL
,
20142 get_identifier ("_hwcap"),
20143 long_unsigned_type_node
);
20144 tree field3
= build_decl (UNKNOWN_LOCATION
, FIELD_DECL
,
20145 get_identifier ("_hwcap2"),
20146 long_unsigned_type_node
);
20148 DECL_FIELD_CONTEXT (field1
) = ifunc_arg_type
;
20149 DECL_FIELD_CONTEXT (field2
) = ifunc_arg_type
;
20150 DECL_FIELD_CONTEXT (field3
) = ifunc_arg_type
;
20152 TYPE_FIELDS (ifunc_arg_type
) = field1
;
20153 DECL_CHAIN (field1
) = field2
;
20154 DECL_CHAIN (field2
) = field3
;
20156 layout_type (ifunc_arg_type
);
20158 tree const_type
= build_qualified_type (ifunc_arg_type
, TYPE_QUAL_CONST
);
20159 tree pointer_type
= build_pointer_type (const_type
);
20161 return pointer_type
;
20164 /* Implement TARGET_MANGLE_DECL_ASSEMBLER_NAME, to add function multiversioning
20168 aarch64_mangle_decl_assembler_name (tree decl
, tree id
)
20170 /* For function version, add the target suffix to the assembler name. */
20171 if (TREE_CODE (decl
) == FUNCTION_DECL
20172 && DECL_FUNCTION_VERSIONED (decl
))
20174 aarch64_fmv_feature_mask feature_mask
= get_feature_mask_for_version (decl
);
20176 std::string name
= IDENTIFIER_POINTER (id
);
20178 /* For the default version, append ".default". */
20179 if (feature_mask
== 0ULL)
20181 name
+= ".default";
20182 return get_identifier (name
.c_str());
20187 int num_features
= ARRAY_SIZE (aarch64_fmv_feature_data
);
20188 for (int i
= 0; i
< num_features
; i
++)
20190 if (feature_mask
& aarch64_fmv_feature_data
[i
].feature_mask
)
20193 name
+= aarch64_fmv_feature_data
[i
].name
;
20197 if (DECL_ASSEMBLER_NAME_SET_P (decl
))
20198 SET_DECL_RTL (decl
, NULL
);
20200 id
= get_identifier (name
.c_str());
20205 /* Return an identifier for the base assembler name of a versioned function.
20206 This is computed by taking the default version's assembler name, and
20207 stripping off the ".default" suffix if it's already been appended. */
20210 get_suffixed_assembler_name (tree default_decl
, const char *suffix
)
20212 std::string name
= IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (default_decl
));
20214 auto size
= name
.size ();
20215 if (size
>= 8 && name
.compare (size
- 8, 8, ".default") == 0)
20216 name
.resize (size
- 8);
20218 return get_identifier (name
.c_str());
20221 /* Make the resolver function decl to dispatch the versions of
20222 a multi-versioned function, DEFAULT_DECL. IFUNC_ALIAS_DECL is
20223 ifunc alias that will point to the created resolver. Create an
20224 empty basic block in the resolver and store the pointer in
20225 EMPTY_BB. Return the decl of the resolver function. */
20228 make_resolver_func (const tree default_decl
,
20229 const tree ifunc_alias_decl
,
20230 basic_block
*empty_bb
)
20232 tree decl
, type
, t
;
20234 /* Create resolver function name based on default_decl. We need to remove an
20235 existing ".default" suffix if this has already been appended. */
20236 tree decl_name
= get_suffixed_assembler_name (default_decl
, ".resolver");
20237 const char *resolver_name
= IDENTIFIER_POINTER (decl_name
);
20239 /* The resolver function should have signature
20240 (void *) resolver (uint64_t, const __ifunc_arg_t *) */
20241 type
= build_function_type_list (ptr_type_node
,
20243 build_ifunc_arg_type (),
20246 decl
= build_fn_decl (resolver_name
, type
);
20247 SET_DECL_ASSEMBLER_NAME (decl
, decl_name
);
20249 DECL_NAME (decl
) = decl_name
;
20250 TREE_USED (decl
) = 1;
20251 DECL_ARTIFICIAL (decl
) = 1;
20252 DECL_IGNORED_P (decl
) = 1;
20253 TREE_PUBLIC (decl
) = 0;
20254 DECL_UNINLINABLE (decl
) = 1;
20256 /* Resolver is not external, body is generated. */
20257 DECL_EXTERNAL (decl
) = 0;
20258 DECL_EXTERNAL (ifunc_alias_decl
) = 0;
20260 DECL_CONTEXT (decl
) = NULL_TREE
;
20261 DECL_INITIAL (decl
) = make_node (BLOCK
);
20262 DECL_STATIC_CONSTRUCTOR (decl
) = 0;
20264 if (DECL_COMDAT_GROUP (default_decl
)
20265 || TREE_PUBLIC (default_decl
))
20267 /* In this case, each translation unit with a call to this
20268 versioned function will put out a resolver. Ensure it
20269 is comdat to keep just one copy. */
20270 DECL_COMDAT (decl
) = 1;
20271 make_decl_one_only (decl
, DECL_ASSEMBLER_NAME (decl
));
20274 TREE_PUBLIC (ifunc_alias_decl
) = 0;
20276 /* Build result decl and add to function_decl. */
20277 t
= build_decl (UNKNOWN_LOCATION
, RESULT_DECL
, NULL_TREE
, ptr_type_node
);
20278 DECL_CONTEXT (t
) = decl
;
20279 DECL_ARTIFICIAL (t
) = 1;
20280 DECL_IGNORED_P (t
) = 1;
20281 DECL_RESULT (decl
) = t
;
20283 /* Build parameter decls and add to function_decl. */
20284 tree arg1
= build_decl (UNKNOWN_LOCATION
, PARM_DECL
,
20285 get_identifier ("hwcap"),
20287 tree arg2
= build_decl (UNKNOWN_LOCATION
, PARM_DECL
,
20288 get_identifier ("arg"),
20289 build_ifunc_arg_type());
20290 DECL_CONTEXT (arg1
) = decl
;
20291 DECL_CONTEXT (arg2
) = decl
;
20292 DECL_ARTIFICIAL (arg1
) = 1;
20293 DECL_ARTIFICIAL (arg2
) = 1;
20294 DECL_IGNORED_P (arg1
) = 1;
20295 DECL_IGNORED_P (arg2
) = 1;
20296 DECL_ARG_TYPE (arg1
) = uint64_type_node
;
20297 DECL_ARG_TYPE (arg2
) = build_ifunc_arg_type ();
20298 DECL_ARGUMENTS (decl
) = arg1
;
20299 TREE_CHAIN (arg1
) = arg2
;
20301 gimplify_function_tree (decl
);
20302 push_cfun (DECL_STRUCT_FUNCTION (decl
));
20303 *empty_bb
= init_lowered_empty_function (decl
, false,
20304 profile_count::uninitialized ());
20306 cgraph_node::add_new_function (decl
, true);
20307 symtab
->call_cgraph_insertion_hooks (cgraph_node::get_create (decl
));
20311 gcc_assert (ifunc_alias_decl
!= NULL
);
20312 /* Mark ifunc_alias_decl as "ifunc" with resolver as resolver_name. */
20313 DECL_ATTRIBUTES (ifunc_alias_decl
)
20314 = make_attribute ("ifunc", resolver_name
,
20315 DECL_ATTRIBUTES (ifunc_alias_decl
));
20317 /* Create the alias for dispatch to resolver here. */
20318 cgraph_node::create_same_body_alias (ifunc_alias_decl
, decl
);
20322 /* This adds a condition to the basic_block NEW_BB in function FUNCTION_DECL
20323 to return a pointer to VERSION_DECL if all feature bits specified in
20324 FEATURE_MASK are not set in MASK_VAR. This function will be called during
20325 version dispatch to decide which function version to execute. It returns
20326 the basic block at the end, to which more conditions can be added. */
20328 add_condition_to_bb (tree function_decl
, tree version_decl
,
20329 aarch64_fmv_feature_mask feature_mask
,
20330 tree mask_var
, basic_block new_bb
)
20332 gimple
*return_stmt
;
20333 tree convert_expr
, result_var
;
20334 gimple
*convert_stmt
;
20335 gimple
*if_else_stmt
;
20337 basic_block bb1
, bb2
, bb3
;
20342 push_cfun (DECL_STRUCT_FUNCTION (function_decl
));
20344 gcc_assert (new_bb
!= NULL
);
20345 gseq
= bb_seq (new_bb
);
20347 convert_expr
= build1 (CONVERT_EXPR
, ptr_type_node
,
20348 build_fold_addr_expr (version_decl
));
20349 result_var
= create_tmp_var (ptr_type_node
);
20350 convert_stmt
= gimple_build_assign (result_var
, convert_expr
);
20351 return_stmt
= gimple_build_return (result_var
);
20353 if (feature_mask
== 0ULL)
20355 /* Default version. */
20356 gimple_seq_add_stmt (&gseq
, convert_stmt
);
20357 gimple_seq_add_stmt (&gseq
, return_stmt
);
20358 set_bb_seq (new_bb
, gseq
);
20359 gimple_set_bb (convert_stmt
, new_bb
);
20360 gimple_set_bb (return_stmt
, new_bb
);
20365 tree and_expr_var
= create_tmp_var (long_long_unsigned_type_node
);
20366 tree and_expr
= build2 (BIT_AND_EXPR
,
20367 long_long_unsigned_type_node
,
20369 build_int_cst (long_long_unsigned_type_node
,
20371 gimple
*and_stmt
= gimple_build_assign (and_expr_var
, and_expr
);
20372 gimple_set_block (and_stmt
, DECL_INITIAL (function_decl
));
20373 gimple_set_bb (and_stmt
, new_bb
);
20374 gimple_seq_add_stmt (&gseq
, and_stmt
);
20376 tree zero_llu
= build_int_cst (long_long_unsigned_type_node
, 0);
20377 if_else_stmt
= gimple_build_cond (EQ_EXPR
, and_expr_var
, zero_llu
,
20378 NULL_TREE
, NULL_TREE
);
20379 gimple_set_block (if_else_stmt
, DECL_INITIAL (function_decl
));
20380 gimple_set_bb (if_else_stmt
, new_bb
);
20381 gimple_seq_add_stmt (&gseq
, if_else_stmt
);
20383 gimple_seq_add_stmt (&gseq
, convert_stmt
);
20384 gimple_seq_add_stmt (&gseq
, return_stmt
);
20385 set_bb_seq (new_bb
, gseq
);
20388 e12
= split_block (bb1
, if_else_stmt
);
20390 e12
->flags
&= ~EDGE_FALLTHRU
;
20391 e12
->flags
|= EDGE_TRUE_VALUE
;
20393 e23
= split_block (bb2
, return_stmt
);
20395 gimple_set_bb (convert_stmt
, bb2
);
20396 gimple_set_bb (return_stmt
, bb2
);
20399 make_edge (bb1
, bb3
, EDGE_FALSE_VALUE
);
20402 make_edge (bb2
, EXIT_BLOCK_PTR_FOR_FN (cfun
), 0);
20409 /* This function generates the dispatch function for
20410 multi-versioned functions. DISPATCH_DECL is the function which will
20411 contain the dispatch logic. FNDECLS are the function choices for
20412 dispatch, and is a tree chain. EMPTY_BB is the basic block pointer
20413 in DISPATCH_DECL in which the dispatch code is generated. */
20416 dispatch_function_versions (tree dispatch_decl
,
20418 basic_block
*empty_bb
)
20420 gimple
*ifunc_cpu_init_stmt
;
20422 vec
<tree
> *fndecls
;
20424 gcc_assert (dispatch_decl
!= NULL
20425 && fndecls_p
!= NULL
20426 && empty_bb
!= NULL
);
20428 push_cfun (DECL_STRUCT_FUNCTION (dispatch_decl
));
20430 gseq
= bb_seq (*empty_bb
);
20431 /* Function version dispatch is via IFUNC. IFUNC resolvers fire before
20432 constructors, so explicity call __init_cpu_features_resolver here. */
20433 tree init_fn_type
= build_function_type_list (void_type_node
,
20434 long_unsigned_type_node
,
20435 build_ifunc_arg_type(),
20437 tree init_fn_id
= get_identifier ("__init_cpu_features_resolver");
20438 tree init_fn_decl
= build_decl (UNKNOWN_LOCATION
, FUNCTION_DECL
,
20439 init_fn_id
, init_fn_type
);
20440 tree arg1
= DECL_ARGUMENTS (dispatch_decl
);
20441 tree arg2
= TREE_CHAIN (arg1
);
20442 ifunc_cpu_init_stmt
= gimple_build_call (init_fn_decl
, 2, arg1
, arg2
);
20443 gimple_seq_add_stmt (&gseq
, ifunc_cpu_init_stmt
);
20444 gimple_set_bb (ifunc_cpu_init_stmt
, *empty_bb
);
20446 /* Build the struct type for __aarch64_cpu_features. */
20447 tree global_type
= lang_hooks
.types
.make_type (RECORD_TYPE
);
20448 tree field1
= build_decl (UNKNOWN_LOCATION
, FIELD_DECL
,
20449 get_identifier ("features"),
20450 long_long_unsigned_type_node
);
20451 DECL_FIELD_CONTEXT (field1
) = global_type
;
20452 TYPE_FIELDS (global_type
) = field1
;
20453 layout_type (global_type
);
20455 tree global_var
= build_decl (UNKNOWN_LOCATION
, VAR_DECL
,
20456 get_identifier ("__aarch64_cpu_features"),
20458 DECL_EXTERNAL (global_var
) = 1;
20459 tree mask_var
= create_tmp_var (long_long_unsigned_type_node
);
20461 tree component_expr
= build3 (COMPONENT_REF
, long_long_unsigned_type_node
,
20462 global_var
, field1
, NULL_TREE
);
20463 gimple
*component_stmt
= gimple_build_assign (mask_var
, component_expr
);
20464 gimple_set_block (component_stmt
, DECL_INITIAL (dispatch_decl
));
20465 gimple_set_bb (component_stmt
, *empty_bb
);
20466 gimple_seq_add_stmt (&gseq
, component_stmt
);
20468 tree not_expr
= build1 (BIT_NOT_EXPR
, long_long_unsigned_type_node
, mask_var
);
20469 gimple
*not_stmt
= gimple_build_assign (mask_var
, not_expr
);
20470 gimple_set_block (not_stmt
, DECL_INITIAL (dispatch_decl
));
20471 gimple_set_bb (not_stmt
, *empty_bb
);
20472 gimple_seq_add_stmt (&gseq
, not_stmt
);
20474 set_bb_seq (*empty_bb
, gseq
);
20478 /* fndecls_p is actually a vector. */
20479 fndecls
= static_cast<vec
<tree
> *> (fndecls_p
);
20481 /* At least one more version other than the default. */
20482 unsigned int num_versions
= fndecls
->length ();
20483 gcc_assert (num_versions
>= 2);
20485 struct function_version_info
20488 aarch64_fmv_feature_mask feature_mask
;
20489 } *function_versions
;
20491 function_versions
= (struct function_version_info
*)
20492 XNEWVEC (struct function_version_info
, (num_versions
));
20494 unsigned int actual_versions
= 0;
20496 for (tree version_decl
: *fndecls
)
20498 aarch64_fmv_feature_mask feature_mask
;
20499 /* Get attribute string, parse it and find the right features. */
20500 feature_mask
= get_feature_mask_for_version (version_decl
);
20501 function_versions
[actual_versions
].version_decl
= version_decl
;
20502 function_versions
[actual_versions
].feature_mask
= feature_mask
;
20506 auto compare_feature_version_info
= [](const void *p1
, const void *p2
) {
20507 const function_version_info v1
= *(const function_version_info
*)p1
;
20508 const function_version_info v2
= *(const function_version_info
*)p2
;
20509 return - compare_feature_masks (v1
.feature_mask
, v2
.feature_mask
);
20512 /* Sort the versions according to descending order of dispatch priority. */
20513 qsort (function_versions
, actual_versions
,
20514 sizeof (struct function_version_info
), compare_feature_version_info
);
20516 for (unsigned int i
= 0; i
< actual_versions
; ++i
)
20517 *empty_bb
= add_condition_to_bb (dispatch_decl
,
20518 function_versions
[i
].version_decl
,
20519 function_versions
[i
].feature_mask
,
20523 free (function_versions
);
20527 /* Implement TARGET_GENERATE_VERSION_DISPATCHER_BODY. */
20530 aarch64_generate_version_dispatcher_body (void *node_p
)
20532 tree resolver_decl
;
20533 basic_block empty_bb
;
20534 tree default_ver_decl
;
20535 struct cgraph_node
*versn
;
20536 struct cgraph_node
*node
;
20538 struct cgraph_function_version_info
*node_version_info
= NULL
;
20539 struct cgraph_function_version_info
*versn_info
= NULL
;
20541 node
= (cgraph_node
*)node_p
;
20543 node_version_info
= node
->function_version ();
20544 gcc_assert (node
->dispatcher_function
20545 && node_version_info
!= NULL
);
20547 if (node_version_info
->dispatcher_resolver
)
20548 return node_version_info
->dispatcher_resolver
;
20550 /* The first version in the chain corresponds to the default version. */
20551 default_ver_decl
= node_version_info
->next
->this_node
->decl
;
20553 /* node is going to be an alias, so remove the finalized bit. */
20554 node
->definition
= false;
20556 resolver_decl
= make_resolver_func (default_ver_decl
,
20557 node
->decl
, &empty_bb
);
20559 node_version_info
->dispatcher_resolver
= resolver_decl
;
20561 push_cfun (DECL_STRUCT_FUNCTION (resolver_decl
));
20563 auto_vec
<tree
, 2> fn_ver_vec
;
20565 for (versn_info
= node_version_info
->next
; versn_info
;
20566 versn_info
= versn_info
->next
)
20568 versn
= versn_info
->this_node
;
20569 /* Check for virtual functions here again, as by this time it should
20570 have been determined if this function needs a vtable index or
20571 not. This happens for methods in derived classes that override
20572 virtual methods in base classes but are not explicitly marked as
20574 if (DECL_VINDEX (versn
->decl
))
20575 sorry ("virtual function multiversioning not supported");
20577 fn_ver_vec
.safe_push (versn
->decl
);
20580 dispatch_function_versions (resolver_decl
, &fn_ver_vec
, &empty_bb
);
20581 cgraph_edge::rebuild_edges ();
20584 /* Fix up symbol names. First we need to obtain the base name, which may
20585 have already been mangled. */
20586 tree base_name
= get_suffixed_assembler_name (default_ver_decl
, "");
20588 /* We need to redo the version mangling on the non-default versions for the
20589 target_clones case. Redoing the mangling for the target_version case is
20590 redundant but does no harm. We need to skip the default version, because
20591 expand_clones will append ".default" later; fortunately that suffix is the
20592 one we want anyway. */
20593 for (versn_info
= node_version_info
->next
->next
; versn_info
;
20594 versn_info
= versn_info
->next
)
20596 tree version_decl
= versn_info
->this_node
->decl
;
20597 tree name
= aarch64_mangle_decl_assembler_name (version_decl
,
20599 symtab
->change_decl_assembler_name (version_decl
, name
);
20602 /* We also need to use the base name for the ifunc declaration. */
20603 symtab
->change_decl_assembler_name (node
->decl
, base_name
);
20605 return resolver_decl
;
20608 /* Make a dispatcher declaration for the multi-versioned function DECL.
20609 Calls to DECL function will be replaced with calls to the dispatcher
20610 by the front-end. Returns the decl of the dispatcher function. */
20613 aarch64_get_function_versions_dispatcher (void *decl
)
20615 tree fn
= (tree
) decl
;
20616 struct cgraph_node
*node
= NULL
;
20617 struct cgraph_node
*default_node
= NULL
;
20618 struct cgraph_function_version_info
*node_v
= NULL
;
20619 struct cgraph_function_version_info
*first_v
= NULL
;
20621 tree dispatch_decl
= NULL
;
20623 struct cgraph_function_version_info
*default_version_info
= NULL
;
20625 gcc_assert (fn
!= NULL
&& DECL_FUNCTION_VERSIONED (fn
));
20627 node
= cgraph_node::get (fn
);
20628 gcc_assert (node
!= NULL
);
20630 node_v
= node
->function_version ();
20631 gcc_assert (node_v
!= NULL
);
20633 if (node_v
->dispatcher_resolver
!= NULL
)
20634 return node_v
->dispatcher_resolver
;
20636 /* Find the default version and make it the first node. */
20638 /* Go to the beginning of the chain. */
20639 while (first_v
->prev
!= NULL
)
20640 first_v
= first_v
->prev
;
20641 default_version_info
= first_v
;
20642 while (default_version_info
!= NULL
)
20644 if (get_feature_mask_for_version
20645 (default_version_info
->this_node
->decl
) == 0ULL)
20647 default_version_info
= default_version_info
->next
;
20650 /* If there is no default node, just return NULL. */
20651 if (default_version_info
== NULL
)
20654 /* Make default info the first node. */
20655 if (first_v
!= default_version_info
)
20657 default_version_info
->prev
->next
= default_version_info
->next
;
20658 if (default_version_info
->next
)
20659 default_version_info
->next
->prev
= default_version_info
->prev
;
20660 first_v
->prev
= default_version_info
;
20661 default_version_info
->next
= first_v
;
20662 default_version_info
->prev
= NULL
;
20665 default_node
= default_version_info
->this_node
;
20667 if (targetm
.has_ifunc_p ())
20669 struct cgraph_function_version_info
*it_v
= NULL
;
20670 struct cgraph_node
*dispatcher_node
= NULL
;
20671 struct cgraph_function_version_info
*dispatcher_version_info
= NULL
;
20673 /* Right now, the dispatching is done via ifunc. */
20674 dispatch_decl
= make_dispatcher_decl (default_node
->decl
);
20675 TREE_NOTHROW (dispatch_decl
) = TREE_NOTHROW (fn
);
20677 dispatcher_node
= cgraph_node::get_create (dispatch_decl
);
20678 gcc_assert (dispatcher_node
!= NULL
);
20679 dispatcher_node
->dispatcher_function
= 1;
20680 dispatcher_version_info
20681 = dispatcher_node
->insert_new_function_version ();
20682 dispatcher_version_info
->next
= default_version_info
;
20683 dispatcher_node
->definition
= 1;
20685 /* Set the dispatcher for all the versions. */
20686 it_v
= default_version_info
;
20687 while (it_v
!= NULL
)
20689 it_v
->dispatcher_resolver
= dispatch_decl
;
20695 error_at (DECL_SOURCE_LOCATION (default_node
->decl
),
20696 "multiversioning needs %<ifunc%> which is not supported "
20700 return dispatch_decl
;
20703 /* This function returns true if FN1 and FN2 are versions of the same function,
20704 that is, the target_version attributes of the function decls are different.
20705 This assumes that FN1 and FN2 have the same signature. */
20708 aarch64_common_function_versions (tree fn1
, tree fn2
)
20710 if (TREE_CODE (fn1
) != FUNCTION_DECL
20711 || TREE_CODE (fn2
) != FUNCTION_DECL
)
20714 return (aarch64_compare_version_priority (fn1
, fn2
) != 0);
20717 /* Implement TARGET_FUNCTION_ATTRIBUTE_INLINABLE_P. Use an opt-out
20718 rather than an opt-in list. */
20721 aarch64_function_attribute_inlinable_p (const_tree fndecl
)
20723 /* A function that has local SME state cannot be inlined into its caller,
20724 since we only support managing PSTATE.ZA switches at function scope. */
20725 return (!aarch64_fndecl_has_new_state (fndecl
, "za")
20726 && !aarch64_fndecl_has_new_state (fndecl
, "zt0"));
20729 /* Helper for aarch64_can_inline_p. In the case where CALLER and CALLEE are
20730 tri-bool options (yes, no, don't care) and the default value is
20731 DEF, determine whether to reject inlining. */
20734 aarch64_tribools_ok_for_inlining_p (int caller
, int callee
,
20735 int dont_care
, int def
)
20737 /* If the callee doesn't care, always allow inlining. */
20738 if (callee
== dont_care
)
20741 /* If the caller doesn't care, always allow inlining. */
20742 if (caller
== dont_care
)
20745 /* Otherwise, allow inlining if either the callee and caller values
20746 agree, or if the callee is using the default value. */
20747 return (callee
== caller
|| callee
== def
);
20750 /* Bit allocations for ipa_fn_summary::target_info. */
20752 /* Set if the function contains a stmt that relies on the function's
20753 choice of PSTATE.SM setting (0 for non-streaming, 1 for streaming).
20754 Not meaningful for streaming-compatible functions. */
20755 constexpr auto AARCH64_IPA_SM_FIXED
= 1U << 0;
20757 /* Set if the function clobbers ZA and ZT0. Not meaningful for functions that
20759 constexpr auto AARCH64_IPA_CLOBBERS_ZA
= 1U << 1;
20760 constexpr auto AARCH64_IPA_CLOBBERS_ZT0
= 1U << 2;
20762 /* Implement TARGET_NEED_IPA_FN_TARGET_INFO. */
20765 aarch64_need_ipa_fn_target_info (const_tree
, unsigned int &)
20767 /* We could in principle skip this for streaming-compatible functions
20768 that have ZA state, but that's a rare combination. */
20772 /* Implement TARGET_UPDATE_IPA_FN_TARGET_INFO. */
20775 aarch64_update_ipa_fn_target_info (unsigned int &info
, const gimple
*stmt
)
20777 if (auto *ga
= dyn_cast
<const gasm
*> (stmt
))
20779 /* We don't know what the asm does, so conservatively assume that
20780 it requires the function's current SM mode. */
20781 info
|= AARCH64_IPA_SM_FIXED
;
20782 for (unsigned int i
= 0; i
< gimple_asm_nclobbers (ga
); ++i
)
20784 tree op
= gimple_asm_clobber_op (ga
, i
);
20785 const char *clobber
= TREE_STRING_POINTER (TREE_VALUE (op
));
20786 if (strcmp (clobber
, "za") == 0)
20787 info
|= AARCH64_IPA_CLOBBERS_ZA
;
20788 if (strcmp (clobber
, "zt0") == 0)
20789 info
|= AARCH64_IPA_CLOBBERS_ZT0
;
20792 if (auto *call
= dyn_cast
<const gcall
*> (stmt
))
20794 if (gimple_call_builtin_p (call
, BUILT_IN_MD
))
20796 /* The attributes on AArch64 builtins are supposed to be accurate.
20797 If the function isn't marked streaming-compatible then it
20798 needs whichever SM mode it selects. */
20799 tree decl
= gimple_call_fndecl (call
);
20800 if (aarch64_fndecl_pstate_sm (decl
) != 0)
20801 info
|= AARCH64_IPA_SM_FIXED
;
20807 /* Implement TARGET_CAN_INLINE_P. Decide whether it is valid
20808 to inline CALLEE into CALLER based on target-specific info.
20809 Make sure that the caller and callee have compatible architectural
20810 features. Then go through the other possible target attributes
20811 and see if they can block inlining. Try not to reject always_inline
20812 callees unless they are incompatible architecturally. */
20815 aarch64_can_inline_p (tree caller
, tree callee
)
20817 tree caller_tree
= DECL_FUNCTION_SPECIFIC_TARGET (caller
);
20818 tree callee_tree
= DECL_FUNCTION_SPECIFIC_TARGET (callee
);
20820 struct cl_target_option
*caller_opts
20821 = TREE_TARGET_OPTION (caller_tree
? caller_tree
20822 : target_option_default_node
);
20824 struct cl_target_option
*callee_opts
20825 = TREE_TARGET_OPTION (callee_tree
? callee_tree
20826 : target_option_default_node
);
20828 /* Callee's ISA flags should be a subset of the caller's. */
20829 auto caller_asm_isa
= (aarch64_get_asm_isa_flags (caller_opts
)
20830 & ~AARCH64_FL_ISA_MODES
);
20831 auto callee_asm_isa
= (aarch64_get_asm_isa_flags (callee_opts
)
20832 & ~AARCH64_FL_ISA_MODES
);
20833 if (callee_asm_isa
& ~caller_asm_isa
)
20836 auto caller_isa
= (aarch64_get_isa_flags (caller_opts
)
20837 & ~AARCH64_FL_ISA_MODES
);
20838 auto callee_isa
= (aarch64_get_isa_flags (callee_opts
)
20839 & ~AARCH64_FL_ISA_MODES
);
20840 if (callee_isa
& ~caller_isa
)
20843 /* Return true if the callee might have target_info property PROPERTY.
20844 The answer must be true unless we have positive proof to the contrary. */
20845 auto callee_has_property
= [&](unsigned int property
)
20847 if (ipa_fn_summaries
)
20848 if (auto *summary
= ipa_fn_summaries
->get (cgraph_node::get (callee
)))
20849 if (!(summary
->target_info
& property
))
20854 /* Streaming-compatible code can be inlined into functions with any
20855 PSTATE.SM mode. Otherwise the caller and callee must agree on
20856 PSTATE.SM mode, unless we can prove that the callee is naturally
20857 streaming-compatible. */
20858 auto caller_sm
= (aarch64_get_isa_flags (caller_opts
) & AARCH64_FL_SM_STATE
);
20859 auto callee_sm
= (aarch64_get_isa_flags (callee_opts
) & AARCH64_FL_SM_STATE
);
20861 && caller_sm
!= callee_sm
20862 && callee_has_property (AARCH64_IPA_SM_FIXED
))
20865 /* aarch64_function_attribute_inlinable_p prevents new-ZA and new-ZT0
20866 functions from being inlined into others. We also need to prevent
20867 inlining of shared-ZA functions into functions without ZA state,
20868 since this is an error condition.
20870 The only other problematic case for ZA is inlining a function that
20871 directly clobbers ZA or ZT0 into a function that has ZA or ZT0 state. */
20872 auto caller_za
= (aarch64_get_isa_flags (caller_opts
) & AARCH64_FL_ZA_ON
);
20873 auto callee_za
= (aarch64_get_isa_flags (callee_opts
) & AARCH64_FL_ZA_ON
);
20874 if (!caller_za
&& callee_za
)
20877 && aarch64_fndecl_has_state (caller
, "za")
20878 && callee_has_property (AARCH64_IPA_CLOBBERS_ZA
))
20881 && aarch64_fndecl_has_state (caller
, "zt0")
20882 && callee_has_property (AARCH64_IPA_CLOBBERS_ZT0
))
20885 /* Allow non-strict aligned functions inlining into strict
20887 if ((TARGET_STRICT_ALIGN_P (caller_opts
->x_target_flags
)
20888 != TARGET_STRICT_ALIGN_P (callee_opts
->x_target_flags
))
20889 && !(!TARGET_STRICT_ALIGN_P (callee_opts
->x_target_flags
)
20890 && TARGET_STRICT_ALIGN_P (caller_opts
->x_target_flags
)))
20893 bool always_inline
= lookup_attribute ("always_inline",
20894 DECL_ATTRIBUTES (callee
));
20896 /* If the architectural features match up and the callee is always_inline
20897 then the other attributes don't matter. */
20901 if (caller_opts
->x_aarch64_cmodel_var
20902 != callee_opts
->x_aarch64_cmodel_var
)
20905 if (caller_opts
->x_aarch64_tls_dialect
20906 != callee_opts
->x_aarch64_tls_dialect
)
20909 /* Honour explicit requests to workaround errata. */
20910 if (!aarch64_tribools_ok_for_inlining_p (
20911 caller_opts
->x_aarch64_fix_a53_err835769
,
20912 callee_opts
->x_aarch64_fix_a53_err835769
,
20913 2, TARGET_FIX_ERR_A53_835769_DEFAULT
))
20916 if (!aarch64_tribools_ok_for_inlining_p (
20917 caller_opts
->x_aarch64_fix_a53_err843419
,
20918 callee_opts
->x_aarch64_fix_a53_err843419
,
20919 2, TARGET_FIX_ERR_A53_843419
))
20922 /* If the user explicitly specified -momit-leaf-frame-pointer for the
20923 caller and calle and they don't match up, reject inlining. */
20924 if (!aarch64_tribools_ok_for_inlining_p (
20925 caller_opts
->x_flag_omit_leaf_frame_pointer
,
20926 callee_opts
->x_flag_omit_leaf_frame_pointer
,
20930 /* If the callee has specific tuning overrides, respect them. */
20931 if (callee_opts
->x_aarch64_override_tune_string
!= NULL
20932 && caller_opts
->x_aarch64_override_tune_string
== NULL
)
20935 /* If the user specified tuning override strings for the
20936 caller and callee and they don't match up, reject inlining.
20937 We just do a string compare here, we don't analyze the meaning
20938 of the string, as it would be too costly for little gain. */
20939 if (callee_opts
->x_aarch64_override_tune_string
20940 && caller_opts
->x_aarch64_override_tune_string
20941 && (strcmp (callee_opts
->x_aarch64_override_tune_string
,
20942 caller_opts
->x_aarch64_override_tune_string
) != 0))
20948 /* Return the ID of the TLDESC ABI, initializing the descriptor if hasn't
20952 aarch64_tlsdesc_abi_id ()
20954 predefined_function_abi
&tlsdesc_abi
= function_abis
[ARM_PCS_TLSDESC
];
20955 if (!tlsdesc_abi
.initialized_p ())
20957 HARD_REG_SET full_reg_clobbers
;
20958 CLEAR_HARD_REG_SET (full_reg_clobbers
);
20959 SET_HARD_REG_BIT (full_reg_clobbers
, R0_REGNUM
);
20960 SET_HARD_REG_BIT (full_reg_clobbers
, CC_REGNUM
);
20961 for (int regno
= P0_REGNUM
; regno
<= P15_REGNUM
; ++regno
)
20962 SET_HARD_REG_BIT (full_reg_clobbers
, regno
);
20963 tlsdesc_abi
.initialize (ARM_PCS_TLSDESC
, full_reg_clobbers
);
20965 return ARM_PCS_TLSDESC
;
20968 /* Return true if SYMBOL_REF X binds locally. */
20971 aarch64_symbol_binds_local_p (const_rtx x
)
20973 return (SYMBOL_REF_DECL (x
)
20974 ? targetm
.binds_local_p (SYMBOL_REF_DECL (x
))
20975 : SYMBOL_REF_LOCAL_P (x
));
20978 /* Return true if SYMBOL_REF X is thread local */
20980 aarch64_tls_symbol_p (rtx x
)
20982 if (! TARGET_HAVE_TLS
)
20985 x
= strip_salt (x
);
20986 if (!SYMBOL_REF_P (x
))
20989 return SYMBOL_REF_TLS_MODEL (x
) != 0;
20992 /* Classify a TLS symbol into one of the TLS kinds. */
20993 enum aarch64_symbol_type
20994 aarch64_classify_tls_symbol (rtx x
)
20996 enum tls_model tls_kind
= tls_symbolic_operand_type (x
);
21000 case TLS_MODEL_GLOBAL_DYNAMIC
:
21001 case TLS_MODEL_LOCAL_DYNAMIC
:
21002 return TARGET_TLS_DESC
? SYMBOL_SMALL_TLSDESC
: SYMBOL_SMALL_TLSGD
;
21004 case TLS_MODEL_INITIAL_EXEC
:
21005 switch (aarch64_cmodel
)
21007 case AARCH64_CMODEL_TINY
:
21008 case AARCH64_CMODEL_TINY_PIC
:
21009 return SYMBOL_TINY_TLSIE
;
21011 return SYMBOL_SMALL_TLSIE
;
21014 case TLS_MODEL_LOCAL_EXEC
:
21015 if (aarch64_tls_size
== 12)
21016 return SYMBOL_TLSLE12
;
21017 else if (aarch64_tls_size
== 24)
21018 return SYMBOL_TLSLE24
;
21019 else if (aarch64_tls_size
== 32)
21020 return SYMBOL_TLSLE32
;
21021 else if (aarch64_tls_size
== 48)
21022 return SYMBOL_TLSLE48
;
21024 gcc_unreachable ();
21026 case TLS_MODEL_EMULATED
:
21027 case TLS_MODEL_NONE
:
21028 return SYMBOL_FORCE_TO_MEM
;
21031 gcc_unreachable ();
21035 /* Return the correct method for accessing X + OFFSET, where X is either
21036 a SYMBOL_REF or LABEL_REF. */
21038 enum aarch64_symbol_type
21039 aarch64_classify_symbol (rtx x
, HOST_WIDE_INT offset
)
21041 x
= strip_salt (x
);
21043 if (LABEL_REF_P (x
))
21045 switch (aarch64_cmodel
)
21047 case AARCH64_CMODEL_LARGE
:
21048 return SYMBOL_FORCE_TO_MEM
;
21050 case AARCH64_CMODEL_TINY_PIC
:
21051 case AARCH64_CMODEL_TINY
:
21052 return SYMBOL_TINY_ABSOLUTE
;
21054 case AARCH64_CMODEL_SMALL_SPIC
:
21055 case AARCH64_CMODEL_SMALL_PIC
:
21056 case AARCH64_CMODEL_SMALL
:
21057 return SYMBOL_SMALL_ABSOLUTE
;
21060 gcc_unreachable ();
21064 if (SYMBOL_REF_P (x
))
21066 if (aarch64_tls_symbol_p (x
))
21067 return aarch64_classify_tls_symbol (x
);
21069 switch (aarch64_cmodel
)
21071 case AARCH64_CMODEL_TINY_PIC
:
21072 case AARCH64_CMODEL_TINY
:
21073 /* With -fPIC non-local symbols use the GOT. For orthogonality
21074 always use the GOT for extern weak symbols. */
21075 if ((flag_pic
|| SYMBOL_REF_WEAK (x
))
21076 && !aarch64_symbol_binds_local_p (x
))
21077 return SYMBOL_TINY_GOT
;
21079 /* When we retrieve symbol + offset address, we have to make sure
21080 the offset does not cause overflow of the final address. But
21081 we have no way of knowing the address of symbol at compile time
21082 so we can't accurately say if the distance between the PC and
21083 symbol + offset is outside the addressible range of +/-1MB in the
21084 TINY code model. So we limit the maximum offset to +/-64KB and
21085 assume the offset to the symbol is not larger than +/-(1MB - 64KB).
21086 If offset_within_block_p is true we allow larger offsets. */
21087 if (!(IN_RANGE (offset
, -0x10000, 0x10000)
21088 || offset_within_block_p (x
, offset
)))
21089 return SYMBOL_FORCE_TO_MEM
;
21091 return SYMBOL_TINY_ABSOLUTE
;
21094 case AARCH64_CMODEL_SMALL_SPIC
:
21095 case AARCH64_CMODEL_SMALL_PIC
:
21096 case AARCH64_CMODEL_SMALL
:
21097 if ((flag_pic
|| SYMBOL_REF_WEAK (x
))
21098 && !aarch64_symbol_binds_local_p (x
))
21099 return aarch64_cmodel
== AARCH64_CMODEL_SMALL_SPIC
21100 ? SYMBOL_SMALL_GOT_28K
: SYMBOL_SMALL_GOT_4G
;
21102 /* Same reasoning as the tiny code model, but the offset cap here is
21103 1MB, allowing +/-3.9GB for the offset to the symbol. */
21104 if (!(IN_RANGE (offset
, -0x100000, 0x100000)
21105 || offset_within_block_p (x
, offset
)))
21106 return SYMBOL_FORCE_TO_MEM
;
21108 return SYMBOL_SMALL_ABSOLUTE
;
21110 case AARCH64_CMODEL_LARGE
:
21111 /* This is alright even in PIC code as the constant
21112 pool reference is always PC relative and within
21113 the same translation unit. */
21114 if (!aarch64_pcrelative_literal_loads
&& CONSTANT_POOL_ADDRESS_P (x
))
21115 return SYMBOL_SMALL_ABSOLUTE
;
21117 return SYMBOL_FORCE_TO_MEM
;
21120 gcc_unreachable ();
21124 /* By default push everything into the constant pool. */
21125 return SYMBOL_FORCE_TO_MEM
;
21129 aarch64_constant_address_p (rtx x
)
21131 return (CONSTANT_P (x
) && memory_address_p (DImode
, x
));
21135 aarch64_legitimate_pic_operand_p (rtx x
)
21138 x
= strip_offset_and_salt (x
, &offset
);
21139 if (SYMBOL_REF_P (x
))
21145 /* Implement TARGET_LEGITIMATE_CONSTANT_P hook. Return true for constants
21146 that should be rematerialized rather than spilled. */
21149 aarch64_legitimate_constant_p (machine_mode mode
, rtx x
)
21151 /* Support CSE and rematerialization of common constants. */
21152 if (CONST_INT_P (x
)
21153 || CONST_DOUBLE_P (x
))
21156 /* Only accept variable-length vector constants if they can be
21159 ??? It would be possible (but complex) to handle rematerialization
21160 of other constants via secondary reloads. */
21161 if (!GET_MODE_SIZE (mode
).is_constant ())
21162 return aarch64_simd_valid_immediate (x
, NULL
);
21164 /* Otherwise, accept any CONST_VECTOR that, if all else fails, can at
21165 least be forced to memory and loaded from there. */
21166 if (CONST_VECTOR_P (x
))
21167 return !targetm
.cannot_force_const_mem (mode
, x
);
21169 /* Do not allow vector struct mode constants for Advanced SIMD.
21170 We could support 0 and -1 easily, but they need support in
21171 aarch64-simd.md. */
21172 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
21173 if (vec_flags
== (VEC_ADVSIMD
| VEC_STRUCT
))
21176 if (GET_CODE (x
) == HIGH
)
21179 /* Accept polynomial constants that can be calculated by using the
21180 destination of a move as the sole temporary. Constants that
21181 require a second temporary cannot be rematerialized (they can't be
21182 forced to memory and also aren't legitimate constants). */
21184 if (poly_int_rtx_p (x
, &offset
))
21185 return aarch64_offset_temporaries (false, offset
) <= 1;
21187 /* If an offset is being added to something else, we need to allow the
21188 base to be moved into the destination register, meaning that there
21189 are no free temporaries for the offset. */
21190 x
= strip_offset_and_salt (x
, &offset
);
21191 if (!offset
.is_constant () && aarch64_offset_temporaries (true, offset
) > 0)
21194 /* Do not allow const (plus (anchor_symbol, const_int)). */
21195 if (maybe_ne (offset
, 0) && SYMBOL_REF_P (x
) && SYMBOL_REF_ANCHOR_P (x
))
21198 /* Treat symbols as constants. Avoid TLS symbols as they are complex,
21199 so spilling them is better than rematerialization. */
21200 if (SYMBOL_REF_P (x
) && !SYMBOL_REF_TLS_MODEL (x
))
21203 /* Label references are always constant. */
21204 if (LABEL_REF_P (x
))
21211 aarch64_load_tp (rtx target
)
21214 || GET_MODE (target
) != Pmode
21215 || !register_operand (target
, Pmode
))
21216 target
= gen_reg_rtx (Pmode
);
21218 /* Can return in any reg. */
21219 emit_insn (gen_aarch64_load_tp_hard (target
));
21223 /* On AAPCS systems, this is the "struct __va_list". */
21224 static GTY(()) tree va_list_type
;
21226 /* Implement TARGET_BUILD_BUILTIN_VA_LIST.
21227 Return the type to use as __builtin_va_list.
21229 AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
21241 aarch64_build_builtin_va_list (void)
21244 tree f_stack
, f_grtop
, f_vrtop
, f_groff
, f_vroff
;
21246 /* Create the type. */
21247 va_list_type
= lang_hooks
.types
.make_type (RECORD_TYPE
);
21248 /* Give it the required name. */
21249 va_list_name
= build_decl (BUILTINS_LOCATION
,
21251 get_identifier ("__va_list"),
21253 DECL_ARTIFICIAL (va_list_name
) = 1;
21254 TYPE_NAME (va_list_type
) = va_list_name
;
21255 TYPE_STUB_DECL (va_list_type
) = va_list_name
;
21257 /* Create the fields. */
21258 f_stack
= build_decl (BUILTINS_LOCATION
,
21259 FIELD_DECL
, get_identifier ("__stack"),
21261 f_grtop
= build_decl (BUILTINS_LOCATION
,
21262 FIELD_DECL
, get_identifier ("__gr_top"),
21264 f_vrtop
= build_decl (BUILTINS_LOCATION
,
21265 FIELD_DECL
, get_identifier ("__vr_top"),
21267 f_groff
= build_decl (BUILTINS_LOCATION
,
21268 FIELD_DECL
, get_identifier ("__gr_offs"),
21269 integer_type_node
);
21270 f_vroff
= build_decl (BUILTINS_LOCATION
,
21271 FIELD_DECL
, get_identifier ("__vr_offs"),
21272 integer_type_node
);
21274 /* Tell tree-stdarg pass about our internal offset fields.
21275 NOTE: va_list_gpr/fpr_counter_field are only used for tree comparision
21276 purpose to identify whether the code is updating va_list internal
21277 offset fields through irregular way. */
21278 va_list_gpr_counter_field
= f_groff
;
21279 va_list_fpr_counter_field
= f_vroff
;
21281 DECL_ARTIFICIAL (f_stack
) = 1;
21282 DECL_ARTIFICIAL (f_grtop
) = 1;
21283 DECL_ARTIFICIAL (f_vrtop
) = 1;
21284 DECL_ARTIFICIAL (f_groff
) = 1;
21285 DECL_ARTIFICIAL (f_vroff
) = 1;
21287 DECL_FIELD_CONTEXT (f_stack
) = va_list_type
;
21288 DECL_FIELD_CONTEXT (f_grtop
) = va_list_type
;
21289 DECL_FIELD_CONTEXT (f_vrtop
) = va_list_type
;
21290 DECL_FIELD_CONTEXT (f_groff
) = va_list_type
;
21291 DECL_FIELD_CONTEXT (f_vroff
) = va_list_type
;
21293 TYPE_FIELDS (va_list_type
) = f_stack
;
21294 DECL_CHAIN (f_stack
) = f_grtop
;
21295 DECL_CHAIN (f_grtop
) = f_vrtop
;
21296 DECL_CHAIN (f_vrtop
) = f_groff
;
21297 DECL_CHAIN (f_groff
) = f_vroff
;
21299 /* Compute its layout. */
21300 layout_type (va_list_type
);
21302 return va_list_type
;
21305 /* Implement TARGET_EXPAND_BUILTIN_VA_START. */
21307 aarch64_expand_builtin_va_start (tree valist
, rtx nextarg ATTRIBUTE_UNUSED
)
21309 const CUMULATIVE_ARGS
*cum
;
21310 tree f_stack
, f_grtop
, f_vrtop
, f_groff
, f_vroff
;
21311 tree stack
, grtop
, vrtop
, groff
, vroff
;
21313 int gr_save_area_size
= cfun
->va_list_gpr_size
;
21314 int vr_save_area_size
= cfun
->va_list_fpr_size
;
21317 cum
= &crtl
->args
.info
;
21318 if (cfun
->va_list_gpr_size
)
21319 gr_save_area_size
= MIN ((NUM_ARG_REGS
- cum
->aapcs_ncrn
) * UNITS_PER_WORD
,
21320 cfun
->va_list_gpr_size
);
21321 if (cfun
->va_list_fpr_size
)
21322 vr_save_area_size
= MIN ((NUM_FP_ARG_REGS
- cum
->aapcs_nvrn
)
21323 * UNITS_PER_VREG
, cfun
->va_list_fpr_size
);
21327 gcc_assert (cum
->aapcs_nvrn
== 0);
21328 vr_save_area_size
= 0;
21331 f_stack
= TYPE_FIELDS (va_list_type_node
);
21332 f_grtop
= DECL_CHAIN (f_stack
);
21333 f_vrtop
= DECL_CHAIN (f_grtop
);
21334 f_groff
= DECL_CHAIN (f_vrtop
);
21335 f_vroff
= DECL_CHAIN (f_groff
);
21337 stack
= build3 (COMPONENT_REF
, TREE_TYPE (f_stack
), valist
, f_stack
,
21339 grtop
= build3 (COMPONENT_REF
, TREE_TYPE (f_grtop
), valist
, f_grtop
,
21341 vrtop
= build3 (COMPONENT_REF
, TREE_TYPE (f_vrtop
), valist
, f_vrtop
,
21343 groff
= build3 (COMPONENT_REF
, TREE_TYPE (f_groff
), valist
, f_groff
,
21345 vroff
= build3 (COMPONENT_REF
, TREE_TYPE (f_vroff
), valist
, f_vroff
,
21348 /* Emit code to initialize STACK, which points to the next varargs stack
21349 argument. CUM->AAPCS_STACK_SIZE gives the number of stack words used
21350 by named arguments. STACK is 8-byte aligned. */
21351 t
= make_tree (TREE_TYPE (stack
), virtual_incoming_args_rtx
);
21352 if (cum
->aapcs_stack_size
> 0)
21353 t
= fold_build_pointer_plus_hwi (t
, cum
->aapcs_stack_size
* UNITS_PER_WORD
);
21354 t
= build2 (MODIFY_EXPR
, TREE_TYPE (stack
), stack
, t
);
21355 expand_expr (t
, const0_rtx
, VOIDmode
, EXPAND_NORMAL
);
21357 /* Emit code to initialize GRTOP, the top of the GR save area.
21358 virtual_incoming_args_rtx should have been 16 byte aligned. */
21359 t
= make_tree (TREE_TYPE (grtop
), virtual_incoming_args_rtx
);
21360 t
= build2 (MODIFY_EXPR
, TREE_TYPE (grtop
), grtop
, t
);
21361 expand_expr (t
, const0_rtx
, VOIDmode
, EXPAND_NORMAL
);
21363 /* Emit code to initialize VRTOP, the top of the VR save area.
21364 This address is gr_save_area_bytes below GRTOP, rounded
21365 down to the next 16-byte boundary. */
21366 t
= make_tree (TREE_TYPE (vrtop
), virtual_incoming_args_rtx
);
21367 vr_offset
= ROUND_UP (gr_save_area_size
,
21368 STACK_BOUNDARY
/ BITS_PER_UNIT
);
21371 t
= fold_build_pointer_plus_hwi (t
, -vr_offset
);
21372 t
= build2 (MODIFY_EXPR
, TREE_TYPE (vrtop
), vrtop
, t
);
21373 expand_expr (t
, const0_rtx
, VOIDmode
, EXPAND_NORMAL
);
21375 /* Emit code to initialize GROFF, the offset from GRTOP of the
21376 next GPR argument. */
21377 t
= build2 (MODIFY_EXPR
, TREE_TYPE (groff
), groff
,
21378 build_int_cst (TREE_TYPE (groff
), -gr_save_area_size
));
21379 expand_expr (t
, const0_rtx
, VOIDmode
, EXPAND_NORMAL
);
21381 /* Likewise emit code to initialize VROFF, the offset from FTOP
21382 of the next VR argument. */
21383 t
= build2 (MODIFY_EXPR
, TREE_TYPE (vroff
), vroff
,
21384 build_int_cst (TREE_TYPE (vroff
), -vr_save_area_size
));
21385 expand_expr (t
, const0_rtx
, VOIDmode
, EXPAND_NORMAL
);
21388 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR. */
21391 aarch64_gimplify_va_arg_expr (tree valist
, tree type
, gimple_seq
*pre_p
,
21392 gimple_seq
*post_p ATTRIBUTE_UNUSED
)
21396 bool is_ha
; /* is HFA or HVA. */
21397 bool dw_align
; /* double-word align. */
21398 machine_mode ag_mode
= VOIDmode
;
21402 tree f_stack
, f_grtop
, f_vrtop
, f_groff
, f_vroff
;
21403 tree stack
, f_top
, f_off
, off
, arg
, roundup
, on_stack
;
21404 HOST_WIDE_INT size
, rsize
, adjust
, align
;
21405 tree t
, u
, cond1
, cond2
;
21407 indirect_p
= pass_va_arg_by_reference (type
);
21409 type
= build_pointer_type (type
);
21411 mode
= TYPE_MODE (type
);
21413 f_stack
= TYPE_FIELDS (va_list_type_node
);
21414 f_grtop
= DECL_CHAIN (f_stack
);
21415 f_vrtop
= DECL_CHAIN (f_grtop
);
21416 f_groff
= DECL_CHAIN (f_vrtop
);
21417 f_vroff
= DECL_CHAIN (f_groff
);
21419 stack
= build3 (COMPONENT_REF
, TREE_TYPE (f_stack
), unshare_expr (valist
),
21420 f_stack
, NULL_TREE
);
21421 size
= int_size_in_bytes (type
);
21423 unsigned int abi_break_gcc_9
;
21424 unsigned int abi_break_gcc_13
;
21425 unsigned int abi_break_gcc_14
;
21427 = aarch64_function_arg_alignment (mode
, type
, &abi_break_gcc_9
,
21428 &abi_break_gcc_13
, &abi_break_gcc_14
)
21433 if (aarch64_vfp_is_call_or_return_candidate (mode
, type
, &ag_mode
, &nregs
,
21436 /* No frontends can create types with variable-sized modes, so we
21437 shouldn't be asked to pass or return them. */
21438 unsigned int ag_size
= GET_MODE_SIZE (ag_mode
).to_constant ();
21440 /* TYPE passed in fp/simd registers. */
21442 aarch64_err_no_fpadvsimd (mode
);
21444 f_top
= build3 (COMPONENT_REF
, TREE_TYPE (f_vrtop
),
21445 unshare_expr (valist
), f_vrtop
, NULL_TREE
);
21446 f_off
= build3 (COMPONENT_REF
, TREE_TYPE (f_vroff
),
21447 unshare_expr (valist
), f_vroff
, NULL_TREE
);
21449 rsize
= nregs
* UNITS_PER_VREG
;
21453 if (BYTES_BIG_ENDIAN
&& ag_size
< UNITS_PER_VREG
)
21454 adjust
= UNITS_PER_VREG
- ag_size
;
21456 else if (BLOCK_REG_PADDING (mode
, type
, 1) == PAD_DOWNWARD
21457 && size
< UNITS_PER_VREG
)
21459 adjust
= UNITS_PER_VREG
- size
;
21464 /* TYPE passed in general registers. */
21465 f_top
= build3 (COMPONENT_REF
, TREE_TYPE (f_grtop
),
21466 unshare_expr (valist
), f_grtop
, NULL_TREE
);
21467 f_off
= build3 (COMPONENT_REF
, TREE_TYPE (f_groff
),
21468 unshare_expr (valist
), f_groff
, NULL_TREE
);
21469 rsize
= ROUND_UP (size
, UNITS_PER_WORD
);
21470 nregs
= rsize
/ UNITS_PER_WORD
;
21473 && abi_break_gcc_13
21475 && !bitint_or_aggr_of_bitint_p (type
))
21476 inform (input_location
, "parameter passing for argument of type "
21477 "%qT changed in GCC 13.1", type
);
21480 && abi_break_gcc_14
21481 && (abi_break_gcc_14
> 8 * BITS_PER_UNIT
) != (align
> 8)
21482 && !bitint_or_aggr_of_bitint_p (type
))
21483 inform (input_location
, "parameter passing for argument of type "
21484 "%qT changed in GCC 14.1", type
);
21488 if (abi_break_gcc_9
21490 && !bitint_or_aggr_of_bitint_p (type
))
21491 inform (input_location
, "parameter passing for argument of type "
21492 "%qT changed in GCC 9.1", type
);
21496 if (BLOCK_REG_PADDING (mode
, type
, 1) == PAD_DOWNWARD
21497 && size
< UNITS_PER_WORD
)
21499 adjust
= UNITS_PER_WORD
- size
;
21503 /* Get a local temporary for the field value. */
21504 off
= get_initialized_tmp_var (f_off
, pre_p
, NULL
);
21506 /* Emit code to branch if off >= 0. */
21507 t
= build2 (GE_EXPR
, boolean_type_node
, off
,
21508 build_int_cst (TREE_TYPE (off
), 0));
21509 cond1
= build3 (COND_EXPR
, ptr_type_node
, t
, NULL_TREE
, NULL_TREE
);
21513 /* Emit: offs = (offs + 15) & -16. */
21514 t
= build2 (PLUS_EXPR
, TREE_TYPE (off
), off
,
21515 build_int_cst (TREE_TYPE (off
), 15));
21516 t
= build2 (BIT_AND_EXPR
, TREE_TYPE (off
), t
,
21517 build_int_cst (TREE_TYPE (off
), -16));
21518 roundup
= build2 (MODIFY_EXPR
, TREE_TYPE (off
), off
, t
);
21523 /* Update ap.__[g|v]r_offs */
21524 t
= build2 (PLUS_EXPR
, TREE_TYPE (off
), off
,
21525 build_int_cst (TREE_TYPE (off
), rsize
));
21526 t
= build2 (MODIFY_EXPR
, TREE_TYPE (f_off
), unshare_expr (f_off
), t
);
21530 t
= build2 (COMPOUND_EXPR
, TREE_TYPE (t
), roundup
, t
);
21532 /* [cond2] if (ap.__[g|v]r_offs > 0) */
21533 u
= build2 (GT_EXPR
, boolean_type_node
, unshare_expr (f_off
),
21534 build_int_cst (TREE_TYPE (f_off
), 0));
21535 cond2
= build3 (COND_EXPR
, ptr_type_node
, u
, NULL_TREE
, NULL_TREE
);
21537 /* String up: make sure the assignment happens before the use. */
21538 t
= build2 (COMPOUND_EXPR
, TREE_TYPE (cond2
), t
, cond2
);
21539 COND_EXPR_ELSE (cond1
) = t
;
21541 /* Prepare the trees handling the argument that is passed on the stack;
21542 the top level node will store in ON_STACK. */
21543 arg
= get_initialized_tmp_var (stack
, pre_p
, NULL
);
21546 /* if (alignof(type) > 8) (arg = arg + 15) & -16; */
21547 t
= fold_build_pointer_plus_hwi (arg
, 15);
21548 t
= build2 (BIT_AND_EXPR
, TREE_TYPE (t
), t
,
21549 build_int_cst (TREE_TYPE (t
), -16));
21550 roundup
= build2 (MODIFY_EXPR
, TREE_TYPE (arg
), arg
, t
);
21554 /* Advance ap.__stack */
21555 t
= fold_build_pointer_plus_hwi (arg
, size
+ 7);
21556 t
= build2 (BIT_AND_EXPR
, TREE_TYPE (t
), t
,
21557 build_int_cst (TREE_TYPE (t
), -8));
21558 t
= build2 (MODIFY_EXPR
, TREE_TYPE (stack
), unshare_expr (stack
), t
);
21559 /* String up roundup and advance. */
21561 t
= build2 (COMPOUND_EXPR
, TREE_TYPE (t
), roundup
, t
);
21562 /* String up with arg */
21563 on_stack
= build2 (COMPOUND_EXPR
, TREE_TYPE (arg
), t
, arg
);
21564 /* Big-endianness related address adjustment. */
21565 if (BLOCK_REG_PADDING (mode
, type
, 1) == PAD_DOWNWARD
21566 && size
< UNITS_PER_WORD
)
21568 t
= build2 (POINTER_PLUS_EXPR
, TREE_TYPE (arg
), arg
,
21569 size_int (UNITS_PER_WORD
- size
));
21570 on_stack
= build2 (COMPOUND_EXPR
, TREE_TYPE (arg
), on_stack
, t
);
21573 COND_EXPR_THEN (cond1
) = unshare_expr (on_stack
);
21574 COND_EXPR_THEN (cond2
) = unshare_expr (on_stack
);
21576 /* Adjustment to OFFSET in the case of BIG_ENDIAN. */
21579 t
= build2 (PREINCREMENT_EXPR
, TREE_TYPE (off
), off
,
21580 build_int_cst (TREE_TYPE (off
), adjust
));
21582 t
= fold_convert (sizetype
, t
);
21583 t
= build2 (POINTER_PLUS_EXPR
, TREE_TYPE (f_top
), f_top
, t
);
21587 /* type ha; // treat as "struct {ftype field[n];}"
21588 ... [computing offs]
21589 for (i = 0; i <nregs; ++i, offs += 16)
21590 ha.field[i] = *((ftype *)(ap.__vr_top + offs));
21593 tree tmp_ha
, field_t
, field_ptr_t
;
21595 /* Declare a local variable. */
21596 tmp_ha
= create_tmp_var_raw (type
, "ha");
21597 gimple_add_tmp_var (tmp_ha
);
21599 /* Establish the base type. */
21603 field_t
= float_type_node
;
21604 field_ptr_t
= float_ptr_type_node
;
21607 field_t
= double_type_node
;
21608 field_ptr_t
= double_ptr_type_node
;
21611 field_t
= long_double_type_node
;
21612 field_ptr_t
= long_double_ptr_type_node
;
21615 field_t
= dfloat32_type_node
;
21616 field_ptr_t
= build_pointer_type (dfloat32_type_node
);
21619 field_t
= dfloat64_type_node
;
21620 field_ptr_t
= build_pointer_type (dfloat64_type_node
);
21623 field_t
= dfloat128_type_node
;
21624 field_ptr_t
= build_pointer_type (dfloat128_type_node
);
21627 field_t
= aarch64_fp16_type_node
;
21628 field_ptr_t
= aarch64_fp16_ptr_type_node
;
21631 field_t
= bfloat16_type_node
;
21632 field_ptr_t
= aarch64_bf16_ptr_type_node
;
21637 tree innertype
= make_signed_type (GET_MODE_PRECISION (SImode
));
21638 field_t
= build_vector_type_for_mode (innertype
, ag_mode
);
21639 field_ptr_t
= build_pointer_type (field_t
);
21646 /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area */
21647 TREE_ADDRESSABLE (tmp_ha
) = 1;
21648 tmp_ha
= build1 (ADDR_EXPR
, field_ptr_t
, tmp_ha
);
21650 t
= fold_convert (field_ptr_t
, addr
);
21651 t
= build2 (MODIFY_EXPR
, field_t
,
21652 build1 (INDIRECT_REF
, field_t
, tmp_ha
),
21653 build1 (INDIRECT_REF
, field_t
, t
));
21655 /* ha.field[i] = *((field_ptr_t)vr_saved_area + i) */
21656 for (i
= 1; i
< nregs
; ++i
)
21658 addr
= fold_build_pointer_plus_hwi (addr
, UNITS_PER_VREG
);
21659 u
= fold_convert (field_ptr_t
, addr
);
21660 u
= build2 (MODIFY_EXPR
, field_t
,
21661 build2 (MEM_REF
, field_t
, tmp_ha
,
21662 build_int_cst (field_ptr_t
,
21664 int_size_in_bytes (field_t
)))),
21665 build1 (INDIRECT_REF
, field_t
, u
));
21666 t
= build2 (COMPOUND_EXPR
, TREE_TYPE (t
), t
, u
);
21669 u
= fold_convert (TREE_TYPE (f_top
), tmp_ha
);
21670 t
= build2 (COMPOUND_EXPR
, TREE_TYPE (f_top
), t
, u
);
21673 COND_EXPR_ELSE (cond2
) = t
;
21674 addr
= fold_convert (build_pointer_type (type
), cond1
);
21675 addr
= build_va_arg_indirect_ref (addr
);
21678 addr
= build_va_arg_indirect_ref (addr
);
21683 /* Implement TARGET_SETUP_INCOMING_VARARGS. */
21686 aarch64_setup_incoming_varargs (cumulative_args_t cum_v
,
21687 const function_arg_info
&arg
,
21688 int *pretend_size ATTRIBUTE_UNUSED
, int no_rtl
)
21690 CUMULATIVE_ARGS
*cum
= get_cumulative_args (cum_v
);
21691 CUMULATIVE_ARGS local_cum
;
21692 int gr_saved
= cfun
->va_list_gpr_size
;
21693 int vr_saved
= cfun
->va_list_fpr_size
;
21695 /* The caller has advanced CUM up to, but not beyond, the last named
21696 argument. Advance a local copy of CUM past the last "real" named
21697 argument, to find out how many registers are left over. */
21699 if (!TYPE_NO_NAMED_ARGS_STDARG_P (TREE_TYPE (current_function_decl
)))
21700 aarch64_function_arg_advance (pack_cumulative_args(&local_cum
), arg
);
21702 /* Found out how many registers we need to save.
21703 Honor tree-stdvar analysis results. */
21704 if (cfun
->va_list_gpr_size
)
21705 gr_saved
= MIN (NUM_ARG_REGS
- local_cum
.aapcs_ncrn
,
21706 cfun
->va_list_gpr_size
/ UNITS_PER_WORD
);
21707 if (cfun
->va_list_fpr_size
)
21708 vr_saved
= MIN (NUM_FP_ARG_REGS
- local_cum
.aapcs_nvrn
,
21709 cfun
->va_list_fpr_size
/ UNITS_PER_VREG
);
21713 gcc_assert (local_cum
.aapcs_nvrn
== 0);
21723 /* virtual_incoming_args_rtx should have been 16-byte aligned. */
21724 ptr
= plus_constant (Pmode
, virtual_incoming_args_rtx
,
21725 - gr_saved
* UNITS_PER_WORD
);
21726 mem
= gen_frame_mem (BLKmode
, ptr
);
21727 set_mem_alias_set (mem
, get_varargs_alias_set ());
21729 move_block_from_reg (local_cum
.aapcs_ncrn
+ R0_REGNUM
,
21734 /* We can't use move_block_from_reg, because it will use
21735 the wrong mode, storing D regs only. */
21736 machine_mode mode
= TImode
;
21737 int off
, i
, vr_start
;
21739 /* Set OFF to the offset from virtual_incoming_args_rtx of
21740 the first vector register. The VR save area lies below
21741 the GR one, and is aligned to 16 bytes. */
21742 off
= -ROUND_UP (gr_saved
* UNITS_PER_WORD
,
21743 STACK_BOUNDARY
/ BITS_PER_UNIT
);
21744 off
-= vr_saved
* UNITS_PER_VREG
;
21746 vr_start
= V0_REGNUM
+ local_cum
.aapcs_nvrn
;
21747 for (i
= 0; i
< vr_saved
; ++i
)
21751 ptr
= plus_constant (Pmode
, virtual_incoming_args_rtx
, off
);
21752 mem
= gen_frame_mem (mode
, ptr
);
21753 set_mem_alias_set (mem
, get_varargs_alias_set ());
21754 aarch64_emit_move (mem
, gen_rtx_REG (mode
, vr_start
+ i
));
21755 off
+= UNITS_PER_VREG
;
21760 /* We don't save the size into *PRETEND_SIZE because we want to avoid
21761 any complication of having crtl->args.pretend_args_size changed. */
21762 cfun
->machine
->frame
.saved_varargs_size
21763 = (ROUND_UP (gr_saved
* UNITS_PER_WORD
,
21764 STACK_BOUNDARY
/ BITS_PER_UNIT
)
21765 + vr_saved
* UNITS_PER_VREG
);
21769 aarch64_conditional_register_usage (void)
21774 for (i
= V0_REGNUM
; i
<= V31_REGNUM
; i
++)
21777 call_used_regs
[i
] = 1;
21778 CLEAR_HARD_REG_BIT (operand_reg_set
, i
);
21782 for (i
= P0_REGNUM
; i
<= P15_REGNUM
; i
++)
21785 call_used_regs
[i
] = 1;
21788 /* Only allow these registers to be accessed via special patterns. */
21789 CLEAR_HARD_REG_BIT (operand_reg_set
, VG_REGNUM
);
21790 CLEAR_HARD_REG_BIT (operand_reg_set
, FFR_REGNUM
);
21791 CLEAR_HARD_REG_BIT (operand_reg_set
, FFRT_REGNUM
);
21792 for (int i
= FIRST_FAKE_REGNUM
; i
<= LAST_FAKE_REGNUM
; ++i
)
21793 CLEAR_HARD_REG_BIT (operand_reg_set
, i
);
21795 /* When tracking speculation, we need a couple of call-clobbered registers
21796 to track the speculation state. It would be nice to just use
21797 IP0 and IP1, but currently there are numerous places that just
21798 assume these registers are free for other uses (eg pointer
21799 authentication). */
21800 if (aarch64_track_speculation
)
21802 fixed_regs
[SPECULATION_TRACKER_REGNUM
] = 1;
21803 call_used_regs
[SPECULATION_TRACKER_REGNUM
] = 1;
21804 fixed_regs
[SPECULATION_SCRATCH_REGNUM
] = 1;
21805 call_used_regs
[SPECULATION_SCRATCH_REGNUM
] = 1;
21809 /* Implement TARGET_MEMBER_TYPE_FORCES_BLK. */
21812 aarch64_member_type_forces_blk (const_tree field_or_array
, machine_mode mode
)
21814 /* For records we're passed a FIELD_DECL, for arrays we're passed
21815 an ARRAY_TYPE. In both cases we're interested in the TREE_TYPE. */
21816 const_tree type
= TREE_TYPE (field_or_array
);
21818 /* Assign BLKmode to anything that contains more than 2 SVE predicates.
21819 For structures, the "multiple" case is indicated by MODE being
21821 unsigned int num_zr
, num_pr
;
21822 if (aarch64_sve::builtin_type_p (type
, &num_zr
, &num_pr
) && num_pr
> 2)
21824 if (TREE_CODE (field_or_array
) == ARRAY_TYPE
)
21825 return !simple_cst_equal (TYPE_SIZE (field_or_array
),
21827 return mode
== VOIDmode
;
21830 return default_member_type_forces_blk (field_or_array
, mode
);
21833 /* Bitmasks that indicate whether earlier versions of GCC would have
21834 taken a different path through the ABI logic. This should result in
21835 a -Wpsabi warning if the earlier path led to a different ABI decision.
21837 WARN_PSABI_EMPTY_CXX17_BASE
21838 Indicates that the type includes an artificial empty C++17 base field
21839 that, prior to GCC 10.1, would prevent the type from being treated as
21840 a HFA or HVA. See PR94383 for details.
21842 WARN_PSABI_NO_UNIQUE_ADDRESS
21843 Indicates that the type includes an empty [[no_unique_address]] field
21844 that, prior to GCC 10.1, would prevent the type from being treated as
21846 const unsigned int WARN_PSABI_EMPTY_CXX17_BASE
= 1U << 0;
21847 const unsigned int WARN_PSABI_NO_UNIQUE_ADDRESS
= 1U << 1;
21848 const unsigned int WARN_PSABI_ZERO_WIDTH_BITFIELD
= 1U << 2;
21850 /* Walk down the type tree of TYPE counting consecutive base elements.
21851 If *MODEP is VOIDmode, then set it to the first valid floating point
21852 type. If a non-floating point type is found, or if a floating point
21853 type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
21854 otherwise return the count in the sub-tree.
21856 The WARN_PSABI_FLAGS argument allows the caller to check whether this
21857 function has changed its behavior relative to earlier versions of GCC.
21858 Normally the argument should be nonnull and point to a zero-initialized
21859 variable. The function then records whether the ABI decision might
21860 be affected by a known fix to the ABI logic, setting the associated
21861 WARN_PSABI_* bits if so.
21863 When the argument is instead a null pointer, the function tries to
21864 simulate the behavior of GCC before all such ABI fixes were made.
21865 This is useful to check whether the function returns something
21866 different after the ABI fixes. */
21868 aapcs_vfp_sub_candidate (const_tree type
, machine_mode
*modep
,
21869 unsigned int *warn_psabi_flags
)
21872 HOST_WIDE_INT size
;
21874 if (aarch64_sve::builtin_type_p (type
))
21877 switch (TREE_CODE (type
))
21880 mode
= TYPE_MODE (type
);
21881 if (mode
!= DFmode
&& mode
!= SFmode
21882 && mode
!= TFmode
&& mode
!= HFmode
21883 && mode
!= SDmode
&& mode
!= DDmode
&& mode
!= TDmode
)
21886 if (*modep
== VOIDmode
)
21889 if (*modep
== mode
)
21895 mode
= TYPE_MODE (TREE_TYPE (type
));
21896 if (mode
!= DFmode
&& mode
!= SFmode
21897 && mode
!= TFmode
&& mode
!= HFmode
)
21900 if (*modep
== VOIDmode
)
21903 if (*modep
== mode
)
21909 /* Use V2SImode and V4SImode as representatives of all 64-bit
21910 and 128-bit vector types. */
21911 size
= int_size_in_bytes (type
);
21924 if (*modep
== VOIDmode
)
21927 /* Vector modes are considered to be opaque: two vectors are
21928 equivalent for the purposes of being homogeneous aggregates
21929 if they are the same size. */
21930 if (*modep
== mode
)
21938 tree index
= TYPE_DOMAIN (type
);
21940 /* Can't handle incomplete types nor sizes that are not
21942 if (!COMPLETE_TYPE_P (type
)
21943 || TREE_CODE (TYPE_SIZE (type
)) != INTEGER_CST
)
21946 count
= aapcs_vfp_sub_candidate (TREE_TYPE (type
), modep
,
21950 || !TYPE_MAX_VALUE (index
)
21951 || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index
))
21952 || !TYPE_MIN_VALUE (index
)
21953 || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index
))
21957 count
*= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index
))
21958 - tree_to_uhwi (TYPE_MIN_VALUE (index
)));
21960 /* There must be no padding. */
21961 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type
)),
21962 count
* GET_MODE_BITSIZE (*modep
)))
21974 /* Can't handle incomplete types nor sizes that are not
21976 if (!COMPLETE_TYPE_P (type
)
21977 || TREE_CODE (TYPE_SIZE (type
)) != INTEGER_CST
)
21980 for (field
= TYPE_FIELDS (type
); field
; field
= TREE_CHAIN (field
))
21982 if (TREE_CODE (field
) != FIELD_DECL
)
21985 if (DECL_FIELD_ABI_IGNORED (field
))
21987 /* See whether this is something that earlier versions of
21988 GCC failed to ignore. */
21990 if (lookup_attribute ("no_unique_address",
21991 DECL_ATTRIBUTES (field
)))
21992 flag
= WARN_PSABI_NO_UNIQUE_ADDRESS
;
21993 else if (cxx17_empty_base_field_p (field
))
21994 flag
= WARN_PSABI_EMPTY_CXX17_BASE
;
21996 /* No compatibility problem. */
21999 /* Simulate the old behavior when WARN_PSABI_FLAGS is null. */
22000 if (warn_psabi_flags
)
22002 *warn_psabi_flags
|= flag
;
22006 /* A zero-width bitfield may affect layout in some
22007 circumstances, but adds no members. The determination
22008 of whether or not a type is an HFA is performed after
22009 layout is complete, so if the type still looks like an
22010 HFA afterwards, it is still classed as one. This is
22011 potentially an ABI break for the hard-float ABI. */
22012 else if (DECL_BIT_FIELD (field
)
22013 && integer_zerop (DECL_SIZE (field
)))
22015 /* Prior to GCC-12 these fields were striped early,
22016 hiding them from the back-end entirely and
22017 resulting in the correct behaviour for argument
22018 passing. Simulate that old behaviour without
22019 generating a warning. */
22020 if (DECL_FIELD_CXX_ZERO_WIDTH_BIT_FIELD (field
))
22022 if (warn_psabi_flags
)
22024 *warn_psabi_flags
|= WARN_PSABI_ZERO_WIDTH_BITFIELD
;
22029 sub_count
= aapcs_vfp_sub_candidate (TREE_TYPE (field
), modep
,
22033 count
+= sub_count
;
22036 /* There must be no padding. */
22037 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type
)),
22038 count
* GET_MODE_BITSIZE (*modep
)))
22045 case QUAL_UNION_TYPE
:
22047 /* These aren't very interesting except in a degenerate case. */
22052 /* Can't handle incomplete types nor sizes that are not
22054 if (!COMPLETE_TYPE_P (type
)
22055 || TREE_CODE (TYPE_SIZE (type
)) != INTEGER_CST
)
22058 for (field
= TYPE_FIELDS (type
); field
; field
= TREE_CHAIN (field
))
22060 if (TREE_CODE (field
) != FIELD_DECL
)
22063 sub_count
= aapcs_vfp_sub_candidate (TREE_TYPE (field
), modep
,
22067 count
= count
> sub_count
? count
: sub_count
;
22070 /* There must be no padding. */
22071 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type
)),
22072 count
* GET_MODE_BITSIZE (*modep
)))
22085 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector
22086 type as described in AAPCS64 \S 4.1.2.
22088 See the comment above aarch64_composite_type_p for the notes on MODE. */
22091 aarch64_short_vector_p (const_tree type
,
22094 poly_int64 size
= -1;
22096 if (type
&& VECTOR_TYPE_P (type
))
22098 if (aarch64_sve::builtin_type_p (type
))
22100 size
= int_size_in_bytes (type
);
22102 else if (GET_MODE_CLASS (mode
) == MODE_VECTOR_INT
22103 || GET_MODE_CLASS (mode
) == MODE_VECTOR_FLOAT
)
22105 /* The containing "else if" is too loose: it means that we look at TYPE
22106 if the type is a vector type (good), but that we otherwise ignore TYPE
22107 and look only at the mode. This is wrong because the type describes
22108 the language-level information whereas the mode is purely an internal
22109 GCC concept. We can therefore reach here for types that are not
22110 vectors in the AAPCS64 sense.
22112 We can't "fix" that for the traditional Advanced SIMD vector modes
22113 without breaking backwards compatibility. However, there's no such
22114 baggage for the structure modes, which were introduced in GCC 12. */
22115 if (aarch64_advsimd_struct_mode_p (mode
))
22118 /* For similar reasons, rely only on the type, not the mode, when
22119 processing SVE types. */
22120 if (type
&& aarch64_some_values_include_pst_objects_p (type
))
22121 /* Leave later code to report an error if SVE is disabled. */
22122 gcc_assert (!TARGET_SVE
|| aarch64_sve_mode_p (mode
));
22124 size
= GET_MODE_SIZE (mode
);
22126 if (known_eq (size
, 8) || known_eq (size
, 16))
22128 /* 64-bit and 128-bit vectors should only acquire an SVE mode if
22129 they are being treated as scalable AAPCS64 types. */
22130 gcc_assert (!aarch64_sve_mode_p (mode
)
22131 && !aarch64_advsimd_struct_mode_p (mode
));
22137 /* Return TRUE if the type, as described by TYPE and MODE, is a composite
22138 type as described in AAPCS64 \S 4.3. This includes aggregate, union and
22139 array types. The C99 floating-point complex types are also considered
22140 as composite types, according to AAPCS64 \S 7.1.1. The complex integer
22141 types, which are GCC extensions and out of the scope of AAPCS64, are
22142 treated as composite types here as well.
22144 Note that MODE itself is not sufficient in determining whether a type
22145 is such a composite type or not. This is because
22146 stor-layout.cc:compute_record_mode may have already changed the MODE
22147 (BLKmode) of a RECORD_TYPE TYPE to some other mode. For example, a
22148 structure with only one field may have its MODE set to the mode of the
22149 field. Also an integer mode whose size matches the size of the
22150 RECORD_TYPE type may be used to substitute the original mode
22151 (i.e. BLKmode) in certain circumstances. In other words, MODE cannot be
22152 solely relied on. */
22155 aarch64_composite_type_p (const_tree type
,
22158 if (aarch64_short_vector_p (type
, mode
))
22161 if (type
&& (AGGREGATE_TYPE_P (type
) || TREE_CODE (type
) == COMPLEX_TYPE
))
22165 && TREE_CODE (type
) == BITINT_TYPE
22166 && int_size_in_bytes (type
) > 16)
22169 if (mode
== BLKmode
22170 || GET_MODE_CLASS (mode
) == MODE_COMPLEX_FLOAT
22171 || GET_MODE_CLASS (mode
) == MODE_COMPLEX_INT
)
22177 /* Return TRUE if an argument, whose type is described by TYPE and MODE,
22178 shall be passed or returned in simd/fp register(s) (providing these
22179 parameter passing registers are available).
22181 Upon successful return, *COUNT returns the number of needed registers,
22182 *BASE_MODE returns the mode of the individual register and when IS_HA
22183 is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
22184 floating-point aggregate or a homogeneous short-vector aggregate.
22186 SILENT_P is true if the function should refrain from reporting any
22187 diagnostics. This should only be used if the caller is certain that
22188 any ABI decisions would eventually come through this function with
22189 SILENT_P set to false. */
22192 aarch64_vfp_is_call_or_return_candidate (machine_mode mode
,
22194 machine_mode
*base_mode
,
22199 if (is_ha
!= NULL
) *is_ha
= false;
22201 machine_mode new_mode
= VOIDmode
;
22202 bool composite_p
= aarch64_composite_type_p (type
, mode
);
22205 && (GET_MODE_CLASS (mode
) == MODE_FLOAT
22206 || GET_MODE_CLASS (mode
) == MODE_DECIMAL_FLOAT
))
22207 || aarch64_short_vector_p (type
, mode
))
22212 else if (GET_MODE_CLASS (mode
) == MODE_COMPLEX_FLOAT
)
22214 if (is_ha
!= NULL
) *is_ha
= true;
22216 new_mode
= GET_MODE_INNER (mode
);
22218 else if (type
&& composite_p
)
22220 unsigned int warn_psabi_flags
= 0;
22221 int ag_count
= aapcs_vfp_sub_candidate (type
, &new_mode
,
22222 &warn_psabi_flags
);
22223 if (ag_count
> 0 && ag_count
<= HA_MAX_NUM_FLDS
)
22225 static unsigned last_reported_type_uid
;
22226 unsigned uid
= TYPE_UID (TYPE_MAIN_VARIANT (type
));
22230 && warn_psabi_flags
22231 && uid
!= last_reported_type_uid
22232 && ((alt
= aapcs_vfp_sub_candidate (type
, &new_mode
, NULL
))
22236 = CHANGES_ROOT_URL
"gcc-10/changes.html#empty_base";
22238 = CHANGES_ROOT_URL
"gcc-12/changes.html#zero_width_bitfields";
22239 gcc_assert (alt
== -1);
22240 last_reported_type_uid
= uid
;
22241 /* Use TYPE_MAIN_VARIANT to strip any redundant const
22243 if (warn_psabi_flags
& WARN_PSABI_NO_UNIQUE_ADDRESS
)
22244 inform (input_location
, "parameter passing for argument of "
22245 "type %qT with %<[[no_unique_address]]%> members "
22246 "changed %{in GCC 10.1%}",
22247 TYPE_MAIN_VARIANT (type
), url10
);
22248 else if (warn_psabi_flags
& WARN_PSABI_EMPTY_CXX17_BASE
)
22249 inform (input_location
, "parameter passing for argument of "
22250 "type %qT when C++17 is enabled changed to match "
22251 "C++14 %{in GCC 10.1%}",
22252 TYPE_MAIN_VARIANT (type
), url10
);
22253 else if (warn_psabi_flags
& WARN_PSABI_ZERO_WIDTH_BITFIELD
)
22254 inform (input_location
, "parameter passing for argument of "
22255 "type %qT changed %{in GCC 12.1%}",
22256 TYPE_MAIN_VARIANT (type
), url12
);
22259 if (is_ha
!= NULL
) *is_ha
= true;
22268 gcc_assert (!aarch64_sve_mode_p (new_mode
));
22269 *base_mode
= new_mode
;
22273 /* Implement TARGET_STRUCT_VALUE_RTX. */
22276 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED
,
22277 int incoming ATTRIBUTE_UNUSED
)
22279 return gen_rtx_REG (Pmode
, AARCH64_STRUCT_VALUE_REGNUM
);
22282 /* Implements target hook vector_mode_supported_p. */
22284 aarch64_vector_mode_supported_p (machine_mode mode
)
22286 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
22287 return vec_flags
!= 0 && (vec_flags
& VEC_STRUCT
) == 0;
22290 /* Implements target hook vector_mode_supported_any_target_p. */
22292 aarch64_vector_mode_supported_any_target_p (machine_mode mode
)
22294 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
, true);
22295 return vec_flags
!= 0 && (vec_flags
& VEC_STRUCT
) == 0;
22298 /* Return the full-width SVE vector mode for element mode MODE, if one
22301 aarch64_full_sve_mode (scalar_mode mode
)
22320 return VNx16QImode
;
22322 return opt_machine_mode ();
22326 /* Return the 128-bit Advanced SIMD vector mode for element mode MODE,
22329 aarch64_vq_mode (scalar_mode mode
)
22350 return opt_machine_mode ();
22354 /* Return appropriate SIMD container
22355 for MODE within a vector of WIDTH bits. */
22356 static machine_mode
22357 aarch64_simd_container_mode (scalar_mode mode
, poly_int64 width
)
22360 && maybe_ne (width
, 128)
22361 && known_eq (width
, BITS_PER_SVE_VECTOR
))
22362 return aarch64_full_sve_mode (mode
).else_mode (word_mode
);
22364 gcc_assert (known_eq (width
, 64) || known_eq (width
, 128));
22365 if (TARGET_BASE_SIMD
)
22367 if (known_eq (width
, 128))
22368 return aarch64_vq_mode (mode
).else_mode (word_mode
);
22391 /* Compare an SVE mode SVE_M and an Advanced SIMD mode ASIMD_M
22392 and return whether the SVE mode should be preferred over the
22393 Advanced SIMD one in aarch64_autovectorize_vector_modes. */
22395 aarch64_cmp_autovec_modes (machine_mode sve_m
, machine_mode asimd_m
)
22397 /* Take into account the aarch64-autovec-preference param if non-zero. */
22398 bool only_asimd_p
= aarch64_autovec_preference
== AARCH64_AUTOVEC_ASIMD_ONLY
;
22399 bool only_sve_p
= aarch64_autovec_preference
== AARCH64_AUTOVEC_SVE_ONLY
;
22406 /* The preference in case of a tie in costs. */
22407 bool prefer_asimd
= aarch64_autovec_preference
== AARCH64_AUTOVEC_PREFER_ASIMD
;
22408 bool prefer_sve
= aarch64_autovec_preference
== AARCH64_AUTOVEC_PREFER_SVE
;
22410 poly_int64 nunits_sve
= GET_MODE_NUNITS (sve_m
);
22411 poly_int64 nunits_asimd
= GET_MODE_NUNITS (asimd_m
);
22412 /* If the CPU information does not have an SVE width registered use the
22413 generic poly_int comparison that prefers SVE. If a preference is
22414 explicitly requested avoid this path. */
22415 if (aarch64_tune_params
.sve_width
== SVE_SCALABLE
22418 return maybe_gt (nunits_sve
, nunits_asimd
);
22420 /* Otherwise estimate the runtime width of the modes involved. */
22421 HOST_WIDE_INT est_sve
= estimated_poly_value (nunits_sve
);
22422 HOST_WIDE_INT est_asimd
= estimated_poly_value (nunits_asimd
);
22424 /* Preferring SVE means picking it first unless the Advanced SIMD mode
22425 is clearly wider. */
22427 return est_sve
>= est_asimd
;
22428 /* Conversely, preferring Advanced SIMD means picking SVE only if SVE
22429 is clearly wider. */
22431 return est_sve
> est_asimd
;
22433 /* In the default case prefer Advanced SIMD over SVE in case of a tie. */
22434 return est_sve
> est_asimd
;
22437 /* Return 128-bit container as the preferred SIMD mode for MODE. */
22438 static machine_mode
22439 aarch64_preferred_simd_mode (scalar_mode mode
)
22441 /* Take into account explicit auto-vectorization ISA preferences through
22442 aarch64_cmp_autovec_modes. */
22443 if (TARGET_SVE
&& aarch64_cmp_autovec_modes (VNx16QImode
, V16QImode
))
22444 return aarch64_full_sve_mode (mode
).else_mode (word_mode
);
22446 return aarch64_vq_mode (mode
).else_mode (word_mode
);
22450 /* Return a list of possible vector sizes for the vectorizer
22451 to iterate over. */
22452 static unsigned int
22453 aarch64_autovectorize_vector_modes (vector_modes
*modes
, bool)
22455 static const machine_mode sve_modes
[] = {
22456 /* Try using full vectors for all element types. */
22459 /* Try using 16-bit containers for 8-bit elements and full vectors
22460 for wider elements. */
22463 /* Try using 32-bit containers for 8-bit and 16-bit elements and
22464 full vectors for wider elements. */
22467 /* Try using 64-bit containers for all element types. */
22471 static const machine_mode advsimd_modes
[] = {
22472 /* Try using 128-bit vectors for all element types. */
22475 /* Try using 64-bit vectors for 8-bit elements and 128-bit vectors
22476 for wider elements. */
22479 /* Try using 64-bit vectors for 16-bit elements and 128-bit vectors
22480 for wider elements.
22482 TODO: We could support a limited form of V4QImode too, so that
22483 we use 32-bit vectors for 8-bit elements. */
22486 /* Try using 64-bit vectors for 32-bit elements and 128-bit vectors
22487 for 64-bit elements.
22489 TODO: We could similarly support limited forms of V2QImode and V2HImode
22494 /* Try using N-byte SVE modes only after trying N-byte Advanced SIMD mode.
22497 - If we can't use N-byte Advanced SIMD vectors then the placement
22498 doesn't matter; we'll just continue as though the Advanced SIMD
22499 entry didn't exist.
22501 - If an SVE main loop with N bytes ends up being cheaper than an
22502 Advanced SIMD main loop with N bytes then by default we'll replace
22503 the Advanced SIMD version with the SVE one.
22505 - If an Advanced SIMD main loop with N bytes ends up being cheaper
22506 than an SVE main loop with N bytes then by default we'll try to
22507 use the SVE loop to vectorize the epilogue instead. */
22509 bool only_asimd_p
= aarch64_autovec_preference
== AARCH64_AUTOVEC_ASIMD_ONLY
;
22510 bool only_sve_p
= aarch64_autovec_preference
== AARCH64_AUTOVEC_SVE_ONLY
;
22512 unsigned int sve_i
= (TARGET_SVE
&& !only_asimd_p
) ? 0 : ARRAY_SIZE (sve_modes
);
22513 unsigned int advsimd_i
= 0;
22515 while (!only_sve_p
&& advsimd_i
< ARRAY_SIZE (advsimd_modes
))
22517 if (sve_i
< ARRAY_SIZE (sve_modes
)
22518 && aarch64_cmp_autovec_modes (sve_modes
[sve_i
],
22519 advsimd_modes
[advsimd_i
]))
22520 modes
->safe_push (sve_modes
[sve_i
++]);
22522 modes
->safe_push (advsimd_modes
[advsimd_i
++]);
22524 while (sve_i
< ARRAY_SIZE (sve_modes
))
22525 modes
->safe_push (sve_modes
[sve_i
++]);
22527 unsigned int flags
= 0;
22528 if (aarch64_vect_compare_costs
)
22529 flags
|= VECT_COMPARE_COSTS
;
22533 /* Implement TARGET_MANGLE_TYPE. */
22535 static const char *
22536 aarch64_mangle_type (const_tree type
)
22538 /* The AArch64 ABI documents say that "__va_list" has to be
22539 mangled as if it is in the "std" namespace. */
22540 if (lang_hooks
.types_compatible_p (CONST_CAST_TREE (type
), va_list_type
))
22541 return "St9__va_list";
22543 /* Half-precision floating point types. */
22544 if (SCALAR_FLOAT_TYPE_P (type
) && TYPE_PRECISION (type
) == 16)
22546 if (TYPE_MAIN_VARIANT (type
) == float16_type_node
)
22548 if (TYPE_MODE (type
) == BFmode
)
22554 /* Modal 8 bit floating point types. */
22555 if (TYPE_MAIN_VARIANT (type
) == aarch64_mfp8_type_node
)
22558 /* Mangle AArch64-specific internal types. TYPE_NAME is non-NULL_TREE for
22560 if (TYPE_NAME (type
) != NULL
)
22563 if ((res
= aarch64_general_mangle_builtin_type (type
))
22564 || (res
= aarch64_sve::mangle_builtin_type (type
)))
22568 /* Use the default mangling. */
22572 /* Implement TARGET_INVALID_CONVERSION. */
22574 static const char *
22575 aarch64_invalid_conversion (const_tree fromtype
, const_tree totype
)
22577 /* Do not allow conversions to/from FP8. But do allow conversions between
22578 volatile and const variants of __mfp8. */
22579 bool fromtype_is_fp8
22580 = (TYPE_MAIN_VARIANT (fromtype
) == aarch64_mfp8_type_node
);
22581 bool totype_is_fp8
= (TYPE_MAIN_VARIANT (totype
) == aarch64_mfp8_type_node
);
22583 if (fromtype_is_fp8
&& totype_is_fp8
)
22586 if (fromtype_is_fp8
)
22587 return N_ ("invalid conversion from type %<mfloat8_t%>");
22589 return N_ ("invalid conversion to type %<mfloat8_t%>");
22591 /* Conversion allowed. */
22595 /* Implement TARGET_VERIFY_TYPE_CONTEXT. */
22598 aarch64_verify_type_context (location_t loc
, type_context_kind context
,
22599 const_tree type
, bool silent_p
)
22601 return aarch64_sve::verify_type_context (loc
, context
, type
, silent_p
);
22604 /* Find the first rtx_insn before insn that will generate an assembly
22608 aarch64_prev_real_insn (rtx_insn
*insn
)
22615 insn
= prev_real_insn (insn
);
22617 while (insn
&& recog_memoized (insn
) < 0);
22623 is_madd_op (enum attr_type t1
)
22626 /* A number of these may be AArch32 only. */
22627 enum attr_type mlatypes
[] = {
22628 TYPE_MLA
, TYPE_MLAS
, TYPE_SMLAD
, TYPE_SMLADX
, TYPE_SMLAL
, TYPE_SMLALD
,
22629 TYPE_SMLALS
, TYPE_SMLALXY
, TYPE_SMLAWX
, TYPE_SMLAWY
, TYPE_SMLAXY
,
22630 TYPE_SMMLA
, TYPE_UMLAL
, TYPE_UMLALS
,TYPE_SMLSD
, TYPE_SMLSDX
, TYPE_SMLSLD
22633 for (i
= 0; i
< ARRAY_SIZE (mlatypes
); i
++)
22635 if (t1
== mlatypes
[i
])
22642 /* Check if there is a register dependency between a load and the insn
22643 for which we hold recog_data. */
22646 dep_between_memop_and_curr (rtx memop
)
22651 gcc_assert (GET_CODE (memop
) == SET
);
22653 if (!REG_P (SET_DEST (memop
)))
22656 load_reg
= SET_DEST (memop
);
22657 for (opno
= 1; opno
< recog_data
.n_operands
; opno
++)
22659 rtx operand
= recog_data
.operand
[opno
];
22660 if (REG_P (operand
)
22661 && reg_overlap_mentioned_p (load_reg
, operand
))
22669 /* When working around the Cortex-A53 erratum 835769,
22670 given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate
22671 instruction and has a preceding memory instruction such that a NOP
22672 should be inserted between them. */
22675 aarch64_madd_needs_nop (rtx_insn
* insn
)
22677 enum attr_type attr_type
;
22681 if (!TARGET_FIX_ERR_A53_835769
)
22684 if (!INSN_P (insn
) || recog_memoized (insn
) < 0)
22687 attr_type
= get_attr_type (insn
);
22688 if (!is_madd_op (attr_type
))
22691 prev
= aarch64_prev_real_insn (insn
);
22692 /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN.
22693 Restore recog state to INSN to avoid state corruption. */
22694 extract_constrain_insn_cached (insn
);
22696 if (!prev
|| !contains_mem_rtx_p (PATTERN (prev
)))
22699 body
= single_set (prev
);
22701 /* If the previous insn is a memory op and there is no dependency between
22702 it and the DImode madd, emit a NOP between them. If body is NULL then we
22703 have a complex memory operation, probably a load/store pair.
22704 Be conservative for now and emit a NOP. */
22705 if (GET_MODE (recog_data
.operand
[0]) == DImode
22706 && (!body
|| !dep_between_memop_and_curr (body
)))
22714 /* Implement FINAL_PRESCAN_INSN. */
22717 aarch64_final_prescan_insn (rtx_insn
*insn
)
22719 if (aarch64_madd_needs_nop (insn
))
22720 fprintf (asm_out_file
, "\tnop // between mem op and mult-accumulate\n");
22724 /* Return true if BASE_OR_STEP is a valid immediate operand for an SVE INDEX
22728 aarch64_sve_index_immediate_p (rtx base_or_step
)
22730 return (CONST_INT_P (base_or_step
)
22731 && IN_RANGE (INTVAL (base_or_step
), -16, 15));
22734 /* Return true if X is a valid immediate for the SVE ADD and SUB instructions
22735 when applied to mode MODE. Negate X first if NEGATE_P is true. */
22738 aarch64_sve_arith_immediate_p (machine_mode mode
, rtx x
, bool negate_p
)
22740 rtx elt
= unwrap_const_vec_duplicate (x
);
22741 if (!CONST_INT_P (elt
))
22744 HOST_WIDE_INT val
= INTVAL (elt
);
22747 val
&= GET_MODE_MASK (GET_MODE_INNER (mode
));
22750 return IN_RANGE (val
, 0, 0xff);
22751 return IN_RANGE (val
, 0, 0xff00);
22754 /* Return true if X is a valid immediate for the SVE SQADD and SQSUB
22755 instructions when applied to mode MODE. Negate X first if NEGATE_P
22759 aarch64_sve_sqadd_sqsub_immediate_p (machine_mode mode
, rtx x
, bool negate_p
)
22761 if (!aarch64_sve_arith_immediate_p (mode
, x
, negate_p
))
22764 /* After the optional negation, the immediate must be nonnegative.
22765 E.g. a saturating add of -127 must be done via SQSUB Zn.B, Zn.B, #127
22766 instead of SQADD Zn.B, Zn.B, #129. */
22767 rtx elt
= unwrap_const_vec_duplicate (x
);
22768 return negate_p
== (INTVAL (elt
) < 0);
22771 /* Return true if X is a valid immediate operand for an SVE logical
22772 instruction such as AND. */
22775 aarch64_sve_bitmask_immediate_p (rtx x
)
22779 return (const_vec_duplicate_p (x
, &elt
)
22780 && CONST_INT_P (elt
)
22781 && aarch64_bitmask_imm (INTVAL (elt
),
22782 GET_MODE_INNER (GET_MODE (x
))));
22785 /* Return true if X is a valid immediate for the SVE DUP and CPY
22789 aarch64_sve_dup_immediate_p (rtx x
)
22791 x
= aarch64_bit_representation (unwrap_const_vec_duplicate (x
));
22792 if (!CONST_INT_P (x
))
22795 HOST_WIDE_INT val
= INTVAL (x
);
22797 return IN_RANGE (val
, -0x80, 0x7f);
22798 return IN_RANGE (val
, -0x8000, 0x7f00);
22801 /* Return true if X is a valid immediate operand for an SVE CMP instruction.
22802 SIGNED_P says whether the operand is signed rather than unsigned. */
22805 aarch64_sve_cmp_immediate_p (rtx x
, bool signed_p
)
22807 x
= unwrap_const_vec_duplicate (x
);
22808 return (CONST_INT_P (x
)
22810 ? IN_RANGE (INTVAL (x
), -16, 15)
22811 : IN_RANGE (INTVAL (x
), 0, 127)));
22814 /* Return true if X is a valid immediate operand for an SVE FADD or FSUB
22815 instruction. Negate X first if NEGATE_P is true. */
22818 aarch64_sve_float_arith_immediate_p (rtx x
, bool negate_p
)
22823 if (!const_vec_duplicate_p (x
, &elt
)
22824 || !CONST_DOUBLE_P (elt
))
22827 r
= *CONST_DOUBLE_REAL_VALUE (elt
);
22830 r
= real_value_negate (&r
);
22832 if (real_equal (&r
, &dconst1
))
22834 if (real_equal (&r
, &dconsthalf
))
22839 /* Return true if X is a valid immediate operand for an SVE FMUL
22843 aarch64_sve_float_mul_immediate_p (rtx x
)
22847 return (const_vec_duplicate_p (x
, &elt
)
22848 && CONST_DOUBLE_P (elt
)
22849 && (real_equal (CONST_DOUBLE_REAL_VALUE (elt
), &dconsthalf
)
22850 || real_equal (CONST_DOUBLE_REAL_VALUE (elt
), &dconst2
)));
22853 /* Return true if replicating VAL32 is a valid 2-byte or 4-byte immediate
22854 for the Advanced SIMD operation described by WHICH and INSN. If INFO
22855 is nonnull, use it to describe valid immediates. */
22857 aarch64_advsimd_valid_immediate_hs (unsigned int val32
,
22858 simd_immediate_info
*info
,
22859 enum simd_immediate_check which
,
22860 simd_immediate_info::insn_type insn
)
22862 /* Try a 4-byte immediate with LSL. */
22863 for (unsigned int shift
= 0; shift
< 32; shift
+= 8)
22864 if ((val32
& (0xff << shift
)) == val32
)
22867 *info
= simd_immediate_info (SImode
, val32
>> shift
, insn
,
22868 simd_immediate_info::LSL
, shift
);
22872 /* Try a 2-byte immediate with LSL. */
22873 unsigned int imm16
= val32
& 0xffff;
22874 if (imm16
== (val32
>> 16))
22875 for (unsigned int shift
= 0; shift
< 16; shift
+= 8)
22876 if ((imm16
& (0xff << shift
)) == imm16
)
22879 *info
= simd_immediate_info (HImode
, imm16
>> shift
, insn
,
22880 simd_immediate_info::LSL
, shift
);
22884 /* Try a 4-byte immediate with MSL, except for cases that MVN
22886 if (which
== AARCH64_CHECK_MOV
)
22887 for (unsigned int shift
= 8; shift
< 24; shift
+= 8)
22889 unsigned int low
= (1 << shift
) - 1;
22890 if (((val32
& (0xff << shift
)) | low
) == val32
)
22893 *info
= simd_immediate_info (SImode
, val32
>> shift
, insn
,
22894 simd_immediate_info::MSL
, shift
);
22902 /* Return true if replicating VAL64 is a valid immediate for the
22903 Advanced SIMD operation described by WHICH. If INFO is nonnull,
22904 use it to describe valid immediates. */
22906 aarch64_advsimd_valid_immediate (unsigned HOST_WIDE_INT val64
,
22907 simd_immediate_info
*info
,
22908 enum simd_immediate_check which
)
22910 unsigned int val32
= val64
& 0xffffffff;
22911 unsigned int val16
= val64
& 0xffff;
22912 unsigned int val8
= val64
& 0xff;
22914 if (val32
== (val64
>> 32))
22916 if ((which
& AARCH64_CHECK_ORR
) != 0
22917 && aarch64_advsimd_valid_immediate_hs (val32
, info
, which
,
22918 simd_immediate_info::MOV
))
22921 if ((which
& AARCH64_CHECK_BIC
) != 0
22922 && aarch64_advsimd_valid_immediate_hs (~val32
, info
, which
,
22923 simd_immediate_info::MVN
))
22926 /* Try using a replicated byte. */
22927 if (which
== AARCH64_CHECK_MOV
22928 && val16
== (val32
>> 16)
22929 && val8
== (val16
>> 8))
22932 *info
= simd_immediate_info (QImode
, val8
);
22937 /* Try using a bit-to-bytemask. */
22938 if (which
== AARCH64_CHECK_MOV
)
22941 for (i
= 0; i
< 64; i
+= 8)
22943 unsigned char byte
= (val64
>> i
) & 0xff;
22944 if (byte
!= 0 && byte
!= 0xff)
22950 *info
= simd_immediate_info (DImode
, val64
);
22957 /* Return true if replicating VAL64 gives a valid immediate for an SVE MOV
22958 instruction. If INFO is nonnull, use it to describe valid immediates. */
22961 aarch64_sve_valid_immediate (unsigned HOST_WIDE_INT val64
,
22962 simd_immediate_info
*info
)
22964 scalar_int_mode mode
= DImode
;
22965 unsigned int val32
= val64
& 0xffffffff;
22966 if (val32
== (val64
>> 32))
22969 unsigned int val16
= val32
& 0xffff;
22970 if (val16
== (val32
>> 16))
22973 unsigned int val8
= val16
& 0xff;
22974 if (val8
== (val16
>> 8))
22978 HOST_WIDE_INT val
= trunc_int_for_mode (val64
, mode
);
22979 if (IN_RANGE (val
, -0x80, 0x7f))
22981 /* DUP with no shift. */
22983 *info
= simd_immediate_info (mode
, val
);
22986 if ((val
& 0xff) == 0 && IN_RANGE (val
, -0x8000, 0x7f00))
22988 /* DUP with LSL #8. */
22990 *info
= simd_immediate_info (mode
, val
);
22993 if (aarch64_bitmask_imm (val64
, mode
))
22997 *info
= simd_immediate_info (mode
, val
);
23003 /* Return true if X is an UNSPEC_PTRUE constant of the form:
23005 (const (unspec [PATTERN ZERO] UNSPEC_PTRUE))
23007 where PATTERN is the svpattern as a CONST_INT and where ZERO
23008 is a zero constant of the required PTRUE mode (which can have
23009 fewer elements than X's mode, if zero bits are significant).
23011 If so, and if INFO is nonnull, describe the immediate in INFO. */
23013 aarch64_sve_ptrue_svpattern_p (rtx x
, struct simd_immediate_info
*info
)
23015 if (GET_CODE (x
) != CONST
)
23019 if (GET_CODE (x
) != UNSPEC
|| XINT (x
, 1) != UNSPEC_PTRUE
)
23024 aarch64_svpattern pattern
23025 = (aarch64_svpattern
) INTVAL (XVECEXP (x
, 0, 0));
23026 machine_mode pred_mode
= GET_MODE (XVECEXP (x
, 0, 1));
23027 scalar_int_mode int_mode
= aarch64_sve_element_int_mode (pred_mode
);
23028 *info
= simd_immediate_info (int_mode
, pattern
);
23033 /* Return true if X is a valid SVE predicate. If INFO is nonnull, use
23034 it to describe valid immediates. */
23037 aarch64_sve_pred_valid_immediate (rtx x
, simd_immediate_info
*info
)
23039 if (aarch64_sve_ptrue_svpattern_p (x
, info
))
23042 if (x
== CONST0_RTX (GET_MODE (x
)))
23045 *info
= simd_immediate_info (DImode
, 0);
23049 /* Analyze the value as a VNx16BImode. This should be relatively
23050 efficient, since rtx_vector_builder has enough built-in capacity
23051 to store all VLA predicate constants without needing the heap. */
23052 rtx_vector_builder builder
;
23053 if (!aarch64_get_sve_pred_bits (builder
, x
))
23056 unsigned int elt_size
= aarch64_widest_sve_pred_elt_size (builder
);
23057 if (int vl
= aarch64_partial_ptrue_length (builder
, elt_size
))
23059 machine_mode mode
= aarch64_sve_pred_mode (elt_size
).require ();
23060 aarch64_svpattern pattern
= aarch64_svpattern_for_vl (mode
, vl
);
23061 if (pattern
!= AARCH64_NUM_SVPATTERNS
)
23065 scalar_int_mode int_mode
= aarch64_sve_element_int_mode (mode
);
23066 *info
= simd_immediate_info (int_mode
, pattern
);
23074 /* Return true if OP is a valid SIMD immediate for the operation
23075 described by WHICH. If INFO is nonnull, use it to describe valid
23078 aarch64_simd_valid_immediate (rtx op
, simd_immediate_info
*info
,
23079 enum simd_immediate_check which
)
23081 machine_mode mode
= GET_MODE (op
);
23082 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
23083 if (vec_flags
== 0 || vec_flags
== (VEC_ADVSIMD
| VEC_STRUCT
))
23086 if ((vec_flags
& VEC_ADVSIMD
) && !TARGET_SIMD
)
23089 if (vec_flags
== (VEC_SVE_PRED
| VEC_STRUCT
))
23090 return op
== CONST0_RTX (mode
) || op
== CONSTM1_RTX (mode
);
23092 if (vec_flags
& VEC_SVE_PRED
)
23093 return aarch64_sve_pred_valid_immediate (op
, info
);
23095 scalar_mode elt_mode
= GET_MODE_INNER (mode
);
23097 unsigned int n_elts
;
23098 if (CONST_VECTOR_P (op
)
23099 && CONST_VECTOR_DUPLICATE_P (op
))
23100 n_elts
= CONST_VECTOR_NPATTERNS (op
);
23101 else if (which
== AARCH64_CHECK_MOV
23103 && const_vec_series_p (op
, &base
, &step
))
23105 gcc_assert (GET_MODE_CLASS (mode
) == MODE_VECTOR_INT
);
23106 if (!aarch64_sve_index_immediate_p (base
)
23107 || !aarch64_sve_index_immediate_p (step
))
23112 /* Get the corresponding container mode. E.g. an INDEX on V2SI
23113 should yield two integer values per 128-bit block, meaning
23114 that we need to treat it in the same way as V2DI and then
23115 ignore the upper 32 bits of each element. */
23116 elt_mode
= aarch64_sve_container_int_mode (mode
);
23117 *info
= simd_immediate_info (elt_mode
, base
, step
);
23121 else if (CONST_VECTOR_P (op
)
23122 && CONST_VECTOR_NUNITS (op
).is_constant (&n_elts
))
23123 /* N_ELTS set above. */;
23127 scalar_float_mode elt_float_mode
;
23129 && is_a
<scalar_float_mode
> (elt_mode
, &elt_float_mode
))
23131 rtx elt
= CONST_VECTOR_ENCODED_ELT (op
, 0);
23132 if (aarch64_float_const_zero_rtx_p (elt
)
23133 || aarch64_float_const_representable_p (elt
))
23136 *info
= simd_immediate_info (elt_float_mode
, elt
);
23141 /* If all elements in an SVE vector have the same value, we have a free
23142 choice between using the element mode and using the container mode.
23143 Using the element mode means that unused parts of the vector are
23144 duplicates of the used elements, while using the container mode means
23145 that the unused parts are an extension of the used elements. Using the
23146 element mode is better for (say) VNx4HI 0x101, since 0x01010101 is valid
23147 for its container mode VNx4SI while 0x00000101 isn't.
23149 If not all elements in an SVE vector have the same value, we need the
23150 transition from one element to the next to occur at container boundaries.
23151 E.g. a fixed-length VNx4HI containing { 1, 2, 3, 4 } should be treated
23152 in the same way as a VNx4SI containing { 1, 2, 3, 4 }. */
23153 scalar_int_mode elt_int_mode
;
23154 if ((vec_flags
& VEC_SVE_DATA
) && n_elts
> 1)
23155 elt_int_mode
= aarch64_sve_container_int_mode (mode
);
23157 elt_int_mode
= int_mode_for_mode (elt_mode
).require ();
23159 unsigned int elt_size
= GET_MODE_SIZE (elt_int_mode
);
23163 /* Expand the vector constant out into a byte vector, with the least
23164 significant byte of the register first. */
23165 auto_vec
<unsigned char, 16> bytes
;
23166 bytes
.reserve (n_elts
* elt_size
);
23167 for (unsigned int i
= 0; i
< n_elts
; i
++)
23169 /* The vector is provided in gcc endian-neutral fashion.
23170 For aarch64_be Advanced SIMD, it must be laid out in the vector
23171 register in reverse order. */
23172 bool swap_p
= ((vec_flags
& VEC_ADVSIMD
) != 0 && BYTES_BIG_ENDIAN
);
23173 rtx elt
= CONST_VECTOR_ELT (op
, swap_p
? (n_elts
- 1 - i
) : i
);
23175 if (elt_mode
!= elt_int_mode
)
23176 elt
= gen_lowpart (elt_int_mode
, elt
);
23178 if (!CONST_INT_P (elt
))
23181 unsigned HOST_WIDE_INT elt_val
= INTVAL (elt
);
23182 for (unsigned int byte
= 0; byte
< elt_size
; byte
++)
23184 bytes
.quick_push (elt_val
& 0xff);
23185 elt_val
>>= BITS_PER_UNIT
;
23189 /* The immediate must repeat every eight bytes. */
23190 unsigned int nbytes
= bytes
.length ();
23191 for (unsigned i
= 8; i
< nbytes
; ++i
)
23192 if (bytes
[i
] != bytes
[i
- 8])
23195 /* Get the repeating 8-byte value as an integer. No endian correction
23196 is needed here because bytes is already in lsb-first order. */
23197 unsigned HOST_WIDE_INT val64
= 0;
23198 for (unsigned int i
= 0; i
< 8; i
++)
23199 val64
|= ((unsigned HOST_WIDE_INT
) bytes
[i
% nbytes
]
23200 << (i
* BITS_PER_UNIT
));
23202 if (vec_flags
& VEC_SVE_DATA
)
23203 return aarch64_sve_valid_immediate (val64
, info
);
23205 return aarch64_advsimd_valid_immediate (val64
, info
, which
);
23208 /* Check whether X is a VEC_SERIES-like constant that starts at 0 and
23209 has a step in the range of INDEX. Return the index expression if so,
23210 otherwise return null. */
23212 aarch64_check_zero_based_sve_index_immediate (rtx x
)
23215 if (const_vec_series_p (x
, &base
, &step
)
23216 && base
== const0_rtx
23217 && aarch64_sve_index_immediate_p (step
))
23222 /* Check of immediate shift constants are within range. */
23224 aarch64_simd_shift_imm_p (rtx x
, machine_mode mode
, bool left
)
23226 x
= unwrap_const_vec_duplicate (x
);
23227 if (!CONST_INT_P (x
))
23229 int bit_width
= GET_MODE_UNIT_SIZE (mode
) * BITS_PER_UNIT
;
23231 return IN_RANGE (INTVAL (x
), 0, bit_width
- 1);
23233 return IN_RANGE (INTVAL (x
), 1, bit_width
);
23236 /* Return the bitmask CONST_INT to select the bits required by a zero extract
23237 operation of width WIDTH at bit position POS. */
23240 aarch64_mask_from_zextract_ops (rtx width
, rtx pos
)
23242 gcc_assert (CONST_INT_P (width
));
23243 gcc_assert (CONST_INT_P (pos
));
23245 unsigned HOST_WIDE_INT mask
23246 = ((unsigned HOST_WIDE_INT
) 1 << UINTVAL (width
)) - 1;
23247 return GEN_INT (mask
<< UINTVAL (pos
));
23251 aarch64_mov_operand_p (rtx x
, machine_mode mode
)
23253 if (GET_CODE (x
) == HIGH
23254 && aarch64_valid_symref (XEXP (x
, 0), GET_MODE (XEXP (x
, 0))))
23257 if (CONST_INT_P (x
))
23260 if (VECTOR_MODE_P (GET_MODE (x
)))
23262 /* Require predicate constants to be VNx16BI before RA, so that we
23263 force everything to have a canonical form. */
23264 if (!lra_in_progress
23265 && !reload_completed
23266 && aarch64_sve_pred_mode_p (GET_MODE (x
))
23267 && known_eq (GET_MODE_SIZE (GET_MODE (x
)), BYTES_PER_SVE_PRED
)
23268 && GET_MODE (x
) != VNx16BImode
)
23271 return aarch64_simd_valid_immediate (x
, NULL
);
23274 /* Remove UNSPEC_SALT_ADDR before checking symbol reference. */
23275 x
= strip_salt (x
);
23277 /* GOT accesses are valid moves. */
23278 if (SYMBOL_REF_P (x
)
23279 && aarch64_classify_symbolic_expression (x
) == SYMBOL_SMALL_GOT_4G
)
23282 if (SYMBOL_REF_P (x
) && mode
== DImode
&& CONSTANT_ADDRESS_P (x
))
23286 && (aarch64_sve_cnt_immediate_p (x
)
23287 || aarch64_sve_rdvl_immediate_p (x
)))
23290 if (aarch64_rdsvl_immediate_p (x
))
23293 return aarch64_classify_symbolic_expression (x
)
23294 == SYMBOL_TINY_ABSOLUTE
;
23297 /* Return a function-invariant register that contains VALUE. *CACHED_INSN
23298 caches instructions that set up such registers, so that they can be
23299 reused by future calls. */
23302 aarch64_get_shareable_reg (rtx_insn
**cached_insn
, rtx value
)
23304 rtx_insn
*insn
= *cached_insn
;
23305 if (insn
&& INSN_P (insn
) && !insn
->deleted ())
23307 rtx pat
= PATTERN (insn
);
23308 if (GET_CODE (pat
) == SET
)
23310 rtx dest
= SET_DEST (pat
);
23312 && !HARD_REGISTER_P (dest
)
23313 && rtx_equal_p (SET_SRC (pat
), value
))
23317 rtx reg
= gen_reg_rtx (GET_MODE (value
));
23318 *cached_insn
= emit_insn_before (gen_rtx_SET (reg
, value
),
23319 function_beg_insn
);
23323 /* Create a 0 constant that is based on V4SI to allow CSE to optimally share
23324 the constant creation. */
23327 aarch64_gen_shareable_zero (machine_mode mode
)
23329 rtx reg
= aarch64_get_shareable_reg (&cfun
->machine
->advsimd_zero_insn
,
23330 CONST0_RTX (V4SImode
));
23331 return lowpart_subreg (mode
, reg
, GET_MODE (reg
));
23334 /* INSN is some form of extension or shift that can be split into a
23335 permutation involving a shared zero. Return true if we should
23336 perform such a split.
23338 ??? For now, make sure that the split instruction executes more
23339 frequently than the zero that feeds it. In future it would be good
23340 to split without that restriction and instead recombine shared zeros
23341 if they turn out not to be worthwhile. This would allow splits in
23342 single-block functions and would also cope more naturally with
23343 rematerialization. The downside of not doing this is that we lose the
23344 optimizations for vector epilogues as well. */
23347 aarch64_split_simd_shift_p (rtx_insn
*insn
)
23349 return (can_create_pseudo_p ()
23350 && optimize_bb_for_speed_p (BLOCK_FOR_INSN (insn
))
23351 && (ENTRY_BLOCK_PTR_FOR_FN (cfun
)->count
23352 < BLOCK_FOR_INSN (insn
)->count
));
23355 /* Return a const_int vector of VAL. */
23357 aarch64_simd_gen_const_vector_dup (machine_mode mode
, HOST_WIDE_INT val
)
23359 rtx c
= gen_int_mode (val
, GET_MODE_INNER (mode
));
23360 return gen_const_vec_duplicate (mode
, c
);
23363 /* Check OP is a legal scalar immediate for the MOVI instruction. */
23366 aarch64_simd_scalar_immediate_valid_for_move (rtx op
, scalar_int_mode mode
)
23368 machine_mode vmode
;
23370 vmode
= aarch64_simd_container_mode (mode
, 64);
23371 rtx op_v
= aarch64_simd_gen_const_vector_dup (vmode
, INTVAL (op
));
23372 return aarch64_simd_valid_immediate (op_v
, NULL
);
23375 /* Construct and return a PARALLEL RTX vector with elements numbering the
23376 lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
23377 the vector - from the perspective of the architecture. This does not
23378 line up with GCC's perspective on lane numbers, so we end up with
23379 different masks depending on our target endian-ness. The diagram
23380 below may help. We must draw the distinction when building masks
23381 which select one half of the vector. An instruction selecting
23382 architectural low-lanes for a big-endian target, must be described using
23383 a mask selecting GCC high-lanes.
23385 Big-Endian Little-Endian
23387 GCC 0 1 2 3 3 2 1 0
23388 | x | x | x | x | | x | x | x | x |
23389 Architecture 3 2 1 0 3 2 1 0
23391 Low Mask: { 2, 3 } { 0, 1 }
23392 High Mask: { 0, 1 } { 2, 3 }
23394 MODE Is the mode of the vector and NUNITS is the number of units in it. */
23397 aarch64_simd_vect_par_cnst_half (machine_mode mode
, int nunits
, bool high
)
23399 rtvec v
= rtvec_alloc (nunits
/ 2);
23400 int high_base
= nunits
/ 2;
23406 if (BYTES_BIG_ENDIAN
)
23407 base
= high
? low_base
: high_base
;
23409 base
= high
? high_base
: low_base
;
23411 for (i
= 0; i
< nunits
/ 2; i
++)
23412 RTVEC_ELT (v
, i
) = GEN_INT (base
+ i
);
23414 t1
= gen_rtx_PARALLEL (mode
, v
);
23418 /* Check OP for validity as a PARALLEL RTX vector with elements
23419 numbering the lanes of either the high (HIGH == TRUE) or low lanes,
23420 from the perspective of the architecture. See the diagram above
23421 aarch64_simd_vect_par_cnst_half for more details. */
23424 aarch64_simd_check_vect_par_cnst_half (rtx op
, machine_mode mode
,
23428 if (!VECTOR_MODE_P (mode
) || !GET_MODE_NUNITS (mode
).is_constant (&nelts
))
23431 rtx ideal
= aarch64_simd_vect_par_cnst_half (mode
, nelts
, high
);
23432 HOST_WIDE_INT count_op
= XVECLEN (op
, 0);
23433 HOST_WIDE_INT count_ideal
= XVECLEN (ideal
, 0);
23436 if (count_op
!= count_ideal
)
23439 for (i
= 0; i
< count_ideal
; i
++)
23441 rtx elt_op
= XVECEXP (op
, 0, i
);
23442 rtx elt_ideal
= XVECEXP (ideal
, 0, i
);
23444 if (!CONST_INT_P (elt_op
)
23445 || INTVAL (elt_ideal
) != INTVAL (elt_op
))
23451 /* Return a PARALLEL containing NELTS elements, with element I equal
23452 to BASE + I * STEP. */
23455 aarch64_gen_stepped_int_parallel (unsigned int nelts
, int base
, int step
)
23457 rtvec vec
= rtvec_alloc (nelts
);
23458 for (unsigned int i
= 0; i
< nelts
; ++i
)
23459 RTVEC_ELT (vec
, i
) = gen_int_mode (base
+ i
* step
, DImode
);
23460 return gen_rtx_PARALLEL (VOIDmode
, vec
);
23463 /* Return true if OP is a PARALLEL of CONST_INTs that form a linear
23464 series with step STEP. */
23467 aarch64_stepped_int_parallel_p (rtx op
, int step
)
23469 if (GET_CODE (op
) != PARALLEL
|| !CONST_INT_P (XVECEXP (op
, 0, 0)))
23472 unsigned HOST_WIDE_INT base
= UINTVAL (XVECEXP (op
, 0, 0));
23473 for (int i
= 1; i
< XVECLEN (op
, 0); ++i
)
23474 if (!CONST_INT_P (XVECEXP (op
, 0, i
))
23475 || UINTVAL (XVECEXP (op
, 0, i
)) != base
+ i
* step
)
23481 /* Return true if OPERANDS[0] to OPERANDS[NUM_OPERANDS - 1] form a
23482 sequence of strided registers, with the stride being equal STRIDE.
23483 The operands are already known to be FPRs. */
23485 aarch64_strided_registers_p (rtx
*operands
, unsigned int num_operands
,
23486 unsigned int stride
)
23488 for (unsigned int i
= 1; i
< num_operands
; ++i
)
23489 if (REGNO (operands
[i
]) != REGNO (operands
[0]) + i
* stride
)
23494 /* Bounds-check lanes. Ensure OPERAND lies between LOW (inclusive) and
23495 HIGH (exclusive). */
23497 aarch64_simd_lane_bounds (rtx operand
, HOST_WIDE_INT low
, HOST_WIDE_INT high
,
23500 HOST_WIDE_INT lane
;
23501 gcc_assert (CONST_INT_P (operand
));
23502 lane
= INTVAL (operand
);
23504 if (lane
< low
|| lane
>= high
)
23507 error_at (EXPR_LOCATION (exp
), "lane %wd out of range %wd - %wd",
23508 lane
, low
, high
- 1);
23510 error ("lane %wd out of range %wd - %wd", lane
, low
, high
- 1);
23514 /* Peform endian correction on lane number N, which indexes a vector
23515 of mode MODE, and return the result as an SImode rtx. */
23518 aarch64_endian_lane_rtx (machine_mode mode
, unsigned int n
)
23520 return gen_int_mode (ENDIAN_LANE_N (GET_MODE_NUNITS (mode
), n
), SImode
);
23523 /* Return TRUE if OP is a valid vector addressing mode. */
23526 aarch64_simd_mem_operand_p (rtx op
)
23529 && (GET_CODE (XEXP (op
, 0)) == POST_INC
|| REG_P (XEXP (op
, 0)))
23530 && memory_operand (op
, VOIDmode
));
23533 /* Return true if OP is a valid MEM operand for an SVE LD1R instruction. */
23536 aarch64_sve_ld1r_operand_p (rtx op
)
23538 struct aarch64_address_info addr
;
23542 && is_a
<scalar_mode
> (GET_MODE (op
), &mode
)
23543 && aarch64_classify_address (&addr
, XEXP (op
, 0), mode
, false)
23544 && addr
.type
== ADDRESS_REG_IMM
23545 && offset_6bit_unsigned_scaled_p (mode
, addr
.const_offset
));
23548 /* Return true if OP is a valid MEM operand for an SVE LD1R{Q,O} instruction
23549 where the size of the read data is specified by `mode` and the size of the
23550 vector elements are specified by `elem_mode`. */
23552 aarch64_sve_ld1rq_ld1ro_operand_p (rtx op
, machine_mode mode
,
23553 scalar_mode elem_mode
)
23555 struct aarch64_address_info addr
;
23557 || !aarch64_classify_address (&addr
, XEXP (op
, 0), elem_mode
, false))
23560 if (addr
.type
== ADDRESS_REG_IMM
)
23561 return offset_4bit_signed_scaled_p (mode
, addr
.const_offset
);
23563 if (addr
.type
== ADDRESS_REG_REG
)
23564 return (1U << addr
.shift
) == GET_MODE_SIZE (elem_mode
);
23569 /* Return true if OP is a valid MEM operand for an SVE LD1RQ instruction. */
23571 aarch64_sve_ld1rq_operand_p (rtx op
)
23573 return aarch64_sve_ld1rq_ld1ro_operand_p (op
, TImode
,
23574 GET_MODE_INNER (GET_MODE (op
)));
23577 /* Return true if OP is a valid MEM operand for an SVE LD1RO instruction for
23578 accessing a vector where the element size is specified by `elem_mode`. */
23580 aarch64_sve_ld1ro_operand_p (rtx op
, scalar_mode elem_mode
)
23582 return aarch64_sve_ld1rq_ld1ro_operand_p (op
, OImode
, elem_mode
);
23585 /* Return true if OP is a valid MEM operand for an SVE LDFF1 instruction. */
23587 aarch64_sve_ldff1_operand_p (rtx op
)
23592 struct aarch64_address_info addr
;
23593 if (!aarch64_classify_address (&addr
, XEXP (op
, 0), GET_MODE (op
), false))
23596 if (addr
.type
== ADDRESS_REG_IMM
)
23597 return known_eq (addr
.const_offset
, 0);
23599 return addr
.type
== ADDRESS_REG_REG
;
23602 /* Return true if OP is a valid MEM operand for an SVE LDNF1 instruction. */
23604 aarch64_sve_ldnf1_operand_p (rtx op
)
23606 struct aarch64_address_info addr
;
23609 && aarch64_classify_address (&addr
, XEXP (op
, 0),
23610 GET_MODE (op
), false)
23611 && addr
.type
== ADDRESS_REG_IMM
);
23614 /* Return true if OP is a valid MEM operand for an SVE LDR instruction.
23615 The conditions for STR are the same. */
23617 aarch64_sve_ldr_operand_p (rtx op
)
23619 struct aarch64_address_info addr
;
23622 && aarch64_classify_address (&addr
, XEXP (op
, 0), GET_MODE (op
),
23623 false, ADDR_QUERY_ANY
)
23624 && addr
.type
== ADDRESS_REG_IMM
);
23627 /* Return true if OP is a valid address for an SVE PRF[BHWD] instruction,
23628 addressing memory of mode MODE. */
23630 aarch64_sve_prefetch_operand_p (rtx op
, machine_mode mode
)
23632 struct aarch64_address_info addr
;
23633 if (!aarch64_classify_address (&addr
, op
, mode
, false, ADDR_QUERY_ANY
))
23636 if (addr
.type
== ADDRESS_REG_IMM
)
23637 return offset_6bit_signed_scaled_p (mode
, addr
.const_offset
);
23639 return addr
.type
== ADDRESS_REG_REG
;
23642 /* Return true if OP is a valid MEM operand for an SVE_STRUCT mode.
23643 We need to be able to access the individual pieces, so the range
23644 is different from LD[234] and ST[234]. */
23646 aarch64_sve_struct_memory_operand_p (rtx op
)
23651 machine_mode mode
= GET_MODE (op
);
23652 struct aarch64_address_info addr
;
23653 if (!aarch64_classify_address (&addr
, XEXP (op
, 0), SVE_BYTE_MODE
, false,
23655 || addr
.type
!= ADDRESS_REG_IMM
)
23658 poly_int64 first
= addr
.const_offset
;
23659 poly_int64 last
= first
+ GET_MODE_SIZE (mode
) - BYTES_PER_SVE_VECTOR
;
23660 return (offset_4bit_signed_scaled_p (SVE_BYTE_MODE
, first
)
23661 && offset_4bit_signed_scaled_p (SVE_BYTE_MODE
, last
));
23664 /* Return true if OFFSET is a constant integer and if VNUM is
23665 OFFSET * the number of bytes in an SVE vector. This is the requirement
23666 that exists in SME LDR and STR instructions, where the VL offset must
23667 equal the ZA slice offset. */
23669 aarch64_sme_ldr_vnum_offset_p (rtx offset
, rtx vnum
)
23671 if (!CONST_INT_P (offset
) || !IN_RANGE (INTVAL (offset
), 0, 15))
23674 if (TARGET_STREAMING
)
23676 poly_int64 const_vnum
;
23677 return (poly_int_rtx_p (vnum
, &const_vnum
)
23678 && known_eq (const_vnum
,
23679 INTVAL (offset
) * BYTES_PER_SVE_VECTOR
));
23683 HOST_WIDE_INT factor
;
23684 return (aarch64_sme_vq_unspec_p (vnum
, &factor
)
23685 && factor
== INTVAL (offset
) * 16);
23689 /* Emit a register copy from operand to operand, taking care not to
23690 early-clobber source registers in the process.
23692 COUNT is the number of components into which the copy needs to be
23695 aarch64_simd_emit_reg_reg_move (rtx
*operands
, machine_mode mode
,
23696 unsigned int count
)
23699 int rdest
= REGNO (operands
[0]);
23700 int rsrc
= REGNO (operands
[1]);
23702 if (!reg_overlap_mentioned_p (operands
[0], operands
[1])
23704 for (i
= 0; i
< count
; i
++)
23705 emit_move_insn (gen_rtx_REG (mode
, rdest
+ i
),
23706 gen_rtx_REG (mode
, rsrc
+ i
));
23708 for (i
= 0; i
< count
; i
++)
23709 emit_move_insn (gen_rtx_REG (mode
, rdest
+ count
- i
- 1),
23710 gen_rtx_REG (mode
, rsrc
+ count
- i
- 1));
23713 /* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is
23714 one of VSTRUCT modes: OI, CI, or XI. */
23716 aarch64_simd_attr_length_rglist (machine_mode mode
)
23718 /* This is only used (and only meaningful) for Advanced SIMD, not SVE. */
23719 return (GET_MODE_SIZE (mode
).to_constant () / UNITS_PER_VREG
) * 4;
23722 /* Implement target hook TARGET_VECTOR_ALIGNMENT. The AAPCS64 sets the maximum
23723 alignment of a vector to 128 bits. SVE predicates have an alignment of
23725 static HOST_WIDE_INT
23726 aarch64_simd_vector_alignment (const_tree type
)
23728 /* ??? Checking the mode isn't ideal, but VECTOR_BOOLEAN_TYPE_P can
23729 be set for non-predicate vectors of booleans. Modes are the most
23730 direct way we have of identifying real SVE predicate types. */
23731 if (GET_MODE_CLASS (TYPE_MODE (type
)) == MODE_VECTOR_BOOL
)
23733 widest_int min_size
23734 = constant_lower_bound (wi::to_poly_widest (TYPE_SIZE (type
)));
23735 return wi::umin (min_size
, 128).to_uhwi ();
23738 /* Implement target hook TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT. */
23740 aarch64_vectorize_preferred_vector_alignment (const_tree type
)
23742 if (aarch64_sve_data_mode_p (TYPE_MODE (type
)))
23744 /* If the length of the vector is a fixed power of 2, try to align
23745 to that length, otherwise don't try to align at all. */
23746 HOST_WIDE_INT result
;
23747 if (!GET_MODE_BITSIZE (TYPE_MODE (type
)).is_constant (&result
)
23748 || !pow2p_hwi (result
))
23749 result
= TYPE_ALIGN (TREE_TYPE (type
));
23752 return TYPE_ALIGN (type
);
23755 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE. */
23757 aarch64_simd_vector_alignment_reachable (const_tree type
, bool is_packed
)
23762 /* For fixed-length vectors, check that the vectorizer will aim for
23763 full-vector alignment. This isn't true for generic GCC vectors
23764 that are wider than the ABI maximum of 128 bits. */
23765 poly_uint64 preferred_alignment
=
23766 aarch64_vectorize_preferred_vector_alignment (type
);
23767 if (TREE_CODE (TYPE_SIZE (type
)) == INTEGER_CST
23768 && maybe_ne (wi::to_widest (TYPE_SIZE (type
)),
23769 preferred_alignment
))
23772 /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned. */
23776 /* Return true if the vector misalignment factor is supported by the
23779 aarch64_builtin_support_vector_misalignment (machine_mode mode
,
23780 const_tree type
, int misalignment
,
23783 if (TARGET_SIMD
&& STRICT_ALIGNMENT
)
23785 /* Return if movmisalign pattern is not supported for this mode. */
23786 if (optab_handler (movmisalign_optab
, mode
) == CODE_FOR_nothing
)
23789 /* Misalignment factor is unknown at compile time. */
23790 if (misalignment
== -1)
23793 return default_builtin_support_vector_misalignment (mode
, type
, misalignment
,
23797 /* If VALS is a vector constant that can be loaded into a register
23798 using DUP, generate instructions to do so and return an RTX to
23799 assign to the register. Otherwise return NULL_RTX. */
23801 aarch64_simd_dup_constant (rtx vals
)
23803 machine_mode mode
= GET_MODE (vals
);
23804 machine_mode inner_mode
= GET_MODE_INNER (mode
);
23807 if (!const_vec_duplicate_p (vals
, &x
))
23810 /* We can load this constant by using DUP and a constant in a
23811 single ARM register. This will be cheaper than a vector
23813 x
= force_reg (inner_mode
, x
);
23814 return gen_vec_duplicate (mode
, x
);
23818 /* Generate code to load VALS, which is a PARALLEL containing only
23819 constants (for vec_init) or CONST_VECTOR, efficiently into a
23820 register. Returns an RTX to copy into the register, or NULL_RTX
23821 for a PARALLEL that cannot be converted into a CONST_VECTOR. */
23823 aarch64_simd_make_constant (rtx vals
)
23825 machine_mode mode
= GET_MODE (vals
);
23827 rtx const_vec
= NULL_RTX
;
23831 if (CONST_VECTOR_P (vals
))
23833 else if (GET_CODE (vals
) == PARALLEL
)
23835 /* A CONST_VECTOR must contain only CONST_INTs and
23836 CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
23837 Only store valid constants in a CONST_VECTOR. */
23838 int n_elts
= XVECLEN (vals
, 0);
23839 for (i
= 0; i
< n_elts
; ++i
)
23841 rtx x
= XVECEXP (vals
, 0, i
);
23842 if (CONST_INT_P (x
) || CONST_DOUBLE_P (x
))
23845 if (n_const
== n_elts
)
23846 const_vec
= gen_rtx_CONST_VECTOR (mode
, XVEC (vals
, 0));
23849 gcc_unreachable ();
23851 if (const_vec
!= NULL_RTX
23852 && aarch64_simd_valid_immediate (const_vec
, NULL
))
23853 /* Load using MOVI/MVNI. */
23855 else if ((const_dup
= aarch64_simd_dup_constant (vals
)) != NULL_RTX
)
23856 /* Loaded using DUP. */
23858 else if (const_vec
!= NULL_RTX
)
23859 /* Load from constant pool. We cannot take advantage of single-cycle
23860 LD1 because we need a PC-relative addressing mode. */
23863 /* A PARALLEL containing something not valid inside CONST_VECTOR.
23864 We cannot construct an initializer. */
23868 /* A subroutine of aarch64_expand_vector_init, with the same interface.
23869 The caller has already tried a divide-and-conquer approach, so do
23870 not consider that case here. */
23873 aarch64_expand_vector_init_fallback (rtx target
, rtx vals
)
23875 machine_mode mode
= GET_MODE (target
);
23876 scalar_mode inner_mode
= GET_MODE_INNER (mode
);
23877 /* The number of vector elements. */
23878 int n_elts
= XVECLEN (vals
, 0);
23879 /* The number of vector elements which are not constant. */
23881 rtx any_const
= NULL_RTX
;
23882 /* The first element of vals. */
23883 rtx v0
= XVECEXP (vals
, 0, 0);
23884 bool all_same
= true;
23886 /* This is a special vec_init<M><N> where N is not an element mode but a
23887 vector mode with half the elements of M. We expect to find two entries
23888 of mode N in VALS and we must put their concatentation into TARGET. */
23889 if (XVECLEN (vals
, 0) == 2 && VECTOR_MODE_P (GET_MODE (XVECEXP (vals
, 0, 0))))
23891 machine_mode narrow_mode
= GET_MODE (XVECEXP (vals
, 0, 0));
23892 gcc_assert (GET_MODE_INNER (narrow_mode
) == inner_mode
23893 && known_eq (GET_MODE_SIZE (mode
),
23894 2 * GET_MODE_SIZE (narrow_mode
)));
23895 emit_insn (gen_aarch64_vec_concat (narrow_mode
, target
,
23896 XVECEXP (vals
, 0, 0),
23897 XVECEXP (vals
, 0, 1)));
23901 /* Count the number of variable elements to initialise. */
23902 for (int i
= 0; i
< n_elts
; ++i
)
23904 rtx x
= XVECEXP (vals
, 0, i
);
23905 if (!(CONST_INT_P (x
) || CONST_DOUBLE_P (x
)))
23910 all_same
&= rtx_equal_p (x
, v0
);
23913 /* No variable elements, hand off to aarch64_simd_make_constant which knows
23914 how best to handle this. */
23917 rtx constant
= aarch64_simd_make_constant (vals
);
23918 if (constant
!= NULL_RTX
)
23920 emit_move_insn (target
, constant
);
23925 /* Splat a single non-constant element if we can. */
23928 rtx x
= force_reg (inner_mode
, v0
);
23929 aarch64_emit_move (target
, gen_vec_duplicate (mode
, x
));
23933 enum insn_code icode
= optab_handler (vec_set_optab
, mode
);
23934 gcc_assert (icode
!= CODE_FOR_nothing
);
23936 /* If there are only variable elements, try to optimize
23937 the insertion using dup for the most common element
23938 followed by insertions. */
23940 /* The algorithm will fill matches[*][0] with the earliest matching element,
23941 and matches[X][1] with the count of duplicate elements (if X is the
23942 earliest element which has duplicates). */
23944 if (n_var
>= n_elts
- 1 && n_elts
<= 16)
23946 int matches
[16][2] = {0};
23947 for (int i
= 0; i
< n_elts
; i
++)
23949 for (int j
= 0; j
<= i
; j
++)
23951 if (rtx_equal_p (XVECEXP (vals
, 0, i
), XVECEXP (vals
, 0, j
)))
23959 int maxelement
= 0;
23961 rtx const_elem
= NULL_RTX
;
23962 int const_elem_pos
= 0;
23964 for (int i
= 0; i
< n_elts
; i
++)
23966 if (matches
[i
][1] > maxv
)
23969 maxv
= matches
[i
][1];
23971 if (CONST_INT_P (XVECEXP (vals
, 0, i
))
23972 || CONST_DOUBLE_P (XVECEXP (vals
, 0, i
)))
23974 const_elem_pos
= i
;
23975 const_elem
= XVECEXP (vals
, 0, i
);
23979 /* Create a duplicate of the most common element, unless all elements
23980 are equally useless to us, in which case just immediately set the
23981 vector register using the first element. */
23985 /* For vectors of two 64-bit elements, we can do even better. */
23987 && (inner_mode
== E_DImode
23988 || inner_mode
== E_DFmode
))
23991 rtx x0
= XVECEXP (vals
, 0, 0);
23992 rtx x1
= XVECEXP (vals
, 0, 1);
23993 /* Combine can pick up this case, but handling it directly
23994 here leaves clearer RTL.
23996 This is load_pair_lanes<mode>, and also gives us a clean-up
23997 for store_pair_lanes<mode>. */
23998 if (memory_operand (x0
, inner_mode
)
23999 && memory_operand (x1
, inner_mode
)
24000 && aarch64_mergeable_load_pair_p (mode
, x0
, x1
))
24003 if (inner_mode
== DFmode
)
24004 t
= gen_load_pair_lanesdf (target
, x0
, x1
);
24006 t
= gen_load_pair_lanesdi (target
, x0
, x1
);
24011 /* The subreg-move sequence below will move into lane zero of the
24012 vector register. For big-endian we want that position to hold
24013 the last element of VALS. */
24014 maxelement
= BYTES_BIG_ENDIAN
? n_elts
- 1 : 0;
24016 /* If we have a single constant element, use that for duplicating
24020 maxelement
= const_elem_pos
;
24021 aarch64_emit_move (target
, gen_vec_duplicate (mode
, const_elem
));
24025 rtx x
= force_reg (inner_mode
, XVECEXP (vals
, 0, maxelement
));
24026 aarch64_emit_move (target
, lowpart_subreg (mode
, x
, inner_mode
));
24031 rtx x
= force_reg (inner_mode
, XVECEXP (vals
, 0, maxelement
));
24032 aarch64_emit_move (target
, gen_vec_duplicate (mode
, x
));
24035 /* Insert the rest. */
24036 for (int i
= 0; i
< n_elts
; i
++)
24038 rtx x
= XVECEXP (vals
, 0, i
);
24039 if (matches
[i
][0] == maxelement
)
24041 x
= force_reg (inner_mode
, x
);
24042 emit_insn (GEN_FCN (icode
) (target
, x
, GEN_INT (i
)));
24047 /* Initialise a vector which is part-variable. We want to first try
24048 to build those lanes which are constant in the most efficient way we
24050 if (n_var
!= n_elts
)
24052 rtx copy
= copy_rtx (vals
);
24054 /* Load constant part of vector. We really don't care what goes into the
24055 parts we will overwrite, but we're more likely to be able to load the
24056 constant efficiently if it has fewer, larger, repeating parts
24057 (see aarch64_simd_valid_immediate). */
24058 for (int i
= 0; i
< n_elts
; i
++)
24060 rtx x
= XVECEXP (vals
, 0, i
);
24061 if (CONST_INT_P (x
) || CONST_DOUBLE_P (x
))
24063 rtx subst
= any_const
;
24064 for (int bit
= n_elts
/ 2; bit
> 0; bit
/= 2)
24066 /* Look in the copied vector, as more elements are const. */
24067 rtx test
= XVECEXP (copy
, 0, i
^ bit
);
24068 if (CONST_INT_P (test
) || CONST_DOUBLE_P (test
))
24074 XVECEXP (copy
, 0, i
) = subst
;
24076 aarch64_expand_vector_init_fallback (target
, copy
);
24079 /* Insert the variable lanes directly. */
24080 for (int i
= 0; i
< n_elts
; i
++)
24082 rtx x
= XVECEXP (vals
, 0, i
);
24083 if (CONST_INT_P (x
) || CONST_DOUBLE_P (x
))
24085 x
= force_reg (inner_mode
, x
);
24086 emit_insn (GEN_FCN (icode
) (target
, x
, GEN_INT (i
)));
24090 /* Return even or odd half of VALS depending on EVEN_P. */
24093 aarch64_unzip_vector_init (machine_mode mode
, rtx vals
, bool even_p
)
24095 int n
= XVECLEN (vals
, 0);
24096 machine_mode new_mode
24097 = aarch64_simd_container_mode (GET_MODE_INNER (mode
),
24098 GET_MODE_BITSIZE (mode
).to_constant () / 2);
24099 rtvec vec
= rtvec_alloc (n
/ 2);
24100 for (int i
= 0; i
< n
/ 2; i
++)
24101 RTVEC_ELT (vec
, i
) = (even_p
) ? XVECEXP (vals
, 0, 2 * i
)
24102 : XVECEXP (vals
, 0, 2 * i
+ 1);
24103 return gen_rtx_PARALLEL (new_mode
, vec
);
24106 /* Return true if SET is a scalar move. */
24109 scalar_move_insn_p (rtx set
)
24111 rtx src
= SET_SRC (set
);
24112 rtx dest
= SET_DEST (set
);
24113 return (is_a
<scalar_mode
> (GET_MODE (dest
))
24114 && aarch64_mov_operand (src
, GET_MODE (dest
)));
24117 /* Similar to seq_cost, but ignore cost for scalar moves. */
24120 seq_cost_ignoring_scalar_moves (const rtx_insn
*seq
, bool speed
)
24124 for (; seq
; seq
= NEXT_INSN (seq
))
24125 if (NONDEBUG_INSN_P (seq
))
24127 if (rtx set
= single_set (seq
))
24129 if (!scalar_move_insn_p (set
))
24130 cost
+= set_rtx_cost (set
, speed
);
24134 int this_cost
= insn_cost (CONST_CAST_RTX_INSN (seq
), speed
);
24145 /* Expand a vector initialization sequence, such that TARGET is
24146 initialized to contain VALS. */
24149 aarch64_expand_vector_init (rtx target
, rtx vals
)
24151 /* Try decomposing the initializer into even and odd halves and
24152 then ZIP them together. Use the resulting sequence if it is
24153 strictly cheaper than loading VALS directly.
24155 Prefer the fallback sequence in the event of a tie, since it
24156 will tend to use fewer registers. */
24158 machine_mode mode
= GET_MODE (target
);
24159 int n_elts
= XVECLEN (vals
, 0);
24162 || maybe_ne (GET_MODE_BITSIZE (mode
), 128))
24164 aarch64_expand_vector_init_fallback (target
, vals
);
24171 for (int i
= 0; i
< 2; i
++)
24174 rtx new_vals
= aarch64_unzip_vector_init (mode
, vals
, i
== 0);
24175 rtx tmp_reg
= gen_reg_rtx (GET_MODE (new_vals
));
24176 aarch64_expand_vector_init (tmp_reg
, new_vals
);
24177 halves
[i
] = gen_rtx_SUBREG (mode
, tmp_reg
, 0);
24178 rtx_insn
*rec_seq
= get_insns ();
24180 costs
[i
] = seq_cost_ignoring_scalar_moves (rec_seq
, !optimize_size
);
24181 emit_insn (rec_seq
);
24184 rtvec v
= gen_rtvec (2, halves
[0], halves
[1]);
24185 rtx_insn
*zip1_insn
24186 = emit_set_insn (target
, gen_rtx_UNSPEC (mode
, v
, UNSPEC_ZIP1
));
24187 unsigned seq_total_cost
24188 = (!optimize_size
) ? std::max (costs
[0], costs
[1]) : costs
[0] + costs
[1];
24189 seq_total_cost
+= insn_cost (zip1_insn
, !optimize_size
);
24191 rtx_insn
*seq
= get_insns ();
24195 aarch64_expand_vector_init_fallback (target
, vals
);
24196 rtx_insn
*fallback_seq
= get_insns ();
24197 unsigned fallback_seq_cost
24198 = seq_cost_ignoring_scalar_moves (fallback_seq
, !optimize_size
);
24201 emit_insn (seq_total_cost
< fallback_seq_cost
? seq
: fallback_seq
);
24204 /* Emit RTL corresponding to:
24205 insr TARGET, ELEM. */
24208 emit_insr (rtx target
, rtx elem
)
24210 machine_mode mode
= GET_MODE (target
);
24211 scalar_mode elem_mode
= GET_MODE_INNER (mode
);
24212 elem
= force_reg (elem_mode
, elem
);
24214 insn_code icode
= optab_handler (vec_shl_insert_optab
, mode
);
24215 gcc_assert (icode
!= CODE_FOR_nothing
);
24216 emit_insn (GEN_FCN (icode
) (target
, target
, elem
));
24219 /* Subroutine of aarch64_sve_expand_vector_init for handling
24220 trailing constants.
24221 This function works as follows:
24222 (a) Create a new vector consisting of trailing constants.
24223 (b) Initialize TARGET with the constant vector using emit_move_insn.
24224 (c) Insert remaining elements in TARGET using insr.
24225 NELTS is the total number of elements in original vector while
24226 while NELTS_REQD is the number of elements that are actually
24229 ??? The heuristic used is to do above only if number of constants
24230 is at least half the total number of elements. May need fine tuning. */
24233 aarch64_sve_expand_vector_init_handle_trailing_constants
24234 (rtx target
, const rtx_vector_builder
&builder
, int nelts
, int nelts_reqd
)
24236 machine_mode mode
= GET_MODE (target
);
24237 scalar_mode elem_mode
= GET_MODE_INNER (mode
);
24238 int n_trailing_constants
= 0;
24240 for (int i
= nelts_reqd
- 1;
24241 i
>= 0 && valid_for_const_vector_p (elem_mode
, builder
.elt (i
));
24243 n_trailing_constants
++;
24245 if (n_trailing_constants
>= nelts_reqd
/ 2)
24247 /* Try to use the natural pattern of BUILDER to extend the trailing
24248 constant elements to a full vector. Replace any variables in the
24249 extra elements with zeros.
24251 ??? It would be better if the builders supported "don't care"
24252 elements, with the builder filling in whichever elements
24253 give the most compact encoding. */
24254 rtx_vector_builder
v (mode
, nelts
, 1);
24255 for (int i
= 0; i
< nelts
; i
++)
24257 rtx x
= builder
.elt (i
+ nelts_reqd
- n_trailing_constants
);
24258 if (!valid_for_const_vector_p (elem_mode
, x
))
24259 x
= CONST0_RTX (elem_mode
);
24262 rtx const_vec
= v
.build ();
24263 emit_move_insn (target
, const_vec
);
24265 for (int i
= nelts_reqd
- n_trailing_constants
- 1; i
>= 0; i
--)
24266 emit_insr (target
, builder
.elt (i
));
24274 /* Subroutine of aarch64_sve_expand_vector_init.
24276 (a) Initialize TARGET by broadcasting element NELTS_REQD - 1 of BUILDER.
24277 (b) Skip trailing elements from BUILDER, which are the same as
24278 element NELTS_REQD - 1.
24279 (c) Insert earlier elements in reverse order in TARGET using insr. */
24282 aarch64_sve_expand_vector_init_insert_elems (rtx target
,
24283 const rtx_vector_builder
&builder
,
24286 machine_mode mode
= GET_MODE (target
);
24287 scalar_mode elem_mode
= GET_MODE_INNER (mode
);
24289 struct expand_operand ops
[2];
24290 enum insn_code icode
= optab_handler (vec_duplicate_optab
, mode
);
24291 gcc_assert (icode
!= CODE_FOR_nothing
);
24293 create_output_operand (&ops
[0], target
, mode
);
24294 create_input_operand (&ops
[1], builder
.elt (nelts_reqd
- 1), elem_mode
);
24295 expand_insn (icode
, 2, ops
);
24297 int ndups
= builder
.count_dups (nelts_reqd
- 1, -1, -1);
24298 for (int i
= nelts_reqd
- ndups
- 1; i
>= 0; i
--)
24299 emit_insr (target
, builder
.elt (i
));
24302 /* Subroutine of aarch64_sve_expand_vector_init to handle case
24303 when all trailing elements of builder are same.
24304 This works as follows:
24305 (a) Use expand_insn interface to broadcast last vector element in TARGET.
24306 (b) Insert remaining elements in TARGET using insr.
24308 ??? The heuristic used is to do above if number of same trailing elements
24309 is at least 3/4 of total number of elements, loosely based on
24310 heuristic from mostly_zeros_p. May need fine-tuning. */
24313 aarch64_sve_expand_vector_init_handle_trailing_same_elem
24314 (rtx target
, const rtx_vector_builder
&builder
, int nelts_reqd
)
24316 int ndups
= builder
.count_dups (nelts_reqd
- 1, -1, -1);
24317 if (ndups
>= (3 * nelts_reqd
) / 4)
24319 aarch64_sve_expand_vector_init_insert_elems (target
, builder
,
24320 nelts_reqd
- ndups
+ 1);
24327 /* Initialize register TARGET from BUILDER. NELTS is the constant number
24328 of elements in BUILDER.
24330 The function tries to initialize TARGET from BUILDER if it fits one
24331 of the special cases outlined below.
24333 Failing that, the function divides BUILDER into two sub-vectors:
24334 v_even = even elements of BUILDER;
24335 v_odd = odd elements of BUILDER;
24337 and recursively calls itself with v_even and v_odd.
24339 if (recursive call succeeded for v_even or v_odd)
24340 TARGET = zip (v_even, v_odd)
24342 The function returns true if it managed to build TARGET from BUILDER
24343 with one of the special cases, false otherwise.
24345 Example: {a, 1, b, 2, c, 3, d, 4}
24347 The vector gets divided into:
24348 v_even = {a, b, c, d}
24349 v_odd = {1, 2, 3, 4}
24351 aarch64_sve_expand_vector_init(v_odd) hits case 1 and
24352 initialize tmp2 from constant vector v_odd using emit_move_insn.
24354 aarch64_sve_expand_vector_init(v_even) fails since v_even contains
24355 4 elements, so we construct tmp1 from v_even using insr:
24362 TARGET = zip (tmp1, tmp2)
24363 which sets TARGET to {a, 1, b, 2, c, 3, d, 4}. */
24366 aarch64_sve_expand_vector_init (rtx target
, const rtx_vector_builder
&builder
,
24367 int nelts
, int nelts_reqd
)
24369 machine_mode mode
= GET_MODE (target
);
24371 /* Case 1: Vector contains trailing constants. */
24373 if (aarch64_sve_expand_vector_init_handle_trailing_constants
24374 (target
, builder
, nelts
, nelts_reqd
))
24377 /* Case 2: Vector contains leading constants. */
24379 rtx_vector_builder
rev_builder (mode
, nelts_reqd
, 1);
24380 for (int i
= 0; i
< nelts_reqd
; i
++)
24381 rev_builder
.quick_push (builder
.elt (nelts_reqd
- i
- 1));
24382 rev_builder
.finalize ();
24384 if (aarch64_sve_expand_vector_init_handle_trailing_constants
24385 (target
, rev_builder
, nelts
, nelts_reqd
))
24387 emit_insn (gen_aarch64_sve_rev (mode
, target
, target
));
24391 /* Case 3: Vector contains trailing same element. */
24393 if (aarch64_sve_expand_vector_init_handle_trailing_same_elem
24394 (target
, builder
, nelts_reqd
))
24397 /* Case 4: Vector contains leading same element. */
24399 if (aarch64_sve_expand_vector_init_handle_trailing_same_elem
24400 (target
, rev_builder
, nelts_reqd
) && nelts_reqd
== nelts
)
24402 emit_insn (gen_aarch64_sve_rev (mode
, target
, target
));
24406 /* Avoid recursing below 4-elements.
24407 ??? The threshold 4 may need fine-tuning. */
24409 if (nelts_reqd
<= 4)
24412 rtx_vector_builder
v_even (mode
, nelts
, 1);
24413 rtx_vector_builder
v_odd (mode
, nelts
, 1);
24415 for (int i
= 0; i
< nelts
* 2; i
+= 2)
24417 v_even
.quick_push (builder
.elt (i
));
24418 v_odd
.quick_push (builder
.elt (i
+ 1));
24421 v_even
.finalize ();
24424 rtx tmp1
= gen_reg_rtx (mode
);
24425 bool did_even_p
= aarch64_sve_expand_vector_init (tmp1
, v_even
,
24426 nelts
, nelts_reqd
/ 2);
24428 rtx tmp2
= gen_reg_rtx (mode
);
24429 bool did_odd_p
= aarch64_sve_expand_vector_init (tmp2
, v_odd
,
24430 nelts
, nelts_reqd
/ 2);
24432 if (!did_even_p
&& !did_odd_p
)
24435 /* Initialize v_even and v_odd using INSR if it didn't match any of the
24436 special cases and zip v_even, v_odd. */
24439 aarch64_sve_expand_vector_init_insert_elems (tmp1
, v_even
, nelts_reqd
/ 2);
24442 aarch64_sve_expand_vector_init_insert_elems (tmp2
, v_odd
, nelts_reqd
/ 2);
24444 rtvec v
= gen_rtvec (2, tmp1
, tmp2
);
24445 emit_set_insn (target
, gen_rtx_UNSPEC (mode
, v
, UNSPEC_ZIP1
));
24449 /* Initialize register TARGET from the elements in PARALLEL rtx VALS. */
24452 aarch64_sve_expand_vector_init (rtx target
, rtx vals
)
24454 machine_mode mode
= GET_MODE (target
);
24455 int nelts
= XVECLEN (vals
, 0);
24457 rtx_vector_builder
v (mode
, nelts
, 1);
24458 for (int i
= 0; i
< nelts
; i
++)
24459 v
.quick_push (XVECEXP (vals
, 0, i
));
24462 /* If neither sub-vectors of v could be initialized specially,
24463 then use INSR to insert all elements from v into TARGET.
24464 ??? This might not be optimal for vectors with large
24465 initializers like 16-element or above.
24466 For nelts < 4, it probably isn't useful to handle specially. */
24469 || !aarch64_sve_expand_vector_init (target
, v
, nelts
, nelts
))
24470 aarch64_sve_expand_vector_init_insert_elems (target
, v
, nelts
);
24473 /* Check whether VALUE is a vector constant in which every element
24474 is either a power of 2 or a negated power of 2. If so, return
24475 a constant vector of log2s, and flip CODE between PLUS and MINUS
24476 if VALUE contains negated powers of 2. Return NULL_RTX otherwise. */
24479 aarch64_convert_mult_to_shift (rtx value
, rtx_code
&code
)
24481 if (!CONST_VECTOR_P (value
))
24484 rtx_vector_builder builder
;
24485 if (!builder
.new_unary_operation (GET_MODE (value
), value
, false))
24488 scalar_mode int_mode
= GET_MODE_INNER (GET_MODE (value
));
24489 /* 1 if the result of the multiplication must be negated,
24490 0 if it mustn't, or -1 if we don't yet care. */
24492 unsigned int encoded_nelts
= const_vector_encoded_nelts (value
);
24493 for (unsigned int i
= 0; i
< encoded_nelts
; ++i
)
24495 rtx elt
= CONST_VECTOR_ENCODED_ELT (value
, i
);
24496 if (!CONST_SCALAR_INT_P (elt
))
24498 rtx_mode_t
val (elt
, int_mode
);
24499 wide_int pow2
= wi::neg (val
);
24502 /* It matters whether we negate or not. Make that choice,
24503 and make sure that it's consistent with previous elements. */
24504 if (negate
== !wi::neg_p (val
))
24506 negate
= wi::neg_p (val
);
24510 /* POW2 is now the value that we want to be a power of 2. */
24511 int shift
= wi::exact_log2 (pow2
);
24514 builder
.quick_push (gen_int_mode (shift
, int_mode
));
24517 /* PLUS and MINUS are equivalent; canonicalize on PLUS. */
24519 else if (negate
== 1)
24520 code
= code
== PLUS
? MINUS
: PLUS
;
24521 return builder
.build ();
24524 /* Prepare for an integer SVE multiply-add or multiply-subtract pattern;
24525 CODE is PLUS for the former and MINUS for the latter. OPERANDS is the
24526 operands array, in the same order as for fma_optab. Return true if
24527 the function emitted all the necessary instructions, false if the caller
24528 should generate the pattern normally with the new OPERANDS array. */
24531 aarch64_prepare_sve_int_fma (rtx
*operands
, rtx_code code
)
24533 machine_mode mode
= GET_MODE (operands
[0]);
24534 if (rtx shifts
= aarch64_convert_mult_to_shift (operands
[2], code
))
24536 rtx product
= expand_binop (mode
, vashl_optab
, operands
[1], shifts
,
24537 NULL_RTX
, true, OPTAB_DIRECT
);
24538 force_expand_binop (mode
, code
== PLUS
? add_optab
: sub_optab
,
24539 operands
[3], product
, operands
[0], true,
24543 operands
[2] = force_reg (mode
, operands
[2]);
24547 /* Likewise, but for a conditional pattern. */
24550 aarch64_prepare_sve_cond_int_fma (rtx
*operands
, rtx_code code
)
24552 machine_mode mode
= GET_MODE (operands
[0]);
24553 if (rtx shifts
= aarch64_convert_mult_to_shift (operands
[3], code
))
24555 rtx product
= expand_binop (mode
, vashl_optab
, operands
[2], shifts
,
24556 NULL_RTX
, true, OPTAB_DIRECT
);
24557 emit_insn (gen_cond (code
, mode
, operands
[0], operands
[1],
24558 operands
[4], product
, operands
[5]));
24561 operands
[3] = force_reg (mode
, operands
[3]);
24565 static unsigned HOST_WIDE_INT
24566 aarch64_shift_truncation_mask (machine_mode mode
)
24568 if (!SHIFT_COUNT_TRUNCATED
|| aarch64_vector_data_mode_p (mode
))
24570 return GET_MODE_UNIT_BITSIZE (mode
) - 1;
24573 /* Select a format to encode pointers in exception handling data. */
24575 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED
, int global
)
24578 switch (aarch64_cmodel
)
24580 case AARCH64_CMODEL_TINY
:
24581 case AARCH64_CMODEL_TINY_PIC
:
24582 case AARCH64_CMODEL_SMALL
:
24583 case AARCH64_CMODEL_SMALL_PIC
:
24584 case AARCH64_CMODEL_SMALL_SPIC
:
24585 /* text+got+data < 4Gb. 4-byte signed relocs are sufficient
24587 type
= DW_EH_PE_sdata4
;
24590 /* No assumptions here. 8-byte relocs required. */
24591 type
= DW_EH_PE_sdata8
;
24594 return (global
? DW_EH_PE_indirect
: 0) | DW_EH_PE_pcrel
| type
;
24597 /* Output .variant_pcs for aarch64_vector_pcs function symbols. */
24600 aarch64_asm_output_variant_pcs (FILE *stream
, const tree decl
, const char* name
)
24602 if (TREE_CODE (decl
) == FUNCTION_DECL
)
24604 arm_pcs pcs
= (arm_pcs
) fndecl_abi (decl
).id ();
24605 if (pcs
== ARM_PCS_SIMD
|| pcs
== ARM_PCS_SVE
)
24607 fprintf (stream
, "\t.variant_pcs\t");
24608 assemble_name (stream
, name
);
24609 fprintf (stream
, "\n");
24614 /* The last .arch and .tune assembly strings that we printed. */
24615 static std::string aarch64_last_printed_arch_string
;
24616 static std::string aarch64_last_printed_tune_string
;
24618 /* Implement ASM_DECLARE_FUNCTION_NAME. Output the ISA features used
24619 by the function fndecl. */
24622 aarch64_declare_function_name (FILE *stream
, const char* name
,
24625 tree target_parts
= DECL_FUNCTION_SPECIFIC_TARGET (fndecl
);
24627 struct cl_target_option
*targ_options
;
24629 targ_options
= TREE_TARGET_OPTION (target_parts
);
24631 targ_options
= TREE_TARGET_OPTION (target_option_current_node
);
24632 gcc_assert (targ_options
);
24634 const struct processor
*this_arch
24635 = aarch64_get_arch (targ_options
->x_selected_arch
);
24637 auto isa_flags
= aarch64_get_asm_isa_flags (targ_options
);
24638 std::string extension
24639 = aarch64_get_extension_string_for_isa_flags (isa_flags
,
24641 /* Only update the assembler .arch string if it is distinct from the last
24642 such string we printed. */
24643 std::string to_print
= this_arch
->name
+ extension
;
24644 if (to_print
!= aarch64_last_printed_arch_string
)
24646 asm_fprintf (asm_out_file
, "\t.arch %s\n", to_print
.c_str ());
24647 aarch64_last_printed_arch_string
= to_print
;
24650 /* Print the cpu name we're tuning for in the comments, might be
24651 useful to readers of the generated asm. Do it only when it changes
24652 from function to function and verbose assembly is requested. */
24653 const struct processor
*this_tune
24654 = aarch64_get_tune_cpu (targ_options
->x_selected_tune
);
24656 if (flag_debug_asm
&& aarch64_last_printed_tune_string
!= this_tune
->name
)
24658 asm_fprintf (asm_out_file
, "\t" ASM_COMMENT_START
".tune %s\n",
24660 aarch64_last_printed_tune_string
= this_tune
->name
;
24663 aarch64_asm_output_variant_pcs (stream
, fndecl
, name
);
24665 /* Don't forget the type directive for ELF. */
24666 ASM_OUTPUT_TYPE_DIRECTIVE (stream
, name
, "function");
24667 ASM_OUTPUT_FUNCTION_LABEL (stream
, name
, fndecl
);
24669 cfun
->machine
->label_is_assembled
= true;
24672 /* Implement PRINT_PATCHABLE_FUNCTION_ENTRY. */
24675 aarch64_print_patchable_function_entry (FILE *file
,
24676 unsigned HOST_WIDE_INT patch_area_size
,
24679 if (!cfun
->machine
->label_is_assembled
)
24681 /* Emit the patching area before the entry label, if any. */
24682 default_print_patchable_function_entry (file
, patch_area_size
,
24687 rtx pa
= gen_patchable_area (GEN_INT (patch_area_size
),
24688 GEN_INT (record_p
));
24689 basic_block bb
= ENTRY_BLOCK_PTR_FOR_FN (cfun
)->next_bb
;
24691 if (!aarch_bti_enabled ()
24692 || cgraph_node::get (cfun
->decl
)->only_called_directly_p ())
24694 /* Emit the patchable_area at the beginning of the function. */
24695 rtx_insn
*insn
= emit_insn_before (pa
, BB_HEAD (bb
));
24696 INSN_ADDRESSES_NEW (insn
, -1);
24700 rtx_insn
*insn
= next_real_nondebug_insn (get_insns ());
24703 || GET_CODE (PATTERN (insn
)) != UNSPEC_VOLATILE
24704 || XINT (PATTERN (insn
), 1) != UNSPECV_BTI_C
)
24706 /* Emit a BTI_C. */
24707 insn
= emit_insn_before (gen_bti_c (), BB_HEAD (bb
));
24710 /* Emit the patchable_area after BTI_C. */
24711 insn
= emit_insn_after (pa
, insn
);
24712 INSN_ADDRESSES_NEW (insn
, -1);
24715 /* Output patchable area. */
24718 aarch64_output_patchable_area (unsigned int patch_area_size
, bool record_p
)
24720 default_print_patchable_function_entry (asm_out_file
, patch_area_size
,
24724 /* Implement ASM_OUTPUT_DEF_FROM_DECLS. Output .variant_pcs for aliases. */
24727 aarch64_asm_output_alias (FILE *stream
, const tree decl
, const tree target
)
24729 const char *name
= XSTR (XEXP (DECL_RTL (decl
), 0), 0);
24730 const char *value
= IDENTIFIER_POINTER (target
);
24731 aarch64_asm_output_variant_pcs (stream
, decl
, name
);
24732 ASM_OUTPUT_DEF (stream
, name
, value
);
24735 /* Implement ASM_OUTPUT_EXTERNAL. Output .variant_pcs for undefined
24736 function symbol references. */
24739 aarch64_asm_output_external (FILE *stream
, tree decl
, const char* name
)
24741 default_elf_asm_output_external (stream
, decl
, name
);
24742 aarch64_asm_output_variant_pcs (stream
, decl
, name
);
24745 /* Triggered after a .cfi_startproc directive is emitted into the assembly file.
24746 Used to output the .cfi_b_key_frame directive when signing the current
24747 function with the B key. */
24750 aarch64_post_cfi_startproc (FILE *f
, tree ignored ATTRIBUTE_UNUSED
)
24752 if (cfun
->machine
->frame
.laid_out
&& aarch64_return_address_signing_enabled ()
24753 && aarch64_ra_sign_key
== AARCH64_KEY_B
)
24754 asm_fprintf (f
, "\t.cfi_b_key_frame\n");
24757 /* Implements TARGET_ASM_FILE_START. Output the assembly header. */
24760 aarch64_start_file (void)
24762 struct cl_target_option
*default_options
24763 = TREE_TARGET_OPTION (target_option_default_node
);
24765 const struct processor
*default_arch
24766 = aarch64_get_arch (default_options
->x_selected_arch
);
24767 auto default_isa_flags
= aarch64_get_asm_isa_flags (default_options
);
24768 std::string extension
24769 = aarch64_get_extension_string_for_isa_flags (default_isa_flags
,
24770 default_arch
->flags
);
24772 aarch64_last_printed_arch_string
= default_arch
->name
+ extension
;
24773 aarch64_last_printed_tune_string
= "";
24774 asm_fprintf (asm_out_file
, "\t.arch %s\n",
24775 aarch64_last_printed_arch_string
.c_str ());
24777 default_file_start ();
24780 /* Emit load exclusive. */
24783 aarch64_emit_load_exclusive (machine_mode mode
, rtx rval
,
24784 rtx mem
, rtx model_rtx
)
24786 if (mode
== TImode
)
24787 emit_insn (gen_aarch64_load_exclusive_pair (gen_lowpart (DImode
, rval
),
24788 gen_highpart (DImode
, rval
),
24791 emit_insn (gen_aarch64_load_exclusive (mode
, rval
, mem
, model_rtx
));
24794 /* Emit store exclusive. */
24797 aarch64_emit_store_exclusive (machine_mode mode
, rtx bval
,
24798 rtx mem
, rtx rval
, rtx model_rtx
)
24800 if (mode
== TImode
)
24801 emit_insn (gen_aarch64_store_exclusive_pair
24802 (bval
, mem
, operand_subword (rval
, 0, 0, TImode
),
24803 operand_subword (rval
, 1, 0, TImode
), model_rtx
));
24805 emit_insn (gen_aarch64_store_exclusive (mode
, bval
, mem
, rval
, model_rtx
));
24808 /* Mark the previous jump instruction as unlikely. */
24811 aarch64_emit_unlikely_jump (rtx insn
)
24813 rtx_insn
*jump
= emit_jump_insn (insn
);
24814 add_reg_br_prob_note (jump
, profile_probability::very_unlikely ());
24817 /* We store the names of the various atomic helpers in a 5x5 array.
24818 Return the libcall function given MODE, MODEL and NAMES. */
24821 aarch64_atomic_ool_func(machine_mode mode
, rtx model_rtx
,
24822 const atomic_ool_names
*names
)
24824 memmodel model
= memmodel_from_int (INTVAL (model_rtx
));
24825 int mode_idx
, model_idx
;
24845 gcc_unreachable ();
24850 case MEMMODEL_RELAXED
:
24853 case MEMMODEL_CONSUME
:
24854 case MEMMODEL_ACQUIRE
:
24857 case MEMMODEL_RELEASE
:
24860 case MEMMODEL_ACQ_REL
:
24861 case MEMMODEL_SEQ_CST
:
24864 case MEMMODEL_SYNC_ACQUIRE
:
24865 case MEMMODEL_SYNC_RELEASE
:
24866 case MEMMODEL_SYNC_SEQ_CST
:
24870 gcc_unreachable ();
24873 return init_one_libfunc_visibility (names
->str
[mode_idx
][model_idx
],
24874 VISIBILITY_HIDDEN
);
24877 #define DEF0(B, N) \
24878 { "__aarch64_" #B #N "_relax", \
24879 "__aarch64_" #B #N "_acq", \
24880 "__aarch64_" #B #N "_rel", \
24881 "__aarch64_" #B #N "_acq_rel", \
24882 "__aarch64_" #B #N "_sync" }
24884 #define DEF4(B) DEF0(B, 1), DEF0(B, 2), DEF0(B, 4), DEF0(B, 8), \
24885 { NULL, NULL, NULL, NULL }
24886 #define DEF5(B) DEF0(B, 1), DEF0(B, 2), DEF0(B, 4), DEF0(B, 8), DEF0(B, 16)
24888 static const atomic_ool_names aarch64_ool_cas_names
= { { DEF5(cas
) } };
24889 const atomic_ool_names aarch64_ool_swp_names
= { { DEF4(swp
) } };
24890 const atomic_ool_names aarch64_ool_ldadd_names
= { { DEF4(ldadd
) } };
24891 const atomic_ool_names aarch64_ool_ldset_names
= { { DEF4(ldset
) } };
24892 const atomic_ool_names aarch64_ool_ldclr_names
= { { DEF4(ldclr
) } };
24893 const atomic_ool_names aarch64_ool_ldeor_names
= { { DEF4(ldeor
) } };
24899 /* Expand a compare and swap pattern. */
24902 aarch64_expand_compare_and_swap (rtx operands
[])
24904 rtx bval
, rval
, mem
, oldval
, newval
, is_weak
, mod_s
, mod_f
, x
, cc_reg
;
24905 machine_mode mode
, r_mode
;
24907 bval
= operands
[0];
24908 rval
= operands
[1];
24910 oldval
= operands
[3];
24911 newval
= operands
[4];
24912 is_weak
= operands
[5];
24913 mod_s
= operands
[6];
24914 mod_f
= operands
[7];
24915 mode
= GET_MODE (mem
);
24917 /* Normally the succ memory model must be stronger than fail, but in the
24918 unlikely event of fail being ACQUIRE and succ being RELEASE we need to
24919 promote succ to ACQ_REL so that we don't lose the acquire semantics. */
24920 if (is_mm_acquire (memmodel_from_int (INTVAL (mod_f
)))
24921 && is_mm_release (memmodel_from_int (INTVAL (mod_s
))))
24922 mod_s
= GEN_INT (MEMMODEL_ACQ_REL
);
24925 if (mode
== QImode
|| mode
== HImode
)
24928 rval
= gen_reg_rtx (r_mode
);
24933 /* The CAS insn requires oldval and rval overlap, but we need to
24934 have a copy of oldval saved across the operation to tell if
24935 the operation is successful. */
24936 if (reg_overlap_mentioned_p (rval
, oldval
))
24937 rval
= copy_to_mode_reg (r_mode
, oldval
);
24939 emit_move_insn (rval
, gen_lowpart (r_mode
, oldval
));
24940 if (mode
== TImode
)
24941 newval
= force_reg (mode
, newval
);
24943 emit_insn (gen_aarch64_compare_and_swap_lse (mode
, rval
, mem
,
24945 cc_reg
= aarch64_gen_compare_reg_maybe_ze (NE
, rval
, oldval
, mode
);
24947 else if (TARGET_OUTLINE_ATOMICS
)
24949 /* Oldval must satisfy compare afterward. */
24950 if (!aarch64_plus_operand (oldval
, mode
))
24951 oldval
= force_reg (mode
, oldval
);
24952 rtx func
= aarch64_atomic_ool_func (mode
, mod_s
, &aarch64_ool_cas_names
);
24953 rval
= emit_library_call_value (func
, NULL_RTX
, LCT_NORMAL
, r_mode
,
24954 oldval
, mode
, newval
, mode
,
24955 XEXP (mem
, 0), Pmode
);
24956 cc_reg
= aarch64_gen_compare_reg_maybe_ze (NE
, rval
, oldval
, mode
);
24960 /* The oldval predicate varies by mode. Test it and force to reg. */
24961 insn_code code
= code_for_aarch64_compare_and_swap (mode
);
24962 if (!insn_data
[code
].operand
[2].predicate (oldval
, mode
))
24963 oldval
= force_reg (mode
, oldval
);
24965 emit_insn (GEN_FCN (code
) (rval
, mem
, oldval
, newval
,
24966 is_weak
, mod_s
, mod_f
));
24967 cc_reg
= gen_rtx_REG (CCmode
, CC_REGNUM
);
24970 if (r_mode
!= mode
)
24971 rval
= gen_lowpart (mode
, rval
);
24972 emit_move_insn (operands
[1], rval
);
24974 x
= gen_rtx_EQ (SImode
, cc_reg
, const0_rtx
);
24975 emit_insn (gen_rtx_SET (bval
, x
));
24978 /* Emit a barrier, that is appropriate for memory model MODEL, at the end of a
24979 sequence implementing an atomic operation. */
24982 aarch64_emit_post_barrier (enum memmodel model
)
24984 const enum memmodel base_model
= memmodel_base (model
);
24986 if (is_mm_sync (model
)
24987 && (base_model
== MEMMODEL_ACQUIRE
24988 || base_model
== MEMMODEL_ACQ_REL
24989 || base_model
== MEMMODEL_SEQ_CST
))
24991 emit_insn (gen_mem_thread_fence (GEN_INT (MEMMODEL_SEQ_CST
)));
24995 /* Split a compare and swap pattern. */
24998 aarch64_split_compare_and_swap (rtx operands
[])
25000 /* Split after prolog/epilog to avoid interactions with shrinkwrapping. */
25001 gcc_assert (epilogue_completed
);
25003 rtx rval
, mem
, oldval
, newval
, scratch
, x
, model_rtx
;
25006 rtx_code_label
*label1
, *label2
;
25007 enum memmodel model
;
25009 rval
= operands
[0];
25011 oldval
= operands
[2];
25012 newval
= operands
[3];
25013 model_rtx
= operands
[5];
25014 scratch
= operands
[7];
25015 mode
= GET_MODE (mem
);
25016 model
= memmodel_from_int (INTVAL (model_rtx
));
25017 is_weak
= operands
[4] != const0_rtx
&& mode
!= TImode
;
25019 /* When OLDVAL is zero and we want the strong version we can emit a tighter
25022 LD[A]XR rval, [mem]
25024 ST[L]XR scratch, newval, [mem]
25025 CBNZ scratch, .label1
25028 bool strong_zero_p
= (!is_weak
&& !aarch64_track_speculation
&&
25029 oldval
== const0_rtx
&& mode
!= TImode
);
25034 label1
= gen_label_rtx ();
25035 emit_label (label1
);
25037 label2
= gen_label_rtx ();
25039 /* The initial load can be relaxed for a __sync operation since a final
25040 barrier will be emitted to stop code hoisting. */
25041 if (is_mm_sync (model
))
25042 aarch64_emit_load_exclusive (mode
, rval
, mem
, GEN_INT (MEMMODEL_RELAXED
));
25044 aarch64_emit_load_exclusive (mode
, rval
, mem
, model_rtx
);
25047 x
= gen_rtx_NE (VOIDmode
, rval
, const0_rtx
);
25050 rtx cc_reg
= aarch64_gen_compare_reg_maybe_ze (NE
, rval
, oldval
, mode
);
25051 x
= gen_rtx_NE (VOIDmode
, cc_reg
, const0_rtx
);
25053 x
= gen_rtx_IF_THEN_ELSE (VOIDmode
, x
,
25054 gen_rtx_LABEL_REF (Pmode
, label2
), pc_rtx
);
25055 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx
, x
));
25057 aarch64_emit_store_exclusive (mode
, scratch
, mem
, newval
, model_rtx
);
25061 x
= aarch64_gen_compare_zero_and_branch (NE
, scratch
, label1
);
25062 aarch64_emit_unlikely_jump (x
);
25065 aarch64_gen_compare_reg (NE
, scratch
, const0_rtx
);
25067 /* 128-bit LDAXP is not atomic unless STLXP succeeds. So for a mismatch,
25068 store the returned value and loop if the STLXP fails. */
25069 if (mode
== TImode
)
25071 rtx_code_label
*label3
= gen_label_rtx ();
25072 emit_jump_insn (gen_rtx_SET (pc_rtx
, gen_rtx_LABEL_REF (Pmode
, label3
)));
25075 emit_label (label2
);
25076 aarch64_emit_store_exclusive (mode
, scratch
, mem
, rval
, model_rtx
);
25078 x
= aarch64_gen_compare_zero_and_branch (NE
, scratch
, label1
);
25079 aarch64_emit_unlikely_jump (x
);
25084 emit_label (label2
);
25086 /* If we used a CBNZ in the exchange loop emit an explicit compare with RVAL
25087 to set the condition flags. If this is not used it will be removed by
25090 aarch64_gen_compare_reg (NE
, rval
, const0_rtx
);
25092 /* Emit any final barrier needed for a __sync operation. */
25093 if (is_mm_sync (model
))
25094 aarch64_emit_post_barrier (model
);
25097 /* Split an atomic operation. */
25100 aarch64_split_atomic_op (enum rtx_code code
, rtx old_out
, rtx new_out
, rtx mem
,
25101 rtx value
, rtx model_rtx
, rtx cond
)
25103 /* Split after prolog/epilog to avoid interactions with shrinkwrapping. */
25104 gcc_assert (epilogue_completed
);
25106 machine_mode mode
= GET_MODE (mem
);
25107 machine_mode wmode
= (mode
== DImode
? DImode
: SImode
);
25108 const enum memmodel model
= memmodel_from_int (INTVAL (model_rtx
));
25109 const bool is_sync
= is_mm_sync (model
);
25110 rtx_code_label
*label
;
25113 /* Split the atomic operation into a sequence. */
25114 label
= gen_label_rtx ();
25115 emit_label (label
);
25118 new_out
= gen_lowpart (wmode
, new_out
);
25120 old_out
= gen_lowpart (wmode
, old_out
);
25123 value
= simplify_gen_subreg (wmode
, value
, mode
, 0);
25125 /* The initial load can be relaxed for a __sync operation since a final
25126 barrier will be emitted to stop code hoisting. */
25128 aarch64_emit_load_exclusive (mode
, old_out
, mem
,
25129 GEN_INT (MEMMODEL_RELAXED
));
25131 aarch64_emit_load_exclusive (mode
, old_out
, mem
, model_rtx
);
25140 x
= gen_rtx_AND (wmode
, old_out
, value
);
25141 emit_insn (gen_rtx_SET (new_out
, x
));
25142 x
= gen_rtx_NOT (wmode
, new_out
);
25143 emit_insn (gen_rtx_SET (new_out
, x
));
25147 if (CONST_INT_P (value
))
25149 value
= GEN_INT (-UINTVAL (value
));
25152 /* Fall through. */
25155 x
= gen_rtx_fmt_ee (code
, wmode
, old_out
, value
);
25156 emit_insn (gen_rtx_SET (new_out
, x
));
25160 aarch64_emit_store_exclusive (mode
, cond
, mem
,
25161 gen_lowpart (mode
, new_out
), model_rtx
);
25163 x
= aarch64_gen_compare_zero_and_branch (NE
, cond
, label
);
25164 aarch64_emit_unlikely_jump (x
);
25166 /* Emit any final barrier needed for a __sync operation. */
25168 aarch64_emit_post_barrier (model
);
25172 aarch64_init_libfuncs (void)
25174 /* Half-precision float operations. The compiler handles all operations
25175 with NULL libfuncs by converting to SFmode. */
25178 set_conv_libfunc (trunc_optab
, HFmode
, SFmode
, "__gnu_f2h_ieee");
25179 set_conv_libfunc (sext_optab
, SFmode
, HFmode
, "__gnu_h2f_ieee");
25182 set_optab_libfunc (add_optab
, HFmode
, NULL
);
25183 set_optab_libfunc (sdiv_optab
, HFmode
, NULL
);
25184 set_optab_libfunc (smul_optab
, HFmode
, NULL
);
25185 set_optab_libfunc (neg_optab
, HFmode
, NULL
);
25186 set_optab_libfunc (sub_optab
, HFmode
, NULL
);
25189 set_optab_libfunc (eq_optab
, HFmode
, NULL
);
25190 set_optab_libfunc (ne_optab
, HFmode
, NULL
);
25191 set_optab_libfunc (lt_optab
, HFmode
, NULL
);
25192 set_optab_libfunc (le_optab
, HFmode
, NULL
);
25193 set_optab_libfunc (ge_optab
, HFmode
, NULL
);
25194 set_optab_libfunc (gt_optab
, HFmode
, NULL
);
25195 set_optab_libfunc (unord_optab
, HFmode
, NULL
);
25198 /* Target hook for c_mode_for_suffix. */
25199 static machine_mode
25200 aarch64_c_mode_for_suffix (char suffix
)
25208 /* We can only represent floating point constants which will fit in
25209 "quarter-precision" values. These values are characterised by
25210 a sign bit, a 4-bit mantissa and a 3-bit exponent. And are given
25213 (-1)^s * (n/16) * 2^r
25216 's' is the sign bit.
25217 'n' is an integer in the range 16 <= n <= 31.
25218 'r' is an integer in the range -3 <= r <= 4. */
25220 /* Return true iff X can be represented by a quarter-precision
25221 floating point immediate operand X. Note, we cannot represent 0.0. */
25223 aarch64_float_const_representable_p (rtx x
)
25225 /* This represents our current view of how many bits
25226 make up the mantissa. */
25227 int point_pos
= 2 * HOST_BITS_PER_WIDE_INT
- 1;
25229 unsigned HOST_WIDE_INT mantissa
, mask
;
25230 REAL_VALUE_TYPE r
, m
;
25233 x
= unwrap_const_vec_duplicate (x
);
25234 if (!CONST_DOUBLE_P (x
))
25237 if (GET_MODE (x
) == VOIDmode
25238 || (GET_MODE (x
) == HFmode
&& !TARGET_FP_F16INST
))
25241 r
= *CONST_DOUBLE_REAL_VALUE (x
);
25243 /* We cannot represent infinities, NaNs or +/-zero. We won't
25244 know if we have +zero until we analyse the mantissa, but we
25245 can reject the other invalid values. */
25246 if (REAL_VALUE_ISINF (r
) || REAL_VALUE_ISNAN (r
)
25247 || REAL_VALUE_MINUS_ZERO (r
))
25250 /* For BFmode, only handle 0.0. */
25251 if (GET_MODE (x
) == BFmode
)
25252 return real_iszero (&r
, false);
25254 /* Extract exponent. */
25255 r
= real_value_abs (&r
);
25256 exponent
= REAL_EXP (&r
);
25258 /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
25259 highest (sign) bit, with a fixed binary point at bit point_pos.
25260 m1 holds the low part of the mantissa, m2 the high part.
25261 WARNING: If we ever have a representation using more than 2 * H_W_I - 1
25262 bits for the mantissa, this can fail (low bits will be lost). */
25263 real_ldexp (&m
, &r
, point_pos
- exponent
);
25264 wide_int w
= real_to_integer (&m
, &fail
, HOST_BITS_PER_WIDE_INT
* 2);
25266 /* If the low part of the mantissa has bits set we cannot represent
25268 if (w
.ulow () != 0)
25270 /* We have rejected the lower HOST_WIDE_INT, so update our
25271 understanding of how many bits lie in the mantissa and
25272 look only at the high HOST_WIDE_INT. */
25273 mantissa
= w
.elt (1);
25274 point_pos
-= HOST_BITS_PER_WIDE_INT
;
25276 /* We can only represent values with a mantissa of the form 1.xxxx. */
25277 mask
= ((unsigned HOST_WIDE_INT
)1 << (point_pos
- 5)) - 1;
25278 if ((mantissa
& mask
) != 0)
25281 /* Having filtered unrepresentable values, we may now remove all
25282 but the highest 5 bits. */
25283 mantissa
>>= point_pos
- 5;
25285 /* We cannot represent the value 0.0, so reject it. This is handled
25290 /* Then, as bit 4 is always set, we can mask it off, leaving
25291 the mantissa in the range [0, 15]. */
25292 mantissa
&= ~(1 << 4);
25293 gcc_assert (mantissa
<= 15);
25295 /* GCC internally does not use IEEE754-like encoding (where normalized
25296 significands are in the range [1, 2). GCC uses [0.5, 1) (see real.cc).
25297 Our mantissa values are shifted 4 places to the left relative to
25298 normalized IEEE754 so we must modify the exponent returned by REAL_EXP
25299 by 5 places to correct for GCC's representation. */
25300 exponent
= 5 - exponent
;
25302 return (exponent
>= 0 && exponent
<= 7);
25305 /* Returns the string with the instruction for AdvSIMD MOVI, MVNI, ORR or BIC
25306 immediate with a CONST_VECTOR of MODE and WIDTH. WHICH selects whether to
25307 output MOVI/MVNI, ORR or BIC immediate. */
25309 aarch64_output_simd_mov_immediate (rtx const_vector
, unsigned width
,
25310 enum simd_immediate_check which
)
25313 static char templ
[40];
25314 const char *mnemonic
;
25315 const char *shift_op
;
25316 unsigned int lane_count
= 0;
25319 struct simd_immediate_info info
;
25321 /* This will return true to show const_vector is legal for use as either
25322 a AdvSIMD MOVI instruction (or, implicitly, MVNI), ORR or BIC immediate.
25323 It will also update INFO to show how the immediate should be generated.
25324 WHICH selects whether to check for MOVI/MVNI, ORR or BIC. */
25325 is_valid
= aarch64_simd_valid_immediate (const_vector
, &info
, which
);
25326 gcc_assert (is_valid
);
25328 element_char
= sizetochar (GET_MODE_BITSIZE (info
.elt_mode
));
25329 lane_count
= width
/ GET_MODE_BITSIZE (info
.elt_mode
);
25331 if (GET_MODE_CLASS (info
.elt_mode
) == MODE_FLOAT
)
25333 gcc_assert (info
.insn
== simd_immediate_info::MOV
25334 && info
.u
.mov
.shift
== 0);
25335 /* For FP zero change it to a CONST_INT 0 and use the integer SIMD
25336 move immediate path. */
25337 if (aarch64_float_const_zero_rtx_p (info
.u
.mov
.value
))
25338 info
.u
.mov
.value
= GEN_INT (0);
25341 const unsigned int buf_size
= 20;
25342 char float_buf
[buf_size
] = {'\0'};
25343 real_to_decimal_for_mode (float_buf
,
25344 CONST_DOUBLE_REAL_VALUE (info
.u
.mov
.value
),
25345 buf_size
, buf_size
, 1, info
.elt_mode
);
25347 if (lane_count
== 1)
25348 snprintf (templ
, sizeof (templ
), "fmov\t%%d0, %s", float_buf
);
25350 snprintf (templ
, sizeof (templ
), "fmov\t%%0.%d%c, %s",
25351 lane_count
, element_char
, float_buf
);
25356 gcc_assert (CONST_INT_P (info
.u
.mov
.value
));
25358 if (which
== AARCH64_CHECK_MOV
)
25360 if (info
.insn
== simd_immediate_info::INDEX
)
25362 gcc_assert (TARGET_SVE
);
25363 snprintf (templ
, sizeof (templ
), "index\t%%Z0.%c, #"
25364 HOST_WIDE_INT_PRINT_DEC
", #" HOST_WIDE_INT_PRINT_DEC
,
25365 element_char
, INTVAL (info
.u
.index
.base
),
25366 INTVAL (info
.u
.index
.step
));
25370 mnemonic
= info
.insn
== simd_immediate_info::MVN
? "mvni" : "movi";
25371 shift_op
= (info
.u
.mov
.modifier
== simd_immediate_info::MSL
25373 if (lane_count
== 1)
25374 snprintf (templ
, sizeof (templ
), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX
,
25375 mnemonic
, UINTVAL (info
.u
.mov
.value
));
25376 else if (info
.u
.mov
.shift
)
25377 snprintf (templ
, sizeof (templ
), "%s\t%%0.%d%c, "
25378 HOST_WIDE_INT_PRINT_HEX
", %s %d", mnemonic
, lane_count
,
25379 element_char
, UINTVAL (info
.u
.mov
.value
), shift_op
,
25382 snprintf (templ
, sizeof (templ
), "%s\t%%0.%d%c, "
25383 HOST_WIDE_INT_PRINT_HEX
, mnemonic
, lane_count
,
25384 element_char
, UINTVAL (info
.u
.mov
.value
));
25388 /* For AARCH64_CHECK_BIC and AARCH64_CHECK_ORR. */
25389 mnemonic
= info
.insn
== simd_immediate_info::MVN
? "bic" : "orr";
25390 if (info
.u
.mov
.shift
)
25391 snprintf (templ
, sizeof (templ
), "%s\t%%0.%d%c, #"
25392 HOST_WIDE_INT_PRINT_DEC
", %s #%d", mnemonic
, lane_count
,
25393 element_char
, UINTVAL (info
.u
.mov
.value
), "lsl",
25396 snprintf (templ
, sizeof (templ
), "%s\t%%0.%d%c, #"
25397 HOST_WIDE_INT_PRINT_DEC
, mnemonic
, lane_count
,
25398 element_char
, UINTVAL (info
.u
.mov
.value
));
25404 aarch64_output_scalar_simd_mov_immediate (rtx immediate
, scalar_int_mode mode
)
25407 /* If a floating point number was passed and we desire to use it in an
25408 integer mode do the conversion to integer. */
25409 if (CONST_DOUBLE_P (immediate
) && GET_MODE_CLASS (mode
) == MODE_INT
)
25411 unsigned HOST_WIDE_INT ival
;
25412 if (!aarch64_reinterpret_float_as_int (immediate
, &ival
))
25413 gcc_unreachable ();
25414 immediate
= gen_int_mode (ival
, mode
);
25417 machine_mode vmode
;
25418 /* use a 64 bit mode for everything except for DI/DF/DD mode, where we use
25419 a 128 bit vector mode. */
25420 int width
= GET_MODE_BITSIZE (mode
) == 64 ? 128 : 64;
25422 vmode
= aarch64_simd_container_mode (mode
, width
);
25423 rtx v_op
= aarch64_simd_gen_const_vector_dup (vmode
, INTVAL (immediate
));
25424 return aarch64_output_simd_mov_immediate (v_op
, width
);
25427 /* Return the output string to use for moving immediate CONST_VECTOR
25428 into an SVE register. */
25431 aarch64_output_sve_mov_immediate (rtx const_vector
)
25433 static char templ
[40];
25434 struct simd_immediate_info info
;
25437 bool is_valid
= aarch64_simd_valid_immediate (const_vector
, &info
);
25438 gcc_assert (is_valid
);
25440 element_char
= sizetochar (GET_MODE_BITSIZE (info
.elt_mode
));
25442 machine_mode vec_mode
= GET_MODE (const_vector
);
25443 if (aarch64_sve_pred_mode_p (vec_mode
))
25445 static char buf
[sizeof ("ptrue\t%0.N, vlNNNNN")];
25446 if (info
.insn
== simd_immediate_info::MOV
)
25448 gcc_assert (info
.u
.mov
.value
== const0_rtx
);
25449 snprintf (buf
, sizeof (buf
), "pfalse\t%%0.b");
25453 gcc_assert (info
.insn
== simd_immediate_info::PTRUE
);
25454 unsigned int total_bytes
;
25455 if (info
.u
.pattern
== AARCH64_SV_ALL
25456 && BYTES_PER_SVE_VECTOR
.is_constant (&total_bytes
))
25457 snprintf (buf
, sizeof (buf
), "ptrue\t%%0.%c, vl%d", element_char
,
25458 total_bytes
/ GET_MODE_SIZE (info
.elt_mode
));
25460 snprintf (buf
, sizeof (buf
), "ptrue\t%%0.%c, %s", element_char
,
25461 svpattern_token (info
.u
.pattern
));
25466 if (info
.insn
== simd_immediate_info::INDEX
)
25468 snprintf (templ
, sizeof (templ
), "index\t%%0.%c, #"
25469 HOST_WIDE_INT_PRINT_DEC
", #" HOST_WIDE_INT_PRINT_DEC
,
25470 element_char
, INTVAL (info
.u
.index
.base
),
25471 INTVAL (info
.u
.index
.step
));
25475 if (GET_MODE_CLASS (info
.elt_mode
) == MODE_FLOAT
)
25477 if (aarch64_float_const_zero_rtx_p (info
.u
.mov
.value
))
25478 info
.u
.mov
.value
= GEN_INT (0);
25481 const int buf_size
= 20;
25482 char float_buf
[buf_size
] = {};
25483 real_to_decimal_for_mode (float_buf
,
25484 CONST_DOUBLE_REAL_VALUE (info
.u
.mov
.value
),
25485 buf_size
, buf_size
, 1, info
.elt_mode
);
25487 snprintf (templ
, sizeof (templ
), "fmov\t%%0.%c, #%s",
25488 element_char
, float_buf
);
25493 snprintf (templ
, sizeof (templ
), "mov\t%%0.%c, #" HOST_WIDE_INT_PRINT_DEC
,
25494 element_char
, INTVAL (info
.u
.mov
.value
));
25498 /* Return the asm template for a PTRUES. CONST_UNSPEC is the
25499 aarch64_sve_ptrue_svpattern_immediate that describes the predicate
25503 aarch64_output_sve_ptrues (rtx const_unspec
)
25505 static char templ
[40];
25507 struct simd_immediate_info info
;
25508 bool is_valid
= aarch64_simd_valid_immediate (const_unspec
, &info
);
25509 gcc_assert (is_valid
&& info
.insn
== simd_immediate_info::PTRUE
);
25511 char element_char
= sizetochar (GET_MODE_BITSIZE (info
.elt_mode
));
25512 snprintf (templ
, sizeof (templ
), "ptrues\t%%0.%c, %s", element_char
,
25513 svpattern_token (info
.u
.pattern
));
25517 /* Split operands into moves from op[1] + op[2] into op[0]. */
25520 aarch64_split_combinev16qi (rtx operands
[3])
25522 machine_mode halfmode
= GET_MODE (operands
[1]);
25524 gcc_assert (halfmode
== V16QImode
);
25526 rtx destlo
= simplify_gen_subreg (halfmode
, operands
[0],
25527 GET_MODE (operands
[0]), 0);
25528 rtx desthi
= simplify_gen_subreg (halfmode
, operands
[0],
25529 GET_MODE (operands
[0]),
25530 GET_MODE_SIZE (halfmode
));
25532 bool skiplo
= rtx_equal_p (destlo
, operands
[1]);
25533 bool skiphi
= rtx_equal_p (desthi
, operands
[2]);
25535 if (skiplo
&& skiphi
)
25537 /* No-op move. Can't split to nothing; emit something. */
25538 emit_note (NOTE_INSN_DELETED
);
25542 /* Special case of reversed high/low parts. */
25543 if (reg_overlap_mentioned_p (operands
[2], destlo
)
25544 && reg_overlap_mentioned_p (operands
[1], desthi
))
25546 emit_insn (gen_xorv16qi3 (operands
[1], operands
[1], operands
[2]));
25547 emit_insn (gen_xorv16qi3 (operands
[2], operands
[1], operands
[2]));
25548 emit_insn (gen_xorv16qi3 (operands
[1], operands
[1], operands
[2]));
25550 else if (!reg_overlap_mentioned_p (operands
[2], destlo
))
25552 /* Try to avoid unnecessary moves if part of the result
25553 is in the right place already. */
25555 emit_move_insn (destlo
, operands
[1]);
25557 emit_move_insn (desthi
, operands
[2]);
25562 emit_move_insn (desthi
, operands
[2]);
25564 emit_move_insn (destlo
, operands
[1]);
25568 /* vec_perm support. */
25570 struct expand_vec_perm_d
25572 rtx target
, op0
, op1
;
25573 vec_perm_indices perm
;
25574 machine_mode vmode
;
25575 machine_mode op_mode
;
25576 unsigned int vec_flags
;
25577 unsigned int op_vec_flags
;
25579 bool zero_op0_p
, zero_op1_p
;
25583 static bool aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d
*d
);
25585 /* Generate a variable permutation. */
25588 aarch64_expand_vec_perm_1 (rtx target
, rtx op0
, rtx op1
, rtx sel
)
25590 machine_mode vmode
= GET_MODE (target
);
25591 bool one_vector_p
= rtx_equal_p (op0
, op1
);
25593 gcc_checking_assert (vmode
== V8QImode
|| vmode
== V16QImode
);
25594 gcc_checking_assert (GET_MODE (op0
) == vmode
);
25595 gcc_checking_assert (GET_MODE (op1
) == vmode
);
25596 gcc_checking_assert (GET_MODE (sel
) == vmode
);
25597 gcc_checking_assert (TARGET_SIMD
);
25601 if (vmode
== V8QImode
)
25603 /* Expand the argument to a V16QI mode by duplicating it. */
25604 rtx pair
= gen_reg_rtx (V16QImode
);
25605 emit_insn (gen_aarch64_combinev8qi (pair
, op0
, op0
));
25606 emit_insn (gen_aarch64_qtbl1v8qi (target
, pair
, sel
));
25610 emit_insn (gen_aarch64_qtbl1v16qi (target
, op0
, sel
));
25617 if (vmode
== V8QImode
)
25619 pair
= gen_reg_rtx (V16QImode
);
25620 emit_insn (gen_aarch64_combinev8qi (pair
, op0
, op1
));
25621 emit_insn (gen_aarch64_qtbl1v8qi (target
, pair
, sel
));
25625 pair
= gen_reg_rtx (V2x16QImode
);
25626 emit_insn (gen_aarch64_combinev16qi (pair
, op0
, op1
));
25627 emit_insn (gen_aarch64_qtbl2v16qi (target
, pair
, sel
));
25632 /* Expand a vec_perm with the operands given by TARGET, OP0, OP1 and SEL.
25633 NELT is the number of elements in the vector. */
25636 aarch64_expand_vec_perm (rtx target
, rtx op0
, rtx op1
, rtx sel
,
25639 machine_mode vmode
= GET_MODE (target
);
25640 bool one_vector_p
= rtx_equal_p (op0
, op1
);
25643 /* The TBL instruction does not use a modulo index, so we must take care
25644 of that ourselves. */
25645 mask
= aarch64_simd_gen_const_vector_dup (vmode
,
25646 one_vector_p
? nelt
- 1 : 2 * nelt
- 1);
25647 sel
= expand_simple_binop (vmode
, AND
, sel
, mask
, NULL
, 0, OPTAB_LIB_WIDEN
);
25649 /* For big-endian, we also need to reverse the index within the vector
25650 (but not which vector). */
25651 if (BYTES_BIG_ENDIAN
)
25653 /* If one_vector_p, mask is a vector of (nelt - 1)'s already. */
25655 mask
= aarch64_simd_gen_const_vector_dup (vmode
, nelt
- 1);
25656 sel
= expand_simple_binop (vmode
, XOR
, sel
, mask
,
25657 NULL
, 0, OPTAB_LIB_WIDEN
);
25659 aarch64_expand_vec_perm_1 (target
, op0
, op1
, sel
);
25662 /* Generate (set TARGET (unspec [OP0 OP1] CODE)). */
25665 emit_unspec2 (rtx target
, int code
, rtx op0
, rtx op1
)
25667 emit_insn (gen_rtx_SET (target
,
25668 gen_rtx_UNSPEC (GET_MODE (target
),
25669 gen_rtvec (2, op0
, op1
), code
)));
25672 /* Expand an SVE vec_perm with the given operands. */
25675 aarch64_expand_sve_vec_perm (rtx target
, rtx op0
, rtx op1
, rtx sel
)
25677 machine_mode data_mode
= GET_MODE (target
);
25678 machine_mode sel_mode
= GET_MODE (sel
);
25679 /* Enforced by the pattern condition. */
25680 int nunits
= GET_MODE_NUNITS (sel_mode
).to_constant ();
25682 /* Note: vec_perm indices are supposed to wrap when they go beyond the
25683 size of the two value vectors, i.e. the upper bits of the indices
25684 are effectively ignored. SVE TBL instead produces 0 for any
25685 out-of-range indices, so we need to modulo all the vec_perm indices
25686 to ensure they are all in range. */
25687 rtx sel_reg
= force_reg (sel_mode
, sel
);
25689 /* Check if the sel only references the first values vector. */
25690 if (CONST_VECTOR_P (sel
)
25691 && aarch64_const_vec_all_in_range_p (sel
, 0, nunits
- 1))
25693 emit_unspec2 (target
, UNSPEC_TBL
, op0
, sel_reg
);
25697 /* Check if the two values vectors are the same. */
25698 if (rtx_equal_p (op0
, op1
))
25700 rtx max_sel
= aarch64_simd_gen_const_vector_dup (sel_mode
, nunits
- 1);
25701 rtx sel_mod
= expand_simple_binop (sel_mode
, AND
, sel_reg
, max_sel
,
25702 NULL
, 0, OPTAB_DIRECT
);
25703 emit_unspec2 (target
, UNSPEC_TBL
, op0
, sel_mod
);
25707 /* Run TBL on for each value vector and combine the results. */
25709 rtx res0
= gen_reg_rtx (data_mode
);
25710 rtx res1
= gen_reg_rtx (data_mode
);
25711 rtx neg_num_elems
= aarch64_simd_gen_const_vector_dup (sel_mode
, -nunits
);
25712 if (!CONST_VECTOR_P (sel
)
25713 || !aarch64_const_vec_all_in_range_p (sel
, 0, 2 * nunits
- 1))
25715 rtx max_sel
= aarch64_simd_gen_const_vector_dup (sel_mode
,
25717 sel_reg
= expand_simple_binop (sel_mode
, AND
, sel_reg
, max_sel
,
25718 NULL
, 0, OPTAB_DIRECT
);
25720 emit_unspec2 (res0
, UNSPEC_TBL
, op0
, sel_reg
);
25721 rtx sel_sub
= expand_simple_binop (sel_mode
, PLUS
, sel_reg
, neg_num_elems
,
25722 NULL
, 0, OPTAB_DIRECT
);
25723 emit_unspec2 (res1
, UNSPEC_TBL
, op1
, sel_sub
);
25724 if (GET_MODE_CLASS (data_mode
) == MODE_VECTOR_INT
)
25725 emit_insn (gen_rtx_SET (target
, gen_rtx_IOR (data_mode
, res0
, res1
)));
25727 emit_unspec2 (target
, UNSPEC_IORF
, res0
, res1
);
25730 /* Recognize patterns suitable for the TRN instructions. */
25732 aarch64_evpc_trn (struct expand_vec_perm_d
*d
)
25735 poly_uint64 nelt
= d
->perm
.length ();
25737 machine_mode vmode
= d
->vmode
;
25739 if (GET_MODE_UNIT_SIZE (vmode
) > 8)
25742 /* Note that these are little-endian tests.
25743 We correct for big-endian later. */
25744 if (!d
->perm
[0].is_constant (&odd
)
25745 || (odd
!= 0 && odd
!= 1)
25746 || !d
->perm
.series_p (0, 2, odd
, 2)
25747 || !d
->perm
.series_p (1, 2, nelt
+ odd
, 2))
25756 /* We don't need a big-endian lane correction for SVE; see the comment
25757 at the head of aarch64-sve.md for details. */
25758 if (BYTES_BIG_ENDIAN
&& d
->vec_flags
== VEC_ADVSIMD
)
25760 std::swap (in0
, in1
);
25765 emit_set_insn (out
, gen_rtx_UNSPEC (vmode
, gen_rtvec (2, in0
, in1
),
25766 odd
? UNSPEC_TRN2
: UNSPEC_TRN1
));
25770 /* Try to re-encode the PERM constant so it combines odd and even elements.
25771 This rewrites constants such as {0, 1, 4, 5}/V4SF to {0, 2}/V2DI.
25772 We retry with this new constant with the full suite of patterns. */
25774 aarch64_evpc_reencode (struct expand_vec_perm_d
*d
)
25776 expand_vec_perm_d newd
;
25778 /* The subregs that we'd create are not supported for big-endian SVE;
25779 see aarch64_modes_compatible_p for details. */
25780 if (BYTES_BIG_ENDIAN
&& (d
->vec_flags
& VEC_ANY_SVE
))
25783 /* Get the new mode. Always twice the size of the inner
25784 and half the elements. */
25785 machine_mode new_mode
;
25786 if (!aarch64_coalesce_units (d
->vmode
, 2).exists (&new_mode
))
25789 vec_perm_indices newpermindices
;
25790 if (!newpermindices
.new_shrunk_vector (d
->perm
, 2))
25793 newd
.vmode
= new_mode
;
25794 newd
.vec_flags
= d
->vec_flags
;
25795 newd
.op_mode
= newd
.vmode
;
25796 newd
.op_vec_flags
= newd
.vec_flags
;
25797 newd
.target
= d
->target
? gen_lowpart (new_mode
, d
->target
) : NULL
;
25798 newd
.op0
= d
->op0
? gen_lowpart (new_mode
, d
->op0
) : NULL
;
25799 newd
.op1
= d
->op1
? gen_lowpart (new_mode
, d
->op1
) : NULL
;
25800 newd
.testing_p
= d
->testing_p
;
25801 newd
.one_vector_p
= d
->one_vector_p
;
25803 newd
.perm
.new_vector (newpermindices
.encoding (), newd
.one_vector_p
? 1 : 2,
25804 newpermindices
.nelts_per_input ());
25805 return aarch64_expand_vec_perm_const_1 (&newd
);
25808 /* Recognize patterns suitable for the UZP instructions. */
25810 aarch64_evpc_uzp (struct expand_vec_perm_d
*d
)
25814 machine_mode vmode
= d
->vmode
;
25816 if (GET_MODE_UNIT_SIZE (vmode
) > 8)
25819 /* Note that these are little-endian tests.
25820 We correct for big-endian later. */
25821 if (!d
->perm
[0].is_constant (&odd
)
25822 || (odd
!= 0 && odd
!= 1)
25823 || !d
->perm
.series_p (0, 1, odd
, 2))
25832 /* We don't need a big-endian lane correction for SVE; see the comment
25833 at the head of aarch64-sve.md for details. */
25834 if (BYTES_BIG_ENDIAN
&& d
->vec_flags
== VEC_ADVSIMD
)
25836 std::swap (in0
, in1
);
25841 emit_set_insn (out
, gen_rtx_UNSPEC (vmode
, gen_rtvec (2, in0
, in1
),
25842 odd
? UNSPEC_UZP2
: UNSPEC_UZP1
));
25846 /* Recognize patterns suitable for the ZIP instructions. */
25848 aarch64_evpc_zip (struct expand_vec_perm_d
*d
)
25851 poly_uint64 nelt
= d
->perm
.length ();
25853 machine_mode vmode
= d
->vmode
;
25855 if (GET_MODE_UNIT_SIZE (vmode
) > 8)
25858 /* Note that these are little-endian tests.
25859 We correct for big-endian later. */
25860 poly_uint64 first
= d
->perm
[0];
25861 if ((maybe_ne (first
, 0U) && maybe_ne (first
* 2, nelt
))
25862 || !d
->perm
.series_p (0, 2, first
, 1)
25863 || !d
->perm
.series_p (1, 2, first
+ nelt
, 1))
25865 high
= maybe_ne (first
, 0U);
25873 /* We don't need a big-endian lane correction for SVE; see the comment
25874 at the head of aarch64-sve.md for details. */
25875 if (BYTES_BIG_ENDIAN
&& d
->vec_flags
== VEC_ADVSIMD
)
25877 std::swap (in0
, in1
);
25882 emit_set_insn (out
, gen_rtx_UNSPEC (vmode
, gen_rtvec (2, in0
, in1
),
25883 high
? UNSPEC_ZIP2
: UNSPEC_ZIP1
));
25887 /* Recognize patterns for the EXT insn. */
25890 aarch64_evpc_ext (struct expand_vec_perm_d
*d
)
25892 HOST_WIDE_INT location
;
25895 /* The first element always refers to the first vector.
25896 Check if the extracted indices are increasing by one. */
25897 if ((d
->vec_flags
& VEC_SVE_PRED
)
25898 || !d
->perm
[0].is_constant (&location
)
25899 || !d
->perm
.series_p (0, 1, location
, 1))
25906 /* The case where (location == 0) is a no-op for both big- and little-endian,
25907 and is removed by the mid-end at optimization levels -O1 and higher.
25909 We don't need a big-endian lane correction for SVE; see the comment
25910 at the head of aarch64-sve.md for details. */
25911 if (BYTES_BIG_ENDIAN
&& location
!= 0 && d
->vec_flags
== VEC_ADVSIMD
)
25913 /* After setup, we want the high elements of the first vector (stored
25914 at the LSB end of the register), and the low elements of the second
25915 vector (stored at the MSB end of the register). So swap. */
25916 std::swap (d
->op0
, d
->op1
);
25917 /* location != 0 (above), so safe to assume (nelt - location) < nelt.
25918 to_constant () is safe since this is restricted to Advanced SIMD
25920 location
= d
->perm
.length ().to_constant () - location
;
25923 offset
= GEN_INT (location
);
25924 emit_set_insn (d
->target
,
25925 gen_rtx_UNSPEC (d
->vmode
,
25926 gen_rtvec (3, d
->op0
, d
->op1
, offset
),
25931 /* Recognize patterns for the REV{64,32,16} insns, which reverse elements
25932 within each 64-bit, 32-bit or 16-bit granule. */
25935 aarch64_evpc_rev_local (struct expand_vec_perm_d
*d
)
25937 HOST_WIDE_INT diff
;
25938 unsigned int i
, size
, unspec
;
25939 machine_mode pred_mode
;
25941 if ((d
->vec_flags
& VEC_SVE_PRED
)
25942 || !d
->one_vector_p
25943 || !d
->perm
[0].is_constant (&diff
)
25947 if (d
->vec_flags
& VEC_SVE_DATA
)
25948 size
= (diff
+ 1) * aarch64_sve_container_bits (d
->vmode
);
25950 size
= (diff
+ 1) * GET_MODE_UNIT_BITSIZE (d
->vmode
);
25953 unspec
= UNSPEC_REV64
;
25954 pred_mode
= VNx2BImode
;
25956 else if (size
== 32)
25958 unspec
= UNSPEC_REV32
;
25959 pred_mode
= VNx4BImode
;
25961 else if (size
== 16)
25963 unspec
= UNSPEC_REV16
;
25964 pred_mode
= VNx8BImode
;
25969 unsigned int step
= diff
+ 1;
25970 for (i
= 0; i
< step
; ++i
)
25971 if (!d
->perm
.series_p (i
, step
, diff
- i
, step
))
25978 if (d
->vec_flags
& VEC_SVE_DATA
)
25980 rtx pred
= aarch64_ptrue_reg (pred_mode
);
25981 emit_insn (gen_aarch64_sve_revbhw (d
->vmode
, pred_mode
,
25982 d
->target
, pred
, d
->op0
));
25985 rtx src
= gen_rtx_UNSPEC (d
->vmode
, gen_rtvec (1, d
->op0
), unspec
);
25986 emit_set_insn (d
->target
, src
);
25990 /* Recognize patterns for the REV insn, which reverses elements within
25994 aarch64_evpc_rev_global (struct expand_vec_perm_d
*d
)
25996 poly_uint64 nelt
= d
->perm
.length ();
25998 if (!d
->one_vector_p
|| d
->vec_flags
== VEC_ADVSIMD
)
26001 if (!d
->perm
.series_p (0, 1, nelt
- 1, -1))
26008 rtx src
= gen_rtx_UNSPEC (d
->vmode
, gen_rtvec (1, d
->op0
), UNSPEC_REV
);
26009 emit_set_insn (d
->target
, src
);
26014 aarch64_evpc_dup (struct expand_vec_perm_d
*d
)
26016 rtx out
= d
->target
;
26019 machine_mode vmode
= d
->vmode
;
26022 if ((d
->vec_flags
& VEC_SVE_PRED
)
26023 || d
->perm
.encoding ().encoded_nelts () != 1
26024 || !d
->perm
[0].is_constant (&elt
))
26027 if ((d
->vec_flags
& VEC_SVE_DATA
)
26028 && elt
* (aarch64_sve_container_bits (vmode
) / 8) >= 64)
26035 /* The generic preparation in aarch64_expand_vec_perm_const_1
26036 swaps the operand order and the permute indices if it finds
26037 d->perm[0] to be in the second operand. Thus, we can always
26038 use d->op0 and need not do any extra arithmetic to get the
26039 correct lane number. */
26041 lane
= GEN_INT (elt
); /* The pattern corrects for big-endian. */
26043 rtx parallel
= gen_rtx_PARALLEL (vmode
, gen_rtvec (1, lane
));
26044 rtx select
= gen_rtx_VEC_SELECT (GET_MODE_INNER (vmode
), in0
, parallel
);
26045 emit_set_insn (out
, gen_rtx_VEC_DUPLICATE (vmode
, select
));
26050 aarch64_evpc_tbl (struct expand_vec_perm_d
*d
)
26052 rtx rperm
[MAX_COMPILE_TIME_VEC_BYTES
], sel
;
26053 machine_mode vmode
= d
->vmode
;
26055 /* Make sure that the indices are constant. */
26056 unsigned int encoded_nelts
= d
->perm
.encoding ().encoded_nelts ();
26057 for (unsigned int i
= 0; i
< encoded_nelts
; ++i
)
26058 if (!d
->perm
[i
].is_constant ())
26064 /* Generic code will try constant permutation twice. Once with the
26065 original mode and again with the elements lowered to QImode.
26066 So wait and don't do the selector expansion ourselves. */
26067 if (vmode
!= V8QImode
&& vmode
!= V16QImode
)
26070 /* to_constant is safe since this routine is specific to Advanced SIMD
26072 unsigned int nelt
= d
->perm
.length ().to_constant ();
26074 /* If one register is the constant vector of 0 then we only need
26075 a one reg TBL and we map any accesses to the vector of 0 to -1. We can't
26076 do this earlier since vec_perm_indices clamps elements to within range so
26077 we can only do it during codegen. */
26080 else if (d
->zero_op1_p
)
26083 for (unsigned int i
= 0; i
< nelt
; ++i
)
26085 auto val
= d
->perm
[i
].to_constant ();
26087 /* If we're selecting from a 0 vector, we can just use an out of range
26089 if ((d
->zero_op0_p
&& val
< nelt
) || (d
->zero_op1_p
&& val
>= nelt
))
26090 rperm
[i
] = constm1_rtx
;
26093 /* If we are remapping a zero register as the first parameter we need
26094 to adjust the indices of the non-zero register. */
26098 /* If big-endian and two vectors we end up with a weird mixed-endian
26099 mode on NEON. Reverse the index within each word but not the word
26100 itself. to_constant is safe because we checked is_constant
26102 rperm
[i
] = GEN_INT (BYTES_BIG_ENDIAN
? val
^ (nelt
- 1) : val
);
26106 sel
= gen_rtx_CONST_VECTOR (vmode
, gen_rtvec_v (nelt
, rperm
));
26107 sel
= force_reg (vmode
, sel
);
26109 aarch64_expand_vec_perm_1 (d
->target
, d
->op0
, d
->op1
, sel
);
26113 /* Try to implement D using an SVE TBL instruction. */
26116 aarch64_evpc_sve_tbl (struct expand_vec_perm_d
*d
)
26118 unsigned HOST_WIDE_INT nelt
;
26120 /* Permuting two variable-length vectors could overflow the
26122 if (!d
->one_vector_p
&& !d
->perm
.length ().is_constant (&nelt
))
26128 machine_mode sel_mode
= related_int_vector_mode (d
->vmode
).require ();
26129 rtx sel
= vec_perm_indices_to_rtx (sel_mode
, d
->perm
);
26130 if (d
->one_vector_p
)
26131 emit_unspec2 (d
->target
, UNSPEC_TBL
, d
->op0
, force_reg (sel_mode
, sel
));
26133 aarch64_expand_sve_vec_perm (d
->target
, d
->op0
, d
->op1
, sel
);
26137 /* Try to implement D using SVE dup instruction. */
26140 aarch64_evpc_sve_dup (struct expand_vec_perm_d
*d
)
26142 if (BYTES_BIG_ENDIAN
26143 || !d
->one_vector_p
26144 || d
->vec_flags
!= VEC_SVE_DATA
26145 || d
->op_vec_flags
!= VEC_ADVSIMD
26146 || d
->perm
.encoding ().nelts_per_pattern () != 1
26147 || !known_eq (d
->perm
.encoding ().npatterns (),
26148 GET_MODE_NUNITS (d
->op_mode
))
26149 || !known_eq (GET_MODE_BITSIZE (d
->op_mode
), 128))
26152 int npatterns
= d
->perm
.encoding ().npatterns ();
26153 for (int i
= 0; i
< npatterns
; i
++)
26154 if (!known_eq (d
->perm
[i
], i
))
26160 aarch64_expand_sve_dupq (d
->target
, GET_MODE (d
->target
), d
->op0
);
26164 /* Try to implement D using SVE SEL instruction. */
26167 aarch64_evpc_sel (struct expand_vec_perm_d
*d
)
26169 machine_mode vmode
= d
->vmode
;
26170 int unit_size
= GET_MODE_UNIT_SIZE (vmode
);
26172 if (d
->vec_flags
!= VEC_SVE_DATA
26176 int n_patterns
= d
->perm
.encoding ().npatterns ();
26177 poly_int64 vec_len
= d
->perm
.length ();
26179 for (int i
= 0; i
< n_patterns
; ++i
)
26180 if (!known_eq (d
->perm
[i
], i
)
26181 && !known_eq (d
->perm
[i
], vec_len
+ i
))
26184 for (int i
= n_patterns
; i
< n_patterns
* 2; i
++)
26185 if (!d
->perm
.series_p (i
, n_patterns
, i
, n_patterns
)
26186 && !d
->perm
.series_p (i
, n_patterns
, vec_len
+ i
, n_patterns
))
26192 machine_mode pred_mode
= aarch64_sve_pred_mode (vmode
);
26194 /* Build a predicate that is true when op0 elements should be used. */
26195 rtx_vector_builder
builder (pred_mode
, n_patterns
, 2);
26196 for (int i
= 0; i
< n_patterns
* 2; i
++)
26198 rtx elem
= known_eq (d
->perm
[i
], i
) ? CONST1_RTX (BImode
)
26199 : CONST0_RTX (BImode
);
26200 builder
.quick_push (elem
);
26203 rtx const_vec
= builder
.build ();
26204 rtx pred
= force_reg (pred_mode
, const_vec
);
26205 /* TARGET = PRED ? OP0 : OP1. */
26206 emit_insn (gen_vcond_mask (vmode
, vmode
, d
->target
, d
->op0
, d
->op1
, pred
));
26210 /* Recognize patterns suitable for the INS instructions. */
26212 aarch64_evpc_ins (struct expand_vec_perm_d
*d
)
26214 machine_mode mode
= d
->vmode
;
26215 unsigned HOST_WIDE_INT nelt
;
26217 if (d
->vec_flags
!= VEC_ADVSIMD
)
26220 /* to_constant is safe since this routine is specific to Advanced SIMD
26222 nelt
= d
->perm
.length ().to_constant ();
26225 HOST_WIDE_INT idx
= -1;
26227 for (unsigned HOST_WIDE_INT i
= 0; i
< nelt
; i
++)
26230 if (!d
->perm
[i
].is_constant (&elt
))
26232 if (elt
== (HOST_WIDE_INT
) i
)
26245 for (unsigned HOST_WIDE_INT i
= 0; i
< nelt
; i
++)
26247 if (d
->perm
[i
].to_constant () == (HOST_WIDE_INT
) (i
+ nelt
))
26261 gcc_assert (idx
!= -1);
26263 unsigned extractindex
= d
->perm
[idx
].to_constant ();
26264 rtx extractv
= d
->op0
;
26265 if (extractindex
>= nelt
)
26268 extractindex
-= nelt
;
26270 gcc_assert (extractindex
< nelt
);
26272 insn_code icode
= code_for_aarch64_simd_vec_copy_lane (mode
);
26273 expand_operand ops
[5];
26274 create_output_operand (&ops
[0], d
->target
, mode
);
26275 create_input_operand (&ops
[1], insv
, mode
);
26276 create_integer_operand (&ops
[2], 1 << idx
);
26277 create_input_operand (&ops
[3], extractv
, mode
);
26278 create_integer_operand (&ops
[4], extractindex
);
26279 expand_insn (icode
, 5, ops
);
26285 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d
*d
)
26287 gcc_assert (d
->op_mode
!= E_VOIDmode
);
26289 /* The pattern matching functions above are written to look for a small
26290 number to begin the sequence (0, 1, N/2). If we begin with an index
26291 from the second operand, we can swap the operands. */
26292 poly_int64 nelt
= d
->perm
.length ();
26293 if (known_ge (d
->perm
[0], nelt
))
26295 d
->perm
.rotate_inputs (1);
26296 std::swap (d
->op0
, d
->op1
);
26299 if (((d
->vec_flags
== VEC_ADVSIMD
&& TARGET_SIMD
)
26300 || d
->vec_flags
== VEC_SVE_DATA
26301 || d
->vec_flags
== (VEC_SVE_DATA
| VEC_PARTIAL
)
26302 || d
->vec_flags
== VEC_SVE_PRED
)
26303 && known_gt (nelt
, 1))
26305 if (d
->vmode
== d
->op_mode
)
26307 if (aarch64_evpc_rev_local (d
))
26309 else if (aarch64_evpc_rev_global (d
))
26311 else if (aarch64_evpc_ext (d
))
26313 else if (aarch64_evpc_dup (d
))
26315 else if (aarch64_evpc_zip (d
))
26317 else if (aarch64_evpc_uzp (d
))
26319 else if (aarch64_evpc_trn (d
))
26321 else if (aarch64_evpc_sel (d
))
26323 else if (aarch64_evpc_ins (d
))
26325 else if (aarch64_evpc_reencode (d
))
26328 if (d
->vec_flags
== VEC_SVE_DATA
)
26329 return aarch64_evpc_sve_tbl (d
);
26330 else if (d
->vec_flags
== VEC_ADVSIMD
)
26331 return aarch64_evpc_tbl (d
);
26335 if (aarch64_evpc_sve_dup (d
))
26342 /* Implement TARGET_VECTORIZE_VEC_PERM_CONST. */
26345 aarch64_vectorize_vec_perm_const (machine_mode vmode
, machine_mode op_mode
,
26346 rtx target
, rtx op0
, rtx op1
,
26347 const vec_perm_indices
&sel
)
26349 struct expand_vec_perm_d d
;
26351 /* Check whether the mask can be applied to a single vector. */
26352 if (sel
.ninputs () == 1
26353 || (op0
&& rtx_equal_p (op0
, op1
)))
26354 d
.one_vector_p
= true;
26355 else if (sel
.all_from_input_p (0))
26357 d
.one_vector_p
= true;
26360 else if (sel
.all_from_input_p (1))
26362 d
.one_vector_p
= true;
26366 d
.one_vector_p
= false;
26368 d
.zero_op0_p
= op0
== CONST0_RTX (op_mode
);
26369 d
.zero_op1_p
= op1
== CONST0_RTX (op_mode
);
26370 d
.perm
.new_vector (sel
.encoding (), d
.one_vector_p
? 1 : 2,
26371 sel
.nelts_per_input ());
26373 d
.vec_flags
= aarch64_classify_vector_mode (d
.vmode
);
26374 d
.op_mode
= op_mode
;
26375 d
.op_vec_flags
= aarch64_classify_vector_mode (d
.op_mode
);
26377 d
.op0
= op0
? force_reg (op_mode
, op0
) : NULL_RTX
;
26381 d
.op1
= op1
? force_reg (op_mode
, op1
) : NULL_RTX
;
26382 d
.testing_p
= !target
;
26385 return aarch64_expand_vec_perm_const_1 (&d
);
26387 rtx_insn
*last
= get_last_insn ();
26388 bool ret
= aarch64_expand_vec_perm_const_1 (&d
);
26389 gcc_assert (last
== get_last_insn ());
26393 /* Generate a byte permute mask for a register of mode MODE,
26394 which has NUNITS units. */
26397 aarch64_reverse_mask (machine_mode mode
, unsigned int nunits
)
26399 /* We have to reverse each vector because we dont have
26400 a permuted load that can reverse-load according to ABI rules. */
26402 rtvec v
= rtvec_alloc (16);
26404 unsigned int usize
= GET_MODE_UNIT_SIZE (mode
);
26406 gcc_assert (BYTES_BIG_ENDIAN
);
26407 gcc_assert (AARCH64_VALID_SIMD_QREG_MODE (mode
));
26409 for (i
= 0; i
< nunits
; i
++)
26410 for (j
= 0; j
< usize
; j
++)
26411 RTVEC_ELT (v
, i
* usize
+ j
) = GEN_INT ((i
+ 1) * usize
- 1 - j
);
26412 mask
= gen_rtx_CONST_VECTOR (V16QImode
, v
);
26413 return force_reg (V16QImode
, mask
);
26416 /* Expand an SVE integer comparison using the SVE equivalent of:
26418 (set TARGET (CODE OP0 OP1)). */
26421 aarch64_expand_sve_vec_cmp_int (rtx target
, rtx_code code
, rtx op0
, rtx op1
)
26423 machine_mode pred_mode
= GET_MODE (target
);
26424 machine_mode data_mode
= GET_MODE (op0
);
26425 rtx res
= aarch64_sve_emit_int_cmp (target
, pred_mode
, code
, data_mode
,
26427 if (!rtx_equal_p (target
, res
))
26428 emit_move_insn (target
, res
);
26431 /* Return the UNSPEC_COND_* code for comparison CODE. */
26433 static unsigned int
26434 aarch64_unspec_cond_code (rtx_code code
)
26439 return UNSPEC_COND_FCMNE
;
26441 return UNSPEC_COND_FCMEQ
;
26443 return UNSPEC_COND_FCMLT
;
26445 return UNSPEC_COND_FCMGT
;
26447 return UNSPEC_COND_FCMLE
;
26449 return UNSPEC_COND_FCMGE
;
26451 return UNSPEC_COND_FCMUO
;
26453 gcc_unreachable ();
26459 (set TARGET (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X>))
26461 where <X> is the operation associated with comparison CODE.
26462 KNOWN_PTRUE_P is true if PRED is known to be a PTRUE. */
26465 aarch64_emit_sve_fp_cond (rtx target
, rtx_code code
, rtx pred
,
26466 bool known_ptrue_p
, rtx op0
, rtx op1
)
26468 rtx flag
= gen_int_mode (known_ptrue_p
, SImode
);
26469 rtx unspec
= gen_rtx_UNSPEC (GET_MODE (pred
),
26470 gen_rtvec (4, pred
, flag
, op0
, op1
),
26471 aarch64_unspec_cond_code (code
));
26472 emit_set_insn (target
, unspec
);
26475 /* Emit the SVE equivalent of:
26477 (set TMP1 (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X1>))
26478 (set TMP2 (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X2>))
26479 (set TARGET (ior:PRED_MODE TMP1 TMP2))
26481 where <Xi> is the operation associated with comparison CODEi.
26482 KNOWN_PTRUE_P is true if PRED is known to be a PTRUE. */
26485 aarch64_emit_sve_or_fp_conds (rtx target
, rtx_code code1
, rtx_code code2
,
26486 rtx pred
, bool known_ptrue_p
, rtx op0
, rtx op1
)
26488 machine_mode pred_mode
= GET_MODE (pred
);
26489 rtx tmp1
= gen_reg_rtx (pred_mode
);
26490 aarch64_emit_sve_fp_cond (tmp1
, code1
, pred
, known_ptrue_p
, op0
, op1
);
26491 rtx tmp2
= gen_reg_rtx (pred_mode
);
26492 aarch64_emit_sve_fp_cond (tmp2
, code2
, pred
, known_ptrue_p
, op0
, op1
);
26493 aarch64_emit_binop (target
, ior_optab
, tmp1
, tmp2
);
26496 /* Emit the SVE equivalent of:
26498 (set TMP (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X>))
26499 (set TARGET (not TMP))
26501 where <X> is the operation associated with comparison CODE.
26502 KNOWN_PTRUE_P is true if PRED is known to be a PTRUE. */
26505 aarch64_emit_sve_invert_fp_cond (rtx target
, rtx_code code
, rtx pred
,
26506 bool known_ptrue_p
, rtx op0
, rtx op1
)
26508 machine_mode pred_mode
= GET_MODE (pred
);
26509 rtx tmp
= gen_reg_rtx (pred_mode
);
26510 aarch64_emit_sve_fp_cond (tmp
, code
, pred
, known_ptrue_p
, op0
, op1
);
26511 aarch64_emit_unop (target
, one_cmpl_optab
, tmp
);
26514 /* Expand an SVE floating-point comparison using the SVE equivalent of:
26516 (set TARGET (CODE OP0 OP1))
26518 If CAN_INVERT_P is true, the caller can also handle inverted results;
26519 return true if the result is in fact inverted. */
26522 aarch64_expand_sve_vec_cmp_float (rtx target
, rtx_code code
,
26523 rtx op0
, rtx op1
, bool can_invert_p
)
26525 machine_mode pred_mode
= GET_MODE (target
);
26526 machine_mode data_mode
= GET_MODE (op0
);
26528 rtx ptrue
= aarch64_ptrue_reg (pred_mode
);
26532 /* UNORDERED has no immediate form. */
26533 op1
= force_reg (data_mode
, op1
);
26542 /* There is native support for the comparison. */
26543 aarch64_emit_sve_fp_cond (target
, code
, ptrue
, true, op0
, op1
);
26548 /* This is a trapping operation (LT or GT). */
26549 aarch64_emit_sve_or_fp_conds (target
, LT
, GT
, ptrue
, true, op0
, op1
);
26553 if (!flag_trapping_math
)
26555 /* This would trap for signaling NaNs. */
26556 op1
= force_reg (data_mode
, op1
);
26557 aarch64_emit_sve_or_fp_conds (target
, UNORDERED
, EQ
,
26558 ptrue
, true, op0
, op1
);
26566 if (flag_trapping_math
)
26568 /* Work out which elements are ordered. */
26569 rtx ordered
= gen_reg_rtx (pred_mode
);
26570 op1
= force_reg (data_mode
, op1
);
26571 aarch64_emit_sve_invert_fp_cond (ordered
, UNORDERED
,
26572 ptrue
, true, op0
, op1
);
26574 /* Test the opposite condition for the ordered elements,
26575 then invert the result. */
26579 code
= reverse_condition_maybe_unordered (code
);
26582 aarch64_emit_sve_fp_cond (target
, code
,
26583 ordered
, false, op0
, op1
);
26586 aarch64_emit_sve_invert_fp_cond (target
, code
,
26587 ordered
, false, op0
, op1
);
26593 /* ORDERED has no immediate form. */
26594 op1
= force_reg (data_mode
, op1
);
26598 gcc_unreachable ();
26601 /* There is native support for the inverse comparison. */
26602 code
= reverse_condition_maybe_unordered (code
);
26605 aarch64_emit_sve_fp_cond (target
, code
, ptrue
, true, op0
, op1
);
26608 aarch64_emit_sve_invert_fp_cond (target
, code
, ptrue
, true, op0
, op1
);
26612 /* Expand an SVE vcond pattern with operands OPS. DATA_MODE is the mode
26613 of the data being selected and CMP_MODE is the mode of the values being
26617 aarch64_expand_sve_vcond (machine_mode data_mode
, machine_mode cmp_mode
,
26620 machine_mode pred_mode
= aarch64_get_mask_mode (cmp_mode
).require ();
26621 rtx pred
= gen_reg_rtx (pred_mode
);
26622 if (FLOAT_MODE_P (cmp_mode
))
26624 if (aarch64_expand_sve_vec_cmp_float (pred
, GET_CODE (ops
[3]),
26625 ops
[4], ops
[5], true))
26626 std::swap (ops
[1], ops
[2]);
26629 aarch64_expand_sve_vec_cmp_int (pred
, GET_CODE (ops
[3]), ops
[4], ops
[5]);
26631 if (!aarch64_sve_reg_or_dup_imm (ops
[1], data_mode
))
26632 ops
[1] = force_reg (data_mode
, ops
[1]);
26633 /* The "false" value can only be zero if the "true" value is a constant. */
26634 if (register_operand (ops
[1], data_mode
)
26635 || !aarch64_simd_reg_or_zero (ops
[2], data_mode
))
26636 ops
[2] = force_reg (data_mode
, ops
[2]);
26638 rtvec vec
= gen_rtvec (3, pred
, ops
[1], ops
[2]);
26639 emit_set_insn (ops
[0], gen_rtx_UNSPEC (data_mode
, vec
, UNSPEC_SEL
));
26644 (a) MODE1 and MODE2 use the same layout for bytes that are common
26647 (b) subregs involving the two modes behave as the target-independent
26648 subreg rules require; and
26650 (c) there is at least one register that can hold both modes.
26652 Return false otherwise. */
26655 aarch64_modes_compatible_p (machine_mode mode1
, machine_mode mode2
)
26657 unsigned int flags1
= aarch64_classify_vector_mode (mode1
);
26658 unsigned int flags2
= aarch64_classify_vector_mode (mode2
);
26660 bool sve1_p
= (flags1
& VEC_ANY_SVE
);
26661 bool sve2_p
= (flags2
& VEC_ANY_SVE
);
26663 bool partial_sve1_p
= sve1_p
&& (flags1
& VEC_PARTIAL
);
26664 bool partial_sve2_p
= sve2_p
&& (flags2
& VEC_PARTIAL
);
26666 bool pred1_p
= (flags1
& VEC_SVE_PRED
);
26667 bool pred2_p
= (flags2
& VEC_SVE_PRED
);
26669 bool partial_advsimd_struct1_p
= (flags1
== (VEC_ADVSIMD
| VEC_STRUCT
26671 bool partial_advsimd_struct2_p
= (flags2
== (VEC_ADVSIMD
| VEC_STRUCT
26674 /* Don't allow changes between predicate modes and other modes.
26675 Only predicate registers can hold predicate modes and only
26676 non-predicate registers can hold non-predicate modes, so any
26677 attempt to mix them would require a round trip through memory. */
26678 if (pred1_p
!= pred2_p
)
26681 /* The contents of partial SVE modes are distributed evenly across
26682 the register, whereas GCC expects them to be clustered together.
26683 We therefore need to be careful about mode changes involving them. */
26684 if (partial_sve1_p
&& partial_sve2_p
)
26686 /* Reject changes between partial SVE modes that have different
26687 patterns of significant and insignificant bits. */
26688 if ((aarch64_sve_container_bits (mode1
)
26689 != aarch64_sve_container_bits (mode2
))
26690 || GET_MODE_UNIT_SIZE (mode1
) != GET_MODE_UNIT_SIZE (mode2
))
26693 else if (partial_sve1_p
)
26695 /* The first lane of MODE1 is where GCC expects it, but anything
26696 bigger than that is not. */
26697 if (maybe_gt (GET_MODE_SIZE (mode2
), GET_MODE_UNIT_SIZE (mode1
)))
26700 else if (partial_sve2_p
)
26702 /* Similarly in reverse. */
26703 if (maybe_gt (GET_MODE_SIZE (mode1
), GET_MODE_UNIT_SIZE (mode2
)))
26707 /* Don't allow changes between partial Advanced SIMD structure modes
26708 and other modes that are bigger than 8 bytes. E.g. V16QI and V2x8QI
26709 are the same size, but the former occupies one Q register while the
26710 latter occupies two D registers. */
26711 if (partial_advsimd_struct1_p
!= partial_advsimd_struct2_p
26712 && maybe_gt (GET_MODE_SIZE (mode1
), 8)
26713 && maybe_gt (GET_MODE_SIZE (mode2
), 8))
26716 if (maybe_ne (BITS_PER_SVE_VECTOR
, 128u))
26718 /* Don't allow changes between SVE modes and other modes that might
26719 be bigger than 128 bits. In particular, OImode, CImode and XImode
26720 divide into 128-bit quantities while SVE modes divide into
26721 BITS_PER_SVE_VECTOR quantities. */
26722 if (sve1_p
&& !sve2_p
&& maybe_gt (GET_MODE_BITSIZE (mode2
), 128))
26724 if (sve2_p
&& !sve1_p
&& maybe_gt (GET_MODE_BITSIZE (mode1
), 128))
26728 if (BYTES_BIG_ENDIAN
)
26730 /* Don't allow changes between SVE data modes and non-SVE modes.
26731 See the comment at the head of aarch64-sve.md for details. */
26732 if (sve1_p
!= sve2_p
)
26735 /* Don't allow changes in element size: lane 0 of the new vector
26736 would not then be lane 0 of the old vector. See the comment
26737 above aarch64_maybe_expand_sve_subreg_move for a more detailed
26740 In the worst case, this forces a register to be spilled in
26741 one mode and reloaded in the other, which handles the
26742 endianness correctly. */
26743 if (sve1_p
&& GET_MODE_UNIT_SIZE (mode1
) != GET_MODE_UNIT_SIZE (mode2
))
26749 /* Implement TARGET_MODES_TIEABLE_P. In principle we should always defer
26750 to aarch64_modes_compatible_p. However due to issues with register
26751 allocation it is preferable to avoid tieing integer scalar and FP
26752 scalar modes. Executing integer operations in general registers is
26753 better than treating them as scalar vector operations. This reduces
26754 latency and avoids redundant int<->FP moves. So tie modes if they
26755 are either the same class, or one of them is a vector mode. */
26758 aarch64_modes_tieable_p (machine_mode mode1
, machine_mode mode2
)
26760 if (aarch64_modes_compatible_p (mode1
, mode2
))
26762 if (GET_MODE_CLASS (mode1
) == GET_MODE_CLASS (mode2
))
26764 if (VECTOR_MODE_P (mode1
) || VECTOR_MODE_P (mode2
))
26770 /* Return a new RTX holding the result of moving POINTER forward by
26774 aarch64_move_pointer (rtx pointer
, poly_int64 amount
)
26776 rtx next
= plus_constant (Pmode
, XEXP (pointer
, 0), amount
);
26778 return adjust_automodify_address (pointer
, GET_MODE (pointer
),
26782 /* Expand a cpymem/movmem using the MOPS extension. OPERANDS are taken
26783 from the cpymem/movmem pattern. IS_MEMMOVE is true if this is a memmove
26784 rather than memcpy. Return true iff we succeeded. */
26786 aarch64_expand_cpymem_mops (rtx
*operands
, bool is_memmove
)
26791 /* All three registers are changed by the instruction, so each one
26792 must be a fresh pseudo. */
26793 rtx dst_addr
= copy_to_mode_reg (Pmode
, XEXP (operands
[0], 0));
26794 rtx src_addr
= copy_to_mode_reg (Pmode
, XEXP (operands
[1], 0));
26795 rtx dst_mem
= replace_equiv_address (operands
[0], dst_addr
);
26796 rtx src_mem
= replace_equiv_address (operands
[1], src_addr
);
26797 rtx sz_reg
= copy_to_mode_reg (DImode
, operands
[2]);
26799 emit_insn (gen_aarch64_movmemdi (dst_mem
, src_mem
, sz_reg
));
26801 emit_insn (gen_aarch64_cpymemdi (dst_mem
, src_mem
, sz_reg
));
26805 /* Expand cpymem/movmem, as if from a __builtin_memcpy/memmove.
26806 OPERANDS are taken from the cpymem/movmem pattern. IS_MEMMOVE is true
26807 if this is a memmove rather than memcpy. Return true if we succeed,
26808 otherwise return false, indicating that a libcall should be emitted. */
26810 aarch64_expand_cpymem (rtx
*operands
, bool is_memmove
)
26813 rtx dst
= operands
[0];
26814 rtx src
= operands
[1];
26815 unsigned align
= UINTVAL (operands
[3]);
26817 machine_mode mode
= BLKmode
, next_mode
;
26819 /* Variable-sized or strict-align copies may use the MOPS expansion. */
26820 if (!CONST_INT_P (operands
[2]) || (STRICT_ALIGNMENT
&& align
< 16))
26821 return aarch64_expand_cpymem_mops (operands
, is_memmove
);
26823 unsigned HOST_WIDE_INT size
= UINTVAL (operands
[2]);
26825 /* Set inline limits for memmove/memcpy. MOPS has a separate threshold. */
26826 unsigned max_copy_size
= TARGET_SIMD
? 256 : 128;
26827 unsigned mops_threshold
= is_memmove
? aarch64_mops_memmove_size_threshold
26828 : aarch64_mops_memcpy_size_threshold
;
26830 /* Reduce the maximum size with -Os. */
26831 if (optimize_function_for_size_p (cfun
))
26832 max_copy_size
/= 4;
26834 /* Large copies use MOPS when available or a library call. */
26835 if (size
> max_copy_size
|| (TARGET_MOPS
&& size
> mops_threshold
))
26836 return aarch64_expand_cpymem_mops (operands
, is_memmove
);
26838 /* Default to 32-byte LDP/STP on large copies, however small copies or
26839 no SIMD support fall back to 16-byte chunks.
26840 ??? Although it would be possible to use LDP/STP Qn in streaming mode
26841 (so using TARGET_BASE_SIMD instead of TARGET_SIMD), it isn't clear
26842 whether that would improve performance. */
26843 bool use_qregs
= size
> 24 && TARGET_SIMD
;
26845 base
= copy_to_mode_reg (Pmode
, XEXP (dst
, 0));
26846 dst
= adjust_automodify_address (dst
, VOIDmode
, base
, 0);
26848 base
= copy_to_mode_reg (Pmode
, XEXP (src
, 0));
26849 src
= adjust_automodify_address (src
, VOIDmode
, base
, 0);
26851 auto_vec
<std::pair
<rtx
, rtx
>, 16> ops
;
26856 /* Find the largest mode in which to do the copy in without over reading
26858 opt_scalar_int_mode mode_iter
;
26859 FOR_EACH_MODE_IN_CLASS (mode_iter
, MODE_INT
)
26860 if (GET_MODE_SIZE (mode_iter
.require ()) <= MIN (size
, 16))
26861 mode
= mode_iter
.require ();
26863 gcc_assert (mode
!= BLKmode
);
26865 mode_bytes
= GET_MODE_SIZE (mode
).to_constant ();
26867 /* Prefer Q-register accesses. */
26868 if (mode_bytes
== 16 && use_qregs
)
26871 rtx reg
= gen_reg_rtx (mode
);
26872 rtx load
= gen_move_insn (reg
, adjust_address (src
, mode
, offset
));
26873 rtx store
= gen_move_insn (adjust_address (dst
, mode
, offset
), reg
);
26874 ops
.safe_push ({ load
, store
});
26875 size
-= mode_bytes
;
26876 offset
+= mode_bytes
;
26878 /* Emit trailing copies using overlapping unaligned accesses
26879 (when !STRICT_ALIGNMENT) - this is smaller and faster. */
26880 if (size
> 0 && size
< 16 && !STRICT_ALIGNMENT
)
26882 next_mode
= smallest_mode_for_size
26883 (size
* BITS_PER_UNIT
, MODE_INT
).require ();
26884 int n_bytes
= GET_MODE_SIZE (next_mode
).to_constant ();
26885 gcc_assert (n_bytes
<= mode_bytes
);
26886 offset
-= n_bytes
- size
;
26891 /* Memcpy interleaves loads with stores, memmove emits all loads first. */
26892 int nops
= ops
.length();
26893 int inc
= is_memmove
|| nops
<= 8 ? nops
: 6;
26895 for (int i
= 0; i
< nops
; i
+= inc
)
26897 int m
= MIN (nops
, i
+ inc
);
26899 for (int j
= i
; j
< m
; j
++)
26900 emit_insn (ops
[j
].first
);
26902 for (int j
= i
; j
< m
; j
++)
26903 emit_insn (ops
[j
].second
);
26908 /* Expand a setmem using the MOPS instructions. OPERANDS are the same
26909 as for the setmem pattern. Return true iff we succeed. */
26911 aarch64_expand_setmem_mops (rtx
*operands
)
26916 /* The first two registers are changed by the instruction, so both
26917 of them must be a fresh pseudo. */
26918 rtx dst_addr
= copy_to_mode_reg (Pmode
, XEXP (operands
[0], 0));
26919 rtx dst_mem
= replace_equiv_address (operands
[0], dst_addr
);
26920 rtx sz_reg
= copy_to_mode_reg (DImode
, operands
[1]);
26921 rtx val
= operands
[2];
26922 if (val
!= CONST0_RTX (QImode
))
26923 val
= force_reg (QImode
, val
);
26924 emit_insn (gen_aarch64_setmemdi (dst_mem
, val
, sz_reg
));
26928 /* Expand setmem, as if from a __builtin_memset. Return true if
26929 we succeed, otherwise return false. */
26932 aarch64_expand_setmem (rtx
*operands
)
26935 unsigned HOST_WIDE_INT len
;
26936 rtx dst
= operands
[0];
26937 rtx val
= operands
[2], src
;
26938 unsigned align
= UINTVAL (operands
[3]);
26940 machine_mode mode
= BLKmode
, next_mode
;
26942 /* Variable-sized or strict-align memset may use the MOPS expansion. */
26943 if (!CONST_INT_P (operands
[1]) || !TARGET_SIMD
26944 || (STRICT_ALIGNMENT
&& align
< 16))
26945 return aarch64_expand_setmem_mops (operands
);
26947 /* Set inline limits for memset. MOPS has a separate threshold. */
26948 unsigned max_set_size
= MAX_SET_SIZE (optimize_function_for_speed_p (cfun
));
26949 unsigned mops_threshold
= aarch64_mops_memset_size_threshold
;
26951 len
= UINTVAL (operands
[1]);
26953 /* Large memset uses MOPS when available or a library call. */
26954 if (len
> max_set_size
|| (TARGET_MOPS
&& len
> mops_threshold
))
26955 return aarch64_expand_setmem_mops (operands
);
26957 base
= copy_to_mode_reg (Pmode
, XEXP (dst
, 0));
26958 dst
= adjust_automodify_address (dst
, VOIDmode
, base
, 0);
26960 /* Prepare the val using a DUP/MOVI v0.16B, val. */
26961 val
= expand_vector_broadcast (V16QImode
, val
);
26962 val
= force_reg (V16QImode
, val
);
26967 /* Find the largest mode in which to do the copy without
26969 opt_scalar_int_mode mode_iter
;
26970 FOR_EACH_MODE_IN_CLASS (mode_iter
, MODE_INT
)
26971 if (GET_MODE_SIZE (mode_iter
.require ()) <= MIN (len
, 16))
26972 mode
= mode_iter
.require ();
26974 gcc_assert (mode
!= BLKmode
);
26976 mode_bytes
= GET_MODE_SIZE (mode
).to_constant ();
26980 /* Prefer Q-register accesses. */
26981 if (mode_bytes
== 16)
26984 src
= lowpart_subreg (mode
, src
, GET_MODE (val
));
26986 emit_move_insn (adjust_address (dst
, mode
, offset
), src
);
26988 offset
+= mode_bytes
;
26990 /* Emit trailing writes using overlapping unaligned accesses
26991 (when !STRICT_ALIGNMENT) - this is smaller and faster. */
26992 if (len
> 0 && len
< 16 && !STRICT_ALIGNMENT
)
26994 next_mode
= smallest_mode_for_size
26995 (len
* BITS_PER_UNIT
, MODE_INT
).require ();
26996 int n_bytes
= GET_MODE_SIZE (next_mode
).to_constant ();
26997 gcc_assert (n_bytes
<= mode_bytes
);
26998 offset
-= n_bytes
- len
;
27007 /* Split a DImode store of a CONST_INT SRC to MEM DST as two
27008 SImode stores. Handle the case when the constant has identical
27009 bottom and top halves. This is beneficial when the two stores can be
27010 merged into an STP and we avoid synthesising potentially expensive
27011 immediates twice. Return true if such a split is possible. */
27014 aarch64_split_dimode_const_store (rtx dst
, rtx src
)
27016 rtx lo
= gen_lowpart (SImode
, src
);
27017 rtx hi
= gen_highpart_mode (SImode
, DImode
, src
);
27019 if (!rtx_equal_p (lo
, hi
))
27022 unsigned int orig_cost
27023 = aarch64_internal_mov_immediate (NULL_RTX
, src
, false, DImode
);
27024 unsigned int lo_cost
27025 = aarch64_internal_mov_immediate (NULL_RTX
, lo
, false, SImode
);
27027 /* We want to transform:
27029 MOVK x1, 0x140, lsl 16
27030 MOVK x1, 0xc0da, lsl 32
27031 MOVK x1, 0x140, lsl 48
27035 MOVK w1, 0x140, lsl 16
27037 So we want to perform this when we save at least one instruction. */
27038 if (orig_cost
<= lo_cost
)
27041 rtx mem_lo
= adjust_address (dst
, SImode
, 0);
27042 if (!aarch64_mem_pair_operand (mem_lo
, SImode
))
27045 rtx tmp_reg
= gen_reg_rtx (SImode
);
27046 aarch64_expand_mov_immediate (tmp_reg
, lo
);
27047 rtx mem_hi
= aarch64_move_pointer (mem_lo
, GET_MODE_SIZE (SImode
));
27048 /* Don't emit an explicit store pair as this may not be always profitable.
27049 Let the sched-fusion logic decide whether to merge them. */
27050 emit_move_insn (mem_lo
, tmp_reg
);
27051 emit_move_insn (mem_hi
, tmp_reg
);
27056 /* Generate RTL for a conditional branch with rtx comparison CODE in
27057 mode CC_MODE. The destination of the unlikely conditional branch
27061 aarch64_gen_unlikely_cbranch (enum rtx_code code
, machine_mode cc_mode
,
27065 x
= gen_rtx_fmt_ee (code
, VOIDmode
,
27066 gen_rtx_REG (cc_mode
, CC_REGNUM
),
27069 x
= gen_rtx_IF_THEN_ELSE (VOIDmode
, x
,
27070 gen_rtx_LABEL_REF (VOIDmode
, label_ref
),
27072 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx
, x
));
27075 /* Generate DImode scratch registers for 128-bit (TImode) addition.
27077 OP1 represents the TImode destination operand 1
27078 OP2 represents the TImode destination operand 2
27079 LOW_DEST represents the low half (DImode) of TImode operand 0
27080 LOW_IN1 represents the low half (DImode) of TImode operand 1
27081 LOW_IN2 represents the low half (DImode) of TImode operand 2
27082 HIGH_DEST represents the high half (DImode) of TImode operand 0
27083 HIGH_IN1 represents the high half (DImode) of TImode operand 1
27084 HIGH_IN2 represents the high half (DImode) of TImode operand 2. */
27087 aarch64_addti_scratch_regs (rtx op1
, rtx op2
, rtx
*low_dest
,
27088 rtx
*low_in1
, rtx
*low_in2
,
27089 rtx
*high_dest
, rtx
*high_in1
,
27092 *low_dest
= gen_reg_rtx (DImode
);
27093 *low_in1
= force_lowpart_subreg (DImode
, op1
, TImode
);
27094 *low_in2
= force_lowpart_subreg (DImode
, op2
, TImode
);
27095 *high_dest
= gen_reg_rtx (DImode
);
27096 *high_in1
= force_highpart_subreg (DImode
, op1
, TImode
);
27097 *high_in2
= force_highpart_subreg (DImode
, op2
, TImode
);
27100 /* Generate DImode scratch registers for 128-bit (TImode) subtraction.
27102 OP1 represents the TImode destination operand 1
27103 OP2 represents the TImode destination operand 2
27104 LOW_DEST represents the low half (DImode) of TImode operand 0
27105 LOW_IN1 represents the low half (DImode) of TImode operand 1
27106 LOW_IN2 represents the low half (DImode) of TImode operand 2
27107 HIGH_DEST represents the high half (DImode) of TImode operand 0
27108 HIGH_IN1 represents the high half (DImode) of TImode operand 1
27109 HIGH_IN2 represents the high half (DImode) of TImode operand 2. */
27113 aarch64_subvti_scratch_regs (rtx op1
, rtx op2
, rtx
*low_dest
,
27114 rtx
*low_in1
, rtx
*low_in2
,
27115 rtx
*high_dest
, rtx
*high_in1
,
27118 *low_dest
= gen_reg_rtx (DImode
);
27119 *low_in1
= force_lowpart_subreg (DImode
, op1
, TImode
);
27120 *low_in2
= force_lowpart_subreg (DImode
, op2
, TImode
);
27121 *high_dest
= gen_reg_rtx (DImode
);
27123 *high_in1
= force_highpart_subreg (DImode
, op1
, TImode
);
27124 *high_in2
= force_highpart_subreg (DImode
, op2
, TImode
);
27127 /* Generate RTL for 128-bit (TImode) subtraction with overflow.
27129 OP0 represents the TImode destination operand 0
27130 LOW_DEST represents the low half (DImode) of TImode operand 0
27131 LOW_IN1 represents the low half (DImode) of TImode operand 1
27132 LOW_IN2 represents the low half (DImode) of TImode operand 2
27133 HIGH_DEST represents the high half (DImode) of TImode operand 0
27134 HIGH_IN1 represents the high half (DImode) of TImode operand 1
27135 HIGH_IN2 represents the high half (DImode) of TImode operand 2
27136 UNSIGNED_P is true if the operation is being performed on unsigned
27139 aarch64_expand_subvti (rtx op0
, rtx low_dest
, rtx low_in1
,
27140 rtx low_in2
, rtx high_dest
, rtx high_in1
,
27141 rtx high_in2
, bool unsigned_p
)
27143 if (low_in2
== const0_rtx
)
27145 low_dest
= low_in1
;
27146 high_in2
= force_reg (DImode
, high_in2
);
27148 emit_insn (gen_subdi3_compare1 (high_dest
, high_in1
, high_in2
));
27150 emit_insn (gen_subvdi_insn (high_dest
, high_in1
, high_in2
));
27154 if (aarch64_plus_immediate (low_in2
, DImode
))
27155 emit_insn (gen_subdi3_compare1_imm (low_dest
, low_in1
, low_in2
,
27156 GEN_INT (-UINTVAL (low_in2
))));
27159 low_in2
= force_reg (DImode
, low_in2
);
27160 emit_insn (gen_subdi3_compare1 (low_dest
, low_in1
, low_in2
));
27162 high_in2
= force_reg (DImode
, high_in2
);
27165 emit_insn (gen_usubdi3_carryinC (high_dest
, high_in1
, high_in2
));
27167 emit_insn (gen_subdi3_carryinV (high_dest
, high_in1
, high_in2
));
27170 emit_move_insn (gen_lowpart (DImode
, op0
), low_dest
);
27171 emit_move_insn (gen_highpart (DImode
, op0
), high_dest
);
27175 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook. */
27177 static unsigned HOST_WIDE_INT
27178 aarch64_asan_shadow_offset (void)
27181 return (HOST_WIDE_INT_1
<< 29);
27183 return (HOST_WIDE_INT_1
<< 36);
27187 aarch64_gen_ccmp_first (rtx_insn
**prep_seq
, rtx_insn
**gen_seq
,
27188 rtx_code code
, tree treeop0
, tree treeop1
)
27190 machine_mode op_mode
, cmp_mode
, cc_mode
= CCmode
;
27192 int unsignedp
= TYPE_UNSIGNED (TREE_TYPE (treeop0
));
27194 struct expand_operand ops
[4];
27197 expand_operands (treeop0
, treeop1
, NULL_RTX
, &op0
, &op1
, EXPAND_NORMAL
);
27199 op_mode
= GET_MODE (op0
);
27200 if (op_mode
== VOIDmode
)
27201 op_mode
= GET_MODE (op1
);
27209 icode
= CODE_FOR_cmpsi
;
27214 icode
= CODE_FOR_cmpdi
;
27219 cc_mode
= aarch64_select_cc_mode ((rtx_code
) code
, op0
, op1
);
27220 icode
= cc_mode
== CCFPEmode
? CODE_FOR_fcmpesf
: CODE_FOR_fcmpsf
;
27225 cc_mode
= aarch64_select_cc_mode ((rtx_code
) code
, op0
, op1
);
27226 icode
= cc_mode
== CCFPEmode
? CODE_FOR_fcmpedf
: CODE_FOR_fcmpdf
;
27234 op0
= prepare_operand (icode
, op0
, 0, op_mode
, cmp_mode
, unsignedp
);
27235 op1
= prepare_operand (icode
, op1
, 1, op_mode
, cmp_mode
, unsignedp
);
27241 *prep_seq
= get_insns ();
27244 create_fixed_operand (&ops
[0], op0
);
27245 create_fixed_operand (&ops
[1], op1
);
27248 if (!maybe_expand_insn (icode
, 2, ops
))
27253 *gen_seq
= get_insns ();
27256 return gen_rtx_fmt_ee ((rtx_code
) code
, cc_mode
,
27257 gen_rtx_REG (cc_mode
, CC_REGNUM
), const0_rtx
);
27261 aarch64_gen_ccmp_next (rtx_insn
**prep_seq
, rtx_insn
**gen_seq
, rtx prev
,
27262 rtx_code cmp_code
, tree treeop0
, tree treeop1
,
27265 rtx op0
, op1
, target
;
27266 machine_mode op_mode
, cmp_mode
, cc_mode
= CCmode
;
27267 int unsignedp
= TYPE_UNSIGNED (TREE_TYPE (treeop0
));
27269 struct expand_operand ops
[6];
27272 push_to_sequence (*prep_seq
);
27273 expand_operands (treeop0
, treeop1
, NULL_RTX
, &op0
, &op1
, EXPAND_NORMAL
);
27275 op_mode
= GET_MODE (op0
);
27276 if (op_mode
== VOIDmode
)
27277 op_mode
= GET_MODE (op1
);
27293 cc_mode
= aarch64_select_cc_mode ((rtx_code
) cmp_code
, op0
, op1
);
27298 cc_mode
= aarch64_select_cc_mode ((rtx_code
) cmp_code
, op0
, op1
);
27306 icode
= code_for_ccmp (cc_mode
, cmp_mode
);
27308 op0
= prepare_operand (icode
, op0
, 2, op_mode
, cmp_mode
, unsignedp
);
27309 op1
= prepare_operand (icode
, op1
, 3, op_mode
, cmp_mode
, unsignedp
);
27315 *prep_seq
= get_insns ();
27318 target
= gen_rtx_REG (cc_mode
, CC_REGNUM
);
27319 aarch64_cond
= aarch64_get_condition_code_1 (cc_mode
, (rtx_code
) cmp_code
);
27321 if (bit_code
!= AND
)
27323 /* Treat the ccmp patterns as canonical and use them where possible,
27324 but fall back to ccmp_rev patterns if there's no other option. */
27325 rtx_code prev_code
= GET_CODE (prev
);
27326 machine_mode prev_mode
= GET_MODE (XEXP (prev
, 0));
27327 if ((prev_mode
== CCFPmode
|| prev_mode
== CCFPEmode
)
27328 && !(prev_code
== EQ
27330 || prev_code
== ORDERED
27331 || prev_code
== UNORDERED
))
27332 icode
= code_for_ccmp_rev (cc_mode
, cmp_mode
);
27335 rtx_code code
= reverse_condition (prev_code
);
27336 prev
= gen_rtx_fmt_ee (code
, VOIDmode
, XEXP (prev
, 0), const0_rtx
);
27338 aarch64_cond
= AARCH64_INVERSE_CONDITION_CODE (aarch64_cond
);
27341 create_fixed_operand (&ops
[0], XEXP (prev
, 0));
27342 create_fixed_operand (&ops
[1], target
);
27343 create_fixed_operand (&ops
[2], op0
);
27344 create_fixed_operand (&ops
[3], op1
);
27345 create_fixed_operand (&ops
[4], prev
);
27346 create_fixed_operand (&ops
[5], GEN_INT (aarch64_cond
));
27348 push_to_sequence (*gen_seq
);
27349 if (!maybe_expand_insn (icode
, 6, ops
))
27355 *gen_seq
= get_insns ();
27358 return gen_rtx_fmt_ee ((rtx_code
) cmp_code
, VOIDmode
, target
, const0_rtx
);
27361 #undef TARGET_GEN_CCMP_FIRST
27362 #define TARGET_GEN_CCMP_FIRST aarch64_gen_ccmp_first
27364 #undef TARGET_GEN_CCMP_NEXT
27365 #define TARGET_GEN_CCMP_NEXT aarch64_gen_ccmp_next
27367 /* Implement TARGET_SCHED_MACRO_FUSION_P. Return true if target supports
27368 instruction fusion of some sort. */
27371 aarch64_macro_fusion_p (void)
27373 return aarch64_tune_params
.fusible_ops
!= AARCH64_FUSE_NOTHING
;
27377 /* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P. Return true if PREV and CURR
27378 should be kept together during scheduling. */
27381 aarch_macro_fusion_pair_p (rtx_insn
*prev
, rtx_insn
*curr
)
27384 rtx prev_set
= single_set (prev
);
27385 rtx curr_set
= single_set (curr
);
27386 /* prev and curr are simple SET insns i.e. no flag setting or branching. */
27387 bool simple_sets_p
= prev_set
&& curr_set
&& !any_condjump_p (curr
);
27389 if (!aarch64_macro_fusion_p ())
27392 if (simple_sets_p
&& aarch64_fusion_enabled_p (AARCH64_FUSE_MOV_MOVK
))
27394 /* We are trying to match:
27395 prev (mov) == (set (reg r0) (const_int imm16))
27396 curr (movk) == (set (zero_extract (reg r0)
27399 (const_int imm16_1)) */
27401 set_dest
= SET_DEST (curr_set
);
27403 if (GET_CODE (set_dest
) == ZERO_EXTRACT
27404 && CONST_INT_P (SET_SRC (curr_set
))
27405 && CONST_INT_P (SET_SRC (prev_set
))
27406 && CONST_INT_P (XEXP (set_dest
, 2))
27407 && INTVAL (XEXP (set_dest
, 2)) == 16
27408 && REG_P (XEXP (set_dest
, 0))
27409 && REG_P (SET_DEST (prev_set
))
27410 && REGNO (XEXP (set_dest
, 0)) == REGNO (SET_DEST (prev_set
)))
27416 if (simple_sets_p
&& aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_ADD
))
27419 /* We're trying to match:
27420 prev (adrp) == (set (reg r1)
27421 (high (symbol_ref ("SYM"))))
27422 curr (add) == (set (reg r0)
27424 (symbol_ref ("SYM"))))
27425 Note that r0 need not necessarily be the same as r1, especially
27426 during pre-regalloc scheduling. */
27428 if (satisfies_constraint_Ush (SET_SRC (prev_set
))
27429 && REG_P (SET_DEST (prev_set
)) && REG_P (SET_DEST (curr_set
)))
27431 if (GET_CODE (SET_SRC (curr_set
)) == LO_SUM
27432 && REG_P (XEXP (SET_SRC (curr_set
), 0))
27433 && REGNO (XEXP (SET_SRC (curr_set
), 0))
27434 == REGNO (SET_DEST (prev_set
))
27435 && rtx_equal_p (XEXP (SET_SRC (prev_set
), 0),
27436 XEXP (SET_SRC (curr_set
), 1)))
27441 if (simple_sets_p
&& aarch64_fusion_enabled_p (AARCH64_FUSE_MOVK_MOVK
))
27444 /* We're trying to match:
27445 prev (movk) == (set (zero_extract (reg r0)
27448 (const_int imm16_1))
27449 curr (movk) == (set (zero_extract (reg r0)
27452 (const_int imm16_2)) */
27454 if (GET_CODE (SET_DEST (prev_set
)) == ZERO_EXTRACT
27455 && GET_CODE (SET_DEST (curr_set
)) == ZERO_EXTRACT
27456 && REG_P (XEXP (SET_DEST (prev_set
), 0))
27457 && REG_P (XEXP (SET_DEST (curr_set
), 0))
27458 && REGNO (XEXP (SET_DEST (prev_set
), 0))
27459 == REGNO (XEXP (SET_DEST (curr_set
), 0))
27460 && CONST_INT_P (XEXP (SET_DEST (prev_set
), 2))
27461 && CONST_INT_P (XEXP (SET_DEST (curr_set
), 2))
27462 && INTVAL (XEXP (SET_DEST (prev_set
), 2)) == 32
27463 && INTVAL (XEXP (SET_DEST (curr_set
), 2)) == 48
27464 && CONST_INT_P (SET_SRC (prev_set
))
27465 && CONST_INT_P (SET_SRC (curr_set
)))
27469 if (simple_sets_p
&& aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_LDR
))
27471 /* We're trying to match:
27472 prev (adrp) == (set (reg r0)
27473 (high (symbol_ref ("SYM"))))
27474 curr (ldr) == (set (reg r1)
27475 (mem (lo_sum (reg r0)
27476 (symbol_ref ("SYM")))))
27478 curr (ldr) == (set (reg r1)
27481 (symbol_ref ("SYM")))))) */
27482 if (satisfies_constraint_Ush (SET_SRC (prev_set
))
27483 && REG_P (SET_DEST (prev_set
)) && REG_P (SET_DEST (curr_set
)))
27485 rtx curr_src
= SET_SRC (curr_set
);
27487 if (GET_CODE (curr_src
) == ZERO_EXTEND
)
27488 curr_src
= XEXP (curr_src
, 0);
27490 if (MEM_P (curr_src
) && GET_CODE (XEXP (curr_src
, 0)) == LO_SUM
27491 && REG_P (XEXP (XEXP (curr_src
, 0), 0))
27492 && REGNO (XEXP (XEXP (curr_src
, 0), 0))
27493 == REGNO (SET_DEST (prev_set
))
27494 && rtx_equal_p (XEXP (XEXP (curr_src
, 0), 1),
27495 XEXP (SET_SRC (prev_set
), 0)))
27500 /* Fuse compare (CMP/CMN/TST/BICS) and conditional branch. */
27501 if (aarch64_fusion_enabled_p (AARCH64_FUSE_CMP_BRANCH
)
27502 && prev_set
&& curr_set
&& any_condjump_p (curr
)
27503 && GET_CODE (SET_SRC (prev_set
)) == COMPARE
27504 && SCALAR_INT_MODE_P (GET_MODE (XEXP (SET_SRC (prev_set
), 0)))
27505 && reg_referenced_p (SET_DEST (prev_set
), PATTERN (curr
)))
27508 /* Fuse CMP and CSEL/CSET. */
27509 if (prev_set
&& curr_set
27510 && GET_CODE (SET_SRC (prev_set
)) == COMPARE
27511 && SCALAR_INT_MODE_P (GET_MODE (XEXP (SET_SRC (prev_set
), 0)))
27512 && reg_referenced_p (SET_DEST (prev_set
), PATTERN (curr
)))
27514 enum attr_type prev_type
= get_attr_type (prev
);
27515 if ((prev_type
== TYPE_ALUS_SREG
|| prev_type
== TYPE_ALUS_IMM
)
27516 && ((aarch64_fusion_enabled_p (AARCH64_FUSE_CMP_CSEL
)
27517 && GET_CODE (SET_SRC (curr_set
)) == IF_THEN_ELSE
27518 && aarch64_reg_or_zero (XEXP (SET_SRC (curr_set
), 1), VOIDmode
)
27519 && aarch64_reg_or_zero (XEXP (SET_SRC (curr_set
), 2), VOIDmode
)
27520 && SCALAR_INT_MODE_P (GET_MODE (XEXP (SET_SRC (curr_set
), 1))))
27521 || (aarch64_fusion_enabled_p (AARCH64_FUSE_CMP_CSET
)
27522 && GET_RTX_CLASS (GET_CODE (SET_SRC (curr_set
)))
27524 && REG_P (SET_DEST (curr_set
)))))
27528 /* Fuse flag-setting ALU instructions and conditional branch. */
27529 if (aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_BRANCH
)
27530 && any_condjump_p (curr
))
27532 unsigned int condreg1
, condreg2
;
27534 aarch64_fixed_condition_code_regs (&condreg1
, &condreg2
);
27535 cc_reg_1
= gen_rtx_REG (CCmode
, condreg1
);
27537 if (reg_referenced_p (cc_reg_1
, PATTERN (curr
))
27539 && modified_in_p (cc_reg_1
, prev
))
27541 enum attr_type prev_type
= get_attr_type (prev
);
27543 /* FIXME: this misses some which is considered simple arthematic
27544 instructions for ThunderX. Simple shifts are missed here. */
27545 if (prev_type
== TYPE_ALUS_SREG
27546 || prev_type
== TYPE_ALUS_IMM
27547 || prev_type
== TYPE_LOGICS_REG
27548 || prev_type
== TYPE_LOGICS_IMM
)
27553 /* Fuse ALU instructions and CBZ/CBNZ. */
27556 && aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_CBZ
)
27557 && any_condjump_p (curr
))
27559 /* We're trying to match:
27560 prev (alu_insn) == (set (r0) plus ((r0) (r1/imm)))
27561 curr (cbz) == (set (pc) (if_then_else (eq/ne) (r0)
27563 (label_ref ("SYM"))
27565 if (SET_DEST (curr_set
) == (pc_rtx
)
27566 && GET_CODE (SET_SRC (curr_set
)) == IF_THEN_ELSE
27567 && REG_P (XEXP (XEXP (SET_SRC (curr_set
), 0), 0))
27568 && REG_P (SET_DEST (prev_set
))
27569 && REGNO (SET_DEST (prev_set
))
27570 == REGNO (XEXP (XEXP (SET_SRC (curr_set
), 0), 0)))
27572 /* Fuse ALU operations followed by conditional branch instruction. */
27573 switch (get_attr_type (prev
))
27576 case TYPE_ALU_SREG
:
27579 case TYPE_ADCS_REG
:
27580 case TYPE_ADCS_IMM
:
27581 case TYPE_LOGIC_REG
:
27582 case TYPE_LOGIC_IMM
:
27586 case TYPE_SHIFT_REG
:
27587 case TYPE_SHIFT_IMM
:
27599 /* Fuse A+B+1 and A-B-1 */
27601 && aarch64_fusion_enabled_p (AARCH64_FUSE_ADDSUB_2REG_CONST1
))
27603 /* We're trying to match:
27604 prev == (set (r0) (plus (r0) (r1)))
27605 curr == (set (r0) (plus (r0) (const_int 1)))
27607 prev == (set (r0) (minus (r0) (r1)))
27608 curr == (set (r0) (plus (r0) (const_int -1))) */
27610 rtx prev_src
= SET_SRC (prev_set
);
27611 rtx curr_src
= SET_SRC (curr_set
);
27614 if (GET_CODE (prev_src
) == MINUS
)
27617 if (GET_CODE (curr_src
) == PLUS
27618 && (GET_CODE (prev_src
) == PLUS
|| GET_CODE (prev_src
) == MINUS
)
27619 && CONST_INT_P (XEXP (curr_src
, 1))
27620 && INTVAL (XEXP (curr_src
, 1)) == polarity
27621 && REG_P (XEXP (curr_src
, 0))
27622 && REG_P (SET_DEST (prev_set
))
27623 && REGNO (SET_DEST (prev_set
)) == REGNO (XEXP (curr_src
, 0)))
27630 /* Return true iff the instruction fusion described by OP is enabled. */
27633 aarch64_fusion_enabled_p (enum aarch64_fusion_pairs op
)
27635 return (aarch64_tune_params
.fusible_ops
& op
) != 0;
27638 /* If MEM is in the form of [base+offset], extract the two parts
27639 of address and set to BASE and OFFSET, otherwise return false
27640 after clearing BASE and OFFSET. */
27643 extract_base_offset_in_addr (rtx mem
, rtx
*base
, rtx
*offset
)
27647 gcc_assert (MEM_P (mem
));
27649 addr
= XEXP (mem
, 0);
27654 *offset
= const0_rtx
;
27658 if (GET_CODE (addr
) == PLUS
27659 && REG_P (XEXP (addr
, 0)) && CONST_INT_P (XEXP (addr
, 1)))
27661 *base
= XEXP (addr
, 0);
27662 *offset
= XEXP (addr
, 1);
27667 *offset
= NULL_RTX
;
27672 /* Types for scheduling fusion. */
27673 enum sched_fusion_type
27675 SCHED_FUSION_NONE
= 0,
27676 SCHED_FUSION_LD_SIGN_EXTEND
,
27677 SCHED_FUSION_LD_ZERO_EXTEND
,
27683 /* If INSN is a load or store of address in the form of [base+offset],
27684 extract the two parts and set to BASE and OFFSET. Return scheduling
27685 fusion type this INSN is. */
27687 static enum sched_fusion_type
27688 fusion_load_store (rtx_insn
*insn
, rtx
*base
, rtx
*offset
)
27691 enum sched_fusion_type fusion
= SCHED_FUSION_LD
;
27693 gcc_assert (INSN_P (insn
));
27694 x
= PATTERN (insn
);
27695 if (GET_CODE (x
) != SET
)
27696 return SCHED_FUSION_NONE
;
27699 dest
= SET_DEST (x
);
27701 machine_mode dest_mode
= GET_MODE (dest
);
27703 if (!aarch64_mode_valid_for_sched_fusion_p (dest_mode
))
27704 return SCHED_FUSION_NONE
;
27706 if (GET_CODE (src
) == SIGN_EXTEND
)
27708 fusion
= SCHED_FUSION_LD_SIGN_EXTEND
;
27709 src
= XEXP (src
, 0);
27710 if (!MEM_P (src
) || GET_MODE (src
) != SImode
)
27711 return SCHED_FUSION_NONE
;
27713 else if (GET_CODE (src
) == ZERO_EXTEND
)
27715 fusion
= SCHED_FUSION_LD_ZERO_EXTEND
;
27716 src
= XEXP (src
, 0);
27717 if (!MEM_P (src
) || GET_MODE (src
) != SImode
)
27718 return SCHED_FUSION_NONE
;
27721 if (MEM_P (src
) && REG_P (dest
))
27722 extract_base_offset_in_addr (src
, base
, offset
);
27723 else if (MEM_P (dest
) && (REG_P (src
) || src
== const0_rtx
))
27725 fusion
= SCHED_FUSION_ST
;
27726 extract_base_offset_in_addr (dest
, base
, offset
);
27729 return SCHED_FUSION_NONE
;
27731 if (*base
== NULL_RTX
|| *offset
== NULL_RTX
)
27732 fusion
= SCHED_FUSION_NONE
;
27737 /* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
27739 Currently we only support to fuse ldr or str instructions, so FUSION_PRI
27740 and PRI are only calculated for these instructions. For other instruction,
27741 FUSION_PRI and PRI are simply set to MAX_PRI - 1. In the future, other
27742 type instruction fusion can be added by returning different priorities.
27744 It's important that irrelevant instructions get the largest FUSION_PRI. */
27747 aarch64_sched_fusion_priority (rtx_insn
*insn
, int max_pri
,
27748 int *fusion_pri
, int *pri
)
27752 enum sched_fusion_type fusion
;
27754 gcc_assert (INSN_P (insn
));
27757 fusion
= fusion_load_store (insn
, &base
, &offset
);
27758 if (fusion
== SCHED_FUSION_NONE
)
27765 /* Set FUSION_PRI according to fusion type and base register. */
27766 *fusion_pri
= tmp
- fusion
* FIRST_PSEUDO_REGISTER
- REGNO (base
);
27768 /* Calculate PRI. */
27771 /* INSN with smaller offset goes first. */
27772 off_val
= (int)(INTVAL (offset
));
27774 tmp
-= (off_val
& 0xfffff);
27776 tmp
+= ((- off_val
) & 0xfffff);
27782 /* Implement the TARGET_SCHED_ADJUST_PRIORITY hook.
27783 Adjust priority of sha1h instructions so they are scheduled before
27784 other SHA1 instructions. */
27787 aarch64_sched_adjust_priority (rtx_insn
*insn
, int priority
)
27789 rtx x
= PATTERN (insn
);
27791 if (GET_CODE (x
) == SET
)
27795 if (GET_CODE (x
) == UNSPEC
&& XINT (x
, 1) == UNSPEC_SHA1H
)
27796 return priority
+ 10;
27802 /* If REVERSED is null, return true if memory reference *MEM2 comes
27803 immediately after memory reference *MEM1. Do not change the references
27806 Otherwise, check if *MEM1 and *MEM2 are consecutive memory references and,
27807 if they are, try to make them use constant offsets from the same base
27808 register. Return true on success. When returning true, set *REVERSED
27809 to true if *MEM1 comes after *MEM2, false if *MEM1 comes before *MEM2. */
27811 aarch64_check_consecutive_mems (rtx
*mem1
, rtx
*mem2
, bool *reversed
)
27816 if (GET_RTX_CLASS (GET_CODE (XEXP (*mem1
, 0))) == RTX_AUTOINC
27817 || GET_RTX_CLASS (GET_CODE (XEXP (*mem2
, 0))) == RTX_AUTOINC
)
27820 if (!MEM_SIZE_KNOWN_P (*mem1
) || !MEM_SIZE_KNOWN_P (*mem2
))
27823 auto size1
= MEM_SIZE (*mem1
);
27824 auto size2
= MEM_SIZE (*mem2
);
27826 rtx base1
, base2
, offset1
, offset2
;
27827 extract_base_offset_in_addr (*mem1
, &base1
, &offset1
);
27828 extract_base_offset_in_addr (*mem2
, &base2
, &offset2
);
27830 /* Make sure at least one memory is in base+offset form. */
27831 if (!(base1
&& offset1
) && !(base2
&& offset2
))
27834 /* If both mems already use the same base register, just check the
27836 if (base1
&& base2
&& rtx_equal_p (base1
, base2
))
27838 if (!offset1
|| !offset2
)
27841 if (known_eq (UINTVAL (offset1
) + size1
, UINTVAL (offset2
)))
27844 if (known_eq (UINTVAL (offset2
) + size2
, UINTVAL (offset1
)) && reversed
)
27853 /* Otherwise, check whether the MEM_EXPRs and MEM_OFFSETs together
27854 guarantee that the values are consecutive. */
27855 if (MEM_EXPR (*mem1
)
27856 && MEM_EXPR (*mem2
)
27857 && MEM_OFFSET_KNOWN_P (*mem1
)
27858 && MEM_OFFSET_KNOWN_P (*mem2
))
27860 poly_int64 expr_offset1
;
27861 poly_int64 expr_offset2
;
27862 tree expr_base1
= get_addr_base_and_unit_offset (MEM_EXPR (*mem1
),
27864 tree expr_base2
= get_addr_base_and_unit_offset (MEM_EXPR (*mem2
),
27868 || !DECL_P (expr_base1
)
27869 || !operand_equal_p (expr_base1
, expr_base2
, OEP_ADDRESS_OF
))
27872 expr_offset1
+= MEM_OFFSET (*mem1
);
27873 expr_offset2
+= MEM_OFFSET (*mem2
);
27875 if (known_eq (expr_offset1
+ size1
, expr_offset2
))
27877 else if (known_eq (expr_offset2
+ size2
, expr_offset1
) && reversed
)
27886 rtx addr1
= plus_constant (Pmode
, XEXP (*mem2
, 0),
27887 expr_offset1
- expr_offset2
);
27888 *mem1
= replace_equiv_address_nv (*mem1
, addr1
);
27892 rtx addr2
= plus_constant (Pmode
, XEXP (*mem1
, 0),
27893 expr_offset2
- expr_offset1
);
27894 *mem2
= replace_equiv_address_nv (*mem2
, addr2
);
27903 /* Test if MODE is suitable for a single transfer register in an ldp or stp
27907 aarch64_ldpstp_operand_mode_p (machine_mode mode
)
27909 if (!targetm
.hard_regno_mode_ok (V0_REGNUM
, mode
)
27910 || hard_regno_nregs (V0_REGNUM
, mode
) > 1)
27913 const auto size
= GET_MODE_SIZE (mode
);
27914 return known_eq (size
, 4) || known_eq (size
, 8) || known_eq (size
, 16);
27917 /* Return true if MEM1 and MEM2 can be combined into a single access
27918 of mode MODE, with the combined access having the same address as MEM1. */
27921 aarch64_mergeable_load_pair_p (machine_mode mode
, rtx mem1
, rtx mem2
)
27923 if (STRICT_ALIGNMENT
&& MEM_ALIGN (mem1
) < GET_MODE_ALIGNMENT (mode
))
27925 return aarch64_check_consecutive_mems (&mem1
, &mem2
, nullptr);
27928 /* Return true if MEM agrees with the ldp-stp policy model.
27929 Otherwise, false. */
27932 aarch64_mem_ok_with_ldpstp_policy_model (rtx mem
, bool load
, machine_mode mode
)
27934 auto policy
= (load
27935 ? aarch64_tune_params
.ldp_policy_model
27936 : aarch64_tune_params
.stp_policy_model
);
27938 /* If we have AARCH64_LDP_STP_POLICY_NEVER, reject the load pair. */
27939 if (policy
== AARCH64_LDP_STP_POLICY_NEVER
)
27942 /* If we have AARCH64_LDP_STP_POLICY_ALIGNED,
27943 do not emit the load pair unless the alignment is checked to be
27944 at least double the alignment of the type. */
27945 if (policy
== AARCH64_LDP_STP_POLICY_ALIGNED
27946 && !optimize_function_for_size_p (cfun
)
27947 && MEM_ALIGN (mem
) < 2 * GET_MODE_ALIGNMENT (mode
))
27953 /* Given OPERANDS of consecutive load/store, check if we can merge
27954 them into ldp/stp. LOAD is true if they are load instructions. */
27957 aarch64_operands_ok_for_ldpstp (rtx
*operands
, bool load
)
27959 enum reg_class rclass_1
, rclass_2
;
27960 rtx mem_1
, mem_2
, reg_1
, reg_2
;
27964 mem_1
= operands
[1];
27965 mem_2
= operands
[3];
27966 reg_1
= operands
[0];
27967 reg_2
= operands
[2];
27968 gcc_assert (REG_P (reg_1
) && REG_P (reg_2
));
27969 if (REGNO (reg_1
) == REGNO (reg_2
))
27971 if (reg_overlap_mentioned_p (reg_1
, mem_2
))
27976 mem_1
= operands
[0];
27977 mem_2
= operands
[2];
27978 reg_1
= operands
[1];
27979 reg_2
= operands
[3];
27982 /* The mems cannot be volatile. */
27983 if (MEM_VOLATILE_P (mem_1
) || MEM_VOLATILE_P (mem_2
))
27986 /* Check if the addresses are in the form of [base+offset]. */
27987 bool reversed
= false;
27988 if (!aarch64_check_consecutive_mems (&mem_1
, &mem_2
, &reversed
))
27991 /* The operands must be of the same size. */
27992 gcc_assert (known_eq (GET_MODE_SIZE (GET_MODE (mem_1
)),
27993 GET_MODE_SIZE (GET_MODE (mem_2
))));
27995 /* The lower memory access must be a mem-pair operand. */
27996 rtx lower_mem
= reversed
? mem_2
: mem_1
;
27997 machine_mode lower_mem_mode
= GET_MODE (lower_mem
);
27998 if (!aarch64_mem_pair_operand (lower_mem
, lower_mem_mode
))
28001 /* Check if lower_mem is ok with the ldp-stp policy model. */
28002 if (!aarch64_mem_ok_with_ldpstp_policy_model (lower_mem
, load
,
28006 if (REG_P (reg_1
) && FP_REGNUM_P (REGNO (reg_1
)))
28007 rclass_1
= FP_REGS
;
28009 rclass_1
= GENERAL_REGS
;
28011 if (REG_P (reg_2
) && FP_REGNUM_P (REGNO (reg_2
)))
28012 rclass_2
= FP_REGS
;
28014 rclass_2
= GENERAL_REGS
;
28016 /* Check if the registers are of same class. */
28017 if (rclass_1
!= rclass_2
)
28023 /* Given OPERANDS of consecutive load/store that can be merged,
28024 swap them if they are not in ascending order. */
28026 aarch64_swap_ldrstr_operands (rtx
* operands
, bool load
)
28028 int mem_op
= load
? 1 : 0;
28029 bool reversed
= false;
28030 if (!aarch64_check_consecutive_mems (operands
+ mem_op
,
28031 operands
+ mem_op
+ 2, &reversed
))
28032 gcc_unreachable ();
28036 /* Irrespective of whether this is a load or a store,
28037 we do the same swap. */
28038 std::swap (operands
[0], operands
[2]);
28039 std::swap (operands
[1], operands
[3]);
28043 /* Helper function used for generation of load/store pair instructions, called
28044 from peepholes in aarch64-ldpstp.md. OPERANDS is an array of
28045 operands as matched by the peepholes in that file. LOAD_P is true if we're
28046 generating a load pair, otherwise we're generating a store pair. CODE is
28047 either {ZERO,SIGN}_EXTEND for extending loads or UNKNOWN if we're generating a
28048 standard load/store pair. */
28051 aarch64_finish_ldpstp_peephole (rtx
*operands
, bool load_p
, enum rtx_code code
)
28053 aarch64_swap_ldrstr_operands (operands
, load_p
);
28056 emit_insn (aarch64_gen_load_pair (operands
[0], operands
[2],
28057 operands
[1], code
));
28060 gcc_assert (code
== UNKNOWN
);
28061 emit_insn (aarch64_gen_store_pair (operands
[0], operands
[1],
28066 /* Taking X and Y to be HOST_WIDE_INT pointers, return the result of a
28067 comparison between the two. */
28069 aarch64_host_wide_int_compare (const void *x
, const void *y
)
28071 return wi::cmps (* ((const HOST_WIDE_INT
*) x
),
28072 * ((const HOST_WIDE_INT
*) y
));
28075 /* Taking X and Y to be pairs of RTX, one pointing to a MEM rtx and the
28076 other pointing to a REG rtx containing an offset, compare the offsets
28081 1 iff offset (X) > offset (Y)
28082 0 iff offset (X) == offset (Y)
28083 -1 iff offset (X) < offset (Y) */
28085 aarch64_ldrstr_offset_compare (const void *x
, const void *y
)
28087 const rtx
* operands_1
= (const rtx
*) x
;
28088 const rtx
* operands_2
= (const rtx
*) y
;
28089 rtx mem_1
, mem_2
, base
, offset_1
, offset_2
;
28091 if (MEM_P (operands_1
[0]))
28092 mem_1
= operands_1
[0];
28094 mem_1
= operands_1
[1];
28096 if (MEM_P (operands_2
[0]))
28097 mem_2
= operands_2
[0];
28099 mem_2
= operands_2
[1];
28101 /* Extract the offsets. */
28102 extract_base_offset_in_addr (mem_1
, &base
, &offset_1
);
28103 extract_base_offset_in_addr (mem_2
, &base
, &offset_2
);
28105 gcc_assert (offset_1
!= NULL_RTX
&& offset_2
!= NULL_RTX
);
28107 return wi::cmps (INTVAL (offset_1
), INTVAL (offset_2
));
28110 /* Given OPERANDS of consecutive load/store, check if we can merge
28111 them into ldp/stp by adjusting the offset. LOAD is true if they
28112 are load instructions. MODE is the mode of memory operands.
28114 Given below consecutive stores:
28116 str w1, [xb, 0x100]
28117 str w1, [xb, 0x104]
28118 str w1, [xb, 0x108]
28119 str w1, [xb, 0x10c]
28121 Though the offsets are out of the range supported by stp, we can
28122 still pair them after adjusting the offset, like:
28124 add scratch, xb, 0x100
28125 stp w1, w1, [scratch]
28126 stp w1, w1, [scratch, 0x8]
28128 The peephole patterns detecting this opportunity should guarantee
28129 the scratch register is avaliable. */
28132 aarch64_operands_adjust_ok_for_ldpstp (rtx
*operands
, bool load
,
28135 const int num_insns
= 4;
28136 enum reg_class rclass
;
28137 HOST_WIDE_INT offvals
[num_insns
], msize
;
28138 rtx mem
[num_insns
], reg
[num_insns
], base
[num_insns
], offset
[num_insns
];
28142 for (int i
= 0; i
< num_insns
; i
++)
28144 reg
[i
] = operands
[2 * i
];
28145 mem
[i
] = operands
[2 * i
+ 1];
28147 gcc_assert (REG_P (reg
[i
]));
28150 /* Do not attempt to merge the loads if the loads clobber each other. */
28151 for (int i
= 0; i
< 8; i
+= 2)
28152 for (int j
= i
+ 2; j
< 8; j
+= 2)
28153 if (reg_overlap_mentioned_p (operands
[i
], operands
[j
]))
28157 for (int i
= 0; i
< num_insns
; i
++)
28159 mem
[i
] = operands
[2 * i
];
28160 reg
[i
] = operands
[2 * i
+ 1];
28163 /* Skip if memory operand is by itself valid for ldp/stp. */
28164 if (!MEM_P (mem
[0]) || aarch64_mem_pair_operand (mem
[0], mode
))
28167 for (int i
= 0; i
< num_insns
; i
++)
28169 /* The mems cannot be volatile. */
28170 if (MEM_VOLATILE_P (mem
[i
]))
28173 /* Check if the addresses are in the form of [base+offset]. */
28174 extract_base_offset_in_addr (mem
[i
], base
+ i
, offset
+ i
);
28175 if (base
[i
] == NULL_RTX
|| offset
[i
] == NULL_RTX
)
28179 /* Check if the registers are of same class. */
28180 rclass
= REG_P (reg
[0]) && FP_REGNUM_P (REGNO (reg
[0]))
28181 ? FP_REGS
: GENERAL_REGS
;
28183 for (int i
= 1; i
< num_insns
; i
++)
28184 if (REG_P (reg
[i
]) && FP_REGNUM_P (REGNO (reg
[i
])))
28186 if (rclass
!= FP_REGS
)
28191 if (rclass
!= GENERAL_REGS
)
28195 /* Only the last register in the order in which they occur
28196 may be clobbered by the load. */
28197 if (rclass
== GENERAL_REGS
&& load
)
28198 for (int i
= 0; i
< num_insns
- 1; i
++)
28199 if (reg_mentioned_p (reg
[i
], mem
[i
]))
28202 /* Check if the bases are same. */
28203 for (int i
= 0; i
< num_insns
- 1; i
++)
28204 if (!rtx_equal_p (base
[i
], base
[i
+ 1]))
28207 for (int i
= 0; i
< num_insns
; i
++)
28208 offvals
[i
] = INTVAL (offset
[i
]);
28210 msize
= GET_MODE_SIZE (mode
).to_constant ();
28212 /* Check if the offsets can be put in the right order to do a ldp/stp. */
28213 qsort (offvals
, num_insns
, sizeof (HOST_WIDE_INT
),
28214 aarch64_host_wide_int_compare
);
28216 if (!(offvals
[1] == offvals
[0] + msize
28217 && offvals
[3] == offvals
[2] + msize
))
28220 /* Check that offsets are within range of each other. The ldp/stp
28221 instructions have 7 bit immediate offsets, so use 0x80. */
28222 if (offvals
[2] - offvals
[0] >= msize
* 0x80)
28225 /* The offsets must be aligned with respect to each other. */
28226 if (offvals
[0] % msize
!= offvals
[2] % msize
)
28229 /* Check if mem[0] is ok with the ldp-stp policy model. */
28230 if (!aarch64_mem_ok_with_ldpstp_policy_model (mem
[0], load
, mode
))
28236 /* Given OPERANDS of consecutive load/store, this function pairs them
28237 into LDP/STP after adjusting the offset. It depends on the fact
28238 that the operands can be sorted so the offsets are correct for STP.
28239 MODE is the mode of memory operands. CODE is the rtl operator
28240 which should be applied to all memory operands, it's SIGN_EXTEND,
28241 ZERO_EXTEND or UNKNOWN. */
28244 aarch64_gen_adjusted_ldpstp (rtx
*operands
, bool load
,
28245 machine_mode mode
, RTX_CODE code
)
28247 rtx base
, offset_1
, offset_2
;
28249 rtx temp_operands
[8];
28250 HOST_WIDE_INT off_val_1
, off_val_2
, base_off
, new_off_1
, new_off_2
,
28251 stp_off_upper_limit
, stp_off_lower_limit
, msize
;
28253 /* We make changes on a copy as we may still bail out. */
28254 for (int i
= 0; i
< 8; i
++)
28255 temp_operands
[i
] = operands
[i
];
28257 /* Sort the operands. Note for cases as below:
28262 We need stable sorting otherwise wrong data may be store to offset 0x320.
28263 Also note the dead store in above case should be optimized away, but no
28264 guarantees here. */
28265 gcc_stablesort(temp_operands
, 4, 2 * sizeof (rtx
*),
28266 aarch64_ldrstr_offset_compare
);
28268 /* Copy the memory operands so that if we have to bail for some
28269 reason the original addresses are unchanged. */
28272 mem_1
= copy_rtx (temp_operands
[1]);
28273 mem_2
= copy_rtx (temp_operands
[5]);
28277 mem_1
= copy_rtx (temp_operands
[0]);
28278 mem_2
= copy_rtx (temp_operands
[4]);
28279 gcc_assert (code
== UNKNOWN
);
28282 extract_base_offset_in_addr (mem_1
, &base
, &offset_1
);
28283 extract_base_offset_in_addr (mem_2
, &base
, &offset_2
);
28284 gcc_assert (base
!= NULL_RTX
&& offset_1
!= NULL_RTX
28285 && offset_2
!= NULL_RTX
);
28287 /* Adjust offset so it can fit in LDP/STP instruction. */
28288 msize
= GET_MODE_SIZE (mode
).to_constant();
28289 stp_off_upper_limit
= msize
* (0x40 - 1);
28290 stp_off_lower_limit
= - msize
* 0x40;
28292 off_val_1
= INTVAL (offset_1
);
28293 off_val_2
= INTVAL (offset_2
);
28295 /* The base offset is optimally half way between the two STP/LDP offsets. */
28297 base_off
= (off_val_1
+ off_val_2
) / 2;
28299 /* However, due to issues with negative LDP/STP offset generation for
28300 larger modes, for DF, DD, DI and vector modes. we must not use negative
28301 addresses smaller than 9 signed unadjusted bits can store. This
28302 provides the most range in this case. */
28303 base_off
= off_val_1
;
28305 /* Adjust the base so that it is aligned with the addresses but still
28307 if (base_off
% msize
!= off_val_1
% msize
)
28308 /* Fix the offset, bearing in mind we want to make it bigger not
28310 base_off
+= (((base_off
% msize
) - (off_val_1
% msize
)) + msize
) % msize
;
28311 else if (msize
<= 4)
28312 /* The negative range of LDP/STP is one larger than the positive range. */
28315 /* Check if base offset is too big or too small. We can attempt to resolve
28316 this issue by setting it to the maximum value and seeing if the offsets
28318 if (base_off
>= 0x1000)
28320 base_off
= 0x1000 - 1;
28321 /* We must still make sure that the base offset is aligned with respect
28322 to the address. But it may not be made any bigger. */
28323 base_off
-= (((base_off
% msize
) - (off_val_1
% msize
)) + msize
) % msize
;
28326 /* Likewise for the case where the base is too small. */
28327 if (base_off
<= -0x1000)
28329 base_off
= -0x1000 + 1;
28330 base_off
+= (((base_off
% msize
) - (off_val_1
% msize
)) + msize
) % msize
;
28333 /* Offset of the first STP/LDP. */
28334 new_off_1
= off_val_1
- base_off
;
28336 /* Offset of the second STP/LDP. */
28337 new_off_2
= off_val_2
- base_off
;
28339 /* The offsets must be within the range of the LDP/STP instructions. */
28340 if (new_off_1
> stp_off_upper_limit
|| new_off_1
< stp_off_lower_limit
28341 || new_off_2
> stp_off_upper_limit
|| new_off_2
< stp_off_lower_limit
)
28344 replace_equiv_address_nv (mem_1
, plus_constant (Pmode
, operands
[8],
28346 replace_equiv_address_nv (mem_2
, plus_constant (Pmode
, operands
[8],
28349 if (!aarch64_mem_pair_operand (mem_1
, mode
)
28350 || !aarch64_mem_pair_operand (mem_2
, mode
))
28355 operands
[0] = temp_operands
[0];
28356 operands
[1] = mem_1
;
28357 operands
[2] = temp_operands
[2];
28358 operands
[4] = temp_operands
[4];
28359 operands
[5] = mem_2
;
28360 operands
[6] = temp_operands
[6];
28364 operands
[0] = mem_1
;
28365 operands
[1] = temp_operands
[1];
28366 operands
[3] = temp_operands
[3];
28367 operands
[4] = mem_2
;
28368 operands
[5] = temp_operands
[5];
28369 operands
[7] = temp_operands
[7];
28372 /* Emit adjusting instruction. */
28373 emit_insn (gen_rtx_SET (operands
[8], plus_constant (DImode
, base
, base_off
)));
28374 /* Emit ldp/stp instructions. */
28377 emit_insn (aarch64_gen_load_pair (operands
[0], operands
[2],
28378 operands
[1], code
));
28379 emit_insn (aarch64_gen_load_pair (operands
[4], operands
[6],
28380 operands
[5], code
));
28384 emit_insn (aarch64_gen_store_pair (operands
[0], operands
[1],
28386 emit_insn (aarch64_gen_store_pair (operands
[4], operands
[5],
28392 /* Implement TARGET_VECTORIZE_CONDITIONAL_OPERATION_IS_EXPENSIVE. Assume that
28393 predicated operations when available are beneficial. */
28396 aarch64_conditional_operation_is_expensive (unsigned)
28401 /* Implement TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE. Assume for now that
28402 it isn't worth branching around empty masked ops (including masked
28406 aarch64_empty_mask_is_expensive (unsigned)
28411 /* Return 1 if pseudo register should be created and used to hold
28412 GOT address for PIC code. */
28415 aarch64_use_pseudo_pic_reg (void)
28417 return aarch64_cmodel
== AARCH64_CMODEL_SMALL_SPIC
;
28420 /* Implement TARGET_UNSPEC_MAY_TRAP_P. */
28423 aarch64_unspec_may_trap_p (const_rtx x
, unsigned flags
)
28425 switch (XINT (x
, 1))
28427 case UNSPEC_GOTSMALLPIC
:
28428 case UNSPEC_GOTSMALLPIC28K
:
28429 case UNSPEC_GOTTINYPIC
:
28435 return default_unspec_may_trap_p (x
, flags
);
28439 /* If X is a positive CONST_DOUBLE with a value that is a power of 2
28440 return the log2 of that value. Otherwise return -1. */
28443 aarch64_fpconst_pow_of_2 (rtx x
)
28445 const REAL_VALUE_TYPE
*r
;
28447 if (!CONST_DOUBLE_P (x
))
28450 r
= CONST_DOUBLE_REAL_VALUE (x
);
28452 if (REAL_VALUE_NEGATIVE (*r
)
28453 || REAL_VALUE_ISNAN (*r
)
28454 || REAL_VALUE_ISINF (*r
)
28455 || !real_isinteger (r
, DFmode
))
28458 return exact_log2 (real_to_integer (r
));
28461 /* If X is a positive CONST_DOUBLE with a value that is the reciprocal of a
28462 power of 2 (i.e 1/2^n) return the number of float bits. e.g. for x==(1/2^n)
28463 return n. Otherwise return -1. */
28466 aarch64_fpconst_pow2_recip (rtx x
)
28468 REAL_VALUE_TYPE r0
;
28470 if (!CONST_DOUBLE_P (x
))
28473 r0
= *CONST_DOUBLE_REAL_VALUE (x
);
28474 if (exact_real_inverse (DFmode
, &r0
)
28475 && !REAL_VALUE_NEGATIVE (r0
))
28477 int ret
= exact_log2 (real_to_integer (&r0
));
28478 if (ret
>= 1 && ret
<= 32)
28484 /* If X is a vector of equal CONST_DOUBLE values and that value is
28485 Y, return the aarch64_fpconst_pow_of_2 of Y. Otherwise return -1. */
28488 aarch64_vec_fpconst_pow_of_2 (rtx x
)
28491 if (!CONST_VECTOR_P (x
)
28492 || !CONST_VECTOR_NUNITS (x
).is_constant (&nelts
))
28495 if (GET_MODE_CLASS (GET_MODE (x
)) != MODE_VECTOR_FLOAT
)
28498 int firstval
= aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x
, 0));
28502 for (int i
= 1; i
< nelts
; i
++)
28503 if (aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x
, i
)) != firstval
)
28509 /* Implement TARGET_PROMOTED_TYPE to promote 16-bit floating point types
28512 __fp16 always promotes through this hook.
28513 _Float16 may promote if TARGET_FLT_EVAL_METHOD is 16, but we do that
28514 through the generic excess precision logic rather than here. */
28517 aarch64_promoted_type (const_tree t
)
28519 if (SCALAR_FLOAT_TYPE_P (t
)
28520 && TYPE_MAIN_VARIANT (t
) == aarch64_fp16_type_node
)
28521 return float_type_node
;
28526 /* Implement the TARGET_OPTAB_SUPPORTED_P hook. */
28529 aarch64_optab_supported_p (int op
, machine_mode mode1
, machine_mode
,
28530 optimization_type opt_type
)
28535 return opt_type
== OPTIMIZE_FOR_SPEED
&& use_rsqrt_p (mode1
);
28542 /* Implement the TARGET_DWARF_POLY_INDETERMINATE_VALUE hook. */
28544 static unsigned int
28545 aarch64_dwarf_poly_indeterminate_value (unsigned int i
, unsigned int *factor
,
28548 /* Polynomial invariant 1 == (VG / 2) - 1. */
28549 gcc_assert (i
== 1);
28552 return AARCH64_DWARF_VG
;
28555 /* Implement TARGET_LIBGCC_FLOATING_POINT_MODE_SUPPORTED_P - return TRUE
28556 if MODE is [BH]Fmode, and punt to the generic implementation otherwise. */
28559 aarch64_libgcc_floating_mode_supported_p (scalar_float_mode mode
)
28561 return ((mode
== HFmode
|| mode
== BFmode
)
28563 : default_libgcc_floating_mode_supported_p (mode
));
28566 /* Implement TARGET_SCALAR_MODE_SUPPORTED_P - return TRUE
28567 if MODE is [BH]Fmode, and punt to the generic implementation otherwise. */
28570 aarch64_scalar_mode_supported_p (scalar_mode mode
)
28572 if (DECIMAL_FLOAT_MODE_P (mode
))
28573 return default_decimal_float_supported_p ();
28575 return ((mode
== HFmode
|| mode
== BFmode
)
28577 : default_scalar_mode_supported_p (mode
));
28580 /* Set the value of FLT_EVAL_METHOD.
28581 ISO/IEC TS 18661-3 defines two values that we'd like to make use of:
28583 0: evaluate all operations and constants, whose semantic type has at
28584 most the range and precision of type float, to the range and
28585 precision of float; evaluate all other operations and constants to
28586 the range and precision of the semantic type;
28588 N, where _FloatN is a supported interchange floating type
28589 evaluate all operations and constants, whose semantic type has at
28590 most the range and precision of _FloatN type, to the range and
28591 precision of the _FloatN type; evaluate all other operations and
28592 constants to the range and precision of the semantic type;
28594 If we have the ARMv8.2-A extensions then we support _Float16 in native
28595 precision, so we should set this to 16. Otherwise, we support the type,
28596 but want to evaluate expressions in float precision, so set this to
28599 static enum flt_eval_method
28600 aarch64_excess_precision (enum excess_precision_type type
)
28604 case EXCESS_PRECISION_TYPE_FAST
:
28605 case EXCESS_PRECISION_TYPE_STANDARD
:
28606 /* We can calculate either in 16-bit range and precision or
28607 32-bit range and precision. Make that decision based on whether
28608 we have native support for the ARMv8.2-A 16-bit floating-point
28609 instructions or not. */
28610 return (TARGET_FP_F16INST
28611 ? FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16
28612 : FLT_EVAL_METHOD_PROMOTE_TO_FLOAT
);
28613 case EXCESS_PRECISION_TYPE_IMPLICIT
:
28614 case EXCESS_PRECISION_TYPE_FLOAT16
:
28615 return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16
;
28617 gcc_unreachable ();
28619 return FLT_EVAL_METHOD_UNPREDICTABLE
;
28622 /* Implement TARGET_C_BITINT_TYPE_INFO.
28623 Return true if _BitInt(N) is supported and fill its details into *INFO. */
28625 aarch64_bitint_type_info (int n
, struct bitint_info
*info
)
28627 if (TARGET_BIG_END
)
28631 info
->limb_mode
= QImode
;
28633 info
->limb_mode
= HImode
;
28635 info
->limb_mode
= SImode
;
28637 info
->limb_mode
= DImode
;
28639 info
->limb_mode
= TImode
;
28641 /* The AAPCS for AArch64 defines _BitInt(N > 128) as an array with
28642 type {signed,unsigned} __int128[M] where M*128 >= N. However, to be
28643 able to use libgcc's implementation to support large _BitInt's we need
28644 to use a LIMB_MODE that is no larger than 'long long'. This is why we
28645 use DImode for our internal LIMB_MODE and we define the ABI_LIMB_MODE to
28646 be TImode to ensure we are ABI compliant. */
28647 info
->limb_mode
= DImode
;
28650 info
->abi_limb_mode
= TImode
;
28652 info
->abi_limb_mode
= info
->limb_mode
;
28653 info
->big_endian
= TARGET_BIG_END
;
28654 info
->extended
= false;
28658 /* Implement TARGET_C_MODE_FOR_FLOATING_TYPE. Return TFmode for
28659 TI_LONG_DOUBLE_TYPE which is for long double type, go with the default
28660 one for the others. */
28662 static machine_mode
28663 aarch64_c_mode_for_floating_type (enum tree_index ti
)
28665 if (ti
== TI_LONG_DOUBLE_TYPE
)
28667 return default_mode_for_floating_type (ti
);
28670 /* Implement TARGET_SCHED_CAN_SPECULATE_INSN. Return true if INSN can be
28671 scheduled for speculative execution. Reject the long-running division
28672 and square-root instructions. */
28675 aarch64_sched_can_speculate_insn (rtx_insn
*insn
)
28677 switch (get_attr_type (insn
))
28685 case TYPE_NEON_FP_SQRT_S
:
28686 case TYPE_NEON_FP_SQRT_D
:
28687 case TYPE_NEON_FP_SQRT_S_Q
:
28688 case TYPE_NEON_FP_SQRT_D_Q
:
28689 case TYPE_NEON_FP_DIV_S
:
28690 case TYPE_NEON_FP_DIV_D
:
28691 case TYPE_NEON_FP_DIV_S_Q
:
28692 case TYPE_NEON_FP_DIV_D_Q
:
28699 /* Implement TARGET_COMPUTE_PRESSURE_CLASSES. */
28702 aarch64_compute_pressure_classes (reg_class
*classes
)
28705 classes
[i
++] = GENERAL_REGS
;
28706 classes
[i
++] = FP_REGS
;
28707 /* PR_REGS isn't a useful pressure class because many predicate pseudo
28708 registers need to go in PR_LO_REGS at some point during their
28709 lifetime. Splitting it into two halves has the effect of making
28710 all predicates count against PR_LO_REGS, so that we try whenever
28711 possible to restrict the number of live predicates to 8. This
28712 greatly reduces the amount of spilling in certain loops. */
28713 classes
[i
++] = PR_LO_REGS
;
28714 classes
[i
++] = PR_HI_REGS
;
28718 /* Implement TARGET_CAN_CHANGE_MODE_CLASS. */
28721 aarch64_can_change_mode_class (machine_mode from
,
28722 machine_mode to
, reg_class_t
)
28724 return aarch64_modes_compatible_p (from
, to
);
28727 /* Implement TARGET_EARLY_REMAT_MODES. */
28730 aarch64_select_early_remat_modes (sbitmap modes
)
28732 /* SVE values are not normally live across a call, so it should be
28733 worth doing early rematerialization even in VL-specific mode. */
28734 for (int i
= 0; i
< NUM_MACHINE_MODES
; ++i
)
28735 if (aarch64_sve_mode_p ((machine_mode
) i
))
28736 bitmap_set_bit (modes
, i
);
28739 /* Override the default target speculation_safe_value. */
28741 aarch64_speculation_safe_value (machine_mode mode
,
28742 rtx result
, rtx val
, rtx failval
)
28744 /* Maybe we should warn if falling back to hard barriers. They are
28745 likely to be noticably more expensive than the alternative below. */
28746 if (!aarch64_track_speculation
)
28747 return default_speculation_safe_value (mode
, result
, val
, failval
);
28750 val
= copy_to_mode_reg (mode
, val
);
28752 if (!aarch64_reg_or_zero (failval
, mode
))
28753 failval
= copy_to_mode_reg (mode
, failval
);
28755 emit_insn (gen_despeculate_copy (mode
, result
, val
, failval
));
28759 /* Implement TARGET_ESTIMATED_POLY_VALUE.
28760 Look into the tuning structure for an estimate.
28761 KIND specifies the type of requested estimate: min, max or likely.
28762 For cores with a known SVE width all three estimates are the same.
28763 For generic SVE tuning we want to distinguish the maximum estimate from
28764 the minimum and likely ones.
28765 The likely estimate is the same as the minimum in that case to give a
28766 conservative behavior of auto-vectorizing with SVE when it is a win
28767 even for 128-bit SVE.
28768 When SVE width information is available VAL.coeffs[1] is multiplied by
28769 the number of VQ chunks over the initial Advanced SIMD 128 bits. */
28771 static HOST_WIDE_INT
28772 aarch64_estimated_poly_value (poly_int64 val
,
28773 poly_value_estimate_kind kind
28774 = POLY_VALUE_LIKELY
)
28776 unsigned int width_source
= aarch64_tune_params
.sve_width
;
28778 /* If there is no core-specific information then the minimum and likely
28779 values are based on 128-bit vectors and the maximum is based on
28780 the architectural maximum of 2048 bits. */
28781 if (width_source
== SVE_SCALABLE
)
28784 case POLY_VALUE_MIN
:
28785 case POLY_VALUE_LIKELY
:
28786 return val
.coeffs
[0];
28787 case POLY_VALUE_MAX
:
28788 return val
.coeffs
[0] + val
.coeffs
[1] * 15;
28791 /* Allow sve_width to be a bitmask of different VL, treating the lowest
28792 as likely. This could be made more general if future -mtune options
28794 if (kind
== POLY_VALUE_MAX
)
28795 width_source
= 1 << floor_log2 (width_source
);
28797 width_source
= least_bit_hwi (width_source
);
28799 /* If the core provides width information, use that. */
28800 HOST_WIDE_INT over_128
= width_source
- 128;
28801 return val
.coeffs
[0] + val
.coeffs
[1] * over_128
/ 128;
28805 /* Return true for types that could be supported as SIMD return or
28809 supported_simd_type (tree t
)
28811 if (SCALAR_FLOAT_TYPE_P (t
) || INTEGRAL_TYPE_P (t
) || POINTER_TYPE_P (t
))
28813 HOST_WIDE_INT s
= tree_to_shwi (TYPE_SIZE_UNIT (t
));
28814 return s
== 1 || s
== 2 || s
== 4 || s
== 8;
28819 /* Determine the lane size for the clone argument/return type. This follows
28820 the LS(P) rule in the VFABIA64. */
28823 lane_size (cgraph_simd_clone_arg_type clone_arg_type
, tree type
)
28825 gcc_assert (clone_arg_type
!= SIMD_CLONE_ARG_TYPE_MASK
);
28827 /* For non map-to-vector types that are pointers we use the element type it
28829 if (POINTER_TYPE_P (type
))
28830 switch (clone_arg_type
)
28834 case SIMD_CLONE_ARG_TYPE_UNIFORM
:
28835 case SIMD_CLONE_ARG_TYPE_LINEAR_CONSTANT_STEP
:
28836 case SIMD_CLONE_ARG_TYPE_LINEAR_VARIABLE_STEP
:
28837 type
= TREE_TYPE (type
);
28841 /* For types (or pointers of non map-to-vector types point to) that are
28842 integers or floating point, we use their size if they are 1, 2, 4 or 8.
28844 if (INTEGRAL_TYPE_P (type
)
28845 || SCALAR_FLOAT_TYPE_P (type
))
28846 switch (TYPE_PRECISION (type
) / BITS_PER_UNIT
)
28854 return TYPE_PRECISION (type
);
28856 /* For any other we use the size of uintptr_t. For map-to-vector types that
28857 are pointers, using the size of uintptr_t is the same as using the size of
28858 their type, seeing all pointers are the same size as uintptr_t. */
28859 return POINTER_SIZE
;
28863 /* Implement TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN. */
28866 aarch64_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node
*node
,
28867 struct cgraph_simd_clone
*clonei
,
28868 tree base_type ATTRIBUTE_UNUSED
,
28869 int num
, bool explicit_p
)
28872 unsigned int nds_elt_bits
;
28873 unsigned HOST_WIDE_INT const_simdlen
;
28878 /* For now, SVE simdclones won't produce illegal simdlen, So only check
28879 const simdlens here. */
28880 if (maybe_ne (clonei
->simdlen
, 0U)
28881 && clonei
->simdlen
.is_constant (&const_simdlen
)
28882 && (const_simdlen
< 2
28883 || const_simdlen
> 1024
28884 || (const_simdlen
& (const_simdlen
- 1)) != 0))
28887 warning_at (DECL_SOURCE_LOCATION (node
->decl
), 0,
28888 "unsupported simdlen %wd", const_simdlen
);
28892 ret_type
= TREE_TYPE (TREE_TYPE (node
->decl
));
28893 /* According to AArch64's Vector ABI the type that determines the simdlen is
28894 the narrowest of types, so we ignore base_type for AArch64. */
28895 if (TREE_CODE (ret_type
) != VOID_TYPE
28896 && !supported_simd_type (ret_type
))
28900 else if (COMPLEX_FLOAT_TYPE_P (ret_type
))
28901 warning_at (DECL_SOURCE_LOCATION (node
->decl
), 0,
28902 "GCC does not currently support return type %qT "
28903 "for simd", ret_type
);
28905 warning_at (DECL_SOURCE_LOCATION (node
->decl
), 0,
28906 "unsupported return type %qT for simd",
28911 auto_vec
<std::pair
<tree
, unsigned int>> vec_elts (clonei
->nargs
+ 1);
28913 /* We are looking for the NDS type here according to the VFABIA64. */
28914 if (TREE_CODE (ret_type
) != VOID_TYPE
)
28916 nds_elt_bits
= lane_size (SIMD_CLONE_ARG_TYPE_VECTOR
, ret_type
);
28917 vec_elts
.safe_push (std::make_pair (ret_type
, nds_elt_bits
));
28920 nds_elt_bits
= POINTER_SIZE
;
28923 tree type_arg_types
= TYPE_ARG_TYPES (TREE_TYPE (node
->decl
));
28924 bool decl_arg_p
= (node
->definition
|| type_arg_types
== NULL_TREE
);
28925 for (t
= (decl_arg_p
? DECL_ARGUMENTS (node
->decl
) : type_arg_types
), i
= 0;
28926 t
&& t
!= void_list_node
; t
= TREE_CHAIN (t
), i
++)
28928 tree arg_type
= decl_arg_p
? TREE_TYPE (t
) : TREE_VALUE (t
);
28929 if (clonei
->args
[i
].arg_type
!= SIMD_CLONE_ARG_TYPE_UNIFORM
28930 && !supported_simd_type (arg_type
))
28934 else if (COMPLEX_FLOAT_TYPE_P (ret_type
))
28935 warning_at (DECL_SOURCE_LOCATION (node
->decl
), 0,
28936 "GCC does not currently support argument type %qT "
28937 "for simd", arg_type
);
28939 warning_at (DECL_SOURCE_LOCATION (node
->decl
), 0,
28940 "unsupported argument type %qT for simd",
28944 unsigned lane_bits
= lane_size (clonei
->args
[i
].arg_type
, arg_type
);
28945 if (clonei
->args
[i
].arg_type
== SIMD_CLONE_ARG_TYPE_VECTOR
)
28946 vec_elts
.safe_push (std::make_pair (arg_type
, lane_bits
));
28947 if (nds_elt_bits
> lane_bits
)
28948 nds_elt_bits
= lane_bits
;
28951 clonei
->vecsize_mangle
= 'n';
28952 clonei
->mask_mode
= VOIDmode
;
28953 poly_uint64 simdlen
;
28954 auto_vec
<poly_uint64
> simdlens (2);
28955 /* Keep track of the possible simdlens the clones of this function can have,
28956 and check them later to see if we support them. */
28957 if (known_eq (clonei
->simdlen
, 0U))
28959 simdlen
= exact_div (poly_uint64 (64), nds_elt_bits
);
28960 if (maybe_ne (simdlen
, 1U))
28961 simdlens
.safe_push (simdlen
);
28962 simdlens
.safe_push (simdlen
* 2);
28965 simdlens
.safe_push (clonei
->simdlen
);
28967 clonei
->vecsize_int
= 0;
28968 clonei
->vecsize_float
= 0;
28970 /* We currently do not support generating simdclones where vector arguments
28971 do not fit into a single vector register, i.e. vector types that are more
28972 than 128-bits large. This is because of how we currently represent such
28973 types in ACLE, where we use a struct to allow us to pass them as arguments
28975 Hence why we have to check whether the simdlens available for this
28976 simdclone would cause a vector type to be larger than 128-bits, and reject
28979 while (j
< simdlens
.length ())
28981 bool remove_simdlen
= false;
28982 for (auto elt
: vec_elts
)
28983 if (known_gt (simdlens
[j
] * elt
.second
, 128U))
28985 /* Don't issue a warning for every simdclone when there is no
28986 specific simdlen clause. */
28987 if (explicit_p
&& maybe_ne (clonei
->simdlen
, 0U))
28988 warning_at (DECL_SOURCE_LOCATION (node
->decl
), 0,
28989 "GCC does not currently support simdlen %wd for "
28991 constant_lower_bound (simdlens
[j
]), elt
.first
);
28992 remove_simdlen
= true;
28995 if (remove_simdlen
)
28996 simdlens
.ordered_remove (j
);
29002 int count
= simdlens
.length ();
29005 if (explicit_p
&& known_eq (clonei
->simdlen
, 0U))
29007 /* Warn the user if we can't generate any simdclone. */
29008 simdlen
= exact_div (poly_uint64 (64), nds_elt_bits
);
29009 warning_at (DECL_SOURCE_LOCATION (node
->decl
), 0,
29010 "GCC does not currently support a simdclone with simdlens"
29011 " %wd and %wd for these types.",
29012 constant_lower_bound (simdlen
),
29013 constant_lower_bound (simdlen
*2));
29018 gcc_assert (num
< count
);
29019 clonei
->simdlen
= simdlens
[num
];
29023 /* Implement TARGET_SIMD_CLONE_ADJUST. */
29026 aarch64_simd_clone_adjust (struct cgraph_node
*node
)
29028 /* Add aarch64_vector_pcs target attribute to SIMD clones so they
29029 use the correct ABI. */
29031 tree t
= TREE_TYPE (node
->decl
);
29032 TYPE_ATTRIBUTES (t
) = make_attribute ("aarch64_vector_pcs", "default",
29033 TYPE_ATTRIBUTES (t
));
29036 /* Implement TARGET_SIMD_CLONE_USABLE. */
29039 aarch64_simd_clone_usable (struct cgraph_node
*node
)
29041 switch (node
->simdclone
->vecsize_mangle
)
29048 gcc_unreachable ();
29052 /* Implement TARGET_COMP_TYPE_ATTRIBUTES */
29055 aarch64_comp_type_attributes (const_tree type1
, const_tree type2
)
29057 auto check_attr
= [&](const char *ns
, const char *name
) {
29058 tree attr1
= lookup_attribute (ns
, name
, TYPE_ATTRIBUTES (type1
));
29059 tree attr2
= lookup_attribute (ns
, name
, TYPE_ATTRIBUTES (type2
));
29060 if (!attr1
&& !attr2
)
29063 return attr1
&& attr2
&& attribute_value_equal (attr1
, attr2
);
29066 if (!check_attr ("gnu", "aarch64_vector_pcs"))
29068 if (!check_attr ("gnu", "Advanced SIMD type"))
29070 if (!check_attr ("gnu", "SVE type"))
29072 if (!check_attr ("gnu", "SVE sizeless type"))
29074 if (!check_attr ("arm", "streaming"))
29076 if (!check_attr ("arm", "streaming_compatible"))
29078 if (aarch64_lookup_shared_state_flags (TYPE_ATTRIBUTES (type1
), "za")
29079 != aarch64_lookup_shared_state_flags (TYPE_ATTRIBUTES (type2
), "za"))
29081 if (aarch64_lookup_shared_state_flags (TYPE_ATTRIBUTES (type1
), "zt0")
29082 != aarch64_lookup_shared_state_flags (TYPE_ATTRIBUTES (type2
), "zt0"))
29087 /* Implement TARGET_MERGE_DECL_ATTRIBUTES. */
29090 aarch64_merge_decl_attributes (tree olddecl
, tree newdecl
)
29092 tree old_attrs
= DECL_ATTRIBUTES (olddecl
);
29093 tree old_new
= lookup_attribute ("arm", "new", old_attrs
);
29095 tree new_attrs
= DECL_ATTRIBUTES (newdecl
);
29096 tree new_new
= lookup_attribute ("arm", "new", new_attrs
);
29098 if (DECL_INITIAL (olddecl
) && new_new
)
29100 error ("cannot apply attribute %qs to %q+D after the function"
29101 " has been defined", "new", newdecl
);
29102 inform (DECL_SOURCE_LOCATION (olddecl
), "%q+D defined here",
29107 if (old_new
&& new_new
)
29109 old_attrs
= remove_attribute ("arm", "new", old_attrs
);
29110 TREE_VALUE (new_new
) = chainon (TREE_VALUE (new_new
),
29111 TREE_VALUE (old_new
));
29114 aarch64_check_arm_new_against_type (TREE_VALUE (new_new
), newdecl
);
29117 return merge_attributes (old_attrs
, new_attrs
);
29120 /* Implement TARGET_GET_MULTILIB_ABI_NAME */
29122 static const char *
29123 aarch64_get_multilib_abi_name (void)
29125 if (TARGET_BIG_END
)
29126 return TARGET_ILP32
? "aarch64_be_ilp32" : "aarch64_be";
29127 return TARGET_ILP32
? "aarch64_ilp32" : "aarch64";
29130 /* Implement TARGET_STACK_PROTECT_GUARD. In case of a
29131 global variable based guard use the default else
29132 return a null tree. */
29134 aarch64_stack_protect_guard (void)
29136 if (aarch64_stack_protector_guard
== SSP_GLOBAL
)
29137 return default_stack_protect_guard ();
29142 /* Implement TARGET_INVALID_UNARY_OP. */
29144 static const char *
29145 aarch64_invalid_unary_op (int op
, const_tree type
)
29147 /* Reject all single-operand operations on __mfp8 except for &. */
29148 if (TYPE_MAIN_VARIANT (type
) == aarch64_mfp8_type_node
&& op
!= ADDR_EXPR
)
29149 return N_ ("operation not permitted on type %<mfloat8_t%>");
29151 /* Operation allowed. */
29155 /* Implement TARGET_INVALID_BINARY_OP. */
29157 static const char *
29158 aarch64_invalid_binary_op (int op ATTRIBUTE_UNUSED
, const_tree type1
,
29161 if (VECTOR_TYPE_P (type1
)
29162 && VECTOR_TYPE_P (type2
)
29163 && !TYPE_INDIVISIBLE_P (type1
)
29164 && !TYPE_INDIVISIBLE_P (type2
)
29165 && (aarch64_sve::builtin_type_p (type1
)
29166 != aarch64_sve::builtin_type_p (type2
)))
29167 return N_("cannot combine GNU and SVE vectors in a binary operation");
29169 /* Reject all 2-operand operations on __mfp8. */
29170 if (TYPE_MAIN_VARIANT (type1
) == aarch64_mfp8_type_node
29171 || TYPE_MAIN_VARIANT (type2
) == aarch64_mfp8_type_node
)
29172 return N_ ("operation not permitted on type %<mfloat8_t%>");
29174 /* Operation allowed. */
29178 /* Implement TARGET_MEMTAG_CAN_TAG_ADDRESSES. Here we tell the rest of the
29179 compiler that we automatically ignore the top byte of our pointers, which
29180 allows using -fsanitize=hwaddress. */
29182 aarch64_can_tag_addresses ()
29184 return !TARGET_ILP32
;
29187 /* Implement TARGET_ASM_FILE_END for AArch64. This adds the AArch64 GNU NOTE
29188 section at the end if needed. */
29189 #define GNU_PROPERTY_AARCH64_FEATURE_1_AND 0xc0000000
29190 #define GNU_PROPERTY_AARCH64_FEATURE_1_BTI (1U << 0)
29191 #define GNU_PROPERTY_AARCH64_FEATURE_1_PAC (1U << 1)
29193 aarch64_file_end_indicate_exec_stack ()
29195 file_end_indicate_exec_stack ();
29197 unsigned feature_1_and
= 0;
29198 if (aarch_bti_enabled ())
29199 feature_1_and
|= GNU_PROPERTY_AARCH64_FEATURE_1_BTI
;
29201 if (aarch_ra_sign_scope
!= AARCH_FUNCTION_NONE
)
29202 feature_1_and
|= GNU_PROPERTY_AARCH64_FEATURE_1_PAC
;
29206 /* Generate .note.gnu.property section. */
29207 switch_to_section (get_section (".note.gnu.property",
29208 SECTION_NOTYPE
, NULL
));
29210 /* PT_NOTE header: namesz, descsz, type.
29211 namesz = 4 ("GNU\0")
29212 descsz = 16 (Size of the program property array)
29213 [(12 + padding) * Number of array elements]
29214 type = 5 (NT_GNU_PROPERTY_TYPE_0). */
29215 assemble_align (POINTER_SIZE
);
29216 assemble_integer (GEN_INT (4), 4, 32, 1);
29217 assemble_integer (GEN_INT (ROUND_UP (12, POINTER_BYTES
)), 4, 32, 1);
29218 assemble_integer (GEN_INT (5), 4, 32, 1);
29220 /* PT_NOTE name. */
29221 assemble_string ("GNU", 4);
29223 /* PT_NOTE contents for NT_GNU_PROPERTY_TYPE_0:
29224 type = GNU_PROPERTY_AARCH64_FEATURE_1_AND
29226 data = feature_1_and. */
29227 assemble_integer (GEN_INT (GNU_PROPERTY_AARCH64_FEATURE_1_AND
), 4, 32, 1);
29228 assemble_integer (GEN_INT (4), 4, 32, 1);
29229 assemble_integer (GEN_INT (feature_1_and
), 4, 32, 1);
29231 /* Pad the size of the note to the required alignment. */
29232 assemble_align (POINTER_SIZE
);
29235 #undef GNU_PROPERTY_AARCH64_FEATURE_1_PAC
29236 #undef GNU_PROPERTY_AARCH64_FEATURE_1_BTI
29237 #undef GNU_PROPERTY_AARCH64_FEATURE_1_AND
29239 /* Helper function for straight line speculation.
29240 Return what barrier should be emitted for straight line speculation
29242 When not mitigating against straight line speculation this function returns
29244 When mitigating against straight line speculation, use:
29245 * SB when the v8.5-A SB extension is enabled.
29246 * DSB+ISB otherwise. */
29248 aarch64_sls_barrier (int mitigation_required
)
29250 return mitigation_required
29251 ? (TARGET_SB
? "sb" : "dsb\tsy\n\tisb")
29255 static GTY (()) tree aarch64_sls_shared_thunks
[30];
29256 static GTY (()) bool aarch64_sls_shared_thunks_needed
= false;
29257 const char *indirect_symbol_names
[30] = {
29258 "__call_indirect_x0",
29259 "__call_indirect_x1",
29260 "__call_indirect_x2",
29261 "__call_indirect_x3",
29262 "__call_indirect_x4",
29263 "__call_indirect_x5",
29264 "__call_indirect_x6",
29265 "__call_indirect_x7",
29266 "__call_indirect_x8",
29267 "__call_indirect_x9",
29268 "__call_indirect_x10",
29269 "__call_indirect_x11",
29270 "__call_indirect_x12",
29271 "__call_indirect_x13",
29272 "__call_indirect_x14",
29273 "__call_indirect_x15",
29274 "", /* "__call_indirect_x16", */
29275 "", /* "__call_indirect_x17", */
29276 "__call_indirect_x18",
29277 "__call_indirect_x19",
29278 "__call_indirect_x20",
29279 "__call_indirect_x21",
29280 "__call_indirect_x22",
29281 "__call_indirect_x23",
29282 "__call_indirect_x24",
29283 "__call_indirect_x25",
29284 "__call_indirect_x26",
29285 "__call_indirect_x27",
29286 "__call_indirect_x28",
29287 "__call_indirect_x29",
29290 /* Function to create a BLR thunk. This thunk is used to mitigate straight
29291 line speculation. Instead of a simple BLR that can be speculated past,
29292 we emit a BL to this thunk, and this thunk contains a BR to the relevant
29293 register. These thunks have the relevant speculation barries put after
29294 their indirect branch so that speculation is blocked.
29296 We use such a thunk so the speculation barriers are kept off the
29297 architecturally executed path in order to reduce the performance overhead.
29299 When optimizing for size we use stubs shared by the linked object.
29300 When optimizing for performance we emit stubs for each function in the hope
29301 that the branch predictor can better train on jumps specific for a given
29304 aarch64_sls_create_blr_label (int regnum
)
29306 gcc_assert (STUB_REGNUM_P (regnum
));
29307 if (optimize_function_for_size_p (cfun
))
29309 /* For the thunks shared between different functions in this compilation
29310 unit we use a named symbol -- this is just for users to more easily
29311 understand the generated assembly. */
29312 aarch64_sls_shared_thunks_needed
= true;
29313 const char *thunk_name
= indirect_symbol_names
[regnum
];
29314 if (aarch64_sls_shared_thunks
[regnum
] == NULL
)
29316 /* Build a decl representing this function stub and record it for
29317 later. We build a decl here so we can use the GCC machinery for
29318 handling sections automatically (through `get_named_section` and
29319 `make_decl_one_only`). That saves us a lot of trouble handling
29320 the specifics of different output file formats. */
29321 tree decl
= build_decl (BUILTINS_LOCATION
, FUNCTION_DECL
,
29322 get_identifier (thunk_name
),
29323 build_function_type_list (void_type_node
,
29325 DECL_RESULT (decl
) = build_decl (BUILTINS_LOCATION
, RESULT_DECL
,
29326 NULL_TREE
, void_type_node
);
29327 TREE_PUBLIC (decl
) = 1;
29328 TREE_STATIC (decl
) = 1;
29329 DECL_IGNORED_P (decl
) = 1;
29330 DECL_ARTIFICIAL (decl
) = 1;
29331 make_decl_one_only (decl
, DECL_ASSEMBLER_NAME (decl
));
29332 resolve_unique_section (decl
, 0, false);
29333 aarch64_sls_shared_thunks
[regnum
] = decl
;
29336 return gen_rtx_SYMBOL_REF (Pmode
, thunk_name
);
29339 if (cfun
->machine
->call_via
[regnum
] == NULL
)
29340 cfun
->machine
->call_via
[regnum
]
29341 = gen_rtx_LABEL_REF (Pmode
, gen_label_rtx ());
29342 return cfun
->machine
->call_via
[regnum
];
29345 /* Helper function for aarch64_sls_emit_blr_function_thunks and
29346 aarch64_sls_emit_shared_blr_thunks below. */
29348 aarch64_sls_emit_function_stub (FILE *out_file
, int regnum
)
29350 /* Save in x16 and branch to that function so this transformation does
29351 not prevent jumping to `BTI c` instructions. */
29352 asm_fprintf (out_file
, "\tmov\tx16, x%d\n", regnum
);
29353 asm_fprintf (out_file
, "\tbr\tx16\n");
29356 /* Emit all BLR stubs for this particular function.
29357 Here we emit all the BLR stubs needed for the current function. Since we
29358 emit these stubs in a consecutive block we know there will be no speculation
29359 gadgets between each stub, and hence we only emit a speculation barrier at
29360 the end of the stub sequences.
29362 This is called in the TARGET_ASM_FUNCTION_EPILOGUE hook. */
29364 aarch64_sls_emit_blr_function_thunks (FILE *out_file
)
29366 if (! aarch64_harden_sls_blr_p ())
29369 bool any_functions_emitted
= false;
29370 /* We must save and restore the current function section since this assembly
29371 is emitted at the end of the function. This means it can be emitted *just
29372 after* the cold section of a function. That cold part would be emitted in
29373 a different section. That switch would trigger a `.cfi_endproc` directive
29374 to be emitted in the original section and a `.cfi_startproc` directive to
29375 be emitted in the new section. Switching to the original section without
29376 restoring would mean that the `.cfi_endproc` emitted as a function ends
29377 would happen in a different section -- leaving an unmatched
29378 `.cfi_startproc` in the cold text section and an unmatched `.cfi_endproc`
29379 in the standard text section. */
29380 section
*save_text_section
= in_section
;
29381 switch_to_section (function_section (current_function_decl
));
29382 for (int regnum
= 0; regnum
< 30; ++regnum
)
29384 rtx specu_label
= cfun
->machine
->call_via
[regnum
];
29385 if (specu_label
== NULL
)
29388 targetm
.asm_out
.print_operand (out_file
, specu_label
, 0);
29389 asm_fprintf (out_file
, ":\n");
29390 aarch64_sls_emit_function_stub (out_file
, regnum
);
29391 any_functions_emitted
= true;
29393 if (any_functions_emitted
)
29394 /* Can use the SB if needs be here, since this stub will only be used
29395 by the current function, and hence for the current target. */
29396 asm_fprintf (out_file
, "\t%s\n", aarch64_sls_barrier (true));
29397 switch_to_section (save_text_section
);
29400 /* Emit shared BLR stubs for the current compilation unit.
29401 Over the course of compiling this unit we may have converted some BLR
29402 instructions to a BL to a shared stub function. This is where we emit those
29404 This function is for the stubs shared between different functions in this
29405 compilation unit. We share when optimizing for size instead of speed.
29407 This function is called through the TARGET_ASM_FILE_END hook. */
29409 aarch64_sls_emit_shared_blr_thunks (FILE *out_file
)
29411 if (! aarch64_sls_shared_thunks_needed
)
29414 for (int regnum
= 0; regnum
< 30; ++regnum
)
29416 tree decl
= aarch64_sls_shared_thunks
[regnum
];
29420 const char *name
= indirect_symbol_names
[regnum
];
29421 switch_to_section (get_named_section (decl
, NULL
, 0));
29422 ASM_OUTPUT_ALIGN (out_file
, 2);
29423 targetm
.asm_out
.globalize_label (out_file
, name
);
29424 /* Only emits if the compiler is configured for an assembler that can
29425 handle visibility directives. */
29426 targetm
.asm_out
.assemble_visibility (decl
, VISIBILITY_HIDDEN
);
29427 ASM_OUTPUT_TYPE_DIRECTIVE (out_file
, name
, "function");
29428 ASM_OUTPUT_LABEL (out_file
, name
);
29429 aarch64_sls_emit_function_stub (out_file
, regnum
);
29430 /* Use the most conservative target to ensure it can always be used by any
29431 function in the translation unit. */
29432 asm_fprintf (out_file
, "\tdsb\tsy\n\tisb\n");
29433 ASM_DECLARE_FUNCTION_SIZE (out_file
, name
, decl
);
29437 /* Implement TARGET_ASM_FILE_END. */
29439 aarch64_asm_file_end ()
29441 aarch64_sls_emit_shared_blr_thunks (asm_out_file
);
29442 /* Since this function will be called for the ASM_FILE_END hook, we ensure
29443 that what would be called otherwise (e.g. `file_end_indicate_exec_stack`
29444 for FreeBSD) still gets called. */
29445 #ifdef TARGET_ASM_FILE_END
29446 TARGET_ASM_FILE_END ();
29451 aarch64_indirect_call_asm (rtx addr
)
29453 gcc_assert (REG_P (addr
));
29454 if (aarch64_harden_sls_blr_p ())
29456 rtx stub_label
= aarch64_sls_create_blr_label (REGNO (addr
));
29457 output_asm_insn ("bl\t%0", &stub_label
);
29460 output_asm_insn ("blr\t%0", &addr
);
29464 /* Emit the assembly instruction to load the thread pointer into DEST.
29465 Select between different tpidr_elN registers depending on -mtp= setting. */
29468 aarch64_output_load_tp (rtx dest
)
29470 const char *tpidrs
[] = {"tpidr_el0", "tpidr_el1", "tpidr_el2",
29471 "tpidr_el3", "tpidrro_el0"};
29473 snprintf (buffer
, sizeof (buffer
), "mrs\t%%0, %s",
29474 tpidrs
[aarch64_tpidr_register
]);
29475 output_asm_insn (buffer
, &dest
);
29479 /* Set up the value of REG_ALLOC_ORDER from scratch.
29481 It was previously good practice to put call-clobbered registers ahead
29482 of call-preserved registers, but that isn't necessary these days.
29483 IRA's model of register save/restore costs is much more sophisticated
29484 than the model that a simple ordering could provide. We leave
29485 HONOR_REG_ALLOC_ORDER undefined so that we can get the full benefit
29488 However, it is still useful to list registers that are members of
29489 multiple classes after registers that are members of fewer classes.
29490 For example, we have:
29492 - FP_LO8_REGS: v0-v7
29493 - FP_LO_REGS: v0-v15
29496 If, as a tie-breaker, we allocate FP_REGS in the order v0-v31,
29497 we run the risk of starving other (lower-priority) pseudos that
29498 require FP_LO8_REGS or FP_LO_REGS. Allocating FP_LO_REGS in the
29499 order v0-v15 could similarly starve pseudos that require FP_LO8_REGS.
29500 Allocating downwards rather than upwards avoids this problem, at least
29501 in code that has reasonable register pressure.
29503 The situation for predicate registers is similar. */
29506 aarch64_adjust_reg_alloc_order ()
29508 for (int i
= 0; i
< FIRST_PSEUDO_REGISTER
; ++i
)
29509 if (IN_RANGE (i
, V0_REGNUM
, V31_REGNUM
))
29510 reg_alloc_order
[i
] = V31_REGNUM
- (i
- V0_REGNUM
);
29511 else if (IN_RANGE (i
, P0_REGNUM
, P15_REGNUM
))
29512 reg_alloc_order
[i
] = P15_REGNUM
- (i
- P0_REGNUM
);
29514 reg_alloc_order
[i
] = i
;
29517 /* Return true if the PARALLEL PAR can be used in a VEC_SELECT expression
29518 of vector mode MODE to select half the elements of that vector.
29519 Allow any combination of indices except duplicates (or out of range of
29520 the mode units). */
29523 aarch64_parallel_select_half_p (machine_mode mode
, rtx par
)
29525 int nunits
= XVECLEN (par
, 0);
29526 if (!known_eq (GET_MODE_NUNITS (mode
), nunits
* 2))
29528 int mode_nunits
= nunits
* 2;
29529 /* Put all the elements of PAR into a hash_set and use its
29530 uniqueness guarantees to check that we don't try to insert the same
29532 hash_set
<rtx
> parset
;
29533 for (int i
= 0; i
< nunits
; ++i
)
29535 rtx elt
= XVECEXP (par
, 0, i
);
29536 if (!CONST_INT_P (elt
)
29537 || !IN_RANGE (INTVAL (elt
), 0, mode_nunits
- 1)
29538 || parset
.add (elt
))
29544 /* Return true if PAR1 and PAR2, two PARALLEL rtxes of CONST_INT values,
29545 contain any common elements. */
29548 aarch64_pars_overlap_p (rtx par1
, rtx par2
)
29550 int len1
= XVECLEN (par1
, 0);
29551 int len2
= XVECLEN (par2
, 0);
29552 hash_set
<rtx
> parset
;
29553 for (int i
= 0; i
< len1
; ++i
)
29554 parset
.add (XVECEXP (par1
, 0, i
));
29555 for (int i
= 0; i
< len2
; ++i
)
29556 if (parset
.contains (XVECEXP (par2
, 0, i
)))
29561 /* Implement OPTIMIZE_MODE_SWITCHING. */
29564 aarch64_optimize_mode_switching (aarch64_mode_entity entity
)
29566 bool have_sme_state
= (aarch64_cfun_incoming_pstate_za () != 0
29567 || (aarch64_cfun_has_new_state ("za")
29568 && df_regs_ever_live_p (ZA_REGNUM
))
29569 || (aarch64_cfun_has_new_state ("zt0")
29570 && df_regs_ever_live_p (ZT0_REGNUM
)));
29572 if (have_sme_state
&& nonlocal_goto_handler_labels
)
29574 static bool reported
;
29577 sorry ("non-local gotos in functions with SME state");
29584 case aarch64_mode_entity::HAVE_ZA_SAVE_BUFFER
:
29585 case aarch64_mode_entity::LOCAL_SME_STATE
:
29586 return have_sme_state
&& !nonlocal_goto_handler_labels
;
29588 gcc_unreachable ();
29591 /* Implement TARGET_MODE_EMIT for ZA_SAVE_BUFFER. */
29594 aarch64_mode_emit_za_save_buffer (aarch64_tristate_mode mode
,
29595 aarch64_tristate_mode prev_mode
)
29597 if (mode
== aarch64_tristate_mode::YES
)
29599 gcc_assert (prev_mode
== aarch64_tristate_mode::NO
);
29600 aarch64_init_tpidr2_block ();
29603 gcc_unreachable ();
29606 /* Implement TARGET_MODE_EMIT for LOCAL_SME_STATE. */
29609 aarch64_mode_emit_local_sme_state (aarch64_local_sme_state mode
,
29610 aarch64_local_sme_state prev_mode
)
29612 /* Back-propagation should ensure that we're always starting from
29614 gcc_assert (prev_mode
!= aarch64_local_sme_state::ANY
);
29616 if (prev_mode
== aarch64_local_sme_state::INACTIVE_CALLER
)
29618 /* Commit any uncommitted lazy save. This leaves ZA either active
29619 and zero (lazy save case) or off (normal case).
29623 mrs <temp>, tpidr2_el0
29624 cbz <temp>, no_save
29625 bl __arm_tpidr2_save
29626 msr tpidr2_el0, xzr
29627 zero { za } // Only if ZA is live
29628 zero { zt0 } // Only if ZT0 is live
29630 auto tmp_reg
= gen_reg_rtx (DImode
);
29631 emit_insn (gen_aarch64_read_tpidr2 (tmp_reg
));
29632 auto label
= gen_label_rtx ();
29633 rtx branch
= aarch64_gen_compare_zero_and_branch (EQ
, tmp_reg
, label
);
29634 auto jump
= emit_jump_insn (branch
);
29635 JUMP_LABEL (jump
) = label
;
29636 emit_insn (gen_aarch64_tpidr2_save ());
29637 emit_insn (gen_aarch64_clear_tpidr2 ());
29638 if (mode
== aarch64_local_sme_state::ACTIVE_LIVE
29639 || mode
== aarch64_local_sme_state::ACTIVE_DEAD
)
29641 if (aarch64_cfun_has_state ("za"))
29642 emit_insn (gen_aarch64_initial_zero_za ());
29643 if (aarch64_cfun_has_state ("zt0"))
29644 emit_insn (gen_aarch64_sme_zero_zt0 ());
29646 emit_label (label
);
29649 if (mode
== aarch64_local_sme_state::ACTIVE_LIVE
29650 || mode
== aarch64_local_sme_state::ACTIVE_DEAD
)
29652 if (prev_mode
== aarch64_local_sme_state::INACTIVE_LOCAL
)
29654 /* Make ZA active after being inactive.
29656 First handle the case in which the lazy save we set up was
29657 committed by a callee. If the function's source-level ZA state
29658 is live then we must conditionally restore it from the lazy
29659 save buffer. Otherwise we can just force PSTATE.ZA to 1. */
29660 if (mode
== aarch64_local_sme_state::ACTIVE_LIVE
)
29661 emit_insn (gen_aarch64_restore_za (aarch64_get_tpidr2_ptr ()));
29663 emit_insn (gen_aarch64_smstart_za ());
29665 /* Now handle the case in which the lazy save was not committed.
29666 In that case, ZA still contains the current function's ZA state,
29667 and we just need to cancel the lazy save. */
29668 emit_insn (gen_aarch64_clear_tpidr2 ());
29670 /* Restore the ZT0 state, if we have some. */
29671 if (aarch64_cfun_has_state ("zt0"))
29672 aarch64_restore_zt0 (true);
29677 if (prev_mode
== aarch64_local_sme_state::SAVED_LOCAL
)
29679 /* Retrieve the current function's ZA state from the lazy save
29681 aarch64_restore_za (aarch64_get_tpidr2_ptr ());
29683 /* Restore the ZT0 state, if we have some. */
29684 if (aarch64_cfun_has_state ("zt0"))
29685 aarch64_restore_zt0 (true);
29689 if (prev_mode
== aarch64_local_sme_state::INACTIVE_CALLER
29690 || prev_mode
== aarch64_local_sme_state::OFF
)
29692 /* INACTIVE_CALLER means that we are enabling ZA for the first
29693 time in this function. The code above means that ZA is either
29694 active and zero (if we committed a lazy save) or off. Handle
29695 the latter case by forcing ZA on.
29697 OFF means that PSTATE.ZA is guaranteed to be 0. We just need
29700 Both cases leave ZA zeroed. */
29701 emit_insn (gen_aarch64_smstart_za ());
29703 /* Restore the ZT0 state, if we have some. */
29704 if (prev_mode
== aarch64_local_sme_state::OFF
29705 && aarch64_cfun_has_state ("zt0"))
29706 aarch64_restore_zt0 (true);
29710 if (prev_mode
== aarch64_local_sme_state::ACTIVE_DEAD
29711 || prev_mode
== aarch64_local_sme_state::ACTIVE_LIVE
)
29712 /* A simple change in liveness, such as in a CFG structure where
29713 ZA is only conditionally defined. No code is needed. */
29716 gcc_unreachable ();
29719 if (mode
== aarch64_local_sme_state::INACTIVE_LOCAL
)
29721 if (prev_mode
== aarch64_local_sme_state::ACTIVE_LIVE
29722 || prev_mode
== aarch64_local_sme_state::ACTIVE_DEAD
29723 || prev_mode
== aarch64_local_sme_state::INACTIVE_CALLER
)
29725 /* Save the ZT0 state, if we have some. */
29726 if (aarch64_cfun_has_state ("zt0"))
29727 aarch64_save_zt0 ();
29729 /* A transition from ACTIVE_LIVE to INACTIVE_LOCAL is the usual
29730 case of setting up a lazy save buffer before a call.
29731 A transition from INACTIVE_CALLER is similar, except that
29732 the contents of ZA are known to be zero.
29734 A transition from ACTIVE_DEAD means that ZA is live at the
29735 point of the transition, but is dead on at least one incoming
29736 edge. (That is, ZA is only conditionally initialized.)
29737 For efficiency, we want to set up a lazy save even for
29738 dead contents, since forcing ZA off would make later code
29739 restore ZA from the lazy save buffer. */
29740 emit_insn (gen_aarch64_write_tpidr2 (aarch64_get_tpidr2_ptr ()));
29744 if (prev_mode
== aarch64_local_sme_state::SAVED_LOCAL
29745 || prev_mode
== aarch64_local_sme_state::OFF
)
29746 /* We're simply discarding the information about which inactive
29750 gcc_unreachable ();
29753 if (mode
== aarch64_local_sme_state::INACTIVE_CALLER
29754 || mode
== aarch64_local_sme_state::OFF
)
29756 /* Save the ZT0 state, if we have some. */
29757 if ((prev_mode
== aarch64_local_sme_state::ACTIVE_LIVE
29758 || prev_mode
== aarch64_local_sme_state::ACTIVE_DEAD
)
29759 && mode
== aarch64_local_sme_state::OFF
29760 && aarch64_cfun_has_state ("zt0"))
29761 aarch64_save_zt0 ();
29763 /* The transition to INACTIVE_CALLER is used before returning from
29764 new("za") functions. Any state in ZA belongs to the current
29765 function rather than a caller, but that state is no longer
29766 needed. Clear any pending lazy save and turn ZA off.
29768 The transition to OFF is used before calling a private-ZA function.
29769 We committed any incoming lazy save above, so at this point any
29770 contents in ZA belong to the current function. */
29771 if (prev_mode
== aarch64_local_sme_state::INACTIVE_LOCAL
)
29772 emit_insn (gen_aarch64_clear_tpidr2 ());
29774 if (prev_mode
!= aarch64_local_sme_state::OFF
29775 && prev_mode
!= aarch64_local_sme_state::SAVED_LOCAL
)
29776 emit_insn (gen_aarch64_smstop_za ());
29781 if (mode
== aarch64_local_sme_state::SAVED_LOCAL
)
29783 /* This is a transition to an exception handler. */
29784 gcc_assert (prev_mode
== aarch64_local_sme_state::OFF
29785 || prev_mode
== aarch64_local_sme_state::INACTIVE_LOCAL
);
29789 gcc_unreachable ();
29792 /* Implement TARGET_MODE_EMIT. */
29795 aarch64_mode_emit (int entity
, int mode
, int prev_mode
, HARD_REG_SET live
)
29797 if (mode
== prev_mode
)
29801 switch (aarch64_mode_entity (entity
))
29803 case aarch64_mode_entity::HAVE_ZA_SAVE_BUFFER
:
29804 aarch64_mode_emit_za_save_buffer (aarch64_tristate_mode (mode
),
29805 aarch64_tristate_mode (prev_mode
));
29808 case aarch64_mode_entity::LOCAL_SME_STATE
:
29809 aarch64_mode_emit_local_sme_state (aarch64_local_sme_state (mode
),
29810 aarch64_local_sme_state (prev_mode
));
29813 rtx_insn
*seq
= get_insns ();
29816 /* Get the set of clobbered registers that are currently live. */
29817 HARD_REG_SET clobbers
= {};
29818 for (rtx_insn
*insn
= seq
; insn
; insn
= NEXT_INSN (insn
))
29820 if (!NONDEBUG_INSN_P (insn
))
29822 vec_rtx_properties properties
;
29823 properties
.add_insn (insn
, false);
29824 for (rtx_obj_reference ref
: properties
.refs ())
29825 if (ref
.is_write () && HARD_REGISTER_NUM_P (ref
.regno
))
29826 SET_HARD_REG_BIT (clobbers
, ref
.regno
);
29830 /* Emit instructions to save clobbered registers to pseudos. Queue
29831 instructions to restore the registers afterwards.
29833 This should only needed in rare situations. */
29834 auto_vec
<rtx
, 33> after
;
29835 for (unsigned int regno
= R0_REGNUM
; regno
< R30_REGNUM
; ++regno
)
29836 if (TEST_HARD_REG_BIT (clobbers
, regno
))
29838 rtx hard_reg
= gen_rtx_REG (DImode
, regno
);
29839 rtx pseudo_reg
= gen_reg_rtx (DImode
);
29840 emit_move_insn (pseudo_reg
, hard_reg
);
29841 after
.quick_push (gen_move_insn (hard_reg
, pseudo_reg
));
29843 if (TEST_HARD_REG_BIT (clobbers
, CC_REGNUM
))
29845 rtx pseudo_reg
= gen_reg_rtx (DImode
);
29846 emit_insn (gen_aarch64_save_nzcv (pseudo_reg
));
29847 after
.quick_push (gen_aarch64_restore_nzcv (pseudo_reg
));
29850 /* Emit the transition instructions themselves. */
29853 /* Restore the clobbered registers. */
29854 for (auto *insn
: after
)
29858 /* Return true if INSN references the SME state represented by hard register
29862 aarch64_insn_references_sme_state_p (rtx_insn
*insn
, unsigned int regno
)
29865 FOR_EACH_INSN_DEF (ref
, insn
)
29866 if (!DF_REF_FLAGS_IS_SET (ref
, DF_REF_MUST_CLOBBER
)
29867 && DF_REF_REGNO (ref
) == regno
)
29869 FOR_EACH_INSN_USE (ref
, insn
)
29870 if (DF_REF_REGNO (ref
) == regno
)
29875 /* Implement TARGET_MODE_NEEDED for LOCAL_SME_STATE. */
29877 static aarch64_local_sme_state
29878 aarch64_mode_needed_local_sme_state (rtx_insn
*insn
, HARD_REG_SET live
)
29881 && find_reg_note (insn
, REG_EH_REGION
, NULL_RTX
))
29883 static bool reported
;
29886 sorry ("catching non-call exceptions in functions with SME state");
29889 /* Aim for graceful error recovery by picking the value that is
29890 least likely to generate an ICE. */
29891 return aarch64_local_sme_state::INACTIVE_LOCAL
;
29894 /* A non-local goto is equivalent to a return. We disallow non-local
29895 receivers in functions with SME state, so we know that the target
29896 expects ZA to be dormant or off. */
29898 && find_reg_note (insn
, REG_NON_LOCAL_GOTO
, NULL_RTX
))
29899 return aarch64_local_sme_state::INACTIVE_CALLER
;
29901 /* start_private_za_call and end_private_za_call bracket a sequence
29902 that calls a private-ZA function. Force ZA to be turned off if the
29903 function doesn't have any live ZA state, otherwise require ZA to be
29905 auto icode
= recog_memoized (insn
);
29906 if (icode
== CODE_FOR_aarch64_start_private_za_call
29907 || icode
== CODE_FOR_aarch64_end_private_za_call
)
29908 return (TEST_HARD_REG_BIT (live
, ZA_REGNUM
)
29909 ? aarch64_local_sme_state::INACTIVE_LOCAL
29910 : aarch64_local_sme_state::OFF
);
29912 /* Force ZA to contain the current function's ZA state if INSN wants
29913 to access it. Do the same for accesses to ZT0, since ZA and ZT0
29914 are both controlled by PSTATE.ZA. */
29915 if (aarch64_insn_references_sme_state_p (insn
, ZA_REGNUM
)
29916 || aarch64_insn_references_sme_state_p (insn
, ZT0_REGNUM
))
29917 return (TEST_HARD_REG_BIT (live
, ZA_REGNUM
)
29918 ? aarch64_local_sme_state::ACTIVE_LIVE
29919 : aarch64_local_sme_state::ACTIVE_DEAD
);
29921 return aarch64_local_sme_state::ANY
;
29924 /* Implement TARGET_MODE_NEEDED for ZA_SAVE_BUFFER. */
29926 static aarch64_tristate_mode
29927 aarch64_mode_needed_za_save_buffer (rtx_insn
*insn
, HARD_REG_SET live
)
29929 /* We need to set up a lazy save buffer no later than the first
29930 transition to INACTIVE_LOCAL (which involves setting up a lazy save). */
29931 if (aarch64_mode_needed_local_sme_state (insn
, live
)
29932 == aarch64_local_sme_state::INACTIVE_LOCAL
)
29933 return aarch64_tristate_mode::YES
;
29935 /* Also make sure that the lazy save buffer is set up before the first
29936 insn that throws internally. The exception handler will sometimes
29938 if (find_reg_note (insn
, REG_EH_REGION
, NULL_RTX
))
29939 return aarch64_tristate_mode::YES
;
29941 return aarch64_tristate_mode::MAYBE
;
29944 /* Implement TARGET_MODE_NEEDED. */
29947 aarch64_mode_needed (int entity
, rtx_insn
*insn
, HARD_REG_SET live
)
29949 switch (aarch64_mode_entity (entity
))
29951 case aarch64_mode_entity::HAVE_ZA_SAVE_BUFFER
:
29952 return int (aarch64_mode_needed_za_save_buffer (insn
, live
));
29954 case aarch64_mode_entity::LOCAL_SME_STATE
:
29955 return int (aarch64_mode_needed_local_sme_state (insn
, live
));
29957 gcc_unreachable ();
29960 /* Implement TARGET_MODE_AFTER for LOCAL_SME_STATE. */
29962 static aarch64_local_sme_state
29963 aarch64_mode_after_local_sme_state (aarch64_local_sme_state mode
,
29966 /* Note places where ZA dies, so that we can try to avoid saving and
29967 restoring state that isn't needed. */
29968 if (mode
== aarch64_local_sme_state::ACTIVE_LIVE
29969 && !TEST_HARD_REG_BIT (live
, ZA_REGNUM
))
29970 return aarch64_local_sme_state::ACTIVE_DEAD
;
29972 /* Note where ZA is born, e.g. when moving past an __arm_out("za")
29974 if (mode
== aarch64_local_sme_state::ACTIVE_DEAD
29975 && TEST_HARD_REG_BIT (live
, ZA_REGNUM
))
29976 return aarch64_local_sme_state::ACTIVE_LIVE
;
29981 /* Implement TARGET_MODE_AFTER. */
29984 aarch64_mode_after (int entity
, int mode
, rtx_insn
*, HARD_REG_SET live
)
29986 switch (aarch64_mode_entity (entity
))
29988 case aarch64_mode_entity::HAVE_ZA_SAVE_BUFFER
:
29991 case aarch64_mode_entity::LOCAL_SME_STATE
:
29992 return int (aarch64_mode_after_local_sme_state
29993 (aarch64_local_sme_state (mode
), live
));
29995 gcc_unreachable ();
29998 /* Implement TARGET_MODE_CONFLUENCE for LOCAL_SME_STATE. */
30000 static aarch64_local_sme_state
30001 aarch64_local_sme_confluence (aarch64_local_sme_state mode1
,
30002 aarch64_local_sme_state mode2
)
30004 /* Perform a symmetrical check for two values. */
30005 auto is_pair
= [&](aarch64_local_sme_state val1
,
30006 aarch64_local_sme_state val2
)
30008 return ((mode1
== val1
&& mode2
== val2
)
30009 || (mode1
== val2
&& mode2
== val1
));
30012 /* INACTIVE_CALLER means ZA is off or it has dormant contents belonging
30013 to a caller. OFF is one of the options. */
30014 if (is_pair (aarch64_local_sme_state::INACTIVE_CALLER
,
30015 aarch64_local_sme_state::OFF
))
30016 return aarch64_local_sme_state::INACTIVE_CALLER
;
30018 /* Similarly for dormant contents belonging to the current function. */
30019 if (is_pair (aarch64_local_sme_state::INACTIVE_LOCAL
,
30020 aarch64_local_sme_state::OFF
))
30021 return aarch64_local_sme_state::INACTIVE_LOCAL
;
30023 /* Treat a conditionally-initialized value as a fully-initialized value. */
30024 if (is_pair (aarch64_local_sme_state::ACTIVE_LIVE
,
30025 aarch64_local_sme_state::ACTIVE_DEAD
))
30026 return aarch64_local_sme_state::ACTIVE_LIVE
;
30028 return aarch64_local_sme_state::ANY
;
30031 /* Implement TARGET_MODE_CONFLUENCE. */
30034 aarch64_mode_confluence (int entity
, int mode1
, int mode2
)
30036 gcc_assert (mode1
!= mode2
);
30037 switch (aarch64_mode_entity (entity
))
30039 case aarch64_mode_entity::HAVE_ZA_SAVE_BUFFER
:
30040 return int (aarch64_tristate_mode::MAYBE
);
30042 case aarch64_mode_entity::LOCAL_SME_STATE
:
30043 return int (aarch64_local_sme_confluence
30044 (aarch64_local_sme_state (mode1
),
30045 aarch64_local_sme_state (mode2
)));
30047 gcc_unreachable ();
30050 /* Implement TARGET_MODE_BACKPROP for an entity that either stays
30051 NO throughput, or makes one transition from NO to YES. */
30053 static aarch64_tristate_mode
30054 aarch64_one_shot_backprop (aarch64_tristate_mode mode1
,
30055 aarch64_tristate_mode mode2
)
30057 /* Keep bringing the transition forward until it starts from NO. */
30058 if (mode1
== aarch64_tristate_mode::MAYBE
30059 && mode2
== aarch64_tristate_mode::YES
)
30062 return aarch64_tristate_mode::MAYBE
;
30065 /* Implement TARGET_MODE_BACKPROP for LOCAL_SME_STATE. */
30067 static aarch64_local_sme_state
30068 aarch64_local_sme_backprop (aarch64_local_sme_state mode1
,
30069 aarch64_local_sme_state mode2
)
30071 /* We always need to know what the current state is when transitioning
30072 to a new state. Force any location with indeterminate starting state
30074 if (mode1
== aarch64_local_sme_state::ANY
)
30077 case aarch64_local_sme_state::INACTIVE_CALLER
:
30078 case aarch64_local_sme_state::OFF
:
30079 case aarch64_local_sme_state::ACTIVE_DEAD
:
30080 /* The current function's ZA state is not live. */
30081 return aarch64_local_sme_state::ACTIVE_DEAD
;
30083 case aarch64_local_sme_state::INACTIVE_LOCAL
:
30084 case aarch64_local_sme_state::ACTIVE_LIVE
:
30085 /* The current function's ZA state is live. */
30086 return aarch64_local_sme_state::ACTIVE_LIVE
;
30088 case aarch64_local_sme_state::SAVED_LOCAL
:
30089 /* This is a transition to an exception handler. Since we don't
30090 support non-call exceptions for SME functions, the source of
30091 the transition must be known. We'll assert later if that's
30093 return aarch64_local_sme_state::ANY
;
30095 case aarch64_local_sme_state::ANY
:
30096 return aarch64_local_sme_state::ANY
;
30099 return aarch64_local_sme_state::ANY
;
30102 /* Implement TARGET_MODE_BACKPROP. */
30105 aarch64_mode_backprop (int entity
, int mode1
, int mode2
)
30107 switch (aarch64_mode_entity (entity
))
30109 case aarch64_mode_entity::HAVE_ZA_SAVE_BUFFER
:
30110 return int (aarch64_one_shot_backprop (aarch64_tristate_mode (mode1
),
30111 aarch64_tristate_mode (mode2
)));
30113 case aarch64_mode_entity::LOCAL_SME_STATE
:
30114 return int (aarch64_local_sme_backprop
30115 (aarch64_local_sme_state (mode1
),
30116 aarch64_local_sme_state (mode2
)));
30118 gcc_unreachable ();
30121 /* Implement TARGET_MODE_ENTRY. */
30124 aarch64_mode_entry (int entity
)
30126 switch (aarch64_mode_entity (entity
))
30128 case aarch64_mode_entity::HAVE_ZA_SAVE_BUFFER
:
30129 return int (aarch64_tristate_mode::NO
);
30131 case aarch64_mode_entity::LOCAL_SME_STATE
:
30132 return int (aarch64_cfun_shared_flags ("za") != 0
30133 ? aarch64_local_sme_state::ACTIVE_LIVE
30134 : aarch64_cfun_incoming_pstate_za () != 0
30135 ? aarch64_local_sme_state::ACTIVE_DEAD
30136 : aarch64_local_sme_state::INACTIVE_CALLER
);
30138 gcc_unreachable ();
30141 /* Implement TARGET_MODE_EXIT. */
30144 aarch64_mode_exit (int entity
)
30146 switch (aarch64_mode_entity (entity
))
30148 case aarch64_mode_entity::HAVE_ZA_SAVE_BUFFER
:
30149 return int (aarch64_tristate_mode::MAYBE
);
30151 case aarch64_mode_entity::LOCAL_SME_STATE
:
30152 return int (aarch64_cfun_shared_flags ("za") != 0
30153 ? aarch64_local_sme_state::ACTIVE_LIVE
30154 : aarch64_cfun_incoming_pstate_za () != 0
30155 ? aarch64_local_sme_state::ACTIVE_DEAD
30156 : aarch64_local_sme_state::INACTIVE_CALLER
);
30158 gcc_unreachable ();
30161 /* Implement TARGET_MODE_EH_HANDLER. */
30164 aarch64_mode_eh_handler (int entity
)
30166 switch (aarch64_mode_entity (entity
))
30168 case aarch64_mode_entity::HAVE_ZA_SAVE_BUFFER
:
30169 /* Require a lazy save buffer to be allocated before the first
30170 insn that can throw. */
30171 return int (aarch64_tristate_mode::YES
);
30173 case aarch64_mode_entity::LOCAL_SME_STATE
:
30174 return int (aarch64_local_sme_state::SAVED_LOCAL
);
30176 gcc_unreachable ();
30179 /* Implement TARGET_MODE_PRIORITY. */
30182 aarch64_mode_priority (int, int n
)
30187 /* Implement TARGET_MD_ASM_ADJUST. */
30190 aarch64_md_asm_adjust (vec
<rtx
> &outputs
, vec
<rtx
> &inputs
,
30191 vec
<machine_mode
> &input_modes
,
30192 vec
<const char *> &constraints
,
30193 vec
<rtx
> &uses
, vec
<rtx
> &clobbers
,
30194 HARD_REG_SET
&clobbered_regs
, location_t loc
)
30196 rtx_insn
*seq
= arm_md_asm_adjust (outputs
, inputs
, input_modes
, constraints
,
30197 uses
, clobbers
, clobbered_regs
, loc
);
30199 /* "za" in the clobber list of a function with ZA state is defined to
30200 mean that the asm can read from and write to ZA. We can model the
30201 read using a USE, but unfortunately, it's not possible to model the
30202 write directly. Use a separate insn to model the effect.
30204 We must ensure that ZA is active on entry, which is enforced by using
30205 SME_STATE_REGNUM. The asm must ensure that ZA is active on return.
30207 The same thing applies to ZT0. */
30209 for (unsigned int i
= clobbers
.length (); i
-- > 0; )
30211 rtx x
= clobbers
[i
];
30213 && (REGNO (x
) == ZA_REGNUM
|| REGNO (x
) == ZT0_REGNUM
))
30215 auto id
= cfun
->machine
->next_asm_update_za_id
++;
30220 rtx id_rtx
= gen_int_mode (id
, SImode
);
30221 emit_insn (REGNO (x
) == ZA_REGNUM
30222 ? gen_aarch64_asm_update_za (id_rtx
)
30223 : gen_aarch64_asm_update_zt0 (id_rtx
));
30224 seq
= get_insns ();
30227 auto mode
= REGNO (x
) == ZA_REGNUM
? VNx16QImode
: V8DImode
;
30228 uses
.safe_push (gen_rtx_REG (mode
, REGNO (x
)));
30229 uses
.safe_push (gen_rtx_REG (DImode
, SME_STATE_REGNUM
));
30231 clobbers
.ordered_remove (i
);
30232 CLEAR_HARD_REG_BIT (clobbered_regs
, REGNO (x
));
30238 /* BB is the target of an exception or nonlocal goto edge, which means
30239 that PSTATE.SM is known to be 0 on entry. Put it into the state that
30240 the current function requires. */
30243 aarch64_switch_pstate_sm_for_landing_pad (basic_block bb
)
30245 if (TARGET_NON_STREAMING
)
30249 rtx_insn
*guard_label
= nullptr;
30250 if (TARGET_STREAMING_COMPATIBLE
)
30251 guard_label
= aarch64_guard_switch_pstate_sm (IP0_REGNUM
,
30252 AARCH64_ISA_MODE_SM_OFF
);
30253 aarch64_sme_mode_switch_regs args_switch
;
30254 args_switch
.add_call_preserved_regs (df_get_live_in (bb
));
30255 args_switch
.emit_prologue ();
30256 aarch64_switch_pstate_sm (AARCH64_ISA_MODE_SM_OFF
, AARCH64_ISA_MODE_SM_ON
);
30257 args_switch
.emit_epilogue ();
30259 emit_label (guard_label
);
30260 auto seq
= get_insns ();
30263 emit_insn_after (seq
, bb_note (bb
));
30267 /* JUMP is a nonlocal goto. Its target requires PSTATE.SM to be 0 on entry,
30268 so arrange to make it so. */
30271 aarch64_switch_pstate_sm_for_jump (rtx_insn
*jump
)
30273 if (TARGET_NON_STREAMING
)
30277 rtx_insn
*guard_label
= nullptr;
30278 if (TARGET_STREAMING_COMPATIBLE
)
30279 guard_label
= aarch64_guard_switch_pstate_sm (IP0_REGNUM
,
30280 AARCH64_ISA_MODE_SM_OFF
);
30281 aarch64_switch_pstate_sm (AARCH64_ISA_MODE_SM_ON
, AARCH64_ISA_MODE_SM_OFF
);
30283 emit_label (guard_label
);
30284 auto seq
= get_insns ();
30287 emit_insn_before (seq
, jump
);
30291 /* If CALL involves a change in PSTATE.SM, emit the instructions needed
30292 to switch to the new mode and the instructions needed to restore the
30293 original mode. Return true if something changed. */
30295 aarch64_switch_pstate_sm_for_call (rtx_call_insn
*call
)
30297 /* Mode switches for sibling calls are handled via the epilogue. */
30298 if (SIBLING_CALL_P (call
))
30301 auto callee_isa_mode
= aarch64_insn_callee_isa_mode (call
);
30302 if (!aarch64_call_switches_pstate_sm (callee_isa_mode
))
30305 /* Switch mode before the call, preserving any argument registers
30306 across the switch. */
30308 rtx_insn
*args_guard_label
= nullptr;
30309 if (TARGET_STREAMING_COMPATIBLE
)
30310 args_guard_label
= aarch64_guard_switch_pstate_sm (IP0_REGNUM
,
30312 aarch64_sme_mode_switch_regs args_switch
;
30313 args_switch
.add_call_args (call
);
30314 args_switch
.emit_prologue ();
30315 aarch64_switch_pstate_sm (AARCH64_ISA_MODE
, callee_isa_mode
);
30316 args_switch
.emit_epilogue ();
30317 if (args_guard_label
)
30318 emit_label (args_guard_label
);
30319 auto args_seq
= get_insns ();
30321 emit_insn_before (args_seq
, call
);
30323 if (find_reg_note (call
, REG_NORETURN
, NULL_RTX
))
30326 /* Switch mode after the call, preserving any return registers across
30329 rtx_insn
*return_guard_label
= nullptr;
30330 if (TARGET_STREAMING_COMPATIBLE
)
30331 return_guard_label
= aarch64_guard_switch_pstate_sm (IP0_REGNUM
,
30333 aarch64_sme_mode_switch_regs return_switch
;
30334 return_switch
.add_call_result (call
);
30335 return_switch
.emit_prologue ();
30336 aarch64_switch_pstate_sm (callee_isa_mode
, AARCH64_ISA_MODE
);
30337 return_switch
.emit_epilogue ();
30338 if (return_guard_label
)
30339 emit_label (return_guard_label
);
30340 auto result_seq
= get_insns ();
30342 emit_insn_after (result_seq
, call
);
30348 const pass_data pass_data_switch_pstate_sm
=
30351 "smstarts", // name
30352 OPTGROUP_NONE
, // optinfo_flags
30354 0, // properties_required
30355 0, // properties_provided
30356 0, // properties_destroyed
30357 0, // todo_flags_start
30358 TODO_df_finish
, // todo_flags_finish
30361 class pass_switch_pstate_sm
: public rtl_opt_pass
30364 pass_switch_pstate_sm (gcc::context
*ctxt
)
30365 : rtl_opt_pass (pass_data_switch_pstate_sm
, ctxt
)
30368 // opt_pass methods:
30369 bool gate (function
*) override final
;
30370 unsigned int execute (function
*) override final
;
30374 pass_switch_pstate_sm::gate (function
*fn
)
30376 return (aarch64_fndecl_pstate_sm (fn
->decl
) != AARCH64_ISA_MODE_SM_OFF
30377 || cfun
->machine
->call_switches_pstate_sm
);
30380 /* Emit any instructions needed to switch PSTATE.SM. */
30382 pass_switch_pstate_sm::execute (function
*fn
)
30386 auto_sbitmap
blocks (last_basic_block_for_fn (cfun
));
30387 bitmap_clear (blocks
);
30388 FOR_EACH_BB_FN (bb
, fn
)
30390 if (has_abnormal_call_or_eh_pred_edge_p (bb
)
30391 && aarch64_switch_pstate_sm_for_landing_pad (bb
))
30392 bitmap_set_bit (blocks
, bb
->index
);
30394 if (cfun
->machine
->call_switches_pstate_sm
)
30397 FOR_BB_INSNS (bb
, insn
)
30398 if (auto *call
= dyn_cast
<rtx_call_insn
*> (insn
))
30399 if (aarch64_switch_pstate_sm_for_call (call
))
30400 bitmap_set_bit (blocks
, bb
->index
);
30403 auto end
= BB_END (bb
);
30405 && find_reg_note (end
, REG_NON_LOCAL_GOTO
, NULL_RTX
)
30406 && aarch64_switch_pstate_sm_for_jump (end
))
30407 bitmap_set_bit (blocks
, bb
->index
);
30409 find_many_sub_basic_blocks (blocks
);
30410 clear_aux_for_blocks ();
30417 make_pass_switch_pstate_sm (gcc::context
*ctxt
)
30419 return new pass_switch_pstate_sm (ctxt
);
30422 /* Parse an implementation-defined system register name of
30423 the form S[0-3]_[0-7]_C[0-15]_C[0-15]_[0-7].
30424 Return true if name matched against above pattern, false
30427 aarch64_is_implem_def_reg (const char *regname
)
30430 unsigned name_len
= strlen (regname
);
30431 if (name_len
< 12 || name_len
> 14)
30434 auto cterm_valid_p
= [&]()
30436 bool leading_zero_p
= false;
30440 if (regname
[pos
] != 'c')
30443 while (regname
[pos
] != '_')
30445 if (leading_zero_p
)
30447 if (i
== 0 && regname
[pos
] == '0')
30448 leading_zero_p
= true;
30451 if (!ISDIGIT (regname
[pos
]))
30453 n
[i
++] = regname
[pos
++];
30460 if (regname
[pos
] != 's')
30463 if (regname
[pos
] < '0' || regname
[pos
] > '3')
30466 if (regname
[pos
++] != '_')
30468 if (regname
[pos
] < '0' || regname
[pos
] > '7')
30471 if (regname
[pos
++] != '_')
30473 if (!cterm_valid_p ())
30475 if (regname
[pos
++] != '_')
30477 if (!cterm_valid_p ())
30479 if (regname
[pos
++] != '_')
30481 if (regname
[pos
] < '0' || regname
[pos
] > '7')
30486 /* Return true if REGNAME matches either a known permitted system
30487 register name, or a generic sysreg specification. For use in
30488 back-end predicate `aarch64_sysreg_string'. */
30490 aarch64_valid_sysreg_name_p (const char *regname
)
30492 const sysreg_t
*sysreg
= aarch64_lookup_sysreg_map (regname
);
30493 if (sysreg
== NULL
)
30494 return aarch64_is_implem_def_reg (regname
);
30495 if (sysreg
->arch_reqs
)
30496 return bool (aarch64_isa_flags
& sysreg
->arch_reqs
);
30500 /* Return the generic sysreg specification for a valid system register
30501 name, otherwise NULL. WRITE_P is true iff the register is being
30502 written to. IS128OP indicates the requested system register should
30503 be checked for a 128-bit implementation. */
30505 aarch64_retrieve_sysreg (const char *regname
, bool write_p
, bool is128op
)
30507 const sysreg_t
*sysreg
= aarch64_lookup_sysreg_map (regname
);
30508 if (sysreg
== NULL
)
30510 if (aarch64_is_implem_def_reg (regname
))
30515 if (is128op
&& !(sysreg
->properties
& F_REG_128
))
30517 if ((write_p
&& (sysreg
->properties
& F_REG_READ
))
30518 || (!write_p
&& (sysreg
->properties
& F_REG_WRITE
)))
30520 if ((~aarch64_isa_flags
& sysreg
->arch_reqs
) != 0)
30522 return sysreg
->encoding
;
30525 /* Target-specific selftests. */
30529 namespace selftest
{
30531 /* Selftest for the RTL loader.
30532 Verify that the RTL loader copes with a dump from
30533 print_rtx_function. This is essentially just a test that class
30534 function_reader can handle a real dump, but it also verifies
30535 that lookup_reg_by_dump_name correctly handles hard regs.
30536 The presence of hard reg names in the dump means that the test is
30537 target-specific, hence it is in this file. */
30540 aarch64_test_loading_full_dump ()
30542 rtl_dump_test
t (SELFTEST_LOCATION
, locate_file ("aarch64/times-two.rtl"));
30544 ASSERT_STREQ ("times_two", IDENTIFIER_POINTER (DECL_NAME (cfun
->decl
)));
30546 rtx_insn
*insn_1
= get_insn_by_uid (1);
30547 ASSERT_EQ (NOTE
, GET_CODE (insn_1
));
30549 rtx_insn
*insn_15
= get_insn_by_uid (15);
30550 ASSERT_EQ (INSN
, GET_CODE (insn_15
));
30551 ASSERT_EQ (USE
, GET_CODE (PATTERN (insn_15
)));
30553 /* Verify crtl->return_rtx. */
30554 ASSERT_EQ (REG
, GET_CODE (crtl
->return_rtx
));
30555 ASSERT_EQ (0, REGNO (crtl
->return_rtx
));
30556 ASSERT_EQ (SImode
, GET_MODE (crtl
->return_rtx
));
30559 /* Test the fractional_cost class. */
30562 aarch64_test_fractional_cost ()
30564 using cf
= fractional_cost
;
30566 ASSERT_EQ (cf (0, 20), 0);
30568 ASSERT_EQ (cf (4, 2), 2);
30569 ASSERT_EQ (3, cf (9, 3));
30571 ASSERT_NE (cf (5, 2), 2);
30572 ASSERT_NE (3, cf (8, 3));
30574 ASSERT_EQ (cf (7, 11) + cf (15, 11), 2);
30575 ASSERT_EQ (cf (2, 3) + cf (3, 5), cf (19, 15));
30576 ASSERT_EQ (cf (2, 3) + cf (1, 6) + cf (1, 6), 1);
30578 ASSERT_EQ (cf (14, 15) - cf (4, 15), cf (2, 3));
30579 ASSERT_EQ (cf (1, 4) - cf (1, 2), 0);
30580 ASSERT_EQ (cf (3, 5) - cf (1, 10), cf (1, 2));
30581 ASSERT_EQ (cf (11, 3) - 3, cf (2, 3));
30582 ASSERT_EQ (3 - cf (7, 3), cf (2, 3));
30583 ASSERT_EQ (3 - cf (10, 3), 0);
30585 ASSERT_EQ (cf (2, 3) * 5, cf (10, 3));
30586 ASSERT_EQ (14 * cf (11, 21), cf (22, 3));
30588 ASSERT_TRUE (cf (4, 15) <= cf (5, 15));
30589 ASSERT_TRUE (cf (5, 15) <= cf (5, 15));
30590 ASSERT_FALSE (cf (6, 15) <= cf (5, 15));
30591 ASSERT_TRUE (cf (1, 3) <= cf (2, 5));
30592 ASSERT_TRUE (cf (1, 12) <= cf (1, 6));
30593 ASSERT_TRUE (cf (5, 3) <= cf (5, 3));
30594 ASSERT_TRUE (cf (239, 240) <= 1);
30595 ASSERT_TRUE (cf (240, 240) <= 1);
30596 ASSERT_FALSE (cf (241, 240) <= 1);
30597 ASSERT_FALSE (2 <= cf (207, 104));
30598 ASSERT_TRUE (2 <= cf (208, 104));
30599 ASSERT_TRUE (2 <= cf (209, 104));
30601 ASSERT_TRUE (cf (4, 15) < cf (5, 15));
30602 ASSERT_FALSE (cf (5, 15) < cf (5, 15));
30603 ASSERT_FALSE (cf (6, 15) < cf (5, 15));
30604 ASSERT_TRUE (cf (1, 3) < cf (2, 5));
30605 ASSERT_TRUE (cf (1, 12) < cf (1, 6));
30606 ASSERT_FALSE (cf (5, 3) < cf (5, 3));
30607 ASSERT_TRUE (cf (239, 240) < 1);
30608 ASSERT_FALSE (cf (240, 240) < 1);
30609 ASSERT_FALSE (cf (241, 240) < 1);
30610 ASSERT_FALSE (2 < cf (207, 104));
30611 ASSERT_FALSE (2 < cf (208, 104));
30612 ASSERT_TRUE (2 < cf (209, 104));
30614 ASSERT_FALSE (cf (4, 15) >= cf (5, 15));
30615 ASSERT_TRUE (cf (5, 15) >= cf (5, 15));
30616 ASSERT_TRUE (cf (6, 15) >= cf (5, 15));
30617 ASSERT_FALSE (cf (1, 3) >= cf (2, 5));
30618 ASSERT_FALSE (cf (1, 12) >= cf (1, 6));
30619 ASSERT_TRUE (cf (5, 3) >= cf (5, 3));
30620 ASSERT_FALSE (cf (239, 240) >= 1);
30621 ASSERT_TRUE (cf (240, 240) >= 1);
30622 ASSERT_TRUE (cf (241, 240) >= 1);
30623 ASSERT_TRUE (2 >= cf (207, 104));
30624 ASSERT_TRUE (2 >= cf (208, 104));
30625 ASSERT_FALSE (2 >= cf (209, 104));
30627 ASSERT_FALSE (cf (4, 15) > cf (5, 15));
30628 ASSERT_FALSE (cf (5, 15) > cf (5, 15));
30629 ASSERT_TRUE (cf (6, 15) > cf (5, 15));
30630 ASSERT_FALSE (cf (1, 3) > cf (2, 5));
30631 ASSERT_FALSE (cf (1, 12) > cf (1, 6));
30632 ASSERT_FALSE (cf (5, 3) > cf (5, 3));
30633 ASSERT_FALSE (cf (239, 240) > 1);
30634 ASSERT_FALSE (cf (240, 240) > 1);
30635 ASSERT_TRUE (cf (241, 240) > 1);
30636 ASSERT_TRUE (2 > cf (207, 104));
30637 ASSERT_FALSE (2 > cf (208, 104));
30638 ASSERT_FALSE (2 > cf (209, 104));
30640 ASSERT_EQ (cf (1, 2).ceil (), 1);
30641 ASSERT_EQ (cf (11, 7).ceil (), 2);
30642 ASSERT_EQ (cf (20, 1).ceil (), 20);
30643 ASSERT_EQ ((cf (0xfffffffd) + 1).ceil (), 0xfffffffe);
30644 ASSERT_EQ ((cf (0xfffffffd) + 2).ceil (), 0xffffffff);
30645 ASSERT_EQ ((cf (0xfffffffd) + 3).ceil (), 0xffffffff);
30646 ASSERT_EQ ((cf (0x7fffffff) * 2).ceil (), 0xfffffffe);
30647 ASSERT_EQ ((cf (0x80000000) * 2).ceil (), 0xffffffff);
30649 ASSERT_EQ (cf (1, 2).as_double (), 0.5);
30652 /* Calculate whether our system register data, as imported from
30653 `aarch64-sys-reg.def' has any duplicate entries. */
30655 aarch64_test_sysreg_encoding_clashes (void)
30657 using dup_instances_t
= hash_map
<nofree_string_hash
,
30658 std::vector
<const sysreg_t
*>>;
30660 dup_instances_t duplicate_instances
;
30662 /* Every time an encoding is established to come up more than once
30663 we add it to a "clash-analysis queue", which is then used to extract
30664 necessary information from our hash map when establishing whether
30665 repeated encodings are valid. */
30667 /* 1) Collect recurrence information. */
30668 for (unsigned i
= 0; i
< ARRAY_SIZE (aarch64_sysregs
); i
++)
30670 const sysreg_t
*reg
= aarch64_sysregs
+ i
;
30672 std::vector
<const sysreg_t
*> *tmp
30673 = &duplicate_instances
.get_or_insert (reg
->encoding
);
30675 tmp
->push_back (reg
);
30678 /* 2) Carry out analysis on collected data. */
30679 for (auto instance
: duplicate_instances
)
30681 unsigned nrep
= instance
.second
.size ();
30683 for (unsigned i
= 0; i
< nrep
; i
++)
30684 for (unsigned j
= i
+ 1; j
< nrep
; j
++)
30686 const sysreg_t
*a
= instance
.second
[i
];
30687 const sysreg_t
*b
= instance
.second
[j
];
30688 ASSERT_TRUE ((a
->properties
!= b
->properties
)
30689 || (a
->arch_reqs
!= b
->arch_reqs
));
30694 /* Run all target-specific selftests. */
30697 aarch64_run_selftests (void)
30699 aarch64_test_loading_full_dump ();
30700 aarch64_test_fractional_cost ();
30701 aarch64_test_sysreg_encoding_clashes ();
30704 } // namespace selftest
30706 #endif /* #if CHECKING_P */
30708 #undef TARGET_STACK_PROTECT_GUARD
30709 #define TARGET_STACK_PROTECT_GUARD aarch64_stack_protect_guard
30711 #undef TARGET_ADDRESS_COST
30712 #define TARGET_ADDRESS_COST aarch64_address_cost
30714 /* This hook will determines whether unnamed bitfields affect the alignment
30715 of the containing structure. The hook returns true if the structure
30716 should inherit the alignment requirements of an unnamed bitfield's
30718 #undef TARGET_ALIGN_ANON_BITFIELD
30719 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
30721 #undef TARGET_ASM_ALIGNED_DI_OP
30722 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
30724 #undef TARGET_ASM_ALIGNED_HI_OP
30725 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
30727 #undef TARGET_ASM_ALIGNED_SI_OP
30728 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
30730 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
30731 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
30732 hook_bool_const_tree_hwi_hwi_const_tree_true
30734 #undef TARGET_ASM_FILE_START
30735 #define TARGET_ASM_FILE_START aarch64_start_file
30737 #undef TARGET_ASM_OUTPUT_MI_THUNK
30738 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
30740 #undef TARGET_ASM_SELECT_RTX_SECTION
30741 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
30743 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
30744 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
30746 #undef TARGET_ASM_PRINT_PATCHABLE_FUNCTION_ENTRY
30747 #define TARGET_ASM_PRINT_PATCHABLE_FUNCTION_ENTRY aarch64_print_patchable_function_entry
30749 #undef TARGET_BUILD_BUILTIN_VA_LIST
30750 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
30752 #undef TARGET_CALLEE_COPIES
30753 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_arg_info_false
30755 #undef TARGET_FRAME_POINTER_REQUIRED
30756 #define TARGET_FRAME_POINTER_REQUIRED aarch64_frame_pointer_required
30758 #undef TARGET_CAN_ELIMINATE
30759 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
30761 #undef TARGET_FUNCTION_ATTRIBUTE_INLINABLE_P
30762 #define TARGET_FUNCTION_ATTRIBUTE_INLINABLE_P \
30763 aarch64_function_attribute_inlinable_p
30765 #undef TARGET_NEED_IPA_FN_TARGET_INFO
30766 #define TARGET_NEED_IPA_FN_TARGET_INFO aarch64_need_ipa_fn_target_info
30768 #undef TARGET_UPDATE_IPA_FN_TARGET_INFO
30769 #define TARGET_UPDATE_IPA_FN_TARGET_INFO aarch64_update_ipa_fn_target_info
30771 #undef TARGET_CAN_INLINE_P
30772 #define TARGET_CAN_INLINE_P aarch64_can_inline_p
30774 #undef TARGET_CANNOT_FORCE_CONST_MEM
30775 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
30777 #undef TARGET_CASE_VALUES_THRESHOLD
30778 #define TARGET_CASE_VALUES_THRESHOLD aarch64_case_values_threshold
30780 #undef TARGET_CONDITIONAL_REGISTER_USAGE
30781 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
30783 #undef TARGET_MEMBER_TYPE_FORCES_BLK
30784 #define TARGET_MEMBER_TYPE_FORCES_BLK aarch64_member_type_forces_blk
30786 /* Only the least significant bit is used for initialization guard
30788 #undef TARGET_CXX_GUARD_MASK_BIT
30789 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
30791 #undef TARGET_C_MODE_FOR_SUFFIX
30792 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
30794 #ifdef TARGET_BIG_ENDIAN_DEFAULT
30795 #undef TARGET_DEFAULT_TARGET_FLAGS
30796 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
30799 #undef TARGET_CLASS_MAX_NREGS
30800 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
30802 #undef TARGET_BUILTIN_DECL
30803 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
30805 #undef TARGET_BUILTIN_RECIPROCAL
30806 #define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal
30808 #undef TARGET_C_EXCESS_PRECISION
30809 #define TARGET_C_EXCESS_PRECISION aarch64_excess_precision
30811 #undef TARGET_C_BITINT_TYPE_INFO
30812 #define TARGET_C_BITINT_TYPE_INFO aarch64_bitint_type_info
30814 #undef TARGET_C_MODE_FOR_FLOATING_TYPE
30815 #define TARGET_C_MODE_FOR_FLOATING_TYPE aarch64_c_mode_for_floating_type
30817 #undef TARGET_EXPAND_BUILTIN
30818 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
30820 #undef TARGET_EXPAND_BUILTIN_VA_START
30821 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
30823 #undef TARGET_FOLD_BUILTIN
30824 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin
30826 #undef TARGET_FUNCTION_ARG
30827 #define TARGET_FUNCTION_ARG aarch64_function_arg
30829 #undef TARGET_FUNCTION_ARG_ADVANCE
30830 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
30832 #undef TARGET_FUNCTION_ARG_BOUNDARY
30833 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
30835 #undef TARGET_FUNCTION_ARG_PADDING
30836 #define TARGET_FUNCTION_ARG_PADDING aarch64_function_arg_padding
30838 #undef TARGET_GET_RAW_RESULT_MODE
30839 #define TARGET_GET_RAW_RESULT_MODE aarch64_get_reg_raw_mode
30840 #undef TARGET_GET_RAW_ARG_MODE
30841 #define TARGET_GET_RAW_ARG_MODE aarch64_get_reg_raw_mode
30843 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
30844 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
30846 #undef TARGET_FUNCTION_VALUE
30847 #define TARGET_FUNCTION_VALUE aarch64_function_value
30849 #undef TARGET_FUNCTION_VALUE_REGNO_P
30850 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
30852 #undef TARGET_START_CALL_ARGS
30853 #define TARGET_START_CALL_ARGS aarch64_start_call_args
30855 #undef TARGET_END_CALL_ARGS
30856 #define TARGET_END_CALL_ARGS aarch64_end_call_args
30858 #undef TARGET_GIMPLE_FOLD_BUILTIN
30859 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
30861 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
30862 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
30864 #undef TARGET_INIT_BUILTINS
30865 #define TARGET_INIT_BUILTINS aarch64_init_builtins
30867 #undef TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
30868 #define TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS \
30869 aarch64_ira_change_pseudo_allocno_class
30871 #undef TARGET_LEGITIMATE_ADDRESS_P
30872 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
30874 #undef TARGET_LEGITIMATE_CONSTANT_P
30875 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
30877 #undef TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT
30878 #define TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT \
30879 aarch64_legitimize_address_displacement
30881 #undef TARGET_LIBGCC_CMP_RETURN_MODE
30882 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
30884 #undef TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P
30885 #define TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P \
30886 aarch64_libgcc_floating_mode_supported_p
30888 #undef TARGET_MANGLE_TYPE
30889 #define TARGET_MANGLE_TYPE aarch64_mangle_type
30891 #undef TARGET_INVALID_CONVERSION
30892 #define TARGET_INVALID_CONVERSION aarch64_invalid_conversion
30894 #undef TARGET_INVALID_UNARY_OP
30895 #define TARGET_INVALID_UNARY_OP aarch64_invalid_unary_op
30897 #undef TARGET_INVALID_BINARY_OP
30898 #define TARGET_INVALID_BINARY_OP aarch64_invalid_binary_op
30900 #undef TARGET_VERIFY_TYPE_CONTEXT
30901 #define TARGET_VERIFY_TYPE_CONTEXT aarch64_verify_type_context
30903 #undef TARGET_MEMORY_MOVE_COST
30904 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
30906 #undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
30907 #define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
30909 #undef TARGET_MUST_PASS_IN_STACK
30910 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
30912 /* This target hook should return true if accesses to volatile bitfields
30913 should use the narrowest mode possible. It should return false if these
30914 accesses should use the bitfield container type. */
30915 #undef TARGET_NARROW_VOLATILE_BITFIELD
30916 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
30918 #undef TARGET_OPTION_OVERRIDE
30919 #define TARGET_OPTION_OVERRIDE aarch64_override_options
30921 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
30922 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
30923 aarch64_override_options_after_change
30925 #undef TARGET_OFFLOAD_OPTIONS
30926 #define TARGET_OFFLOAD_OPTIONS aarch64_offload_options
30928 #undef TARGET_OPTION_RESTORE
30929 #define TARGET_OPTION_RESTORE aarch64_option_restore
30931 #undef TARGET_OPTION_PRINT
30932 #define TARGET_OPTION_PRINT aarch64_option_print
30934 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
30935 #define TARGET_OPTION_VALID_ATTRIBUTE_P aarch64_option_valid_attribute_p
30937 #undef TARGET_OPTION_VALID_VERSION_ATTRIBUTE_P
30938 #define TARGET_OPTION_VALID_VERSION_ATTRIBUTE_P \
30939 aarch64_option_valid_version_attribute_p
30941 #undef TARGET_SET_CURRENT_FUNCTION
30942 #define TARGET_SET_CURRENT_FUNCTION aarch64_set_current_function
30944 #undef TARGET_PASS_BY_REFERENCE
30945 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
30947 #undef TARGET_PREFERRED_RELOAD_CLASS
30948 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
30950 #undef TARGET_SCHED_REASSOCIATION_WIDTH
30951 #define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width
30953 #undef TARGET_DWARF_FRAME_REG_MODE
30954 #define TARGET_DWARF_FRAME_REG_MODE aarch64_dwarf_frame_reg_mode
30956 #undef TARGET_OUTPUT_CFI_DIRECTIVE
30957 #define TARGET_OUTPUT_CFI_DIRECTIVE aarch64_output_cfi_directive
30959 #undef TARGET_DW_CFI_OPRND1_DESC
30960 #define TARGET_DW_CFI_OPRND1_DESC aarch64_dw_cfi_oprnd1_desc
30962 #undef TARGET_PROMOTED_TYPE
30963 #define TARGET_PROMOTED_TYPE aarch64_promoted_type
30965 #undef TARGET_SECONDARY_RELOAD
30966 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
30968 #undef TARGET_SECONDARY_MEMORY_NEEDED
30969 #define TARGET_SECONDARY_MEMORY_NEEDED aarch64_secondary_memory_needed
30971 #undef TARGET_SHIFT_TRUNCATION_MASK
30972 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
30974 #undef TARGET_SETUP_INCOMING_VARARGS
30975 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
30977 #undef TARGET_STRUCT_VALUE_RTX
30978 #define TARGET_STRUCT_VALUE_RTX aarch64_struct_value_rtx
30980 #undef TARGET_REGISTER_MOVE_COST
30981 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
30983 #undef TARGET_RETURN_IN_MEMORY
30984 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
30986 #undef TARGET_RETURN_IN_MSB
30987 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb
30989 #undef TARGET_RTX_COSTS
30990 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
30992 #undef TARGET_INSN_COST
30993 #define TARGET_INSN_COST aarch64_insn_cost
30995 #undef TARGET_SCALAR_MODE_SUPPORTED_P
30996 #define TARGET_SCALAR_MODE_SUPPORTED_P aarch64_scalar_mode_supported_p
30998 #undef TARGET_SCHED_ISSUE_RATE
30999 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
31001 #undef TARGET_SCHED_VARIABLE_ISSUE
31002 #define TARGET_SCHED_VARIABLE_ISSUE aarch64_sched_variable_issue
31004 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
31005 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
31006 aarch64_sched_first_cycle_multipass_dfa_lookahead
31008 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD
31009 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD \
31010 aarch64_first_cycle_multipass_dfa_lookahead_guard
31012 #undef TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS
31013 #define TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS \
31014 aarch64_get_separate_components
31016 #undef TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB
31017 #define TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB \
31018 aarch64_components_for_bb
31020 #undef TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS
31021 #define TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS \
31022 aarch64_disqualify_components
31024 #undef TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS
31025 #define TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS \
31026 aarch64_emit_prologue_components
31028 #undef TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS
31029 #define TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS \
31030 aarch64_emit_epilogue_components
31032 #undef TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS
31033 #define TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS \
31034 aarch64_set_handled_components
31036 #undef TARGET_TRAMPOLINE_INIT
31037 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
31039 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
31040 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
31042 #undef TARGET_VECTOR_MODE_SUPPORTED_P
31043 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
31045 #undef TARGET_VECTOR_MODE_SUPPORTED_ANY_TARGET_P
31046 #define TARGET_VECTOR_MODE_SUPPORTED_ANY_TARGET_P aarch64_vector_mode_supported_any_target_p
31048 #undef TARGET_COMPATIBLE_VECTOR_TYPES_P
31049 #define TARGET_COMPATIBLE_VECTOR_TYPES_P aarch64_compatible_vector_types_p
31051 #undef TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT
31052 #define TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT \
31053 aarch64_builtin_support_vector_misalignment
31055 #undef TARGET_ARRAY_MODE
31056 #define TARGET_ARRAY_MODE aarch64_array_mode
31058 #undef TARGET_ARRAY_MODE_SUPPORTED_P
31059 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
31061 #undef TARGET_VECTORIZE_CREATE_COSTS
31062 #define TARGET_VECTORIZE_CREATE_COSTS aarch64_vectorize_create_costs
31064 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
31065 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
31066 aarch64_builtin_vectorization_cost
31068 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
31069 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
31071 #undef TARGET_VECTORIZE_BUILTINS
31072 #define TARGET_VECTORIZE_BUILTINS
31074 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_MODES
31075 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_MODES \
31076 aarch64_autovectorize_vector_modes
31078 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
31079 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
31080 aarch64_atomic_assign_expand_fenv
31082 /* Section anchor support. */
31084 #undef TARGET_MIN_ANCHOR_OFFSET
31085 #define TARGET_MIN_ANCHOR_OFFSET -256
31087 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a
31088 byte offset; we can do much more for larger data types, but have no way
31089 to determine the size of the access. We assume accesses are aligned. */
31090 #undef TARGET_MAX_ANCHOR_OFFSET
31091 #define TARGET_MAX_ANCHOR_OFFSET 4095
31093 #undef TARGET_VECTORIZE_PREFERRED_DIV_AS_SHIFTS_OVER_MULT
31094 #define TARGET_VECTORIZE_PREFERRED_DIV_AS_SHIFTS_OVER_MULT \
31095 aarch64_vectorize_preferred_div_as_shifts_over_mult
31097 #undef TARGET_VECTOR_ALIGNMENT
31098 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
31100 #undef TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT
31101 #define TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT \
31102 aarch64_vectorize_preferred_vector_alignment
31103 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
31104 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
31105 aarch64_simd_vector_alignment_reachable
31107 /* vec_perm support. */
31109 #undef TARGET_VECTORIZE_VEC_PERM_CONST
31110 #define TARGET_VECTORIZE_VEC_PERM_CONST \
31111 aarch64_vectorize_vec_perm_const
31113 #undef TARGET_VECTORIZE_RELATED_MODE
31114 #define TARGET_VECTORIZE_RELATED_MODE aarch64_vectorize_related_mode
31115 #undef TARGET_VECTORIZE_GET_MASK_MODE
31116 #define TARGET_VECTORIZE_GET_MASK_MODE aarch64_get_mask_mode
31117 #undef TARGET_VECTORIZE_CONDITIONAL_OPERATION_IS_EXPENSIVE
31118 #define TARGET_VECTORIZE_CONDITIONAL_OPERATION_IS_EXPENSIVE \
31119 aarch64_conditional_operation_is_expensive
31120 #undef TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE
31121 #define TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE \
31122 aarch64_empty_mask_is_expensive
31123 #undef TARGET_PREFERRED_ELSE_VALUE
31124 #define TARGET_PREFERRED_ELSE_VALUE \
31125 aarch64_preferred_else_value
31127 #undef TARGET_INIT_LIBFUNCS
31128 #define TARGET_INIT_LIBFUNCS aarch64_init_libfuncs
31130 #undef TARGET_FIXED_CONDITION_CODE_REGS
31131 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
31133 #undef TARGET_FLAGS_REGNUM
31134 #define TARGET_FLAGS_REGNUM CC_REGNUM
31136 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
31137 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
31139 #undef TARGET_ASAN_SHADOW_OFFSET
31140 #define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset
31142 #undef TARGET_LEGITIMIZE_ADDRESS
31143 #define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address
31145 #undef TARGET_SCHED_CAN_SPECULATE_INSN
31146 #define TARGET_SCHED_CAN_SPECULATE_INSN aarch64_sched_can_speculate_insn
31148 #undef TARGET_CAN_USE_DOLOOP_P
31149 #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
31151 #undef TARGET_SCHED_ADJUST_PRIORITY
31152 #define TARGET_SCHED_ADJUST_PRIORITY aarch64_sched_adjust_priority
31154 #undef TARGET_SCHED_MACRO_FUSION_P
31155 #define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p
31157 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
31158 #define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
31160 #undef TARGET_SCHED_FUSION_PRIORITY
31161 #define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority
31163 #undef TARGET_UNSPEC_MAY_TRAP_P
31164 #define TARGET_UNSPEC_MAY_TRAP_P aarch64_unspec_may_trap_p
31166 #undef TARGET_USE_PSEUDO_PIC_REG
31167 #define TARGET_USE_PSEUDO_PIC_REG aarch64_use_pseudo_pic_reg
31169 #undef TARGET_PRINT_OPERAND
31170 #define TARGET_PRINT_OPERAND aarch64_print_operand
31172 #undef TARGET_PRINT_OPERAND_ADDRESS
31173 #define TARGET_PRINT_OPERAND_ADDRESS aarch64_print_operand_address
31175 #undef TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA
31176 #define TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA aarch64_output_addr_const_extra
31178 #undef TARGET_OPTAB_SUPPORTED_P
31179 #define TARGET_OPTAB_SUPPORTED_P aarch64_optab_supported_p
31181 #undef TARGET_OMIT_STRUCT_RETURN_REG
31182 #define TARGET_OMIT_STRUCT_RETURN_REG true
31184 #undef TARGET_DWARF_POLY_INDETERMINATE_VALUE
31185 #define TARGET_DWARF_POLY_INDETERMINATE_VALUE \
31186 aarch64_dwarf_poly_indeterminate_value
31188 /* The architecture reserves bits 0 and 1 so use bit 2 for descriptors. */
31189 #undef TARGET_CUSTOM_FUNCTION_DESCRIPTORS
31190 #define TARGET_CUSTOM_FUNCTION_DESCRIPTORS 4
31192 #undef TARGET_HARD_REGNO_NREGS
31193 #define TARGET_HARD_REGNO_NREGS aarch64_hard_regno_nregs
31194 #undef TARGET_HARD_REGNO_MODE_OK
31195 #define TARGET_HARD_REGNO_MODE_OK aarch64_hard_regno_mode_ok
31197 #undef TARGET_MODES_TIEABLE_P
31198 #define TARGET_MODES_TIEABLE_P aarch64_modes_tieable_p
31200 #undef TARGET_HARD_REGNO_CALL_PART_CLOBBERED
31201 #define TARGET_HARD_REGNO_CALL_PART_CLOBBERED \
31202 aarch64_hard_regno_call_part_clobbered
31204 #undef TARGET_INSN_CALLEE_ABI
31205 #define TARGET_INSN_CALLEE_ABI aarch64_insn_callee_abi
31207 #undef TARGET_CONSTANT_ALIGNMENT
31208 #define TARGET_CONSTANT_ALIGNMENT aarch64_constant_alignment
31210 #undef TARGET_STACK_CLASH_PROTECTION_ALLOCA_PROBE_RANGE
31211 #define TARGET_STACK_CLASH_PROTECTION_ALLOCA_PROBE_RANGE \
31212 aarch64_stack_clash_protection_alloca_probe_range
31214 #undef TARGET_COMPUTE_PRESSURE_CLASSES
31215 #define TARGET_COMPUTE_PRESSURE_CLASSES aarch64_compute_pressure_classes
31217 #undef TARGET_CAN_CHANGE_MODE_CLASS
31218 #define TARGET_CAN_CHANGE_MODE_CLASS aarch64_can_change_mode_class
31220 #undef TARGET_SELECT_EARLY_REMAT_MODES
31221 #define TARGET_SELECT_EARLY_REMAT_MODES aarch64_select_early_remat_modes
31223 #undef TARGET_SPECULATION_SAFE_VALUE
31224 #define TARGET_SPECULATION_SAFE_VALUE aarch64_speculation_safe_value
31226 #undef TARGET_ESTIMATED_POLY_VALUE
31227 #define TARGET_ESTIMATED_POLY_VALUE aarch64_estimated_poly_value
31229 #undef TARGET_ATTRIBUTE_TABLE
31230 #define TARGET_ATTRIBUTE_TABLE aarch64_attribute_table
31232 #undef TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN
31233 #define TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN \
31234 aarch64_simd_clone_compute_vecsize_and_simdlen
31236 #undef TARGET_SIMD_CLONE_ADJUST
31237 #define TARGET_SIMD_CLONE_ADJUST aarch64_simd_clone_adjust
31239 #undef TARGET_SIMD_CLONE_USABLE
31240 #define TARGET_SIMD_CLONE_USABLE aarch64_simd_clone_usable
31242 #undef TARGET_COMP_TYPE_ATTRIBUTES
31243 #define TARGET_COMP_TYPE_ATTRIBUTES aarch64_comp_type_attributes
31245 #undef TARGET_MERGE_DECL_ATTRIBUTES
31246 #define TARGET_MERGE_DECL_ATTRIBUTES aarch64_merge_decl_attributes
31248 #undef TARGET_GET_MULTILIB_ABI_NAME
31249 #define TARGET_GET_MULTILIB_ABI_NAME aarch64_get_multilib_abi_name
31251 #undef TARGET_FNTYPE_ABI
31252 #define TARGET_FNTYPE_ABI aarch64_fntype_abi
31254 #undef TARGET_MEMTAG_CAN_TAG_ADDRESSES
31255 #define TARGET_MEMTAG_CAN_TAG_ADDRESSES aarch64_can_tag_addresses
31258 #undef TARGET_RUN_TARGET_SELFTESTS
31259 #define TARGET_RUN_TARGET_SELFTESTS selftest::aarch64_run_selftests
31260 #endif /* #if CHECKING_P */
31262 #undef TARGET_ASM_POST_CFI_STARTPROC
31263 #define TARGET_ASM_POST_CFI_STARTPROC aarch64_post_cfi_startproc
31265 #undef TARGET_STRICT_ARGUMENT_NAMING
31266 #define TARGET_STRICT_ARGUMENT_NAMING hook_bool_CUMULATIVE_ARGS_true
31268 #undef TARGET_MODE_EMIT
31269 #define TARGET_MODE_EMIT aarch64_mode_emit
31271 #undef TARGET_MODE_NEEDED
31272 #define TARGET_MODE_NEEDED aarch64_mode_needed
31274 #undef TARGET_MODE_AFTER
31275 #define TARGET_MODE_AFTER aarch64_mode_after
31277 #undef TARGET_MODE_CONFLUENCE
31278 #define TARGET_MODE_CONFLUENCE aarch64_mode_confluence
31280 #undef TARGET_MODE_BACKPROP
31281 #define TARGET_MODE_BACKPROP aarch64_mode_backprop
31283 #undef TARGET_MODE_ENTRY
31284 #define TARGET_MODE_ENTRY aarch64_mode_entry
31286 #undef TARGET_MODE_EXIT
31287 #define TARGET_MODE_EXIT aarch64_mode_exit
31289 #undef TARGET_MODE_EH_HANDLER
31290 #define TARGET_MODE_EH_HANDLER aarch64_mode_eh_handler
31292 #undef TARGET_MODE_PRIORITY
31293 #define TARGET_MODE_PRIORITY aarch64_mode_priority
31295 #undef TARGET_MD_ASM_ADJUST
31296 #define TARGET_MD_ASM_ADJUST aarch64_md_asm_adjust
31298 #undef TARGET_ASM_FILE_END
31299 #define TARGET_ASM_FILE_END aarch64_asm_file_end
31301 #undef TARGET_ASM_FUNCTION_EPILOGUE
31302 #define TARGET_ASM_FUNCTION_EPILOGUE aarch64_sls_emit_blr_function_thunks
31304 #undef TARGET_HAVE_SHADOW_CALL_STACK
31305 #define TARGET_HAVE_SHADOW_CALL_STACK true
31307 #undef TARGET_CONST_ANCHOR
31308 #define TARGET_CONST_ANCHOR 0x1000000
31310 #undef TARGET_EXTRA_LIVE_ON_ENTRY
31311 #define TARGET_EXTRA_LIVE_ON_ENTRY aarch64_extra_live_on_entry
31313 #undef TARGET_USE_LATE_PROLOGUE_EPILOGUE
31314 #define TARGET_USE_LATE_PROLOGUE_EPILOGUE aarch64_use_late_prologue_epilogue
31316 #undef TARGET_EMIT_EPILOGUE_FOR_SIBCALL
31317 #define TARGET_EMIT_EPILOGUE_FOR_SIBCALL aarch64_expand_epilogue
31319 #undef TARGET_OPTION_FUNCTION_VERSIONS
31320 #define TARGET_OPTION_FUNCTION_VERSIONS aarch64_common_function_versions
31322 #undef TARGET_COMPARE_VERSION_PRIORITY
31323 #define TARGET_COMPARE_VERSION_PRIORITY aarch64_compare_version_priority
31325 #undef TARGET_GENERATE_VERSION_DISPATCHER_BODY
31326 #define TARGET_GENERATE_VERSION_DISPATCHER_BODY \
31327 aarch64_generate_version_dispatcher_body
31329 #undef TARGET_GET_FUNCTION_VERSIONS_DISPATCHER
31330 #define TARGET_GET_FUNCTION_VERSIONS_DISPATCHER \
31331 aarch64_get_function_versions_dispatcher
31333 #undef TARGET_MANGLE_DECL_ASSEMBLER_NAME
31334 #define TARGET_MANGLE_DECL_ASSEMBLER_NAME aarch64_mangle_decl_assembler_name
31336 struct gcc_target targetm
= TARGET_INITIALIZER
;
31338 #include "gt-aarch64.h"