1 /* Machine description for AArch64 architecture.
2 Copyright (C) 2009-2025 Free Software Foundation, Inc.
3 Contributed by ARM Ltd.
5 This file is part of GCC.
7 GCC is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3, or (at your option)
12 GCC is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
21 #define IN_TARGET_CODE 1
23 #define INCLUDE_STRING
24 #define INCLUDE_ALGORITHM
25 #define INCLUDE_VECTOR
28 #include "coretypes.h"
39 #include "stringpool.h"
46 #include "diagnostic.h"
47 #include "insn-attr.h"
49 #include "fold-const.h"
50 #include "stor-layout.h"
58 #include "langhooks.h"
62 #include "dwarf2out.h"
63 #include "gimple-iterator.h"
64 #include "tree-vectorizer.h"
65 #include "aarch64-cost-tables.h"
69 #include "tm-constrs.h"
70 #include "sched-int.h"
71 #include "target-globals.h"
72 #include "common/common-target.h"
75 #include "selftest-rtl.h"
76 #include "rtx-vector-builder.h"
79 #include "function-abi.h"
80 #include "gimple-pretty-print.h"
81 #include "tree-ssa-loop-niter.h"
82 #include "fractional-cost.h"
86 #include "aarch64-feature-deps.h"
87 #include "config/arm/aarch-common.h"
88 #include "config/arm/aarch-common-protos.h"
89 #include "common/config/aarch64/cpuinfo.h"
92 #include "tree-pass.h"
94 #include "symbol-summary.h"
98 #include "ipa-fnsummary.h"
101 /* This file should be included last. */
102 #include "target-def.h"
104 /* Defined for convenience. */
105 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
107 /* Maximum bytes set for an inline memset expansion. With -Os use 3 STP
108 and 1 MOVI/DUP (same size as a call). */
109 #define MAX_SET_SIZE(speed) (speed ? 256 : 96)
111 /* Flags that describe how a function shares certain architectural state
114 - AARCH64_STATE_SHARED indicates that the function does share the state
117 - AARCH64_STATE_IN indicates that the function reads (or might read) the
118 incoming state. The converse is that the function ignores the incoming
121 - AARCH64_STATE_OUT indicates that the function returns new state.
122 The converse is that the state on return is the same as it was on entry.
124 A function that partially modifies the state treats it as both IN
125 and OUT (because the value on return depends to some extent on the
127 constexpr auto AARCH64_STATE_SHARED
= 1U << 0;
128 constexpr auto AARCH64_STATE_IN
= 1U << 1;
129 constexpr auto AARCH64_STATE_OUT
= 1U << 2;
131 /* Enum to distinguish which type of check is to be done in
132 aarch64_simd_valid_imm. */
133 enum simd_immediate_check
{
140 /* Information about a legitimate vector immediate operand. */
141 struct simd_immediate_info
143 enum insn_type
{ MOV
, MVN
, INDEX
, PTRUE
, SVE_MOV
};
144 enum modifier_type
{ LSL
, MSL
};
146 simd_immediate_info () {}
147 simd_immediate_info (scalar_float_mode
, rtx
);
148 simd_immediate_info (scalar_int_mode
, unsigned HOST_WIDE_INT
,
149 insn_type
= MOV
, modifier_type
= LSL
,
151 simd_immediate_info (scalar_mode
, rtx
, rtx
);
152 simd_immediate_info (scalar_int_mode
, aarch64_svpattern
);
154 /* The mode of the elements. */
155 scalar_mode elt_mode
;
157 /* The instruction to use to move the immediate into a vector. */
162 /* For MOV and MVN. */
165 /* The value of each element. */
168 /* The kind of shift modifier to use, and the number of bits to shift.
169 This is (LSL, 0) if no shift is needed. */
170 modifier_type modifier
;
177 /* The value of the first element and the step to be added for each
178 subsequent element. */
183 aarch64_svpattern pattern
;
187 /* Construct a floating-point immediate in which each element has mode
188 ELT_MODE_IN and value VALUE_IN. */
189 inline simd_immediate_info
190 ::simd_immediate_info (scalar_float_mode elt_mode_in
, rtx value_in
)
191 : elt_mode (elt_mode_in
), insn (MOV
)
193 u
.mov
.value
= value_in
;
194 u
.mov
.modifier
= LSL
;
198 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
199 and value VALUE_IN. The other parameters are as for the structure
201 inline simd_immediate_info
202 ::simd_immediate_info (scalar_int_mode elt_mode_in
,
203 unsigned HOST_WIDE_INT value_in
,
204 insn_type insn_in
, modifier_type modifier_in
,
205 unsigned int shift_in
)
206 : elt_mode (elt_mode_in
), insn (insn_in
)
208 u
.mov
.value
= gen_int_mode (value_in
, elt_mode_in
);
209 u
.mov
.modifier
= modifier_in
;
210 u
.mov
.shift
= shift_in
;
213 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
214 and where element I is equal to BASE_IN + I * STEP_IN. */
215 inline simd_immediate_info
216 ::simd_immediate_info (scalar_mode elt_mode_in
, rtx base_in
, rtx step_in
)
217 : elt_mode (elt_mode_in
), insn (INDEX
)
219 u
.index
.base
= base_in
;
220 u
.index
.step
= step_in
;
223 /* Construct a predicate that controls elements of mode ELT_MODE_IN
224 and has PTRUE pattern PATTERN_IN. */
225 inline simd_immediate_info
226 ::simd_immediate_info (scalar_int_mode elt_mode_in
,
227 aarch64_svpattern pattern_in
)
228 : elt_mode (elt_mode_in
), insn (PTRUE
)
230 u
.pattern
= pattern_in
;
235 /* Describes types that map to Pure Scalable Types (PSTs) in the AAPCS64. */
236 class pure_scalable_type_info
239 /* Represents the result of analyzing a type. All values are nonzero,
240 in the possibly forlorn hope that accidental conversions to bool
241 trigger a warning. */
244 /* The type does not have an ABI identity; i.e. it doesn't contain
245 at least one object whose type is a Fundamental Data Type. */
248 /* The type is definitely a Pure Scalable Type. */
251 /* The type is definitely not a Pure Scalable Type. */
254 /* It doesn't matter for PCS purposes whether the type is a Pure
255 Scalable Type or not, since the type will be handled the same
258 Specifically, this means that if the type is a Pure Scalable Type,
259 there aren't enough argument registers to hold it, and so it will
260 need to be passed or returned in memory. If the type isn't a
261 Pure Scalable Type, it's too big to be passed or returned in core
262 or SIMD&FP registers, and so again will need to go in memory. */
266 /* Aggregates of 17 bytes or more are normally passed and returned
267 in memory, so aggregates of that size can safely be analyzed as
268 DOESNT_MATTER. We need to be able to collect enough pieces to
269 represent a PST that is smaller than that. Since predicates are
270 2 bytes in size for -msve-vector-bits=128, that means we need to be
271 able to store at least 8 pieces.
273 We also need to be able to store enough pieces to represent
274 a single vector in each vector argument register and a single
275 predicate in each predicate argument register. This means that
276 we need at least 12 pieces. */
277 static const unsigned int MAX_PIECES
= NUM_FP_ARG_REGS
+ NUM_PR_ARG_REGS
;
278 static_assert (MAX_PIECES
>= 8, "Need to store at least 8 predicates");
280 /* Describes one piece of a PST. Each piece is one of:
282 - a single Scalable Vector Type (SVT)
283 - a single Scalable Predicate Type (SPT)
284 - a PST containing 2, 3 or 4 SVTs, with no padding
286 It either represents a single built-in type or a PST formed from
287 multiple homogeneous built-in types. */
290 rtx
get_rtx (unsigned int, unsigned int) const;
292 /* The number of vector and predicate registers that the piece
293 occupies. One of the two is always zero. */
297 /* The mode of the registers described above. */
300 /* If this piece is formed from multiple homogeneous built-in types,
301 this is the mode of the built-in types, otherwise it is MODE. */
302 machine_mode orig_mode
;
304 /* The offset in bytes of the piece from the start of the type. */
308 /* Divides types analyzed as IS_PST into individual pieces. The pieces
309 are in memory order. */
310 auto_vec
<piece
, MAX_PIECES
> pieces
;
312 unsigned int num_zr () const;
313 unsigned int num_pr () const;
315 rtx
get_rtx (machine_mode mode
, unsigned int, unsigned int) const;
317 analysis_result
analyze (const_tree
);
318 bool analyze_registers (const_tree
);
321 analysis_result
analyze_array (const_tree
);
322 analysis_result
analyze_record (const_tree
);
323 void add_piece (const piece
&);
327 /* The current code model. */
328 enum aarch64_code_model aarch64_cmodel
;
330 enum aarch64_tp_reg aarch64_tpidr_register
;
332 /* The number of 64-bit elements in an SVE vector. */
333 poly_uint16 aarch64_sve_vg
;
336 #undef TARGET_HAVE_TLS
337 #define TARGET_HAVE_TLS 1
340 static bool aarch64_composite_type_p (const_tree
, machine_mode
);
341 static bool aarch64_return_in_memory_1 (const_tree
);
342 static bool aarch64_vfp_is_call_or_return_candidate (machine_mode
,
344 machine_mode
*, int *,
346 static void aarch64_elf_asm_constructor (rtx
, int) ATTRIBUTE_UNUSED
;
347 static void aarch64_elf_asm_destructor (rtx
, int) ATTRIBUTE_UNUSED
;
348 static void aarch64_override_options_after_change (void);
349 static bool aarch64_vector_mode_supported_p (machine_mode
);
350 static int aarch64_address_cost (rtx
, machine_mode
, addr_space_t
, bool);
351 static bool aarch64_builtin_support_vector_misalignment (machine_mode mode
,
355 static machine_mode
aarch64_simd_container_mode (scalar_mode
, poly_int64
);
356 static bool aarch64_print_address_internal (FILE*, machine_mode
, rtx
,
357 aarch64_addr_query_type
);
359 /* The processor for which instructions should be scheduled. */
360 enum aarch64_cpu aarch64_tune
= AARCH64_CPU_cortexa53
;
362 /* Global flag for PC relative loads. */
363 bool aarch64_pcrelative_literal_loads
;
365 /* Global flag for whether frame pointer is enabled. */
366 bool aarch64_use_frame_pointer
;
368 /* Support for command line parsing of boolean flags in the tuning
370 struct aarch64_flag_desc
376 #define AARCH64_FUSION_PAIR(name, internal_name) \
377 { name, AARCH64_FUSE_##internal_name },
378 static const struct aarch64_flag_desc aarch64_fusible_pairs
[] =
380 { "none", AARCH64_FUSE_NOTHING
},
381 #include "aarch64-fusion-pairs.def"
382 { "all", AARCH64_FUSE_ALL
},
383 { NULL
, AARCH64_FUSE_NOTHING
}
386 #define AARCH64_EXTRA_TUNING_OPTION(name, internal_name) \
387 { name, AARCH64_EXTRA_TUNE_##internal_name },
388 static const struct aarch64_flag_desc aarch64_tuning_flags
[] =
390 { "none", AARCH64_EXTRA_TUNE_NONE
},
391 #include "aarch64-tuning-flags.def"
392 { "all", AARCH64_EXTRA_TUNE_ALL
},
393 { NULL
, AARCH64_EXTRA_TUNE_NONE
}
396 /* Tuning parameters. */
397 #include "tuning_models/generic.h"
398 #include "tuning_models/generic_armv8_a.h"
399 #include "tuning_models/generic_armv9_a.h"
400 #include "tuning_models/cortexa35.h"
401 #include "tuning_models/cortexa53.h"
402 #include "tuning_models/cortexa57.h"
403 #include "tuning_models/cortexa72.h"
404 #include "tuning_models/cortexa73.h"
405 #include "tuning_models/cortexx925.h"
406 #include "tuning_models/exynosm1.h"
407 #include "tuning_models/thunderxt88.h"
408 #include "tuning_models/thunderx.h"
409 #include "tuning_models/tsv110.h"
410 #include "tuning_models/xgene1.h"
411 #include "tuning_models/emag.h"
412 #include "tuning_models/qdf24xx.h"
413 #include "tuning_models/saphira.h"
414 #include "tuning_models/thunderx2t99.h"
415 #include "tuning_models/thunderx3t110.h"
416 #include "tuning_models/neoversen1.h"
417 #include "tuning_models/ampere1.h"
418 #include "tuning_models/ampere1a.h"
419 #include "tuning_models/ampere1b.h"
420 #include "tuning_models/neoversev1.h"
421 #include "tuning_models/neoverse512tvb.h"
422 #include "tuning_models/neoversen2.h"
423 #include "tuning_models/neoversen3.h"
424 #include "tuning_models/neoversev2.h"
425 #include "tuning_models/neoversev3.h"
426 #include "tuning_models/neoversev3ae.h"
427 #include "tuning_models/a64fx.h"
428 #include "tuning_models/fujitsu_monaka.h"
430 /* Support for fine-grained override of the tuning structures. */
431 struct aarch64_tuning_override_function
434 void (*parse_override
)(const char*, struct tune_params
*);
437 static void aarch64_parse_fuse_string (const char*, struct tune_params
*);
438 static void aarch64_parse_tune_string (const char*, struct tune_params
*);
439 static void aarch64_parse_sve_width_string (const char*, struct tune_params
*);
441 static const struct aarch64_tuning_override_function
442 aarch64_tuning_override_functions
[] =
444 { "fuse", aarch64_parse_fuse_string
},
445 { "tune", aarch64_parse_tune_string
},
446 { "sve_width", aarch64_parse_sve_width_string
},
450 /* A processor implementing AArch64. */
455 aarch64_cpu sched_core
;
457 aarch64_feature_flags flags
;
458 const tune_params
*tune
;
461 /* Architectures implementing AArch64. */
462 static CONSTEXPR
const processor all_architectures
[] =
464 #define AARCH64_ARCH(NAME, CORE, ARCH_IDENT, D, E) \
465 {NAME, AARCH64_CPU_##CORE, AARCH64_CPU_##CORE, AARCH64_ARCH_##ARCH_IDENT, \
466 feature_deps::ARCH_IDENT ().enable, NULL},
467 #include "aarch64-arches.def"
468 {NULL
, aarch64_no_cpu
, aarch64_no_cpu
, aarch64_no_arch
, 0, NULL
}
471 /* Processor cores implementing AArch64. */
472 static const struct processor all_cores
[] =
474 #define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, E, COSTS, G, H, I) \
475 {NAME, AARCH64_CPU_##IDENT, AARCH64_CPU_##SCHED, AARCH64_ARCH_##ARCH, \
476 feature_deps::cpu_##IDENT, &COSTS##_tunings},
477 #include "aarch64-cores.def"
478 {NULL
, aarch64_no_cpu
, aarch64_no_cpu
, aarch64_no_arch
, 0, NULL
}
480 /* Internal representation of system registers. */
483 /* Stringified sysreg encoding values, represented as
484 s<sn>_<op1>_c<cn>_c<cm>_<op2>. */
485 const char *encoding
;
486 /* Flags affecting sysreg usage, such as read/write-only. */
488 /* Architectural features implied by sysreg. */
489 aarch64_feature_flags arch_reqs
;
492 /* An aarch64_feature_set initializer for a single feature,
493 AARCH64_FEATURE_<FEAT>. */
494 #define AARCH64_FEATURE(FEAT) AARCH64_FL_##FEAT
496 /* Used by AARCH64_FEATURES. */
497 #define AARCH64_OR_FEATURES_1(X, F1) \
499 #define AARCH64_OR_FEATURES_2(X, F1, F2) \
500 (AARCH64_FEATURE (F1) | AARCH64_OR_FEATURES_1 (X, F2))
501 #define AARCH64_OR_FEATURES_3(X, F1, ...) \
502 (AARCH64_FEATURE (F1) | AARCH64_OR_FEATURES_2 (X, __VA_ARGS__))
504 /* An aarch64_feature_set initializer for the N features listed in "...". */
505 #define AARCH64_FEATURES(N, ...) \
506 AARCH64_OR_FEATURES_##N (0, __VA_ARGS__)
508 #define AARCH64_NO_FEATURES 0
510 /* Flags associated with the properties of system registers. It mainly serves
511 to mark particular registers as read or write only. */
512 #define F_DEPRECATED (1 << 1)
513 #define F_REG_READ (1 << 2)
514 #define F_REG_WRITE (1 << 3)
515 #define F_ARCHEXT (1 << 4)
516 /* Flag indicating register name is alias for another system register. */
517 #define F_REG_ALIAS (1 << 5)
518 /* Flag indicatinig registers which may be implemented with 128-bits. */
519 #define F_REG_128 (1 << 6)
521 /* Database of system registers, their encodings and architectural
523 const sysreg_t aarch64_sysregs
[] =
525 #define CPENC(SN, OP1, CN, CM, OP2) "s"#SN"_"#OP1"_c"#CN"_c"#CM"_"#OP2
526 #define SYSREG(NAME, ENC, FLAGS, ARCH) \
527 { NAME, ENC, FLAGS, ARCH },
528 #include "aarch64-sys-regs.def"
532 #undef AARCH64_NO_FEATURES
534 using sysreg_map_t
= hash_map
<nofree_string_hash
, const sysreg_t
*>;
535 static sysreg_map_t
*sysreg_map
= nullptr;
537 /* Map system register names to their hardware metadata: encoding,
538 feature flags and architectural feature requirements, all of which
539 are encoded in a sysreg_t struct. */
541 aarch64_register_sysreg (const char *name
, const sysreg_t
*metadata
)
543 bool dup
= sysreg_map
->put (name
, metadata
);
544 gcc_checking_assert (!dup
);
547 /* Lazily initialize hash table for system register validation,
548 checking the validity of supplied register name and returning
549 register's associated metadata. */
551 aarch64_init_sysregs (void)
553 gcc_assert (!sysreg_map
);
554 sysreg_map
= new sysreg_map_t
;
557 for (unsigned i
= 0; i
< ARRAY_SIZE (aarch64_sysregs
); i
++)
559 const sysreg_t
*reg
= aarch64_sysregs
+ i
;
560 aarch64_register_sysreg (reg
->name
, reg
);
564 /* No direct access to the sysreg hash-map should be made. Doing so
565 risks trying to acess an unitialized hash-map and dereferencing the
566 returned double pointer without due care risks dereferencing a
569 aarch64_lookup_sysreg_map (const char *regname
)
572 aarch64_init_sysregs ();
574 const sysreg_t
**sysreg_entry
= sysreg_map
->get (regname
);
575 if (sysreg_entry
!= NULL
)
576 return *sysreg_entry
;
580 /* The current tuning set. */
581 struct tune_params aarch64_tune_params
= generic_tunings
;
583 /* If NAME is the name of an arm:: attribute that describes shared state,
584 return its associated AARCH64_STATE_* flags, otherwise return 0. */
586 aarch64_attribute_shared_state_flags (const char *name
)
588 if (strcmp (name
, "in") == 0)
589 return AARCH64_STATE_SHARED
| AARCH64_STATE_IN
;
590 if (strcmp (name
, "inout") == 0)
591 return AARCH64_STATE_SHARED
| AARCH64_STATE_IN
| AARCH64_STATE_OUT
;
592 if (strcmp (name
, "out") == 0)
593 return AARCH64_STATE_SHARED
| AARCH64_STATE_OUT
;
594 if (strcmp (name
, "preserves") == 0)
595 return AARCH64_STATE_SHARED
;
599 /* See whether attribute list ATTRS has any sharing information
600 for state STATE_NAME. Return the associated state flags if so,
601 otherwise return 0. */
603 aarch64_lookup_shared_state_flags (tree attrs
, const char *state_name
)
605 for (tree attr
= attrs
; attr
; attr
= TREE_CHAIN (attr
))
607 if (!is_attribute_namespace_p ("arm", attr
))
610 auto attr_name
= IDENTIFIER_POINTER (get_attribute_name (attr
));
611 auto flags
= aarch64_attribute_shared_state_flags (attr_name
);
615 for (tree arg
= TREE_VALUE (attr
); arg
; arg
= TREE_CHAIN (arg
))
617 tree value
= TREE_VALUE (arg
);
618 if (TREE_CODE (value
) == STRING_CST
619 && strcmp (TREE_STRING_POINTER (value
), state_name
) == 0)
626 /* Return true if DECL creates a new scope for state STATE_STRING. */
628 aarch64_fndecl_has_new_state (const_tree decl
, const char *state_name
)
630 if (tree attr
= lookup_attribute ("arm", "new", DECL_ATTRIBUTES (decl
)))
631 for (tree arg
= TREE_VALUE (attr
); arg
; arg
= TREE_CHAIN (arg
))
633 tree value
= TREE_VALUE (arg
);
634 if (TREE_CODE (value
) == STRING_CST
635 && strcmp (TREE_STRING_POINTER (value
), state_name
) == 0)
641 /* Return true if attribute argument VALUE is a recognized state string,
642 otherwise report an error. NAME is the name of the attribute to which
643 VALUE is being passed. */
645 aarch64_check_state_string (tree name
, tree value
)
647 if (TREE_CODE (value
) != STRING_CST
)
649 error ("the arguments to %qE must be constant strings", name
);
653 const char *state_name
= TREE_STRING_POINTER (value
);
654 if (strcmp (state_name
, "za") != 0
655 && strcmp (state_name
, "zt0") != 0)
657 error ("unrecognized state string %qs", state_name
);
664 /* qsort callback to compare two STRING_CSTs. */
666 cmp_string_csts (const void *a
, const void *b
)
668 return strcmp (TREE_STRING_POINTER (*(const_tree
const *) a
),
669 TREE_STRING_POINTER (*(const_tree
const *) b
));
672 /* Canonicalize a list of state strings. ARGS contains the arguments to
673 a new attribute while OLD_ATTR, if nonnull, contains a previous attribute
674 of the same type. If CAN_MERGE_IN_PLACE, it is safe to adjust OLD_ATTR's
675 arguments and drop the new attribute. Otherwise, the new attribute must
676 be kept and ARGS must include the information in OLD_ATTR.
678 In both cases, the new arguments must be a sorted list of state strings
679 with duplicates removed.
681 Return true if new attribute should be kept, false if it should be
684 aarch64_merge_string_arguments (tree args
, tree old_attr
,
685 bool can_merge_in_place
)
687 /* Get a sorted list of all state strings (including duplicates). */
688 auto add_args
= [](vec
<tree
> &strings
, const_tree args
)
690 for (const_tree arg
= args
; arg
; arg
= TREE_CHAIN (arg
))
691 if (TREE_CODE (TREE_VALUE (arg
)) == STRING_CST
)
692 strings
.safe_push (TREE_VALUE (arg
));
694 auto_vec
<tree
, 16> strings
;
695 add_args (strings
, args
);
697 add_args (strings
, TREE_VALUE (old_attr
));
698 strings
.qsort (cmp_string_csts
);
700 /* The list can be empty if there was no previous attribute and if all
701 the new arguments are erroneous. Drop the attribute in that case. */
702 if (strings
.is_empty ())
705 /* Destructively modify one of the argument lists, removing duplicates
707 bool use_old_attr
= old_attr
&& can_merge_in_place
;
708 tree
*end
= use_old_attr
? &TREE_VALUE (old_attr
) : &args
;
709 tree prev
= NULL_TREE
;
710 for (tree arg
: strings
)
712 if (prev
&& simple_cst_equal (arg
, prev
))
716 *end
= tree_cons (NULL_TREE
, arg
, NULL_TREE
);
718 TREE_VALUE (*end
) = arg
;
719 end
= &TREE_CHAIN (*end
);
722 return !use_old_attr
;
725 /* Check whether an 'aarch64_vector_pcs' attribute is valid. */
728 handle_aarch64_vector_pcs_attribute (tree
*node
, tree name
, tree
,
729 int, bool *no_add_attrs
)
731 /* Since we set fn_type_req to true, the caller should have checked
733 gcc_assert (FUNC_OR_METHOD_TYPE_P (*node
));
734 switch ((arm_pcs
) fntype_abi (*node
).id ())
736 case ARM_PCS_AAPCS64
:
741 error ("the %qE attribute cannot be applied to an SVE function type",
743 *no_add_attrs
= true;
746 case ARM_PCS_TLSDESC
:
747 case ARM_PCS_UNKNOWN
:
753 /* Return true if arm::new(ARGS) is compatible with the type of decl DECL,
754 otherwise report an error. */
756 aarch64_check_arm_new_against_type (tree args
, tree decl
)
758 tree type_attrs
= TYPE_ATTRIBUTES (TREE_TYPE (decl
));
759 for (tree arg
= args
; arg
; arg
= TREE_CHAIN (arg
))
761 tree value
= TREE_VALUE (arg
);
762 if (TREE_CODE (value
) == STRING_CST
)
764 const char *state_name
= TREE_STRING_POINTER (value
);
765 if (aarch64_lookup_shared_state_flags (type_attrs
, state_name
))
767 error_at (DECL_SOURCE_LOCATION (decl
),
768 "cannot create a new %qs scope since %qs is shared"
769 " with callers", state_name
, state_name
);
777 /* Callback for arm::new attributes. */
779 handle_arm_new (tree
*node
, tree name
, tree args
, int, bool *no_add_attrs
)
782 if (TREE_CODE (decl
) != FUNCTION_DECL
)
784 error ("%qE attribute applies only to function definitions", name
);
785 *no_add_attrs
= true;
788 if (TREE_TYPE (decl
) == error_mark_node
)
790 *no_add_attrs
= true;
794 for (tree arg
= args
; arg
; arg
= TREE_CHAIN (arg
))
795 aarch64_check_state_string (name
, TREE_VALUE (arg
));
797 if (!aarch64_check_arm_new_against_type (args
, decl
))
799 *no_add_attrs
= true;
803 /* If there is an old attribute, we should try to update it in-place,
804 so that there is only one (definitive) arm::new attribute on the decl. */
805 tree old_attr
= lookup_attribute ("arm", "new", DECL_ATTRIBUTES (decl
));
806 if (!aarch64_merge_string_arguments (args
, old_attr
, true))
807 *no_add_attrs
= true;
812 /* Callback for arm::{in,out,inout,preserves} attributes. */
814 handle_arm_shared (tree
*node
, tree name
, tree args
,
815 int, bool *no_add_attrs
)
818 tree old_attrs
= TYPE_ATTRIBUTES (type
);
819 auto flags
= aarch64_attribute_shared_state_flags (IDENTIFIER_POINTER (name
));
820 for (tree arg
= args
; arg
; arg
= TREE_CHAIN (arg
))
822 tree value
= TREE_VALUE (arg
);
823 if (aarch64_check_state_string (name
, value
))
825 const char *state_name
= TREE_STRING_POINTER (value
);
826 auto old_flags
= aarch64_lookup_shared_state_flags (old_attrs
,
828 if (old_flags
&& old_flags
!= flags
)
830 error ("inconsistent attributes for state %qs", state_name
);
831 *no_add_attrs
= true;
837 /* We can't update an old attribute in-place, since types are shared.
838 Instead make sure that this new attribute contains all the
839 information, so that the old attribute becomes redundant. */
840 tree old_attr
= lookup_attribute ("arm", IDENTIFIER_POINTER (name
),
842 if (!aarch64_merge_string_arguments (args
, old_attr
, false))
843 *no_add_attrs
= true;
848 /* Mutually-exclusive function type attributes for controlling PSTATE.SM. */
849 static const struct attribute_spec::exclusions attr_streaming_exclusions
[] =
851 /* Attribute name exclusion applies to:
852 function, type, variable */
853 { "streaming", false, true, false },
854 { "streaming_compatible", false, true, false },
855 { NULL
, false, false, false }
858 /* Table of machine attributes. */
859 static const attribute_spec aarch64_gnu_attributes
[] =
861 /* { name, min_len, max_len, decl_req, type_req, fn_type_req,
862 affects_type_identity, handler, exclude } */
863 { "aarch64_vector_pcs", 0, 0, false, true, true, true,
864 handle_aarch64_vector_pcs_attribute
, NULL
},
865 { "indirect_return", 0, 0, false, true, true, true, NULL
, NULL
},
866 { "arm_sve_vector_bits", 1, 1, false, true, false, true,
867 aarch64_sve::handle_arm_sve_vector_bits_attribute
,
869 { "Advanced SIMD type", 1, 1, false, true, false, true, NULL
, NULL
},
870 { "SVE type", 3, 3, false, true, false, true, NULL
, NULL
},
871 { "SVE sizeless type", 0, 0, false, true, false, true, NULL
, NULL
},
872 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
873 { "dllimport", 0, 0, false, false, false, false, handle_dll_attribute
, NULL
},
874 { "dllexport", 0, 0, false, false, false, false, handle_dll_attribute
, NULL
},
876 #ifdef SUBTARGET_ATTRIBUTE_TABLE
877 SUBTARGET_ATTRIBUTE_TABLE
881 static const scoped_attribute_specs aarch64_gnu_attribute_table
=
883 "gnu", { aarch64_gnu_attributes
}
886 static const attribute_spec aarch64_arm_attributes
[] =
888 { "streaming", 0, 0, false, true, true, true,
889 NULL
, attr_streaming_exclusions
},
890 { "streaming_compatible", 0, 0, false, true, true, true,
891 NULL
, attr_streaming_exclusions
},
892 { "locally_streaming", 0, 0, true, false, false, false, NULL
, NULL
},
893 { "new", 1, -1, true, false, false, false,
894 handle_arm_new
, NULL
},
895 { "preserves", 1, -1, false, true, true, true,
896 handle_arm_shared
, NULL
},
897 { "in", 1, -1, false, true, true, true,
898 handle_arm_shared
, NULL
},
899 { "out", 1, -1, false, true, true, true,
900 handle_arm_shared
, NULL
},
901 { "inout", 1, -1, false, true, true, true,
902 handle_arm_shared
, NULL
}
905 static const scoped_attribute_specs aarch64_arm_attribute_table
=
907 "arm", { aarch64_arm_attributes
}
910 static const scoped_attribute_specs
*const aarch64_attribute_table
[] =
912 &aarch64_gnu_attribute_table
,
913 &aarch64_arm_attribute_table
916 typedef enum aarch64_cond_code
918 AARCH64_EQ
= 0, AARCH64_NE
, AARCH64_CS
, AARCH64_CC
, AARCH64_MI
, AARCH64_PL
,
919 AARCH64_VS
, AARCH64_VC
, AARCH64_HI
, AARCH64_LS
, AARCH64_GE
, AARCH64_LT
,
920 AARCH64_GT
, AARCH64_LE
, AARCH64_AL
, AARCH64_NV
924 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
927 /* The condition codes of the processor, and the inverse function. */
928 static const char * const aarch64_condition_codes
[] =
930 "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
931 "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
934 /* The preferred condition codes for SVE conditions. */
935 static const char *const aarch64_sve_condition_codes
[] =
937 "none", "any", "nlast", "last", "first", "nfrst", "vs", "vc",
938 "pmore", "plast", "tcont", "tstop", "gt", "le", "al", "nv"
941 /* Return the assembly token for svpattern value VALUE. */
944 svpattern_token (enum aarch64_svpattern pattern
)
948 #define CASE(UPPER, LOWER, VALUE) case AARCH64_SV_##UPPER: return #LOWER;
949 AARCH64_FOR_SVPATTERN (CASE
)
951 case AARCH64_NUM_SVPATTERNS
:
957 /* Return the location of a piece that is known to be passed or returned
958 in registers. FIRST_ZR is the first unused vector argument register
959 and FIRST_PR is the first unused predicate argument register. */
962 pure_scalable_type_info::piece::get_rtx (unsigned int first_zr
,
963 unsigned int first_pr
) const
965 gcc_assert (VECTOR_MODE_P (mode
)
966 && first_zr
+ num_zr
<= V0_REGNUM
+ NUM_FP_ARG_REGS
967 && first_pr
+ num_pr
<= P0_REGNUM
+ NUM_PR_ARG_REGS
);
969 if (num_zr
> 0 && num_pr
== 0)
970 return gen_rtx_REG (mode
, first_zr
);
972 if (num_zr
== 0 && num_pr
> 0)
973 return gen_rtx_REG (mode
, first_pr
);
978 /* Return the total number of vector registers required by the PST. */
981 pure_scalable_type_info::num_zr () const
983 unsigned int res
= 0;
984 for (unsigned int i
= 0; i
< pieces
.length (); ++i
)
985 res
+= pieces
[i
].num_zr
;
989 /* Return the total number of predicate registers required by the PST. */
992 pure_scalable_type_info::num_pr () const
994 unsigned int res
= 0;
995 for (unsigned int i
= 0; i
< pieces
.length (); ++i
)
996 res
+= pieces
[i
].num_pr
;
1000 /* Return the location of a PST that is known to be passed or returned
1001 in registers. FIRST_ZR is the first unused vector argument register
1002 and FIRST_PR is the first unused predicate argument register. */
1005 pure_scalable_type_info::get_rtx (machine_mode mode
,
1006 unsigned int first_zr
,
1007 unsigned int first_pr
) const
1009 /* Try to return a single REG if possible. This leads to better
1010 code generation; it isn't required for correctness. */
1011 if (mode
== pieces
[0].mode
)
1013 gcc_assert (pieces
.length () == 1);
1014 return pieces
[0].get_rtx (first_zr
, first_pr
);
1017 /* Build up a PARALLEL that contains the individual pieces. */
1018 rtvec rtxes
= rtvec_alloc (pieces
.length ());
1019 for (unsigned int i
= 0; i
< pieces
.length (); ++i
)
1021 rtx reg
= pieces
[i
].get_rtx (first_zr
, first_pr
);
1022 rtx offset
= gen_int_mode (pieces
[i
].offset
, Pmode
);
1023 RTVEC_ELT (rtxes
, i
) = gen_rtx_EXPR_LIST (VOIDmode
, reg
, offset
);
1024 first_zr
+= pieces
[i
].num_zr
;
1025 first_pr
+= pieces
[i
].num_pr
;
1027 return gen_rtx_PARALLEL (mode
, rtxes
);
1030 /* Analyze whether TYPE is a Pure Scalable Type according to the rules
1033 pure_scalable_type_info::analysis_result
1034 pure_scalable_type_info::analyze (const_tree type
)
1036 /* Prevent accidental reuse. */
1037 gcc_assert (pieces
.is_empty ());
1039 /* No code will be generated for erroneous types, so we won't establish
1041 if (type
== error_mark_node
)
1042 return NO_ABI_IDENTITY
;
1044 /* Zero-sized types disappear in the language->ABI mapping. */
1045 if (TYPE_SIZE (type
) && integer_zerop (TYPE_SIZE (type
)))
1046 return NO_ABI_IDENTITY
;
1048 /* Check for SVTs, SPTs, and built-in tuple types that map to PSTs. */
1050 if (aarch64_sve::builtin_type_p (type
, &p
.num_zr
, &p
.num_pr
))
1052 machine_mode mode
= TYPE_MODE_RAW (type
);
1053 gcc_assert (VECTOR_MODE_P (mode
)
1054 && (!TARGET_SVE
|| aarch64_sve_mode_p (mode
)));
1056 p
.mode
= p
.orig_mode
= mode
;
1061 /* Check for user-defined PSTs. */
1062 if (TREE_CODE (type
) == ARRAY_TYPE
)
1063 return analyze_array (type
);
1064 if (TREE_CODE (type
) == RECORD_TYPE
)
1065 return analyze_record (type
);
1070 /* Analyze a type that is known not to be passed or returned in memory.
1071 Return true if it has an ABI identity and is a Pure Scalable Type. */
1074 pure_scalable_type_info::analyze_registers (const_tree type
)
1076 analysis_result result
= analyze (type
);
1077 gcc_assert (result
!= DOESNT_MATTER
);
1078 return result
== IS_PST
;
1081 /* Subroutine of analyze for handling ARRAY_TYPEs. */
1083 pure_scalable_type_info::analysis_result
1084 pure_scalable_type_info::analyze_array (const_tree type
)
1086 /* Analyze the element type. */
1087 pure_scalable_type_info element_info
;
1088 analysis_result result
= element_info
.analyze (TREE_TYPE (type
));
1089 if (result
!= IS_PST
)
1092 /* An array of unknown, flexible or variable length will be passed and
1093 returned by reference whatever we do. */
1094 tree nelts_minus_one
= array_type_nelts_minus_one (type
);
1095 if (!tree_fits_uhwi_p (nelts_minus_one
))
1096 return DOESNT_MATTER
;
1098 /* Likewise if the array is constant-sized but too big to be interesting.
1099 The double checks against MAX_PIECES are to protect against overflow. */
1100 unsigned HOST_WIDE_INT count
= tree_to_uhwi (nelts_minus_one
);
1101 if (count
> MAX_PIECES
)
1102 return DOESNT_MATTER
;
1104 if (count
* element_info
.pieces
.length () > MAX_PIECES
)
1105 return DOESNT_MATTER
;
1107 /* The above checks should have weeded out elements of unknown size. */
1108 poly_uint64 element_bytes
;
1109 if (!poly_int_tree_p (TYPE_SIZE_UNIT (TREE_TYPE (type
)), &element_bytes
))
1112 /* Build up the list of individual vectors and predicates. */
1113 gcc_assert (!element_info
.pieces
.is_empty ());
1114 for (unsigned int i
= 0; i
< count
; ++i
)
1115 for (unsigned int j
= 0; j
< element_info
.pieces
.length (); ++j
)
1117 piece p
= element_info
.pieces
[j
];
1118 p
.offset
+= i
* element_bytes
;
1124 /* Subroutine of analyze for handling RECORD_TYPEs. */
1126 pure_scalable_type_info::analysis_result
1127 pure_scalable_type_info::analyze_record (const_tree type
)
1129 for (tree field
= TYPE_FIELDS (type
); field
; field
= TREE_CHAIN (field
))
1131 if (TREE_CODE (field
) != FIELD_DECL
)
1134 /* Zero-sized fields disappear in the language->ABI mapping. */
1135 if (DECL_SIZE (field
) && integer_zerop (DECL_SIZE (field
)))
1138 /* All fields with an ABI identity must be PSTs for the record as
1139 a whole to be a PST. If any individual field is too big to be
1140 interesting then the record is too. */
1141 pure_scalable_type_info field_info
;
1142 analysis_result subresult
= field_info
.analyze (TREE_TYPE (field
));
1143 if (subresult
== NO_ABI_IDENTITY
)
1145 if (subresult
!= IS_PST
)
1148 /* Since all previous fields are PSTs, we ought to be able to track
1149 the field offset using poly_ints. */
1150 tree bitpos
= bit_position (field
);
1151 gcc_assert (poly_int_tree_p (bitpos
));
1153 /* For the same reason, it shouldn't be possible to create a PST field
1154 whose offset isn't byte-aligned. */
1155 poly_widest_int wide_bytepos
= exact_div (wi::to_poly_widest (bitpos
),
1158 /* Punt if the record is too big to be interesting. */
1159 poly_uint64 bytepos
;
1160 if (!wide_bytepos
.to_uhwi (&bytepos
)
1161 || pieces
.length () + field_info
.pieces
.length () > MAX_PIECES
)
1162 return DOESNT_MATTER
;
1164 /* Add the individual vectors and predicates in the field to the
1166 gcc_assert (!field_info
.pieces
.is_empty ());
1167 for (unsigned int i
= 0; i
< field_info
.pieces
.length (); ++i
)
1169 piece p
= field_info
.pieces
[i
];
1170 p
.offset
+= bytepos
;
1174 /* Empty structures disappear in the language->ABI mapping. */
1175 return pieces
.is_empty () ? NO_ABI_IDENTITY
: IS_PST
;
1178 /* Add P to the list of pieces in the type. */
1181 pure_scalable_type_info::add_piece (const piece
&p
)
1183 /* Try to fold the new piece into the previous one to form a
1184 single-mode PST. For example, if we see three consecutive vectors
1185 of the same mode, we can represent them using the corresponding
1188 This is purely an optimization. */
1189 if (!pieces
.is_empty ())
1191 piece
&prev
= pieces
.last ();
1192 gcc_assert (VECTOR_MODE_P (p
.mode
) && VECTOR_MODE_P (prev
.mode
));
1193 unsigned int nelems1
, nelems2
;
1194 if (prev
.orig_mode
== p
.orig_mode
1195 && GET_MODE_CLASS (p
.orig_mode
) != MODE_VECTOR_BOOL
1196 && known_eq (prev
.offset
+ GET_MODE_SIZE (prev
.mode
), p
.offset
)
1197 && constant_multiple_p (GET_MODE_NUNITS (prev
.mode
),
1198 GET_MODE_NUNITS (p
.orig_mode
), &nelems1
)
1199 && constant_multiple_p (GET_MODE_NUNITS (p
.mode
),
1200 GET_MODE_NUNITS (p
.orig_mode
), &nelems2
)
1201 && targetm
.array_mode (p
.orig_mode
,
1202 nelems1
+ nelems2
).exists (&prev
.mode
))
1204 prev
.num_zr
+= p
.num_zr
;
1205 prev
.num_pr
+= p
.num_pr
;
1209 pieces
.quick_push (p
);
1212 /* Return true if at least one possible value of type TYPE includes at
1213 least one object of Pure Scalable Type, in the sense of the AAPCS64.
1215 This is a relatively expensive test for some types, so it should
1216 generally be made as late as possible. */
1219 aarch64_some_values_include_pst_objects_p (const_tree type
)
1221 if (TYPE_SIZE (type
) && integer_zerop (TYPE_SIZE (type
)))
1224 if (aarch64_sve::builtin_type_p (type
))
1227 if (TREE_CODE (type
) == ARRAY_TYPE
|| TREE_CODE (type
) == COMPLEX_TYPE
)
1228 return aarch64_some_values_include_pst_objects_p (TREE_TYPE (type
));
1230 if (RECORD_OR_UNION_TYPE_P (type
))
1231 for (tree field
= TYPE_FIELDS (type
); field
; field
= TREE_CHAIN (field
))
1232 if (TREE_CODE (field
) == FIELD_DECL
1233 && aarch64_some_values_include_pst_objects_p (TREE_TYPE (field
)))
1239 /* Return the descriptor of the SIMD ABI. */
1241 static const predefined_function_abi
&
1242 aarch64_simd_abi (void)
1244 predefined_function_abi
&simd_abi
= function_abis
[ARM_PCS_SIMD
];
1245 if (!simd_abi
.initialized_p ())
1247 HARD_REG_SET full_reg_clobbers
1248 = default_function_abi
.full_reg_clobbers ();
1249 for (int regno
= 0; regno
< FIRST_PSEUDO_REGISTER
; regno
++)
1250 if (FP_SIMD_SAVED_REGNUM_P (regno
))
1251 CLEAR_HARD_REG_BIT (full_reg_clobbers
, regno
);
1252 simd_abi
.initialize (ARM_PCS_SIMD
, full_reg_clobbers
);
1257 /* Return the descriptor of the SVE PCS. */
1259 static const predefined_function_abi
&
1260 aarch64_sve_abi (void)
1262 predefined_function_abi
&sve_abi
= function_abis
[ARM_PCS_SVE
];
1263 if (!sve_abi
.initialized_p ())
1265 HARD_REG_SET full_reg_clobbers
1266 = default_function_abi
.full_reg_clobbers ();
1267 for (int regno
= V8_REGNUM
; regno
<= V23_REGNUM
; ++regno
)
1268 CLEAR_HARD_REG_BIT (full_reg_clobbers
, regno
);
1269 for (int regno
= P4_REGNUM
; regno
<= P15_REGNUM
; ++regno
)
1270 CLEAR_HARD_REG_BIT (full_reg_clobbers
, regno
);
1271 sve_abi
.initialize (ARM_PCS_SVE
, full_reg_clobbers
);
1276 /* If X is an UNSPEC_SALT_ADDR expression, return the address that it
1277 wraps, otherwise return X itself. */
1283 if (GET_CODE (search
) == CONST
)
1284 search
= XEXP (search
, 0);
1285 if (GET_CODE (search
) == UNSPEC
&& XINT (search
, 1) == UNSPEC_SALT_ADDR
)
1286 x
= XVECEXP (search
, 0, 0);
1290 /* Like strip_offset, but also strip any UNSPEC_SALT_ADDR from the
1294 strip_offset_and_salt (rtx addr
, poly_int64
*offset
)
1296 return strip_salt (strip_offset (addr
, offset
));
1299 /* Generate code to enable conditional branches in functions over 1 MiB. */
1301 aarch64_gen_far_branch (rtx
* operands
, int pos_label
, const char * dest
,
1302 const char * branch_format
)
1304 rtx_code_label
* tmp_label
= gen_label_rtx ();
1305 char label_buf
[256];
1307 ASM_GENERATE_INTERNAL_LABEL (label_buf
, dest
,
1308 CODE_LABEL_NUMBER (tmp_label
));
1309 const char *label_ptr
= targetm
.strip_name_encoding (label_buf
);
1310 rtx dest_label
= operands
[pos_label
];
1311 operands
[pos_label
] = tmp_label
;
1313 snprintf (buffer
, sizeof (buffer
), "%s%s", branch_format
, label_ptr
);
1314 output_asm_insn (buffer
, operands
);
1316 snprintf (buffer
, sizeof (buffer
), "b\t%%l%d\n%s:", pos_label
, label_ptr
);
1317 operands
[pos_label
] = dest_label
;
1318 output_asm_insn (buffer
, operands
);
1323 aarch64_err_no_fpadvsimd (machine_mode mode
)
1325 if (TARGET_GENERAL_REGS_ONLY
)
1326 if (FLOAT_MODE_P (mode
))
1327 error ("%qs is incompatible with the use of floating-point types",
1328 "-mgeneral-regs-only");
1330 error ("%qs is incompatible with the use of vector types",
1331 "-mgeneral-regs-only");
1333 if (FLOAT_MODE_P (mode
))
1334 error ("%qs feature modifier is incompatible with the use of"
1335 " floating-point types", "+nofp");
1337 error ("%qs feature modifier is incompatible with the use of"
1338 " vector types", "+nofp");
1341 /* Report when we try to do something that requires SVE when SVE is disabled.
1342 This is an error of last resort and isn't very high-quality. It usually
1343 involves attempts to measure the vector length in some way. */
1345 aarch64_report_sve_required (void)
1347 static bool reported_p
= false;
1349 /* Avoid reporting a slew of messages for a single oversight. */
1353 error ("this operation requires the SVE ISA extension");
1354 inform (input_location
, "you can enable SVE using the command-line"
1355 " option %<-march%>, or by using the %<target%>"
1356 " attribute or pragma");
1360 /* Return true if REGNO is P0-P15 or one of the special FFR-related
1363 pr_or_ffr_regnum_p (unsigned int regno
)
1365 return PR_REGNUM_P (regno
) || regno
== FFR_REGNUM
|| regno
== FFRT_REGNUM
;
1368 /* Implement TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS.
1369 The register allocator chooses POINTER_AND_FP_REGS if FP_REGS and
1370 GENERAL_REGS have the same cost - even if POINTER_AND_FP_REGS has a much
1371 higher cost. POINTER_AND_FP_REGS is also used if the cost of both FP_REGS
1372 and GENERAL_REGS is lower than the memory cost (in this case the best class
1373 is the lowest cost one). Using POINTER_AND_FP_REGS irrespectively of its
1374 cost results in bad allocations with many redundant int<->FP moves which
1375 are expensive on various cores.
1376 To avoid this we don't allow POINTER_AND_FP_REGS as the allocno class, but
1377 force a decision between FP_REGS and GENERAL_REGS. We use the allocno class
1378 if it isn't POINTER_AND_FP_REGS. Similarly, use the best class if it isn't
1379 POINTER_AND_FP_REGS. Otherwise set the allocno class depending on the mode.
1380 The result of this is that it is no longer inefficient to have a higher
1381 memory move cost than the register move cost.
1385 aarch64_ira_change_pseudo_allocno_class (int regno
, reg_class_t allocno_class
,
1386 reg_class_t best_class
)
1390 if (!reg_class_subset_p (GENERAL_REGS
, allocno_class
)
1391 || !reg_class_subset_p (FP_REGS
, allocno_class
))
1392 return allocno_class
;
1394 if (!reg_class_subset_p (GENERAL_REGS
, best_class
)
1395 || !reg_class_subset_p (FP_REGS
, best_class
))
1398 mode
= PSEUDO_REGNO_MODE (regno
);
1399 return FLOAT_MODE_P (mode
) || VECTOR_MODE_P (mode
) ? FP_REGS
: GENERAL_REGS
;
1403 aarch64_min_divisions_for_recip_mul (machine_mode mode
)
1405 if (GET_MODE_UNIT_SIZE (mode
) == 4)
1406 return aarch64_tune_params
.min_div_recip_mul_sf
;
1407 return aarch64_tune_params
.min_div_recip_mul_df
;
1410 /* Return the reassociation width of treeop OPC with mode MODE. */
1412 aarch64_reassociation_width (unsigned opc
, machine_mode mode
)
1414 if (VECTOR_MODE_P (mode
))
1415 return aarch64_tune_params
.vec_reassoc_width
;
1416 if (INTEGRAL_MODE_P (mode
))
1417 return aarch64_tune_params
.int_reassoc_width
;
1418 /* Reassociation reduces the number of FMAs which may result in worse
1419 performance. Use a per-CPU setting for FMA reassociation which allows
1420 narrow CPUs with few FP pipes to switch it off (value of 1), and wider
1421 CPUs with many FP pipes to enable reassociation.
1422 Since the reassociation pass doesn't understand FMA at all, assume
1423 that any FP addition might turn into FMA. */
1424 if (FLOAT_MODE_P (mode
))
1425 return opc
== PLUS_EXPR
? aarch64_tune_params
.fma_reassoc_width
1426 : aarch64_tune_params
.fp_reassoc_width
;
1430 /* Provide a mapping from gcc register numbers to dwarf register numbers. */
1432 aarch64_debugger_regno (unsigned regno
)
1434 if (GP_REGNUM_P (regno
))
1435 return AARCH64_DWARF_R0
+ regno
- R0_REGNUM
;
1436 else if (regno
== SP_REGNUM
)
1437 return AARCH64_DWARF_SP
;
1438 else if (FP_REGNUM_P (regno
))
1439 return AARCH64_DWARF_V0
+ regno
- V0_REGNUM
;
1440 else if (PR_REGNUM_P (regno
))
1441 return AARCH64_DWARF_P0
+ regno
- P0_REGNUM
;
1442 else if (regno
== VG_REGNUM
)
1443 return AARCH64_DWARF_VG
;
1445 /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
1446 equivalent DWARF register. */
1447 return DWARF_FRAME_REGISTERS
;
1450 /* Implement TARGET_DWARF_FRAME_REG_MODE. */
1452 aarch64_dwarf_frame_reg_mode (int regno
)
1454 /* Predicate registers are call-clobbered in the EH ABI (which is
1455 ARM_PCS_AAPCS64), so they should not be described by CFI.
1456 Their size changes as VL changes, so any values computed by
1457 __builtin_init_dwarf_reg_size_table might not be valid for
1459 if (PR_REGNUM_P (regno
))
1461 return default_dwarf_frame_reg_mode (regno
);
1464 /* Implement TARGET_OUTPUT_CFI_DIRECTIVE. */
1466 aarch64_output_cfi_directive (FILE *f
, dw_cfi_ref cfi
)
1469 if (cfi
->dw_cfi_opc
== DW_CFA_AARCH64_negate_ra_state
)
1471 fprintf (f
, "\t.cfi_negate_ra_state\n");
1477 /* Implement TARGET_DW_CFI_OPRND1_DESC. */
1479 aarch64_dw_cfi_oprnd1_desc (dwarf_call_frame_info cfi_opc
,
1480 dw_cfi_oprnd_type
&oprnd_type
)
1482 if (cfi_opc
== DW_CFA_AARCH64_negate_ra_state
)
1484 oprnd_type
= dw_cfi_oprnd_unused
;
1490 /* If X is a CONST_DOUBLE, return its bit representation as a constant
1491 integer, otherwise return X unmodified. */
1493 aarch64_bit_representation (rtx x
)
1495 if (CONST_DOUBLE_P (x
))
1496 x
= gen_lowpart (int_mode_for_mode (GET_MODE (x
)).require (), x
);
1500 /* Return an estimate for the number of quadwords in an SVE vector. This is
1501 equivalent to the number of Advanced SIMD vectors in an SVE vector. */
1503 aarch64_estimated_sve_vq ()
1505 return estimated_poly_value (BITS_PER_SVE_VECTOR
) / 128;
1508 /* Return true if MODE is an SVE predicate mode. */
1510 aarch64_sve_pred_mode_p (machine_mode mode
)
1513 && (mode
== VNx16BImode
1514 || mode
== VNx8BImode
1515 || mode
== VNx4BImode
1516 || mode
== VNx2BImode
));
1519 /* Three mutually-exclusive flags describing a vector or predicate type. */
1520 const unsigned int VEC_ADVSIMD
= 1;
1521 const unsigned int VEC_SVE_DATA
= 2;
1522 const unsigned int VEC_SVE_PRED
= 4;
1523 /* Indicates a structure of 2, 3 or 4 vectors or predicates. */
1524 const unsigned int VEC_STRUCT
= 8;
1525 /* Can be used in combination with VEC_SVE_DATA to indicate that the
1526 vector has fewer significant bytes than a full SVE vector. */
1527 const unsigned int VEC_PARTIAL
= 16;
1528 /* Useful combinations of the above. */
1529 const unsigned int VEC_ANY_SVE
= VEC_SVE_DATA
| VEC_SVE_PRED
;
1530 const unsigned int VEC_ANY_DATA
= VEC_ADVSIMD
| VEC_SVE_DATA
;
1532 /* Return a set of flags describing the vector properties of mode MODE.
1533 If ANY_TARGET_P is false (the default), ignore modes that are not supported
1534 by the current target. Otherwise categorize the modes that can be used
1535 with the set of all targets supported by the port. */
1538 aarch64_classify_vector_mode (machine_mode mode
, bool any_target_p
= false)
1540 if (aarch64_sve_pred_mode_p (mode
))
1541 return VEC_SVE_PRED
;
1543 /* Make the decision based on the mode's enum value rather than its
1544 properties, so that we keep the correct classification regardless
1545 of -msve-vector-bits. */
1548 /* Partial SVE QI vectors. */
1552 /* Partial SVE HI vectors. */
1555 /* Partial SVE SI vector. */
1557 /* Partial SVE HF vectors. */
1560 /* Partial SVE BF vectors. */
1563 /* Partial SVE SF vector. */
1565 return (TARGET_SVE
|| any_target_p
) ? VEC_SVE_DATA
| VEC_PARTIAL
: 0;
1575 return (TARGET_SVE
|| any_target_p
) ? VEC_SVE_DATA
: 0;
1577 /* x2 SVE vectors. */
1586 /* x3 SVE vectors. */
1595 /* x4 SVE vectors. */
1604 return (TARGET_SVE
|| any_target_p
) ? VEC_SVE_DATA
| VEC_STRUCT
: 0;
1609 return (TARGET_FLOAT
|| any_target_p
) ? VEC_ADVSIMD
| VEC_STRUCT
: 0;
1611 /* Structures of 64-bit Advanced SIMD vectors. */
1636 return (TARGET_FLOAT
|| any_target_p
)
1637 ? VEC_ADVSIMD
| VEC_STRUCT
| VEC_PARTIAL
: 0;
1639 /* Structures of 128-bit Advanced SIMD vectors. */
1664 return (TARGET_FLOAT
|| any_target_p
) ? VEC_ADVSIMD
| VEC_STRUCT
: 0;
1666 /* 64-bit Advanced SIMD vectors. */
1675 /* 128-bit Advanced SIMD vectors. */
1684 return (TARGET_FLOAT
|| any_target_p
) ? VEC_ADVSIMD
: 0;
1688 return TARGET_SVE
? VEC_SVE_PRED
| VEC_STRUCT
: 0;
1695 /* Like aarch64_classify_vector_mode, but also include modes that are used
1696 for memory operands but not register operands. Such modes do not count
1697 as real vector modes; they are just an internal construct to make things
1698 easier to describe. */
1700 aarch64_classify_vector_memory_mode (machine_mode mode
)
1706 return TARGET_SVE
? VEC_SVE_DATA
| VEC_PARTIAL
: 0;
1709 return TARGET_SVE
? VEC_SVE_DATA
: 0;
1714 return TARGET_SVE
? VEC_SVE_DATA
| VEC_STRUCT
: 0;
1717 return aarch64_classify_vector_mode (mode
);
1721 /* Return true if MODE is any of the Advanced SIMD structure modes. */
1723 aarch64_advsimd_struct_mode_p (machine_mode mode
)
1725 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
1726 return (vec_flags
& VEC_ADVSIMD
) && (vec_flags
& VEC_STRUCT
);
1729 /* Return true if MODE is an Advanced SIMD D-register structure mode. */
1731 aarch64_advsimd_partial_struct_mode_p (machine_mode mode
)
1733 return (aarch64_classify_vector_mode (mode
)
1734 == (VEC_ADVSIMD
| VEC_STRUCT
| VEC_PARTIAL
));
1737 /* Return true if MODE is an Advanced SIMD Q-register structure mode. */
1739 aarch64_advsimd_full_struct_mode_p (machine_mode mode
)
1741 return (aarch64_classify_vector_mode (mode
) == (VEC_ADVSIMD
| VEC_STRUCT
));
1744 /* Return true if MODE is any of the data vector modes, including
1747 aarch64_vector_data_mode_p (machine_mode mode
)
1749 return aarch64_classify_vector_mode (mode
) & VEC_ANY_DATA
;
1752 /* Return true if MODE is any form of SVE mode, including predicates,
1753 vectors and structures. */
1755 aarch64_sve_mode_p (machine_mode mode
)
1757 return aarch64_classify_vector_mode (mode
) & VEC_ANY_SVE
;
1760 /* Return true if MODE is an SVE data vector mode; either a single vector
1761 or a structure of vectors. */
1763 aarch64_sve_data_mode_p (machine_mode mode
)
1765 return aarch64_classify_vector_mode (mode
) & VEC_SVE_DATA
;
1768 /* Return the number of defined bytes in one constituent vector of
1769 SVE mode MODE, which has vector flags VEC_FLAGS. */
1771 aarch64_vl_bytes (machine_mode mode
, unsigned int vec_flags
)
1773 if (vec_flags
& VEC_PARTIAL
)
1774 /* A single partial vector. */
1775 return GET_MODE_SIZE (mode
);
1777 if (vec_flags
& VEC_SVE_DATA
)
1778 /* A single vector or a tuple. */
1779 return BYTES_PER_SVE_VECTOR
;
1781 /* A single predicate. */
1782 gcc_assert (vec_flags
& VEC_SVE_PRED
);
1783 return BYTES_PER_SVE_PRED
;
1786 /* If MODE holds an array of vectors, return the number of vectors
1787 in the array, otherwise return 1. */
1790 aarch64_ldn_stn_vectors (machine_mode mode
)
1792 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
1793 if (vec_flags
== (VEC_ADVSIMD
| VEC_PARTIAL
| VEC_STRUCT
))
1794 return exact_div (GET_MODE_SIZE (mode
), 8).to_constant ();
1795 if (vec_flags
== (VEC_ADVSIMD
| VEC_STRUCT
))
1796 return exact_div (GET_MODE_SIZE (mode
), 16).to_constant ();
1797 if (vec_flags
== (VEC_SVE_DATA
| VEC_STRUCT
))
1798 return exact_div (GET_MODE_SIZE (mode
),
1799 BYTES_PER_SVE_VECTOR
).to_constant ();
1803 /* Given an Advanced SIMD vector mode MODE and a tuple size NELEMS, return the
1804 corresponding vector structure mode. */
1806 aarch64_advsimd_vector_array_mode (machine_mode mode
,
1807 unsigned HOST_WIDE_INT nelems
)
1809 unsigned int flags
= VEC_ADVSIMD
| VEC_STRUCT
;
1810 if (known_eq (GET_MODE_SIZE (mode
), 8))
1811 flags
|= VEC_PARTIAL
;
1813 machine_mode struct_mode
;
1814 FOR_EACH_MODE_IN_CLASS (struct_mode
, GET_MODE_CLASS (mode
))
1815 if (aarch64_classify_vector_mode (struct_mode
) == flags
1816 && GET_MODE_INNER (struct_mode
) == GET_MODE_INNER (mode
)
1817 && known_eq (GET_MODE_NUNITS (struct_mode
),
1818 GET_MODE_NUNITS (mode
) * nelems
))
1820 return opt_machine_mode ();
1823 /* Return the SVE vector mode that has NUNITS elements of mode INNER_MODE. */
1826 aarch64_sve_data_mode (scalar_mode inner_mode
, poly_uint64 nunits
)
1828 enum mode_class mclass
= (is_a
<scalar_float_mode
> (inner_mode
)
1829 ? MODE_VECTOR_FLOAT
: MODE_VECTOR_INT
);
1831 FOR_EACH_MODE_IN_CLASS (mode
, mclass
)
1832 if (inner_mode
== GET_MODE_INNER (mode
)
1833 && known_eq (nunits
, GET_MODE_NUNITS (mode
))
1834 && aarch64_sve_data_mode_p (mode
))
1836 return opt_machine_mode ();
1839 /* Implement target hook TARGET_ARRAY_MODE. */
1840 static opt_machine_mode
1841 aarch64_array_mode (machine_mode mode
, unsigned HOST_WIDE_INT nelems
)
1843 if (TARGET_SVE
&& GET_MODE_CLASS (mode
) == MODE_VECTOR_BOOL
)
1845 /* Use VNx32BI and VNx64BI for tuples of predicates, but explicitly
1846 reject giving a mode to other array sizes. Using integer modes
1847 requires a round trip through memory and generates terrible code. */
1850 if (mode
== VNx16BImode
&& nelems
== 2)
1852 if (mode
== VNx16BImode
&& nelems
== 4)
1857 auto flags
= aarch64_classify_vector_mode (mode
);
1858 if (flags
== VEC_SVE_DATA
&& IN_RANGE (nelems
, 2, 4))
1859 return aarch64_sve_data_mode (GET_MODE_INNER (mode
),
1860 GET_MODE_NUNITS (mode
) * nelems
);
1862 if (flags
== VEC_ADVSIMD
&& IN_RANGE (nelems
, 2, 4))
1863 return aarch64_advsimd_vector_array_mode (mode
, nelems
);
1865 return opt_machine_mode ();
1868 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P. */
1870 aarch64_array_mode_supported_p (machine_mode mode
,
1871 unsigned HOST_WIDE_INT nelems
)
1873 if (TARGET_BASE_SIMD
1874 && (AARCH64_VALID_SIMD_QREG_MODE (mode
)
1875 || AARCH64_VALID_SIMD_DREG_MODE (mode
))
1876 && (nelems
>= 2 && nelems
<= 4))
1882 /* MODE is some form of SVE vector mode. For data modes, return the number
1883 of vector register bits that each element of MODE occupies, such as 64
1884 for both VNx2DImode and VNx2SImode (where each 32-bit value is stored
1885 in a 64-bit container). For predicate modes, return the number of
1886 data bits controlled by each significant predicate bit. */
1889 aarch64_sve_container_bits (machine_mode mode
)
1891 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
1892 poly_uint64 vector_bits
= (vec_flags
& (VEC_PARTIAL
| VEC_SVE_PRED
)
1893 ? BITS_PER_SVE_VECTOR
1894 : GET_MODE_BITSIZE (mode
));
1895 return vector_element_size (vector_bits
, GET_MODE_NUNITS (mode
));
1898 /* Return the SVE predicate mode to use for elements that have
1899 ELEM_NBYTES bytes, if such a mode exists. */
1902 aarch64_sve_pred_mode (unsigned int elem_nbytes
)
1906 if (elem_nbytes
== 1)
1908 if (elem_nbytes
== 2)
1910 if (elem_nbytes
== 4)
1912 if (elem_nbytes
== 8)
1915 return opt_machine_mode ();
1918 /* Return the SVE predicate mode that should be used to control
1922 aarch64_sve_pred_mode (machine_mode mode
)
1924 unsigned int bits
= aarch64_sve_container_bits (mode
);
1925 return aarch64_sve_pred_mode (bits
/ BITS_PER_UNIT
).require ();
1928 /* Implement TARGET_VECTORIZE_GET_MASK_MODE. */
1930 static opt_machine_mode
1931 aarch64_get_mask_mode (machine_mode mode
)
1933 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
1934 if (vec_flags
& VEC_SVE_DATA
)
1935 return aarch64_sve_pred_mode (mode
);
1937 return default_get_mask_mode (mode
);
1940 /* Return the integer element mode associated with SVE mode MODE. */
1942 static scalar_int_mode
1943 aarch64_sve_element_int_mode (machine_mode mode
)
1945 poly_uint64 vector_bits
= (GET_MODE_CLASS (mode
) == MODE_VECTOR_BOOL
1946 ? BITS_PER_SVE_VECTOR
1947 : GET_MODE_BITSIZE (mode
));
1948 unsigned int elt_bits
= vector_element_size (vector_bits
,
1949 GET_MODE_NUNITS (mode
));
1950 return int_mode_for_size (elt_bits
, 0).require ();
1953 /* Return an integer element mode that contains exactly
1954 aarch64_sve_container_bits (MODE) bits. This is wider than
1955 aarch64_sve_element_int_mode if MODE is a partial vector,
1956 otherwise it's the same. */
1958 static scalar_int_mode
1959 aarch64_sve_container_int_mode (machine_mode mode
)
1961 return int_mode_for_size (aarch64_sve_container_bits (mode
), 0).require ();
1964 /* Return the integer vector mode associated with SVE mode MODE.
1965 Unlike related_int_vector_mode, this can handle the case in which
1966 MODE is a predicate (and thus has a different total size). */
1969 aarch64_sve_int_mode (machine_mode mode
)
1971 scalar_int_mode int_mode
= aarch64_sve_element_int_mode (mode
);
1972 return aarch64_sve_data_mode (int_mode
, GET_MODE_NUNITS (mode
)).require ();
1975 /* Look for a vector mode with the same classification as VEC_MODE,
1976 but with each group of FACTOR elements coalesced into a single element.
1977 In other words, look for a mode in which the elements are FACTOR times
1978 larger and in which the number of elements is FACTOR times smaller.
1980 Return the mode found, if one exists. */
1982 static opt_machine_mode
1983 aarch64_coalesce_units (machine_mode vec_mode
, unsigned int factor
)
1985 auto elt_bits
= vector_element_size (GET_MODE_BITSIZE (vec_mode
),
1986 GET_MODE_NUNITS (vec_mode
));
1987 auto vec_flags
= aarch64_classify_vector_mode (vec_mode
);
1988 if (vec_flags
& VEC_SVE_PRED
)
1990 if (known_eq (GET_MODE_SIZE (vec_mode
), BYTES_PER_SVE_PRED
))
1991 return aarch64_sve_pred_mode (elt_bits
* factor
);
1995 scalar_mode new_elt_mode
;
1996 if (!int_mode_for_size (elt_bits
* factor
, false).exists (&new_elt_mode
))
1999 if (vec_flags
== VEC_ADVSIMD
)
2001 auto mode
= aarch64_simd_container_mode (new_elt_mode
,
2002 GET_MODE_BITSIZE (vec_mode
));
2003 if (mode
!= word_mode
)
2006 else if (vec_flags
& VEC_SVE_DATA
)
2008 poly_uint64 new_nunits
;
2009 if (multiple_p (GET_MODE_NUNITS (vec_mode
), factor
, &new_nunits
))
2010 return aarch64_sve_data_mode (new_elt_mode
, new_nunits
);
2015 /* Implement TARGET_VECTORIZE_RELATED_MODE. */
2017 static opt_machine_mode
2018 aarch64_vectorize_related_mode (machine_mode vector_mode
,
2019 scalar_mode element_mode
,
2022 unsigned int vec_flags
= aarch64_classify_vector_mode (vector_mode
);
2024 /* If we're operating on SVE vectors, try to return an SVE mode. */
2025 poly_uint64 sve_nunits
;
2026 if ((vec_flags
& VEC_SVE_DATA
)
2027 && multiple_p (BYTES_PER_SVE_VECTOR
,
2028 GET_MODE_SIZE (element_mode
), &sve_nunits
))
2030 machine_mode sve_mode
;
2031 if (maybe_ne (nunits
, 0U))
2033 /* Try to find a full or partial SVE mode with exactly
2035 if (multiple_p (sve_nunits
, nunits
)
2036 && aarch64_sve_data_mode (element_mode
,
2037 nunits
).exists (&sve_mode
))
2042 /* Take the preferred number of units from the number of bytes
2043 that fit in VECTOR_MODE. We always start by "autodetecting"
2044 a full vector mode with preferred_simd_mode, so vectors
2045 chosen here will also be full vector modes. Then
2046 autovectorize_vector_modes tries smaller starting modes
2047 and thus smaller preferred numbers of units. */
2048 sve_nunits
= ordered_min (sve_nunits
, GET_MODE_SIZE (vector_mode
));
2049 if (aarch64_sve_data_mode (element_mode
,
2050 sve_nunits
).exists (&sve_mode
))
2055 /* Prefer to use 1 128-bit vector instead of 2 64-bit vectors. */
2057 && (vec_flags
& VEC_ADVSIMD
)
2058 && known_eq (nunits
, 0U)
2059 && known_eq (GET_MODE_BITSIZE (vector_mode
), 64U)
2060 && maybe_ge (GET_MODE_BITSIZE (element_mode
)
2061 * GET_MODE_NUNITS (vector_mode
), 128U))
2063 machine_mode res
= aarch64_simd_container_mode (element_mode
, 128);
2064 if (VECTOR_MODE_P (res
))
2068 return default_vectorize_related_mode (vector_mode
, element_mode
, nunits
);
2071 /* Implement TARGET_VECTORIZE_PREFERRED_DIV_AS_SHIFTS_OVER_MULT. */
2074 aarch64_vectorize_preferred_div_as_shifts_over_mult (const_tree type
)
2076 machine_mode mode
= TYPE_MODE (type
);
2077 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
2078 bool sve_p
= (vec_flags
& VEC_ANY_SVE
);
2079 bool simd_p
= (vec_flags
& VEC_ADVSIMD
);
2081 return (sve_p
&& TARGET_SVE2
) || (simd_p
&& TARGET_SIMD
);
2084 /* Implement TARGET_PREFERRED_ELSE_VALUE. For binary operations,
2085 prefer to use the first arithmetic operand as the else value if
2086 the else value doesn't matter, since that exactly matches the SVE
2087 destructive merging form. For ternary operations we could either
2088 pick the first operand and use FMAD-like instructions or the last
2089 operand and use FMLA-like instructions; the latter seems more
2093 aarch64_preferred_else_value (unsigned, tree
, unsigned int nops
, tree
*ops
)
2095 return nops
== 3 ? ops
[2] : ops
[0];
2098 /* Implement TARGET_HARD_REGNO_NREGS. */
2101 aarch64_hard_regno_nregs (unsigned regno
, machine_mode mode
)
2103 /* ??? Logically we should only need to provide a value when
2104 HARD_REGNO_MODE_OK says that the combination is valid,
2105 but at the moment we need to handle all modes. Just ignore
2106 any runtime parts for registers that can't store them. */
2107 HOST_WIDE_INT lowest_size
= constant_lower_bound (GET_MODE_SIZE (mode
));
2108 switch (aarch64_regno_regclass (regno
))
2114 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
2115 if (vec_flags
& VEC_SVE_DATA
)
2116 return exact_div (GET_MODE_SIZE (mode
),
2117 aarch64_vl_bytes (mode
, vec_flags
)).to_constant ();
2118 if (vec_flags
== (VEC_ADVSIMD
| VEC_STRUCT
| VEC_PARTIAL
))
2119 return GET_MODE_SIZE (mode
).to_constant () / 8;
2120 return CEIL (lowest_size
, UNITS_PER_VREG
);
2126 return mode
== VNx64BImode
? 4 : mode
== VNx32BImode
? 2 : 1;
2128 case MOVEABLE_SYSREGS
:
2130 case PR_AND_FFR_REGS
:
2135 return CEIL (lowest_size
, UNITS_PER_WORD
);
2140 /* Implement TARGET_HARD_REGNO_MODE_OK. */
2143 aarch64_hard_regno_mode_ok (unsigned regno
, machine_mode mode
)
2145 if (mode
== V8DImode
)
2146 return IN_RANGE (regno
, R0_REGNUM
, R23_REGNUM
)
2147 && multiple_p (regno
- R0_REGNUM
, 2);
2149 if (GET_MODE_CLASS (mode
) == MODE_CC
)
2150 return regno
== CC_REGNUM
;
2152 if (regno
== VG_REGNUM
)
2153 /* This must have the same size as _Unwind_Word. */
2154 return mode
== DImode
;
2156 if (regno
== FPM_REGNUM
)
2157 return mode
== QImode
|| mode
== HImode
|| mode
== SImode
|| mode
== DImode
;
2159 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
2160 if (vec_flags
== VEC_SVE_PRED
)
2161 return pr_or_ffr_regnum_p (regno
);
2163 if (vec_flags
== (VEC_SVE_PRED
| VEC_STRUCT
))
2164 return PR_REGNUM_P (regno
);
2166 if (pr_or_ffr_regnum_p (regno
))
2169 /* These registers are abstract; their modes don't matter. */
2170 if (FAKE_REGNUM_P (regno
))
2173 if (regno
== SP_REGNUM
)
2174 /* The purpose of comparing with ptr_mode is to support the
2175 global register variable associated with the stack pointer
2176 register via the syntax of asm ("wsp") in ILP32. */
2177 return mode
== Pmode
|| mode
== ptr_mode
;
2179 if (regno
== FRAME_POINTER_REGNUM
|| regno
== ARG_POINTER_REGNUM
)
2180 return mode
== Pmode
;
2182 if (GP_REGNUM_P (regno
))
2184 if (vec_flags
& (VEC_ANY_SVE
| VEC_STRUCT
))
2186 if (known_le (GET_MODE_SIZE (mode
), 8))
2188 if (known_le (GET_MODE_SIZE (mode
), 16))
2189 return (regno
& 1) == 0;
2191 else if (FP_REGNUM_P (regno
))
2193 if (vec_flags
& VEC_STRUCT
)
2194 return end_hard_regno (mode
, regno
) - 1 <= V31_REGNUM
;
2196 return !VECTOR_MODE_P (mode
) || vec_flags
!= 0;
2202 /* Return true if a function with type FNTYPE returns its value in
2203 SVE vector or predicate registers. */
2206 aarch64_returns_value_in_sve_regs_p (const_tree fntype
)
2208 tree return_type
= TREE_TYPE (fntype
);
2210 pure_scalable_type_info pst_info
;
2211 switch (pst_info
.analyze (return_type
))
2213 case pure_scalable_type_info::IS_PST
:
2214 return (pst_info
.num_zr () <= NUM_FP_ARG_REGS
2215 && pst_info
.num_pr () <= NUM_PR_ARG_REGS
);
2217 case pure_scalable_type_info::DOESNT_MATTER
:
2218 gcc_assert (aarch64_return_in_memory_1 (return_type
));
2221 case pure_scalable_type_info::NO_ABI_IDENTITY
:
2222 case pure_scalable_type_info::ISNT_PST
:
2228 /* Return true if a function with type FNTYPE takes arguments in
2229 SVE vector or predicate registers. */
2232 aarch64_takes_arguments_in_sve_regs_p (const_tree fntype
)
2234 CUMULATIVE_ARGS args_so_far_v
;
2235 aarch64_init_cumulative_args (&args_so_far_v
, NULL_TREE
, NULL_RTX
,
2236 NULL_TREE
, 0, true);
2237 cumulative_args_t args_so_far
= pack_cumulative_args (&args_so_far_v
);
2239 for (tree chain
= TYPE_ARG_TYPES (fntype
);
2240 chain
&& chain
!= void_list_node
;
2241 chain
= TREE_CHAIN (chain
))
2243 tree arg_type
= TREE_VALUE (chain
);
2244 if (arg_type
== error_mark_node
)
2247 function_arg_info
arg (arg_type
, /*named=*/true);
2248 apply_pass_by_reference_rules (&args_so_far_v
, arg
);
2249 pure_scalable_type_info pst_info
;
2250 if (pst_info
.analyze_registers (arg
.type
))
2252 unsigned int end_zr
= args_so_far_v
.aapcs_nvrn
+ pst_info
.num_zr ();
2253 unsigned int end_pr
= args_so_far_v
.aapcs_nprn
+ pst_info
.num_pr ();
2254 gcc_assert (end_zr
<= NUM_FP_ARG_REGS
&& end_pr
<= NUM_PR_ARG_REGS
);
2258 targetm
.calls
.function_arg_advance (args_so_far
, arg
);
2263 /* Implement TARGET_FNTYPE_ABI. */
2265 static const predefined_function_abi
&
2266 aarch64_fntype_abi (const_tree fntype
)
2268 if (lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (fntype
)))
2269 return aarch64_simd_abi ();
2271 if (aarch64_returns_value_in_sve_regs_p (fntype
)
2272 || aarch64_takes_arguments_in_sve_regs_p (fntype
))
2273 return aarch64_sve_abi ();
2275 return default_function_abi
;
2278 /* Return the state of PSTATE.SM on entry to functions of type FNTYPE. */
2280 static aarch64_isa_mode
2281 aarch64_fntype_pstate_sm (const_tree fntype
)
2283 if (lookup_attribute ("arm", "streaming", TYPE_ATTRIBUTES (fntype
)))
2284 return AARCH64_ISA_MODE_SM_ON
;
2286 if (lookup_attribute ("arm", "streaming_compatible",
2287 TYPE_ATTRIBUTES (fntype
)))
2290 return AARCH64_ISA_MODE_SM_OFF
;
2293 /* Return state flags that describe whether and how functions of type
2294 FNTYPE share state STATE_NAME with their callers. */
2297 aarch64_fntype_shared_flags (const_tree fntype
, const char *state_name
)
2299 return aarch64_lookup_shared_state_flags (TYPE_ATTRIBUTES (fntype
),
2303 /* Return the state of PSTATE.ZA on entry to functions of type FNTYPE. */
2305 static aarch64_isa_mode
2306 aarch64_fntype_pstate_za (const_tree fntype
)
2308 if (aarch64_fntype_shared_flags (fntype
, "za")
2309 || aarch64_fntype_shared_flags (fntype
, "zt0"))
2310 return AARCH64_ISA_MODE_ZA_ON
;
2315 /* Return the ISA mode on entry to functions of type FNTYPE. */
2317 static aarch64_isa_mode
2318 aarch64_fntype_isa_mode (const_tree fntype
)
2320 return (aarch64_fntype_pstate_sm (fntype
)
2321 | aarch64_fntype_pstate_za (fntype
));
2324 /* Return true if FNDECL uses streaming mode internally, as an
2325 implementation choice. */
2328 aarch64_fndecl_is_locally_streaming (const_tree fndecl
)
2330 return lookup_attribute ("arm", "locally_streaming",
2331 DECL_ATTRIBUTES (fndecl
));
2334 /* Return the state of PSTATE.SM when compiling the body of
2335 function FNDECL. This might be different from the state of
2336 PSTATE.SM on entry. */
2338 static aarch64_isa_mode
2339 aarch64_fndecl_pstate_sm (const_tree fndecl
)
2341 if (aarch64_fndecl_is_locally_streaming (fndecl
))
2342 return AARCH64_ISA_MODE_SM_ON
;
2344 return aarch64_fntype_pstate_sm (TREE_TYPE (fndecl
));
2347 /* Return true if function FNDECL has state STATE_NAME, either by creating
2348 new state itself or by sharing state with callers. */
2351 aarch64_fndecl_has_state (tree fndecl
, const char *state_name
)
2353 return (aarch64_fndecl_has_new_state (fndecl
, state_name
)
2354 || aarch64_fntype_shared_flags (TREE_TYPE (fndecl
),
2358 /* Return the state of PSTATE.ZA when compiling the body of function FNDECL.
2359 This might be different from the state of PSTATE.ZA on entry. */
2361 static aarch64_isa_mode
2362 aarch64_fndecl_pstate_za (const_tree fndecl
)
2364 if (aarch64_fndecl_has_new_state (fndecl
, "za")
2365 || aarch64_fndecl_has_new_state (fndecl
, "zt0"))
2366 return AARCH64_ISA_MODE_ZA_ON
;
2368 return aarch64_fntype_pstate_za (TREE_TYPE (fndecl
));
2371 /* Return the ISA mode that should be used to compile the body of
2374 static aarch64_isa_mode
2375 aarch64_fndecl_isa_mode (const_tree fndecl
)
2377 return (aarch64_fndecl_pstate_sm (fndecl
)
2378 | aarch64_fndecl_pstate_za (fndecl
));
2381 /* Return the state of PSTATE.SM on entry to the current function.
2382 This might be different from the state of PSTATE.SM in the function
2385 static aarch64_isa_mode
2386 aarch64_cfun_incoming_pstate_sm ()
2388 return aarch64_fntype_pstate_sm (TREE_TYPE (cfun
->decl
));
2391 /* Return the state of PSTATE.ZA on entry to the current function.
2392 This might be different from the state of PSTATE.ZA in the function
2395 static aarch64_isa_mode
2396 aarch64_cfun_incoming_pstate_za ()
2398 return aarch64_fntype_pstate_za (TREE_TYPE (cfun
->decl
));
2401 /* Return state flags that describe whether and how the current function shares
2402 state STATE_NAME with callers. */
2405 aarch64_cfun_shared_flags (const char *state_name
)
2407 return aarch64_fntype_shared_flags (TREE_TYPE (cfun
->decl
), state_name
);
2410 /* Return true if the current function creates new state of type STATE_NAME
2411 (as opposed to sharing the state with its callers or ignoring the state
2415 aarch64_cfun_has_new_state (const char *state_name
)
2417 return aarch64_fndecl_has_new_state (cfun
->decl
, state_name
);
2420 /* Return true if PSTATE.SM is 1 in the body of the current function,
2421 but is not guaranteed to be 1 on entry. */
2424 aarch64_cfun_enables_pstate_sm ()
2426 return (aarch64_fndecl_is_locally_streaming (cfun
->decl
)
2427 && aarch64_cfun_incoming_pstate_sm () != AARCH64_ISA_MODE_SM_ON
);
2430 /* Return true if the current function has state STATE_NAME, either by
2431 creating new state itself or by sharing state with callers. */
2434 aarch64_cfun_has_state (const char *state_name
)
2436 return aarch64_fndecl_has_state (cfun
->decl
, state_name
);
2439 /* Return true if a call from the current function to a function with
2440 ISA mode CALLEE_MODE would involve a change to PSTATE.SM around
2441 the BL instruction. */
2444 aarch64_call_switches_pstate_sm (aarch64_isa_mode callee_mode
)
2446 return (bool) (callee_mode
& ~AARCH64_ISA_MODE
& AARCH64_ISA_MODE_SM_STATE
);
2449 /* Implement TARGET_COMPATIBLE_VECTOR_TYPES_P. */
2452 aarch64_compatible_vector_types_p (const_tree type1
, const_tree type2
)
2454 return (aarch64_sve::builtin_type_p (type1
)
2455 == aarch64_sve::builtin_type_p (type2
));
2458 /* Return true if we should emit CFI for register REGNO. */
2461 aarch64_emit_cfi_for_reg_p (unsigned int regno
)
2463 return (GP_REGNUM_P (regno
)
2464 || !default_function_abi
.clobbers_full_reg_p (regno
));
2467 /* Return the mode we should use to save and restore register REGNO. */
2470 aarch64_reg_save_mode (unsigned int regno
)
2472 if (GP_REGNUM_P (regno
) || regno
== VG_REGNUM
)
2475 if (FP_REGNUM_P (regno
))
2476 switch (crtl
->abi
->id ())
2478 case ARM_PCS_AAPCS64
:
2479 /* Only the low 64 bits are saved by the base PCS. */
2483 /* The vector PCS saves the low 128 bits (which is the full
2484 register on non-SVE targets). */
2488 /* Use vectors of DImode for registers that need frame
2489 information, so that the first 64 bytes of the save slot
2490 are always the equivalent of what storing D<n> would give. */
2491 if (aarch64_emit_cfi_for_reg_p (regno
))
2494 /* Use vectors of bytes otherwise, so that the layout is
2495 endian-agnostic, and so that we can use LDR and STR for
2496 big-endian targets. */
2499 case ARM_PCS_TLSDESC
:
2500 case ARM_PCS_UNKNOWN
:
2504 if (PR_REGNUM_P (regno
))
2505 /* Save the full predicate register. */
2511 /* Return the CONST_INT that should be placed in an UNSPEC_CALLEE_ABI rtx.
2512 This value encodes the following information:
2513 - the ISA mode on entry to a callee (ISA_MODE)
2514 - the ABI of the callee (PCS_VARIANT)
2515 - whether the callee has an indirect_return
2516 attribute (INDIRECT_RETURN). */
2519 aarch64_gen_callee_cookie (aarch64_isa_mode isa_mode
, arm_pcs pcs_variant
,
2520 bool indirect_return
)
2522 unsigned int im
= (unsigned int) isa_mode
;
2523 unsigned int ir
= (indirect_return
? 1 : 0) << AARCH64_NUM_ISA_MODES
;
2524 unsigned int pv
= (unsigned int) pcs_variant
2525 << (AARCH64_NUM_ABI_ATTRIBUTES
+ AARCH64_NUM_ISA_MODES
);
2526 return gen_int_mode (im
| ir
| pv
, DImode
);
2529 /* COOKIE is a CONST_INT from an UNSPEC_CALLEE_ABI rtx. Return the
2532 static const predefined_function_abi
&
2533 aarch64_callee_abi (rtx cookie
)
2535 return function_abis
[UINTVAL (cookie
)
2536 >> (AARCH64_NUM_ABI_ATTRIBUTES
+ AARCH64_NUM_ISA_MODES
)];
2539 /* COOKIE is a CONST_INT from an UNSPEC_CALLEE_ABI rtx. Return the
2540 required ISA mode on entry to the callee, which is also the ISA
2541 mode on return from the callee. */
2543 static aarch64_isa_mode
2544 aarch64_callee_isa_mode (rtx cookie
)
2546 return UINTVAL (cookie
) & ((1 << AARCH64_NUM_ISA_MODES
) - 1);
2549 /* COOKIE is a CONST_INT from an UNSPEC_CALLEE_ABI rtx. Return
2550 whether function was marked with an indirect_return attribute. */
2553 aarch64_callee_indirect_return (rtx cookie
)
2555 return ((UINTVAL (cookie
) >> AARCH64_NUM_ISA_MODES
) & 1) == 1;
2558 /* INSN is a call instruction. Return the CONST_INT stored in its
2559 UNSPEC_CALLEE_ABI rtx. */
2562 aarch64_insn_callee_cookie (const rtx_insn
*insn
)
2564 rtx pat
= PATTERN (insn
);
2565 gcc_assert (GET_CODE (pat
) == PARALLEL
);
2566 rtx unspec
= XVECEXP (pat
, 0, 1);
2567 gcc_assert (GET_CODE (unspec
) == UNSPEC
2568 && XINT (unspec
, 1) == UNSPEC_CALLEE_ABI
);
2569 return XVECEXP (unspec
, 0, 0);
2572 /* INSN is a call instruction. Return true if the callee has an
2573 indirect_return attribute. */
2576 aarch_fun_is_indirect_return (rtx_insn
*insn
)
2578 rtx cookie
= aarch64_insn_callee_cookie (insn
);
2579 return aarch64_callee_indirect_return (cookie
);
2582 /* Implement TARGET_INSN_CALLEE_ABI. */
2584 const predefined_function_abi
&
2585 aarch64_insn_callee_abi (const rtx_insn
*insn
)
2587 return aarch64_callee_abi (aarch64_insn_callee_cookie (insn
));
2590 /* INSN is a call instruction. Return the required ISA mode on entry to
2591 the callee, which is also the ISA mode on return from the callee. */
2593 static aarch64_isa_mode
2594 aarch64_insn_callee_isa_mode (const rtx_insn
*insn
)
2596 return aarch64_callee_isa_mode (aarch64_insn_callee_cookie (insn
));
2599 /* Implement TARGET_HARD_REGNO_CALL_PART_CLOBBERED. The callee only saves
2600 the lower 64 bits of a 128-bit register. Tell the compiler the callee
2601 clobbers the top 64 bits when restoring the bottom 64 bits. */
2604 aarch64_hard_regno_call_part_clobbered (unsigned int abi_id
,
2608 if (FP_REGNUM_P (regno
) && abi_id
!= ARM_PCS_SVE
)
2610 poly_int64 per_register_size
= GET_MODE_SIZE (mode
);
2611 unsigned int nregs
= hard_regno_nregs (regno
, mode
);
2613 per_register_size
= exact_div (per_register_size
, nregs
);
2614 if (abi_id
== ARM_PCS_SIMD
|| abi_id
== ARM_PCS_TLSDESC
)
2615 return maybe_gt (per_register_size
, 16);
2616 return maybe_gt (per_register_size
, 8);
2621 /* Implement REGMODE_NATURAL_SIZE. */
2623 aarch64_regmode_natural_size (machine_mode mode
)
2625 /* The natural size for SVE data modes is one SVE data vector,
2626 and similarly for predicates. We can't independently modify
2627 anything smaller than that. */
2628 /* ??? For now, only do this for variable-width SVE registers.
2629 Doing it for constant-sized registers breaks lower-subreg.cc. */
2630 /* ??? And once that's fixed, we should probably have similar
2631 code for Advanced SIMD. */
2632 if (!aarch64_sve_vg
.is_constant ())
2634 /* REGMODE_NATURAL_SIZE influences general subreg validity rules,
2635 so we need to handle memory-only modes as well. */
2636 unsigned int vec_flags
= aarch64_classify_vector_memory_mode (mode
);
2637 if (vec_flags
& VEC_SVE_PRED
)
2638 return BYTES_PER_SVE_PRED
;
2639 if (vec_flags
& VEC_SVE_DATA
)
2640 return BYTES_PER_SVE_VECTOR
;
2642 return UNITS_PER_WORD
;
2645 /* Implement HARD_REGNO_CALLER_SAVE_MODE. */
2647 aarch64_hard_regno_caller_save_mode (unsigned regno
, unsigned,
2650 /* The predicate mode determines which bits are significant and
2651 which are "don't care". Decreasing the number of lanes would
2652 lose data while increasing the number of lanes would make bits
2653 unnecessarily significant. */
2654 if (PR_REGNUM_P (regno
))
2656 if (known_lt (GET_MODE_SIZE (mode
), 4)
2657 && REG_CAN_CHANGE_MODE_P (regno
, mode
, SImode
)
2658 && REG_CAN_CHANGE_MODE_P (regno
, SImode
, mode
))
2663 /* Return true if I's bits are consecutive ones from the MSB. */
2665 aarch64_high_bits_all_ones_p (HOST_WIDE_INT i
)
2667 return exact_log2 (-i
) != HOST_WIDE_INT_M1
;
2670 /* Implement TARGET_CONSTANT_ALIGNMENT. Make strings word-aligned so
2671 that strcpy from constants will be faster. */
2673 static HOST_WIDE_INT
2674 aarch64_constant_alignment (const_tree exp
, HOST_WIDE_INT align
)
2676 if (TREE_CODE (exp
) == STRING_CST
&& !optimize_size
)
2677 return MAX (align
, BITS_PER_WORD
);
2681 /* Align definitions of arrays, unions and structures so that
2682 initializations and copies can be made more efficient. This is not
2683 ABI-changing, so it only affects places where we can see the
2684 definition. Increasing the alignment tends to introduce padding,
2685 so don't do this when optimizing for size/conserving stack space. */
2688 aarch64_data_alignment (const_tree type
, unsigned align
)
2693 if (AGGREGATE_TYPE_P (type
))
2695 unsigned HOST_WIDE_INT size
= 0;
2697 if (TYPE_SIZE (type
) && TREE_CODE (TYPE_SIZE (type
)) == INTEGER_CST
2698 && tree_fits_uhwi_p (TYPE_SIZE (type
)))
2699 size
= tree_to_uhwi (TYPE_SIZE (type
));
2701 /* Align small structs/arrays to 32 bits, or 64 bits if larger. */
2702 if (align
< 32 && size
<= 32)
2704 else if (align
< 64)
2712 aarch64_stack_alignment (const_tree type
, unsigned align
)
2714 if (flag_conserve_stack
)
2717 if (AGGREGATE_TYPE_P (type
))
2719 unsigned HOST_WIDE_INT size
= 0;
2721 if (TYPE_SIZE (type
) && TREE_CODE (TYPE_SIZE (type
)) == INTEGER_CST
2722 && tree_fits_uhwi_p (TYPE_SIZE (type
)))
2723 size
= tree_to_uhwi (TYPE_SIZE (type
));
2725 /* Align small structs/arrays to 32 bits, or 64 bits if larger. */
2726 if (align
< 32 && size
<= 32)
2728 else if (align
< 64)
2735 /* Return true if calls to DECL should be treated as
2736 long-calls (ie called via a register). */
2738 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED
)
2743 /* Return true if calls to symbol-ref SYM should be treated as
2744 long-calls (ie called via a register). */
2746 aarch64_is_long_call_p (rtx sym
)
2748 return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym
));
2751 /* Return true if calls to symbol-ref SYM should not go through
2755 aarch64_is_noplt_call_p (rtx sym
)
2757 const_tree decl
= SYMBOL_REF_DECL (sym
);
2762 || lookup_attribute ("noplt", DECL_ATTRIBUTES (decl
)))
2763 && !targetm
.binds_local_p (decl
))
2769 /* Emit an insn that's a simple single-set. Both the operands must be
2770 known to be valid. */
2771 inline static rtx_insn
*
2772 emit_set_insn (rtx x
, rtx y
)
2774 return emit_insn (gen_rtx_SET (x
, y
));
2777 /* X and Y are two things to compare using CODE. Emit the compare insn and
2778 return the rtx for register 0 in the proper mode. */
2780 aarch64_gen_compare_reg (RTX_CODE code
, rtx x
, rtx y
)
2782 machine_mode cmp_mode
= GET_MODE (x
);
2783 machine_mode cc_mode
;
2786 if (cmp_mode
== TImode
)
2788 gcc_assert (code
== NE
);
2791 cc_reg
= gen_rtx_REG (cc_mode
, CC_REGNUM
);
2793 rtx x_lo
= operand_subword (x
, 0, 0, TImode
);
2794 rtx y_lo
= operand_subword (y
, 0, 0, TImode
);
2795 emit_set_insn (cc_reg
, gen_rtx_COMPARE (cc_mode
, x_lo
, y_lo
));
2797 rtx x_hi
= operand_subword (x
, 1, 0, TImode
);
2798 rtx y_hi
= operand_subword (y
, 1, 0, TImode
);
2799 emit_insn (gen_ccmpccdi (cc_reg
, cc_reg
, x_hi
, y_hi
,
2800 gen_rtx_EQ (cc_mode
, cc_reg
, const0_rtx
),
2801 GEN_INT (AARCH64_EQ
)));
2805 cc_mode
= SELECT_CC_MODE (code
, x
, y
);
2806 cc_reg
= gen_rtx_REG (cc_mode
, CC_REGNUM
);
2807 emit_set_insn (cc_reg
, gen_rtx_COMPARE (cc_mode
, x
, y
));
2812 /* Similarly, but maybe zero-extend Y if Y_MODE < SImode. */
2815 aarch64_gen_compare_reg_maybe_ze (RTX_CODE code
, rtx x
, rtx y
,
2816 machine_mode y_mode
)
2818 if (y_mode
== E_QImode
|| y_mode
== E_HImode
)
2820 if (CONST_INT_P (y
))
2822 y
= GEN_INT (INTVAL (y
) & GET_MODE_MASK (y_mode
));
2828 machine_mode cc_mode
;
2830 t
= gen_rtx_ZERO_EXTEND (SImode
, y
);
2831 t
= gen_rtx_COMPARE (CC_SWPmode
, t
, x
);
2832 cc_mode
= CC_SWPmode
;
2833 cc_reg
= gen_rtx_REG (cc_mode
, CC_REGNUM
);
2834 emit_set_insn (cc_reg
, t
);
2839 if (!aarch64_plus_operand (y
, y_mode
))
2840 y
= force_reg (y_mode
, y
);
2842 return aarch64_gen_compare_reg (code
, x
, y
);
2845 /* Generate conditional branch to LABEL, comparing X to 0 using CODE.
2846 Return the jump instruction. */
2849 aarch64_gen_compare_zero_and_branch (rtx_code code
, rtx x
,
2850 rtx_code_label
*label
)
2852 if (aarch64_track_speculation
)
2854 /* Emit an explicit compare instruction, so that we can correctly
2855 track the condition codes. */
2856 rtx cc_reg
= aarch64_gen_compare_reg (code
, x
, const0_rtx
);
2857 x
= gen_rtx_fmt_ee (code
, GET_MODE (cc_reg
), cc_reg
, const0_rtx
);
2860 x
= gen_rtx_fmt_ee (code
, VOIDmode
, x
, const0_rtx
);
2862 x
= gen_rtx_IF_THEN_ELSE (VOIDmode
, x
,
2863 gen_rtx_LABEL_REF (Pmode
, label
), pc_rtx
);
2864 return gen_rtx_SET (pc_rtx
, x
);
2867 /* Return an rtx that branches to LABEL based on the value of bit BITNUM of X.
2868 If CODE is NE, it branches to LABEL when the bit is set; if CODE is EQ,
2869 it branches to LABEL when the bit is clear. */
2872 aarch64_gen_test_and_branch (rtx_code code
, rtx x
, int bitnum
,
2873 rtx_code_label
*label
)
2875 auto mode
= GET_MODE (x
);
2876 if (aarch64_track_speculation
)
2878 auto mask
= gen_int_mode (HOST_WIDE_INT_1U
<< bitnum
, mode
);
2879 emit_insn (gen_aarch64_and3nr_compare0 (mode
, x
, mask
));
2880 rtx cc_reg
= gen_rtx_REG (CC_NZVmode
, CC_REGNUM
);
2881 rtx x
= gen_rtx_fmt_ee (code
, CC_NZVmode
, cc_reg
, const0_rtx
);
2882 return gen_condjump (x
, cc_reg
, label
);
2884 return gen_aarch64_tb (code
, mode
, mode
,
2885 x
, gen_int_mode (bitnum
, mode
), label
);
2888 /* Consider the operation:
2890 OPERANDS[0] = CODE (OPERANDS[1], OPERANDS[2]) + OPERANDS[3]
2894 - CODE is [SU]MAX or [SU]MIN
2895 - OPERANDS[2] and OPERANDS[3] are constant integers
2896 - OPERANDS[3] is a positive or negative shifted 12-bit immediate
2897 - all operands have mode MODE
2899 Decide whether it is possible to implement the operation using:
2901 SUBS <tmp>, OPERANDS[1], -OPERANDS[3]
2903 ADDS <tmp>, OPERANDS[1], OPERANDS[3]
2907 <insn> OPERANDS[0], <tmp>, [wx]zr, <cond>
2909 where <insn> is one of CSEL, CSINV or CSINC. Return true if so.
2910 If GENERATE_P is true, also update OPERANDS as follows:
2912 OPERANDS[4] = -OPERANDS[3]
2913 OPERANDS[5] = the rtl condition representing <cond>
2915 OPERANDS[7] = 0 for CSEL, -1 for CSINV or 1 for CSINC. */
2917 aarch64_maxmin_plus_const (rtx_code code
, rtx
*operands
, bool generate_p
)
2919 signop sgn
= (code
== UMAX
|| code
== UMIN
? UNSIGNED
: SIGNED
);
2920 rtx dst
= operands
[0];
2921 rtx maxmin_op
= operands
[2];
2922 rtx add_op
= operands
[3];
2923 machine_mode mode
= GET_MODE (dst
);
2925 /* max (x, y) - z == (x >= y + 1 ? x : y) - z
2926 == (x >= y ? x : y) - z
2927 == (x > y ? x : y) - z
2928 == (x > y - 1 ? x : y) - z
2930 min (x, y) - z == (x <= y - 1 ? x : y) - z
2931 == (x <= y ? x : y) - z
2932 == (x < y ? x : y) - z
2933 == (x < y + 1 ? x : y) - z
2935 Check whether z is in { y - 1, y, y + 1 } and pick the form(s) for
2936 which x is compared with z. Set DIFF to y - z. Thus the supported
2937 combinations are as follows, with DIFF being the value after the ":":
2939 max (x, y) - z == x >= y + 1 ? x - (y + 1) : -1 [z == y + 1]
2940 == x >= y ? x - y : 0 [z == y]
2941 == x > y ? x - y : 0 [z == y]
2942 == x > y - 1 ? x - (y - 1) : 1 [z == y - 1]
2944 min (x, y) - z == x <= y - 1 ? x - (y - 1) : 1 [z == y - 1]
2945 == x <= y ? x - y : 0 [z == y]
2946 == x < y ? x - y : 0 [z == y]
2947 == x < y + 1 ? x - (y + 1) : -1 [z == y + 1]. */
2948 auto maxmin_val
= rtx_mode_t (maxmin_op
, mode
);
2949 auto add_val
= rtx_mode_t (add_op
, mode
);
2950 auto sub_val
= wi::neg (add_val
);
2951 auto diff
= wi::sub (maxmin_val
, sub_val
);
2953 || (diff
== 1 && wi::gt_p (maxmin_val
, sub_val
, sgn
))
2954 || (diff
== -1 && wi::lt_p (maxmin_val
, sub_val
, sgn
))))
2964 cmp
= diff
== 1 ? GT
: GE
;
2967 cmp
= diff
== 1 ? GTU
: GEU
;
2970 cmp
= diff
== -1 ? LT
: LE
;
2973 cmp
= diff
== -1 ? LTU
: LEU
;
2978 rtx cc
= gen_rtx_REG (CCmode
, CC_REGNUM
);
2980 operands
[4] = immed_wide_int_const (sub_val
, mode
);
2981 operands
[5] = gen_rtx_fmt_ee (cmp
, VOIDmode
, cc
, const0_rtx
);
2982 if (can_create_pseudo_p ())
2983 operands
[6] = gen_reg_rtx (mode
);
2986 operands
[7] = immed_wide_int_const (diff
, mode
);
2992 /* Build the SYMBOL_REF for __tls_get_addr. */
2994 static GTY(()) rtx tls_get_addr_libfunc
;
2997 aarch64_tls_get_addr (void)
2999 if (!tls_get_addr_libfunc
)
3000 tls_get_addr_libfunc
= init_one_libfunc ("__tls_get_addr");
3001 return tls_get_addr_libfunc
;
3004 /* Return the TLS model to use for ADDR. */
3006 static enum tls_model
3007 tls_symbolic_operand_type (rtx addr
)
3009 enum tls_model tls_kind
= TLS_MODEL_NONE
;
3011 addr
= strip_offset_and_salt (addr
, &offset
);
3012 if (SYMBOL_REF_P (addr
))
3013 tls_kind
= SYMBOL_REF_TLS_MODEL (addr
);
3018 /* We'll allow lo_sum's in addresses in our legitimate addresses
3019 so that combine would take care of combining addresses where
3020 necessary, but for generation purposes, we'll generate the address
3023 tmp = hi (symbol_ref); adrp x1, foo
3024 dest = lo_sum (tmp, symbol_ref); add dest, x1, :lo_12:foo
3028 adrp x1, :got:foo adrp tmp, :tlsgd:foo
3029 ldr x1, [:got_lo12:foo] add dest, tmp, :tlsgd_lo12:foo
3033 Load TLS symbol, depending on TLS mechanism and TLS access model.
3035 Global Dynamic - Traditional TLS:
3036 adrp tmp, :tlsgd:imm
3037 add dest, tmp, #:tlsgd_lo12:imm
3040 Global Dynamic - TLS Descriptors:
3041 adrp dest, :tlsdesc:imm
3042 ldr tmp, [dest, #:tlsdesc_lo12:imm]
3043 add dest, dest, #:tlsdesc_lo12:imm
3050 adrp tmp, :gottprel:imm
3051 ldr dest, [tmp, #:gottprel_lo12:imm]
3056 add t0, tp, #:tprel_hi12:imm, lsl #12
3057 add t0, t0, #:tprel_lo12_nc:imm
3061 aarch64_load_symref_appropriately (rtx dest
, rtx imm
,
3062 enum aarch64_symbol_type type
)
3065 rtx tmp
= legitimize_pe_coff_symbol (imm
, true);
3068 emit_insn (gen_rtx_SET (dest
, tmp
));
3075 case SYMBOL_SMALL_ABSOLUTE
:
3077 /* In ILP32, the mode of dest can be either SImode or DImode. */
3079 machine_mode mode
= GET_MODE (dest
);
3081 gcc_assert (mode
== Pmode
|| mode
== ptr_mode
);
3083 if (can_create_pseudo_p ())
3084 tmp_reg
= gen_reg_rtx (mode
);
3086 HOST_WIDE_INT mid_const
= 0;
3090 strip_offset (imm
, &offset
);
3092 HOST_WIDE_INT const_offset
;
3093 if (offset
.is_constant (&const_offset
))
3094 /* Written this way for the sake of negative offsets. */
3095 mid_const
= const_offset
/ (1 << 20) * (1 << 20);
3097 imm
= plus_constant (mode
, imm
, -mid_const
);
3099 emit_move_insn (tmp_reg
, gen_rtx_HIGH (mode
, copy_rtx (imm
)));
3101 emit_set_insn (tmp_reg
, plus_constant (mode
, tmp_reg
, mid_const
));
3102 emit_insn (gen_add_losym (dest
, tmp_reg
, imm
));
3106 case SYMBOL_TINY_ABSOLUTE
:
3107 emit_insn (gen_rtx_SET (dest
, imm
));
3110 case SYMBOL_SMALL_GOT_28K
:
3112 machine_mode mode
= GET_MODE (dest
);
3113 rtx gp_rtx
= pic_offset_table_rtx
;
3117 /* NOTE: pic_offset_table_rtx can be NULL_RTX, because we can reach
3118 here before rtl expand. Tree IVOPT will generate rtl pattern to
3119 decide rtx costs, in which case pic_offset_table_rtx is not
3120 initialized. For that case no need to generate the first adrp
3121 instruction as the final cost for global variable access is
3125 /* -fpic for -mcmodel=small allow 32K GOT table size (but we are
3126 using the page base as GOT base, the first page may be wasted,
3127 in the worst scenario, there is only 28K space for GOT).
3129 The generate instruction sequence for accessing global variable
3132 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym]
3134 Only one instruction needed. But we must initialize
3135 pic_offset_table_rtx properly. We generate initialize insn for
3136 every global access, and allow CSE to remove all redundant.
3138 The final instruction sequences will look like the following
3139 for multiply global variables access.
3141 adrp pic_offset_table_rtx, _GLOBAL_OFFSET_TABLE_
3143 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym1]
3144 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym2]
3145 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym3]
3148 rtx s
= gen_rtx_SYMBOL_REF (Pmode
, "_GLOBAL_OFFSET_TABLE_");
3149 crtl
->uses_pic_offset_table
= 1;
3150 emit_move_insn (gp_rtx
, gen_rtx_HIGH (Pmode
, s
));
3152 if (mode
!= GET_MODE (gp_rtx
))
3153 gp_rtx
= gen_lowpart (mode
, gp_rtx
);
3157 if (mode
== ptr_mode
)
3160 insn
= gen_ldr_got_small_28k_di (dest
, gp_rtx
, imm
);
3162 insn
= gen_ldr_got_small_28k_si (dest
, gp_rtx
, imm
);
3164 mem
= XVECEXP (SET_SRC (insn
), 0, 0);
3168 gcc_assert (mode
== Pmode
);
3170 insn
= gen_ldr_got_small_28k_sidi (dest
, gp_rtx
, imm
);
3171 mem
= XVECEXP (XEXP (SET_SRC (insn
), 0), 0, 0);
3174 /* The operand is expected to be MEM. Whenever the related insn
3175 pattern changed, above code which calculate mem should be
3177 gcc_assert (MEM_P (mem
));
3178 MEM_READONLY_P (mem
) = 1;
3179 MEM_NOTRAP_P (mem
) = 1;
3184 case SYMBOL_SMALL_GOT_4G
:
3185 emit_insn (gen_rtx_SET (dest
, imm
));
3188 case SYMBOL_SMALL_TLSGD
:
3191 /* The return type of __tls_get_addr is the C pointer type
3193 rtx result
= gen_rtx_REG (ptr_mode
, R0_REGNUM
);
3196 if (GET_MODE (dest
) != ptr_mode
)
3197 tmp_reg
= can_create_pseudo_p () ? gen_reg_rtx (ptr_mode
) : result
;
3200 if (ptr_mode
== SImode
)
3201 aarch64_emit_call_insn (gen_tlsgd_small_si (result
, imm
));
3203 aarch64_emit_call_insn (gen_tlsgd_small_di (result
, imm
));
3204 insns
= get_insns ();
3207 RTL_CONST_CALL_P (insns
) = 1;
3208 emit_libcall_block (insns
, tmp_reg
, result
, imm
);
3209 /* Convert back to the mode of the dest adding a zero_extend
3210 from SImode (ptr_mode) to DImode (Pmode). */
3211 if (dest
!= tmp_reg
)
3212 convert_move (dest
, tmp_reg
, true);
3216 case SYMBOL_SMALL_TLSDESC
:
3218 machine_mode mode
= GET_MODE (dest
);
3219 rtx x0
= gen_rtx_REG (mode
, R0_REGNUM
);
3222 gcc_assert (mode
== Pmode
|| mode
== ptr_mode
);
3224 /* In ILP32, the got entry is always of SImode size. Unlike
3225 small GOT, the dest is fixed at reg 0. */
3227 emit_insn (gen_tlsdesc_small_si (imm
));
3229 emit_insn (gen_tlsdesc_small_di (imm
));
3230 tp
= aarch64_load_tp (NULL
);
3233 tp
= gen_lowpart (mode
, tp
);
3235 emit_insn (gen_rtx_SET (dest
, gen_rtx_PLUS (mode
, tp
, x0
)));
3237 set_unique_reg_note (get_last_insn (), REG_EQUIV
, imm
);
3241 case SYMBOL_SMALL_TLSIE
:
3243 /* In ILP32, the mode of dest can be either SImode or DImode,
3244 while the got entry is always of SImode size. The mode of
3245 dest depends on how dest is used: if dest is assigned to a
3246 pointer (e.g. in the memory), it has SImode; it may have
3247 DImode if dest is dereferenced to access the memeory.
3248 This is why we have to handle three different tlsie_small
3249 patterns here (two patterns for ILP32). */
3250 machine_mode mode
= GET_MODE (dest
);
3251 rtx tmp_reg
= gen_reg_rtx (mode
);
3252 rtx tp
= aarch64_load_tp (NULL
);
3254 if (mode
== ptr_mode
)
3257 emit_insn (gen_tlsie_small_di (tmp_reg
, imm
));
3260 emit_insn (gen_tlsie_small_si (tmp_reg
, imm
));
3261 tp
= gen_lowpart (mode
, tp
);
3266 gcc_assert (mode
== Pmode
);
3267 emit_insn (gen_tlsie_small_sidi (tmp_reg
, imm
));
3270 emit_insn (gen_rtx_SET (dest
, gen_rtx_PLUS (mode
, tp
, tmp_reg
)));
3272 set_unique_reg_note (get_last_insn (), REG_EQUIV
, imm
);
3276 case SYMBOL_TLSLE12
:
3277 case SYMBOL_TLSLE24
:
3278 case SYMBOL_TLSLE32
:
3279 case SYMBOL_TLSLE48
:
3281 machine_mode mode
= GET_MODE (dest
);
3282 rtx tp
= aarch64_load_tp (NULL
);
3285 tp
= gen_lowpart (mode
, tp
);
3289 case SYMBOL_TLSLE12
:
3290 emit_insn ((mode
== DImode
? gen_tlsle12_di
: gen_tlsle12_si
)
3293 case SYMBOL_TLSLE24
:
3294 emit_insn ((mode
== DImode
? gen_tlsle24_di
: gen_tlsle24_si
)
3297 case SYMBOL_TLSLE32
:
3298 emit_insn ((mode
== DImode
? gen_tlsle32_di
: gen_tlsle32_si
)
3300 emit_insn ((mode
== DImode
? gen_adddi3
: gen_addsi3
)
3303 case SYMBOL_TLSLE48
:
3304 emit_insn ((mode
== DImode
? gen_tlsle48_di
: gen_tlsle48_si
)
3306 emit_insn ((mode
== DImode
? gen_adddi3
: gen_addsi3
)
3314 set_unique_reg_note (get_last_insn (), REG_EQUIV
, imm
);
3318 case SYMBOL_TINY_GOT
:
3321 machine_mode mode
= GET_MODE (dest
);
3323 if (mode
== ptr_mode
)
3324 insn
= gen_ldr_got_tiny (mode
, dest
, imm
);
3327 gcc_assert (mode
== Pmode
);
3328 insn
= gen_ldr_got_tiny_sidi (dest
, imm
);
3335 case SYMBOL_TINY_TLSIE
:
3337 machine_mode mode
= GET_MODE (dest
);
3338 rtx tp
= aarch64_load_tp (NULL
);
3340 if (mode
== ptr_mode
)
3343 emit_insn (gen_tlsie_tiny_di (dest
, imm
, tp
));
3346 tp
= gen_lowpart (mode
, tp
);
3347 emit_insn (gen_tlsie_tiny_si (dest
, imm
, tp
));
3352 gcc_assert (mode
== Pmode
);
3353 emit_insn (gen_tlsie_tiny_sidi (dest
, imm
, tp
));
3357 set_unique_reg_note (get_last_insn (), REG_EQUIV
, imm
);
3366 /* Emit a move from SRC to DEST. Assume that the move expanders can
3367 handle all moves if !can_create_pseudo_p (). The distinction is
3368 important because, unlike emit_move_insn, the move expanders know
3369 how to force Pmode objects into the constant pool even when the
3370 constant pool address is not itself legitimate. */
3372 aarch64_emit_move (rtx dest
, rtx src
)
3374 return (can_create_pseudo_p ()
3375 ? emit_move_insn (dest
, src
)
3376 : emit_move_insn_1 (dest
, src
));
3379 /* Apply UNOPTAB to OP and store the result in DEST. */
3382 aarch64_emit_unop (rtx dest
, optab unoptab
, rtx op
)
3384 rtx tmp
= expand_unop (GET_MODE (dest
), unoptab
, op
, dest
, 0);
3386 emit_move_insn (dest
, tmp
);
3389 /* Apply BINOPTAB to OP0 and OP1 and store the result in DEST. */
3392 aarch64_emit_binop (rtx dest
, optab binoptab
, rtx op0
, rtx op1
)
3394 rtx tmp
= expand_binop (GET_MODE (dest
), binoptab
, op0
, op1
, dest
, 0,
3397 emit_move_insn (dest
, tmp
);
3400 /* Split a move from SRC to DST into multiple moves of mode SINGLE_MODE. */
3403 aarch64_split_move (rtx dst
, rtx src
, machine_mode single_mode
)
3405 machine_mode mode
= GET_MODE (dst
);
3406 auto npieces
= exact_div (GET_MODE_SIZE (mode
),
3407 GET_MODE_SIZE (single_mode
)).to_constant ();
3408 auto_vec
<rtx
, 4> dst_pieces
, src_pieces
;
3410 for (unsigned int i
= 0; i
< npieces
; ++i
)
3412 auto off
= i
* GET_MODE_SIZE (single_mode
);
3413 dst_pieces
.safe_push (simplify_gen_subreg (single_mode
, dst
, mode
, off
));
3414 src_pieces
.safe_push (simplify_gen_subreg (single_mode
, src
, mode
, off
));
3417 /* At most one pairing may overlap. */
3418 if (reg_overlap_mentioned_p (dst_pieces
[0], src
))
3419 for (unsigned int i
= npieces
; i
-- > 0;)
3420 aarch64_emit_move (dst_pieces
[i
], src_pieces
[i
]);
3422 for (unsigned int i
= 0; i
< npieces
; ++i
)
3423 aarch64_emit_move (dst_pieces
[i
], src_pieces
[i
]);
3426 /* Split a 128-bit move operation into two 64-bit move operations,
3427 taking care to handle partial overlap of register to register
3428 copies. Special cases are needed when moving between GP regs and
3429 FP regs. SRC can be a register, constant or memory; DST a register
3430 or memory. If either operand is memory it must not have any side
3433 aarch64_split_128bit_move (rtx dst
, rtx src
)
3435 machine_mode mode
= GET_MODE (dst
);
3437 gcc_assert (mode
== TImode
|| mode
== TFmode
|| mode
== TDmode
);
3438 gcc_assert (!(side_effects_p (src
) || side_effects_p (dst
)));
3439 gcc_assert (mode
== GET_MODE (src
) || GET_MODE (src
) == VOIDmode
);
3441 if (REG_P (dst
) && REG_P (src
))
3443 int src_regno
= REGNO (src
);
3444 int dst_regno
= REGNO (dst
);
3446 /* Handle FP <-> GP regs. */
3447 if (FP_REGNUM_P (dst_regno
) && GP_REGNUM_P (src_regno
))
3449 rtx src_lo
= gen_lowpart (word_mode
, src
);
3450 rtx src_hi
= gen_highpart (word_mode
, src
);
3452 emit_insn (gen_aarch64_movlow_di (mode
, dst
, src_lo
));
3453 emit_insn (gen_aarch64_movhigh_di (mode
, dst
, src_hi
));
3456 else if (GP_REGNUM_P (dst_regno
) && FP_REGNUM_P (src_regno
))
3458 rtx dst_lo
= gen_lowpart (word_mode
, dst
);
3459 rtx dst_hi
= gen_highpart (word_mode
, dst
);
3461 emit_insn (gen_aarch64_movdi_low (mode
, dst_lo
, src
));
3462 emit_insn (gen_aarch64_movdi_high (mode
, dst_hi
, src
));
3467 aarch64_split_move (dst
, src
, word_mode
);
3470 /* Return true if we should split a move from 128-bit value SRC
3471 to 128-bit register DEST. */
3474 aarch64_split_128bit_move_p (rtx dst
, rtx src
)
3476 if (FP_REGNUM_P (REGNO (dst
)))
3477 return REG_P (src
) && !FP_REGNUM_P (REGNO (src
));
3478 /* All moves to GPRs need to be split. */
3482 /* Split a complex SIMD move. */
3485 aarch64_split_simd_move (rtx dst
, rtx src
)
3487 machine_mode src_mode
= GET_MODE (src
);
3488 machine_mode dst_mode
= GET_MODE (dst
);
3490 gcc_assert (VECTOR_MODE_P (dst_mode
));
3492 if (REG_P (dst
) && REG_P (src
))
3494 gcc_assert (VECTOR_MODE_P (src_mode
));
3495 emit_insn (gen_aarch64_split_simd_mov (src_mode
, dst
, src
));
3499 /* Return a register that contains SVE value X reinterpreted as SVE mode MODE.
3500 The semantics of those of svreinterpret rather than those of subregs;
3501 see the comment at the head of aarch64-sve.md for details about the
3505 aarch64_sve_reinterpret (machine_mode mode
, rtx x
)
3507 if (GET_MODE (x
) == mode
)
3510 /* can_change_mode_class must only return true if subregs and svreinterprets
3511 have the same semantics. */
3512 if (targetm
.can_change_mode_class (GET_MODE (x
), mode
, FP_REGS
))
3513 return force_lowpart_subreg (mode
, x
, GET_MODE (x
));
3515 rtx res
= gen_reg_rtx (mode
);
3516 x
= force_reg (GET_MODE (x
), x
);
3517 emit_insn (gen_aarch64_sve_reinterpret (mode
, res
, x
));
3522 aarch64_zero_extend_const_eq (machine_mode xmode
, rtx x
,
3523 machine_mode ymode
, rtx y
)
3525 rtx r
= simplify_const_unary_operation (ZERO_EXTEND
, xmode
, y
, ymode
);
3526 gcc_assert (r
!= NULL
);
3527 return rtx_equal_p (x
, r
);
3530 /* Return TARGET if it is nonnull and a register of mode MODE.
3531 Otherwise, return a fresh register of mode MODE if we can,
3532 or TARGET reinterpreted as MODE if we can't. */
3535 aarch64_target_reg (rtx target
, machine_mode mode
)
3537 if (target
&& REG_P (target
) && GET_MODE (target
) == mode
)
3539 if (!can_create_pseudo_p ())
3541 gcc_assert (target
);
3542 return gen_lowpart (mode
, target
);
3544 return gen_reg_rtx (mode
);
3547 /* Return a register that contains the constant in BUILDER, given that
3548 the constant is a legitimate move operand. Use TARGET as the register
3549 if it is nonnull and convenient. */
3552 aarch64_emit_set_immediate (rtx target
, rtx_vector_builder
&builder
)
3554 rtx src
= builder
.build ();
3555 target
= aarch64_target_reg (target
, GET_MODE (src
));
3556 emit_insn (gen_rtx_SET (target
, src
));
3561 aarch64_force_temporary (machine_mode mode
, rtx x
, rtx value
)
3563 if (can_create_pseudo_p ())
3564 return force_reg (mode
, value
);
3568 aarch64_emit_move (x
, value
);
3573 /* Return true if predicate value X is a constant in which every element
3574 is a CONST_INT. When returning true, describe X in BUILDER as a VNx16BI
3575 value, i.e. as a predicate in which all bits are significant. */
3578 aarch64_get_sve_pred_bits (rtx_vector_builder
&builder
, rtx x
)
3580 if (!CONST_VECTOR_P (x
))
3583 unsigned int factor
= vector_element_size (GET_MODE_NUNITS (VNx16BImode
),
3584 GET_MODE_NUNITS (GET_MODE (x
)));
3585 unsigned int npatterns
= CONST_VECTOR_NPATTERNS (x
) * factor
;
3586 unsigned int nelts_per_pattern
= CONST_VECTOR_NELTS_PER_PATTERN (x
);
3587 builder
.new_vector (VNx16BImode
, npatterns
, nelts_per_pattern
);
3589 unsigned int nelts
= const_vector_encoded_nelts (x
);
3590 for (unsigned int i
= 0; i
< nelts
; ++i
)
3592 rtx elt
= CONST_VECTOR_ENCODED_ELT (x
, i
);
3593 if (!CONST_INT_P (elt
))
3596 builder
.quick_push (elt
);
3597 for (unsigned int j
= 1; j
< factor
; ++j
)
3598 builder
.quick_push (const0_rtx
);
3600 builder
.finalize ();
3604 /* BUILDER contains a predicate constant of mode VNx16BI. Return the
3605 widest predicate element size it can have (that is, the largest size
3606 for which each element would still be 0 or 1). */
3609 aarch64_widest_sve_pred_elt_size (rtx_vector_builder
&builder
)
3611 /* Start with the most optimistic assumption: that we only need
3612 one bit per pattern. This is what we will use if only the first
3613 bit in each pattern is ever set. */
3614 unsigned int mask
= GET_MODE_SIZE (DImode
);
3615 mask
|= builder
.npatterns ();
3617 /* Look for set bits. */
3618 unsigned int nelts
= builder
.encoded_nelts ();
3619 for (unsigned int i
= 1; i
< nelts
; ++i
)
3620 if (INTVAL (builder
.elt (i
)) != 0)
3626 return mask
& -mask
;
3629 /* If VNx16BImode rtx X is a canonical PTRUE for a predicate mode,
3630 return that predicate mode, otherwise return opt_machine_mode (). */
3633 aarch64_ptrue_all_mode (rtx x
)
3635 gcc_assert (GET_MODE (x
) == VNx16BImode
);
3636 if (!CONST_VECTOR_P (x
)
3637 || !CONST_VECTOR_DUPLICATE_P (x
)
3638 || !CONST_INT_P (CONST_VECTOR_ENCODED_ELT (x
, 0))
3639 || INTVAL (CONST_VECTOR_ENCODED_ELT (x
, 0)) == 0)
3640 return opt_machine_mode ();
3642 unsigned int nelts
= const_vector_encoded_nelts (x
);
3643 for (unsigned int i
= 1; i
< nelts
; ++i
)
3644 if (CONST_VECTOR_ENCODED_ELT (x
, i
) != const0_rtx
)
3645 return opt_machine_mode ();
3647 return aarch64_sve_pred_mode (nelts
);
3650 /* BUILDER is a predicate constant of mode VNx16BI. Consider the value
3651 that the constant would have with predicate element size ELT_SIZE
3652 (ignoring the upper bits in each element) and return:
3654 * -1 if all bits are set
3655 * N if the predicate has N leading set bits followed by all clear bits
3656 * 0 if the predicate does not have any of these forms. */
3659 aarch64_partial_ptrue_length (rtx_vector_builder
&builder
,
3660 unsigned int elt_size
)
3662 /* If nelts_per_pattern is 3, we have set bits followed by clear bits
3663 followed by set bits. */
3664 if (builder
.nelts_per_pattern () == 3)
3667 /* Skip over leading set bits. */
3668 unsigned int nelts
= builder
.encoded_nelts ();
3670 for (; i
< nelts
; i
+= elt_size
)
3671 if (INTVAL (builder
.elt (i
)) == 0)
3673 unsigned int vl
= i
/ elt_size
;
3675 /* Check for the all-true case. */
3679 /* If nelts_per_pattern is 1, then either VL is zero, or we have a
3680 repeating pattern of set bits followed by clear bits. */
3681 if (builder
.nelts_per_pattern () != 2)
3684 /* We have a "foreground" value and a duplicated "background" value.
3685 If the background might repeat and the last set bit belongs to it,
3686 we might have set bits followed by clear bits followed by set bits. */
3687 if (i
> builder
.npatterns () && maybe_ne (nelts
, builder
.full_nelts ()))
3690 /* Make sure that the rest are all clear. */
3691 for (; i
< nelts
; i
+= elt_size
)
3692 if (INTVAL (builder
.elt (i
)) != 0)
3698 /* See if there is an svpattern that encodes an SVE predicate of mode
3699 PRED_MODE in which the first VL bits are set and the rest are clear.
3700 Return the pattern if so, otherwise return AARCH64_NUM_SVPATTERNS.
3701 A VL of -1 indicates an all-true vector. */
3704 aarch64_svpattern_for_vl (machine_mode pred_mode
, int vl
)
3707 return AARCH64_SV_ALL
;
3709 if (maybe_gt (vl
, GET_MODE_NUNITS (pred_mode
)))
3710 return AARCH64_NUM_SVPATTERNS
;
3712 if (vl
>= 1 && vl
<= 8)
3713 return aarch64_svpattern (AARCH64_SV_VL1
+ (vl
- 1));
3715 if (vl
>= 16 && vl
<= 256 && pow2p_hwi (vl
))
3716 return aarch64_svpattern (AARCH64_SV_VL16
+ (exact_log2 (vl
) - 4));
3719 if (GET_MODE_NUNITS (pred_mode
).is_constant (&max_vl
))
3721 if (vl
== (max_vl
/ 3) * 3)
3722 return AARCH64_SV_MUL3
;
3723 /* These would only trigger for non-power-of-2 lengths. */
3724 if (vl
== (max_vl
& -4))
3725 return AARCH64_SV_MUL4
;
3726 if (vl
== (1 << floor_log2 (max_vl
)))
3727 return AARCH64_SV_POW2
;
3729 return AARCH64_SV_ALL
;
3731 return AARCH64_NUM_SVPATTERNS
;
3734 /* Return a VNx16BImode constant in which every sequence of ELT_SIZE
3735 bits has the lowest bit set and the upper bits clear. This is the
3736 VNx16BImode equivalent of a PTRUE for controlling elements of
3737 ELT_SIZE bytes. However, because the constant is VNx16BImode,
3738 all bits are significant, even the upper zeros. */
3741 aarch64_ptrue_all (unsigned int elt_size
)
3743 rtx_vector_builder
builder (VNx16BImode
, elt_size
, 1);
3744 builder
.quick_push (const1_rtx
);
3745 for (unsigned int i
= 1; i
< elt_size
; ++i
)
3746 builder
.quick_push (const0_rtx
);
3747 return builder
.build ();
3750 /* Return an all-true predicate register of mode MODE. */
3753 aarch64_ptrue_reg (machine_mode mode
)
3755 gcc_assert (aarch64_sve_pred_mode_p (mode
));
3756 rtx reg
= force_reg (VNx16BImode
, CONSTM1_RTX (VNx16BImode
));
3757 return gen_lowpart (mode
, reg
);
3760 /* Return an all-true (restricted to the leading VL bits) predicate register of
3764 aarch64_ptrue_reg (machine_mode mode
, unsigned int vl
)
3766 gcc_assert (aarch64_sve_pred_mode_p (mode
));
3768 rtx_vector_builder
builder (VNx16BImode
, vl
, 2);
3770 for (unsigned i
= 0; i
< vl
; i
++)
3771 builder
.quick_push (CONST1_RTX (BImode
));
3773 for (unsigned i
= 0; i
< vl
; i
++)
3774 builder
.quick_push (CONST0_RTX (BImode
));
3776 rtx const_vec
= builder
.build ();
3777 rtx reg
= force_reg (VNx16BImode
, const_vec
);
3778 return gen_lowpart (mode
, reg
);
3781 /* Return a register of mode PRED_MODE for controlling data of mode DATA_MODE.
3783 DATA_MODE can be a scalar, an Advanced SIMD vector, or an SVE vector.
3784 If it's an N-byte scalar or an Advanced SIMD vector, the first N bits
3785 of the predicate will be active and the rest will be inactive.
3786 If DATA_MODE is an SVE mode, every bit of the predicate will be active. */
3788 aarch64_ptrue_reg (machine_mode pred_mode
, machine_mode data_mode
)
3790 if (aarch64_sve_mode_p (data_mode
))
3791 return aarch64_ptrue_reg (pred_mode
);
3793 auto size
= GET_MODE_SIZE (data_mode
).to_constant ();
3794 return aarch64_ptrue_reg (pred_mode
, size
);
3797 /* Return an all-false predicate register of mode MODE. */
3800 aarch64_pfalse_reg (machine_mode mode
)
3802 gcc_assert (aarch64_sve_pred_mode_p (mode
));
3803 rtx reg
= force_reg (VNx16BImode
, CONST0_RTX (VNx16BImode
));
3804 return gen_lowpart (mode
, reg
);
3807 /* PRED1[0] is a PTEST predicate and PRED1[1] is an aarch64_sve_ptrue_flag
3808 for it. PRED2[0] is the predicate for the instruction whose result
3809 is tested by the PTEST and PRED2[1] is again an aarch64_sve_ptrue_flag
3810 for it. Return true if we can prove that the two predicates are
3811 equivalent for PTEST purposes; that is, if we can replace PRED2[0]
3812 with PRED1[0] without changing behavior. */
3815 aarch64_sve_same_pred_for_ptest_p (rtx
*pred1
, rtx
*pred2
)
3817 machine_mode mode
= GET_MODE (pred1
[0]);
3818 gcc_assert (aarch64_sve_pred_mode_p (mode
)
3819 && mode
== GET_MODE (pred2
[0])
3820 && aarch64_sve_ptrue_flag (pred1
[1], SImode
)
3821 && aarch64_sve_ptrue_flag (pred2
[1], SImode
));
3823 bool ptrue1_p
= (pred1
[0] == CONSTM1_RTX (mode
)
3824 || INTVAL (pred1
[1]) == SVE_KNOWN_PTRUE
);
3825 bool ptrue2_p
= (pred2
[0] == CONSTM1_RTX (mode
)
3826 || INTVAL (pred2
[1]) == SVE_KNOWN_PTRUE
);
3827 return (ptrue1_p
&& ptrue2_p
) || rtx_equal_p (pred1
[0], pred2
[0]);
3830 /* Emit a comparison CMP between OP0 and OP1, both of which have mode
3831 DATA_MODE, and return the result in a predicate of mode PRED_MODE.
3832 Use TARGET as the target register if nonnull and convenient. */
3835 aarch64_sve_emit_int_cmp (rtx target
, machine_mode pred_mode
, rtx_code cmp
,
3836 machine_mode data_mode
, rtx op1
, rtx op2
)
3838 insn_code icode
= code_for_aarch64_pred_cmp (cmp
, data_mode
);
3839 expand_operand ops
[5];
3840 create_output_operand (&ops
[0], target
, pred_mode
);
3841 create_input_operand (&ops
[1], CONSTM1_RTX (pred_mode
), pred_mode
);
3842 create_integer_operand (&ops
[2], SVE_KNOWN_PTRUE
);
3843 create_input_operand (&ops
[3], op1
, data_mode
);
3844 create_input_operand (&ops
[4], op2
, data_mode
);
3845 expand_insn (icode
, 5, ops
);
3846 return ops
[0].value
;
3849 /* Use a comparison to convert integer vector SRC into MODE, which is
3850 the corresponding SVE predicate mode. Use TARGET for the result
3851 if it's nonnull and convenient. */
3854 aarch64_convert_sve_data_to_pred (rtx target
, machine_mode mode
, rtx src
)
3856 machine_mode src_mode
= GET_MODE (src
);
3857 return aarch64_sve_emit_int_cmp (target
, mode
, NE
, src_mode
,
3858 src
, CONST0_RTX (src_mode
));
3861 /* Return the assembly token for svprfop value PRFOP. */
3864 svprfop_token (enum aarch64_svprfop prfop
)
3868 #define CASE(UPPER, LOWER, VALUE) case AARCH64_SV_##UPPER: return #LOWER;
3869 AARCH64_FOR_SVPRFOP (CASE
)
3871 case AARCH64_NUM_SVPRFOPS
:
3877 /* Return the assembly string for an SVE prefetch operation with
3878 mnemonic MNEMONIC, given that PRFOP_RTX is the prefetch operation
3879 and that SUFFIX is the format for the remaining operands. */
3882 aarch64_output_sve_prefetch (const char *mnemonic
, rtx prfop_rtx
,
3885 static char buffer
[128];
3886 aarch64_svprfop prfop
= (aarch64_svprfop
) INTVAL (prfop_rtx
);
3887 unsigned int written
= snprintf (buffer
, sizeof (buffer
), "%s\t%s, %s",
3888 mnemonic
, svprfop_token (prfop
), suffix
);
3889 gcc_assert (written
< sizeof (buffer
));
3893 /* Check whether we can calculate the number of elements in PATTERN
3894 at compile time, given that there are NELTS_PER_VQ elements per
3895 128-bit block. Return the value if so, otherwise return -1. */
3898 aarch64_fold_sve_cnt_pat (aarch64_svpattern pattern
, unsigned int nelts_per_vq
)
3900 unsigned int vl
, const_vg
;
3901 if (pattern
>= AARCH64_SV_VL1
&& pattern
<= AARCH64_SV_VL8
)
3902 vl
= 1 + (pattern
- AARCH64_SV_VL1
);
3903 else if (pattern
>= AARCH64_SV_VL16
&& pattern
<= AARCH64_SV_VL256
)
3904 vl
= 16 << (pattern
- AARCH64_SV_VL16
);
3905 else if (aarch64_sve_vg
.is_constant (&const_vg
))
3907 /* There are two vector granules per quadword. */
3908 unsigned int nelts
= (const_vg
/ 2) * nelts_per_vq
;
3911 case AARCH64_SV_POW2
: return 1 << floor_log2 (nelts
);
3912 case AARCH64_SV_MUL4
: return nelts
& -4;
3913 case AARCH64_SV_MUL3
: return (nelts
/ 3) * 3;
3914 case AARCH64_SV_ALL
: return nelts
;
3915 default: gcc_unreachable ();
3921 /* There are two vector granules per quadword. */
3922 poly_uint64 nelts_all
= exact_div (aarch64_sve_vg
, 2) * nelts_per_vq
;
3923 if (known_le (vl
, nelts_all
))
3926 /* Requesting more elements than are available results in a PFALSE. */
3927 if (known_gt (vl
, nelts_all
))
3933 /* Return true if a single CNT[BHWD] instruction can multiply FACTOR
3934 by the number of 128-bit quadwords in an SVE vector. */
3937 aarch64_sve_cnt_factor_p (HOST_WIDE_INT factor
)
3939 /* The coefficient must be [1, 16] * {2, 4, 8, 16}. */
3940 return (IN_RANGE (factor
, 2, 16 * 16)
3941 && (factor
& 1) == 0
3942 && factor
<= 16 * (factor
& -factor
));
3945 /* Return true if we can move VALUE into a register using a single
3946 CNT[BHWD] instruction. */
3949 aarch64_sve_cnt_immediate_p (poly_int64 value
)
3951 HOST_WIDE_INT factor
= value
.coeffs
[0];
3952 return value
.coeffs
[1] == factor
&& aarch64_sve_cnt_factor_p (factor
);
3955 /* Likewise for rtx X. */
3958 aarch64_sve_cnt_immediate_p (rtx x
)
3961 return poly_int_rtx_p (x
, &value
) && aarch64_sve_cnt_immediate_p (value
);
3964 /* Return the asm string for an instruction with a CNT-like vector size
3965 operand (a vector pattern followed by a multiplier in the range [1, 16]).
3966 PREFIX is the mnemonic without the size suffix and OPERANDS is the
3967 first part of the operands template (the part that comes before the
3968 vector size itself). PATTERN is the pattern to use. FACTOR is the
3969 number of quadwords. NELTS_PER_VQ, if nonzero, is the number of elements
3970 in each quadword. If it is zero, we can use any element size. */
3973 aarch64_output_sve_cnt_immediate (const char *prefix
, const char *operands
,
3974 aarch64_svpattern pattern
,
3975 unsigned int factor
,
3976 unsigned int nelts_per_vq
)
3978 static char buffer
[sizeof ("sqincd\t%x0, %w0, vl256, mul #16")];
3980 if (nelts_per_vq
== 0)
3981 /* There is some overlap in the ranges of the four CNT instructions.
3982 Here we always use the smallest possible element size, so that the
3983 multiplier is 1 whereever possible. */
3984 nelts_per_vq
= factor
& -factor
;
3985 int shift
= std::min (exact_log2 (nelts_per_vq
), 4);
3986 gcc_assert (IN_RANGE (shift
, 1, 4));
3987 char suffix
= "dwhb"[shift
- 1];
3990 unsigned int written
;
3991 if (pattern
== AARCH64_SV_ALL
&& factor
== 1)
3992 written
= snprintf (buffer
, sizeof (buffer
), "%s%c\t%s",
3993 prefix
, suffix
, operands
);
3994 else if (factor
== 1)
3995 written
= snprintf (buffer
, sizeof (buffer
), "%s%c\t%s, %s",
3996 prefix
, suffix
, operands
, svpattern_token (pattern
));
3998 written
= snprintf (buffer
, sizeof (buffer
), "%s%c\t%s, %s, mul #%d",
3999 prefix
, suffix
, operands
, svpattern_token (pattern
),
4001 gcc_assert (written
< sizeof (buffer
));
4005 /* Return the asm string for an instruction with a CNT-like vector size
4006 operand (a vector pattern followed by a multiplier in the range [1, 16]).
4007 PREFIX is the mnemonic without the size suffix and OPERANDS is the
4008 first part of the operands template (the part that comes before the
4009 vector size itself). X is the value of the vector size operand,
4010 as a polynomial integer rtx; we need to convert this into an "all"
4011 pattern with a multiplier. */
4014 aarch64_output_sve_cnt_immediate (const char *prefix
, const char *operands
,
4017 poly_int64 value
= rtx_to_poly_int64 (x
);
4018 gcc_assert (aarch64_sve_cnt_immediate_p (value
));
4019 return aarch64_output_sve_cnt_immediate (prefix
, operands
, AARCH64_SV_ALL
,
4020 value
.coeffs
[1], 0);
4023 /* Return the asm string for an instruction with a CNT-like vector size
4024 operand (a vector pattern followed by a multiplier in the range [1, 16]).
4025 PREFIX is the mnemonic without the size suffix and OPERANDS is the
4026 first part of the operands template (the part that comes before the
4027 vector size itself). CNT_PAT[0..2] are the operands of the
4028 UNSPEC_SVE_CNT_PAT; see aarch64_sve_cnt_pat for details. */
4031 aarch64_output_sve_cnt_pat_immediate (const char *prefix
,
4032 const char *operands
, rtx
*cnt_pat
)
4034 aarch64_svpattern pattern
= (aarch64_svpattern
) INTVAL (cnt_pat
[0]);
4035 unsigned int nelts_per_vq
= INTVAL (cnt_pat
[1]);
4036 unsigned int factor
= INTVAL (cnt_pat
[2]) * nelts_per_vq
;
4037 return aarch64_output_sve_cnt_immediate (prefix
, operands
, pattern
,
4038 factor
, nelts_per_vq
);
4041 /* Return true if we can add X using a single SVE INC or DEC instruction. */
4044 aarch64_sve_scalar_inc_dec_immediate_p (rtx x
)
4047 return (poly_int_rtx_p (x
, &value
)
4048 && (aarch64_sve_cnt_immediate_p (value
)
4049 || aarch64_sve_cnt_immediate_p (-value
)));
4052 /* Return the asm string for adding SVE INC/DEC immediate OFFSET to
4056 aarch64_output_sve_scalar_inc_dec (rtx offset
)
4058 poly_int64 offset_value
= rtx_to_poly_int64 (offset
);
4059 gcc_assert (offset_value
.coeffs
[0] == offset_value
.coeffs
[1]);
4060 if (offset_value
.coeffs
[1] > 0)
4061 return aarch64_output_sve_cnt_immediate ("inc", "%x0", AARCH64_SV_ALL
,
4062 offset_value
.coeffs
[1], 0);
4064 return aarch64_output_sve_cnt_immediate ("dec", "%x0", AARCH64_SV_ALL
,
4065 -offset_value
.coeffs
[1], 0);
4068 /* Return true if a single RDVL instruction can multiply FACTOR by the
4069 number of 128-bit quadwords in an SVE vector. This is also the
4073 aarch64_sve_rdvl_addvl_factor_p (HOST_WIDE_INT factor
)
4075 return (multiple_p (factor
, 16)
4076 && IN_RANGE (factor
, -32 * 16, 31 * 16));
4079 /* Return true if ADDPL can be used to add FACTOR multiplied by the number
4080 of quadwords in an SVE vector. */
4083 aarch64_sve_addpl_factor_p (HOST_WIDE_INT factor
)
4085 return (multiple_p (factor
, 2)
4086 && IN_RANGE (factor
, -32 * 2, 31 * 2));
4089 /* Return true if we can move VALUE into a register using a single
4090 RDVL instruction. */
4093 aarch64_sve_rdvl_immediate_p (poly_int64 value
)
4095 HOST_WIDE_INT factor
= value
.coeffs
[0];
4096 return value
.coeffs
[1] == factor
&& aarch64_sve_rdvl_addvl_factor_p (factor
);
4099 /* Likewise for rtx X. */
4102 aarch64_sve_rdvl_immediate_p (rtx x
)
4105 return poly_int_rtx_p (x
, &value
) && aarch64_sve_rdvl_immediate_p (value
);
4108 /* Return the asm string for moving RDVL immediate OFFSET into register
4112 aarch64_output_sve_rdvl (rtx offset
)
4114 static char buffer
[sizeof ("rdvl\t%x0, #-") + 3 * sizeof (int)];
4115 poly_int64 offset_value
= rtx_to_poly_int64 (offset
);
4116 gcc_assert (aarch64_sve_rdvl_immediate_p (offset_value
));
4118 int factor
= offset_value
.coeffs
[1];
4119 snprintf (buffer
, sizeof (buffer
), "rdvl\t%%x0, #%d", factor
/ 16);
4123 /* Return true if we can add VALUE to a register using a single ADDVL
4124 or ADDPL instruction. */
4127 aarch64_sve_addvl_addpl_immediate_p (poly_int64 value
)
4129 HOST_WIDE_INT factor
= value
.coeffs
[0];
4130 if (factor
== 0 || value
.coeffs
[1] != factor
)
4132 return (aarch64_sve_rdvl_addvl_factor_p (factor
)
4133 || aarch64_sve_addpl_factor_p (factor
));
4136 /* Likewise for rtx X. */
4139 aarch64_sve_addvl_addpl_immediate_p (rtx x
)
4142 return (poly_int_rtx_p (x
, &value
)
4143 && aarch64_sve_addvl_addpl_immediate_p (value
));
4146 /* Return the asm string for adding ADDVL or ADDPL immediate OFFSET
4147 to operand 1 and storing the result in operand 0. */
4150 aarch64_output_sve_addvl_addpl (rtx offset
)
4152 static char buffer
[sizeof ("addpl\t%x0, %x1, #-") + 3 * sizeof (int)];
4153 poly_int64 offset_value
= rtx_to_poly_int64 (offset
);
4154 gcc_assert (aarch64_sve_addvl_addpl_immediate_p (offset_value
));
4156 int factor
= offset_value
.coeffs
[1];
4157 if ((factor
& 15) == 0)
4158 snprintf (buffer
, sizeof (buffer
), "addvl\t%%x0, %%x1, #%d", factor
/ 16);
4160 snprintf (buffer
, sizeof (buffer
), "addpl\t%%x0, %%x1, #%d", factor
/ 2);
4164 /* Return true if X is a valid immediate for an SVE vector INC or DEC
4165 instruction. If it is, store the number of elements in each vector
4166 quadword in *NELTS_PER_VQ_OUT (if nonnull) and store the multiplication
4167 factor in *FACTOR_OUT (if nonnull). */
4170 aarch64_sve_vector_inc_dec_immediate_p (rtx x
, int *factor_out
,
4171 unsigned int *nelts_per_vq_out
)
4176 if (!const_vec_duplicate_p (x
, &elt
)
4177 || !poly_int_rtx_p (elt
, &value
))
4180 unsigned int nelts_per_vq
= 128 / GET_MODE_UNIT_BITSIZE (GET_MODE (x
));
4181 if (nelts_per_vq
!= 8 && nelts_per_vq
!= 4 && nelts_per_vq
!= 2)
4182 /* There's no vector INCB. */
4185 HOST_WIDE_INT factor
= value
.coeffs
[0];
4186 if (value
.coeffs
[1] != factor
)
4189 /* The coefficient must be [1, 16] * NELTS_PER_VQ. */
4190 if ((factor
% nelts_per_vq
) != 0
4191 || !IN_RANGE (abs (factor
), nelts_per_vq
, 16 * nelts_per_vq
))
4195 *factor_out
= factor
;
4196 if (nelts_per_vq_out
)
4197 *nelts_per_vq_out
= nelts_per_vq
;
4201 /* Return true if X is a valid immediate for an SVE vector INC or DEC
4205 aarch64_sve_vector_inc_dec_immediate_p (rtx x
)
4207 return aarch64_sve_vector_inc_dec_immediate_p (x
, NULL
, NULL
);
4210 /* Return the asm template for an SVE vector INC or DEC instruction.
4211 OPERANDS gives the operands before the vector count and X is the
4212 value of the vector count operand itself. */
4215 aarch64_output_sve_vector_inc_dec (const char *operands
, rtx x
)
4218 unsigned int nelts_per_vq
;
4219 if (!aarch64_sve_vector_inc_dec_immediate_p (x
, &factor
, &nelts_per_vq
))
4222 return aarch64_output_sve_cnt_immediate ("dec", operands
, AARCH64_SV_ALL
,
4223 -factor
, nelts_per_vq
);
4225 return aarch64_output_sve_cnt_immediate ("inc", operands
, AARCH64_SV_ALL
,
4226 factor
, nelts_per_vq
);
4229 /* Return a constant that represents FACTOR multiplied by the
4230 number of 128-bit quadwords in an SME vector. ISA_MODE is the
4231 ISA mode in which the calculation is being performed. */
4234 aarch64_sme_vq_immediate (machine_mode mode
, HOST_WIDE_INT factor
,
4235 aarch64_isa_mode isa_mode
)
4237 gcc_assert (aarch64_sve_rdvl_addvl_factor_p (factor
));
4238 if (isa_mode
& AARCH64_ISA_MODE_SM_ON
)
4239 /* We're in streaming mode, so we can use normal poly-int values. */
4240 return gen_int_mode ({ factor
, factor
}, mode
);
4242 rtvec vec
= gen_rtvec (1, gen_int_mode (factor
, SImode
));
4243 rtx unspec
= gen_rtx_UNSPEC (mode
, vec
, UNSPEC_SME_VQ
);
4244 return gen_rtx_CONST (mode
, unspec
);
4247 /* Return true if X is a constant that represents some number X
4248 multiplied by the number of quadwords in an SME vector. Store this X
4249 in *FACTOR if so. */
4252 aarch64_sme_vq_unspec_p (const_rtx x
, HOST_WIDE_INT
*factor
)
4254 if (!TARGET_SME
|| GET_CODE (x
) != CONST
)
4258 if (GET_CODE (x
) != UNSPEC
4259 || XINT (x
, 1) != UNSPEC_SME_VQ
4260 || XVECLEN (x
, 0) != 1)
4263 x
= XVECEXP (x
, 0, 0);
4264 if (!CONST_INT_P (x
))
4267 *factor
= INTVAL (x
);
4271 /* Return true if X is a constant that represents some number Y
4272 multiplied by the number of quadwords in an SME vector, and if
4273 that Y is in the range of RDSVL. */
4276 aarch64_rdsvl_immediate_p (const_rtx x
)
4278 HOST_WIDE_INT factor
;
4279 return (aarch64_sme_vq_unspec_p (x
, &factor
)
4280 && aarch64_sve_rdvl_addvl_factor_p (factor
));
4283 /* Return the asm string for an RDSVL instruction that calculates X,
4284 which is a constant that satisfies aarch64_rdsvl_immediate_p. */
4287 aarch64_output_rdsvl (const_rtx x
)
4289 gcc_assert (aarch64_rdsvl_immediate_p (x
));
4290 static char buffer
[sizeof ("rdsvl\t%x0, #-") + 3 * sizeof (int)];
4291 x
= XVECEXP (XEXP (x
, 0), 0, 0);
4292 snprintf (buffer
, sizeof (buffer
), "rdsvl\t%%x0, #%d",
4293 (int) INTVAL (x
) / 16);
4297 /* Return true if X is a constant that can be added using ADDSVL or ADDSPL. */
4300 aarch64_addsvl_addspl_immediate_p (const_rtx x
)
4302 HOST_WIDE_INT factor
;
4303 return (aarch64_sme_vq_unspec_p (x
, &factor
)
4304 && (aarch64_sve_rdvl_addvl_factor_p (factor
)
4305 || aarch64_sve_addpl_factor_p (factor
)));
4308 /* X is a constant that satisfies aarch64_addsvl_addspl_immediate_p.
4309 Return the asm string for the associated instruction. */
4312 aarch64_output_addsvl_addspl (rtx x
)
4314 static char buffer
[sizeof ("addspl\t%x0, %x1, #-") + 3 * sizeof (int)];
4315 HOST_WIDE_INT factor
;
4316 if (!aarch64_sme_vq_unspec_p (x
, &factor
))
4318 if (aarch64_sve_rdvl_addvl_factor_p (factor
))
4319 snprintf (buffer
, sizeof (buffer
), "addsvl\t%%x0, %%x1, #%d",
4321 else if (aarch64_sve_addpl_factor_p (factor
))
4322 snprintf (buffer
, sizeof (buffer
), "addspl\t%%x0, %%x1, #%d",
4329 /* Multipliers for repeating bitmasks of width 32, 16, 8, 4, and 2. */
4331 static const unsigned HOST_WIDE_INT bitmask_imm_mul
[] =
4333 0x0000000100000001ull
,
4334 0x0001000100010001ull
,
4335 0x0101010101010101ull
,
4336 0x1111111111111111ull
,
4337 0x5555555555555555ull
,
4342 /* Return true if 64-bit VAL is a valid bitmask immediate. */
4344 aarch64_bitmask_imm (unsigned HOST_WIDE_INT val
)
4346 unsigned HOST_WIDE_INT tmp
, mask
, first_one
, next_one
;
4349 /* Check for a single sequence of one bits and return quickly if so.
4350 The special cases of all ones and all zeroes returns false. */
4351 tmp
= val
+ (val
& -val
);
4353 if (tmp
== (tmp
& -tmp
))
4354 return (val
+ 1) > 1;
4356 /* Invert if the immediate doesn't start with a zero bit - this means we
4357 only need to search for sequences of one bits. */
4361 /* Find the first set bit and set tmp to val with the first sequence of one
4362 bits removed. Return success if there is a single sequence of ones. */
4363 first_one
= val
& -val
;
4364 tmp
= val
& (val
+ first_one
);
4369 /* Find the next set bit and compute the difference in bit position. */
4370 next_one
= tmp
& -tmp
;
4371 bits
= clz_hwi (first_one
) - clz_hwi (next_one
);
4374 /* Check the bit position difference is a power of 2, and that the first
4375 sequence of one bits fits within 'bits' bits. */
4376 if ((mask
>> bits
) != 0 || bits
!= (bits
& -bits
))
4379 /* Check the sequence of one bits is repeated 64/bits times. */
4380 return val
== mask
* bitmask_imm_mul
[__builtin_clz (bits
) - 26];
4384 /* Return true if VAL is a valid bitmask immediate for MODE. */
4386 aarch64_bitmask_imm (unsigned HOST_WIDE_INT val
, machine_mode mode
)
4389 return aarch64_bitmask_imm (val
);
4392 return aarch64_bitmask_imm ((val
& 0xffffffff) | (val
<< 32));
4394 /* Replicate small immediates to fit 64 bits. */
4395 int size
= GET_MODE_UNIT_PRECISION (mode
);
4396 val
&= (HOST_WIDE_INT_1U
<< size
) - 1;
4397 val
*= bitmask_imm_mul
[__builtin_clz (size
) - 26];
4399 return aarch64_bitmask_imm (val
);
4403 /* Return true if the immediate VAL can be a bitfield immediate
4404 by changing the given MASK bits in VAL to zeroes, ones or bits
4405 from the other half of VAL. Return the new immediate in VAL2. */
4407 aarch64_check_bitmask (unsigned HOST_WIDE_INT val
,
4408 unsigned HOST_WIDE_INT
&val2
,
4409 unsigned HOST_WIDE_INT mask
)
4412 if (val2
!= val
&& aarch64_bitmask_imm (val2
))
4415 if (val2
!= val
&& aarch64_bitmask_imm (val2
))
4418 val2
= val
| (((val
>> 32) | (val
<< 32)) & mask
);
4419 if (val2
!= val
&& aarch64_bitmask_imm (val2
))
4421 val2
= val
| (((val
>> 16) | (val
<< 48)) & mask
);
4422 if (val2
!= val
&& aarch64_bitmask_imm (val2
))
4428 /* Return true if VAL is a valid MOVZ immediate. */
4430 aarch64_is_movz (unsigned HOST_WIDE_INT val
)
4432 return (val
>> (ctz_hwi (val
) & 48)) < 65536;
4436 /* Return true if immediate VAL can be created by a 64-bit MOVI/MOVN/MOVZ. */
4438 aarch64_is_mov_xn_imm (unsigned HOST_WIDE_INT val
)
4440 return aarch64_is_movz (val
) || aarch64_is_movz (~val
)
4441 || aarch64_bitmask_imm (val
);
4445 /* Return true if VAL is an immediate that can be created by a single
4448 aarch64_move_imm (unsigned HOST_WIDE_INT val
, machine_mode mode
)
4450 gcc_assert (mode
== SImode
|| mode
== DImode
);
4455 unsigned HOST_WIDE_INT mask
=
4456 (val
>> 32) == 0 || mode
== SImode
? 0xffffffff : HOST_WIDE_INT_M1U
;
4458 if (aarch64_is_movz (val
& mask
) || aarch64_is_movz (~val
& mask
))
4461 val
= (val
& mask
) | ((val
<< 32) & ~mask
);
4462 return aarch64_bitmask_imm (val
);
4467 aarch64_internal_mov_immediate (rtx dest
, rtx imm
, bool generate
,
4471 unsigned HOST_WIDE_INT val
, val2
, val3
, mask
;
4472 int one_match
, zero_match
;
4475 gcc_assert (mode
== SImode
|| mode
== DImode
);
4479 if (aarch64_move_imm (val
, mode
))
4482 emit_insn (gen_rtx_SET (dest
, imm
));
4486 if ((val
>> 32) == 0 || mode
== SImode
)
4490 emit_insn (gen_rtx_SET (dest
, GEN_INT (val
& 0xffff)));
4492 emit_insn (gen_insv_immsi (dest
, GEN_INT (16),
4493 GEN_INT ((val
>> 16) & 0xffff)));
4495 emit_insn (gen_insv_immdi (dest
, GEN_INT (16),
4496 GEN_INT ((val
>> 16) & 0xffff)));
4501 /* Remaining cases are all for DImode. */
4504 zero_match
= ((val
& mask
) == 0) + ((val
& (mask
<< 16)) == 0) +
4505 ((val
& (mask
<< 32)) == 0) + ((val
& (mask
<< 48)) == 0);
4506 one_match
= ((~val
& mask
) == 0) + ((~val
& (mask
<< 16)) == 0) +
4507 ((~val
& (mask
<< 32)) == 0) + ((~val
& (mask
<< 48)) == 0);
4509 /* Try a bitmask immediate and a movk to generate the immediate
4510 in 2 instructions. */
4512 if (zero_match
< 2 && one_match
< 2)
4514 for (i
= 0; i
< 64; i
+= 16)
4516 if (aarch64_check_bitmask (val
, val2
, mask
<< i
))
4519 val2
= val
& ~(mask
<< i
);
4520 if ((val2
>> 32) == 0 && aarch64_move_imm (val2
, DImode
))
4528 emit_insn (gen_rtx_SET (dest
, GEN_INT (val2
)));
4529 emit_insn (gen_insv_immdi (dest
, GEN_INT (i
),
4530 GEN_INT ((val
>> i
) & 0xffff)));
4535 /* Try 2 bitmask immediates which are xor'd together. */
4536 for (i
= 0; i
< 64; i
+= 16)
4538 val2
= (val
>> i
) & mask
;
4541 if (aarch64_bitmask_imm (val2
) && aarch64_bitmask_imm (val
^ val2
))
4549 emit_insn (gen_rtx_SET (dest
, GEN_INT (val2
)));
4550 emit_insn (gen_xordi3 (dest
, dest
, GEN_INT (val
^ val2
)));
4556 /* Try a bitmask plus 2 movk to generate the immediate in 3 instructions. */
4557 if (zero_match
+ one_match
== 0)
4559 for (i
= 0; i
< 48; i
+= 16)
4560 for (int j
= i
+ 16; j
< 64; j
+= 16)
4561 if (aarch64_check_bitmask (val
, val2
, (mask
<< i
) | (mask
<< j
)))
4565 emit_insn (gen_rtx_SET (dest
, GEN_INT (val2
)));
4566 emit_insn (gen_insv_immdi (dest
, GEN_INT (i
),
4567 GEN_INT ((val
>> i
) & 0xffff)));
4568 emit_insn (gen_insv_immdi (dest
, GEN_INT (j
),
4569 GEN_INT ((val
>> j
) & 0xffff)));
4574 /* Try shifting and inserting the bottom 32-bits into the top bits. */
4575 val2
= val
& 0xffffffff;
4577 val3
= val2
| (val3
<< 32);
4578 for (i
= 17; i
< 48; i
++)
4579 if ((val2
| (val2
<< i
)) == val
)
4583 emit_insn (gen_rtx_SET (dest
, GEN_INT (val2
& 0xffff)));
4584 emit_insn (gen_insv_immdi (dest
, GEN_INT (16),
4585 GEN_INT (val2
>> 16)));
4586 emit_insn (gen_ior_ashldi3 (dest
, dest
, GEN_INT (i
), dest
));
4590 else if ((val3
& ~(val3
<< i
)) == val
)
4594 emit_insn (gen_rtx_SET (dest
, GEN_INT (val3
| 0xffff0000)));
4595 emit_insn (gen_insv_immdi (dest
, GEN_INT (16),
4596 GEN_INT (val2
>> 16)));
4597 emit_insn (gen_and_one_cmpl_ashldi3 (dest
, dest
, GEN_INT (i
),
4604 /* Generate 2-4 instructions, skipping 16 bits of all zeroes or ones which
4605 are emitted by the initial mov. If one_match > zero_match, skip set bits,
4606 otherwise skip zero bits. */
4610 val2
= one_match
> zero_match
? ~val
: val
;
4611 i
= (val2
& mask
) != 0 ? 0 : (val2
& (mask
<< 16)) != 0 ? 16 : 32;
4614 emit_insn (gen_rtx_SET (dest
, GEN_INT (one_match
> zero_match
4615 ? (val
| ~(mask
<< i
))
4616 : (val
& (mask
<< i
)))));
4617 for (i
+= 16; i
< 64; i
+= 16)
4619 if ((val2
& (mask
<< i
)) == 0)
4622 emit_insn (gen_insv_immdi (dest
, GEN_INT (i
),
4623 GEN_INT ((val
>> i
) & 0xffff)));
4630 /* Return whether imm is a 128-bit immediate which is simple enough to
4633 aarch64_mov128_immediate (rtx imm
)
4635 if (CONST_INT_P (imm
))
4638 gcc_assert (CONST_WIDE_INT_NUNITS (imm
) == 2);
4640 rtx lo
= GEN_INT (CONST_WIDE_INT_ELT (imm
, 0));
4641 rtx hi
= GEN_INT (CONST_WIDE_INT_ELT (imm
, 1));
4643 return aarch64_internal_mov_immediate (NULL_RTX
, lo
, false, DImode
)
4644 + aarch64_internal_mov_immediate (NULL_RTX
, hi
, false, DImode
) <= 4;
4648 /* Return true if val can be encoded as a 12-bit unsigned immediate with
4649 a left shift of 0 or 12 bits. */
4651 aarch64_uimm12_shift (unsigned HOST_WIDE_INT val
)
4653 return val
< 4096 || (val
& 0xfff000) == val
;
4656 /* Returns the nearest value to VAL that will fit as a 12-bit unsigned immediate
4657 that can be created with a left shift of 0 or 12. */
4658 static HOST_WIDE_INT
4659 aarch64_clamp_to_uimm12_shift (unsigned HOST_WIDE_INT val
)
4661 /* Check to see if the value fits in 24 bits, as that is the maximum we can
4662 handle correctly. */
4663 gcc_assert (val
< 0x1000000);
4668 return val
& 0xfff000;
4674 X = (X & AND_VAL) | IOR_VAL;
4676 can be implemented using:
4678 MOVK X, #(IOR_VAL >> shift), LSL #shift
4680 Return the shift if so, otherwise return -1. */
4682 aarch64_movk_shift (const wide_int_ref
&and_val
,
4683 const wide_int_ref
&ior_val
)
4685 unsigned int precision
= and_val
.get_precision ();
4686 unsigned HOST_WIDE_INT mask
= 0xffff;
4687 for (unsigned int shift
= 0; shift
< precision
; shift
+= 16)
4689 if (and_val
== ~mask
&& (ior_val
& mask
) == ior_val
)
4696 /* Create mask of ones, covering the lowest to highest bits set in VAL_IN.
4697 Assumed precondition: VAL_IN Is not zero. */
4699 unsigned HOST_WIDE_INT
4700 aarch64_and_split_imm1 (HOST_WIDE_INT val_in
)
4702 int lowest_bit_set
= ctz_hwi (val_in
);
4703 int highest_bit_set
= floor_log2 (val_in
);
4704 gcc_assert (val_in
!= 0);
4706 return ((HOST_WIDE_INT_UC (2) << highest_bit_set
) -
4707 (HOST_WIDE_INT_1U
<< lowest_bit_set
));
4710 /* Create constant where bits outside of lowest bit set to highest bit set
4713 unsigned HOST_WIDE_INT
4714 aarch64_and_split_imm2 (HOST_WIDE_INT val_in
)
4716 return val_in
| ~aarch64_and_split_imm1 (val_in
);
4719 /* Return true if VAL_IN is a valid 'and' bitmask immediate. */
4722 aarch64_and_bitmask_imm (unsigned HOST_WIDE_INT val_in
, machine_mode mode
)
4724 scalar_int_mode int_mode
;
4725 if (!is_a
<scalar_int_mode
> (mode
, &int_mode
))
4728 if (aarch64_bitmask_imm (val_in
, int_mode
))
4731 if (aarch64_move_imm (val_in
, int_mode
))
4734 unsigned HOST_WIDE_INT imm2
= aarch64_and_split_imm2 (val_in
);
4736 return aarch64_bitmask_imm (imm2
, int_mode
);
4739 /* Return the number of temporary registers that aarch64_add_offset_1
4740 would need to add OFFSET to a register. */
4743 aarch64_add_offset_1_temporaries (HOST_WIDE_INT offset
)
4745 return absu_hwi (offset
) < 0x1000000 ? 0 : 1;
4748 /* A subroutine of aarch64_add_offset. Set DEST to SRC + OFFSET for
4749 a non-polynomial OFFSET. MODE is the mode of the addition.
4750 FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
4751 be set and CFA adjustments added to the generated instructions.
4753 TEMP1, if nonnull, is a register of mode MODE that can be used as a
4754 temporary if register allocation is already complete. This temporary
4755 register may overlap DEST but must not overlap SRC. If TEMP1 is known
4756 to hold abs (OFFSET), EMIT_MOVE_IMM can be set to false to avoid emitting
4757 the immediate again.
4759 Since this function may be used to adjust the stack pointer, we must
4760 ensure that it cannot cause transient stack deallocation (for example
4761 by first incrementing SP and then decrementing when adjusting by a
4762 large immediate). */
4765 aarch64_add_offset_1 (scalar_int_mode mode
, rtx dest
,
4766 rtx src
, HOST_WIDE_INT offset
, rtx temp1
,
4767 bool frame_related_p
, bool emit_move_imm
)
4769 gcc_assert (emit_move_imm
|| temp1
!= NULL_RTX
);
4770 gcc_assert (temp1
== NULL_RTX
|| !reg_overlap_mentioned_p (temp1
, src
));
4772 unsigned HOST_WIDE_INT moffset
= absu_hwi (offset
);
4777 if (!rtx_equal_p (dest
, src
))
4779 insn
= emit_insn (gen_rtx_SET (dest
, src
));
4780 RTX_FRAME_RELATED_P (insn
) = frame_related_p
;
4785 /* Single instruction adjustment. */
4786 if (aarch64_uimm12_shift (moffset
))
4788 insn
= emit_insn (gen_add3_insn (dest
, src
, GEN_INT (offset
)));
4789 RTX_FRAME_RELATED_P (insn
) = frame_related_p
;
4793 /* Emit 2 additions/subtractions if the adjustment is less than 24 bits
4796 a) the offset cannot be loaded by a 16-bit move or
4797 b) there is no spare register into which we can move it. */
4798 if (moffset
< 0x1000000
4799 && ((!temp1
&& !can_create_pseudo_p ())
4800 || !aarch64_move_imm (moffset
, mode
)))
4802 HOST_WIDE_INT low_off
= moffset
& 0xfff;
4804 low_off
= offset
< 0 ? -low_off
: low_off
;
4805 insn
= emit_insn (gen_add3_insn (dest
, src
, GEN_INT (low_off
)));
4806 RTX_FRAME_RELATED_P (insn
) = frame_related_p
;
4807 insn
= emit_insn (gen_add2_insn (dest
, GEN_INT (offset
- low_off
)));
4808 RTX_FRAME_RELATED_P (insn
) = frame_related_p
;
4812 /* Emit a move immediate if required and an addition/subtraction. */
4815 gcc_assert (temp1
!= NULL_RTX
|| can_create_pseudo_p ());
4816 temp1
= aarch64_force_temporary (mode
, temp1
,
4817 gen_int_mode (moffset
, mode
));
4819 insn
= emit_insn (offset
< 0
4820 ? gen_sub3_insn (dest
, src
, temp1
)
4821 : gen_add3_insn (dest
, src
, temp1
));
4822 if (frame_related_p
)
4824 RTX_FRAME_RELATED_P (insn
) = frame_related_p
;
4825 rtx adj
= plus_constant (mode
, src
, offset
);
4826 add_reg_note (insn
, REG_CFA_ADJUST_CFA
, gen_rtx_SET (dest
, adj
));
4830 /* Return the number of temporary registers that aarch64_add_offset
4831 would need to move OFFSET into a register or add OFFSET to a register;
4832 ADD_P is true if we want the latter rather than the former. */
4835 aarch64_offset_temporaries (bool add_p
, poly_int64 offset
)
4837 /* This follows the same structure as aarch64_add_offset. */
4838 if (add_p
&& aarch64_sve_addvl_addpl_immediate_p (offset
))
4841 unsigned int count
= 0;
4842 HOST_WIDE_INT factor
= offset
.coeffs
[1];
4843 HOST_WIDE_INT constant
= offset
.coeffs
[0] - factor
;
4844 poly_int64
poly_offset (factor
, factor
);
4845 if (add_p
&& aarch64_sve_addvl_addpl_immediate_p (poly_offset
))
4846 /* Need one register for the ADDVL/ADDPL result. */
4848 else if (factor
!= 0)
4850 factor
/= (HOST_WIDE_INT
) least_bit_hwi (factor
);
4851 if (!IN_RANGE (factor
, -32, 31))
4852 /* Need one register for the CNT or RDVL result and one for the
4853 multiplication factor. If necessary, the second temporary
4854 can be reused for the constant part of the offset. */
4856 /* Need one register for the CNT or RDVL result (which might then
4860 return count
+ aarch64_add_offset_1_temporaries (constant
);
4863 /* If X can be represented as a poly_int64, return the number
4864 of temporaries that are required to add it to a register.
4865 Return -1 otherwise. */
4868 aarch64_add_offset_temporaries (rtx x
)
4871 if (!poly_int_rtx_p (x
, &offset
))
4873 return aarch64_offset_temporaries (true, offset
);
4876 /* Set DEST to SRC + OFFSET. MODE is the mode of the addition.
4877 FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
4878 be set and CFA adjustments added to the generated instructions.
4880 TEMP1, if nonnull, is a register of mode MODE that can be used as a
4881 temporary if register allocation is already complete. This temporary
4882 register may overlap DEST if !FRAME_RELATED_P but must not overlap SRC.
4883 If TEMP1 is known to hold abs (OFFSET), EMIT_MOVE_IMM can be set to
4884 false to avoid emitting the immediate again.
4886 TEMP2, if nonnull, is a second temporary register that doesn't
4887 overlap either DEST or REG.
4889 FORCE_ISA_MODE is AARCH64_ISA_MODE_SM_ON if any variable component of OFFSET
4890 is measured relative to the SME vector length instead of the current
4891 prevailing vector length. It is 0 otherwise.
4893 Since this function may be used to adjust the stack pointer, we must
4894 ensure that it cannot cause transient stack deallocation (for example
4895 by first incrementing SP and then decrementing when adjusting by a
4896 large immediate). */
4899 aarch64_add_offset (scalar_int_mode mode
, rtx dest
, rtx src
,
4900 poly_int64 offset
, rtx temp1
, rtx temp2
,
4901 aarch64_isa_mode force_isa_mode
,
4902 bool frame_related_p
, bool emit_move_imm
= true)
4904 gcc_assert (emit_move_imm
|| temp1
!= NULL_RTX
);
4905 gcc_assert (temp1
== NULL_RTX
|| !reg_overlap_mentioned_p (temp1
, src
));
4906 gcc_assert (temp1
== NULL_RTX
4908 || !reg_overlap_mentioned_p (temp1
, dest
));
4909 gcc_assert (temp2
== NULL_RTX
|| !reg_overlap_mentioned_p (dest
, temp2
));
4911 /* Try using ADDVL or ADDPL to add the whole value. */
4912 if (src
!= const0_rtx
&& aarch64_sve_addvl_addpl_immediate_p (offset
))
4914 gcc_assert (offset
.coeffs
[0] == offset
.coeffs
[1]);
4916 if (force_isa_mode
== 0)
4917 offset_rtx
= gen_int_mode (offset
, mode
);
4919 offset_rtx
= aarch64_sme_vq_immediate (mode
, offset
.coeffs
[0], 0);
4920 rtx_insn
*insn
= emit_insn (gen_add3_insn (dest
, src
, offset_rtx
));
4921 RTX_FRAME_RELATED_P (insn
) = frame_related_p
;
4922 if (frame_related_p
&& (force_isa_mode
& AARCH64_ISA_MODE_SM_ON
))
4923 add_reg_note (insn
, REG_CFA_ADJUST_CFA
,
4924 gen_rtx_SET (dest
, plus_constant (Pmode
, src
,
4929 /* Coefficient 1 is multiplied by the number of 128-bit blocks in an
4930 SVE vector register, over and above the minimum size of 128 bits.
4931 This is equivalent to half the value returned by CNTD with a
4932 vector shape of ALL. */
4933 HOST_WIDE_INT factor
= offset
.coeffs
[1];
4934 HOST_WIDE_INT constant
= offset
.coeffs
[0] - factor
;
4936 /* Try using ADDVL or ADDPL to add the VG-based part. */
4937 poly_int64
poly_offset (factor
, factor
);
4938 if (src
!= const0_rtx
4939 && aarch64_sve_addvl_addpl_immediate_p (poly_offset
))
4942 if (force_isa_mode
== 0)
4943 offset_rtx
= gen_int_mode (poly_offset
, mode
);
4945 offset_rtx
= aarch64_sme_vq_immediate (mode
, factor
, 0);
4946 if (frame_related_p
)
4948 rtx_insn
*insn
= emit_insn (gen_add3_insn (dest
, src
, offset_rtx
));
4949 RTX_FRAME_RELATED_P (insn
) = true;
4950 if (force_isa_mode
& AARCH64_ISA_MODE_SM_ON
)
4951 add_reg_note (insn
, REG_CFA_ADJUST_CFA
,
4952 gen_rtx_SET (dest
, plus_constant (Pmode
, src
,
4958 rtx addr
= gen_rtx_PLUS (mode
, src
, offset_rtx
);
4959 src
= aarch64_force_temporary (mode
, temp1
, addr
);
4964 /* Otherwise use a CNT-based sequence. */
4965 else if (factor
!= 0)
4967 /* Calculate CNTB * FACTOR / 16 as CNTB * REL_FACTOR * 2**SHIFT,
4968 with negative shifts indicating a shift right. */
4969 HOST_WIDE_INT low_bit
= least_bit_hwi (factor
);
4970 HOST_WIDE_INT rel_factor
= factor
/ low_bit
;
4971 int shift
= exact_log2 (low_bit
) - 4;
4972 gcc_assert (shift
>= -4 && (rel_factor
& 1) != 0);
4974 /* Set CODE, VAL and SHIFT so that [+-] VAL * 2**SHIFT is
4975 equal to CNTB * FACTOR / 16, with CODE being the [+-].
4977 We can avoid a multiplication if REL_FACTOR is in the range
4978 of RDVL, although there are then various optimizations that
4979 we can try on top. */
4980 rtx_code code
= PLUS
;
4982 if (IN_RANGE (rel_factor
, -32, 31))
4984 if (force_isa_mode
& AARCH64_ISA_MODE_SM_ON
)
4986 /* Try to use an unshifted RDSVL, otherwise fall back on
4987 a shifted RDSVL #1. */
4988 if (aarch64_sve_rdvl_addvl_factor_p (factor
))
4991 factor
= rel_factor
* 16;
4992 val
= aarch64_sme_vq_immediate (mode
, factor
, 0);
4994 /* Try to use an unshifted CNT[BHWD] or RDVL. */
4995 else if (aarch64_sve_cnt_factor_p (factor
)
4996 || aarch64_sve_rdvl_addvl_factor_p (factor
))
4998 val
= gen_int_mode (poly_int64 (factor
, factor
), mode
);
5001 /* Try to subtract an unshifted CNT[BHWD]. */
5002 else if (aarch64_sve_cnt_factor_p (-factor
))
5005 val
= gen_int_mode (poly_int64 (-factor
, -factor
), mode
);
5008 /* If subtraction is free, prefer to load a positive constant.
5009 In the best case this will fit a shifted CNTB. */
5010 else if (src
!= const0_rtx
&& rel_factor
< 0)
5013 val
= gen_int_mode (-rel_factor
* BYTES_PER_SVE_VECTOR
, mode
);
5015 /* Otherwise use a shifted RDVL or CNT[BHWD]. */
5017 val
= gen_int_mode (rel_factor
* BYTES_PER_SVE_VECTOR
, mode
);
5021 /* If we can calculate CNTB << SHIFT directly, prefer to do that,
5022 since it should increase the chances of being able to use
5023 a shift and add sequence for the multiplication.
5024 If CNTB << SHIFT is out of range, stick with the current
5026 if (force_isa_mode
== 0
5027 && IN_RANGE (low_bit
, 2, 16 * 16))
5029 val
= gen_int_mode (poly_int64 (low_bit
, low_bit
), mode
);
5032 else if ((force_isa_mode
& AARCH64_ISA_MODE_SM_ON
)
5033 && aarch64_sve_rdvl_addvl_factor_p (low_bit
))
5035 val
= aarch64_sme_vq_immediate (mode
, low_bit
, 0);
5039 val
= gen_int_mode (BYTES_PER_SVE_VECTOR
, mode
);
5041 val
= aarch64_force_temporary (mode
, temp1
, val
);
5043 /* Prefer to multiply by a positive factor and subtract rather
5044 than multiply by a negative factor and add, since positive
5045 values are usually easier to move. */
5046 if (rel_factor
< 0 && src
!= const0_rtx
)
5048 rel_factor
= -rel_factor
;
5052 if (can_create_pseudo_p ())
5054 rtx coeff1
= gen_int_mode (rel_factor
, mode
);
5055 val
= expand_mult (mode
, val
, coeff1
, NULL_RTX
, true, true);
5059 rtx coeff1
= gen_int_mode (rel_factor
, mode
);
5060 coeff1
= aarch64_force_temporary (mode
, temp2
, coeff1
);
5061 val
= gen_rtx_MULT (mode
, val
, coeff1
);
5065 /* Multiply by 2 ** SHIFT. */
5068 val
= aarch64_force_temporary (mode
, temp1
, val
);
5069 val
= gen_rtx_ASHIFT (mode
, val
, GEN_INT (shift
));
5073 val
= aarch64_force_temporary (mode
, temp1
, val
);
5074 val
= gen_rtx_ASHIFTRT (mode
, val
, GEN_INT (-shift
));
5077 /* Add the result to SRC or subtract the result from SRC. */
5078 if (src
!= const0_rtx
)
5080 val
= aarch64_force_temporary (mode
, temp1
, val
);
5081 val
= gen_rtx_fmt_ee (code
, mode
, src
, val
);
5083 else if (code
== MINUS
)
5085 val
= aarch64_force_temporary (mode
, temp1
, val
);
5086 val
= gen_rtx_NEG (mode
, val
);
5089 if (constant
== 0 || frame_related_p
)
5091 rtx_insn
*insn
= emit_insn (gen_rtx_SET (dest
, val
));
5092 if (frame_related_p
)
5094 RTX_FRAME_RELATED_P (insn
) = true;
5095 add_reg_note (insn
, REG_CFA_ADJUST_CFA
,
5096 gen_rtx_SET (dest
, plus_constant (Pmode
, src
,
5105 src
= aarch64_force_temporary (mode
, temp1
, val
);
5110 emit_move_imm
= true;
5113 aarch64_add_offset_1 (mode
, dest
, src
, constant
, temp1
,
5114 frame_related_p
, emit_move_imm
);
5117 /* Like aarch64_add_offset, but the offset is given as an rtx rather
5118 than a poly_int64. */
5121 aarch64_split_add_offset (scalar_int_mode mode
, rtx dest
, rtx src
,
5122 rtx offset_rtx
, rtx temp1
, rtx temp2
)
5124 aarch64_add_offset (mode
, dest
, src
, rtx_to_poly_int64 (offset_rtx
),
5125 temp1
, temp2
, 0, false);
5128 /* Add DELTA to the stack pointer, marking the instructions frame-related.
5129 TEMP1 is available as a temporary if nonnull. FORCE_ISA_MODE is as
5130 for aarch64_add_offset. EMIT_MOVE_IMM is false if TEMP1 already
5131 contains abs (DELTA). */
5134 aarch64_add_sp (rtx temp1
, rtx temp2
, poly_int64 delta
,
5135 aarch64_isa_mode force_isa_mode
, bool emit_move_imm
)
5137 aarch64_add_offset (Pmode
, stack_pointer_rtx
, stack_pointer_rtx
, delta
,
5138 temp1
, temp2
, force_isa_mode
, true, emit_move_imm
);
5141 /* Subtract DELTA from the stack pointer, marking the instructions
5142 frame-related if FRAME_RELATED_P. FORCE_ISA_MODE is as for
5143 aarch64_add_offset. TEMP1 is available as a temporary if nonnull. */
5146 aarch64_sub_sp (rtx temp1
, rtx temp2
, poly_int64 delta
,
5147 aarch64_isa_mode force_isa_mode
,
5148 bool frame_related_p
, bool emit_move_imm
= true)
5150 aarch64_add_offset (Pmode
, stack_pointer_rtx
, stack_pointer_rtx
, -delta
,
5151 temp1
, temp2
, force_isa_mode
, frame_related_p
,
5155 /* A streaming-compatible function needs to switch temporarily to the known
5156 PSTATE.SM mode described by LOCAL_MODE. The low bit of OLD_SVCR contains
5157 the runtime state of PSTATE.SM in the streaming-compatible code, before
5158 the start of the switch to LOCAL_MODE.
5160 Emit instructions to branch around the mode switch if PSTATE.SM already
5161 matches LOCAL_MODE. Return the label that the branch jumps to. */
5164 aarch64_guard_switch_pstate_sm (rtx old_svcr
, aarch64_isa_mode local_mode
)
5166 local_mode
&= AARCH64_ISA_MODE_SM_STATE
;
5167 gcc_assert (local_mode
!= 0);
5168 auto already_ok_cond
= (local_mode
& AARCH64_ISA_MODE_SM_ON
? NE
: EQ
);
5169 auto *label
= gen_label_rtx ();
5170 auto branch
= aarch64_gen_test_and_branch (already_ok_cond
, old_svcr
, 0,
5172 auto *jump
= emit_jump_insn (branch
);
5173 JUMP_LABEL (jump
) = label
;
5177 /* Emit code to switch from the PSTATE.SM state in OLD_MODE to the PSTATE.SM
5178 state in NEW_MODE. This is known to involve either an SMSTART SM or
5182 aarch64_switch_pstate_sm (aarch64_isa_mode old_mode
, aarch64_isa_mode new_mode
)
5184 old_mode
&= AARCH64_ISA_MODE_SM_STATE
;
5185 new_mode
&= AARCH64_ISA_MODE_SM_STATE
;
5186 gcc_assert (old_mode
!= new_mode
);
5188 if ((new_mode
& AARCH64_ISA_MODE_SM_ON
)
5189 || (!new_mode
&& (old_mode
& AARCH64_ISA_MODE_SM_OFF
)))
5190 emit_insn (gen_aarch64_smstart_sm ());
5192 emit_insn (gen_aarch64_smstop_sm ());
5195 /* As a side-effect, SMSTART SM and SMSTOP SM clobber the contents of all
5196 FP and predicate registers. This class emits code to preserve any
5197 necessary registers around the mode switch.
5199 The class uses four approaches to saving and restoring contents, enumerated
5202 - GPR: save and restore the contents of FP registers using GPRs.
5203 This is used if the FP register contains no more than 64 significant
5204 bits. The registers used are FIRST_GPR onwards.
5206 - MEM_128: save and restore 128-bit SIMD registers using memory.
5208 - MEM_SVE_PRED: save and restore full SVE predicate registers using memory.
5210 - MEM_SVE_DATA: save and restore full SVE vector registers using memory.
5212 The save slots within each memory group are consecutive, with the
5213 MEM_SVE_PRED slots occupying a region below the MEM_SVE_DATA slots.
5215 There will only be two mode switches for each use of SME, so they should
5216 not be particularly performance-sensitive. It's also rare for SIMD, SVE
5217 or predicate registers to be live across mode switches. We therefore
5218 don't preallocate the save slots but instead allocate them locally on
5219 demand. This makes the code emitted by the class self-contained. */
5221 class aarch64_sme_mode_switch_regs
5224 static const unsigned int FIRST_GPR
= R10_REGNUM
;
5226 void add_reg (machine_mode
, unsigned int);
5227 void add_call_args (rtx_call_insn
*);
5228 void add_call_result (rtx_call_insn
*);
5229 void add_call_preserved_reg (unsigned int);
5230 void add_call_preserved_regs (bitmap
);
5232 void emit_prologue ();
5233 void emit_epilogue ();
5235 /* The number of GPRs needed to save FP registers, starting from
5237 unsigned int num_gprs () { return m_group_count
[GPR
]; }
5240 enum sequence
{ PROLOGUE
, EPILOGUE
};
5241 enum group_type
{ GPR
, MEM_128
, MEM_SVE_PRED
, MEM_SVE_DATA
, NUM_GROUPS
};
5243 /* Information about the save location for one FP, SIMD, SVE data, or
5244 SVE predicate register. */
5245 struct save_location
{
5246 /* The register to be saved. */
5249 /* Which group the save location belongs to. */
5252 /* A zero-based index of the register within the group. */
5256 unsigned int sve_data_headroom ();
5257 rtx
get_slot_mem (machine_mode
, poly_int64
);
5258 void emit_stack_adjust (sequence
, poly_int64
);
5259 void emit_mem_move (sequence
, const save_location
&, poly_int64
);
5261 void emit_gpr_moves (sequence
);
5262 void emit_mem_128_moves (sequence
);
5263 void emit_sve_sp_adjust (sequence
);
5264 void emit_sve_pred_moves (sequence
);
5265 void emit_sve_data_moves (sequence
);
5267 /* All save locations, in no particular order. */
5268 auto_vec
<save_location
, 12> m_save_locations
;
5270 /* The number of registers in each group. */
5271 unsigned int m_group_count
[NUM_GROUPS
] = {};
5274 /* Record that (reg:MODE REGNO) needs to be preserved around the mode
5278 aarch64_sme_mode_switch_regs::add_reg (machine_mode mode
, unsigned int regno
)
5280 if (!FP_REGNUM_P (regno
) && !PR_REGNUM_P (regno
))
5283 unsigned int end_regno
= end_hard_regno (mode
, regno
);
5284 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
5285 gcc_assert ((vec_flags
& VEC_STRUCT
) || end_regno
== regno
+ 1);
5286 for (; regno
< end_regno
; regno
++)
5288 /* Force the mode of SVE saves and restores even for single registers.
5289 This is necessary because big-endian targets only allow LDR Z and
5290 STR Z to be used with byte modes. */
5291 machine_mode submode
= mode
;
5292 if (vec_flags
& VEC_SVE_PRED
)
5293 submode
= VNx16BImode
;
5294 else if (vec_flags
& VEC_SVE_DATA
)
5295 submode
= SVE_BYTE_MODE
;
5296 else if (vec_flags
& VEC_STRUCT
)
5298 if (vec_flags
& VEC_PARTIAL
)
5301 submode
= V16QImode
;
5304 loc
.reg
= gen_rtx_REG (submode
, regno
);
5305 if (vec_flags
& VEC_SVE_PRED
)
5307 gcc_assert (PR_REGNUM_P (regno
));
5308 loc
.group
= MEM_SVE_PRED
;
5312 gcc_assert (FP_REGNUM_P (regno
));
5313 if (known_le (GET_MODE_SIZE (submode
), 8))
5315 else if (known_eq (GET_MODE_SIZE (submode
), 16))
5316 loc
.group
= MEM_128
;
5318 loc
.group
= MEM_SVE_DATA
;
5320 loc
.index
= m_group_count
[loc
.group
]++;
5321 m_save_locations
.quick_push (loc
);
5325 /* Record that the arguments to CALL_INSN need to be preserved around
5329 aarch64_sme_mode_switch_regs::add_call_args (rtx_call_insn
*call_insn
)
5331 for (rtx node
= CALL_INSN_FUNCTION_USAGE (call_insn
);
5332 node
; node
= XEXP (node
, 1))
5334 rtx item
= XEXP (node
, 0);
5335 if (GET_CODE (item
) != USE
)
5337 item
= XEXP (item
, 0);
5340 add_reg (GET_MODE (item
), REGNO (item
));
5344 /* Record that the return value from CALL_INSN (if any) needs to be
5345 preserved around the mode switch. */
5348 aarch64_sme_mode_switch_regs::add_call_result (rtx_call_insn
*call_insn
)
5350 rtx pat
= PATTERN (call_insn
);
5351 gcc_assert (GET_CODE (pat
) == PARALLEL
);
5352 pat
= XVECEXP (pat
, 0, 0);
5353 if (GET_CODE (pat
) == CALL
)
5355 rtx dest
= SET_DEST (pat
);
5356 if (GET_CODE (dest
) == PARALLEL
)
5357 for (int i
= 0; i
< XVECLEN (dest
, 0); ++i
)
5359 rtx x
= XVECEXP (dest
, 0, i
);
5360 gcc_assert (GET_CODE (x
) == EXPR_LIST
);
5361 rtx reg
= XEXP (x
, 0);
5362 add_reg (GET_MODE (reg
), REGNO (reg
));
5365 add_reg (GET_MODE (dest
), REGNO (dest
));
5368 /* REGNO is a register that is call-preserved under the current function's ABI.
5369 Record that it must be preserved around the mode switch. */
5372 aarch64_sme_mode_switch_regs::add_call_preserved_reg (unsigned int regno
)
5374 if (FP_REGNUM_P (regno
))
5375 switch (crtl
->abi
->id ())
5378 add_reg (VNx16QImode
, regno
);
5381 add_reg (V16QImode
, regno
);
5383 case ARM_PCS_AAPCS64
:
5384 add_reg (DImode
, regno
);
5389 else if (PR_REGNUM_P (regno
))
5390 add_reg (VNx16BImode
, regno
);
5393 /* The hard registers in REGS are call-preserved under the current function's
5394 ABI. Record that they must be preserved around the mode switch. */
5397 aarch64_sme_mode_switch_regs::add_call_preserved_regs (bitmap regs
)
5401 EXECUTE_IF_SET_IN_BITMAP (regs
, 0, regno
, bi
)
5402 if (HARD_REGISTER_NUM_P (regno
))
5403 add_call_preserved_reg (regno
);
5408 /* Emit code to save registers before the mode switch. */
5411 aarch64_sme_mode_switch_regs::emit_prologue ()
5413 emit_sve_sp_adjust (PROLOGUE
);
5414 emit_sve_pred_moves (PROLOGUE
);
5415 emit_sve_data_moves (PROLOGUE
);
5416 emit_mem_128_moves (PROLOGUE
);
5417 emit_gpr_moves (PROLOGUE
);
5420 /* Emit code to restore registers after the mode switch. */
5423 aarch64_sme_mode_switch_regs::emit_epilogue ()
5425 emit_gpr_moves (EPILOGUE
);
5426 emit_mem_128_moves (EPILOGUE
);
5427 emit_sve_pred_moves (EPILOGUE
);
5428 emit_sve_data_moves (EPILOGUE
);
5429 emit_sve_sp_adjust (EPILOGUE
);
5432 /* The SVE predicate registers are stored below the SVE data registers,
5433 with the predicate save area being padded to a data-register-sized
5434 boundary. Return the size of this padded area as a whole number
5435 of data register slots. */
5438 aarch64_sme_mode_switch_regs::sve_data_headroom ()
5440 return CEIL (m_group_count
[MEM_SVE_PRED
], 8);
5443 /* Return a memory reference of mode MODE to OFFSET bytes from the
5447 aarch64_sme_mode_switch_regs::get_slot_mem (machine_mode mode
,
5450 rtx addr
= plus_constant (Pmode
, stack_pointer_rtx
, offset
);
5451 return gen_rtx_MEM (mode
, addr
);
5454 /* Allocate or deallocate SIZE bytes of stack space: SEQ decides which. */
5457 aarch64_sme_mode_switch_regs::emit_stack_adjust (sequence seq
,
5460 if (seq
== PROLOGUE
)
5462 emit_insn (gen_rtx_SET (stack_pointer_rtx
,
5463 plus_constant (Pmode
, stack_pointer_rtx
, size
)));
5466 /* Save or restore the register in LOC, whose slot is OFFSET bytes from
5467 the stack pointer. SEQ chooses between saving and restoring. */
5470 aarch64_sme_mode_switch_regs::emit_mem_move (sequence seq
,
5471 const save_location
&loc
,
5474 rtx mem
= get_slot_mem (GET_MODE (loc
.reg
), offset
);
5475 if (seq
== PROLOGUE
)
5476 emit_move_insn (mem
, loc
.reg
);
5478 emit_move_insn (loc
.reg
, mem
);
5481 /* Emit instructions to save or restore the GPR group. SEQ chooses between
5482 saving and restoring. */
5485 aarch64_sme_mode_switch_regs::emit_gpr_moves (sequence seq
)
5487 for (auto &loc
: m_save_locations
)
5488 if (loc
.group
== GPR
)
5490 gcc_assert (loc
.index
< 8);
5491 rtx gpr
= gen_rtx_REG (GET_MODE (loc
.reg
), FIRST_GPR
+ loc
.index
);
5492 if (seq
== PROLOGUE
)
5493 emit_move_insn (gpr
, loc
.reg
);
5495 emit_move_insn (loc
.reg
, gpr
);
5499 /* Emit instructions to save or restore the MEM_128 group. SEQ chooses
5500 between saving and restoring. */
5503 aarch64_sme_mode_switch_regs::emit_mem_128_moves (sequence seq
)
5505 HOST_WIDE_INT count
= m_group_count
[MEM_128
];
5509 auto sp
= stack_pointer_rtx
;
5510 auto sp_adjust
= (seq
== PROLOGUE
? -count
: count
) * 16;
5512 /* Pick a common mode that supports LDR & STR with pre/post-modification
5513 and LDP & STP with pre/post-modification. */
5516 /* An instruction pattern that should be emitted at the end. */
5517 rtx last_pat
= NULL_RTX
;
5519 /* A previous MEM_128 location that hasn't been handled yet. */
5520 save_location
*prev_loc
= nullptr;
5522 /* Look for LDP/STPs and record any leftover LDR/STR in PREV_LOC. */
5523 for (auto &loc
: m_save_locations
)
5524 if (loc
.group
== MEM_128
)
5531 gcc_assert (loc
.index
== prev_loc
->index
+ 1);
5533 /* The offset of the base of the save area from the current
5535 HOST_WIDE_INT bias
= 0;
5536 if (prev_loc
->index
== 0 && seq
== PROLOGUE
)
5539 /* Get the two sets in the LDP/STP. */
5541 gen_rtx_REG (mode
, REGNO (prev_loc
->reg
)),
5542 get_slot_mem (mode
, prev_loc
->index
* 16 + bias
),
5543 gen_rtx_REG (mode
, REGNO (loc
.reg
)),
5544 get_slot_mem (mode
, loc
.index
* 16 + bias
)
5546 unsigned int lhs
= (seq
== PROLOGUE
);
5547 rtx set1
= gen_rtx_SET (ops
[lhs
], ops
[1 - lhs
]);
5548 rtx set2
= gen_rtx_SET (ops
[lhs
+ 2], ops
[3 - lhs
]);
5550 /* Combine the sets with any stack allocation/deallocation. */
5552 if (prev_loc
->index
== 0)
5554 rtx plus_sp
= plus_constant (Pmode
, sp
, sp_adjust
);
5555 rtvec vec
= gen_rtvec (3, gen_rtx_SET (sp
, plus_sp
), set1
, set2
);
5556 pat
= gen_rtx_PARALLEL (VOIDmode
, vec
);
5558 else if (seq
== PROLOGUE
)
5559 pat
= aarch64_gen_store_pair (ops
[1], ops
[0], ops
[2]);
5561 pat
= aarch64_gen_load_pair (ops
[0], ops
[2], ops
[1]);
5563 /* Queue a deallocation to the end, otherwise emit the
5565 if (seq
== EPILOGUE
&& prev_loc
->index
== 0)
5572 /* Handle any leftover LDR/STR. */
5575 rtx reg
= gen_rtx_REG (mode
, REGNO (prev_loc
->reg
));
5577 if (prev_loc
->index
!= 0)
5578 addr
= plus_constant (Pmode
, sp
, prev_loc
->index
* 16);
5579 else if (seq
== PROLOGUE
)
5581 rtx allocate
= plus_constant (Pmode
, sp
, -count
* 16);
5582 addr
= gen_rtx_PRE_MODIFY (Pmode
, sp
, allocate
);
5586 rtx deallocate
= plus_constant (Pmode
, sp
, count
* 16);
5587 addr
= gen_rtx_POST_MODIFY (Pmode
, sp
, deallocate
);
5589 rtx mem
= gen_rtx_MEM (mode
, addr
);
5590 if (seq
== PROLOGUE
)
5591 emit_move_insn (mem
, reg
);
5593 emit_move_insn (reg
, mem
);
5597 emit_insn (last_pat
);
5600 /* Allocate or deallocate the stack space needed by the SVE groups.
5601 SEQ chooses between allocating and deallocating. */
5604 aarch64_sme_mode_switch_regs::emit_sve_sp_adjust (sequence seq
)
5606 if (unsigned int count
= m_group_count
[MEM_SVE_DATA
] + sve_data_headroom ())
5607 emit_stack_adjust (seq
, count
* BYTES_PER_SVE_VECTOR
);
5610 /* Save or restore the MEM_SVE_DATA group. SEQ chooses between saving
5614 aarch64_sme_mode_switch_regs::emit_sve_data_moves (sequence seq
)
5616 for (auto &loc
: m_save_locations
)
5617 if (loc
.group
== MEM_SVE_DATA
)
5619 auto index
= loc
.index
+ sve_data_headroom ();
5620 emit_mem_move (seq
, loc
, index
* BYTES_PER_SVE_VECTOR
);
5624 /* Save or restore the MEM_SVE_PRED group. SEQ chooses between saving
5628 aarch64_sme_mode_switch_regs::emit_sve_pred_moves (sequence seq
)
5630 for (auto &loc
: m_save_locations
)
5631 if (loc
.group
== MEM_SVE_PRED
)
5632 emit_mem_move (seq
, loc
, loc
.index
* BYTES_PER_SVE_PRED
);
5635 /* Set DEST to (vec_series BASE STEP). */
5638 aarch64_expand_vec_series (rtx dest
, rtx base
, rtx step
)
5640 machine_mode mode
= GET_MODE (dest
);
5641 scalar_mode inner
= GET_MODE_INNER (mode
);
5643 /* Each operand can be a register or an immediate in the range [-16, 15]. */
5644 if (!aarch64_sve_index_immediate_p (base
))
5645 base
= force_reg (inner
, base
);
5646 if (!aarch64_sve_index_immediate_p (step
))
5647 step
= force_reg (inner
, step
);
5649 emit_set_insn (dest
, gen_rtx_VEC_SERIES (mode
, base
, step
));
5652 /* Duplicate 128-bit Advanced SIMD vector SRC so that it fills an SVE
5653 register of mode MODE. Use TARGET for the result if it's nonnull
5656 The two vector modes must have the same element mode. The behavior
5657 is to duplicate architectural lane N of SRC into architectural lanes
5658 N + I * STEP of the result. On big-endian targets, architectural
5659 lane 0 of an Advanced SIMD vector is the last element of the vector
5660 in memory layout, so for big-endian targets this operation has the
5661 effect of reversing SRC before duplicating it. Callers need to
5662 account for this. */
5665 aarch64_expand_sve_dupq (rtx target
, machine_mode mode
, rtx src
)
5667 machine_mode src_mode
= GET_MODE (src
);
5668 gcc_assert (GET_MODE_INNER (mode
) == GET_MODE_INNER (src_mode
));
5669 insn_code icode
= (BYTES_BIG_ENDIAN
5670 ? code_for_aarch64_vec_duplicate_vq_be (mode
)
5671 : code_for_aarch64_vec_duplicate_vq_le (mode
));
5674 expand_operand ops
[3];
5675 create_output_operand (&ops
[i
++], target
, mode
);
5676 create_output_operand (&ops
[i
++], src
, src_mode
);
5677 if (BYTES_BIG_ENDIAN
)
5679 /* Create a PARALLEL describing the reversal of SRC. */
5680 unsigned int nelts_per_vq
= 128 / GET_MODE_UNIT_BITSIZE (mode
);
5681 rtx sel
= aarch64_gen_stepped_int_parallel (nelts_per_vq
,
5682 nelts_per_vq
- 1, -1);
5683 create_fixed_operand (&ops
[i
++], sel
);
5685 expand_insn (icode
, i
, ops
);
5686 return ops
[0].value
;
5689 /* Try to force 128-bit vector value SRC into memory and use LD1RQ to fetch
5690 the memory image into DEST. Return true on success. */
5693 aarch64_expand_sve_ld1rq (rtx dest
, rtx src
)
5695 src
= force_const_mem (GET_MODE (src
), src
);
5699 /* Make sure that the address is legitimate. */
5700 if (!aarch64_sve_ld1rq_operand_p (src
))
5702 rtx addr
= force_reg (Pmode
, XEXP (src
, 0));
5703 src
= replace_equiv_address (src
, addr
);
5706 machine_mode mode
= GET_MODE (dest
);
5707 machine_mode pred_mode
= aarch64_sve_pred_mode (mode
);
5708 rtx ptrue
= aarch64_ptrue_reg (pred_mode
);
5709 emit_insn (gen_aarch64_sve_ld1rq (mode
, dest
, src
, ptrue
));
5713 /* SRC is an SVE CONST_VECTOR that contains N "foreground" values followed
5714 by N "background" values. Try to move it into TARGET using:
5716 PTRUE PRED.<T>, VL<N>
5717 MOV TRUE.<T>, #<foreground>
5718 MOV FALSE.<T>, #<background>
5719 SEL TARGET.<T>, PRED.<T>, TRUE.<T>, FALSE.<T>
5721 The PTRUE is always a single instruction but the MOVs might need a
5722 longer sequence. If the background value is zero (as it often is),
5723 the sequence can sometimes collapse to a PTRUE followed by a
5724 zero-predicated move.
5726 Return the target on success, otherwise return null. */
5729 aarch64_expand_sve_const_vector_sel (rtx target
, rtx src
)
5731 gcc_assert (CONST_VECTOR_NELTS_PER_PATTERN (src
) == 2);
5733 /* Make sure that the PTRUE is valid. */
5734 machine_mode mode
= GET_MODE (src
);
5735 machine_mode pred_mode
= aarch64_sve_pred_mode (mode
);
5736 unsigned int npatterns
= CONST_VECTOR_NPATTERNS (src
);
5737 if (aarch64_svpattern_for_vl (pred_mode
, npatterns
)
5738 == AARCH64_NUM_SVPATTERNS
)
5741 rtx_vector_builder
pred_builder (pred_mode
, npatterns
, 2);
5742 rtx_vector_builder
true_builder (mode
, npatterns
, 1);
5743 rtx_vector_builder
false_builder (mode
, npatterns
, 1);
5744 for (unsigned int i
= 0; i
< npatterns
; ++i
)
5746 true_builder
.quick_push (CONST_VECTOR_ENCODED_ELT (src
, i
));
5747 pred_builder
.quick_push (CONST1_RTX (BImode
));
5749 for (unsigned int i
= 0; i
< npatterns
; ++i
)
5751 false_builder
.quick_push (CONST_VECTOR_ENCODED_ELT (src
, i
+ npatterns
));
5752 pred_builder
.quick_push (CONST0_RTX (BImode
));
5754 expand_operand ops
[4];
5755 create_output_operand (&ops
[0], target
, mode
);
5756 create_input_operand (&ops
[1], true_builder
.build (), mode
);
5757 create_input_operand (&ops
[2], false_builder
.build (), mode
);
5758 create_input_operand (&ops
[3], pred_builder
.build (), pred_mode
);
5759 expand_insn (code_for_vcond_mask (mode
, mode
), 4, ops
);
5763 /* Return a register containing CONST_VECTOR SRC, given that SRC has an
5764 SVE data mode and isn't a legitimate constant. Use TARGET for the
5765 result if convenient.
5767 The returned register can have whatever mode seems most natural
5768 given the contents of SRC. */
5771 aarch64_expand_sve_const_vector (rtx target
, rtx src
)
5773 machine_mode mode
= GET_MODE (src
);
5774 unsigned int npatterns
= CONST_VECTOR_NPATTERNS (src
);
5775 unsigned int nelts_per_pattern
= CONST_VECTOR_NELTS_PER_PATTERN (src
);
5776 scalar_mode elt_mode
= GET_MODE_INNER (mode
);
5777 unsigned int elt_bits
= GET_MODE_BITSIZE (elt_mode
);
5778 unsigned int container_bits
= aarch64_sve_container_bits (mode
);
5779 unsigned int encoded_bits
= npatterns
* nelts_per_pattern
* container_bits
;
5781 if (nelts_per_pattern
== 1
5782 && encoded_bits
<= 128
5783 && container_bits
!= elt_bits
)
5785 /* We have a partial vector mode and a constant whose full-vector
5786 equivalent would occupy a repeating 128-bit sequence. Build that
5787 full-vector equivalent instead, so that we have the option of
5788 using LD1RQ and Advanced SIMD operations. */
5789 unsigned int repeat
= container_bits
/ elt_bits
;
5790 machine_mode full_mode
= aarch64_full_sve_mode (elt_mode
).require ();
5791 rtx_vector_builder
builder (full_mode
, npatterns
* repeat
, 1);
5792 for (unsigned int i
= 0; i
< npatterns
; ++i
)
5793 for (unsigned int j
= 0; j
< repeat
; ++j
)
5794 builder
.quick_push (CONST_VECTOR_ENCODED_ELT (src
, i
));
5795 target
= aarch64_target_reg (target
, full_mode
);
5796 return aarch64_expand_sve_const_vector (target
, builder
.build ());
5799 if (nelts_per_pattern
== 1 && encoded_bits
== 128)
5801 /* The constant is a duplicated quadword but can't be narrowed
5802 beyond a quadword. Get the memory image of the first quadword
5803 as a 128-bit vector and try using LD1RQ to load it from memory.
5805 The effect for both endiannesses is to load memory lane N into
5806 architectural lanes N + I * STEP of the result. On big-endian
5807 targets, the layout of the 128-bit vector in an Advanced SIMD
5808 register would be different from its layout in an SVE register,
5809 but this 128-bit vector is a memory value only. */
5810 machine_mode vq_mode
= aarch64_v128_mode (elt_mode
).require ();
5811 rtx vq_value
= simplify_gen_subreg (vq_mode
, src
, mode
, 0);
5812 if (vq_value
&& aarch64_expand_sve_ld1rq (target
, vq_value
))
5816 if (nelts_per_pattern
== 1 && encoded_bits
< 128)
5818 /* The vector is a repeating sequence of 64 bits or fewer.
5819 See if we can load them using an Advanced SIMD move and then
5820 duplicate it to fill a vector. This is better than using a GPR
5821 move because it keeps everything in the same register file. */
5822 machine_mode vq_mode
= aarch64_v128_mode (elt_mode
).require ();
5823 rtx_vector_builder
builder (vq_mode
, npatterns
, 1);
5824 for (unsigned int i
= 0; i
< npatterns
; ++i
)
5826 /* We want memory lane N to go into architectural lane N,
5827 so reverse for big-endian targets. The DUP .Q pattern
5828 has a compensating reverse built-in. */
5829 unsigned int srci
= BYTES_BIG_ENDIAN
? npatterns
- i
- 1 : i
;
5830 builder
.quick_push (CONST_VECTOR_ENCODED_ELT (src
, srci
));
5832 rtx vq_src
= builder
.build ();
5833 if (aarch64_simd_valid_mov_imm (vq_src
))
5835 vq_src
= force_reg (vq_mode
, vq_src
);
5836 return aarch64_expand_sve_dupq (target
, mode
, vq_src
);
5839 /* Get an integer representation of the repeating part of Advanced
5840 SIMD vector VQ_SRC. This preserves the endianness of VQ_SRC,
5841 which for big-endian targets is lane-swapped wrt a normal
5842 Advanced SIMD vector. This means that for both endiannesses,
5843 memory lane N of SVE vector SRC corresponds to architectural
5844 lane N of a register holding VQ_SRC. This in turn means that
5845 memory lane 0 of SVE vector SRC is in the lsb of VQ_SRC (viewed
5846 as a single 128-bit value) and thus that memory lane 0 of SRC is
5847 in the lsb of the integer. Duplicating the integer therefore
5848 ensures that memory lane N of SRC goes into architectural lane
5849 N + I * INDEX of the SVE register. */
5850 scalar_mode int_mode
= int_mode_for_size (encoded_bits
, 0).require ();
5851 rtx elt_value
= simplify_gen_subreg (int_mode
, vq_src
, vq_mode
, 0);
5854 /* Pretend that we had a vector of INT_MODE to start with. */
5855 elt_mode
= int_mode
;
5856 mode
= aarch64_full_sve_mode (int_mode
).require ();
5858 /* If the integer can be moved into a general register by a
5859 single instruction, do that and duplicate the result. */
5860 if (CONST_INT_P (elt_value
)
5861 && aarch64_move_imm (INTVAL (elt_value
),
5862 encoded_bits
<= 32 ? SImode
: DImode
))
5864 elt_value
= force_reg (elt_mode
, elt_value
);
5865 return expand_vector_broadcast (mode
, elt_value
);
5868 else if (npatterns
== 1)
5869 /* We're duplicating a single value, but can't do better than
5870 force it to memory and load from there. This handles things
5871 like symbolic constants. */
5872 elt_value
= CONST_VECTOR_ENCODED_ELT (src
, 0);
5876 /* Load the element from memory if we can, otherwise move it into
5877 a register and use a DUP. */
5878 rtx op
= force_const_mem (elt_mode
, elt_value
);
5880 op
= force_reg (elt_mode
, elt_value
);
5881 return expand_vector_broadcast (mode
, op
);
5885 /* Try using INDEX. */
5887 if (const_vec_series_p (src
, &base
, &step
))
5889 aarch64_expand_vec_series (target
, base
, step
);
5893 /* From here on, it's better to force the whole constant to memory
5895 if (GET_MODE_NUNITS (mode
).is_constant ())
5898 if (nelts_per_pattern
== 2)
5899 if (rtx res
= aarch64_expand_sve_const_vector_sel (target
, src
))
5902 /* Expand each pattern individually. */
5903 gcc_assert (npatterns
> 1);
5904 rtx_vector_builder builder
;
5905 auto_vec
<rtx
, 16> vectors (npatterns
);
5906 for (unsigned int i
= 0; i
< npatterns
; ++i
)
5908 builder
.new_vector (mode
, 1, nelts_per_pattern
);
5909 for (unsigned int j
= 0; j
< nelts_per_pattern
; ++j
)
5910 builder
.quick_push (CONST_VECTOR_ELT (src
, i
+ j
* npatterns
));
5911 vectors
.quick_push (force_reg (mode
, builder
.build ()));
5914 /* Use permutes to interleave the separate vectors. */
5915 while (npatterns
> 1)
5918 for (unsigned int i
= 0; i
< npatterns
; ++i
)
5920 rtx tmp
= (npatterns
== 1 ? target
: gen_reg_rtx (mode
));
5921 rtvec v
= gen_rtvec (2, vectors
[i
], vectors
[i
+ npatterns
]);
5922 emit_set_insn (tmp
, gen_rtx_UNSPEC (mode
, v
, UNSPEC_ZIP1
));
5926 gcc_assert (vectors
[0] == target
);
5930 /* Use WHILE to set a predicate register of mode MODE in which the first
5931 VL bits are set and the rest are clear. Use TARGET for the register
5932 if it's nonnull and convenient. */
5935 aarch64_sve_move_pred_via_while (rtx target
, machine_mode mode
,
5938 rtx limit
= force_reg (DImode
, gen_int_mode (vl
, DImode
));
5939 target
= aarch64_target_reg (target
, mode
);
5940 emit_insn (gen_while (UNSPEC_WHILELO
, DImode
, mode
,
5941 target
, const0_rtx
, limit
));
5946 aarch64_expand_sve_const_pred_1 (rtx
, rtx_vector_builder
&, bool);
5948 /* BUILDER is a constant predicate in which the index of every set bit
5949 is a multiple of ELT_SIZE (which is <= 8). Try to load the constant
5950 by inverting every element at a multiple of ELT_SIZE and EORing the
5951 result with an ELT_SIZE PTRUE.
5953 Return a register that contains the constant on success, otherwise
5954 return null. Use TARGET as the register if it is nonnull and
5958 aarch64_expand_sve_const_pred_eor (rtx target
, rtx_vector_builder
&builder
,
5959 unsigned int elt_size
)
5961 /* Invert every element at a multiple of ELT_SIZE, keeping the
5963 rtx_vector_builder
inv_builder (VNx16BImode
, builder
.npatterns (),
5964 builder
.nelts_per_pattern ());
5965 for (unsigned int i
= 0; i
< builder
.encoded_nelts (); ++i
)
5966 if ((i
& (elt_size
- 1)) == 0 && INTVAL (builder
.elt (i
)) == 0)
5967 inv_builder
.quick_push (const1_rtx
);
5969 inv_builder
.quick_push (const0_rtx
);
5970 inv_builder
.finalize ();
5972 /* See if we can load the constant cheaply. */
5973 rtx inv
= aarch64_expand_sve_const_pred_1 (NULL_RTX
, inv_builder
, false);
5977 /* EOR the result with an ELT_SIZE PTRUE. */
5978 rtx mask
= aarch64_ptrue_all (elt_size
);
5979 mask
= force_reg (VNx16BImode
, mask
);
5980 inv
= gen_lowpart (VNx16BImode
, inv
);
5981 target
= aarch64_target_reg (target
, VNx16BImode
);
5982 emit_insn (gen_aarch64_pred_z (XOR
, VNx16BImode
, target
, mask
, inv
, mask
));
5986 /* BUILDER is a constant predicate in which the index of every set bit
5987 is a multiple of ELT_SIZE (which is <= 8). Try to load the constant
5988 using a TRN1 of size PERMUTE_SIZE, which is >= ELT_SIZE. Return the
5989 register on success, otherwise return null. Use TARGET as the register
5990 if nonnull and convenient. */
5993 aarch64_expand_sve_const_pred_trn (rtx target
, rtx_vector_builder
&builder
,
5994 unsigned int elt_size
,
5995 unsigned int permute_size
)
5997 /* We're going to split the constant into two new constants A and B,
5998 with element I of BUILDER going into A if (I & PERMUTE_SIZE) == 0
5999 and into B otherwise. E.g. for PERMUTE_SIZE == 4 && ELT_SIZE == 1:
6001 A: { 0, 1, 2, 3, _, _, _, _, 8, 9, 10, 11, _, _, _, _ }
6002 B: { 4, 5, 6, 7, _, _, _, _, 12, 13, 14, 15, _, _, _, _ }
6004 where _ indicates elements that will be discarded by the permute.
6006 First calculate the ELT_SIZEs for A and B. */
6007 unsigned int a_elt_size
= GET_MODE_SIZE (DImode
);
6008 unsigned int b_elt_size
= GET_MODE_SIZE (DImode
);
6009 for (unsigned int i
= 0; i
< builder
.encoded_nelts (); i
+= elt_size
)
6010 if (INTVAL (builder
.elt (i
)) != 0)
6012 if (i
& permute_size
)
6013 b_elt_size
|= i
- permute_size
;
6017 a_elt_size
&= -a_elt_size
;
6018 b_elt_size
&= -b_elt_size
;
6020 /* Now construct the vectors themselves. */
6021 rtx_vector_builder
a_builder (VNx16BImode
, builder
.npatterns (),
6022 builder
.nelts_per_pattern ());
6023 rtx_vector_builder
b_builder (VNx16BImode
, builder
.npatterns (),
6024 builder
.nelts_per_pattern ());
6025 unsigned int nelts
= builder
.encoded_nelts ();
6026 for (unsigned int i
= 0; i
< nelts
; ++i
)
6027 if (i
& (elt_size
- 1))
6029 a_builder
.quick_push (const0_rtx
);
6030 b_builder
.quick_push (const0_rtx
);
6032 else if ((i
& permute_size
) == 0)
6034 /* The A and B elements are significant. */
6035 a_builder
.quick_push (builder
.elt (i
));
6036 b_builder
.quick_push (builder
.elt (i
+ permute_size
));
6040 /* The A and B elements are going to be discarded, so pick whatever
6041 is likely to give a nice constant. We are targeting element
6042 sizes A_ELT_SIZE and B_ELT_SIZE for A and B respectively,
6043 with the aim of each being a sequence of ones followed by
6044 a sequence of zeros. So:
6046 * if X_ELT_SIZE <= PERMUTE_SIZE, the best approach is to
6047 duplicate the last X_ELT_SIZE element, to extend the
6048 current sequence of ones or zeros.
6050 * if X_ELT_SIZE > PERMUTE_SIZE, the best approach is to add a
6051 zero, so that the constant really does have X_ELT_SIZE and
6052 not a smaller size. */
6053 if (a_elt_size
> permute_size
)
6054 a_builder
.quick_push (const0_rtx
);
6056 a_builder
.quick_push (a_builder
.elt (i
- a_elt_size
));
6057 if (b_elt_size
> permute_size
)
6058 b_builder
.quick_push (const0_rtx
);
6060 b_builder
.quick_push (b_builder
.elt (i
- b_elt_size
));
6062 a_builder
.finalize ();
6063 b_builder
.finalize ();
6065 /* Try loading A into a register. */
6066 rtx_insn
*last
= get_last_insn ();
6067 rtx a
= aarch64_expand_sve_const_pred_1 (NULL_RTX
, a_builder
, false);
6071 /* Try loading B into a register. */
6073 if (a_builder
!= b_builder
)
6075 b
= aarch64_expand_sve_const_pred_1 (NULL_RTX
, b_builder
, false);
6078 delete_insns_since (last
);
6083 /* Emit the TRN1 itself. We emit a TRN that operates on VNx16BI
6084 operands but permutes them as though they had mode MODE. */
6085 machine_mode mode
= aarch64_sve_pred_mode (permute_size
).require ();
6086 target
= aarch64_target_reg (target
, GET_MODE (a
));
6087 rtx type_reg
= CONST0_RTX (mode
);
6088 emit_insn (gen_aarch64_sve_trn1_conv (mode
, target
, a
, b
, type_reg
));
6092 /* Subroutine of aarch64_expand_sve_const_pred. Try to load the VNx16BI
6093 constant in BUILDER into an SVE predicate register. Return the register
6094 on success, otherwise return null. Use TARGET for the register if
6095 nonnull and convenient.
6097 ALLOW_RECURSE_P is true if we can use methods that would call this
6098 function recursively. */
6101 aarch64_expand_sve_const_pred_1 (rtx target
, rtx_vector_builder
&builder
,
6102 bool allow_recurse_p
)
6104 if (builder
.encoded_nelts () == 1)
6105 /* A PFALSE or a PTRUE .B ALL. */
6106 return aarch64_emit_set_immediate (target
, builder
);
6108 unsigned int elt_size
= aarch64_widest_sve_pred_elt_size (builder
);
6109 if (int vl
= aarch64_partial_ptrue_length (builder
, elt_size
))
6111 /* If we can load the constant using PTRUE, use it as-is. */
6112 machine_mode mode
= aarch64_sve_pred_mode (elt_size
).require ();
6113 if (aarch64_svpattern_for_vl (mode
, vl
) != AARCH64_NUM_SVPATTERNS
)
6114 return aarch64_emit_set_immediate (target
, builder
);
6116 /* Otherwise use WHILE to set the first VL bits. */
6117 return aarch64_sve_move_pred_via_while (target
, mode
, vl
);
6120 if (!allow_recurse_p
)
6123 /* Try inverting the vector in element size ELT_SIZE and then EORing
6124 the result with an ELT_SIZE PTRUE. */
6125 if (INTVAL (builder
.elt (0)) == 0)
6126 if (rtx res
= aarch64_expand_sve_const_pred_eor (target
, builder
,
6130 /* Try using TRN1 to permute two simpler constants. */
6131 for (unsigned int i
= elt_size
; i
<= 8; i
*= 2)
6132 if (rtx res
= aarch64_expand_sve_const_pred_trn (target
, builder
,
6139 /* Return an SVE predicate register that contains the VNx16BImode
6140 constant in BUILDER, without going through the move expanders.
6142 The returned register can have whatever mode seems most natural
6143 given the contents of BUILDER. Use TARGET for the result if
6147 aarch64_expand_sve_const_pred (rtx target
, rtx_vector_builder
&builder
)
6149 /* Try loading the constant using pure predicate operations. */
6150 if (rtx res
= aarch64_expand_sve_const_pred_1 (target
, builder
, true))
6153 /* Try forcing the constant to memory. */
6154 if (builder
.full_nelts ().is_constant ())
6155 if (rtx mem
= force_const_mem (VNx16BImode
, builder
.build ()))
6157 target
= aarch64_target_reg (target
, VNx16BImode
);
6158 emit_move_insn (target
, mem
);
6162 /* The last resort is to load the constant as an integer and then
6163 compare it against zero. Use -1 for set bits in order to increase
6164 the changes of using SVE DUPM or an Advanced SIMD byte mask. */
6165 rtx_vector_builder
int_builder (VNx16QImode
, builder
.npatterns (),
6166 builder
.nelts_per_pattern ());
6167 for (unsigned int i
= 0; i
< builder
.encoded_nelts (); ++i
)
6168 int_builder
.quick_push (INTVAL (builder
.elt (i
))
6169 ? constm1_rtx
: const0_rtx
);
6170 return aarch64_convert_sve_data_to_pred (target
, VNx16BImode
,
6171 int_builder
.build ());
6174 /* Set DEST to immediate IMM. */
6177 aarch64_expand_mov_immediate (rtx dest
, rtx imm
)
6179 machine_mode mode
= GET_MODE (dest
);
6181 /* Check on what type of symbol it is. */
6182 scalar_int_mode int_mode
;
6183 if ((SYMBOL_REF_P (imm
)
6184 || LABEL_REF_P (imm
)
6185 || GET_CODE (imm
) == CONST
6186 || GET_CODE (imm
) == CONST_POLY_INT
)
6187 && is_a
<scalar_int_mode
> (mode
, &int_mode
))
6191 HOST_WIDE_INT const_offset
;
6192 enum aarch64_symbol_type sty
;
6194 /* If we have (const (plus symbol offset)), separate out the offset
6195 before we start classifying the symbol. */
6196 rtx base
= strip_offset (imm
, &offset
);
6198 /* We must always add an offset involving VL separately, rather than
6199 folding it into the relocation. */
6200 if (!offset
.is_constant (&const_offset
))
6204 aarch64_report_sve_required ();
6207 if (base
== const0_rtx
6208 && (aarch64_sve_cnt_immediate_p (offset
)
6209 || aarch64_sve_rdvl_immediate_p (offset
)))
6210 emit_insn (gen_rtx_SET (dest
, imm
));
6213 /* Do arithmetic on 32-bit values if the result is smaller
6215 if (partial_subreg_p (int_mode
, SImode
))
6217 /* It is invalid to do symbol calculations in modes
6218 narrower than SImode. */
6219 gcc_assert (base
== const0_rtx
);
6220 dest
= gen_lowpart (SImode
, dest
);
6223 if (base
!= const0_rtx
)
6225 base
= aarch64_force_temporary (int_mode
, dest
, base
);
6226 aarch64_add_offset (int_mode
, dest
, base
, offset
,
6227 NULL_RTX
, NULL_RTX
, 0, false);
6230 aarch64_add_offset (int_mode
, dest
, base
, offset
,
6231 dest
, NULL_RTX
, 0, false);
6236 if (aarch64_rdsvl_immediate_p (base
))
6238 /* We could handle non-constant offsets if they are ever
6240 gcc_assert (const_offset
== 0);
6241 emit_insn (gen_rtx_SET (dest
, imm
));
6245 sty
= aarch64_classify_symbol (base
, const_offset
);
6248 case SYMBOL_FORCE_TO_MEM
:
6249 if (int_mode
!= ptr_mode
)
6250 imm
= convert_memory_address (ptr_mode
, imm
);
6252 if (const_offset
!= 0
6253 && targetm
.cannot_force_const_mem (ptr_mode
, imm
))
6255 gcc_assert (can_create_pseudo_p ());
6256 base
= aarch64_force_temporary (int_mode
, dest
, base
);
6257 aarch64_add_offset (int_mode
, dest
, base
, const_offset
,
6258 NULL_RTX
, NULL_RTX
, 0, false);
6262 mem
= force_const_mem (ptr_mode
, imm
);
6265 /* If we aren't generating PC relative literals, then
6266 we need to expand the literal pool access carefully.
6267 This is something that needs to be done in a number
6268 of places, so could well live as a separate function. */
6269 if (!aarch64_pcrelative_literal_loads
)
6271 gcc_assert (can_create_pseudo_p ());
6272 base
= gen_reg_rtx (ptr_mode
);
6273 aarch64_expand_mov_immediate (base
, XEXP (mem
, 0));
6274 if (ptr_mode
!= Pmode
)
6275 base
= convert_memory_address (Pmode
, base
);
6276 mem
= gen_rtx_MEM (ptr_mode
, base
);
6279 if (int_mode
!= ptr_mode
)
6280 mem
= gen_rtx_ZERO_EXTEND (int_mode
, mem
);
6282 emit_insn (gen_rtx_SET (dest
, mem
));
6286 case SYMBOL_SMALL_TLSGD
:
6287 case SYMBOL_SMALL_TLSDESC
:
6288 case SYMBOL_SMALL_TLSIE
:
6289 case SYMBOL_SMALL_GOT_28K
:
6290 case SYMBOL_SMALL_GOT_4G
:
6291 case SYMBOL_TINY_GOT
:
6292 case SYMBOL_TINY_TLSIE
:
6293 if (const_offset
!= 0)
6295 gcc_assert(can_create_pseudo_p ());
6296 base
= aarch64_force_temporary (int_mode
, dest
, base
);
6297 aarch64_add_offset (int_mode
, dest
, base
, const_offset
,
6298 NULL_RTX
, NULL_RTX
, 0, false);
6303 case SYMBOL_SMALL_ABSOLUTE
:
6304 case SYMBOL_TINY_ABSOLUTE
:
6305 case SYMBOL_TLSLE12
:
6306 case SYMBOL_TLSLE24
:
6307 case SYMBOL_TLSLE32
:
6308 case SYMBOL_TLSLE48
:
6309 aarch64_load_symref_appropriately (dest
, imm
, sty
);
6317 if (!CONST_INT_P (imm
))
6319 if (aarch64_sve_pred_mode_p (mode
))
6321 /* Only the low bit of each .H, .S and .D element is defined,
6322 so we can set the upper bits to whatever we like. If the
6323 predicate is all-true in MODE, prefer to set all the undefined
6324 bits as well, so that we can share a single .B predicate for
6326 if (imm
== CONSTM1_RTX (mode
))
6327 imm
= CONSTM1_RTX (VNx16BImode
);
6329 /* All methods for constructing predicate modes wider than VNx16BI
6330 will set the upper bits of each element to zero. Expose this
6331 by moving such constants as a VNx16BI, so that all bits are
6332 significant and so that constants for different modes can be
6333 shared. The wider constant will still be available as a
6335 rtx_vector_builder builder
;
6336 if (aarch64_get_sve_pred_bits (builder
, imm
))
6338 rtx res
= aarch64_expand_sve_const_pred (dest
, builder
);
6340 emit_move_insn (dest
, gen_lowpart (mode
, res
));
6345 if (GET_CODE (imm
) == HIGH
|| aarch64_simd_valid_mov_imm (imm
))
6347 emit_insn (gen_rtx_SET (dest
, imm
));
6351 if (CONST_VECTOR_P (imm
) && aarch64_sve_data_mode_p (mode
))
6352 if (rtx res
= aarch64_expand_sve_const_vector (dest
, imm
))
6355 emit_insn (gen_aarch64_sve_reinterpret (mode
, dest
, res
));
6359 rtx mem
= force_const_mem (mode
, imm
);
6361 emit_move_insn (dest
, mem
);
6365 aarch64_internal_mov_immediate (dest
, imm
, true, mode
);
6368 /* Return the MEM rtx that provides the canary value that should be used
6369 for stack-smashing protection. MODE is the mode of the memory.
6370 For SSP_GLOBAL, DECL_RTL is the MEM rtx for the canary variable
6371 (__stack_chk_guard), otherwise it has no useful value. SALT_TYPE
6372 indicates whether the caller is performing a SET or a TEST operation. */
6375 aarch64_stack_protect_canary_mem (machine_mode mode
, rtx decl_rtl
,
6376 aarch64_salt_type salt_type
)
6379 if (aarch64_stack_protector_guard
== SSP_GLOBAL
)
6381 gcc_assert (MEM_P (decl_rtl
));
6382 addr
= XEXP (decl_rtl
, 0);
6384 rtx base
= strip_offset_and_salt (addr
, &offset
);
6385 if (!SYMBOL_REF_P (base
))
6388 rtvec v
= gen_rtvec (2, base
, GEN_INT (salt_type
));
6389 addr
= gen_rtx_UNSPEC (Pmode
, v
, UNSPEC_SALT_ADDR
);
6390 addr
= gen_rtx_CONST (Pmode
, addr
);
6391 addr
= plus_constant (Pmode
, addr
, offset
);
6395 /* Calculate the address from the system register. */
6396 rtx salt
= GEN_INT (salt_type
);
6397 addr
= gen_reg_rtx (mode
);
6399 emit_insn (gen_reg_stack_protect_address_di (addr
, salt
));
6402 emit_insn (gen_reg_stack_protect_address_si (addr
, salt
));
6403 addr
= convert_memory_address (Pmode
, addr
);
6405 addr
= plus_constant (Pmode
, addr
, aarch64_stack_protector_guard_offset
);
6407 return gen_rtx_MEM (mode
, force_reg (Pmode
, addr
));
6410 /* Emit an SVE predicated move from SRC to DEST. PRED is a predicate
6411 that is known to contain PTRUE. */
6414 aarch64_emit_sve_pred_move (rtx dest
, rtx pred
, rtx src
)
6416 expand_operand ops
[3];
6417 machine_mode mode
= GET_MODE (dest
);
6418 create_output_operand (&ops
[0], dest
, mode
);
6419 create_input_operand (&ops
[1], pred
, GET_MODE(pred
));
6420 create_input_operand (&ops
[2], src
, mode
);
6421 temporary_volatile_ok
v (true);
6422 expand_insn (code_for_aarch64_pred_mov (mode
), 3, ops
);
6425 /* Expand a pre-RA SVE data move from SRC to DEST in which at least one
6426 operand is in memory. In this case we need to use the predicated LD1
6427 and ST1 instead of LDR and STR, both for correctness on big-endian
6428 targets and because LD1 and ST1 support a wider range of addressing modes.
6429 PRED_MODE is the mode of the predicate.
6431 See the comment at the head of aarch64-sve.md for details about the
6432 big-endian handling. */
6435 aarch64_expand_sve_mem_move (rtx dest
, rtx src
, machine_mode pred_mode
)
6437 machine_mode mode
= GET_MODE (dest
);
6438 rtx ptrue
= aarch64_ptrue_reg (pred_mode
);
6439 if (!register_operand (src
, mode
)
6440 && !register_operand (dest
, mode
))
6442 rtx tmp
= gen_reg_rtx (mode
);
6444 aarch64_emit_sve_pred_move (tmp
, ptrue
, src
);
6446 emit_move_insn (tmp
, src
);
6449 aarch64_emit_sve_pred_move (dest
, ptrue
, src
);
6452 /* Called only on big-endian targets. See whether an SVE vector move
6453 from SRC to DEST is effectively a REV[BHW] instruction, because at
6454 least one operand is a subreg of an SVE vector that has wider or
6455 narrower elements. Return true and emit the instruction if so.
6459 (set (reg:VNx8HI R1) (subreg:VNx8HI (reg:VNx16QI R2) 0))
6461 represents a VIEW_CONVERT between the following vectors, viewed
6464 R2: { [0].high, [0].low, [1].high, [1].low, ... }
6465 R1: { [0], [1], [2], [3], ... }
6467 The high part of lane X in R2 should therefore correspond to lane X*2
6468 of R1, but the register representations are:
6471 R2: ...... [1].high [1].low [0].high [0].low
6472 R1: ...... [3] [2] [1] [0]
6474 where the low part of lane X in R2 corresponds to lane X*2 in R1.
6475 We therefore need a reverse operation to swap the high and low values
6478 This is purely an optimization. Without it we would spill the
6479 subreg operand to the stack in one mode and reload it in the
6480 other mode, which has the same effect as the REV. */
6483 aarch64_maybe_expand_sve_subreg_move (rtx dest
, rtx src
)
6485 gcc_assert (BYTES_BIG_ENDIAN
);
6487 /* Do not try to optimize subregs that LRA has created for matched
6488 reloads. These subregs only exist as a temporary measure to make
6489 the RTL well-formed, but they are exempt from the usual
6490 TARGET_CAN_CHANGE_MODE_CLASS rules.
6492 For example, if we have:
6494 (set (reg:VNx8HI R1) (foo:VNx8HI (reg:VNx4SI R2)))
6496 and the constraints require R1 and R2 to be in the same register,
6497 LRA may need to create RTL such as:
6499 (set (subreg:VNx4SI (reg:VNx8HI TMP) 0) (reg:VNx4SI R2))
6500 (set (reg:VNx8HI TMP) (foo:VNx8HI (subreg:VNx4SI (reg:VNx8HI TMP) 0)))
6501 (set (reg:VNx8HI R1) (reg:VNx8HI TMP))
6503 which forces both the input and output of the original instruction
6504 to use the same hard register. But for this to work, the normal
6505 rules have to be suppressed on the subreg input, otherwise LRA
6506 would need to reload that input too, meaning that the process
6507 would never terminate. To compensate for this, the normal rules
6508 are also suppressed for the subreg output of the first move.
6509 Ignoring the special case and handling the first move normally
6510 would therefore generate wrong code: we would reverse the elements
6511 for the first subreg but not reverse them back for the second subreg. */
6512 if (SUBREG_P (dest
) && !LRA_SUBREG_P (dest
))
6513 dest
= SUBREG_REG (dest
);
6514 if (SUBREG_P (src
) && !LRA_SUBREG_P (src
))
6515 src
= SUBREG_REG (src
);
6517 /* The optimization handles two single SVE REGs with different element
6521 || aarch64_classify_vector_mode (GET_MODE (dest
)) != VEC_SVE_DATA
6522 || aarch64_classify_vector_mode (GET_MODE (src
)) != VEC_SVE_DATA
6523 || (GET_MODE_UNIT_SIZE (GET_MODE (dest
))
6524 == GET_MODE_UNIT_SIZE (GET_MODE (src
))))
6527 /* Generate *aarch64_sve_mov<mode>_subreg_be. */
6528 rtx ptrue
= aarch64_ptrue_reg (VNx16BImode
);
6529 rtx unspec
= gen_rtx_UNSPEC (GET_MODE (dest
), gen_rtvec (2, ptrue
, src
),
6531 emit_insn (gen_rtx_SET (dest
, unspec
));
6535 /* Return a copy of X with mode MODE, without changing its other
6536 attributes. Unlike gen_lowpart, this doesn't care whether the
6537 mode change is valid. */
6540 aarch64_replace_reg_mode (rtx x
, machine_mode mode
)
6542 if (GET_MODE (x
) == mode
)
6545 x
= shallow_copy_rtx (x
);
6546 set_mode_and_regno (x
, mode
, REGNO (x
));
6550 /* Return the SVE REV[BHW] unspec for reversing quantites of mode MODE
6551 stored in wider integer containers. */
6554 aarch64_sve_rev_unspec (machine_mode mode
)
6556 switch (GET_MODE_UNIT_SIZE (mode
))
6558 case 1: return UNSPEC_REVB
;
6559 case 2: return UNSPEC_REVH
;
6560 case 4: return UNSPEC_REVW
;
6565 /* Split a *aarch64_sve_mov<mode>_subreg_be pattern with the given
6569 aarch64_split_sve_subreg_move (rtx dest
, rtx ptrue
, rtx src
)
6571 /* Decide which REV operation we need. The mode with wider elements
6572 determines the mode of the operands and the mode with the narrower
6573 elements determines the reverse width. */
6574 machine_mode mode_with_wider_elts
= aarch64_sve_int_mode (GET_MODE (dest
));
6575 machine_mode mode_with_narrower_elts
= aarch64_sve_int_mode (GET_MODE (src
));
6576 if (GET_MODE_UNIT_SIZE (mode_with_wider_elts
)
6577 < GET_MODE_UNIT_SIZE (mode_with_narrower_elts
))
6578 std::swap (mode_with_wider_elts
, mode_with_narrower_elts
);
6580 unsigned int unspec
= aarch64_sve_rev_unspec (mode_with_narrower_elts
);
6581 machine_mode pred_mode
= aarch64_sve_pred_mode (mode_with_wider_elts
);
6583 /* Get the operands in the appropriate modes and emit the instruction. */
6584 ptrue
= gen_lowpart (pred_mode
, ptrue
);
6585 dest
= aarch64_replace_reg_mode (dest
, mode_with_wider_elts
);
6586 src
= aarch64_replace_reg_mode (src
, mode_with_wider_elts
);
6587 emit_insn (gen_aarch64_pred (unspec
, mode_with_wider_elts
,
6592 aarch64_function_ok_for_sibcall (tree
, tree exp
)
6594 auto from_abi
= crtl
->abi
->id ();
6595 auto to_abi
= expr_callee_abi (exp
).id ();
6597 /* ARM_PCS_SVE preserves strictly more than ARM_PCS_SIMD, which in
6598 turn preserves strictly more than the base PCS. The callee must
6599 preserve everything that the caller is required to preserve. */
6600 if (from_abi
!= to_abi
&& to_abi
== ARM_PCS_SVE
)
6601 to_abi
= ARM_PCS_SIMD
;
6602 if (from_abi
!= to_abi
&& to_abi
== ARM_PCS_SIMD
)
6603 to_abi
= ARM_PCS_AAPCS64
;
6604 if (from_abi
!= to_abi
)
6607 tree fntype
= TREE_TYPE (TREE_TYPE (CALL_EXPR_FN (exp
)));
6608 if (aarch64_fntype_pstate_sm (fntype
) & ~aarch64_cfun_incoming_pstate_sm ())
6610 for (auto state
: { "za", "zt0" })
6611 if (bool (aarch64_cfun_shared_flags (state
))
6612 != bool (aarch64_fntype_shared_flags (fntype
, state
)))
6615 /* BTI J is needed where indirect_return functions may return
6616 if bti is enabled there. */
6617 if (lookup_attribute ("indirect_return", TYPE_ATTRIBUTES (fntype
))
6618 && !lookup_attribute ("indirect_return",
6619 TYPE_ATTRIBUTES (TREE_TYPE (cfun
->decl
))))
6625 /* Subroutine of aarch64_pass_by_reference for arguments that are not
6626 passed in SVE registers. */
6629 aarch64_pass_by_reference_1 (CUMULATIVE_ARGS
*pcum
,
6630 const function_arg_info
&arg
)
6633 machine_mode dummymode
;
6636 /* GET_MODE_SIZE (BLKmode) is useless since it is 0. */
6637 if (arg
.mode
== BLKmode
&& arg
.type
)
6638 size
= int_size_in_bytes (arg
.type
);
6640 /* No frontends can create types with variable-sized modes, so we
6641 shouldn't be asked to pass or return them. */
6642 size
= GET_MODE_SIZE (arg
.mode
).to_constant ();
6644 /* Aggregates are passed by reference based on their size. */
6645 if (arg
.aggregate_type_p ())
6646 size
= int_size_in_bytes (arg
.type
);
6648 /* Variable sized arguments are always returned by reference. */
6652 /* Can this be a candidate to be passed in fp/simd register(s)? */
6653 if (aarch64_vfp_is_call_or_return_candidate (arg
.mode
, arg
.type
,
6654 &dummymode
, &nregs
, NULL
,
6655 !pcum
|| pcum
->silent_p
))
6658 /* Arguments which are variable sized or larger than 2 registers are
6659 passed by reference unless they are a homogenous floating point
6661 return size
> 2 * UNITS_PER_WORD
;
6664 /* Implement TARGET_PASS_BY_REFERENCE. */
6667 aarch64_pass_by_reference (cumulative_args_t pcum_v
,
6668 const function_arg_info
&arg
)
6670 CUMULATIVE_ARGS
*pcum
= get_cumulative_args (pcum_v
);
6673 return aarch64_pass_by_reference_1 (pcum
, arg
);
6675 pure_scalable_type_info pst_info
;
6676 switch (pst_info
.analyze (arg
.type
))
6678 case pure_scalable_type_info::IS_PST
:
6679 if (pcum
&& !pcum
->silent_p
&& !TARGET_SVE
)
6680 /* We can't gracefully recover at this point, so make this a
6682 fatal_error (input_location
, "arguments of type %qT require"
6683 " the SVE ISA extension", arg
.type
);
6685 /* Variadic SVE types are passed by reference. Normal non-variadic
6686 arguments are too if we've run out of registers. */
6688 || pcum
->aapcs_nvrn
+ pst_info
.num_zr () > NUM_FP_ARG_REGS
6689 || pcum
->aapcs_nprn
+ pst_info
.num_pr () > NUM_PR_ARG_REGS
);
6691 case pure_scalable_type_info::DOESNT_MATTER
:
6692 gcc_assert (aarch64_pass_by_reference_1 (pcum
, arg
));
6695 case pure_scalable_type_info::NO_ABI_IDENTITY
:
6696 case pure_scalable_type_info::ISNT_PST
:
6697 return aarch64_pass_by_reference_1 (pcum
, arg
);
6702 /* Return TRUE if VALTYPE is padded to its least significant bits. */
6704 aarch64_return_in_msb (const_tree valtype
)
6706 machine_mode dummy_mode
;
6709 /* Never happens in little-endian mode. */
6710 if (!BYTES_BIG_ENDIAN
)
6713 /* Only composite types smaller than or equal to 16 bytes can
6714 be potentially returned in registers. */
6715 if (!aarch64_composite_type_p (valtype
, TYPE_MODE (valtype
))
6716 || int_size_in_bytes (valtype
) <= 0
6717 || int_size_in_bytes (valtype
) > 16)
6720 /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
6721 or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
6722 is always passed/returned in the least significant bits of fp/simd
6724 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype
), valtype
,
6725 &dummy_mode
, &dummy_int
, NULL
,
6729 /* Likewise pure scalable types for SVE vector and predicate registers. */
6730 pure_scalable_type_info pst_info
;
6731 if (pst_info
.analyze_registers (valtype
))
6737 /* Implement TARGET_FUNCTION_VALUE.
6738 Define how to find the value returned by a function. */
6741 aarch64_function_value (const_tree type
, const_tree func
,
6742 bool outgoing ATTRIBUTE_UNUSED
)
6747 mode
= TYPE_MODE (type
);
6748 if (INTEGRAL_TYPE_P (type
))
6749 mode
= promote_function_mode (type
, mode
, &unsignedp
, func
, 1);
6751 pure_scalable_type_info pst_info
;
6752 if (type
&& pst_info
.analyze_registers (type
))
6753 return pst_info
.get_rtx (mode
, V0_REGNUM
, P0_REGNUM
);
6755 /* Generic vectors that map to full SVE modes with -msve-vector-bits=N
6756 are returned in memory, not by value. */
6757 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
6758 bool sve_p
= (vec_flags
& VEC_ANY_SVE
);
6760 if (aarch64_return_in_msb (type
))
6762 HOST_WIDE_INT size
= int_size_in_bytes (type
);
6764 if (size
% UNITS_PER_WORD
!= 0)
6766 size
+= UNITS_PER_WORD
- size
% UNITS_PER_WORD
;
6767 mode
= int_mode_for_size (size
* BITS_PER_UNIT
, 0).require ();
6772 machine_mode ag_mode
;
6773 if (aarch64_vfp_is_call_or_return_candidate (mode
, type
, &ag_mode
, &count
,
6776 gcc_assert (!sve_p
);
6777 if (!aarch64_composite_type_p (type
, mode
))
6779 gcc_assert (count
== 1 && mode
== ag_mode
);
6780 return gen_rtx_REG (mode
, V0_REGNUM
);
6782 else if (aarch64_advsimd_full_struct_mode_p (mode
)
6783 && known_eq (GET_MODE_SIZE (ag_mode
), 16))
6784 return gen_rtx_REG (mode
, V0_REGNUM
);
6785 else if (aarch64_advsimd_partial_struct_mode_p (mode
)
6786 && known_eq (GET_MODE_SIZE (ag_mode
), 8))
6787 return gen_rtx_REG (mode
, V0_REGNUM
);
6793 par
= gen_rtx_PARALLEL (mode
, rtvec_alloc (count
));
6794 for (i
= 0; i
< count
; i
++)
6796 rtx tmp
= gen_rtx_REG (ag_mode
, V0_REGNUM
+ i
);
6797 rtx offset
= gen_int_mode (i
* GET_MODE_SIZE (ag_mode
), Pmode
);
6798 tmp
= gen_rtx_EXPR_LIST (VOIDmode
, tmp
, offset
);
6799 XVECEXP (par
, 0, i
) = tmp
;
6808 /* Vector types can acquire a partial SVE mode using things like
6809 __attribute__((vector_size(N))), and this is potentially useful.
6810 However, the choice of mode doesn't affect the type's ABI
6811 identity, so we should treat the types as though they had
6812 the associated integer mode, just like they did before SVE
6815 We know that the vector must be 128 bits or smaller,
6816 otherwise we'd have returned it in memory instead. */
6818 && (aarch64_some_values_include_pst_objects_p (type
)
6819 || (vec_flags
& VEC_PARTIAL
)));
6821 scalar_int_mode int_mode
= int_mode_for_mode (mode
).require ();
6822 rtx reg
= gen_rtx_REG (int_mode
, R0_REGNUM
);
6823 rtx pair
= gen_rtx_EXPR_LIST (VOIDmode
, reg
, const0_rtx
);
6824 return gen_rtx_PARALLEL (mode
, gen_rtvec (1, pair
));
6826 return gen_rtx_REG (mode
, R0_REGNUM
);
6830 /* Implements TARGET_FUNCTION_VALUE_REGNO_P.
6831 Return true if REGNO is the number of a hard register in which the values
6832 of called function may come back. */
6835 aarch64_function_value_regno_p (const unsigned int regno
)
6837 /* Maximum of 16 bytes can be returned in the general registers. Examples
6838 of 16-byte return values are: 128-bit integers and 16-byte small
6839 structures (excluding homogeneous floating-point aggregates). */
6840 if (regno
== R0_REGNUM
|| regno
== R1_REGNUM
)
6843 /* Up to four fp/simd registers can return a function value, e.g. a
6844 homogeneous floating-point aggregate having four members. */
6845 if (regno
>= V0_REGNUM
&& regno
< V0_REGNUM
+ HA_MAX_NUM_FLDS
)
6846 return TARGET_FLOAT
;
6848 if (regno
>= P0_REGNUM
&& regno
< P0_REGNUM
+ HA_MAX_NUM_FLDS
)
6854 /* Subroutine for aarch64_return_in_memory for types that are not returned
6855 in SVE registers. */
6858 aarch64_return_in_memory_1 (const_tree type
)
6861 machine_mode ag_mode
;
6864 if (!AGGREGATE_TYPE_P (type
)
6865 && TREE_CODE (type
) != BITINT_TYPE
6866 && TREE_CODE (type
) != COMPLEX_TYPE
6867 && TREE_CODE (type
) != VECTOR_TYPE
)
6868 /* Simple scalar types always returned in registers. */
6871 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type
), type
,
6872 &ag_mode
, &count
, NULL
, false))
6875 /* Types larger than 2 registers returned in memory. */
6876 size
= int_size_in_bytes (type
);
6877 return (size
< 0 || size
> 2 * UNITS_PER_WORD
);
6880 /* Implement TARGET_RETURN_IN_MEMORY.
6882 If the type T of the result of a function is such that
6884 would require that arg be passed as a value in a register (or set of
6885 registers) according to the parameter passing rules, then the result
6886 is returned in the same registers as would be used for such an
6890 aarch64_return_in_memory (const_tree type
, const_tree fndecl ATTRIBUTE_UNUSED
)
6892 pure_scalable_type_info pst_info
;
6893 switch (pst_info
.analyze (type
))
6895 case pure_scalable_type_info::IS_PST
:
6896 return (pst_info
.num_zr () > NUM_FP_ARG_REGS
6897 || pst_info
.num_pr () > NUM_PR_ARG_REGS
);
6899 case pure_scalable_type_info::DOESNT_MATTER
:
6900 gcc_assert (aarch64_return_in_memory_1 (type
));
6903 case pure_scalable_type_info::NO_ABI_IDENTITY
:
6904 case pure_scalable_type_info::ISNT_PST
:
6905 return aarch64_return_in_memory_1 (type
);
6911 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v
, machine_mode mode
,
6912 const_tree type
, int *nregs
)
6914 CUMULATIVE_ARGS
*pcum
= get_cumulative_args (pcum_v
);
6915 return aarch64_vfp_is_call_or_return_candidate (mode
, type
,
6916 &pcum
->aapcs_vfp_rmode
,
6917 nregs
, NULL
, pcum
->silent_p
);
6920 /* Given MODE and TYPE of a function argument, return the alignment in
6921 bits. The idea is to suppress any stronger alignment requested by
6922 the user and opt for the natural alignment (specified in AAPCS64 \S
6923 4.1). ABI_BREAK_GCC_9 is set to the old alignment if the alignment
6924 was incorrectly calculated in versions of GCC prior to GCC 9.
6925 ABI_BREAK_GCC_13 is set to the old alignment if it was incorrectly
6926 calculated in versions between GCC 9 and GCC 13. If the alignment
6927 might have changed between GCC 13 and GCC 14, ABI_BREAK_GCC_14
6928 is the old GCC 13 alignment, otherwise it is zero.
6930 This is a helper function for local use only. */
6933 aarch64_function_arg_alignment (machine_mode mode
, const_tree type
,
6934 unsigned int *abi_break_gcc_9
,
6935 unsigned int *abi_break_gcc_13
,
6936 unsigned int *abi_break_gcc_14
)
6938 *abi_break_gcc_9
= 0;
6939 *abi_break_gcc_13
= 0;
6940 *abi_break_gcc_14
= 0;
6942 return GET_MODE_ALIGNMENT (mode
);
6944 if (integer_zerop (TYPE_SIZE (type
)))
6947 gcc_assert (TYPE_MODE (type
) == mode
);
6949 if (!AGGREGATE_TYPE_P (type
))
6951 /* The ABI alignment is the natural alignment of the type, without
6952 any attributes applied. Normally this is the alignment of the
6953 TYPE_MAIN_VARIANT, but not always; see PR108910 for a counterexample.
6954 For now we just handle the known exceptions explicitly. */
6955 type
= TYPE_MAIN_VARIANT (type
);
6956 if (POINTER_TYPE_P (type
))
6958 gcc_assert (known_eq (POINTER_SIZE
, GET_MODE_BITSIZE (mode
)));
6959 return POINTER_SIZE
;
6961 if (TREE_CODE (type
) == ENUMERAL_TYPE
&& TREE_TYPE (type
))
6963 *abi_break_gcc_14
= TYPE_ALIGN (type
);
6964 type
= TYPE_MAIN_VARIANT (TREE_TYPE (type
));
6966 gcc_assert (!TYPE_USER_ALIGN (type
));
6967 return TYPE_ALIGN (type
);
6970 if (TREE_CODE (type
) == ARRAY_TYPE
)
6971 return TYPE_ALIGN (TREE_TYPE (type
));
6973 unsigned int alignment
= 0;
6974 unsigned int bitfield_alignment_with_packed
= 0;
6975 unsigned int bitfield_alignment
= 0;
6976 for (tree field
= TYPE_FIELDS (type
); field
; field
= DECL_CHAIN (field
))
6977 if (TREE_CODE (field
) == FIELD_DECL
)
6979 /* Note that we explicitly consider zero-sized fields here,
6980 even though they don't map to AAPCS64 machine types.
6983 struct __attribute__((aligned(8))) empty {};
6986 [[no_unique_address]] empty e;
6990 "s" contains only one Fundamental Data Type (the int field)
6991 but gains 8-byte alignment and size thanks to "e". */
6992 alignment
= std::max (alignment
, DECL_ALIGN (field
));
6993 if (DECL_BIT_FIELD_TYPE (field
))
6995 /* Take the bit-field type's alignment into account only
6996 if the user didn't reduce this field's alignment with
6997 the packed attribute. */
6998 if (!DECL_PACKED (field
))
7000 = std::max (bitfield_alignment
,
7001 TYPE_ALIGN (DECL_BIT_FIELD_TYPE (field
)));
7003 /* Compute the alignment even if the bit-field is
7004 packed, so that we can emit a warning in case the
7005 alignment changed between GCC versions. */
7006 bitfield_alignment_with_packed
7007 = std::max (bitfield_alignment_with_packed
,
7008 TYPE_ALIGN (DECL_BIT_FIELD_TYPE (field
)));
7012 /* Emit a warning if the alignment is different when taking the
7013 'packed' attribute into account. */
7014 if (bitfield_alignment
!= bitfield_alignment_with_packed
7015 && bitfield_alignment_with_packed
> alignment
)
7016 *abi_break_gcc_13
= bitfield_alignment_with_packed
;
7018 if (bitfield_alignment
> alignment
)
7020 *abi_break_gcc_9
= alignment
;
7021 return bitfield_alignment
;
7027 /* Return true if TYPE describes a _BitInt(N) or an angreggate that uses the
7028 _BitInt(N) type. These include ARRAY_TYPE's with an element that is a
7029 _BitInt(N) or an aggregate that uses it, and a RECORD_TYPE or a UNION_TYPE
7030 with a field member that is a _BitInt(N) or an aggregate that uses it.
7031 Return false otherwise. */
7034 bitint_or_aggr_of_bitint_p (tree type
)
7039 if (TREE_CODE (type
) == BITINT_TYPE
)
7042 /* If ARRAY_TYPE, check it's element type. */
7043 if (TREE_CODE (type
) == ARRAY_TYPE
)
7044 return bitint_or_aggr_of_bitint_p (TREE_TYPE (type
));
7046 /* If RECORD_TYPE or UNION_TYPE, check the fields' types. */
7047 if (RECORD_OR_UNION_TYPE_P (type
))
7048 for (tree field
= TYPE_FIELDS (type
); field
; field
= TREE_CHAIN (field
))
7050 if (TREE_CODE (field
) != FIELD_DECL
)
7052 if (bitint_or_aggr_of_bitint_p (TREE_TYPE (field
)))
7058 /* Layout a function argument according to the AAPCS64 rules. The rule
7059 numbers refer to the rule numbers in the AAPCS64. ORIG_MODE is the
7060 mode that was originally given to us by the target hook, whereas the
7061 mode in ARG might be the result of replacing partial SVE modes with
7062 the equivalent integer mode. */
7065 aarch64_layout_arg (cumulative_args_t pcum_v
, const function_arg_info
&arg
)
7067 CUMULATIVE_ARGS
*pcum
= get_cumulative_args (pcum_v
);
7068 tree type
= arg
.type
;
7069 machine_mode mode
= arg
.mode
;
7070 int ncrn
, nvrn
, nregs
;
7071 bool allocate_ncrn
, allocate_nvrn
;
7073 unsigned int abi_break_gcc_9
;
7074 unsigned int abi_break_gcc_13
;
7075 unsigned int abi_break_gcc_14
;
7077 /* We need to do this once per argument. */
7078 if (pcum
->aapcs_arg_processed
)
7081 bool warn_pcs_change
7084 && (currently_expanding_function_start
7085 || currently_expanding_gimple_stmt
));
7087 /* HFAs and HVAs can have an alignment greater than 16 bytes. For example:
7089 typedef struct foo {
7090 __Int8x16_t foo[2] __attribute__((aligned(32)));
7093 is still a HVA despite its larger-than-normal alignment.
7094 However, such over-aligned HFAs and HVAs are guaranteed to have
7097 If we exclude HFAs and HVAs from the discussion below, then there
7098 are several things to note:
7100 - Both the C and AAPCS64 interpretations of a type's alignment should
7101 give a value that is no greater than the type's size.
7103 - Types bigger than 16 bytes are passed indirectly.
7105 - If an argument of type T is passed indirectly, TYPE and MODE describe
7106 a pointer to T rather than T iself.
7108 It follows that the AAPCS64 alignment of TYPE must be no greater
7111 Versions prior to GCC 9.1 ignored a bitfield's underlying type
7112 and so could calculate an alignment that was too small. If this
7113 happened for TYPE then ABI_BREAK_GCC_9 is this older, too-small alignment.
7115 Although GCC 9.1 fixed that bug, it introduced a different one:
7116 it would consider the alignment of a bitfield's underlying type even
7117 if the field was packed (which should have the effect of overriding
7118 the alignment of the underlying type). This was fixed in GCC 13.1.
7120 As a result of this bug, GCC 9 to GCC 12 could calculate an alignment
7121 that was too big. If this happened for TYPE, ABI_BREAK_GCC_13 is
7122 this older, too-big alignment.
7124 Also, the fact that GCC 9 to GCC 12 considered irrelevant
7125 alignments meant they could calculate type alignments that were
7126 bigger than the type's size, contrary to the assumption above.
7127 The handling of register arguments was nevertheless (and justifiably)
7128 written to follow the assumption that the alignment can never be
7129 greater than the size. The same was not true for stack arguments;
7130 their alignment was instead handled by MIN bounds in
7131 aarch64_function_arg_boundary.
7133 The net effect is that, if GCC 9 to GCC 12 incorrectly calculated
7134 an alignment of more than 16 bytes for TYPE then:
7136 - If the argument was passed in registers, these GCC versions
7137 would treat the alignment as though it was *less than* 16 bytes.
7139 - If the argument was passed on the stack, these GCC versions
7140 would treat the alignment as though it was *equal to* 16 bytes.
7142 Both behaviors were wrong, but in different cases. */
7144 pcum
->aapcs_arg_processed
= true;
7146 pure_scalable_type_info pst_info
;
7147 if (type
&& pst_info
.analyze_registers (type
))
7149 /* aarch64_function_arg_alignment has never had an effect on
7152 /* The PCS says that it is invalid to pass an SVE value to an
7153 unprototyped function. There is no ABI-defined location we
7154 can return in this case, so we have no real choice but to raise
7155 an error immediately, even though this is only a query function. */
7156 if (arg
.named
&& pcum
->pcs_variant
!= ARM_PCS_SVE
)
7158 gcc_assert (!pcum
->silent_p
);
7159 error ("SVE type %qT cannot be passed to an unprototyped function",
7161 /* Avoid repeating the message, and avoid tripping the assert
7163 pcum
->pcs_variant
= ARM_PCS_SVE
;
7166 /* We would have converted the argument into pass-by-reference
7167 form if it didn't fit in registers. */
7168 pcum
->aapcs_nextnvrn
= pcum
->aapcs_nvrn
+ pst_info
.num_zr ();
7169 pcum
->aapcs_nextnprn
= pcum
->aapcs_nprn
+ pst_info
.num_pr ();
7170 gcc_assert (arg
.named
7171 && pcum
->pcs_variant
== ARM_PCS_SVE
7172 && pcum
->aapcs_nextnvrn
<= NUM_FP_ARG_REGS
7173 && pcum
->aapcs_nextnprn
<= NUM_PR_ARG_REGS
);
7174 pcum
->aapcs_reg
= pst_info
.get_rtx (mode
, V0_REGNUM
+ pcum
->aapcs_nvrn
,
7175 P0_REGNUM
+ pcum
->aapcs_nprn
);
7179 /* Generic vectors that map to full SVE modes with -msve-vector-bits=N
7180 are passed by reference, not by value. */
7181 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
7182 bool sve_p
= (vec_flags
& VEC_ANY_SVE
);
7184 /* Vector types can acquire a partial SVE mode using things like
7185 __attribute__((vector_size(N))), and this is potentially useful.
7186 However, the choice of mode doesn't affect the type's ABI
7187 identity, so we should treat the types as though they had
7188 the associated integer mode, just like they did before SVE
7191 We know that the vector must be 128 bits or smaller,
7192 otherwise we'd have passed it in memory instead. */
7194 && (aarch64_some_values_include_pst_objects_p (type
)
7195 || (vec_flags
& VEC_PARTIAL
)));
7197 /* Size in bytes, rounded to the nearest multiple of 8 bytes. */
7199 size
= int_size_in_bytes (type
);
7201 /* No frontends can create types with variable-sized modes, so we
7202 shouldn't be asked to pass or return them. */
7203 size
= GET_MODE_SIZE (mode
).to_constant ();
7204 size
= ROUND_UP (size
, UNITS_PER_WORD
);
7206 allocate_ncrn
= (type
) ? !(FLOAT_TYPE_P (type
)) : !FLOAT_MODE_P (mode
);
7207 allocate_nvrn
= aarch64_vfp_is_call_candidate (pcum_v
,
7211 gcc_assert (!sve_p
|| !allocate_nvrn
);
7213 unsigned int alignment
7214 = aarch64_function_arg_alignment (mode
, type
, &abi_break_gcc_9
,
7215 &abi_break_gcc_13
, &abi_break_gcc_14
);
7217 gcc_assert ((allocate_nvrn
|| alignment
<= 16 * BITS_PER_UNIT
)
7218 && (!alignment
|| abi_break_gcc_9
< alignment
)
7219 && (!abi_break_gcc_13
|| alignment
< abi_break_gcc_13
));
7221 /* _BitInt(N) was only added in GCC 14. */
7222 bool warn_pcs_change_le_gcc14
7223 = warn_pcs_change
&& !bitint_or_aggr_of_bitint_p (type
);
7225 /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
7226 The following code thus handles passing by SIMD/FP registers first. */
7228 nvrn
= pcum
->aapcs_nvrn
;
7230 /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
7231 and homogenous short-vector aggregates (HVA). */
7234 /* aarch64_function_arg_alignment has never had an effect on
7236 if (!pcum
->silent_p
&& !TARGET_FLOAT
)
7237 aarch64_err_no_fpadvsimd (mode
);
7239 if (nvrn
+ nregs
<= NUM_FP_ARG_REGS
)
7241 pcum
->aapcs_nextnvrn
= nvrn
+ nregs
;
7242 if (!aarch64_composite_type_p (type
, mode
))
7244 gcc_assert (nregs
== 1);
7245 pcum
->aapcs_reg
= gen_rtx_REG (mode
, V0_REGNUM
+ nvrn
);
7247 else if (aarch64_advsimd_full_struct_mode_p (mode
)
7248 && known_eq (GET_MODE_SIZE (pcum
->aapcs_vfp_rmode
), 16))
7249 pcum
->aapcs_reg
= gen_rtx_REG (mode
, V0_REGNUM
+ nvrn
);
7250 else if (aarch64_advsimd_partial_struct_mode_p (mode
)
7251 && known_eq (GET_MODE_SIZE (pcum
->aapcs_vfp_rmode
), 8))
7252 pcum
->aapcs_reg
= gen_rtx_REG (mode
, V0_REGNUM
+ nvrn
);
7257 par
= gen_rtx_PARALLEL (mode
, rtvec_alloc (nregs
));
7258 for (i
= 0; i
< nregs
; i
++)
7260 rtx tmp
= gen_rtx_REG (pcum
->aapcs_vfp_rmode
,
7261 V0_REGNUM
+ nvrn
+ i
);
7262 rtx offset
= gen_int_mode
7263 (i
* GET_MODE_SIZE (pcum
->aapcs_vfp_rmode
), Pmode
);
7264 tmp
= gen_rtx_EXPR_LIST (VOIDmode
, tmp
, offset
);
7265 XVECEXP (par
, 0, i
) = tmp
;
7267 pcum
->aapcs_reg
= par
;
7273 /* C.3 NSRN is set to 8. */
7274 pcum
->aapcs_nextnvrn
= NUM_FP_ARG_REGS
;
7279 ncrn
= pcum
->aapcs_ncrn
;
7280 nregs
= size
/ UNITS_PER_WORD
;
7282 /* C6 - C9. though the sign and zero extension semantics are
7283 handled elsewhere. This is the case where the argument fits
7284 entirely general registers. */
7285 if (allocate_ncrn
&& (ncrn
+ nregs
<= NUM_ARG_REGS
))
7287 gcc_assert (nregs
== 0 || nregs
== 1 || nregs
== 2);
7289 /* C.8 if the argument has an alignment of 16 then the NGRN is
7290 rounded up to the next even number. */
7294 /* Emit a warning if the alignment changed when taking the
7295 'packed' attribute into account. */
7296 if (warn_pcs_change_le_gcc14
7298 && ((abi_break_gcc_13
== 16 * BITS_PER_UNIT
)
7299 != (alignment
== 16 * BITS_PER_UNIT
)))
7300 inform (input_location
, "parameter passing for argument of type "
7301 "%qT changed in GCC 13.1", type
);
7303 if (warn_pcs_change_le_gcc14
7305 && ((abi_break_gcc_14
== 16 * BITS_PER_UNIT
)
7306 != (alignment
== 16 * BITS_PER_UNIT
)))
7307 inform (input_location
, "parameter passing for argument of type "
7308 "%qT changed in GCC 14.1", type
);
7310 /* The == 16 * BITS_PER_UNIT instead of >= 16 * BITS_PER_UNIT
7311 comparison is there because for > 16 * BITS_PER_UNIT
7312 alignment nregs should be > 2 and therefore it should be
7313 passed by reference rather than value. */
7314 if (alignment
== 16 * BITS_PER_UNIT
)
7316 if (warn_pcs_change_le_gcc14
7318 inform (input_location
, "parameter passing for argument of type "
7319 "%qT changed in GCC 9.1", type
);
7321 gcc_assert (ncrn
+ nregs
<= NUM_ARG_REGS
);
7325 /* If an argument with an SVE mode needs to be shifted up to the
7326 high part of the register, treat it as though it had an integer mode.
7327 Using the normal (parallel [...]) would suppress the shifting. */
7330 && maybe_ne (GET_MODE_SIZE (mode
), nregs
* UNITS_PER_WORD
)
7331 && aarch64_pad_reg_upward (mode
, type
, false))
7333 mode
= int_mode_for_mode (mode
).require ();
7337 /* NREGS can be 0 when e.g. an empty structure is to be passed.
7338 A reg is still generated for it, but the caller should be smart
7339 enough not to use it. */
7341 || (nregs
== 1 && !sve_p
)
7342 || GET_MODE_CLASS (mode
) == MODE_INT
)
7343 pcum
->aapcs_reg
= gen_rtx_REG (mode
, R0_REGNUM
+ ncrn
);
7349 par
= gen_rtx_PARALLEL (mode
, rtvec_alloc (nregs
));
7350 for (i
= 0; i
< nregs
; i
++)
7352 scalar_int_mode reg_mode
= word_mode
;
7354 reg_mode
= int_mode_for_mode (mode
).require ();
7355 rtx tmp
= gen_rtx_REG (reg_mode
, R0_REGNUM
+ ncrn
+ i
);
7356 tmp
= gen_rtx_EXPR_LIST (VOIDmode
, tmp
,
7357 GEN_INT (i
* UNITS_PER_WORD
));
7358 XVECEXP (par
, 0, i
) = tmp
;
7360 pcum
->aapcs_reg
= par
;
7363 pcum
->aapcs_nextncrn
= ncrn
+ nregs
;
7368 pcum
->aapcs_nextncrn
= NUM_ARG_REGS
;
7370 /* The argument is passed on stack; record the needed number of words for
7371 this argument and align the total size if necessary. */
7373 pcum
->aapcs_stack_words
= size
/ UNITS_PER_WORD
;
7375 if (warn_pcs_change_le_gcc14
7377 && ((abi_break_gcc_13
>= 16 * BITS_PER_UNIT
)
7378 != (alignment
>= 16 * BITS_PER_UNIT
)))
7379 inform (input_location
, "parameter passing for argument of type "
7380 "%qT changed in GCC 13.1", type
);
7382 if (warn_pcs_change_le_gcc14
7384 && ((abi_break_gcc_14
>= 16 * BITS_PER_UNIT
)
7385 != (alignment
>= 16 * BITS_PER_UNIT
)))
7386 inform (input_location
, "parameter passing for argument of type "
7387 "%qT changed in GCC 14.1", type
);
7389 if (alignment
== 16 * BITS_PER_UNIT
)
7391 int new_size
= ROUND_UP (pcum
->aapcs_stack_size
, 16 / UNITS_PER_WORD
);
7392 if (pcum
->aapcs_stack_size
!= new_size
)
7394 if (warn_pcs_change_le_gcc14
7396 inform (input_location
, "parameter passing for argument of type "
7397 "%qT changed in GCC 9.1", type
);
7398 pcum
->aapcs_stack_size
= new_size
;
7404 /* Add the current argument register to the set of those that need
7405 to be saved and restored around a change to PSTATE.SM. */
7408 aarch64_record_sme_mode_switch_args (CUMULATIVE_ARGS
*pcum
)
7410 subrtx_var_iterator::array_type array
;
7411 FOR_EACH_SUBRTX_VAR (iter
, array
, pcum
->aapcs_reg
, NONCONST
)
7414 if (REG_P (x
) && (FP_REGNUM_P (REGNO (x
)) || PR_REGNUM_P (REGNO (x
))))
7416 unsigned int i
= pcum
->num_sme_mode_switch_args
++;
7417 gcc_assert (i
< ARRAY_SIZE (pcum
->sme_mode_switch_args
));
7418 pcum
->sme_mode_switch_args
[i
] = x
;
7423 /* Return a parallel that contains all the registers that need to be
7424 saved around a change to PSTATE.SM. Return const0_rtx if there is
7425 no such mode switch, or if no registers need to be saved. */
7428 aarch64_finish_sme_mode_switch_args (CUMULATIVE_ARGS
*pcum
)
7430 if (!pcum
->num_sme_mode_switch_args
)
7433 auto argvec
= gen_rtvec_v (pcum
->num_sme_mode_switch_args
,
7434 pcum
->sme_mode_switch_args
);
7435 return gen_rtx_PARALLEL (VOIDmode
, argvec
);
7438 /* Implement TARGET_FUNCTION_ARG. */
7441 aarch64_function_arg (cumulative_args_t pcum_v
, const function_arg_info
&arg
)
7443 CUMULATIVE_ARGS
*pcum
= get_cumulative_args (pcum_v
);
7444 gcc_assert (pcum
->pcs_variant
== ARM_PCS_AAPCS64
7445 || pcum
->pcs_variant
== ARM_PCS_SIMD
7446 || pcum
->pcs_variant
== ARM_PCS_SVE
);
7448 if (arg
.end_marker_p ())
7450 rtx abi_cookie
= aarch64_gen_callee_cookie (pcum
->isa_mode
,
7452 pcum
->indirect_return
);
7453 rtx sme_mode_switch_args
= aarch64_finish_sme_mode_switch_args (pcum
);
7454 rtx shared_za_flags
= gen_int_mode (pcum
->shared_za_flags
, SImode
);
7455 rtx shared_zt0_flags
= gen_int_mode (pcum
->shared_zt0_flags
, SImode
);
7456 return gen_rtx_PARALLEL (VOIDmode
, gen_rtvec (4, abi_cookie
,
7457 sme_mode_switch_args
,
7462 aarch64_layout_arg (pcum_v
, arg
);
7463 return pcum
->aapcs_reg
;
7467 aarch64_init_cumulative_args (CUMULATIVE_ARGS
*pcum
,
7469 rtx libname ATTRIBUTE_UNUSED
,
7471 unsigned n_named ATTRIBUTE_UNUSED
,
7474 pcum
->aapcs_ncrn
= 0;
7475 pcum
->aapcs_nvrn
= 0;
7476 pcum
->aapcs_nprn
= 0;
7477 pcum
->aapcs_nextncrn
= 0;
7478 pcum
->aapcs_nextnvrn
= 0;
7479 pcum
->aapcs_nextnprn
= 0;
7482 pcum
->pcs_variant
= (arm_pcs
) fntype_abi (fntype
).id ();
7483 pcum
->isa_mode
= aarch64_fntype_isa_mode (fntype
);
7484 pcum
->indirect_return
= lookup_attribute ("indirect_return",
7485 TYPE_ATTRIBUTES (fntype
));
7489 pcum
->pcs_variant
= ARM_PCS_AAPCS64
;
7490 pcum
->isa_mode
= AARCH64_DEFAULT_ISA_MODE
;
7491 pcum
->indirect_return
= false;
7493 pcum
->aapcs_reg
= NULL_RTX
;
7494 pcum
->aapcs_arg_processed
= false;
7495 pcum
->aapcs_stack_words
= 0;
7496 pcum
->aapcs_stack_size
= 0;
7497 pcum
->silent_p
= silent_p
;
7498 pcum
->shared_za_flags
7499 = (fntype
? aarch64_fntype_shared_flags (fntype
, "za") : 0U);
7500 pcum
->shared_zt0_flags
7501 = (fntype
? aarch64_fntype_shared_flags (fntype
, "zt0") : 0U);
7502 pcum
->num_sme_mode_switch_args
= 0;
7506 && fntype
&& fntype
!= error_mark_node
)
7508 const_tree type
= TREE_TYPE (fntype
);
7509 machine_mode mode ATTRIBUTE_UNUSED
; /* To pass pointer as argument. */
7510 int nregs ATTRIBUTE_UNUSED
; /* Likewise. */
7511 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type
), type
,
7512 &mode
, &nregs
, NULL
, false))
7513 aarch64_err_no_fpadvsimd (TYPE_MODE (type
));
7518 && pcum
->pcs_variant
== ARM_PCS_SVE
)
7520 /* We can't gracefully recover at this point, so make this a
7523 fatal_error (input_location
, "%qE requires the SVE ISA extension",
7526 fatal_error (input_location
, "calls to functions of type %qT require"
7527 " the SVE ISA extension", fntype
);
7532 aarch64_function_arg_advance (cumulative_args_t pcum_v
,
7533 const function_arg_info
&arg
)
7535 CUMULATIVE_ARGS
*pcum
= get_cumulative_args (pcum_v
);
7536 if (pcum
->pcs_variant
== ARM_PCS_AAPCS64
7537 || pcum
->pcs_variant
== ARM_PCS_SIMD
7538 || pcum
->pcs_variant
== ARM_PCS_SVE
)
7540 aarch64_layout_arg (pcum_v
, arg
);
7541 gcc_assert ((pcum
->aapcs_reg
!= NULL_RTX
)
7542 != (pcum
->aapcs_stack_words
!= 0));
7544 && aarch64_call_switches_pstate_sm (pcum
->isa_mode
))
7545 aarch64_record_sme_mode_switch_args (pcum
);
7547 pcum
->aapcs_arg_processed
= false;
7548 pcum
->aapcs_ncrn
= pcum
->aapcs_nextncrn
;
7549 pcum
->aapcs_nvrn
= pcum
->aapcs_nextnvrn
;
7550 pcum
->aapcs_nprn
= pcum
->aapcs_nextnprn
;
7551 pcum
->aapcs_stack_size
+= pcum
->aapcs_stack_words
;
7552 pcum
->aapcs_stack_words
= 0;
7553 pcum
->aapcs_reg
= NULL_RTX
;
7558 aarch64_function_arg_regno_p (unsigned regno
)
7560 return ((GP_REGNUM_P (regno
) && regno
< R0_REGNUM
+ NUM_ARG_REGS
)
7561 || (FP_REGNUM_P (regno
) && regno
< V0_REGNUM
+ NUM_FP_ARG_REGS
)
7562 || (PR_REGNUM_P (regno
) && regno
< P0_REGNUM
+ NUM_PR_ARG_REGS
));
7565 /* Implement FUNCTION_ARG_BOUNDARY. Every parameter gets at least
7566 PARM_BOUNDARY bits of alignment, but will be given anything up
7567 to STACK_BOUNDARY bits if the type requires it. This makes sure
7568 that both before and after the layout of each argument, the Next
7569 Stacked Argument Address (NSAA) will have a minimum alignment of
7573 aarch64_function_arg_boundary (machine_mode mode
, const_tree type
)
7575 unsigned int abi_break_gcc_9
;
7576 unsigned int abi_break_gcc_13
;
7577 unsigned int abi_break_gcc_14
;
7578 unsigned int alignment
= aarch64_function_arg_alignment (mode
, type
,
7582 /* We rely on aarch64_layout_arg and aarch64_gimplify_va_arg_expr
7583 to emit warnings about ABI incompatibility. */
7584 alignment
= MIN (MAX (alignment
, PARM_BOUNDARY
), STACK_BOUNDARY
);
7588 /* Implement TARGET_GET_RAW_RESULT_MODE and TARGET_GET_RAW_ARG_MODE. */
7590 static fixed_size_mode
7591 aarch64_get_reg_raw_mode (int regno
)
7593 /* Don't use any non GP registers for __builtin_apply and
7594 __builtin_return if general registers only mode is requested. */
7595 if (TARGET_GENERAL_REGS_ONLY
&& !GP_REGNUM_P (regno
))
7596 return as_a
<fixed_size_mode
> (VOIDmode
);
7597 if (TARGET_SVE
&& FP_REGNUM_P (regno
))
7598 /* Don't use the SVE part of the register for __builtin_apply and
7599 __builtin_return. The SVE registers aren't used by the normal PCS,
7600 so using them there would be a waste of time. The PCS extensions
7601 for SVE types are fundamentally incompatible with the
7602 __builtin_return/__builtin_apply interface. */
7603 return as_a
<fixed_size_mode
> (V16QImode
);
7604 if (PR_REGNUM_P (regno
))
7605 /* For SVE PR regs, indicate that they should be ignored for
7606 __builtin_apply/__builtin_return. */
7607 return as_a
<fixed_size_mode
> (VOIDmode
);
7608 return default_get_reg_raw_mode (regno
);
7611 /* Implement TARGET_FUNCTION_ARG_PADDING.
7613 Small aggregate types are placed in the lowest memory address.
7615 The related parameter passing rules are B.4, C.3, C.5 and C.14. */
7617 static pad_direction
7618 aarch64_function_arg_padding (machine_mode mode
, const_tree type
)
7620 /* On little-endian targets, the least significant byte of every stack
7621 argument is passed at the lowest byte address of the stack slot. */
7622 if (!BYTES_BIG_ENDIAN
)
7625 /* Otherwise, integral, floating-point and pointer types are padded downward:
7626 the least significant byte of a stack argument is passed at the highest
7627 byte address of the stack slot. */
7629 ? (INTEGRAL_TYPE_P (type
) || SCALAR_FLOAT_TYPE_P (type
)
7630 || POINTER_TYPE_P (type
))
7631 : (SCALAR_INT_MODE_P (mode
) || SCALAR_FLOAT_MODE_P (mode
)))
7632 return PAD_DOWNWARD
;
7634 /* Everything else padded upward, i.e. data in first byte of stack slot. */
7638 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
7640 It specifies padding for the last (may also be the only)
7641 element of a block move between registers and memory. If
7642 assuming the block is in the memory, padding upward means that
7643 the last element is padded after its highest significant byte,
7644 while in downward padding, the last element is padded at the
7645 its least significant byte side.
7647 Small aggregates and small complex types are always padded
7650 We don't need to worry about homogeneous floating-point or
7651 short-vector aggregates; their move is not affected by the
7652 padding direction determined here. Regardless of endianness,
7653 each element of such an aggregate is put in the least
7654 significant bits of a fp/simd register.
7656 Return !BYTES_BIG_ENDIAN if the least significant byte of the
7657 register has useful data, and return the opposite if the most
7658 significant byte does. */
7661 aarch64_pad_reg_upward (machine_mode mode
, const_tree type
,
7662 bool first ATTRIBUTE_UNUSED
)
7665 /* Aside from pure scalable types, small composite types are always
7667 if (BYTES_BIG_ENDIAN
&& aarch64_composite_type_p (type
, mode
))
7671 size
= int_size_in_bytes (type
);
7673 /* No frontends can create types with variable-sized modes, so we
7674 shouldn't be asked to pass or return them. */
7675 size
= GET_MODE_SIZE (mode
).to_constant ();
7676 if (size
< 2 * UNITS_PER_WORD
)
7678 pure_scalable_type_info pst_info
;
7679 if (pst_info
.analyze_registers (type
))
7685 /* Otherwise, use the default padding. */
7686 return !BYTES_BIG_ENDIAN
;
7689 static scalar_int_mode
7690 aarch64_libgcc_cmp_return_mode (void)
7695 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
7697 /* We use the 12-bit shifted immediate arithmetic instructions so values
7698 must be multiple of (1 << 12), i.e. 4096. */
7699 #define ARITH_FACTOR 4096
7701 #if (PROBE_INTERVAL % ARITH_FACTOR) != 0
7702 #error Cannot use simple address calculation for stack probing
7705 /* Emit code to probe a range of stack addresses from FIRST to FIRST+POLY_SIZE,
7706 inclusive. These are offsets from the current stack pointer. */
7709 aarch64_emit_probe_stack_range (HOST_WIDE_INT first
, poly_int64 poly_size
)
7712 if (!poly_size
.is_constant (&size
))
7714 sorry ("stack probes for SVE frames");
7718 rtx reg1
= gen_rtx_REG (Pmode
, PROBE_STACK_FIRST_REGNUM
);
7720 /* See the same assertion on PROBE_INTERVAL above. */
7721 gcc_assert ((first
% ARITH_FACTOR
) == 0);
7723 /* See if we have a constant small number of probes to generate. If so,
7724 that's the easy case. */
7725 if (size
<= PROBE_INTERVAL
)
7727 const HOST_WIDE_INT base
= ROUND_UP (size
, ARITH_FACTOR
);
7729 emit_set_insn (reg1
,
7730 plus_constant (Pmode
,
7731 stack_pointer_rtx
, -(first
+ base
)));
7732 emit_stack_probe (plus_constant (Pmode
, reg1
, base
- size
));
7735 /* The run-time loop is made up of 8 insns in the generic case while the
7736 compile-time loop is made up of 4+2*(n-2) insns for n # of intervals. */
7737 else if (size
<= 4 * PROBE_INTERVAL
)
7739 HOST_WIDE_INT i
, rem
;
7741 emit_set_insn (reg1
,
7742 plus_constant (Pmode
,
7744 -(first
+ PROBE_INTERVAL
)));
7745 emit_stack_probe (reg1
);
7747 /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 2 until
7748 it exceeds SIZE. If only two probes are needed, this will not
7749 generate any code. Then probe at FIRST + SIZE. */
7750 for (i
= 2 * PROBE_INTERVAL
; i
< size
; i
+= PROBE_INTERVAL
)
7752 emit_set_insn (reg1
,
7753 plus_constant (Pmode
, reg1
, -PROBE_INTERVAL
));
7754 emit_stack_probe (reg1
);
7757 rem
= size
- (i
- PROBE_INTERVAL
);
7760 const HOST_WIDE_INT base
= ROUND_UP (rem
, ARITH_FACTOR
);
7762 emit_set_insn (reg1
, plus_constant (Pmode
, reg1
, -base
));
7763 emit_stack_probe (plus_constant (Pmode
, reg1
, base
- rem
));
7766 emit_stack_probe (plus_constant (Pmode
, reg1
, -rem
));
7769 /* Otherwise, do the same as above, but in a loop. Note that we must be
7770 extra careful with variables wrapping around because we might be at
7771 the very top (or the very bottom) of the address space and we have
7772 to be able to handle this case properly; in particular, we use an
7773 equality test for the loop condition. */
7776 rtx reg2
= gen_rtx_REG (Pmode
, PROBE_STACK_SECOND_REGNUM
);
7778 /* Step 1: round SIZE to the previous multiple of the interval. */
7780 HOST_WIDE_INT rounded_size
= size
& -PROBE_INTERVAL
;
7783 /* Step 2: compute initial and final value of the loop counter. */
7785 /* TEST_ADDR = SP + FIRST. */
7786 emit_set_insn (reg1
,
7787 plus_constant (Pmode
, stack_pointer_rtx
, -first
));
7789 /* LAST_ADDR = SP + FIRST + ROUNDED_SIZE. */
7790 HOST_WIDE_INT adjustment
= - (first
+ rounded_size
);
7791 if (! aarch64_uimm12_shift (adjustment
))
7793 aarch64_internal_mov_immediate (reg2
, GEN_INT (adjustment
),
7795 emit_set_insn (reg2
, gen_rtx_PLUS (Pmode
, stack_pointer_rtx
, reg2
));
7798 emit_set_insn (reg2
,
7799 plus_constant (Pmode
, stack_pointer_rtx
, adjustment
));
7805 TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
7808 while (TEST_ADDR != LAST_ADDR)
7810 probes at FIRST + N * PROBE_INTERVAL for values of N from 1
7811 until it is equal to ROUNDED_SIZE. */
7813 emit_insn (gen_probe_stack_range (reg1
, reg1
, reg2
));
7816 /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
7817 that SIZE is equal to ROUNDED_SIZE. */
7819 if (size
!= rounded_size
)
7821 HOST_WIDE_INT rem
= size
- rounded_size
;
7825 const HOST_WIDE_INT base
= ROUND_UP (rem
, ARITH_FACTOR
);
7827 emit_set_insn (reg2
, plus_constant (Pmode
, reg2
, -base
));
7828 emit_stack_probe (plus_constant (Pmode
, reg2
, base
- rem
));
7831 emit_stack_probe (plus_constant (Pmode
, reg2
, -rem
));
7835 /* Make sure nothing is scheduled before we are done. */
7836 emit_insn (gen_blockage ());
7839 /* Probe a range of stack addresses from REG1 to REG2 inclusive. These are
7840 absolute addresses. */
7843 aarch64_output_probe_stack_range (rtx reg1
, rtx reg2
)
7845 static int labelno
= 0;
7849 ASM_GENERATE_INTERNAL_LABEL (loop_lab
, "LPSRL", labelno
++);
7852 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file
, loop_lab
);
7854 HOST_WIDE_INT stack_clash_probe_interval
7855 = 1 << param_stack_clash_protection_guard_size
;
7857 /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL. */
7859 HOST_WIDE_INT interval
;
7860 if (flag_stack_clash_protection
)
7861 interval
= stack_clash_probe_interval
;
7863 interval
= PROBE_INTERVAL
;
7865 gcc_assert (aarch64_uimm12_shift (interval
));
7866 xops
[1] = GEN_INT (interval
);
7868 output_asm_insn ("sub\t%0, %0, %1", xops
);
7870 /* If doing stack clash protection then we probe up by the ABI specified
7871 amount. We do this because we're dropping full pages at a time in the
7872 loop. But if we're doing non-stack clash probing, probe at SP 0. */
7873 if (flag_stack_clash_protection
)
7874 xops
[1] = GEN_INT (STACK_CLASH_CALLER_GUARD
);
7876 xops
[1] = CONST0_RTX (GET_MODE (xops
[1]));
7878 /* Probe at TEST_ADDR. If we're inside the loop it is always safe to probe
7879 by this amount for each iteration. */
7880 output_asm_insn ("str\txzr, [%0, %1]", xops
);
7882 /* Test if TEST_ADDR == LAST_ADDR. */
7884 output_asm_insn ("cmp\t%0, %1", xops
);
7887 fputs ("\tb.ne\t", asm_out_file
);
7888 assemble_name_raw (asm_out_file
, loop_lab
);
7889 fputc ('\n', asm_out_file
);
7894 /* Emit the probe loop for doing stack clash probes and stack adjustments for
7895 SVE. This emits probes from BASE to BASE - ADJUSTMENT based on a guard size
7896 of GUARD_SIZE. When a probe is emitted it is done at most
7897 MIN_PROBE_THRESHOLD bytes from the current BASE at an interval of
7898 at most MIN_PROBE_THRESHOLD. By the end of this function
7899 BASE = BASE - ADJUSTMENT. */
7902 aarch64_output_probe_sve_stack_clash (rtx base
, rtx adjustment
,
7903 rtx min_probe_threshold
, rtx guard_size
)
7905 /* This function is not allowed to use any instruction generation function
7906 like gen_ and friends. If you do you'll likely ICE during CFG validation,
7907 so instead emit the code you want using output_asm_insn. */
7908 gcc_assert (flag_stack_clash_protection
);
7909 gcc_assert (CONST_INT_P (min_probe_threshold
) && CONST_INT_P (guard_size
));
7910 gcc_assert (INTVAL (guard_size
) > INTVAL (min_probe_threshold
));
7912 /* The minimum required allocation before the residual requires probing. */
7913 HOST_WIDE_INT residual_probe_guard
= INTVAL (min_probe_threshold
);
7915 /* Clamp the value down to the nearest value that can be used with a cmp. */
7916 residual_probe_guard
= aarch64_clamp_to_uimm12_shift (residual_probe_guard
);
7917 rtx probe_offset_value_rtx
= gen_int_mode (residual_probe_guard
, Pmode
);
7919 gcc_assert (INTVAL (min_probe_threshold
) >= residual_probe_guard
);
7920 gcc_assert (aarch64_uimm12_shift (residual_probe_guard
));
7922 static int labelno
= 0;
7923 char loop_start_lab
[32];
7924 char loop_end_lab
[32];
7927 ASM_GENERATE_INTERNAL_LABEL (loop_start_lab
, "SVLPSPL", labelno
);
7928 ASM_GENERATE_INTERNAL_LABEL (loop_end_lab
, "SVLPEND", labelno
++);
7930 /* Emit loop start label. */
7931 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file
, loop_start_lab
);
7933 /* ADJUSTMENT < RESIDUAL_PROBE_GUARD. */
7934 xops
[0] = adjustment
;
7935 xops
[1] = probe_offset_value_rtx
;
7936 output_asm_insn ("cmp\t%0, %1", xops
);
7938 /* Branch to end if not enough adjustment to probe. */
7939 fputs ("\tb.lt\t", asm_out_file
);
7940 assemble_name_raw (asm_out_file
, loop_end_lab
);
7941 fputc ('\n', asm_out_file
);
7943 /* BASE = BASE - RESIDUAL_PROBE_GUARD. */
7945 xops
[1] = probe_offset_value_rtx
;
7946 output_asm_insn ("sub\t%0, %0, %1", xops
);
7948 /* Probe at BASE. */
7949 xops
[1] = const0_rtx
;
7950 output_asm_insn ("str\txzr, [%0, %1]", xops
);
7952 /* ADJUSTMENT = ADJUSTMENT - RESIDUAL_PROBE_GUARD. */
7953 xops
[0] = adjustment
;
7954 xops
[1] = probe_offset_value_rtx
;
7955 output_asm_insn ("sub\t%0, %0, %1", xops
);
7957 /* Branch to start if still more bytes to allocate. */
7958 fputs ("\tb\t", asm_out_file
);
7959 assemble_name_raw (asm_out_file
, loop_start_lab
);
7960 fputc ('\n', asm_out_file
);
7962 /* No probe leave. */
7963 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file
, loop_end_lab
);
7965 /* BASE = BASE - ADJUSTMENT. */
7967 xops
[1] = adjustment
;
7968 output_asm_insn ("sub\t%0, %0, %1", xops
);
7972 /* Determine whether a frame chain needs to be generated. */
7974 aarch64_needs_frame_chain (void)
7976 if (frame_pointer_needed
)
7979 /* A leaf function cannot have calls or write LR. */
7980 bool is_leaf
= crtl
->is_leaf
&& !df_regs_ever_live_p (LR_REGNUM
);
7982 /* Don't use a frame chain in leaf functions if leaf frame pointers
7984 if (flag_omit_leaf_frame_pointer
&& is_leaf
)
7987 return aarch64_use_frame_pointer
;
7990 /* Return true if the current function should save registers above
7991 the locals area, rather than below it. */
7994 aarch64_save_regs_above_locals_p ()
7996 /* When using stack smash protection, make sure that the canary slot
7997 comes between the locals and the saved registers. Otherwise,
7998 it would be possible for a carefully sized smash attack to change
7999 the saved registers (particularly LR and FP) without reaching the
8001 return crtl
->stack_protect_guard
;
8004 /* Return true if the current function needs to record the incoming
8005 value of PSTATE.SM. */
8007 aarch64_need_old_pstate_sm ()
8009 /* Exit early if the incoming value of PSTATE.SM is known at
8011 if (aarch64_cfun_incoming_pstate_sm () != 0)
8014 if (aarch64_cfun_enables_pstate_sm ())
8017 /* Non-local goto receivers are entered with PSTATE.SM equal to 0,
8018 but the function needs to return with PSTATE.SM unchanged. */
8019 if (nonlocal_goto_handler_labels
)
8022 /* Likewise for exception handlers. */
8024 for (unsigned int i
= 1; vec_safe_iterate (cfun
->eh
->lp_array
, i
, &lp
); ++i
)
8025 if (lp
&& lp
->post_landing_pad
)
8028 /* Non-local gotos need to set PSTATE.SM to zero. It's possible to call
8029 streaming-compatible functions without SME being available, so PSTATE.SM
8030 should only be changed if it is currently set to one. */
8031 if (crtl
->has_nonlocal_goto
)
8034 if (cfun
->machine
->call_switches_pstate_sm
)
8035 for (auto insn
= get_insns (); insn
; insn
= NEXT_INSN (insn
))
8036 if (auto *call
= dyn_cast
<rtx_call_insn
*> (insn
))
8037 if (!SIBLING_CALL_P (call
))
8039 /* Return true if there is a call to a non-streaming-compatible
8041 auto callee_isa_mode
= aarch64_insn_callee_isa_mode (call
);
8042 if (aarch64_call_switches_pstate_sm (callee_isa_mode
))
8048 /* Mark the registers that need to be saved by the callee and calculate
8049 the size of the callee-saved registers area and frame record (both FP
8050 and LR may be omitted). */
8052 aarch64_layout_frame (void)
8054 unsigned regno
, last_fp_reg
= INVALID_REGNUM
;
8055 machine_mode vector_save_mode
= aarch64_reg_save_mode (V8_REGNUM
);
8056 poly_int64 vector_save_size
= GET_MODE_SIZE (vector_save_mode
);
8057 bool frame_related_fp_reg_p
= false;
8058 aarch64_frame
&frame
= cfun
->machine
->frame
;
8059 poly_int64 top_of_locals
= -1;
8060 bool enables_pstate_sm
= aarch64_cfun_enables_pstate_sm ();
8062 vec_safe_truncate (frame
.saved_gprs
, 0);
8063 vec_safe_truncate (frame
.saved_fprs
, 0);
8064 vec_safe_truncate (frame
.saved_prs
, 0);
8066 frame
.emit_frame_chain
= aarch64_needs_frame_chain ();
8068 /* Adjust the outgoing arguments size if required. Keep it in sync with what
8069 the mid-end is doing. */
8070 crtl
->outgoing_args_size
= STACK_DYNAMIC_OFFSET (cfun
);
8072 #define SLOT_NOT_REQUIRED (-2)
8073 #define SLOT_REQUIRED (-1)
8075 frame
.wb_push_candidate1
= INVALID_REGNUM
;
8076 frame
.wb_push_candidate2
= INVALID_REGNUM
;
8077 frame
.spare_pred_reg
= INVALID_REGNUM
;
8079 /* First mark all the registers that really need to be saved... */
8080 for (regno
= 0; regno
<= LAST_SAVED_REGNUM
; regno
++)
8081 frame
.reg_offset
[regno
] = SLOT_NOT_REQUIRED
;
8082 frame
.old_svcr_offset
= SLOT_NOT_REQUIRED
;
8084 /* ... that includes the eh data registers (if needed)... */
8085 if (crtl
->calls_eh_return
)
8086 for (regno
= 0; EH_RETURN_DATA_REGNO (regno
) != INVALID_REGNUM
; regno
++)
8087 frame
.reg_offset
[EH_RETURN_DATA_REGNO (regno
)] = SLOT_REQUIRED
;
8089 /* ... and any callee saved register that dataflow says is live. */
8090 for (regno
= R0_REGNUM
; regno
<= R30_REGNUM
; regno
++)
8091 if (df_regs_ever_live_p (regno
)
8092 && !fixed_regs
[regno
]
8093 && (regno
== R30_REGNUM
8094 || !crtl
->abi
->clobbers_full_reg_p (regno
)))
8095 frame
.reg_offset
[regno
] = SLOT_REQUIRED
;
8097 for (regno
= V0_REGNUM
; regno
<= V31_REGNUM
; regno
++)
8098 if ((enables_pstate_sm
|| df_regs_ever_live_p (regno
))
8099 && !fixed_regs
[regno
]
8100 && !crtl
->abi
->clobbers_full_reg_p (regno
))
8102 frame
.reg_offset
[regno
] = SLOT_REQUIRED
;
8103 last_fp_reg
= regno
;
8104 if (aarch64_emit_cfi_for_reg_p (regno
))
8105 frame_related_fp_reg_p
= true;
8108 /* Big-endian SVE frames need a spare predicate register in order
8109 to save Z8-Z15. Decide which register they should use. Prefer
8110 an unused argument register if possible, so that we don't force P4
8111 to be saved unnecessarily. */
8112 if (frame_related_fp_reg_p
8113 && crtl
->abi
->id () == ARM_PCS_SVE
8114 && BYTES_BIG_ENDIAN
)
8116 bitmap live1
= df_get_live_out (ENTRY_BLOCK_PTR_FOR_FN (cfun
));
8117 bitmap live2
= df_get_live_in (EXIT_BLOCK_PTR_FOR_FN (cfun
));
8118 for (regno
= P0_REGNUM
; regno
<= P7_REGNUM
; regno
++)
8119 if (!bitmap_bit_p (live1
, regno
) && !bitmap_bit_p (live2
, regno
))
8121 gcc_assert (regno
<= P7_REGNUM
);
8122 frame
.spare_pred_reg
= regno
;
8123 df_set_regs_ever_live (regno
, true);
8126 for (regno
= P0_REGNUM
; regno
<= P15_REGNUM
; regno
++)
8127 if ((enables_pstate_sm
|| df_regs_ever_live_p (regno
))
8128 && !fixed_regs
[regno
]
8129 && !crtl
->abi
->clobbers_full_reg_p (regno
))
8130 frame
.reg_offset
[regno
] = SLOT_REQUIRED
;
8132 bool regs_at_top_p
= aarch64_save_regs_above_locals_p ();
8134 poly_int64 offset
= crtl
->outgoing_args_size
;
8135 gcc_assert (multiple_p (offset
, STACK_BOUNDARY
/ BITS_PER_UNIT
));
8138 offset
+= get_frame_size ();
8139 offset
= aligned_upper_bound (offset
, STACK_BOUNDARY
/ BITS_PER_UNIT
);
8140 top_of_locals
= offset
;
8142 frame
.bytes_below_saved_regs
= offset
;
8143 frame
.sve_save_and_probe
= INVALID_REGNUM
;
8145 /* Now assign stack slots for the registers. Start with the predicate
8146 registers, since predicate LDR and STR have a relatively small
8147 offset range. These saves happen below the hard frame pointer. */
8148 for (regno
= P0_REGNUM
; regno
<= P15_REGNUM
; regno
++)
8149 if (known_eq (frame
.reg_offset
[regno
], SLOT_REQUIRED
))
8151 vec_safe_push (frame
.saved_prs
, regno
);
8152 if (frame
.sve_save_and_probe
== INVALID_REGNUM
)
8153 frame
.sve_save_and_probe
= regno
;
8154 frame
.reg_offset
[regno
] = offset
;
8155 offset
+= BYTES_PER_SVE_PRED
;
8158 poly_int64 saved_prs_size
= offset
- frame
.bytes_below_saved_regs
;
8159 if (maybe_ne (saved_prs_size
, 0))
8161 /* If we have any vector registers to save above the predicate registers,
8162 the offset of the vector register save slots need to be a multiple
8163 of the vector size. This lets us use the immediate forms of LDR/STR
8164 (or LD1/ST1 for big-endian).
8166 A vector register is 8 times the size of a predicate register,
8167 and we need to save a maximum of 12 predicate registers, so the
8168 first vector register will be at either #1, MUL VL or #2, MUL VL.
8170 If we don't have any vector registers to save, and we know how
8171 big the predicate save area is, we can just round it up to the
8172 next 16-byte boundary. */
8173 if (last_fp_reg
== INVALID_REGNUM
&& offset
.is_constant ())
8174 offset
= aligned_upper_bound (offset
, STACK_BOUNDARY
/ BITS_PER_UNIT
);
8177 if (known_le (saved_prs_size
, vector_save_size
))
8178 offset
= frame
.bytes_below_saved_regs
+ vector_save_size
;
8179 else if (known_le (saved_prs_size
, vector_save_size
* 2))
8180 offset
= frame
.bytes_below_saved_regs
+ vector_save_size
* 2;
8186 /* If we need to save any SVE vector registers, add them next. */
8187 if (last_fp_reg
!= INVALID_REGNUM
&& crtl
->abi
->id () == ARM_PCS_SVE
)
8188 for (regno
= V0_REGNUM
; regno
<= V31_REGNUM
; regno
++)
8189 if (known_eq (frame
.reg_offset
[regno
], SLOT_REQUIRED
))
8191 vec_safe_push (frame
.saved_fprs
, regno
);
8192 if (frame
.sve_save_and_probe
== INVALID_REGNUM
)
8193 frame
.sve_save_and_probe
= regno
;
8194 frame
.reg_offset
[regno
] = offset
;
8195 offset
+= vector_save_size
;
8198 /* OFFSET is now the offset of the hard frame pointer from the bottom
8199 of the callee save area. */
8200 auto below_hard_fp_saved_regs_size
= offset
- frame
.bytes_below_saved_regs
;
8201 bool saves_below_hard_fp_p
= maybe_ne (below_hard_fp_saved_regs_size
, 0);
8202 gcc_assert (!saves_below_hard_fp_p
8203 || (frame
.sve_save_and_probe
!= INVALID_REGNUM
8204 && known_eq (frame
.reg_offset
[frame
.sve_save_and_probe
],
8205 frame
.bytes_below_saved_regs
)));
8207 frame
.bytes_below_hard_fp
= offset
;
8208 frame
.hard_fp_save_and_probe
= INVALID_REGNUM
;
8210 auto allocate_gpr_slot
= [&](unsigned int regno
)
8212 vec_safe_push (frame
.saved_gprs
, regno
);
8213 frame
.reg_offset
[regno
] = offset
;
8214 offset
+= UNITS_PER_WORD
;
8217 if (frame
.emit_frame_chain
)
8219 /* FP and LR are placed in the linkage record. */
8220 allocate_gpr_slot (R29_REGNUM
);
8221 allocate_gpr_slot (R30_REGNUM
);
8223 else if ((flag_stack_clash_protection
|| !frame
.is_scs_enabled
)
8224 && known_eq (frame
.reg_offset
[R30_REGNUM
], SLOT_REQUIRED
))
8225 /* Put the LR save slot first, since it makes a good choice of probe
8226 for stack clash purposes. The idea is that the link register usually
8227 has to be saved before a call anyway, and so we lose little by
8228 stopping it from being individually shrink-wrapped. */
8229 allocate_gpr_slot (R30_REGNUM
);
8231 for (regno
= R0_REGNUM
; regno
<= R30_REGNUM
; regno
++)
8232 if (known_eq (frame
.reg_offset
[regno
], SLOT_REQUIRED
))
8233 allocate_gpr_slot (regno
);
8235 if (aarch64_need_old_pstate_sm ())
8237 frame
.old_svcr_offset
= offset
;
8238 offset
+= UNITS_PER_WORD
;
8241 /* If the current function changes the SVE vector length, ensure that the
8242 old value of the DWARF VG register is saved and available in the CFI,
8243 so that outer frames with VL-sized offsets can be processed correctly. */
8244 if (cfun
->machine
->call_switches_pstate_sm
8245 || aarch64_cfun_enables_pstate_sm ())
8247 frame
.reg_offset
[VG_REGNUM
] = offset
;
8248 offset
+= UNITS_PER_WORD
;
8251 poly_int64 max_int_offset
= offset
;
8252 offset
= aligned_upper_bound (offset
, STACK_BOUNDARY
/ BITS_PER_UNIT
);
8253 bool has_align_gap
= maybe_ne (offset
, max_int_offset
);
8255 for (regno
= V0_REGNUM
; regno
<= V31_REGNUM
; regno
++)
8256 if (known_eq (frame
.reg_offset
[regno
], SLOT_REQUIRED
))
8258 vec_safe_push (frame
.saved_fprs
, regno
);
8259 /* If there is an alignment gap between integer and fp callee-saves,
8260 allocate the last fp register to it if possible. */
8261 if (regno
== last_fp_reg
8263 && known_eq (vector_save_size
, 8)
8264 && multiple_p (offset
, 16))
8266 frame
.reg_offset
[regno
] = max_int_offset
;
8270 frame
.reg_offset
[regno
] = offset
;
8271 offset
+= vector_save_size
;
8274 offset
= aligned_upper_bound (offset
, STACK_BOUNDARY
/ BITS_PER_UNIT
);
8275 auto saved_regs_size
= offset
- frame
.bytes_below_saved_regs
;
8277 array_slice
<unsigned int> push_regs
= (!vec_safe_is_empty (frame
.saved_gprs
)
8279 : frame
.saved_fprs
);
8280 if (!push_regs
.empty ()
8281 && known_eq (frame
.reg_offset
[push_regs
[0]], frame
.bytes_below_hard_fp
))
8283 frame
.hard_fp_save_and_probe
= push_regs
[0];
8284 frame
.wb_push_candidate1
= push_regs
[0];
8285 if (push_regs
.size () > 1)
8286 frame
.wb_push_candidate2
= push_regs
[1];
8289 /* With stack-clash, a register must be saved in non-leaf functions.
8290 The saving of the bottommost register counts as an implicit probe,
8291 which allows us to maintain the invariant described in the comment
8292 at expand_prologue. */
8293 gcc_assert (crtl
->is_leaf
|| maybe_ne (saved_regs_size
, 0));
8297 offset
+= get_frame_size ();
8298 offset
= aligned_upper_bound (offset
, STACK_BOUNDARY
/ BITS_PER_UNIT
);
8299 top_of_locals
= offset
;
8301 offset
+= frame
.saved_varargs_size
;
8302 gcc_assert (multiple_p (offset
, STACK_BOUNDARY
/ BITS_PER_UNIT
));
8303 frame
.frame_size
= offset
;
8305 frame
.bytes_above_hard_fp
= frame
.frame_size
- frame
.bytes_below_hard_fp
;
8306 gcc_assert (known_ge (top_of_locals
, 0));
8307 frame
.bytes_above_locals
= frame
.frame_size
- top_of_locals
;
8309 frame
.initial_adjust
= 0;
8310 frame
.final_adjust
= 0;
8311 frame
.callee_adjust
= 0;
8312 frame
.sve_callee_adjust
= 0;
8314 frame
.wb_pop_candidate1
= frame
.wb_push_candidate1
;
8315 frame
.wb_pop_candidate2
= frame
.wb_push_candidate2
;
8317 /* Shadow call stack only deals with functions where the LR is pushed
8318 onto the stack and without specifying the "no_sanitize" attribute
8319 with the argument "shadow-call-stack". */
8320 frame
.is_scs_enabled
8321 = (!crtl
->calls_eh_return
8322 && sanitize_flags_p (SANITIZE_SHADOW_CALL_STACK
)
8323 && known_ge (frame
.reg_offset
[LR_REGNUM
], 0));
8325 /* When shadow call stack is enabled, the scs_pop in the epilogue will
8326 restore x30, and we don't need to pop x30 again in the traditional
8327 way. Pop candidates record the registers that need to be popped
8329 if (frame
.is_scs_enabled
)
8331 if (frame
.wb_pop_candidate2
== R30_REGNUM
)
8332 frame
.wb_pop_candidate2
= INVALID_REGNUM
;
8333 else if (frame
.wb_pop_candidate1
== R30_REGNUM
)
8334 frame
.wb_pop_candidate1
= INVALID_REGNUM
;
8337 /* If candidate2 is INVALID_REGNUM, we need to adjust max_push_offset to
8338 256 to ensure that the offset meets the requirements of emit_move_insn.
8339 Similarly, if candidate1 is INVALID_REGNUM, we need to set
8340 max_push_offset to 0, because no registers are popped at this time,
8341 so callee_adjust cannot be adjusted. */
8342 HOST_WIDE_INT max_push_offset
= 0;
8343 if (frame
.wb_pop_candidate1
!= INVALID_REGNUM
)
8345 if (frame
.wb_pop_candidate2
!= INVALID_REGNUM
)
8346 max_push_offset
= 512;
8348 max_push_offset
= 256;
8351 HOST_WIDE_INT const_size
, const_below_saved_regs
, const_above_fp
;
8352 HOST_WIDE_INT const_saved_regs_size
;
8353 if (known_eq (saved_regs_size
, 0))
8354 frame
.initial_adjust
= frame
.frame_size
;
8355 else if (frame
.frame_size
.is_constant (&const_size
)
8356 && const_size
< max_push_offset
8357 && known_eq (frame
.bytes_above_hard_fp
, const_size
))
8359 /* Simple, small frame with no data below the saved registers.
8361 stp reg1, reg2, [sp, -frame_size]!
8362 stp reg3, reg4, [sp, 16] */
8363 frame
.callee_adjust
= const_size
;
8365 else if (frame
.bytes_below_saved_regs
.is_constant (&const_below_saved_regs
)
8366 && saved_regs_size
.is_constant (&const_saved_regs_size
)
8367 && const_below_saved_regs
+ const_saved_regs_size
< 512
8368 /* We could handle this case even with data below the saved
8369 registers, provided that that data left us with valid offsets
8370 for all predicate and vector save slots. It's such a rare
8371 case that it hardly seems worth the effort though. */
8372 && (!saves_below_hard_fp_p
|| const_below_saved_regs
== 0)
8373 && !(cfun
->calls_alloca
8374 && frame
.bytes_above_hard_fp
.is_constant (&const_above_fp
)
8375 && const_above_fp
< max_push_offset
))
8377 /* Frame with small area below the saved registers:
8379 sub sp, sp, frame_size
8380 stp reg1, reg2, [sp, bytes_below_saved_regs]
8381 stp reg3, reg4, [sp, bytes_below_saved_regs + 16] */
8382 frame
.initial_adjust
= frame
.frame_size
;
8384 else if (saves_below_hard_fp_p
8385 && known_eq (saved_regs_size
, below_hard_fp_saved_regs_size
))
8387 /* Frame in which all saves are SVE saves:
8389 sub sp, sp, frame_size - bytes_below_saved_regs
8390 save SVE registers relative to SP
8391 sub sp, sp, bytes_below_saved_regs */
8392 frame
.initial_adjust
= frame
.frame_size
- frame
.bytes_below_saved_regs
;
8393 frame
.final_adjust
= frame
.bytes_below_saved_regs
;
8395 else if (frame
.wb_push_candidate1
!= INVALID_REGNUM
8396 && frame
.bytes_above_hard_fp
.is_constant (&const_above_fp
)
8397 && const_above_fp
< max_push_offset
)
8399 /* Frame with large area below the saved registers, or with SVE saves,
8400 but with a small area above:
8402 stp reg1, reg2, [sp, -hard_fp_offset]!
8403 stp reg3, reg4, [sp, 16]
8404 [sub sp, sp, below_hard_fp_saved_regs_size]
8405 [save SVE registers relative to SP]
8406 sub sp, sp, bytes_below_saved_regs */
8407 frame
.callee_adjust
= const_above_fp
;
8408 frame
.sve_callee_adjust
= below_hard_fp_saved_regs_size
;
8409 frame
.final_adjust
= frame
.bytes_below_saved_regs
;
8415 sub sp, sp, hard_fp_offset
8416 stp x29, x30, [sp, 0]
8418 stp reg3, reg4, [sp, 16]
8419 [sub sp, sp, below_hard_fp_saved_regs_size]
8420 [save SVE registers relative to SP]
8421 sub sp, sp, bytes_below_saved_regs */
8422 frame
.initial_adjust
= frame
.bytes_above_hard_fp
;
8423 frame
.sve_callee_adjust
= below_hard_fp_saved_regs_size
;
8424 frame
.final_adjust
= frame
.bytes_below_saved_regs
;
8427 /* The frame is allocated in pieces, with each non-final piece
8428 including a register save at offset 0 that acts as a probe for
8429 the following piece. In addition, the save of the bottommost register
8430 acts as a probe for callees and allocas. Roll back any probes that
8433 A probe isn't needed if it is associated with the final allocation
8434 (including callees and allocas) that happens before the epilogue is
8437 && !cfun
->calls_alloca
8438 && known_eq (frame
.final_adjust
, 0))
8440 if (maybe_ne (frame
.sve_callee_adjust
, 0))
8441 frame
.sve_save_and_probe
= INVALID_REGNUM
;
8443 frame
.hard_fp_save_and_probe
= INVALID_REGNUM
;
8446 /* Make sure the individual adjustments add up to the full frame size. */
8447 gcc_assert (known_eq (frame
.initial_adjust
8448 + frame
.callee_adjust
8449 + frame
.sve_callee_adjust
8450 + frame
.final_adjust
, frame
.frame_size
));
8452 if (frame
.callee_adjust
== 0)
8454 /* We've decided not to do a "real" push and pop. However,
8455 setting up the frame chain is treated as being essentially
8456 a multi-instruction push. */
8457 frame
.wb_pop_candidate1
= frame
.wb_pop_candidate2
= INVALID_REGNUM
;
8458 if (!frame
.emit_frame_chain
)
8459 frame
.wb_push_candidate1
= frame
.wb_push_candidate2
= INVALID_REGNUM
;
8462 frame
.laid_out
= true;
8465 /* Return true if the register REGNO is saved on entry to
8466 the current function. */
8469 aarch64_register_saved_on_entry (int regno
)
8471 return known_ge (cfun
->machine
->frame
.reg_offset
[regno
], 0);
8474 /* Push the register number REGNO of mode MODE to the stack with write-back
8475 adjusting the stack by ADJUSTMENT. */
8478 aarch64_pushwb_single_reg (machine_mode mode
, unsigned regno
,
8479 HOST_WIDE_INT adjustment
)
8481 rtx base_rtx
= stack_pointer_rtx
;
8484 reg
= gen_rtx_REG (mode
, regno
);
8485 mem
= gen_rtx_PRE_MODIFY (Pmode
, base_rtx
,
8486 plus_constant (Pmode
, base_rtx
, -adjustment
));
8487 mem
= gen_frame_mem (mode
, mem
);
8489 insn
= emit_move_insn (mem
, reg
);
8490 RTX_FRAME_RELATED_P (insn
) = 1;
8493 /* Generate and return an instruction to store the pair of registers
8494 REG and REG2 of mode MODE to location BASE with write-back adjusting
8495 the stack location BASE by ADJUSTMENT. */
8498 aarch64_gen_storewb_pair (machine_mode mode
, rtx base
, rtx reg
, rtx reg2
,
8499 HOST_WIDE_INT adjustment
)
8501 rtx new_base
= plus_constant (Pmode
, base
, -adjustment
);
8502 rtx mem
= gen_frame_mem (mode
, new_base
);
8503 rtx mem2
= adjust_address_nv (mem
, mode
, GET_MODE_SIZE (mode
));
8505 return gen_rtx_PARALLEL (VOIDmode
,
8507 gen_rtx_SET (base
, new_base
),
8508 gen_rtx_SET (mem
, reg
),
8509 gen_rtx_SET (mem2
, reg2
)));
8512 /* Push registers numbered REGNO1 and REGNO2 to the stack, adjusting the
8513 stack pointer by ADJUSTMENT. */
8516 aarch64_push_regs (unsigned regno1
, unsigned regno2
, HOST_WIDE_INT adjustment
)
8519 machine_mode mode
= aarch64_reg_save_mode (regno1
);
8521 if (regno2
== INVALID_REGNUM
)
8522 return aarch64_pushwb_single_reg (mode
, regno1
, adjustment
);
8524 rtx reg1
= gen_rtx_REG (mode
, regno1
);
8525 rtx reg2
= gen_rtx_REG (mode
, regno2
);
8527 insn
= emit_insn (aarch64_gen_storewb_pair (mode
, stack_pointer_rtx
, reg1
,
8529 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn
), 0, 2)) = 1;
8530 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn
), 0, 1)) = 1;
8531 RTX_FRAME_RELATED_P (insn
) = 1;
8534 /* Load the pair of register REG, REG2 of mode MODE from stack location BASE,
8535 adjusting it by ADJUSTMENT afterwards. */
8538 aarch64_gen_loadwb_pair (machine_mode mode
, rtx base
, rtx reg
, rtx reg2
,
8539 HOST_WIDE_INT adjustment
)
8541 rtx mem
= gen_frame_mem (mode
, base
);
8542 rtx mem2
= adjust_address_nv (mem
, mode
, GET_MODE_SIZE (mode
));
8543 rtx new_base
= plus_constant (Pmode
, base
, adjustment
);
8545 return gen_rtx_PARALLEL (VOIDmode
,
8547 gen_rtx_SET (base
, new_base
),
8548 gen_rtx_SET (reg
, mem
),
8549 gen_rtx_SET (reg2
, mem2
)));
8552 /* Pop the two registers numbered REGNO1, REGNO2 from the stack, adjusting it
8553 afterwards by ADJUSTMENT and writing the appropriate REG_CFA_RESTORE notes
8557 aarch64_pop_regs (unsigned regno1
, unsigned regno2
, HOST_WIDE_INT adjustment
,
8560 machine_mode mode
= aarch64_reg_save_mode (regno1
);
8561 rtx reg1
= gen_rtx_REG (mode
, regno1
);
8563 *cfi_ops
= alloc_reg_note (REG_CFA_RESTORE
, reg1
, *cfi_ops
);
8565 if (regno2
== INVALID_REGNUM
)
8567 rtx mem
= plus_constant (Pmode
, stack_pointer_rtx
, adjustment
);
8568 mem
= gen_rtx_POST_MODIFY (Pmode
, stack_pointer_rtx
, mem
);
8569 emit_move_insn (reg1
, gen_frame_mem (mode
, mem
));
8573 rtx reg2
= gen_rtx_REG (mode
, regno2
);
8574 *cfi_ops
= alloc_reg_note (REG_CFA_RESTORE
, reg2
, *cfi_ops
);
8575 emit_insn (aarch64_gen_loadwb_pair (mode
, stack_pointer_rtx
, reg1
,
8580 /* Given an ldp/stp register operand mode MODE, return a suitable mode to use
8581 for a mem rtx representing the entire pair. */
8584 aarch64_pair_mode_for_mode (machine_mode mode
)
8586 if (known_eq (GET_MODE_SIZE (mode
), 4))
8588 else if (known_eq (GET_MODE_SIZE (mode
), 8))
8590 else if (known_eq (GET_MODE_SIZE (mode
), 16))
8596 /* Given a base mem MEM with mode and address suitable for a single ldp/stp
8597 operand, return an rtx like MEM which instead represents the entire pair. */
8600 aarch64_pair_mem_from_base (rtx mem
)
8602 auto pair_mode
= aarch64_pair_mode_for_mode (GET_MODE (mem
));
8603 mem
= adjust_bitfield_address_nv (mem
, pair_mode
, 0);
8604 gcc_assert (aarch64_mem_pair_lanes_operand (mem
, pair_mode
));
8608 /* Generate and return a store pair instruction to store REG1 and REG2
8609 into memory starting at BASE_MEM. All three rtxes should have modes of the
8613 aarch64_gen_store_pair (rtx base_mem
, rtx reg1
, rtx reg2
)
8615 rtx pair_mem
= aarch64_pair_mem_from_base (base_mem
);
8617 return gen_rtx_SET (pair_mem
,
8618 gen_rtx_UNSPEC (GET_MODE (pair_mem
),
8619 gen_rtvec (2, reg1
, reg2
),
8623 /* Generate and return a load pair instruction to load a pair of
8624 registers starting at BASE_MEM into REG1 and REG2. If CODE is
8625 UNKNOWN, all three rtxes should have modes of the same size.
8626 Otherwise, CODE is {SIGN,ZERO}_EXTEND, base_mem should be in SImode,
8627 and REG{1,2} should be in DImode. */
8630 aarch64_gen_load_pair (rtx reg1
, rtx reg2
, rtx base_mem
, enum rtx_code code
)
8632 rtx pair_mem
= aarch64_pair_mem_from_base (base_mem
);
8634 const bool any_extend_p
= (code
== ZERO_EXTEND
|| code
== SIGN_EXTEND
);
8636 gcc_checking_assert (GET_MODE (base_mem
) == SImode
8637 && GET_MODE (reg1
) == DImode
8638 && GET_MODE (reg2
) == DImode
);
8640 gcc_assert (code
== UNKNOWN
);
8643 gen_rtx_UNSPEC (any_extend_p
? SImode
: GET_MODE (reg1
),
8644 gen_rtvec (1, pair_mem
),
8646 gen_rtx_UNSPEC (any_extend_p
? SImode
: GET_MODE (reg2
),
8647 gen_rtvec (1, copy_rtx (pair_mem
)),
8652 for (int i
= 0; i
< 2; i
++)
8653 unspecs
[i
] = gen_rtx_fmt_e (code
, DImode
, unspecs
[i
]);
8655 return gen_rtx_PARALLEL (VOIDmode
,
8657 gen_rtx_SET (reg1
, unspecs
[0]),
8658 gen_rtx_SET (reg2
, unspecs
[1])));
8661 /* Return TRUE if return address signing should be enabled for the current
8662 function, otherwise return FALSE. */
8665 aarch64_return_address_signing_enabled (void)
8667 /* This function should only be called after frame laid out. */
8668 gcc_assert (cfun
->machine
->frame
.laid_out
);
8670 /* If signing scope is AARCH_FUNCTION_NON_LEAF, we only sign a leaf function
8671 if its LR is pushed onto stack. */
8672 return (aarch_ra_sign_scope
== AARCH_FUNCTION_ALL
8673 || (aarch_ra_sign_scope
== AARCH_FUNCTION_NON_LEAF
8674 && known_ge (cfun
->machine
->frame
.reg_offset
[LR_REGNUM
], 0)));
8677 /* Only used by the arm backend. */
8678 void aarch_bti_arch_check (void)
8681 /* Return TRUE if Branch Target Identification Mechanism is enabled. */
8683 aarch_bti_enabled (void)
8685 return (aarch_enable_bti
== 1);
8688 /* Check if INSN is a BTI J insn. */
8690 aarch_bti_j_insn_p (rtx_insn
*insn
)
8692 if (!insn
|| !INSN_P (insn
))
8695 rtx pat
= PATTERN (insn
);
8696 return GET_CODE (pat
) == UNSPEC_VOLATILE
&& XINT (pat
, 1) == UNSPECV_BTI_J
;
8699 /* Return TRUE if Guarded Control Stack is enabled. */
8701 aarch64_gcs_enabled (void)
8703 return (aarch64_enable_gcs
== 1);
8706 /* Check if X (or any sub-rtx of X) is a PACIASP/PACIBSP instruction. */
8708 aarch_pac_insn_p (rtx x
)
8713 subrtx_var_iterator::array_type array
;
8714 FOR_EACH_SUBRTX_VAR (iter
, array
, PATTERN (x
), ALL
)
8717 if (sub
&& GET_CODE (sub
) == UNSPEC
)
8719 int unspec_val
= XINT (sub
, 1);
8722 case UNSPEC_PACIASP
:
8723 case UNSPEC_PACIBSP
:
8729 iter
.skip_subrtxes ();
8735 rtx
aarch_gen_bti_c (void)
8737 return gen_bti_c ();
8740 rtx
aarch_gen_bti_j (void)
8742 return gen_bti_j ();
8745 /* The caller is going to use ST1D or LD1D to save or restore an SVE
8746 register in mode MODE at BASE_RTX + OFFSET, where OFFSET is in
8747 the range [1, 16] * GET_MODE_SIZE (MODE). Prepare for this by:
8749 (1) updating BASE_RTX + OFFSET so that it is a legitimate ST1D
8752 (2) setting PRED to a valid predicate register for the ST1D or LD1D,
8753 if the variable isn't already nonnull
8755 (1) is needed when OFFSET is in the range [8, 16] * GET_MODE_SIZE (MODE).
8756 Handle this case using a temporary base register that is suitable for
8757 all offsets in that range. Use ANCHOR_REG as this base register if it
8758 is nonnull, otherwise create a new register and store it in ANCHOR_REG. */
8761 aarch64_adjust_sve_callee_save_base (machine_mode mode
, rtx
&base_rtx
,
8762 rtx
&anchor_reg
, poly_int64
&offset
,
8765 if (maybe_ge (offset
, 8 * GET_MODE_SIZE (mode
)))
8767 /* This is the maximum valid offset of the anchor from the base.
8768 Lower values would be valid too. */
8769 poly_int64 anchor_offset
= 16 * GET_MODE_SIZE (mode
);
8772 anchor_reg
= gen_rtx_REG (Pmode
, STACK_CLASH_SVE_CFA_REGNUM
);
8773 emit_insn (gen_add3_insn (anchor_reg
, base_rtx
,
8774 gen_int_mode (anchor_offset
, Pmode
)));
8776 base_rtx
= anchor_reg
;
8777 offset
-= anchor_offset
;
8781 int pred_reg
= cfun
->machine
->frame
.spare_pred_reg
;
8782 emit_move_insn (gen_rtx_REG (VNx16BImode
, pred_reg
),
8783 CONSTM1_RTX (VNx16BImode
));
8784 ptrue
= gen_rtx_REG (VNx2BImode
, pred_reg
);
8788 /* Add a REG_CFA_EXPRESSION note to INSN to say that register REG
8789 is saved at BASE + OFFSET. */
8792 aarch64_add_cfa_expression (rtx_insn
*insn
, rtx reg
,
8793 rtx base
, poly_int64 offset
)
8795 rtx mem
= gen_frame_mem (GET_MODE (reg
),
8796 plus_constant (Pmode
, base
, offset
));
8797 add_reg_note (insn
, REG_CFA_EXPRESSION
, gen_rtx_SET (mem
, reg
));
8800 /* Emit code to save the callee-saved registers in REGS. Skip any
8801 write-back candidates if SKIP_WB is true, otherwise consider only
8802 write-back candidates.
8804 The stack pointer is currently BYTES_BELOW_SP bytes above the bottom
8805 of the static frame. HARD_FP_VALID_P is true if the hard frame pointer
8809 aarch64_save_callee_saves (poly_int64 bytes_below_sp
,
8810 array_slice
<unsigned int> regs
, bool skip_wb
,
8811 bool hard_fp_valid_p
)
8813 aarch64_frame
&frame
= cfun
->machine
->frame
;
8815 rtx anchor_reg
= NULL_RTX
, ptrue
= NULL_RTX
;
8817 auto skip_save_p
= [&](unsigned int regno
)
8819 if (cfun
->machine
->reg_is_wrapped_separately
[regno
])
8822 if (skip_wb
== (regno
== frame
.wb_push_candidate1
8823 || regno
== frame
.wb_push_candidate2
))
8829 for (unsigned int i
= 0; i
< regs
.size (); ++i
)
8831 unsigned int regno
= regs
[i
];
8833 bool frame_related_p
= aarch64_emit_cfi_for_reg_p (regno
);
8835 if (skip_save_p (regno
))
8838 machine_mode mode
= aarch64_reg_save_mode (regno
);
8839 rtx reg
= gen_rtx_REG (mode
, regno
);
8841 offset
= frame
.reg_offset
[regno
] - bytes_below_sp
;
8842 if (regno
== VG_REGNUM
)
8844 move_src
= gen_rtx_REG (DImode
, IP0_REGNUM
);
8845 emit_move_insn (move_src
, gen_int_mode (aarch64_sve_vg
, DImode
));
8847 rtx base_rtx
= stack_pointer_rtx
;
8848 poly_int64 sp_offset
= offset
;
8850 HOST_WIDE_INT const_offset
;
8851 if (mode
== VNx2DImode
&& BYTES_BIG_ENDIAN
)
8852 aarch64_adjust_sve_callee_save_base (mode
, base_rtx
, anchor_reg
,
8854 else if (GP_REGNUM_P (REGNO (reg
))
8855 && (!offset
.is_constant (&const_offset
) || const_offset
>= 512))
8857 poly_int64 fp_offset
= frame
.bytes_below_hard_fp
- bytes_below_sp
;
8858 if (hard_fp_valid_p
)
8859 base_rtx
= hard_frame_pointer_rtx
;
8864 anchor_reg
= gen_rtx_REG (Pmode
, STACK_CLASH_SVE_CFA_REGNUM
);
8865 emit_insn (gen_add3_insn (anchor_reg
, base_rtx
,
8866 gen_int_mode (fp_offset
, Pmode
)));
8868 base_rtx
= anchor_reg
;
8870 offset
-= fp_offset
;
8872 rtx mem
= gen_frame_mem (mode
, plus_constant (Pmode
, base_rtx
, offset
));
8873 rtx cfi_mem
= gen_frame_mem (mode
, plus_constant (Pmode
,
8876 rtx cfi_set
= gen_rtx_SET (cfi_mem
, reg
);
8877 bool need_cfi_note_p
= (base_rtx
!= stack_pointer_rtx
);
8879 unsigned int regno2
;
8880 if (!aarch64_sve_mode_p (mode
)
8882 && i
+ 1 < regs
.size ()
8883 && (regno2
= regs
[i
+ 1], !skip_save_p (regno2
))
8884 && known_eq (GET_MODE_SIZE (mode
),
8885 frame
.reg_offset
[regno2
] - frame
.reg_offset
[regno
]))
8887 rtx reg2
= gen_rtx_REG (mode
, regno2
);
8889 offset
+= GET_MODE_SIZE (mode
);
8890 insn
= emit_insn (aarch64_gen_store_pair (mem
, reg
, reg2
));
8893 = gen_frame_mem (mode
,
8894 plus_constant (Pmode
,
8896 sp_offset
+ GET_MODE_SIZE (mode
)));
8897 rtx cfi_set2
= gen_rtx_SET (cfi_mem2
, reg2
);
8899 /* The first part of a frame-related parallel insn is always
8900 assumed to be relevant to the frame calculations;
8901 subsequent parts, are only frame-related if
8902 explicitly marked. */
8903 if (aarch64_emit_cfi_for_reg_p (regno2
))
8904 RTX_FRAME_RELATED_P (cfi_set2
) = 1;
8906 /* Add a REG_FRAME_RELATED_EXPR note since the unspec
8907 representation of stp cannot be understood directly by
8909 rtx par
= gen_rtx_PARALLEL (VOIDmode
,
8910 gen_rtvec (2, cfi_set
, cfi_set2
));
8911 add_reg_note (insn
, REG_FRAME_RELATED_EXPR
, par
);
8918 if (mode
== VNx2DImode
&& BYTES_BIG_ENDIAN
)
8920 insn
= emit_insn (gen_aarch64_pred_mov (mode
, mem
,
8922 need_cfi_note_p
= true;
8924 else if (aarch64_sve_mode_p (mode
))
8925 insn
= emit_insn (gen_rtx_SET (mem
, move_src
));
8927 insn
= emit_move_insn (mem
, move_src
);
8929 if (frame_related_p
&& (need_cfi_note_p
|| move_src
!= reg
))
8930 add_reg_note (insn
, REG_FRAME_RELATED_EXPR
, cfi_set
);
8933 RTX_FRAME_RELATED_P (insn
) = frame_related_p
;
8935 /* Emit a fake instruction to indicate that the VG save slot has
8936 been initialized. */
8937 if (regno
== VG_REGNUM
)
8938 emit_insn (gen_aarch64_old_vg_saved (move_src
, mem
));
8942 /* Emit code to restore the callee registers in REGS, ignoring pop candidates
8943 and any other registers that are handled separately. Write the appropriate
8944 REG_CFA_RESTORE notes into CFI_OPS.
8946 The stack pointer is currently BYTES_BELOW_SP bytes above the bottom
8947 of the static frame. */
8950 aarch64_restore_callee_saves (poly_int64 bytes_below_sp
,
8951 array_slice
<unsigned int> regs
, rtx
*cfi_ops
)
8953 aarch64_frame
&frame
= cfun
->machine
->frame
;
8955 rtx anchor_reg
= NULL_RTX
, ptrue
= NULL_RTX
;
8957 auto skip_restore_p
= [&](unsigned int regno
)
8959 if (cfun
->machine
->reg_is_wrapped_separately
[regno
])
8962 if (regno
== frame
.wb_pop_candidate1
8963 || regno
== frame
.wb_pop_candidate2
)
8966 /* The shadow call stack code restores LR separately. */
8967 if (frame
.is_scs_enabled
&& regno
== LR_REGNUM
)
8973 for (unsigned int i
= 0; i
< regs
.size (); ++i
)
8975 unsigned int regno
= regs
[i
];
8976 bool frame_related_p
= aarch64_emit_cfi_for_reg_p (regno
);
8977 if (skip_restore_p (regno
))
8980 machine_mode mode
= aarch64_reg_save_mode (regno
);
8981 rtx reg
= gen_rtx_REG (mode
, regno
);
8982 offset
= frame
.reg_offset
[regno
] - bytes_below_sp
;
8983 rtx base_rtx
= stack_pointer_rtx
;
8984 if (mode
== VNx2DImode
&& BYTES_BIG_ENDIAN
)
8985 aarch64_adjust_sve_callee_save_base (mode
, base_rtx
, anchor_reg
,
8987 rtx mem
= gen_frame_mem (mode
, plus_constant (Pmode
, base_rtx
, offset
));
8989 unsigned int regno2
;
8990 if (!aarch64_sve_mode_p (mode
)
8991 && i
+ 1 < regs
.size ()
8992 && (regno2
= regs
[i
+ 1], !skip_restore_p (regno2
))
8993 && known_eq (GET_MODE_SIZE (mode
),
8994 frame
.reg_offset
[regno2
] - frame
.reg_offset
[regno
]))
8996 rtx reg2
= gen_rtx_REG (mode
, regno2
);
8998 offset
+= GET_MODE_SIZE (mode
);
8999 emit_insn (aarch64_gen_load_pair (reg
, reg2
, mem
));
9001 *cfi_ops
= alloc_reg_note (REG_CFA_RESTORE
, reg2
, *cfi_ops
);
9005 else if (mode
== VNx2DImode
&& BYTES_BIG_ENDIAN
)
9006 emit_insn (gen_aarch64_pred_mov (mode
, reg
, ptrue
, mem
));
9007 else if (aarch64_sve_mode_p (mode
))
9008 emit_insn (gen_rtx_SET (reg
, mem
));
9010 emit_move_insn (reg
, mem
);
9011 if (frame_related_p
)
9012 *cfi_ops
= alloc_reg_note (REG_CFA_RESTORE
, reg
, *cfi_ops
);
9016 /* Return true if OFFSET is a signed 4-bit value multiplied by the size
9020 offset_4bit_signed_scaled_p (machine_mode mode
, poly_int64 offset
)
9022 HOST_WIDE_INT multiple
;
9023 return (constant_multiple_p (offset
, GET_MODE_SIZE (mode
), &multiple
)
9024 && IN_RANGE (multiple
, -8, 7));
9027 /* Return true if OFFSET is a signed 6-bit value multiplied by the size
9031 offset_6bit_signed_scaled_p (machine_mode mode
, poly_int64 offset
)
9033 HOST_WIDE_INT multiple
;
9034 return (constant_multiple_p (offset
, GET_MODE_SIZE (mode
), &multiple
)
9035 && IN_RANGE (multiple
, -32, 31));
9038 /* Return true if OFFSET is an unsigned 6-bit value multiplied by the size
9042 offset_6bit_unsigned_scaled_p (machine_mode mode
, poly_int64 offset
)
9044 HOST_WIDE_INT multiple
;
9045 return (constant_multiple_p (offset
, GET_MODE_SIZE (mode
), &multiple
)
9046 && IN_RANGE (multiple
, 0, 63));
9049 /* Return true if OFFSET is a signed 7-bit value multiplied by the size
9053 aarch64_offset_7bit_signed_scaled_p (machine_mode mode
, poly_int64 offset
)
9055 HOST_WIDE_INT multiple
;
9056 return (constant_multiple_p (offset
, GET_MODE_SIZE (mode
), &multiple
)
9057 && IN_RANGE (multiple
, -64, 63));
9060 /* Return true if OFFSET is a signed 9-bit value. */
9063 aarch64_offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED
,
9066 HOST_WIDE_INT const_offset
;
9067 return (offset
.is_constant (&const_offset
)
9068 && IN_RANGE (const_offset
, -256, 255));
9071 /* Return true if OFFSET is a signed 9-bit value multiplied by the size
9075 offset_9bit_signed_scaled_p (machine_mode mode
, poly_int64 offset
)
9077 HOST_WIDE_INT multiple
;
9078 return (constant_multiple_p (offset
, GET_MODE_SIZE (mode
), &multiple
)
9079 && IN_RANGE (multiple
, -256, 255));
9082 /* Return true if OFFSET is an unsigned 12-bit value multiplied by the size
9086 offset_12bit_unsigned_scaled_p (machine_mode mode
, poly_int64 offset
)
9088 HOST_WIDE_INT multiple
;
9089 return (constant_multiple_p (offset
, GET_MODE_SIZE (mode
), &multiple
)
9090 && IN_RANGE (multiple
, 0, 4095));
9093 /* Implement TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS. */
9096 aarch64_get_separate_components (void)
9098 aarch64_frame
&frame
= cfun
->machine
->frame
;
9099 sbitmap components
= sbitmap_alloc (LAST_SAVED_REGNUM
+ 1);
9100 bitmap_clear (components
);
9102 /* The registers we need saved to the frame. */
9103 bool enables_pstate_sm
= aarch64_cfun_enables_pstate_sm ();
9104 for (unsigned regno
= 0; regno
<= LAST_SAVED_REGNUM
; regno
++)
9105 if (aarch64_register_saved_on_entry (regno
))
9107 /* Disallow shrink wrapping for registers that will be clobbered
9108 by an SMSTART SM in the prologue. */
9109 if (enables_pstate_sm
9110 && (FP_REGNUM_P (regno
) || PR_REGNUM_P (regno
)))
9113 /* Punt on saves and restores that use ST1D and LD1D. We could
9114 try to be smarter, but it would involve making sure that the
9115 spare predicate register itself is safe to use at the save
9116 and restore points. Also, when a frame pointer is being used,
9117 the slots are often out of reach of ST1D and LD1D anyway. */
9118 machine_mode mode
= aarch64_reg_save_mode (regno
);
9119 if (mode
== VNx2DImode
&& BYTES_BIG_ENDIAN
)
9122 poly_int64 offset
= frame
.reg_offset
[regno
];
9124 /* Get the offset relative to the register we'll use. */
9125 if (frame_pointer_needed
)
9126 offset
-= frame
.bytes_below_hard_fp
;
9128 /* Check that we can access the stack slot of the register with one
9129 direct load with no adjustments needed. */
9130 if (aarch64_sve_mode_p (mode
)
9131 ? offset_9bit_signed_scaled_p (mode
, offset
)
9132 : offset_12bit_unsigned_scaled_p (mode
, offset
))
9133 bitmap_set_bit (components
, regno
);
9136 /* Don't mess with the hard frame pointer. */
9137 if (frame_pointer_needed
)
9138 bitmap_clear_bit (components
, HARD_FRAME_POINTER_REGNUM
);
9140 /* If the spare predicate register used by big-endian SVE code
9141 is call-preserved, it must be saved in the main prologue
9142 before any saves that use it. */
9143 if (frame
.spare_pred_reg
!= INVALID_REGNUM
)
9144 bitmap_clear_bit (components
, frame
.spare_pred_reg
);
9146 unsigned reg1
= frame
.wb_push_candidate1
;
9147 unsigned reg2
= frame
.wb_push_candidate2
;
9148 /* If registers have been chosen to be stored/restored with
9149 writeback don't interfere with them to avoid having to output explicit
9150 stack adjustment instructions. */
9151 if (reg2
!= INVALID_REGNUM
)
9152 bitmap_clear_bit (components
, reg2
);
9153 if (reg1
!= INVALID_REGNUM
)
9154 bitmap_clear_bit (components
, reg1
);
9156 bitmap_clear_bit (components
, LR_REGNUM
);
9157 bitmap_clear_bit (components
, SP_REGNUM
);
9158 if (flag_stack_clash_protection
)
9160 if (frame
.sve_save_and_probe
!= INVALID_REGNUM
)
9161 bitmap_clear_bit (components
, frame
.sve_save_and_probe
);
9162 if (frame
.hard_fp_save_and_probe
!= INVALID_REGNUM
)
9163 bitmap_clear_bit (components
, frame
.hard_fp_save_and_probe
);
9166 /* The VG save sequence needs a temporary GPR. Punt for now on trying
9168 bitmap_clear_bit (components
, VG_REGNUM
);
9173 /* Implement TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB. */
9176 aarch64_components_for_bb (basic_block bb
)
9178 bitmap in
= DF_LIVE_IN (bb
);
9179 bitmap gen
= &DF_LIVE_BB_INFO (bb
)->gen
;
9180 bitmap kill
= &DF_LIVE_BB_INFO (bb
)->kill
;
9182 sbitmap components
= sbitmap_alloc (LAST_SAVED_REGNUM
+ 1);
9183 bitmap_clear (components
);
9185 /* Clobbered registers don't generate values in any meaningful sense,
9186 since nothing after the clobber can rely on their value. And we can't
9187 say that partially-clobbered registers are unconditionally killed,
9188 because whether they're killed or not depends on the mode of the
9189 value they're holding. Thus partially call-clobbered registers
9190 appear in neither the kill set nor the gen set.
9192 Check manually for any calls that clobber more of a register than the
9193 current function can. */
9194 function_abi_aggregator callee_abis
;
9196 FOR_BB_INSNS (bb
, insn
)
9198 callee_abis
.note_callee_abi (insn_callee_abi (insn
));
9199 HARD_REG_SET extra_caller_saves
= callee_abis
.caller_save_regs (*crtl
->abi
);
9201 /* GPRs are used in a bb if they are in the IN, GEN, or KILL sets. */
9202 for (unsigned regno
= 0; regno
<= LAST_SAVED_REGNUM
; regno
++)
9203 if (!fixed_regs
[regno
]
9204 && !crtl
->abi
->clobbers_full_reg_p (regno
)
9205 && (TEST_HARD_REG_BIT (extra_caller_saves
, regno
)
9206 || bitmap_bit_p (in
, regno
)
9207 || bitmap_bit_p (gen
, regno
)
9208 || bitmap_bit_p (kill
, regno
)))
9210 bitmap_set_bit (components
, regno
);
9212 /* If there is a callee-save at an adjacent offset, add it too
9213 to increase the use of LDP/STP. */
9214 poly_int64 offset
= cfun
->machine
->frame
.reg_offset
[regno
];
9215 unsigned regno2
= multiple_p (offset
, 16) ? regno
+ 1 : regno
- 1;
9217 if (regno2
<= LAST_SAVED_REGNUM
)
9219 poly_int64 offset2
= cfun
->machine
->frame
.reg_offset
[regno2
];
9221 ? known_eq (offset
+ 8, offset2
)
9222 : multiple_p (offset2
, 16) && known_eq (offset2
+ 8, offset
))
9223 bitmap_set_bit (components
, regno2
);
9230 /* Implement TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS.
9231 Nothing to do for aarch64. */
9234 aarch64_disqualify_components (sbitmap
, edge
, sbitmap
, bool)
9238 /* Return the next set bit in BMP from START onwards. Return the total number
9239 of bits in BMP if no set bit is found at or after START. */
9242 aarch64_get_next_set_bit (sbitmap bmp
, unsigned int start
)
9244 unsigned int nbits
= SBITMAP_SIZE (bmp
);
9248 gcc_assert (start
< nbits
);
9249 for (unsigned int i
= start
; i
< nbits
; i
++)
9250 if (bitmap_bit_p (bmp
, i
))
9256 /* Do the work for aarch64_emit_prologue_components and
9257 aarch64_emit_epilogue_components. COMPONENTS is the bitmap of registers
9258 to save/restore, PROLOGUE_P indicates whether to emit the prologue sequence
9259 for these components or the epilogue sequence. That is, it determines
9260 whether we should emit stores or loads and what kind of CFA notes to attach
9261 to the insns. Otherwise the logic for the two sequences is very
9265 aarch64_process_components (sbitmap components
, bool prologue_p
)
9267 aarch64_frame
&frame
= cfun
->machine
->frame
;
9268 rtx ptr_reg
= gen_rtx_REG (Pmode
, frame_pointer_needed
9269 ? HARD_FRAME_POINTER_REGNUM
9270 : STACK_POINTER_REGNUM
);
9272 unsigned last_regno
= SBITMAP_SIZE (components
);
9273 unsigned regno
= aarch64_get_next_set_bit (components
, R0_REGNUM
);
9274 rtx_insn
*insn
= NULL
;
9276 while (regno
!= last_regno
)
9278 bool frame_related_p
= aarch64_emit_cfi_for_reg_p (regno
);
9279 machine_mode mode
= aarch64_reg_save_mode (regno
);
9281 rtx reg
= gen_rtx_REG (mode
, regno
);
9282 poly_int64 offset
= frame
.reg_offset
[regno
];
9283 if (frame_pointer_needed
)
9284 offset
-= frame
.bytes_below_hard_fp
;
9286 rtx addr
= plus_constant (Pmode
, ptr_reg
, offset
);
9287 rtx mem
= gen_frame_mem (mode
, addr
);
9289 rtx set
= prologue_p
? gen_rtx_SET (mem
, reg
) : gen_rtx_SET (reg
, mem
);
9290 unsigned regno2
= aarch64_get_next_set_bit (components
, regno
+ 1);
9291 /* No more registers to handle after REGNO.
9292 Emit a single save/restore and exit. */
9293 if (regno2
== last_regno
)
9295 insn
= emit_insn (set
);
9296 if (frame_related_p
)
9298 RTX_FRAME_RELATED_P (insn
) = 1;
9300 add_reg_note (insn
, REG_CFA_OFFSET
, copy_rtx (set
));
9302 add_reg_note (insn
, REG_CFA_RESTORE
, reg
);
9307 poly_int64 offset2
= frame
.reg_offset
[regno2
];
9308 /* The next register is not of the same class or its offset is not
9309 mergeable with the current one into a pair. */
9310 if (aarch64_sve_mode_p (mode
)
9311 || !satisfies_constraint_Ump (mem
)
9312 || GP_REGNUM_P (regno
) != GP_REGNUM_P (regno2
)
9313 || (crtl
->abi
->id () == ARM_PCS_SIMD
&& FP_REGNUM_P (regno
))
9314 || maybe_ne ((offset2
- frame
.reg_offset
[regno
]),
9315 GET_MODE_SIZE (mode
)))
9317 insn
= emit_insn (set
);
9318 if (frame_related_p
)
9320 RTX_FRAME_RELATED_P (insn
) = 1;
9322 add_reg_note (insn
, REG_CFA_OFFSET
, copy_rtx (set
));
9324 add_reg_note (insn
, REG_CFA_RESTORE
, reg
);
9331 bool frame_related2_p
= aarch64_emit_cfi_for_reg_p (regno2
);
9333 /* REGNO2 can be saved/restored in a pair with REGNO. */
9334 rtx reg2
= gen_rtx_REG (mode
, regno2
);
9335 if (frame_pointer_needed
)
9336 offset2
-= frame
.bytes_below_hard_fp
;
9337 rtx addr2
= plus_constant (Pmode
, ptr_reg
, offset2
);
9338 rtx mem2
= gen_frame_mem (mode
, addr2
);
9339 rtx set2
= prologue_p
? gen_rtx_SET (mem2
, reg2
)
9340 : gen_rtx_SET (reg2
, mem2
);
9343 insn
= emit_insn (aarch64_gen_store_pair (mem
, reg
, reg2
));
9345 insn
= emit_insn (aarch64_gen_load_pair (reg
, reg2
, mem
));
9347 if (frame_related_p
|| frame_related2_p
)
9349 RTX_FRAME_RELATED_P (insn
) = 1;
9352 if (frame_related_p
)
9353 add_reg_note (insn
, REG_CFA_OFFSET
, set
);
9354 if (frame_related2_p
)
9355 add_reg_note (insn
, REG_CFA_OFFSET
, set2
);
9359 if (frame_related_p
)
9360 add_reg_note (insn
, REG_CFA_RESTORE
, reg
);
9361 if (frame_related2_p
)
9362 add_reg_note (insn
, REG_CFA_RESTORE
, reg2
);
9366 regno
= aarch64_get_next_set_bit (components
, regno2
+ 1);
9370 /* Implement TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS. */
9373 aarch64_emit_prologue_components (sbitmap components
)
9375 aarch64_process_components (components
, true);
9378 /* Implement TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS. */
9381 aarch64_emit_epilogue_components (sbitmap components
)
9383 aarch64_process_components (components
, false);
9386 /* Implement TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS. */
9389 aarch64_set_handled_components (sbitmap components
)
9391 for (unsigned regno
= 0; regno
<= LAST_SAVED_REGNUM
; regno
++)
9392 if (bitmap_bit_p (components
, regno
))
9393 cfun
->machine
->reg_is_wrapped_separately
[regno
] = true;
9396 /* On AArch64 we have an ABI defined safe buffer. This constant is used to
9397 determining the probe offset for alloca. */
9399 static HOST_WIDE_INT
9400 aarch64_stack_clash_protection_alloca_probe_range (void)
9402 return STACK_CLASH_CALLER_GUARD
;
9405 /* Emit a stack tie that acts as a scheduling barrier for all previous and
9406 subsequent memory accesses and that requires the stack pointer and REG
9407 to have their current values. REG can be stack_pointer_rtx if no
9408 other register's value needs to be fixed. */
9411 aarch64_emit_stack_tie (rtx reg
)
9413 emit_insn (gen_stack_tie (reg
, gen_int_mode (REGNO (reg
), DImode
)));
9416 /* Allocate POLY_SIZE bytes of stack space using TEMP1 and TEMP2 as scratch
9417 registers. If POLY_SIZE is not large enough to require a probe this function
9418 will only adjust the stack. When allocating the stack space
9419 FRAME_RELATED_P is then used to indicate if the allocation is frame related.
9420 FINAL_ADJUSTMENT_P indicates whether we are allocating the area below
9421 the saved registers. If we are then we ensure that any allocation
9422 larger than the ABI defined buffer needs a probe so that the
9423 invariant of having a 1KB buffer is maintained.
9425 We emit barriers after each stack adjustment to prevent optimizations from
9426 breaking the invariant that we never drop the stack more than a page. This
9427 invariant is needed to make it easier to correctly handle asynchronous
9428 events, e.g. if we were to allow the stack to be dropped by more than a page
9429 and then have multiple probes up and we take a signal somewhere in between
9430 then the signal handler doesn't know the state of the stack and can make no
9431 assumptions about which pages have been probed.
9433 FORCE_ISA_MODE is AARCH64_ISA_MODE_SM_ON if any variable component of
9434 POLY_SIZE is measured relative to the SME vector length instead of the
9435 current prevailing vector length. It is 0 otherwise. */
9438 aarch64_allocate_and_probe_stack_space (rtx temp1
, rtx temp2
,
9439 poly_int64 poly_size
,
9440 aarch64_isa_mode force_isa_mode
,
9441 bool frame_related_p
,
9442 bool final_adjustment_p
)
9444 aarch64_frame
&frame
= cfun
->machine
->frame
;
9445 HOST_WIDE_INT guard_size
9446 = 1 << param_stack_clash_protection_guard_size
;
9447 HOST_WIDE_INT guard_used_by_caller
= STACK_CLASH_CALLER_GUARD
;
9448 HOST_WIDE_INT byte_sp_alignment
= STACK_BOUNDARY
/ BITS_PER_UNIT
;
9449 gcc_assert (multiple_p (poly_size
, byte_sp_alignment
));
9450 HOST_WIDE_INT min_probe_threshold
9451 = (final_adjustment_p
9452 ? guard_used_by_caller
+ byte_sp_alignment
9453 : guard_size
- guard_used_by_caller
);
9454 poly_int64 frame_size
= frame
.frame_size
;
9456 /* We should always have a positive probe threshold. */
9457 gcc_assert (min_probe_threshold
> 0);
9459 if (flag_stack_clash_protection
&& !final_adjustment_p
)
9461 poly_int64 initial_adjust
= frame
.initial_adjust
;
9462 poly_int64 sve_callee_adjust
= frame
.sve_callee_adjust
;
9463 poly_int64 final_adjust
= frame
.final_adjust
;
9465 if (known_eq (frame_size
, 0))
9467 dump_stack_clash_frame_info (NO_PROBE_NO_FRAME
, false);
9469 else if (known_lt (initial_adjust
+ sve_callee_adjust
,
9470 guard_size
- guard_used_by_caller
)
9471 && known_lt (final_adjust
, guard_used_by_caller
))
9473 dump_stack_clash_frame_info (NO_PROBE_SMALL_FRAME
, true);
9477 /* If SIZE is not large enough to require probing, just adjust the stack and
9479 if (known_lt (poly_size
, min_probe_threshold
)
9480 || !flag_stack_clash_protection
)
9482 aarch64_sub_sp (temp1
, temp2
, poly_size
, force_isa_mode
,
9488 /* Handle the SVE non-constant case first. */
9489 if (!poly_size
.is_constant (&size
))
9493 fprintf (dump_file
, "Stack clash SVE prologue: ");
9494 print_dec (poly_size
, dump_file
);
9495 fprintf (dump_file
, " bytes, dynamic probing will be required.\n");
9498 /* First calculate the amount of bytes we're actually spilling. */
9499 aarch64_add_offset (Pmode
, temp1
, CONST0_RTX (Pmode
),
9500 poly_size
, temp1
, temp2
, force_isa_mode
,
9503 rtx_insn
*insn
= get_last_insn ();
9505 if (frame_related_p
)
9507 /* This is done to provide unwinding information for the stack
9508 adjustments we're about to do, however to prevent the optimizers
9509 from removing the R11 move and leaving the CFA note (which would be
9510 very wrong) we tie the old and new stack pointer together.
9511 The tie will expand to nothing but the optimizers will not touch
9513 rtx stack_ptr_copy
= gen_rtx_REG (Pmode
, STACK_CLASH_SVE_CFA_REGNUM
);
9514 emit_move_insn (stack_ptr_copy
, stack_pointer_rtx
);
9515 aarch64_emit_stack_tie (stack_ptr_copy
);
9517 /* We want the CFA independent of the stack pointer for the
9518 duration of the loop. */
9519 add_reg_note (insn
, REG_CFA_DEF_CFA
, stack_ptr_copy
);
9520 RTX_FRAME_RELATED_P (insn
) = 1;
9523 rtx probe_const
= gen_int_mode (min_probe_threshold
, Pmode
);
9524 rtx guard_const
= gen_int_mode (guard_size
, Pmode
);
9526 insn
= emit_insn (gen_probe_sve_stack_clash (Pmode
, stack_pointer_rtx
,
9527 stack_pointer_rtx
, temp1
,
9528 probe_const
, guard_const
));
9530 /* Now reset the CFA register if needed. */
9531 if (frame_related_p
)
9533 add_reg_note (insn
, REG_CFA_DEF_CFA
,
9534 gen_rtx_PLUS (Pmode
, stack_pointer_rtx
,
9535 gen_int_mode (poly_size
, Pmode
)));
9536 RTX_FRAME_RELATED_P (insn
) = 1;
9544 "Stack clash AArch64 prologue: " HOST_WIDE_INT_PRINT_DEC
9545 " bytes, probing will be required.\n", size
);
9547 /* Round size to the nearest multiple of guard_size, and calculate the
9548 residual as the difference between the original size and the rounded
9550 HOST_WIDE_INT rounded_size
= ROUND_DOWN (size
, guard_size
);
9551 HOST_WIDE_INT residual
= size
- rounded_size
;
9553 /* We can handle a small number of allocations/probes inline. Otherwise
9555 if (rounded_size
<= STACK_CLASH_MAX_UNROLL_PAGES
* guard_size
)
9557 for (HOST_WIDE_INT i
= 0; i
< rounded_size
; i
+= guard_size
)
9559 aarch64_sub_sp (NULL
, temp2
, guard_size
, force_isa_mode
, true);
9560 emit_stack_probe (plus_constant (Pmode
, stack_pointer_rtx
,
9561 guard_used_by_caller
));
9562 emit_insn (gen_blockage ());
9564 dump_stack_clash_frame_info (PROBE_INLINE
, size
!= rounded_size
);
9568 /* Compute the ending address. */
9569 aarch64_add_offset (Pmode
, temp1
, stack_pointer_rtx
, -rounded_size
,
9570 temp1
, NULL
, force_isa_mode
, false, true);
9571 rtx_insn
*insn
= get_last_insn ();
9573 /* For the initial allocation, we don't have a frame pointer
9574 set up, so we always need CFI notes. If we're doing the
9575 final allocation, then we may have a frame pointer, in which
9576 case it is the CFA, otherwise we need CFI notes.
9578 We can determine which allocation we are doing by looking at
9579 the value of FRAME_RELATED_P since the final allocations are not
9581 if (frame_related_p
)
9583 /* We want the CFA independent of the stack pointer for the
9584 duration of the loop. */
9585 add_reg_note (insn
, REG_CFA_DEF_CFA
,
9586 plus_constant (Pmode
, temp1
, rounded_size
));
9587 RTX_FRAME_RELATED_P (insn
) = 1;
9590 /* This allocates and probes the stack. Note that this re-uses some of
9591 the existing Ada stack protection code. However we are guaranteed not
9592 to enter the non loop or residual branches of that code.
9594 The non-loop part won't be entered because if our allocation amount
9595 doesn't require a loop, the case above would handle it.
9597 The residual amount won't be entered because TEMP1 is a mutliple of
9598 the allocation size. The residual will always be 0. As such, the only
9599 part we are actually using from that code is the loop setup. The
9600 actual probing is done in aarch64_output_probe_stack_range. */
9601 insn
= emit_insn (gen_probe_stack_range (stack_pointer_rtx
,
9602 stack_pointer_rtx
, temp1
));
9604 /* Now reset the CFA register if needed. */
9605 if (frame_related_p
)
9607 add_reg_note (insn
, REG_CFA_DEF_CFA
,
9608 plus_constant (Pmode
, stack_pointer_rtx
, rounded_size
));
9609 RTX_FRAME_RELATED_P (insn
) = 1;
9612 emit_insn (gen_blockage ());
9613 dump_stack_clash_frame_info (PROBE_LOOP
, size
!= rounded_size
);
9616 /* Handle any residuals. Residuals of at least MIN_PROBE_THRESHOLD have to
9617 be probed. This maintains the requirement that each page is probed at
9618 least once. For initial probing we probe only if the allocation is
9619 more than GUARD_SIZE - buffer, and below the saved registers we probe
9620 if the amount is larger than buffer. GUARD_SIZE - buffer + buffer ==
9621 GUARD_SIZE. This works that for any allocation that is large enough to
9622 trigger a probe here, we'll have at least one, and if they're not large
9623 enough for this code to emit anything for them, The page would have been
9624 probed by the saving of FP/LR either by this function or any callees. If
9625 we don't have any callees then we won't have more stack adjustments and so
9629 gcc_assert (guard_used_by_caller
+ byte_sp_alignment
<= size
);
9631 /* If we're doing final adjustments, and we've done any full page
9632 allocations then any residual needs to be probed. */
9633 if (final_adjustment_p
&& rounded_size
!= 0)
9634 min_probe_threshold
= 0;
9636 aarch64_sub_sp (temp1
, temp2
, residual
, force_isa_mode
, frame_related_p
);
9637 if (residual
>= min_probe_threshold
)
9641 "Stack clash AArch64 prologue residuals: "
9642 HOST_WIDE_INT_PRINT_DEC
" bytes, probing will be required."
9645 emit_stack_probe (plus_constant (Pmode
, stack_pointer_rtx
,
9646 guard_used_by_caller
));
9647 emit_insn (gen_blockage ());
9652 /* Implement TARGET_EXTRA_LIVE_ON_ENTRY. */
9655 aarch64_extra_live_on_entry (bitmap regs
)
9659 bitmap_set_bit (regs
, LOWERING_REGNUM
);
9660 bitmap_set_bit (regs
, SME_STATE_REGNUM
);
9661 bitmap_set_bit (regs
, TPIDR2_SETUP_REGNUM
);
9662 bitmap_set_bit (regs
, ZA_FREE_REGNUM
);
9663 bitmap_set_bit (regs
, ZA_SAVED_REGNUM
);
9665 /* The only time ZA can't have live contents on entry is when
9666 the function explicitly treats it as a pure output. */
9667 auto za_flags
= aarch64_cfun_shared_flags ("za");
9668 if (za_flags
!= (AARCH64_STATE_SHARED
| AARCH64_STATE_OUT
))
9669 bitmap_set_bit (regs
, ZA_REGNUM
);
9671 /* Since ZT0 is call-clobbered, it is only live on input if
9672 it is explicitly shared, and is not a pure output. */
9673 auto zt0_flags
= aarch64_cfun_shared_flags ("zt0");
9675 && zt0_flags
!= (AARCH64_STATE_SHARED
| AARCH64_STATE_OUT
))
9676 bitmap_set_bit (regs
, ZT0_REGNUM
);
9680 /* Return 1 if the register is used by the epilogue. We need to say the
9681 return register is used, but only after epilogue generation is complete.
9682 Note that in the case of sibcalls, the values "used by the epilogue" are
9683 considered live at the start of the called function. */
9686 aarch64_epilogue_uses (int regno
)
9688 if (epilogue_completed
)
9690 if (regno
== LR_REGNUM
)
9693 if (regno
== LOWERING_REGNUM
&& TARGET_ZA
)
9695 if (regno
== SME_STATE_REGNUM
&& TARGET_ZA
)
9697 if (regno
== TPIDR2_SETUP_REGNUM
&& TARGET_ZA
)
9699 /* If the function shares SME state with its caller, ensure that that
9700 data is not in the lazy save buffer on exit. */
9701 if (regno
== ZA_SAVED_REGNUM
&& aarch64_cfun_incoming_pstate_za () != 0)
9703 if (regno
== ZA_REGNUM
&& aarch64_cfun_shared_flags ("za") != 0)
9705 if (regno
== ZT0_REGNUM
&& aarch64_cfun_shared_flags ("zt0") != 0)
9710 /* Implement TARGET_USE_LATE_PROLOGUE_EPILOGUE. */
9713 aarch64_use_late_prologue_epilogue ()
9715 return aarch64_cfun_enables_pstate_sm ();
9718 /* The current function's frame has a save slot for the incoming state
9719 of SVCR. Return a legitimate memory for the slot, based on the hard
9723 aarch64_old_svcr_mem ()
9725 gcc_assert (frame_pointer_needed
9726 && known_ge (cfun
->machine
->frame
.old_svcr_offset
, 0));
9727 rtx base
= hard_frame_pointer_rtx
;
9728 poly_int64 offset
= (0
9729 /* hard fp -> bottom of frame. */
9730 - cfun
->machine
->frame
.bytes_below_hard_fp
9731 /* bottom of frame -> save slot. */
9732 + cfun
->machine
->frame
.old_svcr_offset
);
9733 return gen_frame_mem (DImode
, plus_constant (Pmode
, base
, offset
));
9736 /* The current function's frame has a save slot for the incoming state
9737 of SVCR. Load the slot into register REGNO and return the register. */
9740 aarch64_read_old_svcr (unsigned int regno
)
9742 rtx svcr
= gen_rtx_REG (DImode
, regno
);
9743 emit_move_insn (svcr
, aarch64_old_svcr_mem ());
9747 /* Like the rtx version of aarch64_guard_switch_pstate_sm, but first
9748 load the incoming value of SVCR from its save slot into temporary
9752 aarch64_guard_switch_pstate_sm (unsigned int regno
,
9753 aarch64_isa_mode local_mode
)
9755 rtx old_svcr
= aarch64_read_old_svcr (regno
);
9756 return aarch64_guard_switch_pstate_sm (old_svcr
, local_mode
);
9759 /* AArch64 stack frames generated by this compiler look like:
9761 +-------------------------------+
9763 | incoming stack arguments |
9765 +-------------------------------+
9766 | | <-- incoming stack pointer (aligned)
9767 | callee-allocated save area |
9768 | for register varargs |
9770 +-------------------------------+
9771 | local variables (1) | <-- frame_pointer_rtx
9773 +-------------------------------+
9775 +-------------------------------+
9776 | callee-saved registers |
9777 +-------------------------------+
9779 +-------------------------------+
9781 +-------------------------------+ <-- hard_frame_pointer_rtx (aligned)
9782 | SVE vector registers |
9783 +-------------------------------+
9784 | SVE predicate registers |
9785 +-------------------------------+
9786 | local variables (2) |
9787 +-------------------------------+
9789 +-------------------------------+
9790 | dynamic allocation |
9791 +-------------------------------+
9793 +-------------------------------+
9794 | outgoing stack arguments | <-- arg_pointer
9796 +-------------------------------+
9797 | | <-- stack_pointer_rtx (aligned)
9799 The regions marked (1) and (2) are mutually exclusive. (2) is used
9800 when aarch64_save_regs_above_locals_p is true.
9802 Dynamic stack allocations via alloca() decrease stack_pointer_rtx
9803 but leave frame_pointer_rtx and hard_frame_pointer_rtx
9806 By default for stack-clash we assume the guard is at least 64KB, but this
9807 value is configurable to either 4KB or 64KB. We also force the guard size to
9808 be the same as the probing interval and both values are kept in sync.
9810 With those assumptions the callee can allocate up to 63KB (or 3KB depending
9811 on the guard size) of stack space without probing.
9813 When probing is needed, we emit a probe at the start of the prologue
9814 and every PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE bytes thereafter.
9816 We can also use register saves as probes. These are stored in
9817 sve_save_and_probe and hard_fp_save_and_probe.
9819 For outgoing arguments we probe if the size is larger than 1KB, such that
9820 the ABI specified buffer is maintained for the next callee.
9822 The following registers are reserved during frame layout and should not be
9823 used for any other purpose:
9825 - r11: Used by stack clash protection when SVE is enabled, and also
9826 as an anchor register when saving and restoring registers
9827 - r12(EP0) and r13(EP1): Used as temporaries for stack adjustment.
9828 - r14 and r15: Used for speculation tracking.
9829 - r16(IP0), r17(IP1): Used by indirect tailcalls.
9830 - r30(LR), r29(FP): Used by standard frame layout.
9832 These registers must be avoided in frame layout related code unless the
9833 explicit intention is to interact with one of the features listed above. */
9835 /* Generate the prologue instructions for entry into a function.
9836 Establish the stack frame by decreasing the stack pointer with a
9837 properly calculated size and, if necessary, create a frame record
9838 filled with the values of LR and previous frame pointer. The
9839 current FP is also set up if it is in use. */
9842 aarch64_expand_prologue (void)
9844 aarch64_frame
&frame
= cfun
->machine
->frame
;
9845 poly_int64 frame_size
= frame
.frame_size
;
9846 poly_int64 initial_adjust
= frame
.initial_adjust
;
9847 HOST_WIDE_INT callee_adjust
= frame
.callee_adjust
;
9848 poly_int64 final_adjust
= frame
.final_adjust
;
9849 poly_int64 sve_callee_adjust
= frame
.sve_callee_adjust
;
9850 unsigned reg1
= frame
.wb_push_candidate1
;
9851 unsigned reg2
= frame
.wb_push_candidate2
;
9852 bool emit_frame_chain
= frame
.emit_frame_chain
;
9854 aarch64_isa_mode force_isa_mode
= 0;
9855 if (aarch64_cfun_enables_pstate_sm ())
9856 force_isa_mode
= AARCH64_ISA_MODE_SM_ON
;
9858 if (flag_stack_clash_protection
9859 && known_eq (callee_adjust
, 0)
9860 && known_lt (frame
.reg_offset
[VG_REGNUM
], 0))
9862 /* Fold the SVE allocation into the initial allocation.
9863 We don't do this in aarch64_layout_arg to avoid pessimizing
9864 the epilogue code. */
9865 initial_adjust
+= sve_callee_adjust
;
9866 sve_callee_adjust
= 0;
9869 /* Sign return address for functions. */
9870 if (aarch64_return_address_signing_enabled ())
9872 switch (aarch64_ra_sign_key
)
9875 insn
= emit_insn (gen_paciasp ());
9878 insn
= emit_insn (gen_pacibsp ());
9883 add_reg_note (insn
, REG_CFA_NEGATE_RA_STATE
, const0_rtx
);
9884 RTX_FRAME_RELATED_P (insn
) = 1;
9887 /* Push return address to shadow call stack. */
9888 if (frame
.is_scs_enabled
)
9889 emit_insn (gen_scs_push ());
9891 if (flag_stack_usage_info
)
9892 current_function_static_stack_size
= constant_lower_bound (frame_size
);
9894 if (flag_stack_check
== STATIC_BUILTIN_STACK_CHECK
)
9896 if (crtl
->is_leaf
&& !cfun
->calls_alloca
)
9898 if (maybe_gt (frame_size
, PROBE_INTERVAL
)
9899 && maybe_gt (frame_size
, get_stack_check_protect ()))
9900 aarch64_emit_probe_stack_range (get_stack_check_protect (),
9902 - get_stack_check_protect ()));
9904 else if (maybe_gt (frame_size
, 0))
9905 aarch64_emit_probe_stack_range (get_stack_check_protect (), frame_size
);
9908 rtx tmp0_rtx
= gen_rtx_REG (Pmode
, EP0_REGNUM
);
9909 rtx tmp1_rtx
= gen_rtx_REG (Pmode
, EP1_REGNUM
);
9911 /* In theory we should never have both an initial adjustment
9912 and a callee save adjustment. Verify that is the case since the
9913 code below does not handle it for -fstack-clash-protection. */
9914 gcc_assert (known_eq (initial_adjust
, 0) || callee_adjust
== 0);
9916 /* Will only probe if the initial adjustment is larger than the guard
9917 less the amount of the guard reserved for use by the caller's
9919 aarch64_allocate_and_probe_stack_space (tmp0_rtx
, tmp1_rtx
, initial_adjust
,
9920 force_isa_mode
, true, false);
9922 if (callee_adjust
!= 0)
9923 aarch64_push_regs (reg1
, reg2
, callee_adjust
);
9925 /* The offset of the current SP from the bottom of the static frame. */
9926 poly_int64 bytes_below_sp
= frame_size
- initial_adjust
- callee_adjust
;
9928 if (emit_frame_chain
)
9930 /* The offset of the frame chain record (if any) from the current SP. */
9931 poly_int64 chain_offset
= (initial_adjust
+ callee_adjust
9932 - frame
.bytes_above_hard_fp
);
9933 gcc_assert (known_ge (chain_offset
, 0));
9935 gcc_assert (reg1
== R29_REGNUM
&& reg2
== R30_REGNUM
);
9936 if (callee_adjust
== 0)
9937 aarch64_save_callee_saves (bytes_below_sp
, frame
.saved_gprs
,
9940 gcc_assert (known_eq (chain_offset
, 0));
9941 aarch64_add_offset (Pmode
, hard_frame_pointer_rtx
,
9942 stack_pointer_rtx
, chain_offset
,
9943 tmp1_rtx
, tmp0_rtx
, force_isa_mode
,
9944 frame_pointer_needed
);
9945 if (frame_pointer_needed
&& !frame_size
.is_constant ())
9947 /* Variable-sized frames need to describe the save slot
9948 address using DW_CFA_expression rather than DW_CFA_offset.
9949 This means that, without taking further action, the
9950 locations of the registers that we've already saved would
9951 remain based on the stack pointer even after we redefine
9952 the CFA based on the frame pointer. We therefore need new
9953 DW_CFA_expressions to re-express the save slots with addresses
9954 based on the frame pointer. */
9955 rtx_insn
*insn
= get_last_insn ();
9956 gcc_assert (RTX_FRAME_RELATED_P (insn
));
9958 /* Add an explicit CFA definition if this was previously
9960 if (!find_reg_note (insn
, REG_CFA_ADJUST_CFA
, NULL_RTX
))
9962 rtx src
= plus_constant (Pmode
, stack_pointer_rtx
, chain_offset
);
9963 add_reg_note (insn
, REG_CFA_ADJUST_CFA
,
9964 gen_rtx_SET (hard_frame_pointer_rtx
, src
));
9967 /* Change the save slot expressions for the registers that
9968 we've already saved. */
9969 aarch64_add_cfa_expression (insn
, regno_reg_rtx
[reg2
],
9970 hard_frame_pointer_rtx
, UNITS_PER_WORD
);
9971 aarch64_add_cfa_expression (insn
, regno_reg_rtx
[reg1
],
9972 hard_frame_pointer_rtx
, 0);
9974 aarch64_emit_stack_tie (hard_frame_pointer_rtx
);
9977 aarch64_save_callee_saves (bytes_below_sp
, frame
.saved_gprs
, true,
9979 if (maybe_ge (frame
.reg_offset
[VG_REGNUM
], 0))
9981 unsigned int saved_regs
[] = { VG_REGNUM
};
9982 aarch64_save_callee_saves (bytes_below_sp
, saved_regs
, true,
9985 if (maybe_ne (sve_callee_adjust
, 0))
9987 gcc_assert (!flag_stack_clash_protection
9988 || known_eq (initial_adjust
, 0)
9989 /* The VG save isn't shrink-wrapped and so serves as
9990 a probe of the initial allocation. */
9991 || known_eq (frame
.reg_offset
[VG_REGNUM
], bytes_below_sp
));
9992 aarch64_allocate_and_probe_stack_space (tmp1_rtx
, tmp0_rtx
,
9995 !frame_pointer_needed
, false);
9996 bytes_below_sp
-= sve_callee_adjust
;
9998 aarch64_save_callee_saves (bytes_below_sp
, frame
.saved_prs
, true,
10000 aarch64_save_callee_saves (bytes_below_sp
, frame
.saved_fprs
, true,
10003 /* We may need to probe the final adjustment if it is larger than the guard
10004 that is assumed by the called. */
10005 gcc_assert (known_eq (bytes_below_sp
, final_adjust
));
10006 aarch64_allocate_and_probe_stack_space (tmp1_rtx
, tmp0_rtx
, final_adjust
,
10008 !frame_pointer_needed
, true);
10009 if (emit_frame_chain
&& maybe_ne (final_adjust
, 0))
10010 aarch64_emit_stack_tie (hard_frame_pointer_rtx
);
10012 /* Save the incoming value of PSTATE.SM, if required. Code further
10013 down does this for locally-streaming functions. */
10014 if (known_ge (frame
.old_svcr_offset
, 0)
10015 && !aarch64_cfun_enables_pstate_sm ())
10017 rtx mem
= aarch64_old_svcr_mem ();
10018 MEM_VOLATILE_P (mem
) = 1;
10021 rtx reg
= gen_rtx_REG (DImode
, IP0_REGNUM
);
10022 emit_insn (gen_aarch64_read_svcr (reg
));
10023 emit_move_insn (mem
, reg
);
10027 rtx old_r0
= NULL_RTX
, old_r1
= NULL_RTX
;
10028 auto &args
= crtl
->args
.info
;
10029 if (args
.aapcs_ncrn
> 0)
10031 old_r0
= gen_rtx_REG (DImode
, PROBE_STACK_FIRST_REGNUM
);
10032 emit_move_insn (old_r0
, gen_rtx_REG (DImode
, R0_REGNUM
));
10034 if (args
.aapcs_ncrn
> 1)
10036 old_r1
= gen_rtx_REG (DImode
, PROBE_STACK_SECOND_REGNUM
);
10037 emit_move_insn (old_r1
, gen_rtx_REG (DImode
, R1_REGNUM
));
10039 emit_insn (gen_aarch64_get_sme_state ());
10040 emit_move_insn (mem
, gen_rtx_REG (DImode
, R0_REGNUM
));
10042 emit_move_insn (gen_rtx_REG (DImode
, R0_REGNUM
), old_r0
);
10044 emit_move_insn (gen_rtx_REG (DImode
, R1_REGNUM
), old_r1
);
10048 /* Enable PSTATE.SM, if required. */
10049 if (aarch64_cfun_enables_pstate_sm ())
10051 rtx_insn
*guard_label
= nullptr;
10052 if (known_ge (cfun
->machine
->frame
.old_svcr_offset
, 0))
10054 /* The current function is streaming-compatible. Save the
10055 original state of PSTATE.SM. */
10056 rtx svcr
= gen_rtx_REG (DImode
, IP0_REGNUM
);
10057 emit_insn (gen_aarch64_read_svcr (svcr
));
10058 emit_move_insn (aarch64_old_svcr_mem (), svcr
);
10059 guard_label
= aarch64_guard_switch_pstate_sm (svcr
,
10062 aarch64_sme_mode_switch_regs args_switch
;
10063 auto &args
= crtl
->args
.info
;
10064 for (unsigned int i
= 0; i
< args
.num_sme_mode_switch_args
; ++i
)
10066 rtx x
= args
.sme_mode_switch_args
[i
];
10067 args_switch
.add_reg (GET_MODE (x
), REGNO (x
));
10069 args_switch
.emit_prologue ();
10070 emit_insn (gen_aarch64_smstart_sm ());
10071 args_switch
.emit_epilogue ();
10073 emit_label (guard_label
);
10077 /* Return TRUE if we can use a simple_return insn.
10079 This function checks whether the callee saved stack is empty, which
10080 means no restore actions are need. The pro_and_epilogue will use
10081 this to check whether shrink-wrapping opt is feasible. */
10084 aarch64_use_return_insn_p (void)
10086 if (!reload_completed
)
10092 return known_eq (cfun
->machine
->frame
.frame_size
, 0);
10095 /* Generate the epilogue instructions for returning from a function.
10096 This is almost exactly the reverse of the prolog sequence, except
10097 that we need to insert barriers to avoid scheduling loads that read
10098 from a deallocated stack, and we optimize the unwind records by
10099 emitting them all together if possible. */
10101 aarch64_expand_epilogue (rtx_call_insn
*sibcall
)
10103 aarch64_frame
&frame
= cfun
->machine
->frame
;
10104 poly_int64 initial_adjust
= frame
.initial_adjust
;
10105 HOST_WIDE_INT callee_adjust
= frame
.callee_adjust
;
10106 poly_int64 final_adjust
= frame
.final_adjust
;
10107 poly_int64 sve_callee_adjust
= frame
.sve_callee_adjust
;
10108 poly_int64 bytes_below_hard_fp
= frame
.bytes_below_hard_fp
;
10109 unsigned reg1
= frame
.wb_pop_candidate1
;
10110 unsigned reg2
= frame
.wb_pop_candidate2
;
10111 rtx cfi_ops
= NULL
;
10113 /* A stack clash protection prologue may not have left EP0_REGNUM or
10114 EP1_REGNUM in a usable state. The same is true for allocations
10115 with an SVE component, since we then need both temporary registers
10116 for each allocation. For stack clash we are in a usable state if
10117 the adjustment is less than GUARD_SIZE - GUARD_USED_BY_CALLER. */
10118 HOST_WIDE_INT guard_size
10119 = 1 << param_stack_clash_protection_guard_size
;
10120 HOST_WIDE_INT guard_used_by_caller
= STACK_CLASH_CALLER_GUARD
;
10121 aarch64_isa_mode force_isa_mode
= 0;
10122 if (aarch64_cfun_enables_pstate_sm ())
10123 force_isa_mode
= AARCH64_ISA_MODE_SM_ON
;
10125 /* We can re-use the registers when:
10127 (a) the deallocation amount is the same as the corresponding
10128 allocation amount (which is false if we combine the initial
10129 and SVE callee save allocations in the prologue); and
10131 (b) the allocation amount doesn't need a probe (which is false
10132 if the amount is guard_size - guard_used_by_caller or greater).
10134 In such situations the register should remain live with the correct
10136 bool can_inherit_p
= (initial_adjust
.is_constant ()
10137 && final_adjust
.is_constant ()
10138 && (!flag_stack_clash_protection
10139 || (known_lt (initial_adjust
,
10140 guard_size
- guard_used_by_caller
)
10141 && known_eq (sve_callee_adjust
, 0))));
10143 /* We need to add memory barrier to prevent read from deallocated stack. */
10144 bool need_barrier_p
10145 = maybe_ne (get_frame_size ()
10146 + frame
.saved_varargs_size
, 0);
10148 /* Reset PSTATE.SM, if required. */
10149 if (aarch64_cfun_enables_pstate_sm ())
10151 rtx_insn
*guard_label
= nullptr;
10152 if (known_ge (cfun
->machine
->frame
.old_svcr_offset
, 0))
10153 guard_label
= aarch64_guard_switch_pstate_sm (IP0_REGNUM
,
10155 aarch64_sme_mode_switch_regs return_switch
;
10157 return_switch
.add_call_args (sibcall
);
10158 else if (crtl
->return_rtx
&& REG_P (crtl
->return_rtx
))
10159 return_switch
.add_reg (GET_MODE (crtl
->return_rtx
),
10160 REGNO (crtl
->return_rtx
));
10161 return_switch
.emit_prologue ();
10162 emit_insn (gen_aarch64_smstop_sm ());
10163 return_switch
.emit_epilogue ();
10165 emit_label (guard_label
);
10168 /* Emit a barrier to prevent loads from a deallocated stack. */
10169 if (maybe_gt (final_adjust
, crtl
->outgoing_args_size
)
10170 || cfun
->calls_alloca
10171 || crtl
->calls_eh_return
)
10173 aarch64_emit_stack_tie (stack_pointer_rtx
);
10174 need_barrier_p
= false;
10177 /* Restore the stack pointer from the frame pointer if it may not
10178 be the same as the stack pointer. */
10179 rtx tmp0_rtx
= gen_rtx_REG (Pmode
, EP0_REGNUM
);
10180 rtx tmp1_rtx
= gen_rtx_REG (Pmode
, EP1_REGNUM
);
10181 if (frame_pointer_needed
10182 && (maybe_ne (final_adjust
, 0) || cfun
->calls_alloca
))
10183 /* If writeback is used when restoring callee-saves, the CFA
10184 is restored on the instruction doing the writeback. */
10185 aarch64_add_offset (Pmode
, stack_pointer_rtx
,
10186 hard_frame_pointer_rtx
,
10187 -bytes_below_hard_fp
+ final_adjust
,
10188 tmp1_rtx
, tmp0_rtx
, force_isa_mode
,
10189 callee_adjust
== 0);
10191 /* The case where we need to re-use the register here is very rare, so
10192 avoid the complicated condition and just always emit a move if the
10193 immediate doesn't fit. */
10194 aarch64_add_sp (tmp1_rtx
, tmp0_rtx
, final_adjust
, force_isa_mode
, true);
10196 /* Restore the vector registers before the predicate registers,
10197 so that we can use P4 as a temporary for big-endian SVE frames. */
10198 aarch64_restore_callee_saves (final_adjust
, frame
.saved_fprs
, &cfi_ops
);
10199 aarch64_restore_callee_saves (final_adjust
, frame
.saved_prs
, &cfi_ops
);
10200 if (maybe_ne (sve_callee_adjust
, 0))
10201 aarch64_add_sp (NULL_RTX
, NULL_RTX
, sve_callee_adjust
,
10202 force_isa_mode
, true);
10204 /* When shadow call stack is enabled, the scs_pop in the epilogue will
10205 restore x30, we don't need to restore x30 again in the traditional
10207 aarch64_restore_callee_saves (final_adjust
+ sve_callee_adjust
,
10208 frame
.saved_gprs
, &cfi_ops
);
10210 if (need_barrier_p
)
10211 aarch64_emit_stack_tie (stack_pointer_rtx
);
10213 if (callee_adjust
!= 0)
10214 aarch64_pop_regs (reg1
, reg2
, callee_adjust
, &cfi_ops
);
10216 /* If we have no register restore information, the CFA must have been
10217 defined in terms of the stack pointer since the end of the prologue. */
10218 gcc_assert (cfi_ops
|| !frame_pointer_needed
);
10220 if (cfi_ops
&& (callee_adjust
!= 0 || maybe_gt (initial_adjust
, 65536)))
10222 /* Emit delayed restores and set the CFA to be SP + initial_adjust. */
10223 insn
= get_last_insn ();
10224 rtx new_cfa
= plus_constant (Pmode
, stack_pointer_rtx
, initial_adjust
);
10225 REG_NOTES (insn
) = alloc_reg_note (REG_CFA_DEF_CFA
, new_cfa
, cfi_ops
);
10226 RTX_FRAME_RELATED_P (insn
) = 1;
10230 /* Liveness of EP0_REGNUM can not be trusted across function calls either, so
10231 add restriction on emit_move optimization to leaf functions. */
10232 aarch64_add_sp (tmp0_rtx
, tmp1_rtx
, initial_adjust
, force_isa_mode
,
10233 (!can_inherit_p
|| !crtl
->is_leaf
10234 || df_regs_ever_live_p (EP0_REGNUM
)));
10238 /* Emit delayed restores and reset the CFA to be SP. */
10239 insn
= get_last_insn ();
10240 cfi_ops
= alloc_reg_note (REG_CFA_DEF_CFA
, stack_pointer_rtx
, cfi_ops
);
10241 REG_NOTES (insn
) = cfi_ops
;
10242 RTX_FRAME_RELATED_P (insn
) = 1;
10245 /* Pop return address from shadow call stack. */
10246 if (frame
.is_scs_enabled
)
10248 machine_mode mode
= aarch64_reg_save_mode (R30_REGNUM
);
10249 rtx reg
= gen_rtx_REG (mode
, R30_REGNUM
);
10251 insn
= emit_insn (gen_scs_pop ());
10252 add_reg_note (insn
, REG_CFA_RESTORE
, reg
);
10253 RTX_FRAME_RELATED_P (insn
) = 1;
10256 /* Stack adjustment for exception handler. */
10257 if (crtl
->calls_eh_return
&& !sibcall
)
10259 /* If the EH_RETURN_TAKEN_RTX flag is set then we need
10260 to unwind the stack and jump to the handler, otherwise
10261 skip this eh_return logic and continue with normal
10262 return after the label. We have already reset the CFA
10263 to be SP; letting the CFA move during this adjustment
10264 is just as correct as retaining the CFA from the body
10265 of the function. Therefore, do nothing special. */
10266 rtx_code_label
*label
= gen_label_rtx ();
10267 rtx x
= aarch64_gen_compare_zero_and_branch (EQ
, EH_RETURN_TAKEN_RTX
,
10269 rtx jump
= emit_jump_insn (x
);
10270 JUMP_LABEL (jump
) = label
;
10271 LABEL_NUSES (label
)++;
10272 emit_insn (gen_add2_insn (stack_pointer_rtx
,
10273 EH_RETURN_STACKADJ_RTX
));
10274 emit_jump_insn (gen_indirect_jump (EH_RETURN_HANDLER_RTX
));
10276 emit_label (label
);
10279 /* We prefer to emit the combined return/authenticate instruction RETAA,
10280 however there are three cases in which we must instead emit an explicit
10281 authentication instruction.
10283 1) Sibcalls don't return in a normal way, so if we're about to call one
10284 we must authenticate.
10286 2) The RETAA instruction is not available without FEAT_PAuth, so if we
10287 are generating code for !TARGET_PAUTH we can't use it and must
10288 explicitly authenticate.
10290 if (aarch64_return_address_signing_enabled ()
10291 && (sibcall
|| !TARGET_PAUTH
))
10293 switch (aarch64_ra_sign_key
)
10295 case AARCH64_KEY_A
:
10296 insn
= emit_insn (gen_autiasp ());
10298 case AARCH64_KEY_B
:
10299 insn
= emit_insn (gen_autibsp ());
10302 gcc_unreachable ();
10304 add_reg_note (insn
, REG_CFA_NEGATE_RA_STATE
, const0_rtx
);
10305 RTX_FRAME_RELATED_P (insn
) = 1;
10308 emit_use (gen_rtx_REG (DImode
, LR_REGNUM
));
10310 emit_jump_insn (ret_rtx
);
10313 /* Output code to add DELTA to the first argument, and then jump
10314 to FUNCTION. Used for C++ multiple inheritance. */
10316 aarch64_output_mi_thunk (FILE *file
, tree thunk ATTRIBUTE_UNUSED
,
10317 HOST_WIDE_INT delta
,
10318 HOST_WIDE_INT vcall_offset
,
10321 /* The this pointer is always in x0. Note that this differs from
10322 Arm where the this pointer maybe bumped to r1 if r0 is required
10323 to return a pointer to an aggregate. On AArch64 a result value
10324 pointer will be in x8. */
10325 int this_regno
= R0_REGNUM
;
10326 rtx this_rtx
, temp0
, temp1
, addr
, funexp
;
10328 const char *fnname
= IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (thunk
));
10330 if (aarch_bti_enabled ())
10331 emit_insn (gen_bti_c());
10333 reload_completed
= 1;
10334 emit_note (NOTE_INSN_PROLOGUE_END
);
10336 this_rtx
= gen_rtx_REG (Pmode
, this_regno
);
10337 temp0
= gen_rtx_REG (Pmode
, EP0_REGNUM
);
10338 temp1
= gen_rtx_REG (Pmode
, EP1_REGNUM
);
10340 if (vcall_offset
== 0)
10341 aarch64_add_offset (Pmode
, this_rtx
, this_rtx
, delta
, temp1
, temp0
,
10345 gcc_assert ((vcall_offset
& (POINTER_BYTES
- 1)) == 0);
10350 if (delta
>= -256 && delta
< 256)
10351 addr
= gen_rtx_PRE_MODIFY (Pmode
, this_rtx
,
10352 plus_constant (Pmode
, this_rtx
, delta
));
10354 aarch64_add_offset (Pmode
, this_rtx
, this_rtx
, delta
,
10355 temp1
, temp0
, 0, false);
10358 if (Pmode
== ptr_mode
)
10359 aarch64_emit_move (temp0
, gen_rtx_MEM (ptr_mode
, addr
));
10361 aarch64_emit_move (temp0
,
10362 gen_rtx_ZERO_EXTEND (Pmode
,
10363 gen_rtx_MEM (ptr_mode
, addr
)));
10365 if (vcall_offset
>= -256 && vcall_offset
< 4096 * POINTER_BYTES
)
10366 addr
= plus_constant (Pmode
, temp0
, vcall_offset
);
10369 aarch64_internal_mov_immediate (temp1
, GEN_INT (vcall_offset
), true,
10371 addr
= gen_rtx_PLUS (Pmode
, temp0
, temp1
);
10374 if (Pmode
== ptr_mode
)
10375 aarch64_emit_move (temp1
, gen_rtx_MEM (ptr_mode
,addr
));
10377 aarch64_emit_move (temp1
,
10378 gen_rtx_SIGN_EXTEND (Pmode
,
10379 gen_rtx_MEM (ptr_mode
, addr
)));
10381 emit_insn (gen_add2_insn (this_rtx
, temp1
));
10384 /* Generate a tail call to the target function. */
10385 if (!TREE_USED (function
))
10387 assemble_external (function
);
10388 TREE_USED (function
) = 1;
10390 funexp
= XEXP (DECL_RTL (function
), 0);
10391 funexp
= gen_rtx_MEM (FUNCTION_MODE
, funexp
);
10392 auto isa_mode
= aarch64_fntype_isa_mode (TREE_TYPE (function
));
10393 auto pcs_variant
= arm_pcs (fndecl_abi (function
).id ());
10394 bool ir
= lookup_attribute ("indirect_return",
10395 TYPE_ATTRIBUTES (TREE_TYPE (function
)));
10396 rtx callee_abi
= aarch64_gen_callee_cookie (isa_mode
, pcs_variant
, ir
);
10397 insn
= emit_call_insn (gen_sibcall (funexp
, const0_rtx
, callee_abi
));
10398 SIBLING_CALL_P (insn
) = 1;
10400 insn
= get_insns ();
10401 shorten_branches (insn
);
10403 assemble_start_function (thunk
, fnname
);
10404 final_start_function (insn
, file
, 1);
10405 final (insn
, file
, 1);
10406 final_end_function ();
10407 assemble_end_function (thunk
, fnname
);
10409 /* Stop pretending to be a post-reload pass. */
10410 reload_completed
= 0;
10414 aarch64_tls_referenced_p (rtx x
)
10416 if (!TARGET_HAVE_TLS
)
10418 subrtx_iterator::array_type array
;
10419 FOR_EACH_SUBRTX (iter
, array
, x
, ALL
)
10421 const_rtx x
= *iter
;
10422 if (SYMBOL_REF_P (x
) && SYMBOL_REF_TLS_MODEL (x
) != 0)
10424 /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
10425 TLS offsets, not real symbol references. */
10426 if (GET_CODE (x
) == UNSPEC
&& XINT (x
, 1) == UNSPEC_TLS
)
10427 iter
.skip_subrtxes ();
10434 aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED
, rtx x
)
10436 if (GET_CODE (x
) == HIGH
)
10439 /* There's no way to calculate VL-based values using relocations. */
10440 subrtx_iterator::array_type array
;
10441 HOST_WIDE_INT factor
;
10442 FOR_EACH_SUBRTX (iter
, array
, x
, ALL
)
10443 if (GET_CODE (*iter
) == CONST_POLY_INT
10444 || aarch64_sme_vq_unspec_p (x
, &factor
))
10448 rtx base
= strip_offset_and_salt (x
, &offset
);
10449 if (SYMBOL_REF_P (base
) || LABEL_REF_P (base
))
10451 /* We checked for POLY_INT_CST offsets above. */
10452 if (aarch64_classify_symbol (base
, offset
.to_constant ())
10453 != SYMBOL_FORCE_TO_MEM
)
10456 /* Avoid generating a 64-bit relocation in ILP32; leave
10457 to aarch64_expand_mov_immediate to handle it properly. */
10458 return mode
!= ptr_mode
;
10461 return aarch64_tls_referenced_p (x
);
10464 /* Implement TARGET_CASE_VALUES_THRESHOLD.
10465 The expansion for a table switch is quite expensive due to the number
10466 of instructions, the table lookup and hard to predict indirect jump.
10467 When optimizing for speed, and -O3 enabled, use the per-core tuning if
10468 set, otherwise use tables for >= 11 cases as a tradeoff between size and
10469 performance. When optimizing for size, use 8 for smallest codesize. */
10471 static unsigned int
10472 aarch64_case_values_threshold (void)
10474 /* Use the specified limit for the number of cases before using jump
10475 tables at higher optimization levels. */
10477 && aarch64_tune_params
.max_case_values
!= 0)
10478 return aarch64_tune_params
.max_case_values
;
10480 return optimize_size
? 8 : 11;
10483 /* Return true if register REGNO is a valid index register.
10484 STRICT_P is true if REG_OK_STRICT is in effect. */
10487 aarch64_regno_ok_for_index_p (int regno
, bool strict_p
)
10489 if (!HARD_REGISTER_NUM_P (regno
))
10497 regno
= reg_renumber
[regno
];
10499 return GP_REGNUM_P (regno
);
10502 /* Return true if register REGNO is a valid base register for mode MODE.
10503 STRICT_P is true if REG_OK_STRICT is in effect. */
10506 aarch64_regno_ok_for_base_p (int regno
, bool strict_p
)
10508 if (!HARD_REGISTER_NUM_P (regno
))
10516 regno
= reg_renumber
[regno
];
10519 /* The fake registers will be eliminated to either the stack or
10520 hard frame pointer, both of which are usually valid base registers.
10521 Reload deals with the cases where the eliminated form isn't valid. */
10522 return (GP_REGNUM_P (regno
)
10523 || regno
== SP_REGNUM
10524 || regno
== FRAME_POINTER_REGNUM
10525 || regno
== ARG_POINTER_REGNUM
);
10528 /* Return true if X is a valid base register for mode MODE.
10529 STRICT_P is true if REG_OK_STRICT is in effect. */
10532 aarch64_base_register_rtx_p (rtx x
, bool strict_p
)
10536 && contains_reg_of_mode
[GENERAL_REGS
][GET_MODE (SUBREG_REG (x
))])
10537 x
= SUBREG_REG (x
);
10539 return (REG_P (x
) && aarch64_regno_ok_for_base_p (REGNO (x
), strict_p
));
10542 /* Return true if address offset is a valid index. If it is, fill in INFO
10543 appropriately. STRICT_P is true if REG_OK_STRICT is in effect. */
10546 aarch64_classify_index (struct aarch64_address_info
*info
, rtx x
,
10547 machine_mode mode
, bool strict_p
)
10549 enum aarch64_address_type type
;
10554 if ((REG_P (x
) || SUBREG_P (x
))
10555 && GET_MODE (x
) == Pmode
)
10557 type
= ADDRESS_REG_REG
;
10561 /* (sign_extend:DI (reg:SI)) */
10562 else if ((GET_CODE (x
) == SIGN_EXTEND
10563 || GET_CODE (x
) == ZERO_EXTEND
)
10564 && GET_MODE (x
) == DImode
10565 && GET_MODE (XEXP (x
, 0)) == SImode
)
10567 type
= (GET_CODE (x
) == SIGN_EXTEND
)
10568 ? ADDRESS_REG_SXTW
: ADDRESS_REG_UXTW
;
10569 index
= XEXP (x
, 0);
10572 /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
10573 else if (GET_CODE (x
) == MULT
10574 && (GET_CODE (XEXP (x
, 0)) == SIGN_EXTEND
10575 || GET_CODE (XEXP (x
, 0)) == ZERO_EXTEND
)
10576 && GET_MODE (XEXP (x
, 0)) == DImode
10577 && GET_MODE (XEXP (XEXP (x
, 0), 0)) == SImode
10578 && CONST_INT_P (XEXP (x
, 1)))
10580 type
= (GET_CODE (XEXP (x
, 0)) == SIGN_EXTEND
)
10581 ? ADDRESS_REG_SXTW
: ADDRESS_REG_UXTW
;
10582 index
= XEXP (XEXP (x
, 0), 0);
10583 shift
= exact_log2 (INTVAL (XEXP (x
, 1)));
10585 /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
10586 else if (GET_CODE (x
) == ASHIFT
10587 && (GET_CODE (XEXP (x
, 0)) == SIGN_EXTEND
10588 || GET_CODE (XEXP (x
, 0)) == ZERO_EXTEND
)
10589 && GET_MODE (XEXP (x
, 0)) == DImode
10590 && GET_MODE (XEXP (XEXP (x
, 0), 0)) == SImode
10591 && CONST_INT_P (XEXP (x
, 1)))
10593 type
= (GET_CODE (XEXP (x
, 0)) == SIGN_EXTEND
)
10594 ? ADDRESS_REG_SXTW
: ADDRESS_REG_UXTW
;
10595 index
= XEXP (XEXP (x
, 0), 0);
10596 shift
= INTVAL (XEXP (x
, 1));
10598 /* (and:DI (mult:DI (reg:DI) (const_int scale))
10599 (const_int 0xffffffff<<shift)) */
10600 else if (GET_CODE (x
) == AND
10601 && GET_MODE (x
) == DImode
10602 && GET_CODE (XEXP (x
, 0)) == MULT
10603 && GET_MODE (XEXP (XEXP (x
, 0), 0)) == DImode
10604 && CONST_INT_P (XEXP (XEXP (x
, 0), 1))
10605 && CONST_INT_P (XEXP (x
, 1)))
10607 type
= ADDRESS_REG_UXTW
;
10608 index
= XEXP (XEXP (x
, 0), 0);
10609 shift
= exact_log2 (INTVAL (XEXP (XEXP (x
, 0), 1)));
10610 /* Avoid undefined code dealing with shift being -1. */
10612 && INTVAL (XEXP (x
, 1)) != (HOST_WIDE_INT
)0xffffffff << shift
)
10615 /* (and:DI (ashift:DI (reg:DI) (const_int shift))
10616 (const_int 0xffffffff<<shift)) */
10617 else if (GET_CODE (x
) == AND
10618 && GET_MODE (x
) == DImode
10619 && GET_CODE (XEXP (x
, 0)) == ASHIFT
10620 && GET_MODE (XEXP (XEXP (x
, 0), 0)) == DImode
10621 && CONST_INT_P (XEXP (XEXP (x
, 0), 1))
10622 && CONST_INT_P (XEXP (x
, 1)))
10624 type
= ADDRESS_REG_UXTW
;
10625 index
= XEXP (XEXP (x
, 0), 0);
10626 shift
= INTVAL (XEXP (XEXP (x
, 0), 1));
10627 if (INTVAL (XEXP (x
, 1)) != (HOST_WIDE_INT
)0xffffffff << shift
)
10630 /* (mult:P (reg:P) (const_int scale)) */
10631 else if (GET_CODE (x
) == MULT
10632 && GET_MODE (x
) == Pmode
10633 && GET_MODE (XEXP (x
, 0)) == Pmode
10634 && CONST_INT_P (XEXP (x
, 1)))
10636 type
= ADDRESS_REG_REG
;
10637 index
= XEXP (x
, 0);
10638 shift
= exact_log2 (INTVAL (XEXP (x
, 1)));
10640 /* (ashift:P (reg:P) (const_int shift)) */
10641 else if (GET_CODE (x
) == ASHIFT
10642 && GET_MODE (x
) == Pmode
10643 && GET_MODE (XEXP (x
, 0)) == Pmode
10644 && CONST_INT_P (XEXP (x
, 1)))
10646 type
= ADDRESS_REG_REG
;
10647 index
= XEXP (x
, 0);
10648 shift
= INTVAL (XEXP (x
, 1));
10654 && SUBREG_P (index
)
10655 && contains_reg_of_mode
[GENERAL_REGS
][GET_MODE (SUBREG_REG (index
))])
10656 index
= SUBREG_REG (index
);
10658 auto vec_flags
= aarch64_classify_vector_memory_mode (mode
);
10659 if (vec_flags
& VEC_SVE_DATA
)
10661 if (type
!= ADDRESS_REG_REG
10662 || (1 << shift
) != GET_MODE_UNIT_SIZE (mode
))
10668 && !(IN_RANGE (shift
, 1, 3)
10669 && known_eq (1 << shift
, GET_MODE_SIZE (mode
))))
10674 && aarch64_regno_ok_for_index_p (REGNO (index
), strict_p
))
10677 info
->offset
= index
;
10678 info
->shift
= shift
;
10685 /* Return true if MODE is one of the modes for which we
10686 support LDP/STP operations. */
10689 aarch64_mode_valid_for_sched_fusion_p (machine_mode mode
)
10691 return mode
== SImode
|| mode
== DImode
10692 || mode
== SFmode
|| mode
== DFmode
10693 || mode
== SDmode
|| mode
== DDmode
10694 || (aarch64_vector_mode_supported_p (mode
)
10695 && (known_eq (GET_MODE_SIZE (mode
), 8)
10696 || known_eq (GET_MODE_SIZE (mode
), 16)));
10699 /* Return true if REGNO is a virtual pointer register, or an eliminable
10700 "soft" frame register. Like REGNO_PTR_FRAME_P except that we don't
10701 include stack_pointer or hard_frame_pointer. */
10703 virt_or_elim_regno_p (unsigned regno
)
10705 return ((regno
>= FIRST_VIRTUAL_REGISTER
10706 && regno
<= LAST_VIRTUAL_POINTER_REGISTER
)
10707 || regno
== FRAME_POINTER_REGNUM
10708 || regno
== ARG_POINTER_REGNUM
);
10711 /* Return true if X is a valid address of type TYPE for machine mode MODE.
10712 If it is, fill in INFO appropriately. STRICT_P is true if
10713 REG_OK_STRICT is in effect. */
10716 aarch64_classify_address (struct aarch64_address_info
*info
,
10717 rtx x
, machine_mode mode
, bool strict_p
,
10718 aarch64_addr_query_type type
)
10720 enum rtx_code code
= GET_CODE (x
);
10724 HOST_WIDE_INT const_size
;
10726 /* Whether a vector mode is partial doesn't affect address legitimacy.
10727 Partial vectors like VNx8QImode allow the same indexed addressing
10728 mode and MUL VL addressing mode as full vectors like VNx16QImode;
10729 in both cases, MUL VL counts multiples of GET_MODE_SIZE. */
10730 unsigned int vec_flags
= aarch64_classify_vector_memory_mode (mode
);
10731 vec_flags
&= ~VEC_PARTIAL
;
10733 /* We use load/store pair for all large int mode load/stores.
10734 TI/TF/TDmode may also use a load/store pair. */
10735 bool advsimd_struct_p
= (vec_flags
== (VEC_ADVSIMD
| VEC_STRUCT
));
10736 bool load_store_pair_p
= (type
== ADDR_QUERY_LDP_STP
10737 || type
== ADDR_QUERY_LDP_STP_N
10741 || advsimd_struct_p
);
10742 /* If we are dealing with ADDR_QUERY_LDP_STP_N that means the incoming mode
10743 corresponds to the actual size of the memory being loaded/stored and the
10744 mode of the corresponding addressing mode is half of that. */
10745 if (type
== ADDR_QUERY_LDP_STP_N
)
10747 if (known_eq (GET_MODE_SIZE (mode
), 32))
10749 else if (known_eq (GET_MODE_SIZE (mode
), 16))
10751 else if (known_eq (GET_MODE_SIZE (mode
), 8))
10756 /* This isn't really an Advanced SIMD struct mode, but a mode
10757 used to represent the complete mem in a load/store pair. */
10758 advsimd_struct_p
= false;
10761 bool allow_reg_index_p
= (!load_store_pair_p
10762 && ((vec_flags
== 0
10763 && known_lt (GET_MODE_SIZE (mode
), 16))
10764 || vec_flags
== VEC_ADVSIMD
10765 || vec_flags
& VEC_SVE_DATA
));
10767 /* For SVE, only accept [Rn], [Rn, #offset, MUL VL] and [Rn, Rm, LSL #shift].
10768 The latter is not valid for SVE predicates, and that's rejected through
10769 allow_reg_index_p above. */
10770 if ((vec_flags
& (VEC_SVE_DATA
| VEC_SVE_PRED
)) != 0
10771 && (code
!= REG
&& code
!= PLUS
))
10774 gcc_checking_assert (GET_MODE (x
) == VOIDmode
10775 || SCALAR_INT_MODE_P (GET_MODE (x
)));
10781 info
->type
= ADDRESS_REG_IMM
;
10783 info
->offset
= const0_rtx
;
10784 info
->const_offset
= 0;
10785 return aarch64_base_register_rtx_p (x
, strict_p
);
10793 && virt_or_elim_regno_p (REGNO (op0
))
10794 && poly_int_rtx_p (op1
, &offset
))
10796 info
->type
= ADDRESS_REG_IMM
;
10798 info
->offset
= op1
;
10799 info
->const_offset
= offset
;
10804 if (maybe_ne (GET_MODE_SIZE (mode
), 0)
10805 && aarch64_base_register_rtx_p (op0
, strict_p
)
10806 && poly_int_rtx_p (op1
, &offset
))
10808 info
->type
= ADDRESS_REG_IMM
;
10810 info
->offset
= op1
;
10811 info
->const_offset
= offset
;
10813 /* TImode, TFmode and TDmode values are allowed in both pairs of X
10814 registers and individual Q registers. The available
10816 X,X: 7-bit signed scaled offset
10817 Q: 9-bit signed offset
10818 We conservatively require an offset representable in either mode.
10819 When performing the check for pairs of X registers i.e. LDP/STP
10820 pass down DImode since that is the natural size of the LDP/STP
10821 instruction memory accesses. */
10822 if (mode
== TImode
|| mode
== TFmode
|| mode
== TDmode
)
10823 return (aarch64_offset_7bit_signed_scaled_p (DImode
, offset
)
10824 && (aarch64_offset_9bit_signed_unscaled_p (mode
, offset
)
10825 || offset_12bit_unsigned_scaled_p (mode
, offset
)));
10827 if (mode
== V8DImode
)
10828 return (aarch64_offset_7bit_signed_scaled_p (DImode
, offset
)
10829 && aarch64_offset_7bit_signed_scaled_p (DImode
, offset
+ 48));
10831 /* A 7bit offset check because OImode will emit a ldp/stp
10832 instruction (only !TARGET_SIMD or big endian will get here).
10833 For ldp/stp instructions, the offset is scaled for the size of a
10834 single element of the pair. */
10835 if (aarch64_advsimd_partial_struct_mode_p (mode
)
10836 && known_eq (GET_MODE_SIZE (mode
), 16))
10837 return aarch64_offset_7bit_signed_scaled_p (DImode
, offset
);
10838 if (aarch64_advsimd_full_struct_mode_p (mode
)
10839 && known_eq (GET_MODE_SIZE (mode
), 32))
10840 return aarch64_offset_7bit_signed_scaled_p (TImode
, offset
);
10842 /* Three 9/12 bit offsets checks because CImode will emit three
10843 ldr/str instructions (only !TARGET_SIMD or big endian will
10845 if (aarch64_advsimd_partial_struct_mode_p (mode
)
10846 && known_eq (GET_MODE_SIZE (mode
), 24))
10847 return (aarch64_offset_7bit_signed_scaled_p (DImode
, offset
)
10848 && (aarch64_offset_9bit_signed_unscaled_p (DImode
,
10850 || offset_12bit_unsigned_scaled_p (DImode
,
10852 if (aarch64_advsimd_full_struct_mode_p (mode
)
10853 && known_eq (GET_MODE_SIZE (mode
), 48))
10854 return (aarch64_offset_7bit_signed_scaled_p (TImode
, offset
)
10855 && (aarch64_offset_9bit_signed_unscaled_p (TImode
,
10857 || offset_12bit_unsigned_scaled_p (TImode
,
10860 /* Two 7bit offsets checks because XImode will emit two ldp/stp
10861 instructions (only big endian will get here). */
10862 if (aarch64_advsimd_partial_struct_mode_p (mode
)
10863 && known_eq (GET_MODE_SIZE (mode
), 32))
10864 return (aarch64_offset_7bit_signed_scaled_p (DImode
, offset
)
10865 && aarch64_offset_7bit_signed_scaled_p (DImode
,
10867 if (aarch64_advsimd_full_struct_mode_p (mode
)
10868 && known_eq (GET_MODE_SIZE (mode
), 64))
10869 return (aarch64_offset_7bit_signed_scaled_p (TImode
, offset
)
10870 && aarch64_offset_7bit_signed_scaled_p (TImode
,
10873 /* Make "m" use the LD1 offset range for SVE data modes, so
10874 that pre-RTL optimizers like ivopts will work to that
10875 instead of the wider LDR/STR range. */
10876 if (vec_flags
== VEC_SVE_DATA
)
10877 return (type
== ADDR_QUERY_M
10878 ? offset_4bit_signed_scaled_p (mode
, offset
)
10879 : offset_9bit_signed_scaled_p (mode
, offset
));
10881 if (vec_flags
== (VEC_SVE_DATA
| VEC_STRUCT
))
10883 poly_int64 end_offset
= (offset
10884 + GET_MODE_SIZE (mode
)
10885 - BYTES_PER_SVE_VECTOR
);
10886 return (type
== ADDR_QUERY_M
10887 ? offset_4bit_signed_scaled_p (mode
, offset
)
10888 : (offset_9bit_signed_scaled_p (SVE_BYTE_MODE
, offset
)
10889 && offset_9bit_signed_scaled_p (SVE_BYTE_MODE
,
10893 if (vec_flags
== VEC_SVE_PRED
)
10894 return offset_9bit_signed_scaled_p (mode
, offset
);
10896 if (vec_flags
== (VEC_SVE_PRED
| VEC_STRUCT
))
10898 poly_int64 end_offset
= (offset
10899 + GET_MODE_SIZE (mode
)
10900 - BYTES_PER_SVE_PRED
);
10901 return (offset_9bit_signed_scaled_p (VNx16BImode
, end_offset
)
10902 && offset_9bit_signed_scaled_p (VNx16BImode
, offset
));
10905 if (load_store_pair_p
)
10906 return ((known_eq (GET_MODE_SIZE (mode
), 4)
10907 || known_eq (GET_MODE_SIZE (mode
), 8)
10908 || known_eq (GET_MODE_SIZE (mode
), 16))
10909 && aarch64_offset_7bit_signed_scaled_p (mode
, offset
));
10911 return (aarch64_offset_9bit_signed_unscaled_p (mode
, offset
)
10912 || offset_12bit_unsigned_scaled_p (mode
, offset
));
10915 if (allow_reg_index_p
)
10917 /* Look for base + (scaled/extended) index register. */
10918 if (aarch64_base_register_rtx_p (op0
, strict_p
)
10919 && aarch64_classify_index (info
, op1
, mode
, strict_p
))
10924 if (aarch64_base_register_rtx_p (op1
, strict_p
)
10925 && aarch64_classify_index (info
, op0
, mode
, strict_p
))
10938 info
->type
= ADDRESS_REG_WB
;
10939 info
->base
= XEXP (x
, 0);
10940 info
->offset
= NULL_RTX
;
10941 return aarch64_base_register_rtx_p (info
->base
, strict_p
);
10945 info
->type
= ADDRESS_REG_WB
;
10946 info
->base
= XEXP (x
, 0);
10947 if (GET_CODE (XEXP (x
, 1)) == PLUS
10948 && poly_int_rtx_p (XEXP (XEXP (x
, 1), 1), &offset
)
10949 && rtx_equal_p (XEXP (XEXP (x
, 1), 0), info
->base
)
10950 && aarch64_base_register_rtx_p (info
->base
, strict_p
))
10952 info
->offset
= XEXP (XEXP (x
, 1), 1);
10953 info
->const_offset
= offset
;
10955 /* TImode, TFmode and TDmode values are allowed in both pairs of X
10956 registers and individual Q registers. The available
10958 X,X: 7-bit signed scaled offset
10959 Q: 9-bit signed offset
10960 We conservatively require an offset representable in either mode.
10962 if (mode
== TImode
|| mode
== TFmode
|| mode
== TDmode
)
10963 return (aarch64_offset_7bit_signed_scaled_p (mode
, offset
)
10964 && aarch64_offset_9bit_signed_unscaled_p (mode
, offset
));
10966 if (load_store_pair_p
)
10967 return ((known_eq (GET_MODE_SIZE (mode
), 4)
10968 || known_eq (GET_MODE_SIZE (mode
), 8)
10969 || known_eq (GET_MODE_SIZE (mode
), 16))
10970 && aarch64_offset_7bit_signed_scaled_p (mode
, offset
));
10972 return aarch64_offset_9bit_signed_unscaled_p (mode
, offset
);
10979 /* load literal: pc-relative constant pool entry. Only supported
10980 for SI mode or larger. */
10981 info
->type
= ADDRESS_SYMBOLIC
;
10983 if (!load_store_pair_p
10984 && GET_MODE_SIZE (mode
).is_constant (&const_size
)
10985 && const_size
>= 4)
10988 rtx sym
= strip_offset_and_salt (x
, &offset
);
10989 return ((LABEL_REF_P (sym
)
10990 || (SYMBOL_REF_P (sym
)
10991 && CONSTANT_POOL_ADDRESS_P (sym
)
10992 && aarch64_pcrelative_literal_loads
)));
10997 info
->type
= ADDRESS_LO_SUM
;
10998 info
->base
= XEXP (x
, 0);
10999 info
->offset
= XEXP (x
, 1);
11000 if (allow_reg_index_p
11001 && aarch64_base_register_rtx_p (info
->base
, strict_p
))
11004 HOST_WIDE_INT const_offset
;
11005 rtx sym
= strip_offset_and_salt (info
->offset
, &offset
);
11006 if (SYMBOL_REF_P (sym
)
11007 && offset
.is_constant (&const_offset
)
11008 && (aarch64_classify_symbol (sym
, const_offset
)
11009 == SYMBOL_SMALL_ABSOLUTE
))
11011 /* The symbol and offset must be aligned to the access size. */
11012 unsigned int align
;
11014 if (CONSTANT_POOL_ADDRESS_P (sym
))
11015 align
= GET_MODE_ALIGNMENT (get_pool_mode (sym
));
11016 else if (TREE_CONSTANT_POOL_ADDRESS_P (sym
))
11018 tree exp
= SYMBOL_REF_DECL (sym
);
11019 align
= TYPE_ALIGN (TREE_TYPE (exp
));
11020 align
= aarch64_constant_alignment (exp
, align
);
11022 else if (SYMBOL_REF_DECL (sym
))
11023 align
= DECL_ALIGN (SYMBOL_REF_DECL (sym
));
11024 else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym
)
11025 && SYMBOL_REF_BLOCK (sym
) != NULL
)
11026 align
= SYMBOL_REF_BLOCK (sym
)->alignment
;
11028 align
= BITS_PER_UNIT
;
11030 poly_int64 ref_size
= GET_MODE_SIZE (mode
);
11031 if (known_eq (ref_size
, 0))
11032 ref_size
= GET_MODE_SIZE (DImode
);
11034 return (multiple_p (const_offset
, ref_size
)
11035 && multiple_p (align
/ BITS_PER_UNIT
, ref_size
));
11045 /* Return true if the address X is valid for a PRFM instruction.
11046 STRICT_P is true if we should do strict checking with
11047 aarch64_classify_address. */
11050 aarch64_address_valid_for_prefetch_p (rtx x
, bool strict_p
)
11052 struct aarch64_address_info addr
;
11054 /* PRFM accepts the same addresses as DImode... */
11055 bool res
= aarch64_classify_address (&addr
, x
, DImode
, strict_p
);
11059 /* ... except writeback forms. */
11060 return addr
.type
!= ADDRESS_REG_WB
;
11064 aarch64_symbolic_address_p (rtx x
)
11067 x
= strip_offset_and_salt (x
, &offset
);
11068 return SYMBOL_REF_P (x
) || LABEL_REF_P (x
);
11071 /* Classify the base of symbolic expression X. */
11073 enum aarch64_symbol_type
11074 aarch64_classify_symbolic_expression (rtx x
)
11078 split_const (x
, &x
, &offset
);
11079 return aarch64_classify_symbol (x
, INTVAL (offset
));
11083 /* Return TRUE if X is a legitimate address for accessing memory in
11086 aarch64_legitimate_address_hook_p (machine_mode mode
, rtx x
, bool strict_p
,
11087 code_helper
= ERROR_MARK
)
11089 struct aarch64_address_info addr
;
11091 return aarch64_classify_address (&addr
, x
, mode
, strict_p
);
11094 /* Return TRUE if X is a legitimate address of type TYPE for accessing
11095 memory in mode MODE. STRICT_P is true if REG_OK_STRICT is in effect. */
11097 aarch64_legitimate_address_p (machine_mode mode
, rtx x
, bool strict_p
,
11098 aarch64_addr_query_type type
)
11100 struct aarch64_address_info addr
;
11102 return aarch64_classify_address (&addr
, x
, mode
, strict_p
, type
);
11105 /* Implement TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT. */
11108 aarch64_legitimize_address_displacement (rtx
*offset1
, rtx
*offset2
,
11109 poly_int64 orig_offset
,
11112 HOST_WIDE_INT size
;
11113 if (GET_MODE_SIZE (mode
).is_constant (&size
))
11115 HOST_WIDE_INT const_offset
, second_offset
;
11117 /* A general SVE offset is A * VQ + B. Remove the A component from
11118 coefficient 0 in order to get the constant B. */
11119 const_offset
= orig_offset
.coeffs
[0] - orig_offset
.coeffs
[1];
11121 /* Split an out-of-range address displacement into a base and
11122 offset. Use 4KB range for 1- and 2-byte accesses and a 16KB
11123 range otherwise to increase opportunities for sharing the base
11124 address of different sizes. Unaligned accesses use the signed
11125 9-bit range, TImode/TFmode/TDmode use the intersection of signed
11126 scaled 7-bit and signed 9-bit offset. */
11127 if (mode
== TImode
|| mode
== TFmode
|| mode
== TDmode
)
11128 second_offset
= ((const_offset
+ 0x100) & 0x1f8) - 0x100;
11129 else if ((const_offset
& (size
- 1)) != 0)
11130 second_offset
= ((const_offset
+ 0x100) & 0x1ff) - 0x100;
11132 second_offset
= const_offset
& (size
< 4 ? 0xfff : 0x3ffc);
11134 if (second_offset
== 0 || known_eq (orig_offset
, second_offset
))
11137 /* Split the offset into second_offset and the rest. */
11138 *offset1
= gen_int_mode (orig_offset
- second_offset
, Pmode
);
11139 *offset2
= gen_int_mode (second_offset
, Pmode
);
11144 /* Get the mode we should use as the basis of the range. For structure
11145 modes this is the mode of one vector. */
11146 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
11147 machine_mode step_mode
11148 = (vec_flags
& VEC_STRUCT
) != 0 ? SVE_BYTE_MODE
: mode
;
11150 /* Get the "mul vl" multiplier we'd like to use. */
11151 HOST_WIDE_INT factor
= GET_MODE_SIZE (step_mode
).coeffs
[1];
11152 HOST_WIDE_INT vnum
= orig_offset
.coeffs
[1] / factor
;
11153 if (vec_flags
& VEC_SVE_DATA
)
11154 /* LDR supports a 9-bit range, but the move patterns for
11155 structure modes require all vectors to be in range of the
11156 same base. The simplest way of accomodating that while still
11157 promoting reuse of anchor points between different modes is
11158 to use an 8-bit range unconditionally. */
11159 vnum
= ((vnum
+ 128) & 255) - 128;
11161 /* Predicates are only handled singly, so we might as well use
11163 vnum
= ((vnum
+ 256) & 511) - 256;
11167 /* Convert the "mul vl" multiplier into a byte offset. */
11168 poly_int64 second_offset
= GET_MODE_SIZE (step_mode
) * vnum
;
11169 if (known_eq (second_offset
, orig_offset
))
11172 /* Split the offset into second_offset and the rest. */
11173 *offset1
= gen_int_mode (orig_offset
- second_offset
, Pmode
);
11174 *offset2
= gen_int_mode (second_offset
, Pmode
);
11179 /* Return the binary representation of floating point constant VALUE in INTVAL.
11180 If the value cannot be converted, return false without setting INTVAL.
11181 The conversion is done in the given MODE. */
11183 aarch64_reinterpret_float_as_int (rtx value
, unsigned HOST_WIDE_INT
*intval
)
11186 /* We make a general exception for 0. */
11187 if (aarch64_float_const_zero_rtx_p (value
))
11193 scalar_float_mode mode
;
11194 if (!CONST_DOUBLE_P (value
)
11195 || !is_a
<scalar_float_mode
> (GET_MODE (value
), &mode
)
11196 || GET_MODE_BITSIZE (mode
) > HOST_BITS_PER_WIDE_INT
11197 /* Only support up to DF mode. */
11198 || GET_MODE_BITSIZE (mode
) > GET_MODE_BITSIZE (DFmode
))
11201 unsigned HOST_WIDE_INT ival
= 0;
11204 real_to_target (res
,
11205 CONST_DOUBLE_REAL_VALUE (value
),
11206 REAL_MODE_FORMAT (mode
));
11208 if (mode
== DFmode
|| mode
== DDmode
)
11210 int order
= BYTES_BIG_ENDIAN
? 1 : 0;
11211 ival
= zext_hwi (res
[order
], 32);
11212 ival
|= (zext_hwi (res
[1 - order
], 32) << 32);
11215 ival
= zext_hwi (res
[0], 32);
11221 /* Return TRUE if rtx X is an immediate constant that can be moved using a
11222 single MOV(+MOVK) followed by an FMOV. */
11224 aarch64_float_const_rtx_p (rtx x
)
11226 machine_mode mode
= GET_MODE (x
);
11227 if (mode
== VOIDmode
)
11230 /* Determine whether it's cheaper to write float constants as
11231 mov/movk pairs over ldr/adrp pairs. */
11232 unsigned HOST_WIDE_INT ival
;
11234 if (CONST_DOUBLE_P (x
)
11235 && SCALAR_FLOAT_MODE_P (mode
)
11236 && aarch64_reinterpret_float_as_int (x
, &ival
))
11238 machine_mode imode
= known_eq (GET_MODE_SIZE (mode
), 8) ? DImode
: SImode
;
11239 int num_instr
= aarch64_internal_mov_immediate
11240 (NULL_RTX
, gen_int_mode (ival
, imode
), false, imode
);
11241 return num_instr
< 3;
11247 /* Return TRUE if rtx X is immediate constant 0.0 (but not in Decimal
11248 Floating Point). */
11250 aarch64_float_const_zero_rtx_p (rtx x
)
11252 /* 0.0 in Decimal Floating Point cannot be represented by #0 or
11253 zr as our callers expect, so no need to check the actual
11254 value if X is of Decimal Floating Point type. */
11255 if (GET_MODE_CLASS (GET_MODE (x
)) == MODE_DECIMAL_FLOAT
)
11258 if (REAL_VALUE_MINUS_ZERO (*CONST_DOUBLE_REAL_VALUE (x
)))
11259 return !HONOR_SIGNED_ZEROS (GET_MODE (x
));
11260 return real_equal (CONST_DOUBLE_REAL_VALUE (x
), &dconst0
);
11263 /* Return true if X is any kind of constant zero rtx. */
11266 aarch64_const_zero_rtx_p (rtx x
)
11268 return (x
== CONST0_RTX (GET_MODE (x
))
11269 || (CONST_DOUBLE_P (x
) && aarch64_float_const_zero_rtx_p (x
)));
11272 /* Return TRUE if rtx X is immediate constant that fits in a single
11273 MOVI immediate operation. */
11275 aarch64_can_const_movi_rtx_p (rtx x
, machine_mode mode
)
11280 machine_mode vmode
;
11281 scalar_int_mode imode
;
11282 unsigned HOST_WIDE_INT ival
;
11284 if (CONST_DOUBLE_P (x
)
11285 && SCALAR_FLOAT_MODE_P (mode
))
11287 if (!aarch64_reinterpret_float_as_int (x
, &ival
))
11290 /* We make a general exception for 0. */
11291 if (aarch64_float_const_zero_rtx_p (x
))
11294 imode
= int_mode_for_mode (mode
).require ();
11296 else if (CONST_INT_P (x
)
11297 && is_a
<scalar_int_mode
> (mode
, &imode
))
11302 /* use a 64 bit mode for everything except for DI/DF/DD mode, where we use
11303 a 128 bit vector mode. */
11304 int width
= GET_MODE_BITSIZE (imode
) == 64 ? 128 : 64;
11306 vmode
= aarch64_simd_container_mode (imode
, width
);
11307 rtx v_op
= aarch64_simd_gen_const_vector_dup (vmode
, ival
);
11309 return aarch64_simd_valid_mov_imm (v_op
);
11312 /* Return TRUE if DST and SRC with mode MODE is a valid fp move. */
11314 aarch64_valid_fp_move (rtx dst
, rtx src
, machine_mode mode
)
11319 if (aarch64_reg_or_fp_zero (src
, mode
))
11322 if (!register_operand (dst
, mode
))
11328 if (!DECIMAL_FLOAT_MODE_P (mode
))
11330 if (aarch64_can_const_movi_rtx_p (src
, mode
)
11331 || aarch64_float_const_representable_p (src
)
11332 || aarch64_float_const_zero_rtx_p (src
))
11335 /* Block FP immediates which are split during expand. */
11336 if (aarch64_float_const_rtx_p (src
))
11340 return can_create_pseudo_p ();
11343 /* Return the fixed registers used for condition codes. */
11346 aarch64_fixed_condition_code_regs (unsigned int *p1
, unsigned int *p2
)
11349 *p2
= INVALID_REGNUM
;
11353 /* Return a fresh memory reference to the current function's TPIDR2 block,
11354 creating a block if necessary. */
11357 aarch64_get_tpidr2_block ()
11359 if (!cfun
->machine
->tpidr2_block
)
11360 /* The TPIDR2 block is 16 bytes in size and must be aligned to a 128-bit
11362 cfun
->machine
->tpidr2_block
= assign_stack_local (V16QImode
, 16, 128);
11363 return copy_rtx (cfun
->machine
->tpidr2_block
);
11366 /* Return a fresh register that points to the current function's
11367 TPIDR2 block, creating a block if necessary. */
11370 aarch64_get_tpidr2_ptr ()
11372 rtx block
= aarch64_get_tpidr2_block ();
11373 return force_reg (Pmode
, XEXP (block
, 0));
11376 /* Emit instructions to allocate a ZA lazy save buffer and initialize the
11377 current function's TPIDR2 block. */
11380 aarch64_init_tpidr2_block ()
11382 rtx block
= aarch64_get_tpidr2_block ();
11384 /* The ZA save buffer is SVL.B*SVL.B bytes in size. */
11385 rtx svl_bytes
= aarch64_sme_vq_immediate (Pmode
, 16, AARCH64_ISA_MODE
);
11386 rtx svl_bytes_reg
= force_reg (DImode
, svl_bytes
);
11387 rtx za_size
= expand_simple_binop (Pmode
, MULT
, svl_bytes_reg
,
11388 svl_bytes_reg
, NULL
, 0, OPTAB_LIB_WIDEN
);
11389 rtx za_save_buffer
= allocate_dynamic_stack_space (za_size
, 128,
11390 BITS_PER_UNIT
, -1, true);
11391 za_save_buffer
= force_reg (Pmode
, za_save_buffer
);
11392 cfun
->machine
->za_save_buffer
= za_save_buffer
;
11394 /* The first word of the block points to the save buffer and the second
11395 word is the number of ZA slices to save. */
11396 rtx block_0
= adjust_address (block
, DImode
, 0);
11397 emit_insn (aarch64_gen_store_pair (block_0
, za_save_buffer
, svl_bytes_reg
));
11399 if (!memory_operand (block
, V16QImode
))
11400 block
= replace_equiv_address (block
, force_reg (Pmode
, XEXP (block
, 0)));
11401 emit_insn (gen_aarch64_setup_local_tpidr2 (block
));
11404 /* Restore the contents of ZA from the lazy save buffer, given that
11405 register TPIDR2_BLOCK points to the current function's TPIDR2 block.
11406 PSTATE.ZA is known to be 0 and TPIDR2_EL0 is known to be null. */
11409 aarch64_restore_za (rtx tpidr2_block
)
11411 emit_insn (gen_aarch64_smstart_za ());
11412 if (REGNO (tpidr2_block
) != R0_REGNUM
)
11413 emit_move_insn (gen_rtx_REG (Pmode
, R0_REGNUM
), tpidr2_block
);
11414 emit_insn (gen_aarch64_tpidr2_restore ());
11417 /* Return the ZT0 save buffer, creating one if necessary. */
11420 aarch64_get_zt0_save_buffer ()
11422 if (!cfun
->machine
->zt0_save_buffer
)
11423 cfun
->machine
->zt0_save_buffer
= assign_stack_local (V8DImode
, 64, 128);
11424 return cfun
->machine
->zt0_save_buffer
;
11427 /* Save ZT0 to the current function's save buffer. */
11430 aarch64_save_zt0 ()
11432 rtx mem
= aarch64_get_zt0_save_buffer ();
11433 mem
= replace_equiv_address (mem
, force_reg (Pmode
, XEXP (mem
, 0)));
11434 emit_insn (gen_aarch64_sme_str_zt0 (mem
));
11437 /* Restore ZT0 from the current function's save buffer. FROM_LAZY_SAVE_P
11438 is true if the load is happening after a call to a private-ZA function,
11439 false if it can be treated as a normal load. */
11442 aarch64_restore_zt0 (bool from_lazy_save_p
)
11444 rtx mem
= aarch64_get_zt0_save_buffer ();
11445 mem
= replace_equiv_address (mem
, force_reg (Pmode
, XEXP (mem
, 0)));
11446 emit_insn (from_lazy_save_p
11447 ? gen_aarch64_restore_zt0 (mem
)
11448 : gen_aarch64_sme_ldr_zt0 (mem
));
11451 /* Implement TARGET_START_CALL_ARGS. */
11454 aarch64_start_call_args (cumulative_args_t ca_v
)
11456 CUMULATIVE_ARGS
*ca
= get_cumulative_args (ca_v
);
11458 if (!TARGET_SME
&& (ca
->isa_mode
& AARCH64_ISA_MODE_SM_ON
))
11460 error ("calling a streaming function requires the ISA extension %qs",
11462 inform (input_location
, "you can enable %qs using the command-line"
11463 " option %<-march%>, or by using the %<target%>"
11464 " attribute or pragma", "sme");
11467 if ((ca
->shared_za_flags
& (AARCH64_STATE_IN
| AARCH64_STATE_OUT
))
11468 && !aarch64_cfun_has_state ("za"))
11469 error ("call to a function that shares %qs state from a function"
11470 " that has no %qs state", "za", "za");
11471 else if ((ca
->shared_zt0_flags
& (AARCH64_STATE_IN
| AARCH64_STATE_OUT
))
11472 && !aarch64_cfun_has_state ("zt0"))
11473 error ("call to a function that shares %qs state from a function"
11474 " that has no %qs state", "zt0", "zt0");
11475 else if (!TARGET_ZA
&& (ca
->isa_mode
& AARCH64_ISA_MODE_ZA_ON
))
11476 error ("call to a function that shares SME state from a function"
11477 " that has no SME state");
11479 /* If this is a call to a private ZA function, emit a marker to
11480 indicate where any necessary set-up code could be inserted.
11481 The code itself is inserted by the mode-switching pass. */
11482 if (TARGET_ZA
&& !(ca
->isa_mode
& AARCH64_ISA_MODE_ZA_ON
))
11483 emit_insn (gen_aarch64_start_private_za_call ());
11485 /* If this is a call to a shared-ZA function that doesn't share ZT0,
11486 save and restore ZT0 around the call. */
11487 if (aarch64_cfun_has_state ("zt0")
11488 && (ca
->isa_mode
& AARCH64_ISA_MODE_ZA_ON
)
11489 && ca
->shared_zt0_flags
== 0)
11490 aarch64_save_zt0 ();
11493 /* This function is used by the call expanders of the machine description.
11494 RESULT is the register in which the result is returned. It's NULL for
11495 "call" and "sibcall".
11496 MEM is the location of the function call.
11498 - a const_int that gives the argument to the call's UNSPEC_CALLEE_ABI.
11499 - a PARALLEL that contains such a const_int as its first element.
11500 The second element is a PARALLEL that lists all the argument
11501 registers that need to be saved and restored around a change
11502 in PSTATE.SM, or const0_rtx if no such switch is needed.
11503 The third and fourth elements are const_ints that contain the
11504 sharing flags for ZA and ZT0 respectively.
11505 SIBCALL indicates whether this function call is normal call or sibling call.
11506 It will generate different pattern accordingly. */
11509 aarch64_expand_call (rtx result
, rtx mem
, rtx cookie
, bool sibcall
)
11511 rtx call
, callee
, tmp
;
11515 rtx callee_abi
= cookie
;
11516 rtx sme_mode_switch_args
= const0_rtx
;
11517 unsigned int shared_za_flags
= 0;
11518 unsigned int shared_zt0_flags
= 0;
11519 if (GET_CODE (cookie
) == PARALLEL
)
11521 callee_abi
= XVECEXP (cookie
, 0, 0);
11522 sme_mode_switch_args
= XVECEXP (cookie
, 0, 1);
11523 shared_za_flags
= INTVAL (XVECEXP (cookie
, 0, 2));
11524 shared_zt0_flags
= INTVAL (XVECEXP (cookie
, 0, 3));
11527 gcc_assert (CONST_INT_P (callee_abi
));
11528 auto callee_isa_mode
= aarch64_callee_isa_mode (callee_abi
);
11530 if (aarch64_cfun_has_state ("za")
11531 && (callee_isa_mode
& AARCH64_ISA_MODE_ZA_ON
)
11532 && !shared_za_flags
)
11534 sorry ("call to a function that shares state other than %qs"
11535 " from a function that has %qs state", "za", "za");
11536 inform (input_location
, "use %<__arm_preserves(\"za\")%> if the"
11537 " callee preserves ZA");
11540 gcc_assert (MEM_P (mem
));
11541 callee
= XEXP (mem
, 0);
11544 tmp
= legitimize_pe_coff_symbol (callee
, false);
11549 mode
= GET_MODE (callee
);
11550 gcc_assert (mode
== Pmode
);
11552 /* Decide if we should generate indirect calls by loading the
11553 address of the callee into a register before performing
11554 the branch-and-link. */
11555 if (SYMBOL_REF_P (callee
)
11556 ? (aarch64_is_long_call_p (callee
)
11557 || aarch64_is_noplt_call_p (callee
))
11559 XEXP (mem
, 0) = force_reg (mode
, callee
);
11561 /* Accumulate the return values, including state that is shared via
11563 auto_vec
<rtx
, 8> return_values
;
11566 if (GET_CODE (result
) == PARALLEL
)
11567 for (int i
= 0; i
< XVECLEN (result
, 0); ++i
)
11568 return_values
.safe_push (XVECEXP (result
, 0, i
));
11570 return_values
.safe_push (result
);
11572 unsigned int orig_num_return_values
= return_values
.length ();
11573 if (shared_za_flags
& AARCH64_STATE_OUT
)
11574 return_values
.safe_push (gen_rtx_REG (VNx16BImode
, ZA_REGNUM
));
11575 /* When calling private-ZA functions from functions with ZA state,
11576 we want to know whether the call committed a lazy save. */
11577 if (TARGET_ZA
&& !shared_za_flags
)
11578 return_values
.safe_push (gen_rtx_REG (VNx16BImode
, ZA_SAVED_REGNUM
));
11579 if (shared_zt0_flags
& AARCH64_STATE_OUT
)
11580 return_values
.safe_push (gen_rtx_REG (V8DImode
, ZT0_REGNUM
));
11582 /* Create the new return value, if necessary. */
11583 if (orig_num_return_values
!= return_values
.length ())
11585 if (return_values
.length () == 1)
11586 result
= return_values
[0];
11589 for (rtx
&x
: return_values
)
11590 if (GET_CODE (x
) != EXPR_LIST
)
11591 x
= gen_rtx_EXPR_LIST (VOIDmode
, x
, const0_rtx
);
11592 rtvec v
= gen_rtvec_v (return_values
.length (),
11593 return_values
.address ());
11594 result
= gen_rtx_PARALLEL (VOIDmode
, v
);
11598 call
= gen_rtx_CALL (VOIDmode
, mem
, const0_rtx
);
11600 if (result
!= NULL_RTX
)
11601 call
= gen_rtx_SET (result
, call
);
11606 tmp
= gen_rtx_CLOBBER (VOIDmode
, gen_rtx_REG (Pmode
, LR_REGNUM
));
11608 callee_abi
= gen_rtx_UNSPEC (DImode
, gen_rtvec (1, callee_abi
),
11609 UNSPEC_CALLEE_ABI
);
11611 vec
= gen_rtvec (3, call
, callee_abi
, tmp
);
11612 call
= gen_rtx_PARALLEL (VOIDmode
, vec
);
11614 auto call_insn
= aarch64_emit_call_insn (call
);
11616 /* Check whether the call requires a change to PSTATE.SM. We can't
11617 emit the instructions to change PSTATE.SM yet, since they involve
11618 a change in vector length and a change in instruction set, which
11619 cannot be represented in RTL.
11621 For now, just record which registers will be clobbered and used
11622 by the changes to PSTATE.SM. */
11623 if (!sibcall
&& aarch64_call_switches_pstate_sm (callee_isa_mode
))
11625 aarch64_sme_mode_switch_regs args_switch
;
11626 if (sme_mode_switch_args
!= const0_rtx
)
11628 unsigned int num_args
= XVECLEN (sme_mode_switch_args
, 0);
11629 for (unsigned int i
= 0; i
< num_args
; ++i
)
11631 rtx x
= XVECEXP (sme_mode_switch_args
, 0, i
);
11632 args_switch
.add_reg (GET_MODE (x
), REGNO (x
));
11636 aarch64_sme_mode_switch_regs result_switch
;
11638 result_switch
.add_call_result (call_insn
);
11640 unsigned int num_gprs
= MAX (args_switch
.num_gprs (),
11641 result_switch
.num_gprs ());
11642 for (unsigned int i
= 0; i
< num_gprs
; ++i
)
11643 clobber_reg (&CALL_INSN_FUNCTION_USAGE (call_insn
),
11644 gen_rtx_REG (DImode
, args_switch
.FIRST_GPR
+ i
));
11646 for (int regno
= V0_REGNUM
; regno
< V0_REGNUM
+ 32; regno
+= 4)
11647 clobber_reg (&CALL_INSN_FUNCTION_USAGE (call_insn
),
11648 gen_rtx_REG (V4x16QImode
, regno
));
11650 for (int regno
= P0_REGNUM
; regno
< P0_REGNUM
+ 16; regno
+= 1)
11651 clobber_reg (&CALL_INSN_FUNCTION_USAGE (call_insn
),
11652 gen_rtx_REG (VNx16BImode
, regno
));
11654 /* Ensure that the VG save slot has been initialized. Also emit
11655 an instruction to model the effect of the temporary clobber
11656 of VG, so that the prologue/epilogue pass sees the need to
11657 save the old value. */
11658 use_reg (&CALL_INSN_FUNCTION_USAGE (call_insn
),
11659 gen_rtx_REG (DImode
, VG_REGNUM
));
11660 emit_insn_before (gen_aarch64_update_vg (), call_insn
);
11662 cfun
->machine
->call_switches_pstate_sm
= true;
11665 /* Add any ZA-related information.
11667 ZA_REGNUM represents the current function's ZA state, rather than
11668 the contents of the ZA register itself. We ensure that the function's
11669 ZA state is preserved by private-ZA call sequences, so the call itself
11670 does not use or clobber ZA_REGNUM. The same thing applies to
11674 /* The callee requires ZA to be active if the callee is shared-ZA,
11675 otherwise it requires ZA to be dormant or off. The state of ZA is
11676 captured by a combination of SME_STATE_REGNUM, TPIDR2_SETUP_REGNUM,
11677 and ZA_SAVED_REGNUM. */
11678 use_reg (&CALL_INSN_FUNCTION_USAGE (call_insn
),
11679 gen_rtx_REG (DImode
, SME_STATE_REGNUM
));
11680 use_reg (&CALL_INSN_FUNCTION_USAGE (call_insn
),
11681 gen_rtx_REG (DImode
, TPIDR2_SETUP_REGNUM
));
11682 use_reg (&CALL_INSN_FUNCTION_USAGE (call_insn
),
11683 gen_rtx_REG (VNx16BImode
, ZA_SAVED_REGNUM
));
11685 /* Keep the aarch64_start/end_private_za_call markers live. */
11686 if (!(callee_isa_mode
& AARCH64_ISA_MODE_ZA_ON
))
11687 use_reg (&CALL_INSN_FUNCTION_USAGE (call_insn
),
11688 gen_rtx_REG (VNx16BImode
, LOWERING_REGNUM
));
11690 /* If the callee is a shared-ZA function, record whether it uses the
11691 current value of ZA and ZT0. */
11692 if (shared_za_flags
& AARCH64_STATE_IN
)
11693 use_reg (&CALL_INSN_FUNCTION_USAGE (call_insn
),
11694 gen_rtx_REG (VNx16BImode
, ZA_REGNUM
));
11696 if (shared_zt0_flags
& AARCH64_STATE_IN
)
11697 use_reg (&CALL_INSN_FUNCTION_USAGE (call_insn
),
11698 gen_rtx_REG (V8DImode
, ZT0_REGNUM
));
11702 /* Implement TARGET_END_CALL_ARGS. */
11705 aarch64_end_call_args (cumulative_args_t ca_v
)
11707 CUMULATIVE_ARGS
*ca
= get_cumulative_args (ca_v
);
11709 /* If this is a call to a private ZA function, emit a marker to
11710 indicate where any necessary restoration code could be inserted.
11711 The code itself is inserted by the mode-switching pass. */
11712 if (TARGET_ZA
&& !(ca
->isa_mode
& AARCH64_ISA_MODE_ZA_ON
))
11713 emit_insn (gen_aarch64_end_private_za_call ());
11715 /* If this is a call to a shared-ZA function that doesn't share ZT0,
11716 save and restore ZT0 around the call. */
11717 if (aarch64_cfun_has_state ("zt0")
11718 && (ca
->isa_mode
& AARCH64_ISA_MODE_ZA_ON
)
11719 && ca
->shared_zt0_flags
== 0)
11720 aarch64_restore_zt0 (false);
11723 /* Emit call insn with PAT and do aarch64-specific handling. */
11726 aarch64_emit_call_insn (rtx pat
)
11728 auto insn
= emit_call_insn (pat
);
11730 rtx
*fusage
= &CALL_INSN_FUNCTION_USAGE (insn
);
11731 clobber_reg (fusage
, gen_rtx_REG (word_mode
, IP0_REGNUM
));
11732 clobber_reg (fusage
, gen_rtx_REG (word_mode
, IP1_REGNUM
));
11733 return as_a
<rtx_call_insn
*> (insn
);
11737 aarch64_select_cc_mode (RTX_CODE code
, rtx x
, rtx y
)
11739 machine_mode mode_x
= GET_MODE (x
);
11740 rtx_code code_x
= GET_CODE (x
);
11742 /* All floating point compares return CCFP if it is an equality
11743 comparison, and CCFPE otherwise. */
11744 if (GET_MODE_CLASS (mode_x
) == MODE_FLOAT
)
11767 gcc_unreachable ();
11771 /* Equality comparisons of short modes against zero can be performed
11772 using the TST instruction with the appropriate bitmask. */
11773 if (y
== const0_rtx
&& (REG_P (x
) || SUBREG_P (x
))
11774 && (code
== EQ
|| code
== NE
)
11775 && (mode_x
== HImode
|| mode_x
== QImode
))
11778 /* Similarly, comparisons of zero_extends from shorter modes can
11779 be performed using an ANDS with an immediate mask. */
11780 if (y
== const0_rtx
&& code_x
== ZERO_EXTEND
11781 && (mode_x
== SImode
|| mode_x
== DImode
)
11782 && (GET_MODE (XEXP (x
, 0)) == HImode
|| GET_MODE (XEXP (x
, 0)) == QImode
)
11783 && (code
== EQ
|| code
== NE
))
11786 /* Zero extracts support equality comparisons. */
11787 if ((mode_x
== SImode
|| mode_x
== DImode
)
11789 && (code_x
== ZERO_EXTRACT
&& CONST_INT_P (XEXP (x
, 1))
11790 && CONST_INT_P (XEXP (x
, 2)))
11791 && (code
== EQ
|| code
== NE
))
11794 /* ANDS/BICS/TST support equality and all signed comparisons. */
11795 if ((mode_x
== SImode
|| mode_x
== DImode
)
11798 && (code
== EQ
|| code
== NE
|| code
== LT
|| code
== GE
11799 || code
== GT
|| code
== LE
))
11802 /* ADDS/SUBS correctly set N and Z flags. */
11803 if ((mode_x
== SImode
|| mode_x
== DImode
)
11805 && (code
== EQ
|| code
== NE
|| code
== LT
|| code
== GE
)
11806 && (code_x
== PLUS
|| code_x
== MINUS
|| code_x
== NEG
))
11809 /* A compare with a shifted operand. Because of canonicalization,
11810 the comparison will have to be swapped when we emit the assembly
11812 if ((mode_x
== SImode
|| mode_x
== DImode
)
11813 && (REG_P (y
) || SUBREG_P (y
) || y
== const0_rtx
)
11814 && (code_x
== ASHIFT
|| code_x
== ASHIFTRT
11815 || code_x
== LSHIFTRT
11816 || code_x
== ZERO_EXTEND
|| code_x
== SIGN_EXTEND
))
11819 /* Similarly for a negated operand, but we can only do this for
11821 if ((mode_x
== SImode
|| mode_x
== DImode
)
11822 && (REG_P (y
) || SUBREG_P (y
))
11823 && (code
== EQ
|| code
== NE
)
11827 /* A test for unsigned overflow from an addition. */
11828 if ((mode_x
== DImode
|| mode_x
== TImode
)
11829 && (code
== LTU
|| code
== GEU
)
11831 && rtx_equal_p (XEXP (x
, 0), y
))
11834 /* A test for unsigned overflow from an add with carry. */
11835 if ((mode_x
== DImode
|| mode_x
== TImode
)
11836 && (code
== LTU
|| code
== GEU
)
11838 && CONST_SCALAR_INT_P (y
)
11839 && (rtx_mode_t (y
, mode_x
)
11840 == (wi::shwi (1, mode_x
)
11841 << (GET_MODE_BITSIZE (mode_x
).to_constant () / 2))))
11844 /* A test for signed overflow. */
11845 if ((mode_x
== DImode
|| mode_x
== TImode
)
11848 && GET_CODE (y
) == SIGN_EXTEND
)
11851 /* For everything else, return CCmode. */
11856 aarch64_get_condition_code_1 (machine_mode
, enum rtx_code
);
11859 aarch64_get_condition_code (rtx x
)
11861 machine_mode mode
= GET_MODE (XEXP (x
, 0));
11862 enum rtx_code comp_code
= GET_CODE (x
);
11864 if (GET_MODE_CLASS (mode
) != MODE_CC
)
11865 mode
= SELECT_CC_MODE (comp_code
, XEXP (x
, 0), XEXP (x
, 1));
11866 return aarch64_get_condition_code_1 (mode
, comp_code
);
11870 aarch64_get_condition_code_1 (machine_mode mode
, enum rtx_code comp_code
)
11878 case GE
: return AARCH64_GE
;
11879 case GT
: return AARCH64_GT
;
11880 case LE
: return AARCH64_LS
;
11881 case LT
: return AARCH64_MI
;
11882 case NE
: return AARCH64_NE
;
11883 case EQ
: return AARCH64_EQ
;
11884 case ORDERED
: return AARCH64_VC
;
11885 case UNORDERED
: return AARCH64_VS
;
11886 case UNLT
: return AARCH64_LT
;
11887 case UNLE
: return AARCH64_LE
;
11888 case UNGT
: return AARCH64_HI
;
11889 case UNGE
: return AARCH64_PL
;
11890 default: return -1;
11897 case NE
: return AARCH64_NE
;
11898 case EQ
: return AARCH64_EQ
;
11899 case GE
: return AARCH64_GE
;
11900 case GT
: return AARCH64_GT
;
11901 case LE
: return AARCH64_LE
;
11902 case LT
: return AARCH64_LT
;
11903 case GEU
: return AARCH64_CS
;
11904 case GTU
: return AARCH64_HI
;
11905 case LEU
: return AARCH64_LS
;
11906 case LTU
: return AARCH64_CC
;
11907 default: return -1;
11914 case NE
: return AARCH64_NE
;
11915 case EQ
: return AARCH64_EQ
;
11916 case GE
: return AARCH64_LE
;
11917 case GT
: return AARCH64_LT
;
11918 case LE
: return AARCH64_GE
;
11919 case LT
: return AARCH64_GT
;
11920 case GEU
: return AARCH64_LS
;
11921 case GTU
: return AARCH64_CC
;
11922 case LEU
: return AARCH64_CS
;
11923 case LTU
: return AARCH64_HI
;
11924 default: return -1;
11931 case NE
: return AARCH64_NE
; /* = any */
11932 case EQ
: return AARCH64_EQ
; /* = none */
11933 case GE
: return AARCH64_PL
; /* = nfrst */
11934 case LT
: return AARCH64_MI
; /* = first */
11935 case GEU
: return AARCH64_CS
; /* = nlast */
11936 case GTU
: return AARCH64_HI
; /* = pmore */
11937 case LEU
: return AARCH64_LS
; /* = plast */
11938 case LTU
: return AARCH64_CC
; /* = last */
11939 default: return -1;
11946 case NE
: return AARCH64_NE
;
11947 case EQ
: return AARCH64_EQ
;
11948 case GE
: return AARCH64_PL
;
11949 case LT
: return AARCH64_MI
;
11950 case GT
: return AARCH64_GT
;
11951 case LE
: return AARCH64_LE
;
11952 default: return -1;
11959 case NE
: return AARCH64_NE
;
11960 case EQ
: return AARCH64_EQ
;
11961 case GE
: return AARCH64_PL
;
11962 case LT
: return AARCH64_MI
;
11963 default: return -1;
11970 case NE
: return AARCH64_NE
;
11971 case EQ
: return AARCH64_EQ
;
11972 default: return -1;
11979 case LTU
: return AARCH64_CS
;
11980 case GEU
: return AARCH64_CC
;
11981 default: return -1;
11988 case GEU
: return AARCH64_CS
;
11989 case LTU
: return AARCH64_CC
;
11990 default: return -1;
11997 case NE
: return AARCH64_VS
;
11998 case EQ
: return AARCH64_VC
;
11999 default: return -1;
12010 /* Return true if X is a CONST_INT, CONST_WIDE_INT or a constant vector
12011 duplicate of such constants. If so, store in RET_WI the wide_int
12012 representation of the constant paired with the inner mode of the vector mode
12013 or MODE for scalar X constants. If MODE is not provided then TImode is
12017 aarch64_extract_vec_duplicate_wide_int (rtx x
, wide_int
*ret_wi
,
12018 scalar_mode mode
= TImode
)
12020 rtx elt
= unwrap_const_vec_duplicate (x
);
12021 if (!CONST_SCALAR_INT_P (elt
))
12024 = CONST_SCALAR_INT_P (x
) ? mode
: GET_MODE_INNER (GET_MODE (x
));
12025 *ret_wi
= rtx_mode_t (elt
, smode
);
12029 /* Return true if X is a scalar or a constant vector of integer
12030 immediates that represent the rounding constant used in the fixed-point
12031 arithmetic instructions.
12032 The accepted form of the constant is (1 << (C - 1)) where C is in the range
12033 [1, MODE_WIDTH/2]. */
12036 aarch64_rnd_imm_p (rtx x
)
12039 if (!aarch64_extract_vec_duplicate_wide_int (x
, &rnd_cst
))
12041 int log2
= wi::exact_log2 (rnd_cst
);
12044 return IN_RANGE (log2
, 0, rnd_cst
.get_precision () / 2 - 1);
12047 /* Return true if RND is a constant vector of integer rounding constants
12048 corresponding to a constant vector of shifts, SHIFT.
12049 The relationship should be RND == (1 << (SHIFT - 1)). */
12052 aarch64_const_vec_rnd_cst_p (rtx rnd
, rtx shift
)
12054 wide_int rnd_cst
, shft_cst
;
12055 if (!aarch64_extract_vec_duplicate_wide_int (rnd
, &rnd_cst
)
12056 || !aarch64_extract_vec_duplicate_wide_int (shift
, &shft_cst
))
12059 return rnd_cst
== (wi::shwi (1, rnd_cst
.get_precision ()) << (shft_cst
- 1));
12063 aarch64_const_vec_all_same_in_range_p (rtx x
,
12064 HOST_WIDE_INT minval
,
12065 HOST_WIDE_INT maxval
)
12068 return (const_vec_duplicate_p (x
, &elt
)
12069 && CONST_INT_P (elt
)
12070 && IN_RANGE (INTVAL (elt
), minval
, maxval
));
12073 /* Some constants can't be made using normal mov instructions in Advanced SIMD
12074 but we can still create them in various ways. If the constant in VAL can be
12075 created using alternate methods then if possible then return true and
12076 additionally set TARGET to the rtx for the sequence if TARGET is not NULL.
12077 Otherwise return false if sequence is not possible. */
12080 aarch64_maybe_generate_simd_constant (rtx target
, rtx val
, machine_mode mode
)
12083 auto smode
= GET_MODE_INNER (mode
);
12084 if (!aarch64_extract_vec_duplicate_wide_int (val
, &wval
, smode
))
12087 /* For Advanced SIMD we can create an integer with only the top bit set
12088 using fneg (0.0f). */
12092 && wi::only_sign_bit_p (wval
))
12097 /* Use the same base type as aarch64_gen_shareable_zero. */
12098 rtx zero
= CONST0_RTX (V4SImode
);
12099 emit_move_insn (lowpart_subreg (V4SImode
, target
, mode
), zero
);
12100 rtx neg
= lowpart_subreg (V2DImode
, target
, mode
);
12101 emit_insn (gen_aarch64_fnegv2di2 (neg
, copy_rtx (neg
)));
12108 /* Check if the value in VAL with mode MODE can be created using special
12109 instruction sequences. */
12111 bool aarch64_simd_special_constant_p (rtx val
, machine_mode mode
)
12113 return aarch64_maybe_generate_simd_constant (NULL_RTX
, val
, mode
);
12117 aarch64_const_vec_all_same_int_p (rtx x
, HOST_WIDE_INT val
)
12119 return aarch64_const_vec_all_same_in_range_p (x
, val
, val
);
12122 /* Return true if VEC is a constant in which every element is in the range
12123 [MINVAL, MAXVAL]. The elements do not need to have the same value. */
12126 aarch64_const_vec_all_in_range_p (rtx vec
,
12127 HOST_WIDE_INT minval
,
12128 HOST_WIDE_INT maxval
)
12130 if (!CONST_VECTOR_P (vec
)
12131 || GET_MODE_CLASS (GET_MODE (vec
)) != MODE_VECTOR_INT
)
12135 if (!CONST_VECTOR_STEPPED_P (vec
))
12136 nunits
= const_vector_encoded_nelts (vec
);
12137 else if (!CONST_VECTOR_NUNITS (vec
).is_constant (&nunits
))
12140 for (int i
= 0; i
< nunits
; i
++)
12142 rtx vec_elem
= CONST_VECTOR_ELT (vec
, i
);
12143 if (!CONST_INT_P (vec_elem
)
12144 || !IN_RANGE (INTVAL (vec_elem
), minval
, maxval
))
12151 #define AARCH64_CC_V 1
12152 #define AARCH64_CC_C (1 << 1)
12153 #define AARCH64_CC_Z (1 << 2)
12154 #define AARCH64_CC_N (1 << 3)
12156 /* N Z C V flags for ccmp. Indexed by AARCH64_COND_CODE. */
12157 static const int aarch64_nzcv_codes
[] =
12159 0, /* EQ, Z == 1. */
12160 AARCH64_CC_Z
, /* NE, Z == 0. */
12161 0, /* CS, C == 1. */
12162 AARCH64_CC_C
, /* CC, C == 0. */
12163 0, /* MI, N == 1. */
12164 AARCH64_CC_N
, /* PL, N == 0. */
12165 0, /* VS, V == 1. */
12166 AARCH64_CC_V
, /* VC, V == 0. */
12167 0, /* HI, C ==1 && Z == 0. */
12168 AARCH64_CC_C
, /* LS, !(C == 1 && Z == 0). */
12169 AARCH64_CC_V
, /* GE, N == V. */
12170 0, /* LT, N != V. */
12171 AARCH64_CC_Z
, /* GT, Z == 0 && N == V. */
12172 0, /* LE, !(Z == 0 && N == V). */
12177 /* Print floating-point vector immediate operand X to F, negating it
12178 first if NEGATE is true. Return true on success, false if it isn't
12179 a constant we can handle. */
12182 aarch64_print_vector_float_operand (FILE *f
, rtx x
, bool negate
)
12186 if (!const_vec_duplicate_p (x
, &elt
))
12189 REAL_VALUE_TYPE r
= *CONST_DOUBLE_REAL_VALUE (elt
);
12191 r
= real_value_negate (&r
);
12193 /* Handle the SVE single-bit immediates specially, since they have a
12194 fixed form in the assembly syntax. */
12195 if (real_equal (&r
, &dconst0
))
12196 asm_fprintf (f
, "0.0");
12197 else if (real_equal (&r
, &dconst2
))
12198 asm_fprintf (f
, "2.0");
12199 else if (real_equal (&r
, &dconst1
))
12200 asm_fprintf (f
, "1.0");
12201 else if (real_equal (&r
, &dconsthalf
))
12202 asm_fprintf (f
, "0.5");
12205 const int buf_size
= 20;
12206 char float_buf
[buf_size
] = {'\0'};
12207 real_to_decimal_for_mode (float_buf
, &r
, buf_size
, buf_size
,
12208 1, GET_MODE (elt
));
12209 asm_fprintf (f
, "%s", float_buf
);
12215 /* Return the equivalent letter for size. */
12217 sizetochar (int size
)
12221 case 64: return 'd';
12222 case 32: return 's';
12223 case 16: return 'h';
12224 case 8: return 'b';
12225 default: gcc_unreachable ();
12229 /* Print operand X to file F in a target specific manner according to CODE.
12230 The acceptable formatting commands given by CODE are:
12231 'c': An integer or symbol address without a preceding #
12233 'C': Take the duplicated element in a vector constant
12234 and print it in hex.
12235 'D': Take the duplicated element in a vector constant
12236 and print it as an unsigned integer, in decimal.
12237 'e': Print the sign/zero-extend size as a character 8->b,
12238 16->h, 32->w. Can also be used for masks:
12239 0xff->b, 0xffff->h, 0xffffffff->w.
12240 'I': If the operand is a duplicated vector constant,
12241 replace it with the duplicated scalar. If the
12242 operand is then a floating-point constant, replace
12243 it with the integer bit representation. Print the
12244 transformed constant as a signed decimal number.
12245 'p': Prints N such that 2^N == X (X must be power of 2 and
12247 'P': Print the number of non-zero bits in X (a const_int).
12248 'H': Print the higher numbered register of a pair (TImode)
12250 'm': Print a condition (eq, ne, etc).
12251 'M': Same as 'm', but invert condition.
12252 'N': Take the duplicated element in a vector constant
12253 and print the negative of it in decimal.
12254 'b/h/s/d/q': Print a scalar FP/SIMD register name.
12255 'Z': Same for SVE registers. ('z' was already taken.)
12256 Note that it is not necessary to use %Z for operands
12257 that have SVE modes. The convention is to use %Z
12258 only for non-SVE (or potentially non-SVE) modes.
12259 'S/T/U/V': Print a FP/SIMD register name for a register list.
12260 The register printed is the FP/SIMD register name
12261 of X + 0/1/2/3 for S/T/U/V.
12262 'R': Print a scalar Integer/FP/SIMD register name + 1.
12263 'X': Print bottom 16 bits of integer constant in hex.
12264 'w/x': Print a general register name or the zero register
12265 (32-bit or 64-bit).
12266 '0': Print a normal operand, if it's a general register,
12267 then we assume DImode.
12268 'k': Print NZCV for conditional compare instructions.
12269 'K': Print a predicate register as pn<N> rather than p<N>
12270 'A': Output address constant representing the first
12271 argument of X, specifying a relocation offset
12273 'L': Output constant address specified by X
12274 with a relocation offset if appropriate.
12275 'G': Prints address of X, specifying a PC relative
12276 relocation mode if appropriate.
12277 'y': Output address of LDP or STP - this is used for
12278 some LDP/STPs which don't use a PARALLEL in their
12279 pattern (so the mode needs to be adjusted).
12280 'z': Output address of a typical LDP or STP. */
12283 aarch64_print_operand (FILE *f
, rtx x
, int code
)
12289 if (CONST_INT_P (x
))
12290 fprintf (f
, HOST_WIDE_INT_PRINT_DEC
, INTVAL (x
));
12294 rtx base
= strip_offset_and_salt (x
, &offset
);
12295 if (SYMBOL_REF_P (base
))
12296 output_addr_const (f
, x
);
12298 output_operand_lossage ("unsupported operand for code '%c'", code
);
12304 x
= unwrap_const_vec_duplicate (x
);
12305 if (!CONST_INT_P (x
))
12307 output_operand_lossage ("invalid operand for '%%%c'", code
);
12311 HOST_WIDE_INT val
= INTVAL (x
);
12312 if ((val
& ~7) == 8 || val
== 0xff)
12314 else if ((val
& ~7) == 16 || val
== 0xffff)
12316 else if ((val
& ~7) == 32 || val
== 0xffffffff)
12320 output_operand_lossage ("invalid operand for '%%%c'", code
);
12330 if (!CONST_INT_P (x
) || (n
= exact_log2 (INTVAL (x
))) < 0)
12332 output_operand_lossage ("invalid operand for '%%%c'", code
);
12336 asm_fprintf (f
, "%d", n
);
12341 if (!CONST_INT_P (x
))
12343 output_operand_lossage ("invalid operand for '%%%c'", code
);
12347 asm_fprintf (f
, "%u", popcount_hwi (INTVAL (x
)));
12351 if (x
== const0_rtx
)
12353 asm_fprintf (f
, "xzr");
12357 if (!REG_P (x
) || !GP_REGNUM_P (REGNO (x
) + 1))
12359 output_operand_lossage ("invalid operand for '%%%c'", code
);
12363 asm_fprintf (f
, "%s", reg_names
[REGNO (x
) + 1]);
12368 x
= aarch64_bit_representation (unwrap_const_vec_duplicate (x
));
12369 if (CONST_INT_P (x
))
12370 asm_fprintf (f
, "%wd", INTVAL (x
));
12373 output_operand_lossage ("invalid operand for '%%%c'", code
);
12383 /* CONST_TRUE_RTX means al/nv (al is the default, don't print it). */
12384 if (x
== const_true_rtx
)
12391 if (!COMPARISON_P (x
))
12393 output_operand_lossage ("invalid operand for '%%%c'", code
);
12397 cond_code
= aarch64_get_condition_code (x
);
12398 gcc_assert (cond_code
>= 0);
12400 cond_code
= AARCH64_INVERSE_CONDITION_CODE (cond_code
);
12401 if (GET_MODE (XEXP (x
, 0)) == CC_NZCmode
)
12402 fputs (aarch64_sve_condition_codes
[cond_code
], f
);
12404 fputs (aarch64_condition_codes
[cond_code
], f
);
12409 if (!const_vec_duplicate_p (x
, &elt
))
12411 output_operand_lossage ("invalid vector constant");
12415 if (GET_MODE_CLASS (GET_MODE (x
)) == MODE_VECTOR_INT
)
12416 asm_fprintf (f
, "%wd", (HOST_WIDE_INT
) -UINTVAL (elt
));
12417 else if (GET_MODE_CLASS (GET_MODE (x
)) == MODE_VECTOR_FLOAT
12418 && aarch64_print_vector_float_operand (f
, x
, true))
12422 output_operand_lossage ("invalid vector constant");
12433 code
= TOLOWER (code
);
12434 if (!REG_P (x
) || !FP_REGNUM_P (REGNO (x
)))
12436 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code
);
12439 asm_fprintf (f
, "%c%d", code
, REGNO (x
) - V0_REGNUM
);
12446 if (!REG_P (x
) || (!FP_REGNUM_P (REGNO (x
)) && !PR_REGNUM_P (REGNO (x
))))
12448 output_operand_lossage ("incompatible operand for '%%%c'", code
);
12451 if (PR_REGNUM_P (REGNO (x
)))
12452 asm_fprintf (f
, "p%d", REGNO (x
) - P0_REGNUM
+ (code
- 'S'));
12454 asm_fprintf (f
, "%c%d",
12455 aarch64_sve_data_mode_p (GET_MODE (x
)) ? 'z' : 'v',
12456 REGNO (x
) - V0_REGNUM
+ (code
- 'S'));
12460 if (REG_P (x
) && FP_REGNUM_P (REGNO (x
))
12461 && (aarch64_advsimd_partial_struct_mode_p (GET_MODE (x
))))
12462 asm_fprintf (f
, "d%d", REGNO (x
) - V0_REGNUM
+ 1);
12463 else if (REG_P (x
) && FP_REGNUM_P (REGNO (x
)))
12464 asm_fprintf (f
, "q%d", REGNO (x
) - V0_REGNUM
+ 1);
12465 else if (REG_P (x
) && GP_REGNUM_P (REGNO (x
)))
12466 asm_fprintf (f
, "x%d", REGNO (x
) - R0_REGNUM
+ 1);
12468 output_operand_lossage ("incompatible register operand for '%%%c'",
12473 if (!CONST_INT_P (x
))
12475 output_operand_lossage ("invalid operand for '%%%c'", code
);
12478 asm_fprintf (f
, "0x%wx", UINTVAL (x
) & 0xffff);
12483 /* Print a replicated constant in hex. */
12484 if (!const_vec_duplicate_p (x
, &elt
) || !CONST_INT_P (elt
))
12486 output_operand_lossage ("invalid operand for '%%%c'", code
);
12489 scalar_mode inner_mode
= GET_MODE_INNER (GET_MODE (x
));
12490 asm_fprintf (f
, "0x%wx", UINTVAL (elt
) & GET_MODE_MASK (inner_mode
));
12496 /* Print a replicated constant in decimal, treating it as
12498 if (!const_vec_duplicate_p (x
, &elt
) || !CONST_INT_P (elt
))
12500 output_operand_lossage ("invalid operand for '%%%c'", code
);
12503 scalar_mode inner_mode
= GET_MODE_INNER (GET_MODE (x
));
12504 asm_fprintf (f
, "%wd", UINTVAL (elt
) & GET_MODE_MASK (inner_mode
));
12510 if (aarch64_const_zero_rtx_p (x
))
12512 asm_fprintf (f
, "%czr", code
);
12516 if (REG_P (x
) && GP_REGNUM_P (REGNO (x
)))
12518 asm_fprintf (f
, "%c%d", code
, REGNO (x
) - R0_REGNUM
);
12522 if (REG_P (x
) && REGNO (x
) == SP_REGNUM
)
12524 asm_fprintf (f
, "%ssp", code
== 'w' ? "w" : "");
12533 output_operand_lossage ("missing operand");
12537 switch (GET_CODE (x
))
12541 asm_fprintf (f
, "%s", XSTR (x
, 0));
12545 if (aarch64_sve_data_mode_p (GET_MODE (x
)))
12547 if (REG_NREGS (x
) == 1)
12548 asm_fprintf (f
, "z%d", REGNO (x
) - V0_REGNUM
);
12552 = sizetochar (GET_MODE_UNIT_BITSIZE (GET_MODE (x
)));
12553 asm_fprintf (f
, "{z%d.%c - z%d.%c}",
12554 REGNO (x
) - V0_REGNUM
, suffix
,
12555 END_REGNO (x
) - V0_REGNUM
- 1, suffix
);
12559 asm_fprintf (f
, "%s", reg_names
[REGNO (x
)]);
12563 output_address (GET_MODE (x
), XEXP (x
, 0));
12568 output_addr_const (asm_out_file
, x
);
12572 asm_fprintf (f
, "%wd", INTVAL (x
));
12576 if (!VECTOR_MODE_P (GET_MODE (x
)))
12578 output_addr_const (asm_out_file
, x
);
12584 if (!const_vec_duplicate_p (x
, &elt
))
12586 output_operand_lossage ("invalid vector constant");
12590 if (GET_MODE_CLASS (GET_MODE (x
)) == MODE_VECTOR_INT
)
12591 asm_fprintf (f
, "%wd", INTVAL (elt
));
12592 else if (GET_MODE_CLASS (GET_MODE (x
)) == MODE_VECTOR_FLOAT
12593 && aarch64_print_vector_float_operand (f
, x
, false))
12597 output_operand_lossage ("invalid vector constant");
12603 /* Since we define TARGET_SUPPORTS_WIDE_INT we shouldn't ever
12604 be getting CONST_DOUBLEs holding integers. */
12605 gcc_assert (GET_MODE (x
) != VOIDmode
);
12606 if (aarch64_float_const_zero_rtx_p (x
))
12611 else if (aarch64_float_const_representable_p (x
))
12613 #define buf_size 20
12614 char float_buf
[buf_size
] = {'\0'};
12615 real_to_decimal_for_mode (float_buf
,
12616 CONST_DOUBLE_REAL_VALUE (x
),
12617 buf_size
, buf_size
,
12619 asm_fprintf (asm_out_file
, "%s", float_buf
);
12623 output_operand_lossage ("invalid constant");
12626 output_operand_lossage ("invalid operand");
12632 if (GET_CODE (x
) == HIGH
)
12635 switch (aarch64_classify_symbolic_expression (x
))
12637 case SYMBOL_SMALL_GOT_4G
:
12638 asm_fprintf (asm_out_file
, ":got:");
12641 case SYMBOL_SMALL_TLSGD
:
12642 asm_fprintf (asm_out_file
, ":tlsgd:");
12645 case SYMBOL_SMALL_TLSDESC
:
12646 asm_fprintf (asm_out_file
, ":tlsdesc:");
12649 case SYMBOL_SMALL_TLSIE
:
12650 asm_fprintf (asm_out_file
, ":gottprel:");
12653 case SYMBOL_TLSLE24
:
12654 asm_fprintf (asm_out_file
, ":tprel:");
12657 case SYMBOL_TINY_GOT
:
12658 gcc_unreachable ();
12664 output_addr_const (asm_out_file
, x
);
12668 switch (aarch64_classify_symbolic_expression (x
))
12670 case SYMBOL_SMALL_GOT_4G
:
12671 asm_fprintf (asm_out_file
, ":got_lo12:");
12674 case SYMBOL_SMALL_TLSGD
:
12675 asm_fprintf (asm_out_file
, ":tlsgd_lo12:");
12678 case SYMBOL_SMALL_TLSDESC
:
12679 asm_fprintf (asm_out_file
, ":tlsdesc_lo12:");
12682 case SYMBOL_SMALL_TLSIE
:
12683 asm_fprintf (asm_out_file
, ":gottprel_lo12:");
12686 case SYMBOL_TLSLE12
:
12687 asm_fprintf (asm_out_file
, ":tprel_lo12:");
12690 case SYMBOL_TLSLE24
:
12691 asm_fprintf (asm_out_file
, ":tprel_lo12_nc:");
12694 case SYMBOL_TINY_GOT
:
12695 asm_fprintf (asm_out_file
, ":got:");
12698 case SYMBOL_TINY_TLSIE
:
12699 asm_fprintf (asm_out_file
, ":gottprel:");
12705 output_addr_const (asm_out_file
, x
);
12709 switch (aarch64_classify_symbolic_expression (x
))
12711 case SYMBOL_TLSLE24
:
12712 asm_fprintf (asm_out_file
, ":tprel_hi12:");
12717 output_addr_const (asm_out_file
, x
);
12722 HOST_WIDE_INT cond_code
;
12724 if (!CONST_INT_P (x
))
12726 output_operand_lossage ("invalid operand for '%%%c'", code
);
12730 cond_code
= INTVAL (x
);
12731 gcc_assert (cond_code
>= 0 && cond_code
<= AARCH64_NV
);
12732 asm_fprintf (f
, "%d", aarch64_nzcv_codes
[cond_code
]);
12737 if (!REG_P (x
) || !PR_REGNUM_P (REGNO (x
)))
12739 output_operand_lossage ("invalid operand for '%%%c'", code
);
12742 asm_fprintf (f
, "pn%d", REGNO (x
) - P0_REGNUM
);
12748 machine_mode mode
= GET_MODE (x
);
12752 && maybe_ne (GET_MODE_SIZE (mode
), 8)
12753 && maybe_ne (GET_MODE_SIZE (mode
), 16)
12754 && maybe_ne (GET_MODE_SIZE (mode
), 32)))
12756 output_operand_lossage ("invalid operand for '%%%c'", code
);
12760 if (!aarch64_print_address_internal (f
, mode
, XEXP (x
, 0),
12762 ? ADDR_QUERY_LDP_STP_N
12763 : ADDR_QUERY_LDP_STP
))
12764 output_operand_lossage ("invalid operand prefix '%%%c'", code
);
12769 output_operand_lossage ("invalid operand prefix '%%%c'", code
);
12774 /* Print address 'x' of a memory access with mode 'mode'.
12775 'op' is the context required by aarch64_classify_address. It can either be
12776 MEM for a normal memory access or PARALLEL for LDP/STP. */
12778 aarch64_print_address_internal (FILE *f
, machine_mode mode
, rtx x
,
12779 aarch64_addr_query_type type
)
12781 struct aarch64_address_info addr
;
12782 unsigned int size
, vec_flags
;
12784 /* Check all addresses are Pmode - including ILP32. */
12785 if (GET_MODE (x
) != Pmode
12786 && (!CONST_INT_P (x
)
12787 || trunc_int_for_mode (INTVAL (x
), Pmode
) != INTVAL (x
)))
12789 output_operand_lossage ("invalid address mode");
12793 const bool load_store_pair_p
= (type
== ADDR_QUERY_LDP_STP
12794 || type
== ADDR_QUERY_LDP_STP_N
);
12796 if (aarch64_classify_address (&addr
, x
, mode
, true, type
))
12799 case ADDRESS_REG_IMM
:
12800 if (known_eq (addr
.const_offset
, 0))
12802 asm_fprintf (f
, "[%s]", reg_names
[REGNO (addr
.base
)]);
12806 vec_flags
= aarch64_classify_vector_memory_mode (mode
);
12807 if ((vec_flags
& VEC_ANY_SVE
) && !load_store_pair_p
)
12810 = exact_div (addr
.const_offset
,
12811 aarch64_vl_bytes (mode
, vec_flags
)).to_constant ();
12812 asm_fprintf (f
, "[%s, #%wd, mul vl]",
12813 reg_names
[REGNO (addr
.base
)], vnum
);
12817 if (!CONST_INT_P (addr
.offset
))
12820 asm_fprintf (f
, "[%s, %wd]", reg_names
[REGNO (addr
.base
)],
12821 INTVAL (addr
.offset
));
12824 case ADDRESS_REG_REG
:
12825 if (addr
.shift
== 0)
12826 asm_fprintf (f
, "[%s, %s]", reg_names
[REGNO (addr
.base
)],
12827 reg_names
[REGNO (addr
.offset
)]);
12829 asm_fprintf (f
, "[%s, %s, lsl %u]", reg_names
[REGNO (addr
.base
)],
12830 reg_names
[REGNO (addr
.offset
)], addr
.shift
);
12833 case ADDRESS_REG_UXTW
:
12834 if (addr
.shift
== 0)
12835 asm_fprintf (f
, "[%s, w%d, uxtw]", reg_names
[REGNO (addr
.base
)],
12836 REGNO (addr
.offset
) - R0_REGNUM
);
12838 asm_fprintf (f
, "[%s, w%d, uxtw %u]", reg_names
[REGNO (addr
.base
)],
12839 REGNO (addr
.offset
) - R0_REGNUM
, addr
.shift
);
12842 case ADDRESS_REG_SXTW
:
12843 if (addr
.shift
== 0)
12844 asm_fprintf (f
, "[%s, w%d, sxtw]", reg_names
[REGNO (addr
.base
)],
12845 REGNO (addr
.offset
) - R0_REGNUM
);
12847 asm_fprintf (f
, "[%s, w%d, sxtw %u]", reg_names
[REGNO (addr
.base
)],
12848 REGNO (addr
.offset
) - R0_REGNUM
, addr
.shift
);
12851 case ADDRESS_REG_WB
:
12852 /* Writeback is only supported for fixed-width modes. */
12853 size
= GET_MODE_SIZE (mode
).to_constant ();
12854 switch (GET_CODE (x
))
12857 asm_fprintf (f
, "[%s, %d]!", reg_names
[REGNO (addr
.base
)], size
);
12860 asm_fprintf (f
, "[%s], %d", reg_names
[REGNO (addr
.base
)], size
);
12863 asm_fprintf (f
, "[%s, -%d]!", reg_names
[REGNO (addr
.base
)], size
);
12866 asm_fprintf (f
, "[%s], -%d", reg_names
[REGNO (addr
.base
)], size
);
12869 asm_fprintf (f
, "[%s, %wd]!", reg_names
[REGNO (addr
.base
)],
12870 INTVAL (addr
.offset
));
12873 asm_fprintf (f
, "[%s], %wd", reg_names
[REGNO (addr
.base
)],
12874 INTVAL (addr
.offset
));
12881 case ADDRESS_LO_SUM
:
12882 asm_fprintf (f
, "[%s, #:lo12:", reg_names
[REGNO (addr
.base
)]);
12883 output_addr_const (f
, addr
.offset
);
12884 asm_fprintf (f
, "]");
12887 case ADDRESS_SYMBOLIC
:
12888 output_addr_const (f
, x
);
12895 /* Print address 'x' of a memory access with mode 'mode'. */
12897 aarch64_print_operand_address (FILE *f
, machine_mode mode
, rtx x
)
12899 if (!aarch64_print_address_internal (f
, mode
, x
, ADDR_QUERY_ANY
))
12900 output_addr_const (f
, x
);
12903 /* Implement TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA. */
12906 aarch64_output_addr_const_extra (FILE *file
, rtx x
)
12908 if (GET_CODE (x
) == UNSPEC
&& XINT (x
, 1) == UNSPEC_SALT_ADDR
)
12910 output_addr_const (file
, XVECEXP (x
, 0, 0));
12917 aarch64_label_mentioned_p (rtx x
)
12922 if (LABEL_REF_P (x
))
12925 /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
12926 referencing instruction, but they are constant offsets, not
12928 if (GET_CODE (x
) == UNSPEC
&& XINT (x
, 1) == UNSPEC_TLS
)
12931 fmt
= GET_RTX_FORMAT (GET_CODE (x
));
12932 for (i
= GET_RTX_LENGTH (GET_CODE (x
)) - 1; i
>= 0; i
--)
12938 for (j
= XVECLEN (x
, i
) - 1; j
>= 0; j
--)
12939 if (aarch64_label_mentioned_p (XVECEXP (x
, i
, j
)))
12942 else if (fmt
[i
] == 'e' && aarch64_label_mentioned_p (XEXP (x
, i
)))
12949 /* Implement REGNO_REG_CLASS. */
12952 aarch64_regno_regclass (unsigned regno
)
12954 if (W8_W11_REGNUM_P (regno
))
12955 return W8_W11_REGS
;
12957 if (W12_W15_REGNUM_P (regno
))
12958 return W12_W15_REGS
;
12960 if (STUB_REGNUM_P (regno
))
12963 if (GP_REGNUM_P (regno
))
12964 return GENERAL_REGS
;
12966 if (regno
== SP_REGNUM
)
12969 if (regno
== FRAME_POINTER_REGNUM
12970 || regno
== ARG_POINTER_REGNUM
)
12971 return POINTER_REGS
;
12973 if (FP_REGNUM_P (regno
))
12974 return (FP_LO8_REGNUM_P (regno
) ? FP_LO8_REGS
12975 : FP_LO_REGNUM_P (regno
) ? FP_LO_REGS
: FP_REGS
);
12977 if (PR_REGNUM_P (regno
))
12978 return PR_LO_REGNUM_P (regno
) ? PR_LO_REGS
: PR_HI_REGS
;
12980 if (regno
== FPM_REGNUM
)
12981 return MOVEABLE_SYSREGS
;
12983 if (regno
== FFR_REGNUM
|| regno
== FFRT_REGNUM
)
12986 if (FAKE_REGNUM_P (regno
))
12992 /* OFFSET is an address offset for mode MODE, which has SIZE bytes.
12993 If OFFSET is out of range, return an offset of an anchor point
12994 that is in range. Return 0 otherwise. */
12996 static HOST_WIDE_INT
12997 aarch64_anchor_offset (HOST_WIDE_INT offset
, HOST_WIDE_INT size
,
13000 /* Does it look like we'll need a 16-byte load/store-pair operation? */
13002 return (offset
+ 0x400) & ~0x7f0;
13004 /* For offsets that aren't a multiple of the access size, the limit is
13006 if (offset
& (size
- 1))
13008 /* BLKmode typically uses LDP of X-registers. */
13009 if (mode
== BLKmode
)
13010 return (offset
+ 512) & ~0x3ff;
13011 return (offset
+ 0x100) & ~0x1ff;
13014 /* Small negative offsets are supported. */
13015 if (IN_RANGE (offset
, -256, 0))
13018 if (mode
== TImode
|| mode
== TFmode
|| mode
== TDmode
)
13019 return (offset
+ 0x100) & ~0x1ff;
13021 /* Use 12-bit offset by access size. */
13022 return offset
& (~0xfff * size
);
13026 aarch64_legitimize_address (rtx x
, rtx
/* orig_x */, machine_mode mode
)
13029 rtx tmp
= legitimize_pe_coff_symbol (x
, true);
13034 /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask),
13035 where mask is selected by alignment and size of the offset.
13036 We try to pick as large a range for the offset as possible to
13037 maximize the chance of a CSE. However, for aligned addresses
13038 we limit the range to 4k so that structures with different sized
13039 elements are likely to use the same base. We need to be careful
13040 not to split a CONST for some forms of address expression, otherwise
13041 it will generate sub-optimal code. */
13043 /* First split X + CONST (base, offset) into (base + X) + offset. */
13044 if (GET_CODE (x
) == PLUS
&& GET_CODE (XEXP (x
, 1)) == CONST
)
13047 rtx base
= strip_offset (XEXP (x
, 1), &offset
);
13049 base
= expand_binop (Pmode
, add_optab
, base
, XEXP (x
, 0),
13050 NULL_RTX
, true, OPTAB_DIRECT
);
13051 x
= plus_constant (Pmode
, base
, offset
);
13054 if (GET_CODE (x
) == PLUS
&& CONST_INT_P (XEXP (x
, 1)))
13056 rtx base
= XEXP (x
, 0);
13057 rtx offset_rtx
= XEXP (x
, 1);
13058 HOST_WIDE_INT offset
= INTVAL (offset_rtx
);
13060 if (GET_CODE (base
) == PLUS
)
13062 rtx op0
= XEXP (base
, 0);
13063 rtx op1
= XEXP (base
, 1);
13065 /* Force any scaling into a temp for CSE. */
13066 op0
= force_reg (Pmode
, op0
);
13067 op1
= force_reg (Pmode
, op1
);
13069 /* Let the pointer register be in op0. */
13070 if (REG_POINTER (op1
))
13071 std::swap (op0
, op1
);
13073 /* If the pointer is virtual or frame related, then we know that
13074 virtual register instantiation or register elimination is going
13075 to apply a second constant. We want the two constants folded
13076 together easily. Therefore, emit as (OP0 + CONST) + OP1. */
13077 if (virt_or_elim_regno_p (REGNO (op0
)))
13079 base
= expand_binop (Pmode
, add_optab
, op0
, offset_rtx
,
13080 NULL_RTX
, true, OPTAB_DIRECT
);
13081 return gen_rtx_PLUS (Pmode
, base
, op1
);
13084 /* Otherwise, in order to encourage CSE (and thence loop strength
13085 reduce) scaled addresses, emit as (OP0 + OP1) + CONST. */
13086 base
= expand_binop (Pmode
, add_optab
, op0
, op1
,
13087 NULL_RTX
, true, OPTAB_DIRECT
);
13088 x
= gen_rtx_PLUS (Pmode
, base
, offset_rtx
);
13091 HOST_WIDE_INT size
;
13092 if (GET_MODE_SIZE (mode
).is_constant (&size
))
13094 HOST_WIDE_INT base_offset
= aarch64_anchor_offset (offset
, size
,
13096 if (base_offset
!= 0)
13098 base
= plus_constant (Pmode
, base
, base_offset
);
13099 base
= force_operand (base
, NULL_RTX
);
13100 return plus_constant (Pmode
, base
, offset
- base_offset
);
13109 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED
, rtx x
,
13110 reg_class_t rclass
,
13112 secondary_reload_info
*sri
)
13114 /* Use aarch64_sve_reload_mem for SVE memory reloads that cannot use
13115 LDR and STR. See the comment at the head of aarch64-sve.md for
13116 more details about the big-endian handling. */
13117 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
13118 if (reg_class_subset_p (rclass
, FP_REGS
)
13119 && !((REG_P (x
) && HARD_REGISTER_P (x
))
13120 || aarch64_simd_valid_mov_imm (x
))
13121 && mode
!= VNx16QImode
13122 && (vec_flags
& VEC_SVE_DATA
)
13123 && ((vec_flags
& VEC_PARTIAL
) || BYTES_BIG_ENDIAN
))
13125 sri
->icode
= CODE_FOR_aarch64_sve_reload_mem
;
13129 /* If we have to disable direct literal pool loads and stores because the
13130 function is too big, then we need a scratch register. */
13131 if (MEM_P (x
) && SYMBOL_REF_P (x
) && CONSTANT_POOL_ADDRESS_P (x
)
13132 && (SCALAR_FLOAT_MODE_P (GET_MODE (x
))
13133 || targetm
.vector_mode_supported_p (GET_MODE (x
)))
13134 && !aarch64_pcrelative_literal_loads
)
13136 sri
->icode
= code_for_aarch64_reload_movcp (mode
, DImode
);
13140 /* Without the TARGET_SIMD or TARGET_SVE instructions we cannot move a
13141 Q register to a Q register directly. We need a scratch. */
13146 || (vec_flags
== VEC_ADVSIMD
&& known_eq (GET_MODE_SIZE (mode
), 16)))
13147 && mode
== GET_MODE (x
)
13149 && FP_REGNUM_P (REGNO (x
))
13150 && reg_class_subset_p (rclass
, FP_REGS
))
13152 sri
->icode
= code_for_aarch64_reload_mov (mode
);
13156 /* A TFmode, TImode or TDmode memory access should be handled via an FP_REGS
13157 because AArch64 has richer addressing modes for LDR/STR instructions
13158 than LDP/STP instructions. */
13159 if (TARGET_FLOAT
&& rclass
== GENERAL_REGS
13160 && known_eq (GET_MODE_SIZE (mode
), 16) && MEM_P (x
))
13163 if (rclass
== FP_REGS
13164 && (mode
== TImode
|| mode
== TFmode
|| mode
== TDmode
)
13166 return GENERAL_REGS
;
13171 /* Implement TARGET_SECONDARY_MEMORY_NEEDED. */
13174 aarch64_secondary_memory_needed (machine_mode mode
, reg_class_t class1
,
13175 reg_class_t class2
)
13178 && reg_classes_intersect_p (class1
, FP_REGS
)
13179 && reg_classes_intersect_p (class2
, FP_REGS
))
13181 /* We can't do a 128-bit FPR-to-FPR move without TARGET_SIMD,
13182 so we can't easily split a move involving tuples of 128-bit
13183 vectors. Force the copy through memory instead.
13185 (Tuples of 64-bit vectors are fine.) */
13186 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
13187 if (vec_flags
== (VEC_ADVSIMD
| VEC_STRUCT
))
13193 /* Implement TARGET_FRAME_POINTER_REQUIRED. */
13196 aarch64_frame_pointer_required ()
13198 /* If the function needs to record the incoming value of PSTATE.SM,
13199 make sure that the slot is accessible from the frame pointer. */
13200 return aarch64_need_old_pstate_sm ();
13204 aarch64_can_eliminate (const int from ATTRIBUTE_UNUSED
, const int to
)
13206 gcc_assert (from
== ARG_POINTER_REGNUM
|| from
== FRAME_POINTER_REGNUM
);
13208 /* If we need a frame pointer, ARG_POINTER_REGNUM and FRAME_POINTER_REGNUM
13209 can only eliminate to HARD_FRAME_POINTER_REGNUM. */
13210 if (frame_pointer_needed
)
13211 return to
== HARD_FRAME_POINTER_REGNUM
;
13216 aarch64_initial_elimination_offset (unsigned from
, unsigned to
)
13218 aarch64_frame
&frame
= cfun
->machine
->frame
;
13220 if (to
== HARD_FRAME_POINTER_REGNUM
)
13222 if (from
== ARG_POINTER_REGNUM
)
13223 return frame
.bytes_above_hard_fp
;
13225 if (from
== FRAME_POINTER_REGNUM
)
13226 return frame
.bytes_above_hard_fp
- frame
.bytes_above_locals
;
13229 if (to
== STACK_POINTER_REGNUM
)
13231 if (from
== FRAME_POINTER_REGNUM
)
13232 return frame
.frame_size
- frame
.bytes_above_locals
;
13235 return frame
.frame_size
;
13239 /* Get return address without mangling. */
13242 aarch64_return_addr_rtx (void)
13244 rtx val
= get_hard_reg_initial_val (Pmode
, LR_REGNUM
);
13245 /* Note: aarch64_return_address_signing_enabled only
13246 works after cfun->machine->frame.laid_out is set,
13247 so here we don't know if the return address will
13248 be signed or not. */
13249 rtx lr
= gen_rtx_REG (Pmode
, LR_REGNUM
);
13250 emit_move_insn (lr
, val
);
13251 emit_insn (GEN_FCN (CODE_FOR_xpaclri
) ());
13256 /* Implement RETURN_ADDR_RTX. We do not support moving back to a
13260 aarch64_return_addr (int count
, rtx frame ATTRIBUTE_UNUSED
)
13264 return aarch64_return_addr_rtx ();
13268 aarch64_asm_trampoline_template (FILE *f
)
13270 /* Even if the current function doesn't have branch protection, some
13271 later function might, so since this template is only generated once
13272 we have to add a BTI just in case. */
13273 asm_fprintf (f
, "\thint\t34 // bti c\n");
13277 asm_fprintf (f
, "\tldr\tw%d, .+20\n", IP1_REGNUM
- R0_REGNUM
);
13278 asm_fprintf (f
, "\tldr\tw%d, .+20\n", STATIC_CHAIN_REGNUM
- R0_REGNUM
);
13282 asm_fprintf (f
, "\tldr\t%s, .+20\n", reg_names
[IP1_REGNUM
]);
13283 asm_fprintf (f
, "\tldr\t%s, .+24\n", reg_names
[STATIC_CHAIN_REGNUM
]);
13285 asm_fprintf (f
, "\tbr\t%s\n", reg_names
[IP1_REGNUM
]);
13287 /* We always emit a speculation barrier.
13288 This is because the same trampoline template is used for every nested
13289 function. Since nested functions are not particularly common or
13290 performant we don't worry too much about the extra instructions to copy
13292 This is not yet a problem, since we have not yet implemented function
13293 specific attributes to choose between hardening against straight line
13294 speculation or not, but such function specific attributes are likely to
13295 happen in the future. */
13296 asm_fprintf (f
, "\tdsb\tsy\n\tisb\n");
13298 assemble_aligned_integer (POINTER_BYTES
, const0_rtx
);
13299 assemble_aligned_integer (POINTER_BYTES
, const0_rtx
);
13303 aarch64_trampoline_init (rtx m_tramp
, tree fndecl
, rtx chain_value
)
13305 rtx fnaddr
, mem
, a_tramp
;
13306 const int tramp_code_sz
= 24;
13308 /* Don't need to copy the trailing D-words, we fill those in below. */
13309 /* We create our own memory address in Pmode so that `emit_block_move` can
13310 use parts of the backend which expect Pmode addresses. */
13311 rtx temp
= convert_memory_address (Pmode
, XEXP (m_tramp
, 0));
13312 emit_block_move (gen_rtx_MEM (BLKmode
, temp
),
13313 assemble_trampoline_template (),
13314 GEN_INT (tramp_code_sz
), BLOCK_OP_NORMAL
);
13315 mem
= adjust_address (m_tramp
, ptr_mode
, tramp_code_sz
);
13316 fnaddr
= XEXP (DECL_RTL (fndecl
), 0);
13317 if (GET_MODE (fnaddr
) != ptr_mode
)
13318 fnaddr
= convert_memory_address (ptr_mode
, fnaddr
);
13319 emit_move_insn (mem
, fnaddr
);
13321 mem
= adjust_address (m_tramp
, ptr_mode
, tramp_code_sz
+ POINTER_BYTES
);
13322 emit_move_insn (mem
, chain_value
);
13324 /* XXX We should really define a "clear_cache" pattern and use
13325 gen_clear_cache(). */
13326 a_tramp
= XEXP (m_tramp
, 0);
13327 maybe_emit_call_builtin___clear_cache (a_tramp
,
13328 plus_constant (ptr_mode
,
13333 static unsigned char
13334 aarch64_class_max_nregs (reg_class_t regclass
, machine_mode mode
)
13336 /* ??? Logically we should only need to provide a value when
13337 HARD_REGNO_MODE_OK says that at least one register in REGCLASS
13338 can hold MODE, but at the moment we need to handle all modes.
13339 Just ignore any runtime parts for registers that can't store them. */
13340 HOST_WIDE_INT lowest_size
= constant_lower_bound (GET_MODE_SIZE (mode
));
13341 unsigned int nregs
, vec_flags
;
13347 case TAILCALL_ADDR_REGS
:
13351 case POINTER_AND_FP_REGS
:
13355 vec_flags
= aarch64_classify_vector_mode (mode
);
13356 if ((vec_flags
& VEC_SVE_DATA
)
13357 && constant_multiple_p (GET_MODE_SIZE (mode
),
13358 aarch64_vl_bytes (mode
, vec_flags
), &nregs
))
13360 if (vec_flags
== (VEC_ADVSIMD
| VEC_STRUCT
| VEC_PARTIAL
))
13361 return GET_MODE_SIZE (mode
).to_constant () / 8;
13362 return (vec_flags
& VEC_ADVSIMD
13363 ? CEIL (lowest_size
, UNITS_PER_VREG
)
13364 : CEIL (lowest_size
, UNITS_PER_WORD
));
13369 return mode
== VNx64BImode
? 4 : mode
== VNx32BImode
? 2 : 1;
13371 case MOVEABLE_SYSREGS
:
13374 case PR_AND_FFR_REGS
:
13384 gcc_unreachable ();
13388 aarch64_preferred_reload_class (rtx x
, reg_class_t regclass
)
13390 if (regclass
== POINTER_REGS
)
13391 return GENERAL_REGS
;
13393 if (regclass
== STACK_REG
)
13396 && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x
)), POINTER_REGS
))
13402 /* Register eliminiation can result in a request for
13403 SP+constant->FP_REGS. We cannot support such operations which
13404 use SP as source and an FP_REG as destination, so reject out
13406 if (! reg_class_subset_p (regclass
, GENERAL_REGS
) && GET_CODE (x
) == PLUS
)
13408 rtx lhs
= XEXP (x
, 0);
13410 /* Look through a possible SUBREG introduced by ILP32. */
13411 if (SUBREG_P (lhs
))
13412 lhs
= SUBREG_REG (lhs
);
13414 gcc_assert (REG_P (lhs
));
13415 gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs
)),
13424 aarch64_asm_output_labelref (FILE* f
, const char *name
)
13426 asm_fprintf (f
, "%U%s", name
);
13430 aarch64_elf_asm_constructor (rtx symbol
, int priority
)
13432 if (priority
== DEFAULT_INIT_PRIORITY
)
13433 default_ctor_section_asm_out_constructor (symbol
, priority
);
13437 /* While priority is known to be in range [0, 65535], so 18 bytes
13438 would be enough, the compiler might not know that. To avoid
13439 -Wformat-truncation false positive, use a larger size. */
13441 snprintf (buf
, sizeof (buf
), ".init_array.%.5u", priority
);
13442 s
= get_section (buf
, SECTION_WRITE
| SECTION_NOTYPE
, NULL
);
13443 switch_to_section (s
);
13444 assemble_align (POINTER_SIZE
);
13445 assemble_aligned_integer (POINTER_BYTES
, symbol
);
13450 aarch64_elf_asm_destructor (rtx symbol
, int priority
)
13452 if (priority
== DEFAULT_INIT_PRIORITY
)
13453 default_dtor_section_asm_out_destructor (symbol
, priority
);
13457 /* While priority is known to be in range [0, 65535], so 18 bytes
13458 would be enough, the compiler might not know that. To avoid
13459 -Wformat-truncation false positive, use a larger size. */
13461 snprintf (buf
, sizeof (buf
), ".fini_array.%.5u", priority
);
13462 s
= get_section (buf
, SECTION_WRITE
| SECTION_NOTYPE
, NULL
);
13463 switch_to_section (s
);
13464 assemble_align (POINTER_SIZE
);
13465 assemble_aligned_integer (POINTER_BYTES
, symbol
);
13470 aarch64_output_casesi (rtx
*operands
)
13474 rtx diff_vec
= PATTERN (NEXT_INSN (as_a
<rtx_insn
*> (operands
[2])));
13476 static const char *const patterns
[4][2] =
13479 "ldrb\t%w3, [%0,%w1,uxtw]",
13480 "add\t%3, %4, %w3, sxtb #2"
13483 "ldrh\t%w3, [%0,%w1,uxtw #1]",
13484 "add\t%3, %4, %w3, sxth #2"
13487 "ldr\t%w3, [%0,%w1,uxtw #2]",
13488 "add\t%3, %4, %w3, sxtw #2"
13490 /* We assume that DImode is only generated when not optimizing and
13491 that we don't really need 64-bit address offsets. That would
13492 imply an object file with 8GB of code in a single function! */
13494 "ldr\t%w3, [%0,%w1,uxtw #2]",
13495 "add\t%3, %4, %w3, sxtw #2"
13499 gcc_assert (GET_CODE (diff_vec
) == ADDR_DIFF_VEC
);
13501 scalar_int_mode mode
= as_a
<scalar_int_mode
> (GET_MODE (diff_vec
));
13502 index
= exact_log2 (GET_MODE_SIZE (mode
));
13504 gcc_assert (index
>= 0 && index
<= 3);
13506 /* Need to implement table size reduction, by chaning the code below. */
13507 output_asm_insn (patterns
[index
][0], operands
);
13508 ASM_GENERATE_INTERNAL_LABEL (label
, "Lrtx", CODE_LABEL_NUMBER (operands
[2]));
13509 snprintf (buf
, sizeof (buf
),
13510 "adr\t%%4, %s", targetm
.strip_name_encoding (label
));
13511 output_asm_insn (buf
, operands
);
13512 output_asm_insn (patterns
[index
][1], operands
);
13513 output_asm_insn ("br\t%3", operands
);
13514 output_asm_insn (aarch64_sls_barrier (aarch64_harden_sls_retbr_p ()),
13516 assemble_label (asm_out_file
, label
);
13520 /* Return the asm string for an SME ZERO instruction whose 8-bit mask
13521 operand is MASK. */
13523 aarch64_output_sme_zero_za (rtx mask
)
13525 auto mask_val
= UINTVAL (mask
);
13529 if (mask_val
== 0xff)
13530 return "zero\t{ za }";
13532 static constexpr struct { unsigned char mask
; char letter
; } tiles
[] = {
13538 /* The last entry in the list has the form "za7.d }", but that's the
13539 same length as "za7.d, ". */
13540 static char buffer
[sizeof("zero\t{ ") + sizeof ("za7.d, ") * 8 + 1];
13541 for (auto &tile
: tiles
)
13543 unsigned int tile_mask
= tile
.mask
;
13544 unsigned int tile_index
= 0;
13545 unsigned int i
= snprintf (buffer
, sizeof (buffer
), "zero\t");
13546 const char *prefix
= "{ ";
13547 auto remaining_mask
= mask_val
;
13548 while (tile_mask
< 0x100)
13550 if ((remaining_mask
& tile_mask
) == tile_mask
)
13552 i
+= snprintf (buffer
+ i
, sizeof (buffer
) - i
, "%sza%d.%c",
13553 prefix
, tile_index
, tile
.letter
);
13555 remaining_mask
&= ~tile_mask
;
13560 if (remaining_mask
== 0)
13562 gcc_assert (i
+ 3 <= sizeof (buffer
));
13563 snprintf (buffer
+ i
, sizeof (buffer
) - i
, " }");
13567 gcc_unreachable ();
13570 /* Return size in bits of an arithmetic operand which is shifted/scaled and
13571 masked such that it is suitable for a UXTB, UXTH, or UXTW extend
13575 aarch64_uxt_size (int shift
, HOST_WIDE_INT mask
)
13577 if (shift
>= 0 && shift
<= 4)
13580 for (size
= 8; size
<= 32; size
*= 2)
13582 HOST_WIDE_INT bits
= ((HOST_WIDE_INT
)1U << size
) - 1;
13583 if (mask
== bits
<< shift
)
13590 /* Constant pools are per function only when PC relative
13591 literal loads are true or we are in the large memory
13595 aarch64_can_use_per_function_literal_pools_p (void)
13597 return (aarch64_pcrelative_literal_loads
13598 || aarch64_cmodel
== AARCH64_CMODEL_LARGE
);
13602 aarch64_use_blocks_for_constant_p (machine_mode
, const_rtx
)
13604 /* We can't use blocks for constants when we're using a per-function
13606 return !aarch64_can_use_per_function_literal_pools_p ();
13609 /* Select appropriate section for constants depending
13610 on where we place literal pools. */
13613 aarch64_select_rtx_section (machine_mode mode
,
13615 unsigned HOST_WIDE_INT align
)
13617 if (aarch64_can_use_per_function_literal_pools_p ())
13618 return function_section (current_function_decl
);
13620 return default_elf_select_rtx_section (mode
, x
, align
);
13623 /* Implement ASM_OUTPUT_POOL_EPILOGUE. */
13625 aarch64_asm_output_pool_epilogue (FILE *f
, const char *, tree
,
13626 HOST_WIDE_INT offset
)
13628 /* When using per-function literal pools, we must ensure that any code
13629 section is aligned to the minimal instruction length, lest we get
13630 errors from the assembler re "unaligned instructions". */
13631 if ((offset
& 3) && aarch64_can_use_per_function_literal_pools_p ())
13632 ASM_OUTPUT_ALIGN (f
, 2);
13637 /* Helper function for rtx cost calculation. Strip a shift expression
13638 from X. Returns the inner operand if successful, or the original
13639 expression on failure. */
13641 aarch64_strip_shift (rtx x
)
13645 /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
13646 we can convert both to ROR during final output. */
13647 if ((GET_CODE (op
) == ASHIFT
13648 || GET_CODE (op
) == ASHIFTRT
13649 || GET_CODE (op
) == LSHIFTRT
13650 || GET_CODE (op
) == ROTATERT
13651 || GET_CODE (op
) == ROTATE
)
13652 && CONST_INT_P (XEXP (op
, 1)))
13653 return XEXP (op
, 0);
13655 if (GET_CODE (op
) == MULT
13656 && CONST_INT_P (XEXP (op
, 1))
13657 && ((unsigned) exact_log2 (INTVAL (XEXP (op
, 1)))) < 64)
13658 return XEXP (op
, 0);
13663 /* Helper function for rtx cost calculation. Strip an extend
13664 expression from X. Returns the inner operand if successful, or the
13665 original expression on failure. We deal with a number of possible
13666 canonicalization variations here. If STRIP_SHIFT is true, then
13667 we can strip off a shift also. */
13669 aarch64_strip_extend (rtx x
, bool strip_shift
)
13671 scalar_int_mode mode
;
13674 if (!is_a
<scalar_int_mode
> (GET_MODE (op
), &mode
))
13677 if (GET_CODE (op
) == AND
13678 && GET_CODE (XEXP (op
, 0)) == MULT
13679 && CONST_INT_P (XEXP (XEXP (op
, 0), 1))
13680 && CONST_INT_P (XEXP (op
, 1))
13681 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op
, 0), 1))),
13682 INTVAL (XEXP (op
, 1))) != 0)
13683 return XEXP (XEXP (op
, 0), 0);
13685 /* Now handle extended register, as this may also have an optional
13686 left shift by 1..4. */
13688 && GET_CODE (op
) == ASHIFT
13689 && CONST_INT_P (XEXP (op
, 1))
13690 && ((unsigned HOST_WIDE_INT
) INTVAL (XEXP (op
, 1))) <= 4)
13693 if (GET_CODE (op
) == ZERO_EXTEND
13694 || GET_CODE (op
) == SIGN_EXTEND
)
13703 /* Helper function for rtx cost calculation. Strip extension as well as any
13704 inner VEC_SELECT high-half from X. Returns the inner vector operand if
13705 successful, or the original expression on failure. */
13707 aarch64_strip_extend_vec_half (rtx x
)
13709 if (GET_CODE (x
) == ZERO_EXTEND
|| GET_CODE (x
) == SIGN_EXTEND
)
13712 if (GET_CODE (x
) == VEC_SELECT
13713 && vec_series_highpart_p (GET_MODE (x
), GET_MODE (XEXP (x
, 0)),
13720 /* Helper function for rtx cost calculation. Strip VEC_DUPLICATE as well as
13721 any subsequent extend and VEC_SELECT from X. Returns the inner scalar
13722 operand if successful, or the original expression on failure. */
13724 aarch64_strip_duplicate_vec_elt (rtx x
)
13726 if (GET_CODE (x
) == VEC_DUPLICATE
13727 && is_a
<scalar_mode
> (GET_MODE (XEXP (x
, 0))))
13730 if (GET_CODE (x
) == VEC_SELECT
)
13732 else if ((GET_CODE (x
) == ZERO_EXTEND
|| GET_CODE (x
) == SIGN_EXTEND
)
13733 && GET_CODE (XEXP (x
, 0)) == VEC_SELECT
)
13734 x
= XEXP (XEXP (x
, 0), 0);
13739 /* Return true iff CODE is a shift supported in combination
13740 with arithmetic instructions. */
13743 aarch64_shift_p (enum rtx_code code
)
13745 return code
== ASHIFT
|| code
== ASHIFTRT
|| code
== LSHIFTRT
;
13749 /* Return true iff X is a cheap shift without a sign extend. */
13752 aarch64_cheap_mult_shift_p (rtx x
)
13759 if (!(aarch64_tune_params
.extra_tuning_flags
13760 & AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND
))
13763 if (GET_CODE (op0
) == SIGN_EXTEND
)
13766 if (GET_CODE (x
) == ASHIFT
&& CONST_INT_P (op1
)
13767 && UINTVAL (op1
) <= 4)
13770 if (GET_CODE (x
) != MULT
|| !CONST_INT_P (op1
))
13773 HOST_WIDE_INT l2
= exact_log2 (INTVAL (op1
));
13775 if (l2
> 0 && l2
<= 4)
13781 /* Helper function for rtx cost calculation. Calculate the cost of
13782 a MULT or ASHIFT, which may be part of a compound PLUS/MINUS rtx.
13783 Return the calculated cost of the expression, recursing manually in to
13784 operands where needed. */
13787 aarch64_rtx_mult_cost (rtx x
, enum rtx_code code
, int outer
, bool speed
)
13790 const struct cpu_cost_table
*extra_cost
13791 = aarch64_tune_params
.insn_extra_cost
;
13793 bool compound_p
= (outer
== PLUS
|| outer
== MINUS
);
13794 machine_mode mode
= GET_MODE (x
);
13796 gcc_checking_assert (code
== MULT
);
13801 if (VECTOR_MODE_P (mode
))
13803 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
13804 if (TARGET_SIMD
&& (vec_flags
& VEC_ADVSIMD
))
13806 /* The select-operand-high-half versions of the instruction have the
13807 same cost as the three vector version - don't add the costs of the
13808 extension or selection into the costs of the multiply. */
13809 op0
= aarch64_strip_extend_vec_half (op0
);
13810 op1
= aarch64_strip_extend_vec_half (op1
);
13811 /* The by-element versions of the instruction have the same costs as
13812 the normal 3-vector version. We make an assumption that the input
13813 to the VEC_DUPLICATE is already on the FP & SIMD side. This means
13814 costing of a MUL by element pre RA is a bit optimistic. */
13815 op0
= aarch64_strip_duplicate_vec_elt (op0
);
13816 op1
= aarch64_strip_duplicate_vec_elt (op1
);
13818 cost
+= rtx_cost (op0
, mode
, MULT
, 0, speed
);
13819 cost
+= rtx_cost (op1
, mode
, MULT
, 1, speed
);
13822 if (GET_CODE (x
) == MULT
)
13823 cost
+= extra_cost
->vect
.mult
;
13824 /* This is to catch the SSRA costing currently flowing here. */
13826 cost
+= extra_cost
->vect
.alu
;
13831 /* Integer multiply/fma. */
13832 if (GET_MODE_CLASS (mode
) == MODE_INT
)
13834 /* The multiply will be canonicalized as a shift, cost it as such. */
13835 if (aarch64_shift_p (GET_CODE (x
))
13836 || (CONST_INT_P (op1
)
13837 && exact_log2 (INTVAL (op1
)) > 0))
13839 bool is_extend
= GET_CODE (op0
) == ZERO_EXTEND
13840 || GET_CODE (op0
) == SIGN_EXTEND
;
13845 /* If the shift is considered cheap,
13846 then don't add any cost. */
13847 if (aarch64_cheap_mult_shift_p (x
))
13849 else if (REG_P (op1
))
13850 /* ARITH + shift-by-register. */
13851 cost
+= extra_cost
->alu
.arith_shift_reg
;
13852 else if (is_extend
)
13853 /* ARITH + extended register. We don't have a cost field
13854 for ARITH+EXTEND+SHIFT, so use extend_arith here. */
13855 cost
+= extra_cost
->alu
.extend_arith
;
13857 /* ARITH + shift-by-immediate. */
13858 cost
+= extra_cost
->alu
.arith_shift
;
13861 /* LSL (immediate). */
13862 cost
+= extra_cost
->alu
.shift
;
13865 /* Strip extends as we will have costed them in the case above. */
13867 op0
= aarch64_strip_extend (op0
, true);
13869 cost
+= rtx_cost (op0
, VOIDmode
, code
, 0, speed
);
13874 /* MNEG or [US]MNEGL. Extract the NEG operand and indicate that it's a
13875 compound and let the below cases handle it. After all, MNEG is a
13876 special-case alias of MSUB. */
13877 if (GET_CODE (op0
) == NEG
)
13879 op0
= XEXP (op0
, 0);
13883 /* Integer multiplies or FMAs have zero/sign extending variants. */
13884 if ((GET_CODE (op0
) == ZERO_EXTEND
13885 && GET_CODE (op1
) == ZERO_EXTEND
)
13886 || (GET_CODE (op0
) == SIGN_EXTEND
13887 && GET_CODE (op1
) == SIGN_EXTEND
))
13889 cost
+= rtx_cost (XEXP (op0
, 0), VOIDmode
, MULT
, 0, speed
);
13890 cost
+= rtx_cost (XEXP (op1
, 0), VOIDmode
, MULT
, 1, speed
);
13895 /* SMADDL/UMADDL/UMSUBL/SMSUBL. */
13896 cost
+= extra_cost
->mult
[0].extend_add
;
13898 /* MUL/SMULL/UMULL. */
13899 cost
+= extra_cost
->mult
[0].extend
;
13905 /* This is either an integer multiply or a MADD. In both cases
13906 we want to recurse and cost the operands. */
13907 cost
+= rtx_cost (op0
, mode
, MULT
, 0, speed
);
13908 cost
+= rtx_cost (op1
, mode
, MULT
, 1, speed
);
13914 cost
+= extra_cost
->mult
[mode
== DImode
].add
;
13917 cost
+= extra_cost
->mult
[mode
== DImode
].simple
;
13926 /* Floating-point FMA/FMUL can also support negations of the
13927 operands, unless the rounding mode is upward or downward in
13928 which case FNMUL is different than FMUL with operand negation. */
13929 bool neg0
= GET_CODE (op0
) == NEG
;
13930 bool neg1
= GET_CODE (op1
) == NEG
;
13931 if (compound_p
|| !flag_rounding_math
|| (neg0
&& neg1
))
13934 op0
= XEXP (op0
, 0);
13936 op1
= XEXP (op1
, 0);
13940 /* FMADD/FNMADD/FNMSUB/FMSUB. */
13941 cost
+= extra_cost
->fp
[mode
== DFmode
].fma
;
13944 cost
+= extra_cost
->fp
[mode
== DFmode
].mult
;
13947 cost
+= rtx_cost (op0
, mode
, MULT
, 0, speed
);
13948 cost
+= rtx_cost (op1
, mode
, MULT
, 1, speed
);
13954 aarch64_address_cost (rtx x
,
13956 addr_space_t as ATTRIBUTE_UNUSED
,
13959 enum rtx_code c
= GET_CODE (x
);
13960 const struct cpu_addrcost_table
*addr_cost
= aarch64_tune_params
.addr_cost
;
13961 struct aarch64_address_info info
;
13965 if (!aarch64_classify_address (&info
, x
, mode
, false))
13967 if (GET_CODE (x
) == CONST
|| SYMBOL_REF_P (x
))
13969 /* This is a CONST or SYMBOL ref which will be split
13970 in a different way depending on the code model in use.
13971 Cost it through the generic infrastructure. */
13972 int cost_symbol_ref
= rtx_cost (x
, Pmode
, MEM
, 1, speed
);
13973 /* Divide through by the cost of one instruction to
13974 bring it to the same units as the address costs. */
13975 cost_symbol_ref
/= COSTS_N_INSNS (1);
13976 /* The cost is then the cost of preparing the address,
13977 followed by an immediate (possibly 0) offset. */
13978 return cost_symbol_ref
+ addr_cost
->imm_offset
;
13982 /* This is most likely a jump table from a case
13984 return addr_cost
->register_offset
;
13990 case ADDRESS_LO_SUM
:
13991 case ADDRESS_SYMBOLIC
:
13992 case ADDRESS_REG_IMM
:
13993 cost
+= addr_cost
->imm_offset
;
13996 case ADDRESS_REG_WB
:
13997 if (c
== PRE_INC
|| c
== PRE_DEC
|| c
== PRE_MODIFY
)
13998 cost
+= addr_cost
->pre_modify
;
13999 else if (c
== POST_INC
|| c
== POST_DEC
|| c
== POST_MODIFY
)
14001 unsigned int nvectors
= aarch64_ldn_stn_vectors (mode
);
14003 cost
+= addr_cost
->post_modify_ld3_st3
;
14004 else if (nvectors
== 4)
14005 cost
+= addr_cost
->post_modify_ld4_st4
;
14007 cost
+= addr_cost
->post_modify
;
14010 gcc_unreachable ();
14014 case ADDRESS_REG_REG
:
14015 cost
+= addr_cost
->register_offset
;
14018 case ADDRESS_REG_SXTW
:
14019 cost
+= addr_cost
->register_sextend
;
14022 case ADDRESS_REG_UXTW
:
14023 cost
+= addr_cost
->register_zextend
;
14027 gcc_unreachable ();
14031 if (info
.shift
> 0)
14033 /* For the sake of calculating the cost of the shifted register
14034 component, we can treat same sized modes in the same way. */
14035 if (known_eq (GET_MODE_BITSIZE (mode
), 16))
14036 cost
+= addr_cost
->addr_scale_costs
.hi
;
14037 else if (known_eq (GET_MODE_BITSIZE (mode
), 32))
14038 cost
+= addr_cost
->addr_scale_costs
.si
;
14039 else if (known_eq (GET_MODE_BITSIZE (mode
), 64))
14040 cost
+= addr_cost
->addr_scale_costs
.di
;
14042 /* We can't tell, or this is a 128-bit vector. */
14043 cost
+= addr_cost
->addr_scale_costs
.ti
;
14049 /* Return the cost of a branch. If SPEED_P is true then the compiler is
14050 optimizing for speed. If PREDICTABLE_P is true then the branch is predicted
14054 aarch64_branch_cost (bool speed_p
, bool predictable_p
)
14056 /* When optimizing for speed, use the cost of unpredictable branches. */
14057 const struct cpu_branch_cost
*branch_costs
=
14058 aarch64_tune_params
.branch_costs
;
14060 if (!speed_p
|| predictable_p
)
14061 return branch_costs
->predictable
;
14063 return branch_costs
->unpredictable
;
14066 /* Return true if X is a zero or sign extract
14067 usable in an ADD or SUB (extended register) instruction. */
14069 aarch64_rtx_arith_op_extract_p (rtx x
)
14071 /* The simple case <ARITH>, XD, XN, XM, [us]xt.
14073 if (GET_CODE (x
) == SIGN_EXTEND
14074 || GET_CODE (x
) == ZERO_EXTEND
)
14075 return REG_P (XEXP (x
, 0));
14081 aarch64_frint_unspec_p (unsigned int u
)
14085 case UNSPEC_FRINTZ
:
14086 case UNSPEC_FRINTP
:
14087 case UNSPEC_FRINTM
:
14088 case UNSPEC_FRINTA
:
14089 case UNSPEC_FRINTN
:
14090 case UNSPEC_FRINTX
:
14091 case UNSPEC_FRINTI
:
14099 /* Return true iff X is an rtx that will match an extr instruction
14100 i.e. as described in the *extr<mode>5_insn family of patterns.
14101 OP0 and OP1 will be set to the operands of the shifts involved
14102 on success and will be NULL_RTX otherwise. */
14105 aarch64_extr_rtx_p (rtx x
, rtx
*res_op0
, rtx
*res_op1
)
14108 scalar_int_mode mode
;
14109 if (!is_a
<scalar_int_mode
> (GET_MODE (x
), &mode
))
14112 *res_op0
= NULL_RTX
;
14113 *res_op1
= NULL_RTX
;
14115 if (GET_CODE (x
) != IOR
)
14121 if ((GET_CODE (op0
) == ASHIFT
&& GET_CODE (op1
) == LSHIFTRT
)
14122 || (GET_CODE (op1
) == ASHIFT
&& GET_CODE (op0
) == LSHIFTRT
))
14124 /* Canonicalise locally to ashift in op0, lshiftrt in op1. */
14125 if (GET_CODE (op1
) == ASHIFT
)
14126 std::swap (op0
, op1
);
14128 if (!CONST_INT_P (XEXP (op0
, 1)) || !CONST_INT_P (XEXP (op1
, 1)))
14131 unsigned HOST_WIDE_INT shft_amnt_0
= UINTVAL (XEXP (op0
, 1));
14132 unsigned HOST_WIDE_INT shft_amnt_1
= UINTVAL (XEXP (op1
, 1));
14134 if (shft_amnt_0
< GET_MODE_BITSIZE (mode
)
14135 && shft_amnt_0
+ shft_amnt_1
== GET_MODE_BITSIZE (mode
))
14137 *res_op0
= XEXP (op0
, 0);
14138 *res_op1
= XEXP (op1
, 0);
14146 /* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
14147 storing it in *COST. Result is true if the total cost of the operation
14148 has now been calculated. */
14150 aarch64_if_then_else_costs (rtx op0
, rtx op1
, rtx op2
, int *cost
, bool speed
)
14154 enum rtx_code cmpcode
;
14155 const struct cpu_cost_table
*extra_cost
14156 = aarch64_tune_params
.insn_extra_cost
;
14158 if (COMPARISON_P (op0
))
14160 inner
= XEXP (op0
, 0);
14161 comparator
= XEXP (op0
, 1);
14162 cmpcode
= GET_CODE (op0
);
14167 comparator
= const0_rtx
;
14171 if (GET_CODE (op1
) == PC
|| GET_CODE (op2
) == PC
)
14173 /* Conditional branch. */
14174 if (GET_MODE_CLASS (GET_MODE (inner
)) == MODE_CC
)
14178 if (cmpcode
== NE
|| cmpcode
== EQ
)
14180 if (comparator
== const0_rtx
)
14182 /* TBZ/TBNZ/CBZ/CBNZ. */
14183 if (GET_CODE (inner
) == ZERO_EXTRACT
)
14185 *cost
+= rtx_cost (XEXP (inner
, 0), VOIDmode
,
14186 ZERO_EXTRACT
, 0, speed
);
14189 *cost
+= rtx_cost (inner
, VOIDmode
, cmpcode
, 0, speed
);
14193 if (register_operand (inner
, VOIDmode
)
14194 && aarch64_imm24 (comparator
, VOIDmode
))
14196 /* SUB and SUBS. */
14197 *cost
+= COSTS_N_INSNS (2);
14199 *cost
+= extra_cost
->alu
.arith
* 2;
14203 else if (cmpcode
== LT
|| cmpcode
== GE
)
14206 if (comparator
== const0_rtx
)
14211 else if (GET_MODE_CLASS (GET_MODE (inner
)) == MODE_CC
)
14214 if (GET_CODE (op1
) == COMPARE
)
14216 /* Increase cost of CCMP reg, 0, imm, CC to prefer CMP reg, 0. */
14217 if (XEXP (op1
, 1) == const0_rtx
)
14221 machine_mode mode
= GET_MODE (XEXP (op1
, 0));
14223 if (GET_MODE_CLASS (mode
) == MODE_INT
)
14224 *cost
+= extra_cost
->alu
.arith
;
14226 *cost
+= extra_cost
->fp
[mode
== DFmode
].compare
;
14231 /* It's a conditional operation based on the status flags,
14232 so it must be some flavor of CSEL. */
14234 /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL. */
14235 if (GET_CODE (op1
) == NEG
14236 || GET_CODE (op1
) == NOT
14237 || (GET_CODE (op1
) == PLUS
&& XEXP (op1
, 1) == const1_rtx
))
14238 op1
= XEXP (op1
, 0);
14239 else if (GET_CODE (op1
) == ZERO_EXTEND
&& GET_CODE (op2
) == ZERO_EXTEND
)
14241 /* CSEL with zero-extension (*cmovdi_insn_uxtw). */
14242 op1
= XEXP (op1
, 0);
14243 op2
= XEXP (op2
, 0);
14245 else if (GET_CODE (op1
) == ZERO_EXTEND
&& op2
== const0_rtx
)
14247 inner
= XEXP (op1
, 0);
14248 if (GET_CODE (inner
) == NEG
|| GET_CODE (inner
) == NOT
)
14249 /* CSINV/NEG with zero extend + const 0 (*csinv3_uxtw_insn3). */
14250 op1
= XEXP (inner
, 0);
14252 else if (op1
== constm1_rtx
|| op1
== const1_rtx
)
14254 /* Use CSINV or CSINC. */
14255 *cost
+= rtx_cost (op2
, VOIDmode
, IF_THEN_ELSE
, 2, speed
);
14258 else if (op2
== constm1_rtx
|| op2
== const1_rtx
)
14260 /* Use CSINV or CSINC. */
14261 *cost
+= rtx_cost (op1
, VOIDmode
, IF_THEN_ELSE
, 1, speed
);
14265 *cost
+= rtx_cost (op1
, VOIDmode
, IF_THEN_ELSE
, 1, speed
);
14266 *cost
+= rtx_cost (op2
, VOIDmode
, IF_THEN_ELSE
, 2, speed
);
14270 /* We don't know what this is, cost all operands. */
14274 /* Check whether X is a bitfield operation of the form shift + extend that
14275 maps down to a UBFIZ/SBFIZ/UBFX/SBFX instruction. If so, return the
14276 operand to which the bitfield operation is applied. Otherwise return
14280 aarch64_extend_bitfield_pattern_p (rtx x
)
14282 rtx_code outer_code
= GET_CODE (x
);
14283 machine_mode outer_mode
= GET_MODE (x
);
14285 if (outer_code
!= ZERO_EXTEND
&& outer_code
!= SIGN_EXTEND
14286 && outer_mode
!= SImode
&& outer_mode
!= DImode
)
14289 rtx inner
= XEXP (x
, 0);
14290 rtx_code inner_code
= GET_CODE (inner
);
14291 machine_mode inner_mode
= GET_MODE (inner
);
14294 switch (inner_code
)
14297 if (CONST_INT_P (XEXP (inner
, 1))
14298 && (inner_mode
== QImode
|| inner_mode
== HImode
))
14299 op
= XEXP (inner
, 0);
14302 if (outer_code
== ZERO_EXTEND
&& CONST_INT_P (XEXP (inner
, 1))
14303 && (inner_mode
== QImode
|| inner_mode
== HImode
))
14304 op
= XEXP (inner
, 0);
14307 if (outer_code
== SIGN_EXTEND
&& CONST_INT_P (XEXP (inner
, 1))
14308 && (inner_mode
== QImode
|| inner_mode
== HImode
))
14309 op
= XEXP (inner
, 0);
14318 /* Return true if the mask and a shift amount from an RTX of the form
14319 (x << SHFT_AMNT) & MASK are valid to combine into a UBFIZ instruction of
14320 mode MODE. See the *andim_ashift<mode>_bfiz pattern. */
14323 aarch64_mask_and_shift_for_ubfiz_p (scalar_int_mode mode
, rtx mask
,
14326 return CONST_INT_P (mask
) && CONST_INT_P (shft_amnt
)
14327 && INTVAL (mask
) > 0
14328 && UINTVAL (shft_amnt
) < GET_MODE_BITSIZE (mode
)
14329 && exact_log2 ((UINTVAL (mask
) >> UINTVAL (shft_amnt
)) + 1) >= 0
14331 & ((HOST_WIDE_INT_1U
<< UINTVAL (shft_amnt
)) - 1)) == 0;
14334 /* Return true if the masks and a shift amount from an RTX of the form
14335 ((x & MASK1) | ((y << SHIFT_AMNT) & MASK2)) are valid to combine into
14336 a BFI instruction of mode MODE. See *arch64_bfi patterns. */
14339 aarch64_masks_and_shift_for_bfi_p (scalar_int_mode mode
,
14340 unsigned HOST_WIDE_INT mask1
,
14341 unsigned HOST_WIDE_INT shft_amnt
,
14342 unsigned HOST_WIDE_INT mask2
)
14344 unsigned HOST_WIDE_INT t
;
14346 /* Verify that there is no overlap in what bits are set in the two masks. */
14347 if (mask1
!= ~mask2
)
14350 /* Verify that mask2 is not all zeros or ones. */
14351 if (mask2
== 0 || mask2
== HOST_WIDE_INT_M1U
)
14354 /* The shift amount should always be less than the mode size. */
14355 gcc_assert (shft_amnt
< GET_MODE_BITSIZE (mode
));
14357 /* Verify that the mask being shifted is contiguous and would be in the
14358 least significant bits after shifting by shft_amnt. */
14359 t
= mask2
+ (HOST_WIDE_INT_1U
<< shft_amnt
);
14360 return (t
== (t
& -t
));
14363 /* Return true if X is an RTX representing an operation in the ABD family
14364 of instructions. */
14367 aarch64_abd_rtx_p (rtx x
)
14369 if (GET_CODE (x
) != MINUS
)
14371 rtx max_arm
= XEXP (x
, 0);
14372 rtx min_arm
= XEXP (x
, 1);
14373 if (GET_CODE (max_arm
) != SMAX
&& GET_CODE (max_arm
) != UMAX
)
14375 bool signed_p
= GET_CODE (max_arm
) == SMAX
;
14376 if (signed_p
&& GET_CODE (min_arm
) != SMIN
)
14378 else if (!signed_p
&& GET_CODE (min_arm
) != UMIN
)
14381 rtx maxop0
= XEXP (max_arm
, 0);
14382 rtx maxop1
= XEXP (max_arm
, 1);
14383 rtx minop0
= XEXP (min_arm
, 0);
14384 rtx minop1
= XEXP (min_arm
, 1);
14385 return rtx_equal_p (maxop0
, minop0
) && rtx_equal_p (maxop1
, minop1
);
14388 /* Calculate the cost of calculating X, storing it in *COST. Result
14389 is true if the total cost of the operation has now been calculated. */
14391 aarch64_rtx_costs (rtx x
, machine_mode mode
, int outer ATTRIBUTE_UNUSED
,
14392 int param ATTRIBUTE_UNUSED
, int *cost
, bool speed
)
14395 const struct cpu_cost_table
*extra_cost
14396 = aarch64_tune_params
.insn_extra_cost
;
14397 rtx_code code
= GET_CODE (x
);
14398 scalar_int_mode int_mode
;
14400 /* By default, assume that everything has equivalent cost to the
14401 cheapest instruction. Any additional costs are applied as a delta
14402 above this default. */
14403 *cost
= COSTS_N_INSNS (1);
14408 /* The cost depends entirely on the operands to SET. */
14410 op0
= SET_DEST (x
);
14413 switch (GET_CODE (op0
))
14418 rtx address
= XEXP (op0
, 0);
14419 if (VECTOR_MODE_P (mode
))
14420 *cost
+= extra_cost
->ldst
.storev
;
14421 else if (GET_MODE_CLASS (mode
) == MODE_INT
)
14422 *cost
+= extra_cost
->ldst
.store
;
14423 else if (mode
== SFmode
|| mode
== SDmode
)
14424 *cost
+= extra_cost
->ldst
.storef
;
14425 else if (mode
== DFmode
|| mode
== DDmode
)
14426 *cost
+= extra_cost
->ldst
.stored
;
14429 COSTS_N_INSNS (aarch64_address_cost (address
, mode
,
14433 *cost
+= rtx_cost (op1
, mode
, SET
, 1, speed
);
14437 if (! REG_P (SUBREG_REG (op0
)))
14438 *cost
+= rtx_cost (SUBREG_REG (op0
), VOIDmode
, SET
, 0, speed
);
14440 /* Fall through. */
14442 /* The cost is one per vector-register copied. */
14443 if (VECTOR_MODE_P (GET_MODE (op0
)) && REG_P (op1
))
14445 int nregs
= aarch64_hard_regno_nregs (V0_REGNUM
, GET_MODE (op0
));
14446 *cost
= COSTS_N_INSNS (nregs
);
14448 /* const0_rtx is in general free, but we will use an
14449 instruction to set a register to 0. */
14450 else if (REG_P (op1
) || op1
== const0_rtx
)
14452 /* The cost is 1 per register copied. */
14453 int nregs
= aarch64_hard_regno_nregs (R0_REGNUM
, GET_MODE (op0
));
14454 *cost
= COSTS_N_INSNS (nregs
);
14457 /* Cost is just the cost of the RHS of the set. */
14458 *cost
+= rtx_cost (op1
, mode
, SET
, 1, speed
);
14463 /* Bit-field insertion. Strip any redundant widening of
14464 the RHS to meet the width of the target. */
14465 if (SUBREG_P (op1
))
14466 op1
= SUBREG_REG (op1
);
14467 if ((GET_CODE (op1
) == ZERO_EXTEND
14468 || GET_CODE (op1
) == SIGN_EXTEND
)
14469 && CONST_INT_P (XEXP (op0
, 1))
14470 && is_a
<scalar_int_mode
> (GET_MODE (XEXP (op1
, 0)), &int_mode
)
14471 && GET_MODE_BITSIZE (int_mode
) >= INTVAL (XEXP (op0
, 1)))
14472 op1
= XEXP (op1
, 0);
14474 if (CONST_INT_P (op1
))
14476 /* MOV immediate is assumed to always be cheap. */
14477 *cost
= COSTS_N_INSNS (1);
14483 *cost
+= extra_cost
->alu
.bfi
;
14484 *cost
+= rtx_cost (op1
, VOIDmode
, code
, 1, speed
);
14490 /* We can't make sense of this, assume default cost. */
14491 *cost
= COSTS_N_INSNS (1);
14497 /* If an instruction can incorporate a constant within the
14498 instruction, the instruction's expression avoids calling
14499 rtx_cost() on the constant. If rtx_cost() is called on a
14500 constant, then it is usually because the constant must be
14501 moved into a register by one or more instructions.
14503 The exception is constant 0, which can be expressed
14504 as XZR/WZR and is therefore free. The exception to this is
14505 if we have (set (reg) (const0_rtx)) in which case we must cost
14506 the move. However, we can catch that when we cost the SET, so
14507 we don't need to consider that here. */
14508 if (x
== const0_rtx
)
14512 /* To an approximation, building any other constant is
14513 proportionally expensive to the number of instructions
14514 required to build that constant. This is true whether we
14515 are compiling for SPEED or otherwise. */
14516 machine_mode imode
= known_le (GET_MODE_SIZE (mode
), 4)
14518 *cost
= COSTS_N_INSNS (aarch64_internal_mov_immediate
14519 (NULL_RTX
, x
, false, imode
));
14525 /* First determine number of instructions to do the move
14526 as an integer constant. */
14527 if (!aarch64_float_const_representable_p (x
)
14528 && !aarch64_can_const_movi_rtx_p (x
, mode
)
14529 && aarch64_float_const_rtx_p (x
))
14531 unsigned HOST_WIDE_INT ival
;
14532 bool succeed
= aarch64_reinterpret_float_as_int (x
, &ival
);
14533 gcc_assert (succeed
);
14535 machine_mode imode
= known_eq (GET_MODE_SIZE (mode
), 8)
14537 int ncost
= aarch64_internal_mov_immediate
14538 (NULL_RTX
, gen_int_mode (ival
, imode
), false, imode
);
14539 *cost
+= COSTS_N_INSNS (ncost
);
14545 /* mov[df,sf]_aarch64. */
14546 if (aarch64_float_const_representable_p (x
))
14547 /* FMOV (scalar immediate). */
14548 *cost
+= extra_cost
->fp
[mode
== DFmode
|| mode
== DDmode
].fpconst
;
14549 else if (!aarch64_float_const_zero_rtx_p (x
))
14551 /* This will be a load from memory. */
14552 if (mode
== DFmode
|| mode
== DDmode
)
14553 *cost
+= extra_cost
->ldst
.loadd
;
14555 *cost
+= extra_cost
->ldst
.loadf
;
14558 /* Otherwise this is +0.0. We get this using MOVI d0, #0
14559 or MOV v0.s[0], wzr - neither of which are modeled by the
14560 cost tables. Just use the default cost. */
14570 /* For loads we want the base cost of a load, plus an
14571 approximation for the additional cost of the addressing
14573 rtx address
= XEXP (x
, 0);
14574 if (VECTOR_MODE_P (mode
))
14575 *cost
+= extra_cost
->ldst
.loadv
;
14576 else if (GET_MODE_CLASS (mode
) == MODE_INT
)
14577 *cost
+= extra_cost
->ldst
.load
;
14578 else if (mode
== SFmode
|| mode
== SDmode
)
14579 *cost
+= extra_cost
->ldst
.loadf
;
14580 else if (mode
== DFmode
|| mode
== DDmode
)
14581 *cost
+= extra_cost
->ldst
.loadd
;
14584 COSTS_N_INSNS (aarch64_address_cost (address
, mode
,
14593 if (VECTOR_MODE_P (mode
))
14595 /* Many vector comparison operations are represented as NEG
14596 of a comparison. */
14597 if (COMPARISON_P (op0
))
14599 rtx op00
= XEXP (op0
, 0);
14600 rtx op01
= XEXP (op0
, 1);
14601 machine_mode inner_mode
= GET_MODE (op00
);
14603 if (GET_MODE_CLASS (inner_mode
) == MODE_VECTOR_FLOAT
14604 && GET_CODE (op00
) == ABS
14605 && GET_CODE (op01
) == ABS
)
14607 op00
= XEXP (op00
, 0);
14608 op01
= XEXP (op01
, 0);
14610 *cost
+= rtx_cost (op00
, inner_mode
, GET_CODE (op0
), 0, speed
);
14611 *cost
+= rtx_cost (op01
, inner_mode
, GET_CODE (op0
), 1, speed
);
14613 *cost
+= extra_cost
->vect
.alu
;
14619 *cost
+= extra_cost
->vect
.alu
;
14624 if (GET_MODE_CLASS (mode
) == MODE_INT
)
14626 if (GET_RTX_CLASS (GET_CODE (op0
)) == RTX_COMPARE
14627 || GET_RTX_CLASS (GET_CODE (op0
)) == RTX_COMM_COMPARE
)
14630 *cost
+= rtx_cost (XEXP (op0
, 0), VOIDmode
, NEG
, 0, speed
);
14634 /* Cost this as SUB wzr, X. */
14635 op0
= CONST0_RTX (mode
);
14640 if (GET_MODE_CLASS (mode
) == MODE_FLOAT
)
14642 /* Support (neg(fma...)) as a single instruction only if
14643 sign of zeros is unimportant. This matches the decision
14644 making in aarch64.md. */
14645 if (GET_CODE (op0
) == FMA
&& !HONOR_SIGNED_ZEROS (GET_MODE (op0
)))
14648 *cost
= rtx_cost (op0
, mode
, NEG
, 0, speed
);
14651 if (GET_CODE (op0
) == MULT
)
14654 *cost
= rtx_cost (op0
, mode
, NEG
, 0, speed
);
14659 *cost
+= extra_cost
->fp
[mode
== DFmode
].neg
;
14669 if (VECTOR_MODE_P (mode
))
14670 *cost
+= extra_cost
->vect
.alu
;
14672 *cost
+= extra_cost
->alu
.clz
;
14678 if (VECTOR_MODE_P (mode
))
14680 *cost
= COSTS_N_INSNS (3);
14682 *cost
+= extra_cost
->vect
.alu
* 3;
14684 else if (TARGET_CSSC
)
14686 *cost
= COSTS_N_INSNS (1);
14688 *cost
+= extra_cost
->alu
.clz
;
14692 *cost
= COSTS_N_INSNS (2);
14694 *cost
+= extra_cost
->alu
.clz
+ extra_cost
->alu
.rev
;
14702 if (op1
== const0_rtx
14703 && GET_CODE (op0
) == AND
)
14706 mode
= GET_MODE (op0
);
14710 if (GET_MODE_CLASS (GET_MODE (op0
)) == MODE_INT
)
14712 /* TODO: A write to the CC flags possibly costs extra, this
14713 needs encoding in the cost tables. */
14715 mode
= GET_MODE (op0
);
14717 if (GET_CODE (op0
) == AND
)
14723 if (GET_CODE (op0
) == PLUS
)
14725 /* ADDS (and CMN alias). */
14730 if (GET_CODE (op0
) == MINUS
)
14737 if (GET_CODE (op0
) == ZERO_EXTRACT
&& op1
== const0_rtx
14738 && GET_MODE (x
) == CC_NZmode
&& CONST_INT_P (XEXP (op0
, 1))
14739 && CONST_INT_P (XEXP (op0
, 2)))
14741 /* COMPARE of ZERO_EXTRACT form of TST-immediate.
14742 Handle it here directly rather than going to cost_logic
14743 since we know the immediate generated for the TST is valid
14744 so we can avoid creating an intermediate rtx for it only
14745 for costing purposes. */
14747 *cost
+= extra_cost
->alu
.logical
;
14749 *cost
+= rtx_cost (XEXP (op0
, 0), GET_MODE (op0
),
14750 ZERO_EXTRACT
, 0, speed
);
14754 if (GET_CODE (op1
) == NEG
)
14758 *cost
+= extra_cost
->alu
.arith
;
14760 *cost
+= rtx_cost (op0
, mode
, COMPARE
, 0, speed
);
14761 *cost
+= rtx_cost (XEXP (op1
, 0), mode
, NEG
, 1, speed
);
14767 Compare can freely swap the order of operands, and
14768 canonicalization puts the more complex operation first.
14769 But the integer MINUS logic expects the shift/extend
14770 operation in op1. */
14772 || (SUBREG_P (op0
) && REG_P (SUBREG_REG (op0
)))))
14780 if (GET_MODE_CLASS (GET_MODE (op0
)) == MODE_FLOAT
)
14784 *cost
+= extra_cost
->fp
[mode
== DFmode
].compare
;
14786 if (CONST_DOUBLE_P (op1
) && aarch64_float_const_zero_rtx_p (op1
))
14788 *cost
+= rtx_cost (op0
, VOIDmode
, COMPARE
, 0, speed
);
14789 /* FCMP supports constant 0.0 for no extra cost. */
14795 if (VECTOR_MODE_P (mode
))
14797 /* Vector compare. */
14799 *cost
+= extra_cost
->vect
.alu
;
14801 if (aarch64_float_const_zero_rtx_p (op1
))
14803 /* Vector cm (eq|ge|gt|lt|le) supports constant 0.0 for no extra
14817 if (VECTOR_MODE_P (mode
))
14819 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
14820 if (TARGET_SIMD
&& (vec_flags
& VEC_ADVSIMD
))
14822 /* Recognise the SABD and UABD operation here.
14823 Recursion from the PLUS case will catch the accumulating
14825 if (aarch64_abd_rtx_p (x
))
14828 *cost
+= extra_cost
->vect
.alu
;
14831 /* SUBL2 and SUBW2.
14832 The select-operand-high-half versions of the sub instruction
14833 have the same cost as the regular three vector version -
14834 don't add the costs of the select into the costs of the sub.
14836 op0
= aarch64_strip_extend_vec_half (op0
);
14837 op1
= aarch64_strip_extend_vec_half (op1
);
14841 *cost
+= rtx_cost (op0
, mode
, MINUS
, 0, speed
);
14843 /* Detect valid immediates. */
14844 if ((GET_MODE_CLASS (mode
) == MODE_INT
14845 || (GET_MODE_CLASS (mode
) == MODE_CC
14846 && GET_MODE_CLASS (GET_MODE (op0
)) == MODE_INT
))
14847 && CONST_INT_P (op1
)
14848 && aarch64_uimm12_shift (INTVAL (op1
)))
14851 /* SUB(S) (immediate). */
14852 *cost
+= extra_cost
->alu
.arith
;
14856 /* Look for SUB (extended register). */
14857 if (is_a
<scalar_int_mode
> (mode
)
14858 && aarch64_rtx_arith_op_extract_p (op1
))
14861 *cost
+= extra_cost
->alu
.extend_arith
;
14863 op1
= aarch64_strip_extend (op1
, true);
14864 *cost
+= rtx_cost (op1
, VOIDmode
, GET_CODE (op1
), 0, speed
);
14868 rtx new_op1
= aarch64_strip_extend (op1
, false);
14870 /* Cost this as an FMA-alike operation. */
14871 if ((GET_CODE (new_op1
) == MULT
14872 || aarch64_shift_p (GET_CODE (new_op1
)))
14873 && code
!= COMPARE
)
14875 *cost
+= aarch64_rtx_mult_cost (new_op1
, MULT
, code
, speed
);
14879 *cost
+= rtx_cost (new_op1
, VOIDmode
, MINUS
, 1, speed
);
14883 if (VECTOR_MODE_P (mode
))
14886 *cost
+= extra_cost
->vect
.alu
;
14888 else if (GET_MODE_CLASS (mode
) == MODE_INT
)
14891 *cost
+= extra_cost
->alu
.arith
;
14893 else if (GET_MODE_CLASS (mode
) == MODE_FLOAT
)
14896 *cost
+= extra_cost
->fp
[mode
== DFmode
].addsub
;
14910 if (VECTOR_MODE_P (mode
))
14912 /* ADDL2 and ADDW2. */
14913 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
14914 if (TARGET_SIMD
&& (vec_flags
& VEC_ADVSIMD
))
14916 /* The select-operand-high-half versions of the add instruction
14917 have the same cost as the regular three vector version -
14918 don't add the costs of the select into the costs of the add.
14920 op0
= aarch64_strip_extend_vec_half (op0
);
14921 op1
= aarch64_strip_extend_vec_half (op1
);
14925 if (GET_RTX_CLASS (GET_CODE (op0
)) == RTX_COMPARE
14926 || GET_RTX_CLASS (GET_CODE (op0
)) == RTX_COMM_COMPARE
)
14929 *cost
+= rtx_cost (XEXP (op0
, 0), mode
, PLUS
, 0, speed
);
14930 *cost
+= rtx_cost (op1
, mode
, PLUS
, 1, speed
);
14934 if (GET_MODE_CLASS (mode
) == MODE_INT
14935 && (aarch64_plus_immediate (op1
, mode
)
14936 || aarch64_sve_addvl_addpl_immediate (op1
, mode
)))
14938 *cost
+= rtx_cost (op0
, mode
, PLUS
, 0, speed
);
14942 /* ADD (immediate). */
14943 *cost
+= extra_cost
->alu
.arith
;
14945 /* Some tunings prefer to not use the VL-based scalar ops.
14946 Increase the cost of the poly immediate to prevent their
14948 if (GET_CODE (op1
) == CONST_POLY_INT
14949 && (aarch64_tune_params
.extra_tuning_flags
14950 & AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS
))
14951 *cost
+= COSTS_N_INSNS (1);
14956 if (aarch64_pluslong_immediate (op1
, mode
))
14958 /* 24-bit add in 2 instructions or 12-bit shifted add. */
14959 if ((INTVAL (op1
) & 0xfff) != 0)
14960 *cost
+= COSTS_N_INSNS (1);
14962 *cost
+= rtx_cost (op0
, mode
, PLUS
, 0, speed
);
14966 *cost
+= rtx_cost (op1
, mode
, PLUS
, 1, speed
);
14968 /* Look for ADD (extended register). */
14969 if (is_a
<scalar_int_mode
> (mode
)
14970 && aarch64_rtx_arith_op_extract_p (op0
))
14973 *cost
+= extra_cost
->alu
.extend_arith
;
14975 op0
= aarch64_strip_extend (op0
, true);
14976 *cost
+= rtx_cost (op0
, VOIDmode
, GET_CODE (op0
), 0, speed
);
14980 /* Strip any extend, leave shifts behind as we will
14981 cost them through mult_cost. */
14982 new_op0
= aarch64_strip_extend (op0
, false);
14984 if (GET_CODE (new_op0
) == MULT
14985 || aarch64_shift_p (GET_CODE (new_op0
)))
14987 *cost
+= aarch64_rtx_mult_cost (new_op0
, MULT
, PLUS
,
14992 *cost
+= rtx_cost (new_op0
, VOIDmode
, PLUS
, 0, speed
);
14996 if (VECTOR_MODE_P (mode
))
14999 *cost
+= extra_cost
->vect
.alu
;
15001 else if (GET_MODE_CLASS (mode
) == MODE_INT
)
15004 *cost
+= extra_cost
->alu
.arith
;
15006 else if (GET_MODE_CLASS (mode
) == MODE_FLOAT
)
15009 *cost
+= extra_cost
->fp
[mode
== DFmode
].addsub
;
15017 *cost
= COSTS_N_INSNS (1);
15021 if (VECTOR_MODE_P (mode
))
15022 *cost
+= extra_cost
->vect
.alu
;
15024 *cost
+= extra_cost
->alu
.rev
;
15029 if (aarch_rev16_p (x
))
15031 *cost
= COSTS_N_INSNS (1);
15035 if (VECTOR_MODE_P (mode
))
15036 *cost
+= extra_cost
->vect
.alu
;
15038 *cost
+= extra_cost
->alu
.rev
;
15043 if (aarch64_extr_rtx_p (x
, &op0
, &op1
))
15045 *cost
+= rtx_cost (op0
, mode
, IOR
, 0, speed
);
15046 *cost
+= rtx_cost (op1
, mode
, IOR
, 1, speed
);
15048 *cost
+= extra_cost
->alu
.shift
;
15052 /* Fall through. */
15059 if (VECTOR_MODE_P (mode
))
15062 *cost
+= extra_cost
->vect
.alu
;
15067 && GET_CODE (op0
) == MULT
15068 && CONST_INT_P (XEXP (op0
, 1))
15069 && CONST_INT_P (op1
)
15070 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0
, 1))),
15071 INTVAL (op1
)) != 0)
15073 /* This is a UBFM/SBFM. */
15074 *cost
+= rtx_cost (XEXP (op0
, 0), mode
, ZERO_EXTRACT
, 0, speed
);
15076 *cost
+= extra_cost
->alu
.bfx
;
15080 if (is_int_mode (mode
, &int_mode
))
15082 if (CONST_INT_P (op1
))
15084 /* We have a mask + shift version of a UBFIZ
15085 i.e. the *andim_ashift<mode>_bfiz pattern. */
15086 if (GET_CODE (op0
) == ASHIFT
15087 && aarch64_mask_and_shift_for_ubfiz_p (int_mode
, op1
,
15090 *cost
+= rtx_cost (XEXP (op0
, 0), int_mode
, code
, 0, speed
);
15092 *cost
+= extra_cost
->alu
.bfx
;
15096 else if (aarch64_bitmask_imm (INTVAL (op1
), int_mode
))
15098 /* We possibly get the immediate for free, this is not
15100 *cost
+= rtx_cost (op0
, int_mode
, code
, 0, speed
);
15102 *cost
+= extra_cost
->alu
.logical
;
15111 /* Handle ORN, EON, or BIC. */
15112 if (GET_CODE (op0
) == NOT
)
15113 op0
= XEXP (op0
, 0);
15115 new_op0
= aarch64_strip_shift (op0
);
15117 /* If we had a shift on op0 then this is a logical-shift-
15118 by-register/immediate operation. Otherwise, this is just
15119 a logical operation. */
15122 if (new_op0
!= op0
)
15124 /* Shift by immediate. */
15125 if (CONST_INT_P (XEXP (op0
, 1)))
15126 *cost
+= extra_cost
->alu
.log_shift
;
15128 *cost
+= extra_cost
->alu
.log_shift_reg
;
15131 *cost
+= extra_cost
->alu
.logical
;
15134 /* In both cases we want to cost both operands. */
15135 *cost
+= rtx_cost (new_op0
, int_mode
, code
, 0, speed
);
15136 *cost
+= rtx_cost (op1
, int_mode
, code
, 1, speed
);
15145 op0
= aarch64_strip_shift (x
);
15147 if (VECTOR_MODE_P (mode
))
15150 *cost
+= extra_cost
->vect
.alu
;
15154 /* MVN-shifted-reg. */
15157 *cost
+= rtx_cost (op0
, mode
, code
, 0, speed
);
15160 *cost
+= extra_cost
->alu
.log_shift
;
15164 /* EON can have two forms: (xor (not a) b) but also (not (xor a b)).
15165 Handle the second form here taking care that 'a' in the above can
15167 else if (GET_CODE (op0
) == XOR
)
15169 rtx newop0
= XEXP (op0
, 0);
15170 rtx newop1
= XEXP (op0
, 1);
15171 rtx op0_stripped
= aarch64_strip_shift (newop0
);
15173 *cost
+= rtx_cost (newop1
, mode
, code
, 1, speed
);
15174 *cost
+= rtx_cost (op0_stripped
, mode
, XOR
, 0, speed
);
15178 if (op0_stripped
!= newop0
)
15179 *cost
+= extra_cost
->alu
.log_shift
;
15181 *cost
+= extra_cost
->alu
.logical
;
15188 *cost
+= extra_cost
->alu
.logical
;
15195 /* If a value is written in SI mode, then zero extended to DI
15196 mode, the operation will in general be free as a write to
15197 a 'w' register implicitly zeroes the upper bits of an 'x'
15198 register. However, if this is
15200 (set (reg) (zero_extend (reg)))
15202 we must cost the explicit register move. */
15204 && GET_MODE (op0
) == SImode
)
15206 int op_cost
= rtx_cost (op0
, VOIDmode
, ZERO_EXTEND
, 0, speed
);
15208 /* If OP_COST is non-zero, then the cost of the zero extend
15209 is effectively the cost of the inner operation. Otherwise
15210 we have a MOV instruction and we take the cost from the MOV
15211 itself. This is true independently of whether we are
15212 optimizing for space or time. */
15218 else if (MEM_P (op0
))
15220 /* All loads can zero extend to any size for free. */
15221 *cost
= rtx_cost (op0
, VOIDmode
, ZERO_EXTEND
, param
, speed
);
15225 op0
= aarch64_extend_bitfield_pattern_p (x
);
15228 *cost
+= rtx_cost (op0
, mode
, ZERO_EXTEND
, 0, speed
);
15230 *cost
+= extra_cost
->alu
.bfx
;
15236 if (VECTOR_MODE_P (mode
))
15239 *cost
+= extra_cost
->vect
.alu
;
15243 /* We generate an AND instead of UXTB/UXTH. */
15244 *cost
+= extra_cost
->alu
.logical
;
15250 if (MEM_P (XEXP (x
, 0)))
15255 rtx address
= XEXP (XEXP (x
, 0), 0);
15256 *cost
+= extra_cost
->ldst
.load_sign_extend
;
15259 COSTS_N_INSNS (aarch64_address_cost (address
, mode
,
15265 op0
= aarch64_extend_bitfield_pattern_p (x
);
15268 *cost
+= rtx_cost (op0
, mode
, SIGN_EXTEND
, 0, speed
);
15270 *cost
+= extra_cost
->alu
.bfx
;
15276 if (VECTOR_MODE_P (mode
))
15277 *cost
+= extra_cost
->vect
.alu
;
15279 *cost
+= extra_cost
->alu
.extend
;
15291 if (CONST_INT_P (op1
))
15295 if (VECTOR_MODE_P (mode
))
15297 /* Vector shift (immediate). */
15298 *cost
+= extra_cost
->vect
.alu
;
15302 /* LSL (immediate), ASR (immediate), UBMF, UBFIZ and friends.
15303 These are all aliases. */
15304 *cost
+= extra_cost
->alu
.shift
;
15308 /* We can incorporate zero/sign extend for free. */
15309 if (GET_CODE (op0
) == ZERO_EXTEND
15310 || GET_CODE (op0
) == SIGN_EXTEND
)
15311 op0
= XEXP (op0
, 0);
15313 *cost
+= rtx_cost (op0
, VOIDmode
, ASHIFT
, 0, speed
);
15318 if (VECTOR_MODE_P (mode
))
15321 /* Vector shift (register). */
15322 *cost
+= extra_cost
->vect
.alu
;
15328 *cost
+= extra_cost
->alu
.shift_reg
;
15330 /* The register shift amount may be in a shorter mode expressed
15331 as a lowpart SUBREG. For costing purposes just look inside. */
15332 if (SUBREG_P (op1
) && subreg_lowpart_p (op1
))
15333 op1
= SUBREG_REG (op1
);
15334 if (GET_CODE (op1
) == AND
&& REG_P (XEXP (op1
, 0))
15335 && CONST_INT_P (XEXP (op1
, 1))
15336 && known_eq (INTVAL (XEXP (op1
, 1)),
15337 GET_MODE_BITSIZE (mode
) - 1))
15339 *cost
+= rtx_cost (op0
, mode
, code
, 0, speed
);
15340 /* We already demanded XEXP (op1, 0) to be REG_P, so
15341 don't recurse into it. */
15345 return false; /* All arguments need to be in registers. */
15350 if (aarch64_cmodel
== AARCH64_CMODEL_LARGE
15351 || aarch64_cmodel
== AARCH64_CMODEL_SMALL_SPIC
)
15355 *cost
+= extra_cost
->ldst
.load
;
15357 else if (aarch64_cmodel
== AARCH64_CMODEL_SMALL
15358 || aarch64_cmodel
== AARCH64_CMODEL_SMALL_PIC
)
15360 /* ADRP, followed by ADD. */
15361 *cost
+= COSTS_N_INSNS (1);
15363 *cost
+= 2 * extra_cost
->alu
.arith
;
15365 else if (aarch64_cmodel
== AARCH64_CMODEL_TINY
15366 || aarch64_cmodel
== AARCH64_CMODEL_TINY_PIC
)
15370 *cost
+= extra_cost
->alu
.arith
;
15375 /* One extra load instruction, after accessing the GOT. */
15376 *cost
+= COSTS_N_INSNS (1);
15378 *cost
+= extra_cost
->ldst
.load
;
15384 /* ADRP/ADD (immediate). */
15386 *cost
+= extra_cost
->alu
.arith
;
15394 if (VECTOR_MODE_P (mode
))
15395 *cost
+= extra_cost
->vect
.alu
;
15397 *cost
+= extra_cost
->alu
.bfx
;
15400 /* We can trust that the immediates used will be correct (there
15401 are no by-register forms), so we need only cost op0. */
15402 *cost
+= rtx_cost (XEXP (x
, 0), VOIDmode
, code
, 0, speed
);
15406 *cost
+= aarch64_rtx_mult_cost (x
, MULT
, 0, speed
);
15407 /* aarch64_rtx_mult_cost always handles recursion to its
15412 /* We can expand signed mod by power of 2 using a NEGS, two parallel
15413 ANDs and a CSNEG. Assume here that CSNEG is the same as the cost of
15414 an unconditional negate. This case should only ever be reached through
15415 the set_smod_pow2_cheap check in expmed.cc. */
15416 if (CONST_INT_P (XEXP (x
, 1))
15417 && exact_log2 (INTVAL (XEXP (x
, 1))) > 0
15418 && (mode
== SImode
|| mode
== DImode
))
15420 /* We expand to 4 instructions. Reset the baseline. */
15421 *cost
= COSTS_N_INSNS (4);
15424 *cost
+= 2 * extra_cost
->alu
.logical
15425 + 2 * extra_cost
->alu
.arith
;
15430 /* Fall-through. */
15434 /* Slighly prefer UMOD over SMOD. */
15435 if (VECTOR_MODE_P (mode
))
15436 *cost
+= extra_cost
->vect
.alu
;
15437 else if (GET_MODE_CLASS (mode
) == MODE_INT
)
15438 *cost
+= (extra_cost
->mult
[mode
== DImode
].add
15439 + extra_cost
->mult
[mode
== DImode
].idiv
15440 + (code
== MOD
? 1 : 0));
15442 return false; /* All arguments need to be in registers. */
15449 if (VECTOR_MODE_P (mode
))
15450 *cost
+= extra_cost
->vect
.alu
;
15451 else if (GET_MODE_CLASS (mode
) == MODE_INT
)
15452 /* There is no integer SQRT, so only DIV and UDIV can get
15454 *cost
+= (extra_cost
->mult
[mode
== DImode
].idiv
15455 /* Slighly prefer UDIV over SDIV. */
15456 + (code
== DIV
? 1 : 0));
15458 *cost
+= extra_cost
->fp
[mode
== DFmode
].div
;
15460 return false; /* All arguments need to be in registers. */
15463 return aarch64_if_then_else_costs (XEXP (x
, 0), XEXP (x
, 1),
15464 XEXP (x
, 2), cost
, speed
);
15477 return false; /* All arguments must be in registers. */
15486 if (VECTOR_MODE_P (mode
))
15487 *cost
+= extra_cost
->vect
.alu
;
15489 *cost
+= extra_cost
->fp
[mode
== DFmode
].fma
;
15492 /* FMSUB, FNMADD, and FNMSUB are free. */
15493 if (GET_CODE (op0
) == NEG
)
15494 op0
= XEXP (op0
, 0);
15496 if (GET_CODE (op2
) == NEG
)
15497 op2
= XEXP (op2
, 0);
15499 /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
15500 and the by-element operand as operand 0. */
15501 if (GET_CODE (op1
) == NEG
)
15502 op1
= XEXP (op1
, 0);
15504 /* Catch vector-by-element operations. The by-element operand can
15505 either be (vec_duplicate (vec_select (x))) or just
15506 (vec_select (x)), depending on whether we are multiplying by
15507 a vector or a scalar.
15509 Canonicalization is not very good in these cases, FMA4 will put the
15510 by-element operand as operand 0, FNMA4 will have it as operand 1. */
15511 if (GET_CODE (op0
) == VEC_DUPLICATE
)
15512 op0
= XEXP (op0
, 0);
15513 else if (GET_CODE (op1
) == VEC_DUPLICATE
)
15514 op1
= XEXP (op1
, 0);
15516 if (GET_CODE (op0
) == VEC_SELECT
)
15517 op0
= XEXP (op0
, 0);
15518 else if (GET_CODE (op1
) == VEC_SELECT
)
15519 op1
= XEXP (op1
, 0);
15521 /* If the remaining parameters are not registers,
15522 get the cost to put them into registers. */
15523 *cost
+= rtx_cost (op0
, mode
, FMA
, 0, speed
);
15524 *cost
+= rtx_cost (op1
, mode
, FMA
, 1, speed
);
15525 *cost
+= rtx_cost (op2
, mode
, FMA
, 2, speed
);
15529 case UNSIGNED_FLOAT
:
15531 *cost
+= extra_cost
->fp
[mode
== DFmode
].fromint
;
15537 if (VECTOR_MODE_P (mode
))
15539 /*Vector truncate. */
15540 *cost
+= extra_cost
->vect
.alu
;
15543 *cost
+= extra_cost
->fp
[mode
== DFmode
].widen
;
15547 case FLOAT_TRUNCATE
:
15550 if (VECTOR_MODE_P (mode
))
15552 /*Vector conversion. */
15553 *cost
+= extra_cost
->vect
.alu
;
15556 *cost
+= extra_cost
->fp
[mode
== DFmode
].narrow
;
15563 /* Strip the rounding part. They will all be implemented
15564 by the fcvt* family of instructions anyway. */
15565 if (GET_CODE (x
) == UNSPEC
)
15567 unsigned int uns_code
= XINT (x
, 1);
15569 if (uns_code
== UNSPEC_FRINTA
15570 || uns_code
== UNSPEC_FRINTM
15571 || uns_code
== UNSPEC_FRINTN
15572 || uns_code
== UNSPEC_FRINTP
15573 || uns_code
== UNSPEC_FRINTZ
)
15574 x
= XVECEXP (x
, 0, 0);
15579 if (VECTOR_MODE_P (mode
))
15580 *cost
+= extra_cost
->vect
.alu
;
15582 *cost
+= extra_cost
->fp
[GET_MODE (x
) == DFmode
].toint
;
15585 /* We can combine fmul by a power of 2 followed by a fcvt into a single
15586 fixed-point fcvt. */
15587 if (GET_CODE (x
) == MULT
15588 && ((VECTOR_MODE_P (mode
)
15589 && aarch64_vec_fpconst_pow_of_2 (XEXP (x
, 1)) > 0)
15590 || aarch64_fpconst_pow_of_2 (XEXP (x
, 1)) > 0))
15592 *cost
+= rtx_cost (XEXP (x
, 0), VOIDmode
, code
, 0, speed
);
15596 *cost
+= rtx_cost (x
, VOIDmode
, code
, 0, speed
);
15600 if (VECTOR_MODE_P (mode
))
15602 /* ABS (vector). */
15604 *cost
+= extra_cost
->vect
.alu
;
15606 else if (GET_MODE_CLASS (mode
) == MODE_FLOAT
)
15610 /* FABD, which is analogous to FADD. */
15611 if (GET_CODE (op0
) == MINUS
)
15613 *cost
+= rtx_cost (XEXP (op0
, 0), mode
, MINUS
, 0, speed
);
15614 *cost
+= rtx_cost (XEXP (op0
, 1), mode
, MINUS
, 1, speed
);
15616 *cost
+= extra_cost
->fp
[mode
== DFmode
].addsub
;
15620 /* Simple FABS is analogous to FNEG. */
15622 *cost
+= extra_cost
->fp
[mode
== DFmode
].neg
;
15626 /* Integer ABS will either be split to
15627 two arithmetic instructions, or will be an ABS
15628 (scalar), which we don't model. */
15629 *cost
= COSTS_N_INSNS (2);
15631 *cost
+= 2 * extra_cost
->alu
.arith
;
15639 if (VECTOR_MODE_P (mode
))
15640 *cost
+= extra_cost
->vect
.alu
;
15643 /* FMAXNM/FMINNM/FMAX/FMIN.
15644 TODO: This may not be accurate for all implementations, but
15645 we do not model this in the cost tables. */
15646 *cost
+= extra_cost
->fp
[mode
== DFmode
].addsub
;
15652 /* The floating point round to integer frint* instructions. */
15653 if (aarch64_frint_unspec_p (XINT (x
, 1)))
15656 *cost
+= extra_cost
->fp
[mode
== DFmode
].roundint
;
15664 /* Decompose <su>muldi3_highpart. */
15665 if (/* (truncate:DI */
15668 && GET_MODE (XEXP (x
, 0)) == TImode
15669 && GET_CODE (XEXP (x
, 0)) == LSHIFTRT
15671 && GET_CODE (XEXP (XEXP (x
, 0), 0)) == MULT
15672 /* (ANY_EXTEND:TI (reg:DI))
15673 (ANY_EXTEND:TI (reg:DI))) */
15674 && ((GET_CODE (XEXP (XEXP (XEXP (x
, 0), 0), 0)) == ZERO_EXTEND
15675 && GET_CODE (XEXP (XEXP (XEXP (x
, 0), 0), 1)) == ZERO_EXTEND
)
15676 || (GET_CODE (XEXP (XEXP (XEXP (x
, 0), 0), 0)) == SIGN_EXTEND
15677 && GET_CODE (XEXP (XEXP (XEXP (x
, 0), 0), 1)) == SIGN_EXTEND
))
15678 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x
, 0), 0), 0), 0)) == DImode
15679 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x
, 0), 0), 1), 0)) == DImode
15680 /* (const_int 64) */
15681 && CONST_INT_P (XEXP (XEXP (x
, 0), 1))
15682 && UINTVAL (XEXP (XEXP (x
, 0), 1)) == 64)
15686 *cost
+= extra_cost
->mult
[mode
== DImode
].extend
;
15687 *cost
+= rtx_cost (XEXP (XEXP (XEXP (XEXP (x
, 0), 0), 0), 0),
15688 mode
, MULT
, 0, speed
);
15689 *cost
+= rtx_cost (XEXP (XEXP (XEXP (XEXP (x
, 0), 0), 1), 0),
15690 mode
, MULT
, 1, speed
);
15696 /* Load using MOVI/MVNI. */
15697 if (aarch64_simd_valid_mov_imm (x
))
15698 *cost
= extra_cost
->vect
.movi
;
15699 else /* Load using constant pool. */
15700 *cost
= extra_cost
->ldst
.load
;
15704 /* depending on the operation, either DUP or INS.
15705 For now, keep default costing. */
15707 case VEC_DUPLICATE
:
15708 /* Load using a DUP. */
15709 *cost
= extra_cost
->vect
.dup
;
15713 rtx op0
= XEXP (x
, 0);
15714 *cost
= rtx_cost (op0
, GET_MODE (op0
), VEC_SELECT
, 0, speed
);
15716 /* cost subreg of 0 as free, otherwise as DUP */
15717 rtx op1
= XEXP (x
, 1);
15718 if (vec_series_lowpart_p (mode
, GET_MODE (op1
), op1
))
15720 else if (vec_series_highpart_p (mode
, GET_MODE (op1
), op1
))
15721 *cost
= extra_cost
->vect
.dup
;
15723 *cost
= extra_cost
->vect
.extract
;
15731 && flag_aarch64_verbose_cost
)
15732 fprintf (dump_file
,
15733 "\nFailed to cost RTX. Assuming default cost.\n");
15738 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
15739 calculated for X. This cost is stored in *COST. Returns true
15740 if the total cost of X was calculated. */
15742 aarch64_rtx_costs_wrapper (rtx x
, machine_mode mode
, int outer
,
15743 int param
, int *cost
, bool speed
)
15745 bool result
= aarch64_rtx_costs (x
, mode
, outer
, param
, cost
, speed
);
15748 && flag_aarch64_verbose_cost
)
15750 print_rtl_single (dump_file
, x
);
15751 fprintf (dump_file
, "\n%s cost: %d (%s)\n",
15752 speed
? "Hot" : "Cold",
15753 *cost
, result
? "final" : "partial");
15760 aarch64_register_move_cost (machine_mode mode
,
15761 reg_class_t from_i
, reg_class_t to_i
)
15763 enum reg_class from
= (enum reg_class
) from_i
;
15764 enum reg_class to
= (enum reg_class
) to_i
;
15765 const struct cpu_regmove_cost
*regmove_cost
15766 = aarch64_tune_params
.regmove_cost
;
15768 /* Trest any subset of POINTER_REGS as though it were GENERAL_REGS. */
15769 if (reg_class_subset_p (to
, POINTER_REGS
))
15772 if (reg_class_subset_p (from
, POINTER_REGS
))
15773 from
= GENERAL_REGS
;
15775 /* Make RDFFR very expensive. In particular, if we know that the FFR
15776 contains a PTRUE (e.g. after a SETFFR), we must never use RDFFR
15777 as a way of obtaining a PTRUE. */
15778 if (GET_MODE_CLASS (mode
) == MODE_VECTOR_BOOL
15779 && hard_reg_set_subset_p (reg_class_contents
[from_i
],
15780 reg_class_contents
[FFR_REGS
]))
15783 /* Moves to/from sysregs are expensive, and must go via GPR. */
15784 if (from
== MOVEABLE_SYSREGS
)
15785 return 80 + aarch64_register_move_cost (mode
, GENERAL_REGS
, to
);
15786 if (to
== MOVEABLE_SYSREGS
)
15787 return 80 + aarch64_register_move_cost (mode
, from
, GENERAL_REGS
);
15789 /* Moving between GPR and stack cost is the same as GP2GP. */
15790 if ((from
== GENERAL_REGS
&& to
== STACK_REG
)
15791 || (to
== GENERAL_REGS
&& from
== STACK_REG
))
15792 return regmove_cost
->GP2GP
;
15794 /* To/From the stack register, we move via the gprs. */
15795 if (to
== STACK_REG
|| from
== STACK_REG
)
15796 return aarch64_register_move_cost (mode
, from
, GENERAL_REGS
)
15797 + aarch64_register_move_cost (mode
, GENERAL_REGS
, to
);
15799 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
15800 if (vec_flags
!= (VEC_ADVSIMD
| VEC_STRUCT
| VEC_PARTIAL
)
15801 && known_eq (GET_MODE_SIZE (mode
), 16))
15803 /* 128-bit operations on general registers require 2 instructions. */
15804 if (from
== GENERAL_REGS
&& to
== GENERAL_REGS
)
15805 return regmove_cost
->GP2GP
* 2;
15806 else if (from
== GENERAL_REGS
)
15807 return regmove_cost
->GP2FP
* 2;
15808 else if (to
== GENERAL_REGS
)
15809 return regmove_cost
->FP2GP
* 2;
15811 /* When AdvSIMD instructions are disabled it is not possible to move
15812 a 128-bit value directly between Q registers. This is handled in
15813 secondary reload. A general register is used as a scratch to move
15814 the upper DI value and the lower DI value is moved directly,
15815 hence the cost is the sum of three moves. */
15816 if (!TARGET_SIMD
&& !TARGET_SVE
)
15817 return regmove_cost
->GP2FP
+ regmove_cost
->FP2GP
+ regmove_cost
->FP2FP
;
15819 return regmove_cost
->FP2FP
;
15822 if (from
== GENERAL_REGS
&& to
== GENERAL_REGS
)
15823 return regmove_cost
->GP2GP
;
15824 else if (from
== GENERAL_REGS
)
15825 return regmove_cost
->GP2FP
;
15826 else if (to
== GENERAL_REGS
)
15827 return regmove_cost
->FP2GP
;
15829 if (!TARGET_SIMD
&& vec_flags
== (VEC_ADVSIMD
| VEC_STRUCT
))
15831 /* Needs a round-trip through memory, which can use LDP/STP for pairs.
15832 The cost must be greater than 2 units to indicate that direct
15833 moves aren't possible. */
15834 auto per_vector
= (aarch64_tune_params
.memmov_cost
.load_fp
15835 + aarch64_tune_params
.memmov_cost
.store_fp
);
15836 return MIN (CEIL (per_vector
, 2), 4);
15839 return regmove_cost
->FP2FP
;
15842 /* Implements TARGET_MEMORY_MOVE_COST. */
15844 aarch64_memory_move_cost (machine_mode mode
, reg_class_t rclass_i
, bool in
)
15846 enum reg_class rclass
= (enum reg_class
) rclass_i
;
15847 if (GET_MODE_CLASS (mode
) == MODE_VECTOR_BOOL
15848 ? reg_classes_intersect_p (rclass
, PR_REGS
)
15849 : reg_class_subset_p (rclass
, PR_REGS
))
15851 ? aarch64_tune_params
.memmov_cost
.load_pred
15852 : aarch64_tune_params
.memmov_cost
.store_pred
);
15854 if (VECTOR_MODE_P (mode
) || FLOAT_MODE_P (mode
)
15855 ? reg_classes_intersect_p (rclass
, FP_REGS
)
15856 : reg_class_subset_p (rclass
, FP_REGS
))
15858 ? aarch64_tune_params
.memmov_cost
.load_fp
15859 : aarch64_tune_params
.memmov_cost
.store_fp
);
15861 /* If the move needs to go through GPRs, add the cost of doing that. */
15863 if (rclass_i
== MOVEABLE_SYSREGS
)
15865 ? aarch64_register_move_cost (DImode
, GENERAL_REGS
, rclass_i
)
15866 : aarch64_register_move_cost (DImode
, rclass_i
, GENERAL_REGS
));
15869 ? base
+ aarch64_tune_params
.memmov_cost
.load_int
15870 : base
+ aarch64_tune_params
.memmov_cost
.store_int
);
15873 /* Implement TARGET_INSN_COST. We have the opportunity to do something
15874 much more productive here, such as using insn attributes to cost things.
15875 But we don't, not yet.
15877 The main point of this current definition is to make calling insn_cost
15878 on one instruction equivalent to calling seq_cost on a sequence that
15879 contains only that instruction. The default definition would instead
15880 only look at SET_SRCs, ignoring SET_DESTs.
15882 This ensures that, for example, storing a 128-bit zero vector is more
15883 expensive than storing a 128-bit vector register. A move of zero
15884 into a 128-bit vector register followed by multiple stores of that
15885 register is then cheaper than multiple stores of zero (which would
15886 use STP of XZR). This in turn allows STP Qs to be formed. */
15888 aarch64_insn_cost (rtx_insn
*insn
, bool speed
)
15890 if (rtx set
= single_set (insn
))
15891 return set_rtx_cost (set
, speed
);
15892 return pattern_cost (PATTERN (insn
), speed
);
15895 /* Implement TARGET_INIT_BUILTINS. */
15897 aarch64_init_builtins ()
15899 aarch64_general_init_builtins ();
15900 aarch64_sve::init_builtins ();
15901 #ifdef SUBTARGET_INIT_BUILTINS
15902 SUBTARGET_INIT_BUILTINS
;
15906 /* Implement TARGET_FOLD_BUILTIN. */
15908 aarch64_fold_builtin (tree fndecl
, int nargs
, tree
*args
, bool)
15910 unsigned int code
= DECL_MD_FUNCTION_CODE (fndecl
);
15911 unsigned int subcode
= code
>> AARCH64_BUILTIN_SHIFT
;
15912 tree type
= TREE_TYPE (TREE_TYPE (fndecl
));
15913 switch (code
& AARCH64_BUILTIN_CLASS
)
15915 case AARCH64_BUILTIN_GENERAL
:
15916 return aarch64_general_fold_builtin (subcode
, type
, nargs
, args
);
15918 case AARCH64_BUILTIN_SVE
:
15921 gcc_unreachable ();
15924 /* Implement TARGET_GIMPLE_FOLD_BUILTIN. */
15926 aarch64_gimple_fold_builtin (gimple_stmt_iterator
*gsi
)
15928 gcall
*stmt
= as_a
<gcall
*> (gsi_stmt (*gsi
));
15929 tree fndecl
= gimple_call_fndecl (stmt
);
15930 unsigned int code
= DECL_MD_FUNCTION_CODE (fndecl
);
15931 unsigned int subcode
= code
>> AARCH64_BUILTIN_SHIFT
;
15932 gimple
*new_stmt
= NULL
;
15933 switch (code
& AARCH64_BUILTIN_CLASS
)
15935 case AARCH64_BUILTIN_GENERAL
:
15936 new_stmt
= aarch64_general_gimple_fold_builtin (subcode
, stmt
, gsi
);
15939 case AARCH64_BUILTIN_SVE
:
15940 new_stmt
= aarch64_sve::gimple_fold_builtin (subcode
, gsi
, stmt
);
15947 gsi_replace (gsi
, new_stmt
, false);
15951 /* Implement TARGET_EXPAND_BUILTIN. */
15953 aarch64_expand_builtin (tree exp
, rtx target
, rtx
, machine_mode
, int ignore
)
15955 tree fndecl
= TREE_OPERAND (CALL_EXPR_FN (exp
), 0);
15956 unsigned int code
= DECL_MD_FUNCTION_CODE (fndecl
);
15957 unsigned int subcode
= code
>> AARCH64_BUILTIN_SHIFT
;
15958 switch (code
& AARCH64_BUILTIN_CLASS
)
15960 case AARCH64_BUILTIN_GENERAL
:
15961 return aarch64_general_expand_builtin (subcode
, exp
, target
, ignore
);
15963 case AARCH64_BUILTIN_SVE
:
15964 return aarch64_sve::expand_builtin (subcode
, exp
, target
);
15966 gcc_unreachable ();
15969 /* Implement TARGET_BUILTIN_DECL. */
15971 aarch64_builtin_decl (unsigned int code
, bool initialize_p
)
15973 unsigned int subcode
= code
>> AARCH64_BUILTIN_SHIFT
;
15974 switch (code
& AARCH64_BUILTIN_CLASS
)
15976 case AARCH64_BUILTIN_GENERAL
:
15977 return aarch64_general_builtin_decl (subcode
, initialize_p
);
15979 case AARCH64_BUILTIN_SVE
:
15980 return aarch64_sve::builtin_decl (subcode
, initialize_p
);
15982 gcc_unreachable ();
15985 /* Return true if it is safe and beneficial to use the approximate rsqrt optabs
15986 to optimize 1.0/sqrt. */
15989 use_rsqrt_p (machine_mode mode
)
15991 return (!flag_trapping_math
15992 && flag_unsafe_math_optimizations
15993 && ((aarch64_tune_params
.approx_modes
->recip_sqrt
15994 & AARCH64_APPROX_MODE (mode
))
15995 || flag_mrecip_low_precision_sqrt
));
15998 /* Function to decide when to use the approximate reciprocal square root
16002 aarch64_builtin_reciprocal (tree fndecl
)
16004 machine_mode mode
= TYPE_MODE (TREE_TYPE (fndecl
));
16006 if (!use_rsqrt_p (mode
))
16008 unsigned int code
= DECL_MD_FUNCTION_CODE (fndecl
);
16009 unsigned int subcode
= code
>> AARCH64_BUILTIN_SHIFT
;
16010 switch (code
& AARCH64_BUILTIN_CLASS
)
16012 case AARCH64_BUILTIN_GENERAL
:
16013 return aarch64_general_builtin_rsqrt (subcode
);
16015 case AARCH64_BUILTIN_SVE
:
16018 gcc_unreachable ();
16021 /* Emit code to perform the floating-point operation:
16025 where all three operands are already known to be registers.
16026 If the operation is an SVE one, PTRUE is a suitable all-true
16030 aarch64_emit_mult (rtx dst
, rtx ptrue
, rtx src1
, rtx src2
)
16033 emit_insn (gen_aarch64_pred (UNSPEC_COND_FMUL
, GET_MODE (dst
),
16034 dst
, ptrue
, src1
, src2
,
16035 gen_int_mode (SVE_RELAXED_GP
, SImode
)));
16037 emit_set_insn (dst
, gen_rtx_MULT (GET_MODE (dst
), src1
, src2
));
16040 /* Emit instruction sequence to compute either the approximate square root
16041 or its approximate reciprocal, depending on the flag RECP, and return
16042 whether the sequence was emitted or not. */
16045 aarch64_emit_approx_sqrt (rtx dst
, rtx src
, bool recp
)
16047 machine_mode mode
= GET_MODE (dst
);
16049 if (GET_MODE_INNER (mode
) == HFmode
)
16051 gcc_assert (!recp
);
16057 if (!(flag_mlow_precision_sqrt
16058 || (aarch64_tune_params
.approx_modes
->sqrt
16059 & AARCH64_APPROX_MODE (mode
))))
16062 if (!flag_finite_math_only
16063 || flag_trapping_math
16064 || !flag_unsafe_math_optimizations
16065 || optimize_function_for_size_p (cfun
))
16069 /* Caller assumes we cannot fail. */
16070 gcc_assert (use_rsqrt_p (mode
));
16073 if (aarch64_sve_mode_p (mode
))
16074 pg
= aarch64_ptrue_reg (aarch64_sve_pred_mode (mode
));
16075 machine_mode mmsk
= (VECTOR_MODE_P (mode
)
16076 ? related_int_vector_mode (mode
).require ()
16077 : int_mode_for_mode (mode
).require ());
16078 rtx xmsk
= NULL_RTX
;
16081 /* When calculating the approximate square root, compare the
16082 argument with 0.0 and create a mask. */
16083 rtx zero
= CONST0_RTX (mode
);
16086 xmsk
= gen_reg_rtx (GET_MODE (pg
));
16087 rtx hint
= gen_int_mode (SVE_KNOWN_PTRUE
, SImode
);
16088 emit_insn (gen_aarch64_pred_fcm (UNSPEC_COND_FCMNE
, mode
,
16089 xmsk
, pg
, hint
, src
, zero
));
16093 xmsk
= gen_reg_rtx (mmsk
);
16094 emit_insn (gen_rtx_SET (xmsk
,
16096 gen_rtx_EQ (mmsk
, src
, zero
))));
16100 /* Estimate the approximate reciprocal square root. */
16101 rtx xdst
= gen_reg_rtx (mode
);
16102 emit_insn (gen_aarch64_rsqrte (mode
, xdst
, src
));
16104 /* Iterate over the series twice for SF and thrice for DF. */
16105 int iterations
= (GET_MODE_INNER (mode
) == DFmode
) ? 3 : 2;
16107 /* Optionally iterate over the series once less for faster performance
16108 while sacrificing the accuracy. */
16109 if ((recp
&& flag_mrecip_low_precision_sqrt
)
16110 || (!recp
&& flag_mlow_precision_sqrt
))
16113 /* Iterate over the series to calculate the approximate reciprocal square
16115 rtx x1
= gen_reg_rtx (mode
);
16116 while (iterations
--)
16118 rtx x2
= gen_reg_rtx (mode
);
16119 aarch64_emit_mult (x2
, pg
, xdst
, xdst
);
16121 emit_insn (gen_aarch64_rsqrts (mode
, x1
, src
, x2
));
16123 if (iterations
> 0)
16124 aarch64_emit_mult (xdst
, pg
, xdst
, x1
);
16130 /* Multiply nonzero source values by the corresponding intermediate
16131 result elements, so that the final calculation is the approximate
16132 square root rather than its reciprocal. Select a zero result for
16133 zero source values, to avoid the Inf * 0 -> NaN that we'd get
16135 emit_insn (gen_cond (UNSPEC_COND_FMUL
, mode
,
16136 xdst
, xmsk
, xdst
, src
, CONST0_RTX (mode
)));
16139 /* Qualify the approximate reciprocal square root when the
16140 argument is 0.0 by squashing the intermediary result to 0.0. */
16141 rtx xtmp
= gen_reg_rtx (mmsk
);
16142 emit_set_insn (xtmp
, gen_rtx_AND (mmsk
, gen_rtx_NOT (mmsk
, xmsk
),
16143 gen_rtx_SUBREG (mmsk
, xdst
, 0)));
16144 emit_move_insn (xdst
, gen_rtx_SUBREG (mode
, xtmp
, 0));
16146 /* Calculate the approximate square root. */
16147 aarch64_emit_mult (xdst
, pg
, xdst
, src
);
16151 /* Finalize the approximation. */
16152 aarch64_emit_mult (dst
, pg
, xdst
, x1
);
16157 /* Emit the instruction sequence to compute the approximation for the division
16158 of NUM by DEN in QUO and return whether the sequence was emitted or not. */
16161 aarch64_emit_approx_div (rtx quo
, rtx num
, rtx den
)
16163 machine_mode mode
= GET_MODE (quo
);
16165 if (GET_MODE_INNER (mode
) == HFmode
)
16168 bool use_approx_division_p
= (flag_mlow_precision_div
16169 || (aarch64_tune_params
.approx_modes
->division
16170 & AARCH64_APPROX_MODE (mode
)));
16172 if (!flag_finite_math_only
16173 || flag_trapping_math
16174 || !flag_unsafe_math_optimizations
16175 || optimize_function_for_size_p (cfun
)
16176 || !use_approx_division_p
)
16179 if (!TARGET_SIMD
&& VECTOR_MODE_P (mode
))
16183 if (aarch64_sve_mode_p (mode
))
16184 pg
= aarch64_ptrue_reg (aarch64_sve_pred_mode (mode
));
16186 /* Estimate the approximate reciprocal. */
16187 rtx xrcp
= gen_reg_rtx (mode
);
16188 emit_insn (gen_aarch64_frecpe (mode
, xrcp
, den
));
16190 /* Iterate over the series twice for SF and thrice for DF. */
16191 int iterations
= (GET_MODE_INNER (mode
) == DFmode
) ? 3 : 2;
16193 /* Optionally iterate over the series less for faster performance,
16194 while sacrificing the accuracy. The default is 2 for DF and 1 for SF. */
16195 if (flag_mlow_precision_div
)
16196 iterations
= (GET_MODE_INNER (mode
) == DFmode
16197 ? aarch64_double_recp_precision
16198 : aarch64_float_recp_precision
);
16200 /* Iterate over the series to calculate the approximate reciprocal. */
16201 rtx xtmp
= gen_reg_rtx (mode
);
16202 while (iterations
--)
16204 emit_insn (gen_aarch64_frecps (mode
, xtmp
, xrcp
, den
));
16206 if (iterations
> 0)
16207 aarch64_emit_mult (xrcp
, pg
, xrcp
, xtmp
);
16210 if (num
!= CONST1_RTX (mode
))
16212 /* As the approximate reciprocal of DEN is already calculated, only
16213 calculate the approximate division when NUM is not 1.0. */
16214 rtx xnum
= force_reg (mode
, num
);
16215 aarch64_emit_mult (xrcp
, pg
, xrcp
, xnum
);
16218 /* Finalize the approximation. */
16219 aarch64_emit_mult (quo
, pg
, xrcp
, xtmp
);
16223 /* Emit an optimized sequence to perform a vector rotate
16224 of REG by the vector constant amount AMNT_VEC and place the result
16225 in DST. Return true iff successful. */
16228 aarch64_emit_opt_vec_rotate (rtx dst
, rtx reg
, rtx amnt_vec
)
16230 rtx amnt
= unwrap_const_vec_duplicate (amnt_vec
);
16231 gcc_assert (CONST_INT_P (amnt
));
16232 HOST_WIDE_INT rotamnt
= UINTVAL (amnt
);
16233 machine_mode mode
= GET_MODE (reg
);
16234 /* Don't end up here after reload. */
16235 gcc_assert (can_create_pseudo_p ());
16236 /* Rotates by half the element width map down to REV* instructions and should
16237 always be preferred when possible. */
16238 if (rotamnt
== GET_MODE_UNIT_BITSIZE (mode
) / 2
16239 && expand_rotate_as_vec_perm (mode
, dst
, reg
, amnt
))
16241 /* 64 and 128-bit vector modes can use the XAR instruction
16243 else if ((TARGET_SHA3
&& mode
== V2DImode
)
16245 && (known_eq (GET_MODE_SIZE (mode
), 8)
16246 || known_eq (GET_MODE_SIZE (mode
), 16))))
16248 rtx zeroes
= aarch64_gen_shareable_zero (mode
);
16250 = gen_rtx_ROTATE (mode
, gen_rtx_XOR (mode
, reg
, zeroes
),
16252 emit_set_insn (dst
, xar_op
);
16255 /* If none of the above, try to expand rotates by any byte amount as
16257 else if (expand_rotate_as_vec_perm (mode
, dst
, reg
, amnt
))
16262 /* Return the number of instructions that can be issued per cycle. */
16264 aarch64_sched_issue_rate (void)
16266 return aarch64_tune_params
.issue_rate
;
16269 /* Implement TARGET_SCHED_VARIABLE_ISSUE. */
16271 aarch64_sched_variable_issue (FILE *, int, rtx_insn
*insn
, int more
)
16273 if (DEBUG_INSN_P (insn
))
16276 rtx_code code
= GET_CODE (PATTERN (insn
));
16277 if (code
== USE
|| code
== CLOBBER
)
16280 if (get_attr_type (insn
) == TYPE_NO_INSN
)
16287 aarch64_sched_first_cycle_multipass_dfa_lookahead (void)
16289 int issue_rate
= aarch64_sched_issue_rate ();
16291 return issue_rate
> 1 && !sched_fusion
? issue_rate
: 0;
16295 /* Implement TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD as
16296 autopref_multipass_dfa_lookahead_guard from haifa-sched.cc. It only
16297 has an effect if PARAM_SCHED_AUTOPREF_QUEUE_DEPTH > 0. */
16300 aarch64_first_cycle_multipass_dfa_lookahead_guard (rtx_insn
*insn
,
16303 return autopref_multipass_dfa_lookahead_guard (insn
, ready_index
);
16307 /* Vectorizer cost model target hooks. */
16309 /* If a vld1 from address ADDR should be recorded in vector_load_decls,
16310 return the decl that should be recorded. Return null otherwise. */
16312 aarch64_vector_load_decl (tree addr
)
16314 if (TREE_CODE (addr
) != ADDR_EXPR
)
16316 tree base
= get_base_address (TREE_OPERAND (addr
, 0));
16317 if (TREE_CODE (base
) != VAR_DECL
)
16322 /* Return true if STMT_INFO accesses a decl that is known to be the
16323 argument to a vld1 in the same function. */
16325 aarch64_accesses_vector_load_decl_p (stmt_vec_info stmt_info
)
16327 if (!cfun
->machine
->vector_load_decls
)
16329 auto dr
= STMT_VINFO_DATA_REF (stmt_info
);
16332 tree decl
= aarch64_vector_load_decl (DR_BASE_ADDRESS (dr
));
16333 return decl
&& cfun
->machine
->vector_load_decls
->contains (decl
);
16336 /* Information about how the CPU would issue the scalar, Advanced SIMD
16337 or SVE version of a vector loop, using the scheme defined by the
16338 aarch64_base_vec_issue_info hierarchy of structures. */
16339 class aarch64_vec_op_count
16342 aarch64_vec_op_count () = default;
16343 aarch64_vec_op_count (const aarch64_vec_issue_info
*, unsigned int,
16346 unsigned int vec_flags () const { return m_vec_flags
; }
16347 unsigned int vf_factor () const { return m_vf_factor
; }
16349 const aarch64_base_vec_issue_info
*base_issue_info () const;
16350 const aarch64_simd_vec_issue_info
*simd_issue_info () const;
16351 const aarch64_sve_vec_issue_info
*sve_issue_info () const;
16353 fractional_cost
rename_cycles_per_iter () const;
16354 fractional_cost
min_nonpred_cycles_per_iter () const;
16355 fractional_cost
min_pred_cycles_per_iter () const;
16356 fractional_cost
min_cycles_per_iter () const;
16358 void dump () const;
16360 /* The number of individual "general" operations. See the comments
16361 in aarch64_base_vec_issue_info for details. */
16362 unsigned int general_ops
= 0;
16364 /* The number of load and store operations, under the same scheme
16366 unsigned int loads
= 0;
16367 unsigned int stores
= 0;
16369 /* The minimum number of cycles needed to execute all loop-carried
16370 operations, which in the vector code become associated with
16372 unsigned int reduction_latency
= 0;
16374 /* The number of individual predicate operations. See the comments
16375 in aarch64_sve_vec_issue_info for details. */
16376 unsigned int pred_ops
= 0;
16379 /* The issue information for the core. */
16380 const aarch64_vec_issue_info
*m_issue_info
= nullptr;
16382 /* - If M_VEC_FLAGS is zero then this structure describes scalar code
16383 - If M_VEC_FLAGS & VEC_ADVSIMD is nonzero then this structure describes
16384 Advanced SIMD code.
16385 - If M_VEC_FLAGS & VEC_ANY_SVE is nonzero then this structure describes
16387 unsigned int m_vec_flags
= 0;
16389 /* Assume that, when the code is executing on the core described
16390 by M_ISSUE_INFO, one iteration of the loop will handle M_VF_FACTOR
16391 times more data than the vectorizer anticipates.
16393 This is only ever different from 1 for SVE. It allows us to consider
16394 what would happen on a 256-bit SVE target even when the -mtune
16395 parameters say that the “likely” SVE length is 128 bits. */
16396 unsigned int m_vf_factor
= 1;
16399 aarch64_vec_op_count::
16400 aarch64_vec_op_count (const aarch64_vec_issue_info
*issue_info
,
16401 unsigned int vec_flags
, unsigned int vf_factor
)
16402 : m_issue_info (issue_info
),
16403 m_vec_flags (vec_flags
),
16404 m_vf_factor (vf_factor
)
16408 /* Return the base issue information (i.e. the parts that make sense
16409 for both scalar and vector code). Return null if we have no issue
16411 const aarch64_base_vec_issue_info
*
16412 aarch64_vec_op_count::base_issue_info () const
16414 if (auto *ret
= simd_issue_info ())
16416 return m_issue_info
->scalar
;
16419 /* If the structure describes vector code and we have associated issue
16420 information, return that issue information, otherwise return null. */
16421 const aarch64_simd_vec_issue_info
*
16422 aarch64_vec_op_count::simd_issue_info () const
16424 if (auto *ret
= sve_issue_info ())
16427 return m_issue_info
->advsimd
;
16431 /* If the structure describes SVE code and we have associated issue
16432 information, return that issue information, otherwise return null. */
16433 const aarch64_sve_vec_issue_info
*
16434 aarch64_vec_op_count::sve_issue_info () const
16436 if (m_vec_flags
& VEC_ANY_SVE
)
16437 return m_issue_info
->sve
;
16441 /* Estimate the minimum number of cycles per iteration needed to rename
16444 ??? For now this is done inline rather than via cost tables, since it
16445 isn't clear how it should be parameterized for the general case. */
16447 aarch64_vec_op_count::rename_cycles_per_iter () const
16449 if (sve_issue_info () == &neoverse512tvb_sve_issue_info
16450 || sve_issue_info () == &neoversen2_sve_issue_info
16451 || sve_issue_info () == &neoversev2_sve_issue_info
)
16452 /* + 1 for an addition. We've already counted a general op for each
16453 store, so we don't need to account for stores separately. The branch
16454 reads no registers and so does not need to be counted either.
16456 ??? This value is very much on the pessimistic side, but seems to work
16457 pretty well in practice. */
16458 return { general_ops
+ loads
+ pred_ops
+ 1, 5 };
16463 /* Like min_cycles_per_iter, but excluding predicate operations. */
16465 aarch64_vec_op_count::min_nonpred_cycles_per_iter () const
16467 auto *issue_info
= base_issue_info ();
16469 fractional_cost cycles
= MAX (reduction_latency
, 1);
16470 cycles
= std::max (cycles
, { stores
, issue_info
->stores_per_cycle
});
16471 cycles
= std::max (cycles
, { loads
+ stores
,
16472 issue_info
->loads_stores_per_cycle
});
16473 cycles
= std::max (cycles
, { general_ops
,
16474 issue_info
->general_ops_per_cycle
});
16475 cycles
= std::max (cycles
, rename_cycles_per_iter ());
16479 /* Like min_cycles_per_iter, but including only the predicate operations. */
16481 aarch64_vec_op_count::min_pred_cycles_per_iter () const
16483 if (auto *issue_info
= sve_issue_info ())
16484 return { pred_ops
, issue_info
->pred_ops_per_cycle
};
16488 /* Estimate the minimum number of cycles needed to issue the operations.
16489 This is a very simplistic model! */
16491 aarch64_vec_op_count::min_cycles_per_iter () const
16493 return std::max (min_nonpred_cycles_per_iter (),
16494 min_pred_cycles_per_iter ());
16497 /* Dump information about the structure. */
16499 aarch64_vec_op_count::dump () const
16501 dump_printf_loc (MSG_NOTE
, vect_location
,
16502 " load operations = %d\n", loads
);
16503 dump_printf_loc (MSG_NOTE
, vect_location
,
16504 " store operations = %d\n", stores
);
16505 dump_printf_loc (MSG_NOTE
, vect_location
,
16506 " general operations = %d\n", general_ops
);
16507 if (sve_issue_info ())
16508 dump_printf_loc (MSG_NOTE
, vect_location
,
16509 " predicate operations = %d\n", pred_ops
);
16510 dump_printf_loc (MSG_NOTE
, vect_location
,
16511 " reduction latency = %d\n", reduction_latency
);
16512 if (auto rcpi
= rename_cycles_per_iter ())
16513 dump_printf_loc (MSG_NOTE
, vect_location
,
16514 " estimated cycles per iteration to rename = %f\n",
16515 rcpi
.as_double ());
16516 if (auto pred_cpi
= min_pred_cycles_per_iter ())
16518 dump_printf_loc (MSG_NOTE
, vect_location
,
16519 " estimated min cycles per iteration"
16520 " without predication = %f\n",
16521 min_nonpred_cycles_per_iter ().as_double ());
16522 dump_printf_loc (MSG_NOTE
, vect_location
,
16523 " estimated min cycles per iteration"
16524 " for predication = %f\n", pred_cpi
.as_double ());
16526 if (auto cpi
= min_cycles_per_iter ())
16527 dump_printf_loc (MSG_NOTE
, vect_location
,
16528 " estimated min cycles per iteration = %f\n",
16532 /* Information about vector code that we're in the process of costing. */
16533 class aarch64_vector_costs
: public vector_costs
16536 aarch64_vector_costs (vec_info
*, bool);
16538 unsigned int add_stmt_cost (int count
, vect_cost_for_stmt kind
,
16539 stmt_vec_info stmt_info
, slp_tree
, tree vectype
,
16541 vect_cost_model_location where
) override
;
16542 void finish_cost (const vector_costs
*) override
;
16543 bool better_main_loop_than_p (const vector_costs
*other
) const override
;
16546 void record_potential_advsimd_unrolling (loop_vec_info
);
16547 void analyze_loop_vinfo (loop_vec_info
);
16548 void count_ops (unsigned int, vect_cost_for_stmt
, stmt_vec_info
, slp_tree
,
16549 aarch64_vec_op_count
*);
16550 fractional_cost
adjust_body_cost_sve (const aarch64_vec_op_count
*,
16551 fractional_cost
, unsigned int,
16552 unsigned int *, bool *);
16553 unsigned int adjust_body_cost (loop_vec_info
, const aarch64_vector_costs
*,
16555 bool prefer_unrolled_loop () const;
16556 unsigned int determine_suggested_unroll_factor ();
16558 /* True if we have performed one-time initialization based on the
16560 bool m_analyzed_vinfo
= false;
16562 /* This loop uses an average operation that is not supported by SVE, but is
16563 supported by Advanced SIMD and SVE2. */
16564 bool m_has_avg
= false;
16566 /* Additional initialization costs for using gather or scatter operation in
16567 the current loop. */
16568 unsigned int m_sve_gather_scatter_init_cost
= 0;
16570 /* True if the vector body contains a store to a decl and if the
16571 function is known to have a vld1 from the same decl.
16573 In the Advanced SIMD ACLE, the recommended endian-agnostic way of
16574 initializing a vector is:
16576 float f[4] = { elts };
16577 float32x4_t x = vld1q_f32(f);
16579 We should strongly prefer vectorization of the initialization of f,
16580 so that the store to f and the load back can be optimized away,
16581 leaving a vectorization of { elts }. */
16582 bool m_stores_to_vector_load_decl
= false;
16584 /* Non-zero if the last operation we costed is a vector promotion or demotion.
16585 In this case the value is the number of insns in the last operation.
16587 On AArch64 vector promotion and demotions require us to first widen or
16588 narrow the input and only after that emit conversion instructions. For
16589 costing this means we need to emit the cost of the final conversions as
16591 unsigned int m_num_last_promote_demote
= 0;
16593 /* - If M_VEC_FLAGS is zero then we're costing the original scalar code.
16594 - If M_VEC_FLAGS & VEC_ADVSIMD is nonzero then we're costing Advanced
16596 - If M_VEC_FLAGS & VEC_ANY_SVE is nonzero then we're costing SVE code. */
16597 unsigned int m_vec_flags
= 0;
16599 /* At the moment, we do not model LDP and STP in the vector and scalar costs.
16600 This means that code such as:
16605 will be costed as two scalar instructions and two vector instructions
16606 (a scalar_to_vec and an unaligned_store). For SLP, the vector form
16607 wins if the costs are equal, because of the fact that the vector costs
16608 include constant initializations whereas the scalar costs don't.
16609 We would therefore tend to vectorize the code above, even though
16610 the scalar version can use a single STP.
16612 We should eventually fix this and model LDP and STP in the main costs;
16613 see the comment in aarch64_sve_adjust_stmt_cost for some of the problems.
16614 Until then, we look specifically for code that does nothing more than
16615 STP-like operations. We cost them on that basis in addition to the
16616 normal latency-based costs.
16618 If the scalar or vector code could be a sequence of STPs +
16619 initialization, this variable counts the cost of the sequence,
16620 with 2 units per instruction. The variable is ~0U for other
16622 unsigned int m_stp_sequence_cost
= 0;
16624 /* On some CPUs, SVE and Advanced SIMD provide the same theoretical vector
16625 throughput, such as 4x128 Advanced SIMD vs. 2x256 SVE. In those
16626 situations, we try to predict whether an Advanced SIMD implementation
16627 of the loop could be completely unrolled and become straight-line code.
16628 If so, it is generally better to use the Advanced SIMD version rather
16629 than length-agnostic SVE, since the SVE loop would execute an unknown
16630 number of times and so could not be completely unrolled in the same way.
16632 If we're applying this heuristic, M_UNROLLED_ADVSIMD_NITERS is the
16633 number of Advanced SIMD loop iterations that would be unrolled and
16634 M_UNROLLED_ADVSIMD_STMTS estimates the total number of statements
16635 in the unrolled loop. Both values are zero if we're not applying
16637 unsigned HOST_WIDE_INT m_unrolled_advsimd_niters
= 0;
16638 unsigned HOST_WIDE_INT m_unrolled_advsimd_stmts
= 0;
16640 /* If we're vectorizing a loop that executes a constant number of times,
16641 this variable gives the number of times that the vector loop would
16642 iterate, otherwise it is zero. */
16643 uint64_t m_num_vector_iterations
= 0;
16645 /* Used only when vectorizing loops. Estimates the number and kind of
16646 operations that would be needed by one iteration of the scalar
16647 or vector loop. There is one entry for each tuning option of
16649 auto_vec
<aarch64_vec_op_count
, 2> m_ops
;
16652 aarch64_vector_costs::aarch64_vector_costs (vec_info
*vinfo
,
16653 bool costing_for_scalar
)
16654 : vector_costs (vinfo
, costing_for_scalar
),
16655 m_vec_flags (costing_for_scalar
? 0
16656 : aarch64_classify_vector_mode (vinfo
->vector_mode
))
16658 if (auto *issue_info
= aarch64_tune_params
.vec_costs
->issue_info
)
16660 m_ops
.quick_push ({ issue_info
, m_vec_flags
});
16661 if (aarch64_tune_params
.vec_costs
== &neoverse512tvb_vector_cost
)
16663 unsigned int vf_factor
= (m_vec_flags
& VEC_ANY_SVE
) ? 2 : 1;
16664 m_ops
.quick_push ({ &neoversev1_vec_issue_info
, m_vec_flags
,
16670 /* Implement TARGET_VECTORIZE_CREATE_COSTS. */
16672 aarch64_vectorize_create_costs (vec_info
*vinfo
, bool costing_for_scalar
)
16674 return new aarch64_vector_costs (vinfo
, costing_for_scalar
);
16677 /* Return the appropriate SIMD costs for vectors of type VECTYPE. */
16678 static const simd_vec_cost
*
16679 aarch64_simd_vec_costs (tree vectype
)
16681 const cpu_vector_cost
*costs
= aarch64_tune_params
.vec_costs
;
16682 if (vectype
!= NULL
16683 && aarch64_sve_mode_p (TYPE_MODE (vectype
))
16684 && costs
->sve
!= NULL
)
16686 return costs
->advsimd
;
16689 /* Return the appropriate SIMD costs for vectors with VEC_* flags FLAGS. */
16690 static const simd_vec_cost
*
16691 aarch64_simd_vec_costs_for_flags (unsigned int flags
)
16693 const cpu_vector_cost
*costs
= aarch64_tune_params
.vec_costs
;
16694 if ((flags
& VEC_ANY_SVE
) && costs
->sve
)
16696 return costs
->advsimd
;
16699 /* If STMT_INFO is a memory reference, return the scalar memory type,
16700 otherwise return null. */
16702 aarch64_dr_type (stmt_vec_info stmt_info
)
16704 if (auto dr
= STMT_VINFO_DATA_REF (stmt_info
))
16705 return TREE_TYPE (DR_REF (dr
));
16709 /* Decide whether to use the unrolling heuristic described above
16710 m_unrolled_advsimd_niters, updating that field if so. LOOP_VINFO
16711 describes the loop that we're vectorizing. */
16713 aarch64_vector_costs::
16714 record_potential_advsimd_unrolling (loop_vec_info loop_vinfo
)
16716 /* The heuristic only makes sense on targets that have the same
16717 vector throughput for SVE and Advanced SIMD. */
16718 if (!(aarch64_tune_params
.extra_tuning_flags
16719 & AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT
))
16722 /* We only want to apply the heuristic if LOOP_VINFO is being
16723 vectorized for SVE. */
16724 if (!(m_vec_flags
& VEC_ANY_SVE
))
16727 /* Check whether it is possible in principle to use Advanced SIMD
16729 if (aarch64_autovec_preference
== AARCH64_AUTOVEC_SVE_ONLY
)
16732 /* We don't want to apply the heuristic to outer loops, since it's
16733 harder to track two levels of unrolling. */
16734 if (LOOP_VINFO_LOOP (loop_vinfo
)->inner
)
16737 /* Only handle cases in which the number of Advanced SIMD iterations
16738 would be known at compile time but the number of SVE iterations
16740 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
)
16741 || aarch64_sve_vg
.is_constant ())
16744 /* Guess how many times the Advanced SIMD loop would iterate and make
16745 sure that it is within the complete unrolling limit. Even if the
16746 number of iterations is small enough, the number of statements might
16747 not be, which is why we need to estimate the number of statements too. */
16748 unsigned int estimated_vq
= aarch64_estimated_sve_vq ();
16749 unsigned int advsimd_vf
= CEIL (vect_vf_for_cost (loop_vinfo
), estimated_vq
);
16750 unsigned HOST_WIDE_INT unrolled_advsimd_niters
16751 = LOOP_VINFO_INT_NITERS (loop_vinfo
) / advsimd_vf
;
16752 if (unrolled_advsimd_niters
> (unsigned int) param_max_completely_peel_times
)
16755 /* Record that we're applying the heuristic and should try to estimate
16756 the number of statements in the Advanced SIMD loop. */
16757 m_unrolled_advsimd_niters
= unrolled_advsimd_niters
;
16760 /* Do one-time initialization of the aarch64_vector_costs given that we're
16761 costing the loop vectorization described by LOOP_VINFO. */
16763 aarch64_vector_costs::analyze_loop_vinfo (loop_vec_info loop_vinfo
)
16765 /* Record the number of times that the vector loop would execute,
16767 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
16768 auto scalar_niters
= max_stmt_executions_int (loop
);
16769 if (scalar_niters
>= 0)
16771 unsigned int vf
= vect_vf_for_cost (loop_vinfo
);
16772 if (LOOP_VINFO_MASKS (loop_vinfo
).is_empty ())
16773 m_num_vector_iterations
= scalar_niters
/ vf
;
16775 m_num_vector_iterations
= CEIL (scalar_niters
, vf
);
16778 /* Detect whether we're vectorizing for SVE and should apply the unrolling
16779 heuristic described above m_unrolled_advsimd_niters. */
16780 record_potential_advsimd_unrolling (loop_vinfo
);
16783 /* Implement targetm.vectorize.builtin_vectorization_cost. */
16785 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost
,
16787 int misalign ATTRIBUTE_UNUSED
)
16790 const cpu_vector_cost
*costs
= aarch64_tune_params
.vec_costs
;
16793 if (vectype
!= NULL
)
16794 fp
= FLOAT_TYPE_P (vectype
);
16796 const simd_vec_cost
*simd_costs
= aarch64_simd_vec_costs (vectype
);
16798 switch (type_of_cost
)
16801 return fp
? costs
->scalar_fp_stmt_cost
: costs
->scalar_int_stmt_cost
;
16804 return costs
->scalar_load_cost
;
16807 return costs
->scalar_store_cost
;
16810 return fp
? simd_costs
->fp_stmt_cost
16811 : simd_costs
->int_stmt_cost
;
16814 return simd_costs
->align_load_cost
;
16817 return simd_costs
->store_cost
;
16819 case vec_to_scalar
:
16820 return simd_costs
->vec_to_scalar_cost
;
16822 case scalar_to_vec
:
16823 return simd_costs
->scalar_to_vec_cost
;
16825 case unaligned_load
:
16826 case vector_gather_load
:
16827 return simd_costs
->unalign_load_cost
;
16829 case unaligned_store
:
16830 case vector_scatter_store
:
16831 return simd_costs
->unalign_store_cost
;
16833 case cond_branch_taken
:
16834 return costs
->cond_taken_branch_cost
;
16836 case cond_branch_not_taken
:
16837 return costs
->cond_not_taken_branch_cost
;
16840 return simd_costs
->permute_cost
;
16842 case vec_promote_demote
:
16843 return fp
? simd_costs
->fp_stmt_cost
16844 : simd_costs
->int_stmt_cost
;
16846 case vec_construct
:
16847 elements
= estimated_poly_value (TYPE_VECTOR_SUBPARTS (vectype
));
16848 return elements
/ 2 + 1;
16851 gcc_unreachable ();
16855 /* Return true if an access of kind KIND for STMT_INFO (or NODE if SLP)
16856 represents one vector of an LD[234] or ST[234] operation. Return the total
16857 number of vectors (2, 3 or 4) if so, otherwise return a value outside that
16860 aarch64_ld234_st234_vectors (vect_cost_for_stmt kind
, stmt_vec_info stmt_info
,
16863 if ((kind
== vector_load
16864 || kind
== unaligned_load
16865 || kind
== vector_store
16866 || kind
== unaligned_store
)
16867 && STMT_VINFO_DATA_REF (stmt_info
))
16869 stmt_info
= DR_GROUP_FIRST_ELEMENT (stmt_info
);
16871 && vect_mem_access_type (stmt_info
, node
) == VMAT_LOAD_STORE_LANES
)
16872 return DR_GROUP_SIZE (stmt_info
);
16877 /* Return true if creating multiple copies of STMT_INFO for Advanced SIMD
16878 vectors would produce a series of LDP or STP operations. KIND is the
16879 kind of statement that STMT_INFO represents. */
16881 aarch64_advsimd_ldp_stp_p (enum vect_cost_for_stmt kind
,
16882 stmt_vec_info stmt_info
)
16888 case unaligned_load
:
16889 case unaligned_store
:
16896 return is_gimple_assign (stmt_info
->stmt
);
16899 /* Return true if STMT_INFO is the second part of a two-statement multiply-add
16900 or multiply-subtract sequence that might be suitable for fusing into a
16901 single instruction. If VEC_FLAGS is zero, analyze the operation as
16902 a scalar one, otherwise analyze it as an operation on vectors with those
16905 aarch64_multiply_add_p (vec_info
*vinfo
, stmt_vec_info stmt_info
,
16906 unsigned int vec_flags
)
16908 gassign
*assign
= dyn_cast
<gassign
*> (stmt_info
->stmt
);
16911 tree_code code
= gimple_assign_rhs_code (assign
);
16912 if (code
!= PLUS_EXPR
&& code
!= MINUS_EXPR
)
16915 auto is_mul_result
= [&](int i
)
16917 tree rhs
= gimple_op (assign
, i
);
16918 /* ??? Should we try to check for a single use as well? */
16919 if (TREE_CODE (rhs
) != SSA_NAME
)
16922 stmt_vec_info def_stmt_info
= vinfo
->lookup_def (rhs
);
16924 || STMT_VINFO_DEF_TYPE (def_stmt_info
) != vect_internal_def
)
16926 gassign
*rhs_assign
= dyn_cast
<gassign
*> (def_stmt_info
->stmt
);
16927 if (!rhs_assign
|| gimple_assign_rhs_code (rhs_assign
) != MULT_EXPR
)
16930 if (vec_flags
& VEC_ADVSIMD
)
16932 /* Scalar and SVE code can tie the result to any FMLA input (or none,
16933 although that requires a MOVPRFX for SVE). However, Advanced SIMD
16934 only supports MLA forms, so will require a move if the result
16935 cannot be tied to the accumulator. The most important case in
16936 which this is true is when the accumulator input is invariant. */
16937 rhs
= gimple_op (assign
, 3 - i
);
16938 if (TREE_CODE (rhs
) != SSA_NAME
)
16940 def_stmt_info
= vinfo
->lookup_def (rhs
);
16942 || STMT_VINFO_DEF_TYPE (def_stmt_info
) == vect_external_def
16943 || STMT_VINFO_DEF_TYPE (def_stmt_info
) == vect_constant_def
)
16950 if (code
== MINUS_EXPR
&& (vec_flags
& VEC_ADVSIMD
))
16951 /* Advanced SIMD doesn't have FNMADD/FNMSUB/FNMLA/FNMLS, so the
16952 multiplication must be on the second operand (to form an FMLS).
16953 But if both operands are multiplications and the second operand
16954 is used more than once, we'll instead negate the second operand
16955 and use it as an accumulator for the first operand. */
16956 return (is_mul_result (2)
16957 && (has_single_use (gimple_assign_rhs2 (assign
))
16958 || !is_mul_result (1)));
16960 return is_mul_result (1) || is_mul_result (2);
16963 /* Return true if STMT_INFO is the second part of a two-statement boolean AND
16964 expression sequence that might be suitable for fusing into a
16965 single instruction. If VEC_FLAGS is zero, analyze the operation as
16966 a scalar one, otherwise analyze it as an operation on vectors with those
16970 aarch64_bool_compound_p (vec_info
*vinfo
, stmt_vec_info stmt_info
,
16971 unsigned int vec_flags
)
16973 gassign
*assign
= dyn_cast
<gassign
*> (stmt_info
->stmt
);
16975 || gimple_assign_rhs_code (assign
) != BIT_AND_EXPR
16976 || !STMT_VINFO_VECTYPE (stmt_info
)
16977 || !VECTOR_BOOLEAN_TYPE_P (STMT_VINFO_VECTYPE (stmt_info
)))
16980 for (int i
= 1; i
< 3; ++i
)
16982 tree rhs
= gimple_op (assign
, i
);
16984 if (TREE_CODE (rhs
) != SSA_NAME
)
16987 stmt_vec_info def_stmt_info
= vinfo
->lookup_def (rhs
);
16989 || STMT_VINFO_DEF_TYPE (def_stmt_info
) != vect_internal_def
)
16992 gassign
*rhs_assign
= dyn_cast
<gassign
*> (def_stmt_info
->stmt
);
16994 || TREE_CODE_CLASS (gimple_assign_rhs_code (rhs_assign
))
16998 if (vec_flags
& VEC_ADVSIMD
)
17006 /* We are considering implementing STMT_INFO using SVE. If STMT_INFO is an
17007 in-loop reduction that SVE supports directly, return its latency in cycles,
17008 otherwise return zero. SVE_COSTS specifies the latencies of the relevant
17010 static unsigned int
17011 aarch64_sve_in_loop_reduction_latency (vec_info
*vinfo
,
17012 stmt_vec_info stmt_info
,
17013 const sve_vec_cost
*sve_costs
)
17015 switch (vect_reduc_type (vinfo
, stmt_info
))
17017 case EXTRACT_LAST_REDUCTION
:
17018 return sve_costs
->clast_cost
;
17020 case FOLD_LEFT_REDUCTION
:
17021 switch (TYPE_MODE (TREE_TYPE (gimple_get_lhs (stmt_info
->stmt
))))
17025 return sve_costs
->fadda_f16_cost
;
17028 return sve_costs
->fadda_f32_cost
;
17031 return sve_costs
->fadda_f64_cost
;
17042 /* STMT_INFO describes a loop-carried operation in the original scalar code
17043 that we are considering implementing as a reduction. Return one of the
17044 following values, depending on VEC_FLAGS:
17046 - If VEC_FLAGS is zero, return the loop carry latency of the original
17049 - If VEC_FLAGS & VEC_ADVSIMD, return the loop carry latency of the
17050 Advanced SIMD implementation.
17052 - If VEC_FLAGS & VEC_ANY_SVE, return the loop carry latency of the
17053 SVE implementation. */
17054 static unsigned int
17055 aarch64_in_loop_reduction_latency (vec_info
*vinfo
, stmt_vec_info stmt_info
,
17056 unsigned int vec_flags
)
17058 const cpu_vector_cost
*vec_costs
= aarch64_tune_params
.vec_costs
;
17059 const sve_vec_cost
*sve_costs
= nullptr;
17060 if (vec_flags
& VEC_ANY_SVE
)
17061 sve_costs
= aarch64_tune_params
.vec_costs
->sve
;
17063 /* If the caller is asking for the SVE latency, check for forms of reduction
17064 that only SVE can handle directly. */
17067 unsigned int latency
17068 = aarch64_sve_in_loop_reduction_latency (vinfo
, stmt_info
, sve_costs
);
17073 /* Handle scalar costs. */
17074 bool is_float
= FLOAT_TYPE_P (TREE_TYPE (gimple_get_lhs (stmt_info
->stmt
)));
17075 if (vec_flags
== 0)
17078 return vec_costs
->scalar_fp_stmt_cost
;
17079 return vec_costs
->scalar_int_stmt_cost
;
17082 /* Otherwise, the loop body just contains normal integer or FP operations,
17083 with a vector reduction outside the loop. */
17084 const simd_vec_cost
*simd_costs
17085 = aarch64_simd_vec_costs_for_flags (vec_flags
);
17087 return simd_costs
->fp_stmt_cost
;
17088 return simd_costs
->int_stmt_cost
;
17091 /* STMT_COST is the cost calculated by aarch64_builtin_vectorization_cost
17092 for STMT_INFO, which has cost kind KIND. If this is a scalar operation,
17093 try to subdivide the target-independent categorization provided by KIND
17094 to get a more accurate cost. */
17095 static fractional_cost
17096 aarch64_detect_scalar_stmt_subtype (vec_info
*vinfo
, vect_cost_for_stmt kind
,
17097 stmt_vec_info stmt_info
,
17098 fractional_cost stmt_cost
)
17100 /* Detect an extension of a loaded value. In general, we'll be able to fuse
17101 the extension with the load. */
17102 if (kind
== scalar_stmt
&& vect_is_extending_load (vinfo
, stmt_info
))
17108 /* STMT_COST is the cost calculated by aarch64_builtin_vectorization_cost
17109 for the vectorized form of STMT_INFO possibly using SLP node NODE, which has
17110 cost kind KIND and which when vectorized would operate on vector type
17111 VECTYPE. Try to subdivide the target-independent categorization provided by
17112 KIND to get a more accurate cost. WHERE specifies where the cost associated
17113 with KIND occurs. */
17114 static fractional_cost
17115 aarch64_detect_vector_stmt_subtype (vec_info
*vinfo
, vect_cost_for_stmt kind
,
17116 stmt_vec_info stmt_info
, slp_tree node
,
17118 enum vect_cost_model_location where
,
17119 fractional_cost stmt_cost
)
17121 const simd_vec_cost
*simd_costs
= aarch64_simd_vec_costs (vectype
);
17122 const sve_vec_cost
*sve_costs
= nullptr;
17123 if (aarch64_sve_mode_p (TYPE_MODE (vectype
)))
17124 sve_costs
= aarch64_tune_params
.vec_costs
->sve
;
17126 /* It's generally better to avoid costing inductions, since the induction
17127 will usually be hidden by other operations. This is particularly true
17128 for things like COND_REDUCTIONS. */
17129 if (is_a
<gphi
*> (stmt_info
->stmt
))
17132 /* Detect cases in which vec_to_scalar is describing the extraction of a
17133 vector element in preparation for a scalar store. The store itself is
17134 costed separately. */
17135 if (vect_is_store_elt_extraction (kind
, stmt_info
))
17136 return simd_costs
->store_elt_extra_cost
;
17138 /* Detect SVE gather loads, which are costed as a single scalar_load
17139 for each element. We therefore need to divide the full-instruction
17140 cost by the number of elements in the vector. */
17141 if (kind
== scalar_load
17143 && vect_mem_access_type (stmt_info
, node
) == VMAT_GATHER_SCATTER
)
17145 unsigned int nunits
= vect_nunits_for_cost (vectype
);
17146 /* Test for VNx2 modes, which have 64-bit containers. */
17147 if (known_eq (GET_MODE_NUNITS (TYPE_MODE (vectype
)), aarch64_sve_vg
))
17148 return { sve_costs
->gather_load_x64_cost
, nunits
};
17149 return { sve_costs
->gather_load_x32_cost
, nunits
};
17152 /* Detect cases in which a scalar_store is really storing one element
17153 in a scatter operation. */
17154 if (kind
== scalar_store
17156 && vect_mem_access_type (stmt_info
, node
) == VMAT_GATHER_SCATTER
)
17157 return sve_costs
->scatter_store_elt_cost
;
17159 /* Detect cases in which vec_to_scalar represents an in-loop reduction. */
17160 if (kind
== vec_to_scalar
17161 && where
== vect_body
17164 unsigned int latency
17165 = aarch64_sve_in_loop_reduction_latency (vinfo
, stmt_info
, sve_costs
);
17170 /* Detect cases in which vec_to_scalar represents a single reduction
17171 instruction like FADDP or MAXV. */
17172 if (kind
== vec_to_scalar
17173 && where
== vect_epilogue
17174 && vect_is_reduction (stmt_info
))
17175 switch (GET_MODE_INNER (TYPE_MODE (vectype
)))
17178 return simd_costs
->reduc_i8_cost
;
17181 return simd_costs
->reduc_i16_cost
;
17184 return simd_costs
->reduc_i32_cost
;
17187 return simd_costs
->reduc_i64_cost
;
17191 return simd_costs
->reduc_f16_cost
;
17194 return simd_costs
->reduc_f32_cost
;
17197 return simd_costs
->reduc_f64_cost
;
17203 /* Otherwise stick with the original categorization. */
17207 /* STMT_COST is the cost calculated by aarch64_builtin_vectorization_cost
17208 for STMT_INFO, which has cost kind KIND and which when vectorized would
17209 operate on vector type VECTYPE. Adjust the cost as necessary for SVE
17211 static fractional_cost
17212 aarch64_sve_adjust_stmt_cost (class vec_info
*vinfo
, vect_cost_for_stmt kind
,
17213 stmt_vec_info stmt_info
, tree vectype
,
17214 fractional_cost stmt_cost
)
17216 /* Unlike vec_promote_demote, vector_stmt conversions do not change the
17217 vector register size or number of units. Integer promotions of this
17218 type therefore map to SXT[BHW] or UXT[BHW].
17220 Most loads have extending forms that can do the sign or zero extension
17221 on the fly. Optimistically assume that a load followed by an extension
17222 will fold to this form during combine, and that the extension therefore
17224 if (kind
== vector_stmt
&& vect_is_extending_load (vinfo
, stmt_info
))
17227 /* For similar reasons, vector_stmt integer truncations are a no-op,
17228 because we can just ignore the unused upper bits of the source. */
17229 if (kind
== vector_stmt
&& vect_is_integer_truncation (stmt_info
))
17232 /* Advanced SIMD can load and store pairs of registers using LDP and STP,
17233 but there are no equivalent instructions for SVE. This means that
17234 (all other things being equal) 128-bit SVE needs twice as many load
17235 and store instructions as Advanced SIMD in order to process vector pairs.
17237 Also, scalar code can often use LDP and STP to access pairs of values,
17238 so it is too simplistic to say that one SVE load or store replaces
17239 VF scalar loads and stores.
17241 Ideally we would account for this in the scalar and Advanced SIMD
17242 costs by making suitable load/store pairs as cheap as a single
17243 load/store. However, that would be a very invasive change and in
17244 practice it tends to stress other parts of the cost model too much.
17245 E.g. stores of scalar constants currently count just a store,
17246 whereas stores of vector constants count a store and a vec_init.
17247 This is an artificial distinction for AArch64, where stores of
17248 nonzero scalar constants need the same kind of register invariant
17251 An alternative would be to double the cost of any SVE loads and stores
17252 that could be paired in Advanced SIMD (and possibly also paired in
17253 scalar code). But this tends to stress other parts of the cost model
17254 in the same way. It also means that we can fall back to Advanced SIMD
17255 even if full-loop predication would have been useful.
17257 Here we go for a more conservative version: double the costs of SVE
17258 loads and stores if one iteration of the scalar loop processes enough
17259 elements for it to use a whole number of Advanced SIMD LDP or STP
17260 instructions. This makes it very likely that the VF would be 1 for
17261 Advanced SIMD, and so no epilogue should be needed. */
17262 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
))
17264 stmt_vec_info first
= DR_GROUP_FIRST_ELEMENT (stmt_info
);
17265 unsigned int count
= DR_GROUP_SIZE (first
) - DR_GROUP_GAP (first
);
17266 unsigned int elt_bits
= GET_MODE_UNIT_BITSIZE (TYPE_MODE (vectype
));
17267 if (multiple_p (count
* elt_bits
, 256)
17268 && aarch64_advsimd_ldp_stp_p (kind
, stmt_info
))
17275 /* STMT_COST is the cost calculated for STMT_INFO, which has cost kind KIND
17276 and which when vectorized would operate on vector type VECTYPE. Add the
17277 cost of any embedded operations. */
17278 static fractional_cost
17279 aarch64_adjust_stmt_cost (vec_info
*vinfo
, vect_cost_for_stmt kind
,
17280 stmt_vec_info stmt_info
, slp_tree node
, tree vectype
,
17281 unsigned vec_flags
, fractional_cost stmt_cost
)
17285 const simd_vec_cost
*simd_costs
= aarch64_simd_vec_costs (vectype
);
17287 /* Detect cases in which a vector load or store represents an
17288 LD[234] or ST[234] instruction. */
17289 switch (aarch64_ld234_st234_vectors (kind
, stmt_info
, node
))
17292 stmt_cost
+= simd_costs
->ld2_st2_permute_cost
;
17296 stmt_cost
+= simd_costs
->ld3_st3_permute_cost
;
17300 stmt_cost
+= simd_costs
->ld4_st4_permute_cost
;
17304 gassign
*assign
= dyn_cast
<gassign
*> (STMT_VINFO_STMT (stmt_info
));
17305 if ((kind
== scalar_stmt
|| kind
== vector_stmt
) && assign
)
17307 /* For MLA we need to reduce the cost since MLA is 1 instruction. */
17308 if (!vect_is_reduction (stmt_info
)
17309 && aarch64_multiply_add_p (vinfo
, stmt_info
, vec_flags
))
17312 /* For vector boolean ANDs with a compare operand we just need
17314 if (aarch64_bool_compound_p (vinfo
, stmt_info
, vec_flags
))
17318 if (kind
== vector_stmt
|| kind
== vec_to_scalar
)
17319 if (tree cmp_type
= vect_embedded_comparison_type (stmt_info
))
17321 if (FLOAT_TYPE_P (cmp_type
))
17322 stmt_cost
+= simd_costs
->fp_stmt_cost
;
17324 stmt_cost
+= simd_costs
->int_stmt_cost
;
17328 if (kind
== scalar_stmt
)
17329 if (tree cmp_type
= vect_embedded_comparison_type (stmt_info
))
17331 if (FLOAT_TYPE_P (cmp_type
))
17332 stmt_cost
+= aarch64_tune_params
.vec_costs
->scalar_fp_stmt_cost
;
17334 stmt_cost
+= aarch64_tune_params
.vec_costs
->scalar_int_stmt_cost
;
17340 /* Return true if STMT_INFO is part of a reduction that has the form:
17345 with the single accumulator being read and written multiple times. */
17347 aarch64_force_single_cycle (vec_info
*vinfo
, stmt_vec_info stmt_info
)
17349 if (!STMT_VINFO_REDUC_DEF (stmt_info
))
17352 auto reduc_info
= info_for_reduction (vinfo
, stmt_info
);
17353 return STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info
);
17356 /* COUNT, KIND and STMT_INFO are the same as for vector_costs::add_stmt_cost
17357 and they describe an operation in the body of a vector loop. Record issue
17358 information relating to the vector operation in OPS. */
17360 aarch64_vector_costs::count_ops (unsigned int count
, vect_cost_for_stmt kind
,
17361 stmt_vec_info stmt_info
, slp_tree node
,
17362 aarch64_vec_op_count
*ops
)
17364 const aarch64_base_vec_issue_info
*base_issue
= ops
->base_issue_info ();
17367 const aarch64_simd_vec_issue_info
*simd_issue
= ops
->simd_issue_info ();
17368 const aarch64_sve_vec_issue_info
*sve_issue
= ops
->sve_issue_info ();
17370 /* Calculate the minimum cycles per iteration imposed by a reduction
17372 if ((kind
== scalar_stmt
|| kind
== vector_stmt
|| kind
== vec_to_scalar
)
17373 && vect_is_reduction (stmt_info
))
17376 = aarch64_in_loop_reduction_latency (m_vinfo
, stmt_info
, m_vec_flags
);
17377 if (aarch64_force_single_cycle (m_vinfo
, stmt_info
))
17378 /* ??? Ideally we'd use a tree to reduce the copies down to 1 vector,
17379 and then accumulate that, but at the moment the loop-carried
17380 dependency includes all copies. */
17381 ops
->reduction_latency
= MAX (ops
->reduction_latency
, base
* count
);
17383 ops
->reduction_latency
= MAX (ops
->reduction_latency
, base
);
17386 if (stmt_info
&& (kind
== scalar_stmt
|| kind
== vector_stmt
))
17388 /* Assume that multiply-adds will become a single operation. */
17389 if (aarch64_multiply_add_p (m_vinfo
, stmt_info
, m_vec_flags
))
17392 /* Assume that bool AND with compare operands will become a single
17394 if (aarch64_bool_compound_p (m_vinfo
, stmt_info
, m_vec_flags
))
17398 /* Detect the case where we are using an emulated gather/scatter. When a
17399 target does not support gathers and scatters directly the vectorizer
17400 emulates these by constructing an index vector and then issuing an
17401 extraction for every lane in the vector. If the index vector is loaded
17402 from memory, the vector load and extractions are subsequently lowered by
17403 veclower into a series of scalar index loads. After the final loads are
17404 done it issues a vec_construct to recreate the vector from the scalar. For
17405 costing when we see a vec_to_scalar on a stmt with VMAT_GATHER_SCATTER we
17406 are dealing with an emulated instruction and should adjust costing
17408 if (kind
== vec_to_scalar
17409 && (m_vec_flags
& VEC_ADVSIMD
)
17410 && vect_mem_access_type (stmt_info
, node
) == VMAT_GATHER_SCATTER
)
17412 auto dr
= STMT_VINFO_DATA_REF (stmt_info
);
17413 tree dr_ref
= DR_REF (dr
);
17414 while (handled_component_p (dr_ref
))
17416 if (TREE_CODE (dr_ref
) == ARRAY_REF
)
17418 tree offset
= TREE_OPERAND (dr_ref
, 1);
17419 if (SSA_VAR_P (offset
))
17421 if (gimple_vuse (SSA_NAME_DEF_STMT (offset
)))
17423 if (STMT_VINFO_TYPE (stmt_info
) == load_vec_info_type
)
17424 ops
->loads
+= count
- 1;
17426 /* Stores want to count both the index to array and data to
17427 array using vec_to_scalar. However we have index stores
17428 in Adv.SIMD and so we only want to adjust the index
17430 ops
->loads
+= count
/ 2;
17436 dr_ref
= TREE_OPERAND (dr_ref
, 0);
17440 /* Count the basic operation cost associated with KIND. */
17443 case cond_branch_taken
:
17444 case cond_branch_not_taken
:
17445 case vector_gather_load
:
17446 case vector_scatter_store
:
17447 /* We currently don't expect these to be used in a loop body. */
17451 case vec_promote_demote
:
17452 case vec_construct
:
17453 case vec_to_scalar
:
17454 case scalar_to_vec
:
17457 ops
->general_ops
+= count
;
17462 case unaligned_load
:
17463 ops
->loads
+= count
;
17464 if (m_vec_flags
|| FLOAT_TYPE_P (aarch64_dr_type (stmt_info
)))
17465 ops
->general_ops
+= base_issue
->fp_simd_load_general_ops
* count
;
17469 case unaligned_store
:
17471 ops
->stores
+= count
;
17472 if (m_vec_flags
|| FLOAT_TYPE_P (aarch64_dr_type (stmt_info
)))
17473 ops
->general_ops
+= base_issue
->fp_simd_store_general_ops
* count
;
17477 /* Add any embedded comparison operations. */
17478 if ((kind
== scalar_stmt
|| kind
== vector_stmt
|| kind
== vec_to_scalar
)
17479 && vect_embedded_comparison_type (stmt_info
))
17480 ops
->general_ops
+= count
;
17482 /* COND_REDUCTIONS need two sets of VEC_COND_EXPRs, whereas so far we
17483 have only accounted for one. */
17484 if ((kind
== vector_stmt
|| kind
== vec_to_scalar
)
17485 && vect_reduc_type (m_vinfo
, stmt_info
) == COND_REDUCTION
)
17486 ops
->general_ops
+= count
;
17488 /* Count the predicate operations needed by an SVE comparison. */
17489 if (sve_issue
&& (kind
== vector_stmt
|| kind
== vec_to_scalar
))
17490 if (tree type
= vect_comparison_type (stmt_info
))
17492 unsigned int base
= (FLOAT_TYPE_P (type
)
17493 ? sve_issue
->fp_cmp_pred_ops
17494 : sve_issue
->int_cmp_pred_ops
);
17495 ops
->pred_ops
+= base
* count
;
17498 /* Add any extra overhead associated with LD[234] and ST[234] operations. */
17500 switch (aarch64_ld234_st234_vectors (kind
, stmt_info
, node
))
17503 ops
->general_ops
+= simd_issue
->ld2_st2_general_ops
* count
;
17507 ops
->general_ops
+= simd_issue
->ld3_st3_general_ops
* count
;
17511 ops
->general_ops
+= simd_issue
->ld4_st4_general_ops
* count
;
17515 /* Add any overhead associated with gather loads and scatter stores. */
17517 && (kind
== scalar_load
|| kind
== scalar_store
)
17518 && vect_mem_access_type (stmt_info
, node
) == VMAT_GATHER_SCATTER
)
17520 unsigned int pairs
= CEIL (count
, 2);
17521 ops
->pred_ops
+= sve_issue
->gather_scatter_pair_pred_ops
* pairs
;
17522 ops
->general_ops
+= sve_issue
->gather_scatter_pair_general_ops
* pairs
;
17526 /* Return true if STMT_INFO contains a memory access and if the constant
17527 component of the memory address is aligned to SIZE bytes. */
17529 aarch64_aligned_constant_offset_p (stmt_vec_info stmt_info
,
17532 if (!STMT_VINFO_DATA_REF (stmt_info
))
17535 if (auto first_stmt
= DR_GROUP_FIRST_ELEMENT (stmt_info
))
17536 stmt_info
= first_stmt
;
17537 tree constant_offset
= DR_INIT (STMT_VINFO_DATA_REF (stmt_info
));
17538 /* Needed for gathers & scatters, for example. */
17539 if (!constant_offset
)
17542 return multiple_p (wi::to_poly_offset (constant_offset
), size
);
17545 /* Check if a scalar or vector stmt could be part of a region of code
17546 that does nothing more than store values to memory, in the scalar
17547 case using STP. Return the cost of the stmt if so, counting 2 for
17548 one instruction. Return ~0U otherwise.
17550 The arguments are a subset of those passed to add_stmt_cost. */
17552 aarch64_stp_sequence_cost (unsigned int count
, vect_cost_for_stmt kind
,
17553 stmt_vec_info stmt_info
, tree vectype
)
17555 /* Code that stores vector constants uses a vector_load to create
17556 the constant. We don't apply the heuristic to that case for two
17559 - At the moment, STPs are only formed via peephole2, and the
17560 constant scalar moves would often come between STRs and so
17561 prevent STP formation.
17563 - The scalar code also has to load the constant somehow, and that
17567 case scalar_to_vec
:
17568 /* Count 2 insns for a GPR->SIMD dup and 1 insn for a FPR->SIMD dup. */
17569 return (FLOAT_TYPE_P (vectype
) ? 2 : 4) * count
;
17571 case vec_construct
:
17572 if (FLOAT_TYPE_P (vectype
))
17573 /* Count 1 insn for the maximum number of FP->SIMD INS
17575 return (vect_nunits_for_cost (vectype
) - 1) * 2 * count
;
17577 /* Count 2 insns for a GPR->SIMD move and 2 insns for the
17578 maximum number of GPR->SIMD INS instructions. */
17579 return vect_nunits_for_cost (vectype
) * 4 * count
;
17582 case unaligned_store
:
17583 /* Count 1 insn per vector if we can't form STP Q pairs. */
17584 if (aarch64_sve_mode_p (TYPE_MODE (vectype
)))
17589 /* Assume we won't be able to use STP if the constant offset
17590 component of the address is misaligned. ??? This could be
17591 removed if we formed STP pairs earlier, rather than relying
17593 auto size
= GET_MODE_SIZE (TYPE_MODE (vectype
));
17594 if (!aarch64_aligned_constant_offset_p (stmt_info
, size
))
17597 return CEIL (count
, 2) * 2;
17600 if (stmt_info
&& STMT_VINFO_DATA_REF (stmt_info
))
17602 /* Check for a mode in which STP pairs can be formed. */
17603 auto size
= GET_MODE_SIZE (TYPE_MODE (aarch64_dr_type (stmt_info
)));
17604 if (maybe_ne (size
, 4) && maybe_ne (size
, 8))
17607 /* Assume we won't be able to use STP if the constant offset
17608 component of the address is misaligned. ??? This could be
17609 removed if we formed STP pairs earlier, rather than relying
17611 if (!aarch64_aligned_constant_offset_p (stmt_info
, size
))
17622 aarch64_vector_costs::add_stmt_cost (int count
, vect_cost_for_stmt kind
,
17623 stmt_vec_info stmt_info
, slp_tree node
,
17624 tree vectype
, int misalign
,
17625 vect_cost_model_location where
)
17627 fractional_cost stmt_cost
17628 = aarch64_builtin_vectorization_cost (kind
, vectype
, misalign
);
17630 bool in_inner_loop_p
= (where
== vect_body
17632 && stmt_in_inner_loop_p (m_vinfo
, stmt_info
));
17634 /* Do one-time initialization based on the vinfo. */
17635 loop_vec_info loop_vinfo
= dyn_cast
<loop_vec_info
> (m_vinfo
);
17636 if (!m_analyzed_vinfo
)
17639 analyze_loop_vinfo (loop_vinfo
);
17641 m_analyzed_vinfo
= true;
17644 /* Apply the heuristic described above m_stp_sequence_cost. */
17645 if (m_stp_sequence_cost
!= ~0U)
17647 uint64_t cost
= aarch64_stp_sequence_cost (count
, kind
,
17648 stmt_info
, vectype
);
17649 m_stp_sequence_cost
= MIN (m_stp_sequence_cost
+ cost
, ~0U);
17652 /* Try to get a more accurate cost by looking at STMT_INFO instead
17653 of just looking at KIND. */
17656 /* If we scalarize a strided store, the vectorizer costs one
17657 vec_to_scalar for each element. However, we can store the first
17658 element using an FP store without a separate extract step. */
17659 if (vect_is_store_elt_extraction (kind
, stmt_info
))
17662 stmt_cost
= aarch64_detect_scalar_stmt_subtype (m_vinfo
, kind
,
17663 stmt_info
, stmt_cost
);
17665 if (vectype
&& m_vec_flags
)
17666 stmt_cost
= aarch64_detect_vector_stmt_subtype (m_vinfo
, kind
,
17671 /* Check if we've seen an SVE gather/scatter operation and which size. */
17672 if (kind
== scalar_load
17673 && aarch64_sve_mode_p (TYPE_MODE (vectype
))
17674 && vect_mem_access_type (stmt_info
, node
) == VMAT_GATHER_SCATTER
)
17676 const sve_vec_cost
*sve_costs
= aarch64_tune_params
.vec_costs
->sve
;
17679 /* Test for VNx2 modes, which have 64-bit containers. */
17680 if (known_eq (GET_MODE_NUNITS (TYPE_MODE (vectype
)),
17682 m_sve_gather_scatter_init_cost
17683 += sve_costs
->gather_load_x64_init_cost
;
17685 m_sve_gather_scatter_init_cost
17686 += sve_costs
->gather_load_x32_init_cost
;
17691 /* Do any SVE-specific adjustments to the cost. */
17692 if (stmt_info
&& vectype
&& aarch64_sve_mode_p (TYPE_MODE (vectype
)))
17693 stmt_cost
= aarch64_sve_adjust_stmt_cost (m_vinfo
, kind
, stmt_info
,
17694 vectype
, stmt_cost
);
17696 /* Vector promotion and demotion requires us to widen the operation first
17697 and only after that perform the conversion. Unfortunately the mid-end
17698 expects this to be doable as a single operation and doesn't pass on
17699 enough context here for us to tell which operation is happening. To
17700 account for this we count every promote-demote operation twice and if
17701 the previously costed operation was also a promote-demote we reduce
17702 the cost of the currently being costed operation to simulate the final
17703 conversion cost. Note that for SVE we can do better here if the converted
17704 value comes from a load since the widening load would consume the widening
17705 operations. However since we're in stage 3 we can't change the helper
17706 vect_is_extending_load and duplicating the code seems not useful. */
17707 gassign
*assign
= NULL
;
17708 if (kind
== vec_promote_demote
17709 && (assign
= dyn_cast
<gassign
*> (STMT_VINFO_STMT (stmt_info
)))
17710 && gimple_assign_rhs_code (assign
) == FLOAT_EXPR
)
17712 auto new_count
= count
* 2 - m_num_last_promote_demote
;
17713 m_num_last_promote_demote
= count
;
17717 m_num_last_promote_demote
= 0;
17721 /* Account for any extra "embedded" costs that apply additively
17722 to the base cost calculated above. */
17723 stmt_cost
= aarch64_adjust_stmt_cost (m_vinfo
, kind
, stmt_info
, node
,
17724 vectype
, m_vec_flags
, stmt_cost
);
17726 /* If we're recording a nonzero vector loop body cost for the
17727 innermost loop, also estimate the operations that would need
17728 to be issued by all relevant implementations of the loop. */
17730 && (m_costing_for_scalar
|| where
== vect_body
)
17731 && (!LOOP_VINFO_LOOP (loop_vinfo
)->inner
|| in_inner_loop_p
)
17733 for (auto &ops
: m_ops
)
17734 count_ops (count
, kind
, stmt_info
, node
, &ops
);
17736 /* If we're applying the SVE vs. Advanced SIMD unrolling heuristic,
17737 estimate the number of statements in the unrolled Advanced SIMD
17738 loop. For simplicitly, we assume that one iteration of the
17739 Advanced SIMD loop would need the same number of statements
17740 as one iteration of the SVE loop. */
17741 if (where
== vect_body
&& m_unrolled_advsimd_niters
)
17742 m_unrolled_advsimd_stmts
+= count
* m_unrolled_advsimd_niters
;
17744 /* Detect the use of an averaging operation. */
17745 gimple
*stmt
= stmt_info
->stmt
;
17746 if (is_gimple_call (stmt
)
17747 && gimple_call_internal_p (stmt
))
17749 switch (gimple_call_internal_fn (stmt
))
17751 case IFN_AVG_FLOOR
:
17760 /* If the statement stores to a decl that is known to be the argument
17761 to a vld1 in the same function, ignore the store for costing purposes.
17762 See the comment above m_stores_to_vector_load_decl for more details. */
17764 && (kind
== vector_store
|| kind
== unaligned_store
)
17765 && aarch64_accesses_vector_load_decl_p (stmt_info
))
17768 m_stores_to_vector_load_decl
= true;
17771 return record_stmt_cost (stmt_info
, where
, (count
* stmt_cost
).ceil ());
17774 /* Return true if (a) we're applying the Advanced SIMD vs. SVE unrolling
17775 heuristic described above m_unrolled_advsimd_niters and (b) the heuristic
17776 says that we should prefer the Advanced SIMD loop. */
17778 aarch64_vector_costs::prefer_unrolled_loop () const
17780 if (!m_unrolled_advsimd_stmts
)
17783 if (dump_enabled_p ())
17784 dump_printf_loc (MSG_NOTE
, vect_location
, "Number of insns in"
17785 " unrolled Advanced SIMD loop = "
17786 HOST_WIDE_INT_PRINT_UNSIGNED
"\n",
17787 m_unrolled_advsimd_stmts
);
17789 /* The balance here is tricky. On the one hand, we can't be sure whether
17790 the code is vectorizable with Advanced SIMD or not. However, even if
17791 it isn't vectorizable with Advanced SIMD, there's a possibility that
17792 the scalar code could also be unrolled. Some of the code might then
17793 benefit from SLP, or from using LDP and STP. We therefore apply
17794 the heuristic regardless of can_use_advsimd_p. */
17795 return (m_unrolled_advsimd_stmts
17796 && (m_unrolled_advsimd_stmts
17797 <= (unsigned int) param_max_completely_peeled_insns
));
17800 /* Subroutine of adjust_body_cost for handling SVE. Use ISSUE_INFO to work out
17801 how fast the SVE code can be issued and compare it to the equivalent value
17802 for scalar code (SCALAR_CYCLES_PER_ITER). If COULD_USE_ADVSIMD is true,
17803 also compare it to the issue rate of Advanced SIMD code
17804 (ADVSIMD_CYCLES_PER_ITER).
17806 ORIG_BODY_COST is the cost originally passed to adjust_body_cost and
17807 *BODY_COST is the current value of the adjusted cost. *SHOULD_DISPARAGE
17808 is true if we think the loop body is too expensive. */
17811 aarch64_vector_costs::
17812 adjust_body_cost_sve (const aarch64_vec_op_count
*ops
,
17813 fractional_cost scalar_cycles_per_iter
,
17814 unsigned int orig_body_cost
, unsigned int *body_cost
,
17815 bool *should_disparage
)
17817 if (dump_enabled_p ())
17820 fractional_cost sve_pred_cycles_per_iter
= ops
->min_pred_cycles_per_iter ();
17821 fractional_cost sve_cycles_per_iter
= ops
->min_cycles_per_iter ();
17823 /* If the scalar version of the loop could issue at least as
17824 quickly as the predicate parts of the SVE loop, make the SVE loop
17825 prohibitively expensive. In this case vectorization is adding an
17826 overhead that the original scalar code didn't have.
17828 This is mostly intended to detect cases in which WHILELOs dominate
17829 for very tight loops, which is something that normal latency-based
17830 costs would not model. Adding this kind of cliffedge would be
17831 too drastic for scalar_cycles_per_iter vs. sve_cycles_per_iter;
17832 code in the caller handles that case in a more conservative way. */
17833 fractional_cost sve_estimate
= sve_pred_cycles_per_iter
+ 1;
17834 if (scalar_cycles_per_iter
< sve_estimate
)
17836 unsigned int min_cost
17837 = orig_body_cost
* estimated_poly_value (BYTES_PER_SVE_VECTOR
);
17838 if (*body_cost
< min_cost
)
17840 if (dump_enabled_p ())
17841 dump_printf_loc (MSG_NOTE
, vect_location
,
17842 "Increasing body cost to %d because the"
17843 " scalar code could issue within the limit"
17844 " imposed by predicate operations\n",
17846 *body_cost
= min_cost
;
17847 *should_disparage
= true;
17851 return sve_cycles_per_iter
;
17855 aarch64_vector_costs::determine_suggested_unroll_factor ()
17857 bool sve
= m_vec_flags
& VEC_ANY_SVE
;
17858 /* If we are trying to unroll an Advanced SIMD main loop that contains
17859 an averaging operation that we do not support with SVE and we might use a
17860 predicated epilogue, we need to be conservative and block unrolling as
17861 this might lead to a less optimal loop for the first and only epilogue
17862 using the original loop's vectorization factor.
17863 TODO: Remove this constraint when we add support for multiple epilogue
17865 if (!sve
&& !TARGET_SVE2
&& m_has_avg
)
17868 unsigned int max_unroll_factor
= 1;
17869 for (auto vec_ops
: m_ops
)
17871 aarch64_simd_vec_issue_info
const *vec_issue
17872 = vec_ops
.simd_issue_info ();
17875 /* Limit unroll factor to a value adjustable by the user, the default
17877 unsigned int unroll_factor
= aarch64_vect_unroll_limit
;
17878 unsigned int factor
17879 = vec_ops
.reduction_latency
> 1 ? vec_ops
.reduction_latency
: 1;
17882 /* Sanity check, this should never happen. */
17883 if ((vec_ops
.stores
+ vec_ops
.loads
+ vec_ops
.general_ops
) == 0)
17886 /* Check stores. */
17887 if (vec_ops
.stores
> 0)
17889 temp
= CEIL (factor
* vec_issue
->stores_per_cycle
,
17891 unroll_factor
= MIN (unroll_factor
, temp
);
17894 /* Check loads + stores. */
17895 if (vec_ops
.loads
> 0)
17897 temp
= CEIL (factor
* vec_issue
->loads_stores_per_cycle
,
17898 vec_ops
.loads
+ vec_ops
.stores
);
17899 unroll_factor
= MIN (unroll_factor
, temp
);
17902 /* Check general ops. */
17903 if (vec_ops
.general_ops
> 0)
17905 temp
= CEIL (factor
* vec_issue
->general_ops_per_cycle
,
17906 vec_ops
.general_ops
);
17907 unroll_factor
= MIN (unroll_factor
, temp
);
17909 max_unroll_factor
= MAX (max_unroll_factor
, unroll_factor
);
17912 /* Make sure unroll factor is power of 2. */
17913 return 1 << ceil_log2 (max_unroll_factor
);
17916 /* BODY_COST is the cost of a vector loop body. Adjust the cost as necessary
17917 and return the new cost. */
17919 aarch64_vector_costs::
17920 adjust_body_cost (loop_vec_info loop_vinfo
,
17921 const aarch64_vector_costs
*scalar_costs
,
17922 unsigned int body_cost
)
17924 if (scalar_costs
->m_ops
.is_empty () || m_ops
.is_empty ())
17927 const auto &scalar_ops
= scalar_costs
->m_ops
[0];
17928 const auto &vector_ops
= m_ops
[0];
17929 unsigned int estimated_vf
= vect_vf_for_cost (loop_vinfo
);
17930 unsigned int orig_body_cost
= body_cost
;
17931 bool should_disparage
= false;
17933 if (dump_enabled_p ())
17934 dump_printf_loc (MSG_NOTE
, vect_location
,
17935 "Original vector body cost = %d\n", body_cost
);
17937 /* If we know we have a single partial vector iteration, cap the VF
17938 to the number of scalar iterations for costing purposes. */
17939 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
))
17941 auto niters
= LOOP_VINFO_INT_NITERS (loop_vinfo
);
17942 if (niters
< estimated_vf
&& dump_enabled_p ())
17943 dump_printf_loc (MSG_NOTE
, vect_location
,
17944 "Scalar loop iterates at most %wd times. Capping VF "
17945 " from %d to %wd\n", niters
, estimated_vf
, niters
);
17947 estimated_vf
= MIN (estimated_vf
, niters
);
17950 fractional_cost scalar_cycles_per_iter
17951 = scalar_ops
.min_cycles_per_iter () * estimated_vf
;
17953 fractional_cost vector_cycles_per_iter
= vector_ops
.min_cycles_per_iter ();
17955 if (dump_enabled_p ())
17957 if (IN_RANGE (m_num_vector_iterations
, 0, 65536))
17958 dump_printf_loc (MSG_NOTE
, vect_location
,
17959 "Vector loop iterates at most %wd times\n",
17960 m_num_vector_iterations
);
17961 dump_printf_loc (MSG_NOTE
, vect_location
, "Scalar issue estimate:\n");
17962 scalar_ops
.dump ();
17963 dump_printf_loc (MSG_NOTE
, vect_location
,
17964 " estimated cycles per vector iteration"
17965 " (for VF %d) = %f\n",
17966 estimated_vf
, scalar_cycles_per_iter
.as_double ());
17969 if (vector_ops
.sve_issue_info ())
17971 if (dump_enabled_p ())
17972 dump_printf_loc (MSG_NOTE
, vect_location
, "SVE issue estimate:\n");
17973 vector_cycles_per_iter
17974 = adjust_body_cost_sve (&vector_ops
, scalar_cycles_per_iter
,
17975 orig_body_cost
, &body_cost
, &should_disparage
);
17977 if (aarch64_tune_params
.vec_costs
== &neoverse512tvb_vector_cost
)
17979 /* Also take Neoverse V1 tuning into account, doubling the
17980 scalar and Advanced SIMD estimates to account for the
17981 doubling in SVE vector length. */
17982 if (dump_enabled_p ())
17983 dump_printf_loc (MSG_NOTE
, vect_location
,
17984 "Neoverse V1 estimate:\n");
17985 auto vf_factor
= m_ops
[1].vf_factor ();
17986 adjust_body_cost_sve (&m_ops
[1], scalar_cycles_per_iter
* vf_factor
,
17987 orig_body_cost
, &body_cost
, &should_disparage
);
17992 if (dump_enabled_p ())
17994 dump_printf_loc (MSG_NOTE
, vect_location
,
17995 "Vector issue estimate:\n");
17996 vector_ops
.dump ();
18000 /* Decide whether to stick to latency-based costs or whether to try to
18001 take issue rates into account. */
18002 unsigned int threshold
= aarch64_loop_vect_issue_rate_niters
;
18003 if (m_vec_flags
& VEC_ANY_SVE
)
18004 threshold
= CEIL (threshold
, aarch64_estimated_sve_vq ());
18006 if (m_num_vector_iterations
>= 1
18007 && m_num_vector_iterations
< threshold
)
18009 if (dump_enabled_p ())
18010 dump_printf_loc (MSG_NOTE
, vect_location
,
18011 "Low iteration count, so using pure latency"
18014 /* Increase the cost of the vector code if it looks like the scalar code
18015 could issue more quickly. These values are only rough estimates,
18016 so minor differences should only result in minor changes. */
18017 else if (scalar_cycles_per_iter
< vector_cycles_per_iter
)
18019 body_cost
= fractional_cost::scale (body_cost
, vector_cycles_per_iter
,
18020 scalar_cycles_per_iter
);
18021 if (dump_enabled_p ())
18022 dump_printf_loc (MSG_NOTE
, vect_location
,
18023 "Increasing body cost to %d because scalar code"
18024 " would issue more quickly\n", body_cost
);
18026 /* In general, it's expected that the proposed vector code would be able
18027 to issue more quickly than the original scalar code. This should
18028 already be reflected to some extent in the latency-based costs.
18030 However, the latency-based costs effectively assume that the scalar
18031 code and the vector code execute serially, which tends to underplay
18032 one important case: if the real (non-serialized) execution time of
18033 a scalar iteration is dominated by loop-carried dependencies,
18034 and if the vector code is able to reduce both the length of
18035 the loop-carried dependencies *and* the number of cycles needed
18036 to issue the code in general, we can be more confident that the
18037 vector code is an improvement, even if adding the other (non-loop-carried)
18038 latencies tends to hide this saving. We therefore reduce the cost of the
18039 vector loop body in proportion to the saving. */
18040 else if (scalar_ops
.reduction_latency
> vector_ops
.reduction_latency
18041 && scalar_ops
.reduction_latency
== scalar_cycles_per_iter
18042 && scalar_cycles_per_iter
> vector_cycles_per_iter
18043 && !should_disparage
)
18045 body_cost
= fractional_cost::scale (body_cost
, vector_cycles_per_iter
,
18046 scalar_cycles_per_iter
);
18047 if (dump_enabled_p ())
18048 dump_printf_loc (MSG_NOTE
, vect_location
,
18049 "Decreasing body cost to %d account for smaller"
18050 " reduction latency\n", body_cost
);
18057 aarch64_vector_costs::finish_cost (const vector_costs
*uncast_scalar_costs
)
18059 /* Record the issue information for any SVE WHILE instructions that the
18061 loop_vec_info loop_vinfo
= dyn_cast
<loop_vec_info
> (m_vinfo
);
18062 if (!m_ops
.is_empty ()
18064 && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo
))
18066 unsigned int num_masks
= 0;
18067 rgroup_controls
*rgm
;
18068 unsigned int num_vectors_m1
;
18069 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo
).rgc_vec
,
18070 num_vectors_m1
, rgm
)
18072 num_masks
+= num_vectors_m1
+ 1;
18073 for (auto &ops
: m_ops
)
18074 if (auto *issue
= ops
.sve_issue_info ())
18075 ops
.pred_ops
+= num_masks
* issue
->while_pred_ops
;
18079 = static_cast<const aarch64_vector_costs
*> (uncast_scalar_costs
);
18080 if (loop_vinfo
&& m_vec_flags
)
18082 m_costs
[vect_body
] = adjust_body_cost (loop_vinfo
, scalar_costs
,
18083 m_costs
[vect_body
]);
18084 m_suggested_unroll_factor
= determine_suggested_unroll_factor ();
18086 /* For gather and scatters there's an additional overhead for the first
18087 iteration. For low count loops they're not beneficial so model the
18088 overhead as loop prologue costs. */
18089 m_costs
[vect_prologue
] += m_sve_gather_scatter_init_cost
;
18092 /* Apply the heuristic described above m_stp_sequence_cost. Prefer
18093 the scalar code in the event of a tie, since there is more chance
18094 of scalar code being optimized with surrounding operations.
18096 In addition, if the vector body is a simple store to a decl that
18097 is elsewhere loaded using vld1, strongly prefer the vector form,
18098 to the extent of giving the prologue a zero cost. See the comment
18099 above m_stores_to_vector_load_decl for details. */
18102 && m_stp_sequence_cost
!= ~0U)
18104 if (m_stores_to_vector_load_decl
)
18105 m_costs
[vect_prologue
] = 0;
18106 else if (m_stp_sequence_cost
>= scalar_costs
->m_stp_sequence_cost
)
18107 m_costs
[vect_body
] = 2 * scalar_costs
->total_cost ();
18110 vector_costs::finish_cost (scalar_costs
);
18114 aarch64_vector_costs::
18115 better_main_loop_than_p (const vector_costs
*uncast_other
) const
18117 auto other
= static_cast<const aarch64_vector_costs
*> (uncast_other
);
18119 auto this_loop_vinfo
= as_a
<loop_vec_info
> (this->m_vinfo
);
18120 auto other_loop_vinfo
= as_a
<loop_vec_info
> (other
->m_vinfo
);
18122 if (dump_enabled_p ())
18123 dump_printf_loc (MSG_NOTE
, vect_location
,
18124 "Comparing two main loops (%s at VF %d vs %s at VF %d)\n",
18125 GET_MODE_NAME (this_loop_vinfo
->vector_mode
),
18126 vect_vf_for_cost (this_loop_vinfo
),
18127 GET_MODE_NAME (other_loop_vinfo
->vector_mode
),
18128 vect_vf_for_cost (other_loop_vinfo
));
18130 /* Apply the unrolling heuristic described above
18131 m_unrolled_advsimd_niters. */
18132 if (bool (m_unrolled_advsimd_stmts
)
18133 != bool (other
->m_unrolled_advsimd_stmts
))
18135 bool this_prefer_unrolled
= this->prefer_unrolled_loop ();
18136 bool other_prefer_unrolled
= other
->prefer_unrolled_loop ();
18137 if (this_prefer_unrolled
!= other_prefer_unrolled
)
18139 if (dump_enabled_p ())
18140 dump_printf_loc (MSG_NOTE
, vect_location
,
18141 "Preferring Advanced SIMD loop because"
18142 " it can be unrolled\n");
18143 return other_prefer_unrolled
;
18147 for (unsigned int i
= 0; i
< m_ops
.length (); ++i
)
18149 if (dump_enabled_p ())
18152 dump_printf_loc (MSG_NOTE
, vect_location
,
18153 "Reconsidering with subtuning %d\n", i
);
18154 dump_printf_loc (MSG_NOTE
, vect_location
,
18155 "Issue info for %s loop:\n",
18156 GET_MODE_NAME (this_loop_vinfo
->vector_mode
));
18157 this->m_ops
[i
].dump ();
18158 dump_printf_loc (MSG_NOTE
, vect_location
,
18159 "Issue info for %s loop:\n",
18160 GET_MODE_NAME (other_loop_vinfo
->vector_mode
));
18161 other
->m_ops
[i
].dump ();
18164 auto this_estimated_vf
= (vect_vf_for_cost (this_loop_vinfo
)
18165 * this->m_ops
[i
].vf_factor ());
18166 auto other_estimated_vf
= (vect_vf_for_cost (other_loop_vinfo
)
18167 * other
->m_ops
[i
].vf_factor ());
18169 /* If it appears that one loop could process the same amount of data
18170 in fewer cycles, prefer that loop over the other one. */
18171 fractional_cost this_cost
18172 = this->m_ops
[i
].min_cycles_per_iter () * other_estimated_vf
;
18173 fractional_cost other_cost
18174 = other
->m_ops
[i
].min_cycles_per_iter () * this_estimated_vf
;
18175 if (dump_enabled_p ())
18177 dump_printf_loc (MSG_NOTE
, vect_location
,
18178 "Weighted cycles per iteration of %s loop ~= %f\n",
18179 GET_MODE_NAME (this_loop_vinfo
->vector_mode
),
18180 this_cost
.as_double ());
18181 dump_printf_loc (MSG_NOTE
, vect_location
,
18182 "Weighted cycles per iteration of %s loop ~= %f\n",
18183 GET_MODE_NAME (other_loop_vinfo
->vector_mode
),
18184 other_cost
.as_double ());
18186 if (this_cost
!= other_cost
)
18188 if (dump_enabled_p ())
18189 dump_printf_loc (MSG_NOTE
, vect_location
,
18190 "Preferring loop with lower cycles"
18191 " per iteration\n");
18192 return this_cost
< other_cost
;
18195 /* If the issue rate of SVE code is limited by predicate operations
18196 (i.e. if sve_pred_cycles_per_iter > sve_nonpred_cycles_per_iter),
18197 and if Advanced SIMD code could issue within the limit imposed
18198 by the predicate operations, the predicate operations are adding an
18199 overhead that the original code didn't have and so we should prefer
18200 the Advanced SIMD version. */
18201 auto better_pred_limit_p
= [](const aarch64_vec_op_count
&a
,
18202 const aarch64_vec_op_count
&b
) -> bool
18204 if (a
.pred_ops
== 0
18205 && (b
.min_pred_cycles_per_iter ()
18206 > b
.min_nonpred_cycles_per_iter ()))
18208 if (dump_enabled_p ())
18209 dump_printf_loc (MSG_NOTE
, vect_location
,
18210 "Preferring Advanced SIMD loop since"
18211 " SVE loop is predicate-limited\n");
18216 if (better_pred_limit_p (this->m_ops
[i
], other
->m_ops
[i
]))
18218 if (better_pred_limit_p (other
->m_ops
[i
], this->m_ops
[i
]))
18222 return vector_costs::better_main_loop_than_p (other
);
18225 static void initialize_aarch64_code_model (struct gcc_options
*);
18227 /* Parse TOKEN, which has length LENGTH to see if it is an option
18228 described in FLAG. If it is, return the index bit for that fusion type.
18229 If not, error (printing OPTION_NAME) and return zero. */
18231 static unsigned int
18232 aarch64_parse_one_option_token (const char *token
,
18234 const struct aarch64_flag_desc
*flag
,
18235 const char *option_name
)
18237 for (; flag
->name
!= NULL
; flag
++)
18239 if (length
== strlen (flag
->name
)
18240 && !strncmp (flag
->name
, token
, length
))
18244 error ("unknown flag passed in %<-moverride=%s%> (%s)", option_name
, token
);
18248 /* Parse OPTION which is a comma-separated list of flags to enable.
18249 FLAGS gives the list of flags we understand, INITIAL_STATE gives any
18250 default state we inherit from the CPU tuning structures. OPTION_NAME
18251 gives the top-level option we are parsing in the -moverride string,
18252 for use in error messages. */
18254 static unsigned int
18255 aarch64_parse_boolean_options (const char *option
,
18256 const struct aarch64_flag_desc
*flags
,
18257 unsigned int initial_state
,
18258 const char *option_name
)
18260 const char separator
= '.';
18261 const char* specs
= option
;
18262 const char* ntoken
= option
;
18263 unsigned int found_flags
= initial_state
;
18265 while ((ntoken
= strchr (specs
, separator
)))
18267 size_t token_length
= ntoken
- specs
;
18268 unsigned token_ops
= aarch64_parse_one_option_token (specs
,
18272 /* If we find "none" (or, for simplicity's sake, an error) anywhere
18273 in the token stream, reset the supported operations. So:
18275 adrp+add.cmp+branch.none.adrp+add
18277 would have the result of turning on only adrp+add fusion. */
18281 found_flags
|= token_ops
;
18285 /* We ended with a comma, print something. */
18288 error ("%qs string ill-formed", option_name
);
18292 /* We still have one more token to parse. */
18293 size_t token_length
= strlen (specs
);
18294 unsigned token_ops
= aarch64_parse_one_option_token (specs
,
18301 found_flags
|= token_ops
;
18302 return found_flags
;
18305 /* Support for overriding instruction fusion. */
18308 aarch64_parse_fuse_string (const char *fuse_string
,
18309 struct tune_params
*tune
)
18311 tune
->fusible_ops
= aarch64_parse_boolean_options (fuse_string
,
18312 aarch64_fusible_pairs
,
18317 /* Support for overriding other tuning flags. */
18320 aarch64_parse_tune_string (const char *tune_string
,
18321 struct tune_params
*tune
)
18323 tune
->extra_tuning_flags
18324 = aarch64_parse_boolean_options (tune_string
,
18325 aarch64_tuning_flags
,
18326 tune
->extra_tuning_flags
,
18330 /* Parse the sve_width tuning moverride string in TUNE_STRING.
18331 Accept the valid SVE vector widths allowed by
18332 aarch64_sve_vector_bits_enum and use it to override sve_width
18336 aarch64_parse_sve_width_string (const char *tune_string
,
18337 struct tune_params
*tune
)
18341 int n
= sscanf (tune_string
, "%d", &width
);
18344 error ("invalid format for %<sve_width%>");
18356 error ("invalid %<sve_width%> value: %d", width
);
18358 tune
->sve_width
= (enum aarch64_sve_vector_bits_enum
) width
;
18361 /* Parse TOKEN, which has length LENGTH to see if it is a tuning option
18362 we understand. If it is, extract the option string and handoff to
18363 the appropriate function. */
18366 aarch64_parse_one_override_token (const char* token
,
18368 struct tune_params
*tune
)
18370 const struct aarch64_tuning_override_function
*fn
18371 = aarch64_tuning_override_functions
;
18373 const char *option_part
= strchr (token
, '=');
18376 error ("tuning string missing in option (%s)", token
);
18380 /* Get the length of the option name. */
18381 length
= option_part
- token
;
18382 /* Skip the '=' to get to the option string. */
18385 for (; fn
->name
!= NULL
; fn
++)
18387 if (!strncmp (fn
->name
, token
, length
))
18389 fn
->parse_override (option_part
, tune
);
18394 error ("unknown tuning option (%s)",token
);
18398 /* A checking mechanism for the implementation of the tls size. */
18401 initialize_aarch64_tls_size (struct gcc_options
*opts
)
18403 if (aarch64_tls_size
== 0)
18404 aarch64_tls_size
= 24;
18406 switch (opts
->x_aarch64_cmodel_var
)
18408 case AARCH64_CMODEL_TINY
:
18409 /* Both the default and maximum TLS size allowed under tiny is 1M which
18410 needs two instructions to address, so we clamp the size to 24. */
18411 if (aarch64_tls_size
> 24)
18412 aarch64_tls_size
= 24;
18414 case AARCH64_CMODEL_SMALL
:
18415 /* The maximum TLS size allowed under small is 4G. */
18416 if (aarch64_tls_size
> 32)
18417 aarch64_tls_size
= 32;
18419 case AARCH64_CMODEL_LARGE
:
18420 /* The maximum TLS size allowed under large is 16E.
18421 FIXME: 16E should be 64bit, we only support 48bit offset now. */
18422 if (aarch64_tls_size
> 48)
18423 aarch64_tls_size
= 48;
18426 gcc_unreachable ();
18432 /* Return the CPU corresponding to the enum CPU. */
18434 static const struct processor
*
18435 aarch64_get_tune_cpu (enum aarch64_cpu cpu
)
18437 gcc_assert (cpu
!= aarch64_no_cpu
);
18439 return &all_cores
[cpu
];
18442 /* Return the architecture corresponding to the enum ARCH. */
18444 static const struct processor
*
18445 aarch64_get_arch (enum aarch64_arch arch
)
18447 gcc_assert (arch
!= aarch64_no_arch
);
18449 return &all_architectures
[arch
];
18452 /* Parse STRING looking for options in the format:
18453 string :: option:string
18454 option :: name=substring
18456 substring :: defined by option. */
18459 aarch64_parse_override_string (const char* input_string
,
18460 struct tune_params
* tune
)
18462 const char separator
= ':';
18463 size_t string_length
= strlen (input_string
) + 1;
18464 char *string_root
= (char *) xmalloc (sizeof (*string_root
) * string_length
);
18465 char *string
= string_root
;
18466 strncpy (string
, input_string
, string_length
);
18467 string
[string_length
- 1] = '\0';
18469 char* ntoken
= string
;
18471 while ((ntoken
= strchr (string
, separator
)))
18473 size_t token_length
= ntoken
- string
;
18474 /* Make this substring look like a string. */
18476 aarch64_parse_one_override_token (string
, token_length
, tune
);
18480 /* One last option to parse. */
18481 aarch64_parse_one_override_token (string
, strlen (string
), tune
);
18482 free (string_root
);
18485 /* Adjust CURRENT_TUNE (a generic tuning struct) with settings that
18486 are best for a generic target with the currently-enabled architecture
18489 aarch64_adjust_generic_arch_tuning (struct tune_params
¤t_tune
)
18491 /* Neoverse V1 is the only core that is known to benefit from
18492 AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS. There is therefore no
18493 point enabling it for SVE2 and above. */
18495 current_tune
.extra_tuning_flags
18496 &= ~AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS
;
18500 aarch64_override_options_after_change_1 (struct gcc_options
*opts
)
18502 /* PR 70044: We have to be careful about being called multiple times for the
18503 same function. This means all changes should be repeatable. */
18505 /* Set aarch64_use_frame_pointer based on -fno-omit-frame-pointer.
18506 Disable the frame pointer flag so the mid-end will not use a frame
18507 pointer in leaf functions in order to support -fomit-leaf-frame-pointer.
18508 Set x_flag_omit_frame_pointer to the special value 2 to differentiate
18509 between -fomit-frame-pointer (1) and -fno-omit-frame-pointer (2). */
18510 aarch64_use_frame_pointer
= opts
->x_flag_omit_frame_pointer
!= 1;
18511 if (opts
->x_flag_omit_frame_pointer
== 0)
18512 opts
->x_flag_omit_frame_pointer
= 2;
18514 /* If not optimizing for size, set the default
18515 alignment to what the target wants. */
18516 if (!opts
->x_optimize_size
)
18518 if (opts
->x_flag_align_loops
&& !opts
->x_str_align_loops
)
18519 opts
->x_str_align_loops
= aarch64_tune_params
.loop_align
;
18520 if (opts
->x_flag_align_jumps
&& !opts
->x_str_align_jumps
)
18521 opts
->x_str_align_jumps
= aarch64_tune_params
.jump_align
;
18522 if (opts
->x_flag_align_functions
&& !opts
->x_str_align_functions
)
18523 opts
->x_str_align_functions
= aarch64_tune_params
.function_align
;
18526 /* We default to no pc-relative literal loads. */
18528 aarch64_pcrelative_literal_loads
= false;
18530 /* If -mpc-relative-literal-loads is set on the command line, this
18531 implies that the user asked for PC relative literal loads. */
18532 if (opts
->x_pcrelative_literal_loads
== 1)
18533 aarch64_pcrelative_literal_loads
= true;
18535 /* In the tiny memory model it makes no sense to disallow PC relative
18536 literal pool loads. */
18537 if (aarch64_cmodel
== AARCH64_CMODEL_TINY
18538 || aarch64_cmodel
== AARCH64_CMODEL_TINY_PIC
)
18539 aarch64_pcrelative_literal_loads
= true;
18541 /* When enabling the lower precision Newton series for the square root, also
18542 enable it for the reciprocal square root, since the latter is an
18543 intermediary step for the former. */
18544 if (flag_mlow_precision_sqrt
)
18545 flag_mrecip_low_precision_sqrt
= true;
18548 /* 'Unpack' up the internal tuning structs and update the options
18549 in OPTS. The caller must have set up selected_tune and selected_arch
18550 as all the other target-specific codegen decisions are
18551 derived from them. */
18554 aarch64_override_options_internal (struct gcc_options
*opts
)
18556 const struct processor
*tune
= aarch64_get_tune_cpu (opts
->x_selected_tune
);
18557 aarch64_tune
= tune
->sched_core
;
18558 /* Make a copy of the tuning parameters attached to the core, which
18559 we may later overwrite. */
18560 aarch64_tune_params
= *(tune
->tune
);
18561 if (tune
->tune
== &generic_tunings
)
18562 aarch64_adjust_generic_arch_tuning (aarch64_tune_params
);
18564 if (opts
->x_aarch64_override_tune_string
)
18565 aarch64_parse_override_string (opts
->x_aarch64_override_tune_string
,
18566 &aarch64_tune_params
);
18568 if (opts
->x_aarch64_ldp_policy_param
)
18569 aarch64_tune_params
.ldp_policy_model
= opts
->x_aarch64_ldp_policy_param
;
18571 if (opts
->x_aarch64_stp_policy_param
)
18572 aarch64_tune_params
.stp_policy_model
= opts
->x_aarch64_stp_policy_param
;
18574 /* This target defaults to strict volatile bitfields. */
18575 if (opts
->x_flag_strict_volatile_bitfields
< 0 && abi_version_at_least (2))
18576 opts
->x_flag_strict_volatile_bitfields
= 1;
18578 if (aarch64_stack_protector_guard
== SSP_GLOBAL
18579 && opts
->x_aarch64_stack_protector_guard_offset_str
)
18581 error ("incompatible options %<-mstack-protector-guard=global%> and "
18582 "%<-mstack-protector-guard-offset=%s%>",
18583 aarch64_stack_protector_guard_offset_str
);
18586 if (aarch64_stack_protector_guard
== SSP_SYSREG
18587 && !(opts
->x_aarch64_stack_protector_guard_offset_str
18588 && opts
->x_aarch64_stack_protector_guard_reg_str
))
18590 error ("both %<-mstack-protector-guard-offset%> and "
18591 "%<-mstack-protector-guard-reg%> must be used "
18592 "with %<-mstack-protector-guard=sysreg%>");
18595 if (opts
->x_aarch64_stack_protector_guard_reg_str
)
18597 if (strlen (opts
->x_aarch64_stack_protector_guard_reg_str
) > 100)
18598 error ("specify a system register with a small string length");
18601 if (opts
->x_aarch64_stack_protector_guard_offset_str
)
18604 const char *str
= aarch64_stack_protector_guard_offset_str
;
18606 long offs
= strtol (aarch64_stack_protector_guard_offset_str
, &end
, 0);
18607 if (!*str
|| *end
|| errno
)
18608 error ("%qs is not a valid offset in %qs", str
,
18609 "-mstack-protector-guard-offset=");
18610 aarch64_stack_protector_guard_offset
= offs
;
18613 if ((flag_sanitize
& SANITIZE_SHADOW_CALL_STACK
)
18614 && !fixed_regs
[R18_REGNUM
])
18615 error ("%<-fsanitize=shadow-call-stack%> requires %<-ffixed-x18%>");
18617 aarch64_feature_flags isa_flags
= aarch64_get_isa_flags (opts
);
18618 if ((isa_flags
& (AARCH64_FL_SM_ON
| AARCH64_FL_ZA_ON
))
18619 && !(isa_flags
& AARCH64_FL_SME
))
18621 if (isa_flags
& AARCH64_FL_SM_ON
)
18622 error ("streaming functions require the ISA extension %qs", "sme");
18624 error ("functions with SME state require the ISA extension %qs",
18626 inform (input_location
, "you can enable %qs using the command-line"
18627 " option %<-march%>, or by using the %<target%>"
18628 " attribute or pragma", "sme");
18629 opts
->x_target_flags
&= ~MASK_GENERAL_REGS_ONLY
;
18630 auto new_flags
= isa_flags
| feature_deps::SME ().enable
;
18631 aarch64_set_asm_isa_flags (opts
, new_flags
);
18634 initialize_aarch64_code_model (opts
);
18635 initialize_aarch64_tls_size (opts
);
18636 aarch64_tpidr_register
= opts
->x_aarch64_tpidr_reg
;
18638 int queue_depth
= 0;
18639 switch (aarch64_tune_params
.autoprefetcher_model
)
18641 case tune_params::AUTOPREFETCHER_OFF
:
18644 case tune_params::AUTOPREFETCHER_WEAK
:
18647 case tune_params::AUTOPREFETCHER_STRONG
:
18648 queue_depth
= max_insn_queue_index
+ 1;
18651 gcc_unreachable ();
18654 /* We don't mind passing in global_options_set here as we don't use
18655 the *options_set structs anyway. */
18656 SET_OPTION_IF_UNSET (opts
, &global_options_set
,
18657 param_sched_autopref_queue_depth
, queue_depth
);
18659 /* Set up parameters to be used in prefetching algorithm. Do not
18660 override the defaults unless we are tuning for a core we have
18661 researched values for. */
18662 if (aarch64_tune_params
.prefetch
->num_slots
> 0)
18663 SET_OPTION_IF_UNSET (opts
, &global_options_set
,
18664 param_simultaneous_prefetches
,
18665 aarch64_tune_params
.prefetch
->num_slots
);
18666 if (aarch64_tune_params
.prefetch
->l1_cache_size
>= 0)
18667 SET_OPTION_IF_UNSET (opts
, &global_options_set
,
18668 param_l1_cache_size
,
18669 aarch64_tune_params
.prefetch
->l1_cache_size
);
18670 if (aarch64_tune_params
.prefetch
->l1_cache_line_size
>= 0)
18671 SET_OPTION_IF_UNSET (opts
, &global_options_set
,
18672 param_l1_cache_line_size
,
18673 aarch64_tune_params
.prefetch
->l1_cache_line_size
);
18675 if (aarch64_tune_params
.prefetch
->l1_cache_line_size
>= 0)
18677 SET_OPTION_IF_UNSET (opts
, &global_options_set
,
18678 param_destruct_interfere_size
,
18679 aarch64_tune_params
.prefetch
->l1_cache_line_size
);
18680 SET_OPTION_IF_UNSET (opts
, &global_options_set
,
18681 param_construct_interfere_size
,
18682 aarch64_tune_params
.prefetch
->l1_cache_line_size
);
18686 /* For a generic AArch64 target, cover the current range of cache line
18688 SET_OPTION_IF_UNSET (opts
, &global_options_set
,
18689 param_destruct_interfere_size
,
18691 SET_OPTION_IF_UNSET (opts
, &global_options_set
,
18692 param_construct_interfere_size
,
18696 if (aarch64_tune_params
.prefetch
->l2_cache_size
>= 0)
18697 SET_OPTION_IF_UNSET (opts
, &global_options_set
,
18698 param_l2_cache_size
,
18699 aarch64_tune_params
.prefetch
->l2_cache_size
);
18700 if (!aarch64_tune_params
.prefetch
->prefetch_dynamic_strides
)
18701 SET_OPTION_IF_UNSET (opts
, &global_options_set
,
18702 param_prefetch_dynamic_strides
, 0);
18703 if (aarch64_tune_params
.prefetch
->minimum_stride
>= 0)
18704 SET_OPTION_IF_UNSET (opts
, &global_options_set
,
18705 param_prefetch_minimum_stride
,
18706 aarch64_tune_params
.prefetch
->minimum_stride
);
18708 /* Use the alternative scheduling-pressure algorithm by default. */
18709 SET_OPTION_IF_UNSET (opts
, &global_options_set
,
18710 param_sched_pressure_algorithm
,
18711 SCHED_PRESSURE_MODEL
);
18713 /* Validate the guard size. */
18714 int guard_size
= param_stack_clash_protection_guard_size
;
18716 if (guard_size
!= 12 && guard_size
!= 16)
18717 error ("only values 12 (4 KB) and 16 (64 KB) are supported for guard "
18718 "size. Given value %d (%llu KB) is out of range",
18719 guard_size
, (1ULL << guard_size
) / 1024ULL);
18721 /* Enforce that interval is the same size as size so the mid-end does the
18723 SET_OPTION_IF_UNSET (opts
, &global_options_set
,
18724 param_stack_clash_protection_probe_interval
,
18727 /* The maybe_set calls won't update the value if the user has explicitly set
18728 one. Which means we need to validate that probing interval and guard size
18731 = param_stack_clash_protection_probe_interval
;
18732 if (guard_size
!= probe_interval
)
18733 error ("stack clash guard size %<%d%> must be equal to probing interval "
18734 "%<%d%>", guard_size
, probe_interval
);
18736 /* Enable sw prefetching at specified optimization level for
18737 CPUS that have prefetch. Lower optimization level threshold by 1
18738 when profiling is enabled. */
18739 if (opts
->x_flag_prefetch_loop_arrays
< 0
18740 && !opts
->x_optimize_size
18741 && aarch64_tune_params
.prefetch
->default_opt_level
>= 0
18742 && opts
->x_optimize
>= aarch64_tune_params
.prefetch
->default_opt_level
)
18743 opts
->x_flag_prefetch_loop_arrays
= 1;
18745 /* Avoid loop-dependant FMA chains. */
18746 if (aarch64_tune_params
.extra_tuning_flags
18747 & AARCH64_EXTRA_TUNE_AVOID_CROSS_LOOP_FMA
)
18748 SET_OPTION_IF_UNSET (opts
, &global_options_set
, param_avoid_fma_max_bits
,
18751 /* Consider fully pipelined FMA in reassociation. */
18752 if (aarch64_tune_params
.extra_tuning_flags
18753 & AARCH64_EXTRA_TUNE_FULLY_PIPELINED_FMA
)
18754 SET_OPTION_IF_UNSET (opts
, &global_options_set
, param_fully_pipelined_fma
,
18757 aarch64_override_options_after_change_1 (opts
);
18760 /* Straight line speculation indicators. */
18761 enum aarch64_sls_hardening_type
18768 static enum aarch64_sls_hardening_type aarch64_sls_hardening
;
18770 /* Return whether we should mitigatate Straight Line Speculation for the RET
18771 and BR instructions. */
18773 aarch64_harden_sls_retbr_p (void)
18775 return aarch64_sls_hardening
& SLS_RETBR
;
18778 /* Return whether we should mitigatate Straight Line Speculation for the BLR
18781 aarch64_harden_sls_blr_p (void)
18783 return aarch64_sls_hardening
& SLS_BLR
;
18786 /* As of yet we only allow setting these options globally, in the future we may
18787 allow setting them per function. */
18789 aarch64_validate_sls_mitigation (const char *const_str
)
18791 char *token_save
= NULL
;
18794 if (strcmp (const_str
, "none") == 0)
18796 aarch64_sls_hardening
= SLS_NONE
;
18799 if (strcmp (const_str
, "all") == 0)
18801 aarch64_sls_hardening
= SLS_ALL
;
18805 char *str_root
= xstrdup (const_str
);
18806 str
= strtok_r (str_root
, ",", &token_save
);
18808 error ("invalid argument given to %<-mharden-sls=%>");
18810 int temp
= SLS_NONE
;
18813 if (strcmp (str
, "blr") == 0)
18815 else if (strcmp (str
, "retbr") == 0)
18817 else if (strcmp (str
, "none") == 0 || strcmp (str
, "all") == 0)
18819 error ("%qs must be by itself for %<-mharden-sls=%>", str
);
18824 error ("invalid argument %qs for %<-mharden-sls=%>", str
);
18827 str
= strtok_r (NULL
, ",", &token_save
);
18829 aarch64_sls_hardening
= (aarch64_sls_hardening_type
) temp
;
18833 /* Return the VG value associated with -msve-vector-bits= value VALUE. */
18836 aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits_enum value
)
18838 /* 128-bit SVE and Advanced SIMD modes use different register layouts
18839 on big-endian targets, so we would need to forbid subregs that convert
18840 from one to the other. By default a reinterpret sequence would then
18841 involve a store to memory in one mode and a load back in the other.
18842 Even if we optimize that sequence using reverse instructions,
18843 it would still be a significant potential overhead.
18845 For now, it seems better to generate length-agnostic code for that
18847 if (value
== SVE_SCALABLE
18848 || (value
== SVE_128
&& BYTES_BIG_ENDIAN
))
18849 return poly_uint16 (2, 2);
18851 return (int) value
/ 64;
18854 /* Set the global aarch64_asm_isa_flags to FLAGS and update
18855 aarch64_isa_flags accordingly. */
18858 aarch64_set_asm_isa_flags (aarch64_feature_flags flags
)
18860 aarch64_set_asm_isa_flags (&global_options
, flags
);
18864 aarch64_handle_no_branch_protection (void)
18866 aarch_ra_sign_scope
= AARCH_FUNCTION_NONE
;
18867 aarch_enable_bti
= 0;
18868 aarch64_enable_gcs
= 0;
18872 aarch64_handle_standard_branch_protection (void)
18874 aarch_ra_sign_scope
= AARCH_FUNCTION_NON_LEAF
;
18875 aarch64_ra_sign_key
= AARCH64_KEY_A
;
18876 aarch_enable_bti
= 1;
18877 aarch64_enable_gcs
= 1;
18881 aarch64_handle_pac_ret_protection (void)
18883 aarch_ra_sign_scope
= AARCH_FUNCTION_NON_LEAF
;
18884 aarch64_ra_sign_key
= AARCH64_KEY_A
;
18888 aarch64_handle_pac_ret_leaf (void)
18890 aarch_ra_sign_scope
= AARCH_FUNCTION_ALL
;
18894 aarch64_handle_pac_ret_b_key (void)
18896 aarch64_ra_sign_key
= AARCH64_KEY_B
;
18900 aarch64_handle_bti_protection (void)
18902 aarch_enable_bti
= 1;
18905 aarch64_handle_gcs_protection (void)
18907 aarch64_enable_gcs
= 1;
18910 static const struct aarch_branch_protect_type aarch64_pac_ret_subtypes
[] = {
18911 { "leaf", false, aarch64_handle_pac_ret_leaf
, NULL
, 0 },
18912 { "b-key", false, aarch64_handle_pac_ret_b_key
, NULL
, 0 },
18913 { NULL
, false, NULL
, NULL
, 0 }
18916 static const struct aarch_branch_protect_type aarch64_branch_protect_types
[] =
18918 { "none", true, aarch64_handle_no_branch_protection
, NULL
, 0 },
18919 { "standard", true, aarch64_handle_standard_branch_protection
, NULL
, 0 },
18920 { "pac-ret", false, aarch64_handle_pac_ret_protection
,
18921 aarch64_pac_ret_subtypes
, ARRAY_SIZE (aarch64_pac_ret_subtypes
) },
18922 { "bti", false, aarch64_handle_bti_protection
, NULL
, 0 },
18923 { "gcs", false, aarch64_handle_gcs_protection
, NULL
, 0 },
18924 { NULL
, false, NULL
, NULL
, 0 }
18927 /* Implement TARGET_OPTION_OVERRIDE. This is called once in the beginning
18928 and is used to parse the -m{cpu,tune,arch} strings and setup the initial
18929 tuning structs. In particular it must set selected_tune and
18930 aarch64_asm_isa_flags that define the available ISA features and tuning
18931 decisions. It must also set selected_arch as this will be used to
18932 output the .arch asm tags for each function. */
18935 aarch64_override_options (void)
18937 aarch64_feature_flags cpu_isa
= 0;
18938 aarch64_feature_flags arch_isa
= 0;
18939 aarch64_set_asm_isa_flags (0);
18941 aarch64_cpu cpu
= aarch64_no_cpu
;
18942 aarch64_arch arch
= aarch64_no_arch
;
18943 aarch64_cpu tune
= aarch64_no_cpu
;
18945 if (aarch64_harden_sls_string
)
18946 aarch64_validate_sls_mitigation (aarch64_harden_sls_string
);
18948 if (aarch64_branch_protection_string
)
18949 aarch_validate_mbranch_protection (aarch64_branch_protect_types
,
18950 aarch64_branch_protection_string
,
18951 "-mbranch-protection=");
18953 /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
18954 If either of -march or -mtune is given, they override their
18955 respective component of -mcpu. */
18956 if (aarch64_cpu_string
)
18957 aarch64_validate_mcpu (aarch64_cpu_string
, &cpu
, &cpu_isa
);
18959 if (aarch64_arch_string
)
18960 aarch64_validate_march (aarch64_arch_string
, &arch
, &arch_isa
);
18962 if (aarch64_tune_string
)
18963 aarch64_validate_mtune (aarch64_tune_string
, &tune
);
18965 #ifdef SUBTARGET_OVERRIDE_OPTIONS
18966 SUBTARGET_OVERRIDE_OPTIONS
;
18969 if (cpu
!= aarch64_no_cpu
&& arch
!= aarch64_no_arch
)
18971 /* If both -mcpu and -march are specified, warn if they are not
18972 feature compatible. feature compatible means that the inclusion of the
18973 cpu features would end up disabling an achitecture feature. In
18974 otherwords the cpu features need to be a strict superset of the arch
18975 features and if so prefer the -march ISA flags. */
18976 if (~cpu_isa
& arch_isa
)
18978 std::string ext_diff
18979 = aarch64_get_extension_string_for_isa_flags (arch_isa
, cpu_isa
);
18980 warning (0, "switch %<-mcpu=%s%> conflicts with %<-march=%s%> switch "
18981 "and resulted in options %qs being added",
18982 aarch64_cpu_string
,
18983 aarch64_arch_string
,
18984 ext_diff
.c_str ());
18987 selected_arch
= arch
;
18988 aarch64_set_asm_isa_flags (arch_isa
| AARCH64_FL_DEFAULT_ISA_MODE
);
18990 else if (cpu
!= aarch64_no_cpu
)
18992 selected_arch
= aarch64_get_tune_cpu (cpu
)->arch
;
18993 aarch64_set_asm_isa_flags (cpu_isa
| AARCH64_FL_DEFAULT_ISA_MODE
);
18995 else if (arch
!= aarch64_no_arch
)
18997 cpu
= aarch64_get_arch (arch
)->ident
;
18998 selected_arch
= arch
;
18999 aarch64_set_asm_isa_flags (arch_isa
| AARCH64_FL_DEFAULT_ISA_MODE
);
19003 /* No -mcpu or -march specified, so use the default CPU. */
19004 cpu
= TARGET_CPU_DEFAULT
;
19005 const processor
*cpu_info
= aarch64_get_tune_cpu (cpu
);
19006 selected_arch
= cpu_info
->arch
;
19007 aarch64_set_asm_isa_flags (cpu_info
->flags
19008 | AARCH64_FL_DEFAULT_ISA_MODE
);
19011 selected_tune
= (tune
!= aarch64_no_cpu
) ? tune
: cpu
;
19013 if (aarch_enable_bti
== 2)
19015 #ifdef TARGET_ENABLE_BTI
19016 aarch_enable_bti
= 1;
19018 aarch_enable_bti
= 0;
19022 if (aarch64_enable_gcs
== 2)
19024 #ifdef TARGET_ENABLE_GCS
19025 aarch64_enable_gcs
= 1;
19027 aarch64_enable_gcs
= 0;
19031 /* Return address signing is currently not supported for ILP32 targets. For
19032 LP64 targets use the configured option in the absence of a command-line
19033 option for -mbranch-protection. */
19034 if (!TARGET_ILP32
&& aarch64_branch_protection_string
== NULL
)
19036 #ifdef TARGET_ENABLE_PAC_RET
19037 aarch_ra_sign_scope
= AARCH_FUNCTION_NON_LEAF
;
19039 aarch_ra_sign_scope
= AARCH_FUNCTION_NONE
;
19043 #ifndef HAVE_AS_MABI_OPTION
19044 /* The compiler may have been configured with 2.23.* binutils, which does
19045 not have support for ILP32. */
19047 error ("assembler does not support %<-mabi=ilp32%>");
19050 warning (OPT_Wdeprecated
, "%<-mabi=ilp32%> is deprecated");
19052 /* Convert -msve-vector-bits to a VG count. */
19053 aarch64_sve_vg
= aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits
);
19055 if (aarch_ra_sign_scope
!= AARCH_FUNCTION_NONE
&& TARGET_ILP32
)
19056 sorry ("return address signing is only supported for %<-mabi=lp64%>");
19058 /* The pass to insert speculation tracking runs before
19059 shrink-wrapping and the latter does not know how to update the
19060 tracking status. So disable it in this case. */
19061 if (aarch64_track_speculation
)
19062 flag_shrink_wrap
= 0;
19064 aarch64_override_options_internal (&global_options
);
19066 /* Save these options as the default ones in case we push and pop them later
19067 while processing functions with potential target attributes. */
19068 target_option_default_node
= target_option_current_node
19069 = build_target_option_node (&global_options
, &global_options_set
);
19072 /* Implement targetm.override_options_after_change. */
19075 aarch64_override_options_after_change (void)
19077 aarch64_override_options_after_change_1 (&global_options
);
19080 /* Implement the TARGET_OFFLOAD_OPTIONS hook. */
19082 aarch64_offload_options (void)
19085 return xstrdup ("-foffload-abi=ilp32 -foffload-abi-host-opts=-mabi=ilp32");
19087 return xstrdup ("-foffload-abi=lp64 -foffload-abi-host-opts=-mabi=lp64");
19090 static struct machine_function
*
19091 aarch64_init_machine_status (void)
19093 struct machine_function
*machine
;
19094 machine
= ggc_cleared_alloc
<machine_function
> ();
19099 aarch64_init_expanders (void)
19101 init_machine_status
= aarch64_init_machine_status
;
19104 /* A checking mechanism for the implementation of the various code models. */
19106 initialize_aarch64_code_model (struct gcc_options
*opts
)
19108 aarch64_cmodel
= opts
->x_aarch64_cmodel_var
;
19109 switch (opts
->x_aarch64_cmodel_var
)
19111 case AARCH64_CMODEL_TINY
:
19112 if (opts
->x_flag_pic
)
19113 aarch64_cmodel
= AARCH64_CMODEL_TINY_PIC
;
19115 case AARCH64_CMODEL_SMALL
:
19116 if (opts
->x_flag_pic
)
19118 #ifdef HAVE_AS_SMALL_PIC_RELOCS
19119 aarch64_cmodel
= (flag_pic
== 2
19120 ? AARCH64_CMODEL_SMALL_PIC
19121 : AARCH64_CMODEL_SMALL_SPIC
);
19123 aarch64_cmodel
= AARCH64_CMODEL_SMALL_PIC
;
19127 case AARCH64_CMODEL_LARGE
:
19128 if (opts
->x_flag_pic
)
19129 sorry ("code model %qs with %<-f%s%>", "large",
19130 opts
->x_flag_pic
> 1 ? "PIC" : "pic");
19131 if (opts
->x_aarch64_abi
== AARCH64_ABI_ILP32
)
19132 sorry ("code model %qs not supported in ilp32 mode", "large");
19134 case AARCH64_CMODEL_TINY_PIC
:
19135 case AARCH64_CMODEL_SMALL_PIC
:
19136 case AARCH64_CMODEL_SMALL_SPIC
:
19137 gcc_unreachable ();
19141 /* Implements TARGET_OPTION_RESTORE. Restore the backend codegen decisions
19142 using the information saved in PTR. */
19145 aarch64_option_restore (struct gcc_options
*opts
,
19146 struct gcc_options
* /* opts_set */,
19147 struct cl_target_option
* /* ptr */)
19149 aarch64_override_options_internal (opts
);
19152 /* Implement TARGET_OPTION_PRINT. */
19155 aarch64_option_print (FILE *file
, int indent
, struct cl_target_option
*ptr
)
19157 const struct processor
*cpu
19158 = aarch64_get_tune_cpu (ptr
->x_selected_tune
);
19159 const struct processor
*arch
= aarch64_get_arch (ptr
->x_selected_arch
);
19160 aarch64_feature_flags isa_flags
= aarch64_get_asm_isa_flags(ptr
);
19161 std::string extension
19162 = aarch64_get_extension_string_for_isa_flags (isa_flags
, arch
->flags
);
19164 fprintf (file
, "%*sselected tune = %s\n", indent
, "", cpu
->name
);
19165 fprintf (file
, "%*sselected arch = %s%s\n", indent
, "",
19166 arch
->name
, extension
.c_str ());
19169 static GTY(()) tree aarch64_previous_fndecl
;
19172 aarch64_reset_previous_fndecl (void)
19174 aarch64_previous_fndecl
= NULL
;
19177 /* Restore or save the TREE_TARGET_GLOBALS from or to NEW_TREE.
19178 Used by aarch64_set_current_function and aarch64_pragma_target_parse to
19179 make sure optab availability predicates are recomputed when necessary. */
19182 aarch64_save_restore_target_globals (tree new_tree
)
19184 if (TREE_TARGET_GLOBALS (new_tree
))
19185 restore_target_globals (TREE_TARGET_GLOBALS (new_tree
));
19186 else if (new_tree
== target_option_default_node
)
19187 restore_target_globals (&default_target_globals
);
19189 TREE_TARGET_GLOBALS (new_tree
) = save_target_globals_default_opts ();
19192 /* Return the target_option_node for FNDECL, or the current options
19193 if FNDECL is null. */
19196 aarch64_fndecl_options (tree fndecl
)
19199 return target_option_current_node
;
19201 if (tree options
= DECL_FUNCTION_SPECIFIC_TARGET (fndecl
))
19204 return target_option_default_node
;
19207 /* Implement TARGET_SET_CURRENT_FUNCTION. Unpack the codegen decisions
19208 like tuning and ISA features from the DECL_FUNCTION_SPECIFIC_TARGET
19209 of the function, if such exists. This function may be called multiple
19210 times on a single function so use aarch64_previous_fndecl to avoid
19211 setting up identical state. */
19214 aarch64_set_current_function (tree fndecl
)
19216 tree old_tree
= aarch64_fndecl_options (aarch64_previous_fndecl
);
19217 tree new_tree
= aarch64_fndecl_options (fndecl
);
19219 auto new_isa_mode
= (fndecl
19220 ? aarch64_fndecl_isa_mode (fndecl
)
19221 : AARCH64_DEFAULT_ISA_MODE
);
19222 auto isa_flags
= aarch64_get_isa_flags (TREE_TARGET_OPTION (new_tree
));
19224 static bool reported_zt0_p
;
19225 if (!reported_zt0_p
19226 && !(isa_flags
& AARCH64_FL_SME2
)
19228 && aarch64_fndecl_has_state (fndecl
, "zt0"))
19230 error ("functions with %qs state require the ISA extension %qs",
19232 inform (input_location
, "you can enable %qs using the command-line"
19233 " option %<-march%>, or by using the %<target%>"
19234 " attribute or pragma", "sme2");
19235 reported_zt0_p
= true;
19238 /* If nothing to do, return. #pragma GCC reset or #pragma GCC pop to
19239 the default have been handled by aarch64_save_restore_target_globals from
19240 aarch64_pragma_target_parse. */
19241 if (old_tree
== new_tree
19242 && (!fndecl
|| aarch64_previous_fndecl
)
19243 && (isa_flags
& AARCH64_FL_ISA_MODES
).val
[0] == new_isa_mode
)
19245 gcc_assert (AARCH64_ISA_MODE
== new_isa_mode
);
19249 aarch64_previous_fndecl
= fndecl
;
19251 /* First set the target options. */
19252 cl_target_option_restore (&global_options
, &global_options_set
,
19253 TREE_TARGET_OPTION (new_tree
));
19255 /* The ISA mode can vary based on function type attributes and
19256 function declaration attributes. Make sure that the target
19257 options correctly reflect these attributes. */
19258 if ((isa_flags
& AARCH64_FL_ISA_MODES
).val
[0] != new_isa_mode
)
19260 auto base_flags
= (aarch64_asm_isa_flags
& ~AARCH64_FL_ISA_MODES
);
19261 aarch64_set_asm_isa_flags (base_flags
19262 | aarch64_feature_flags (new_isa_mode
));
19264 aarch64_override_options_internal (&global_options
);
19265 new_tree
= build_target_option_node (&global_options
,
19266 &global_options_set
);
19267 DECL_FUNCTION_SPECIFIC_TARGET (fndecl
) = new_tree
;
19269 tree new_optimize
= build_optimization_node (&global_options
,
19270 &global_options_set
);
19271 if (new_optimize
!= optimization_default_node
)
19272 DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl
) = new_optimize
;
19275 aarch64_save_restore_target_globals (new_tree
);
19277 gcc_assert (AARCH64_ISA_MODE
== new_isa_mode
);
19280 /* Enum describing the various ways we can handle attributes.
19281 In many cases we can reuse the generic option handling machinery. */
19283 enum aarch64_attr_opt_type
19285 aarch64_attr_mask
, /* Attribute should set a bit in target_flags. */
19286 aarch64_attr_bool
, /* Attribute sets or unsets a boolean variable. */
19287 aarch64_attr_enum
, /* Attribute sets an enum variable. */
19288 aarch64_attr_custom
/* Attribute requires a custom handling function. */
19291 /* All the information needed to handle a target attribute.
19292 NAME is the name of the attribute.
19293 ATTR_TYPE specifies the type of behavior of the attribute as described
19294 in the definition of enum aarch64_attr_opt_type.
19295 ALLOW_NEG is true if the attribute supports a "no-" form.
19296 HANDLER is the function that takes the attribute string as an argument
19297 It is needed only when the ATTR_TYPE is aarch64_attr_custom.
19298 OPT_NUM is the enum specifying the option that the attribute modifies.
19299 This is needed for attributes that mirror the behavior of a command-line
19300 option, that is it has ATTR_TYPE aarch64_attr_mask, aarch64_attr_bool or
19301 aarch64_attr_enum. */
19303 struct aarch64_attribute_info
19306 enum aarch64_attr_opt_type attr_type
;
19308 bool (*handler
) (const char *);
19309 enum opt_code opt_num
;
19312 /* Handle the ARCH_STR argument to the arch= target attribute. */
19315 aarch64_handle_attr_arch (const char *str
)
19317 aarch64_arch tmp_arch
= aarch64_no_arch
;
19318 std::string invalid_extension
;
19319 aarch64_feature_flags tmp_flags
;
19320 enum aarch_parse_opt_result parse_res
19321 = aarch64_parse_arch (str
, &tmp_arch
, &tmp_flags
, &invalid_extension
);
19323 if (parse_res
== AARCH_PARSE_OK
)
19325 gcc_assert (tmp_arch
!= aarch64_no_arch
);
19326 selected_arch
= tmp_arch
;
19327 aarch64_set_asm_isa_flags (tmp_flags
| (aarch64_asm_isa_flags
19328 & AARCH64_FL_ISA_MODES
));
19334 case AARCH_PARSE_MISSING_ARG
:
19335 error ("missing name in %<target(\"arch=\")%> pragma or attribute");
19337 case AARCH_PARSE_INVALID_ARG
:
19338 error ("invalid name %qs in %<target(\"arch=\")%> pragma or attribute", str
);
19339 aarch64_print_hint_for_arch (str
);
19341 case AARCH_PARSE_INVALID_FEATURE
:
19342 error ("invalid feature modifier %s of value %qs in "
19343 "%<target()%> pragma or attribute", invalid_extension
.c_str (), str
);
19344 aarch64_print_hint_for_extensions (invalid_extension
.c_str ());
19347 gcc_unreachable ();
19353 /* Handle the argument CPU_STR to the cpu= target attribute. */
19356 aarch64_handle_attr_cpu (const char *str
)
19358 aarch64_cpu tmp_cpu
= aarch64_no_cpu
;
19359 std::string invalid_extension
;
19360 aarch64_feature_flags tmp_flags
;
19361 enum aarch_parse_opt_result parse_res
19362 = aarch64_parse_cpu (str
, &tmp_cpu
, &tmp_flags
, &invalid_extension
);
19364 if (parse_res
== AARCH_PARSE_OK
)
19366 gcc_assert (tmp_cpu
!= aarch64_no_cpu
);
19367 selected_tune
= tmp_cpu
;
19368 selected_arch
= aarch64_get_tune_cpu (tmp_cpu
)->arch
;
19369 aarch64_set_asm_isa_flags (tmp_flags
| (aarch64_asm_isa_flags
19370 & AARCH64_FL_ISA_MODES
));
19376 case AARCH_PARSE_MISSING_ARG
:
19377 error ("missing name in %<target(\"cpu=\")%> pragma or attribute");
19379 case AARCH_PARSE_INVALID_ARG
:
19380 error ("invalid name %qs in %<target(\"cpu=\")%> pragma or attribute", str
);
19381 aarch64_print_hint_for_core (str
);
19383 case AARCH_PARSE_INVALID_FEATURE
:
19384 error ("invalid feature modifier %qs of value %qs in "
19385 "%<target()%> pragma or attribute", invalid_extension
.c_str (), str
);
19386 aarch64_print_hint_for_extensions (invalid_extension
.c_str ());
19389 gcc_unreachable ();
19395 /* Handle the argument STR to the branch-protection= attribute. */
19398 aarch64_handle_attr_branch_protection (const char* str
)
19400 return aarch_validate_mbranch_protection (aarch64_branch_protect_types
, str
,
19401 "target(\"branch-protection=\")");
19404 /* Handle the argument STR to the tune= target attribute. */
19407 aarch64_handle_attr_tune (const char *str
)
19409 aarch64_cpu tmp_tune
= aarch64_no_cpu
;
19410 enum aarch_parse_opt_result parse_res
19411 = aarch64_parse_tune (str
, &tmp_tune
);
19413 if (parse_res
== AARCH_PARSE_OK
)
19415 gcc_assert (tmp_tune
!= aarch64_no_cpu
);
19416 selected_tune
= tmp_tune
;
19422 case AARCH_PARSE_INVALID_ARG
:
19423 error ("invalid name %qs in %<target(\"tune=\")%> pragma or attribute", str
);
19424 aarch64_print_hint_for_core (str
);
19427 gcc_unreachable ();
19433 /* Parse an architecture extensions target attribute string specified in STR.
19434 For example "+fp+nosimd". Show any errors if needed. Return TRUE
19435 if successful. Update aarch64_isa_flags to reflect the ISA features
19439 aarch64_handle_attr_isa_flags (char *str
)
19441 enum aarch_parse_opt_result parse_res
;
19442 auto isa_flags
= aarch64_asm_isa_flags
;
19444 /* We allow "+nothing" in the beginning to clear out all architectural
19445 features if the user wants to handpick specific features. */
19446 if (strncmp ("+nothing", str
, 8) == 0)
19448 isa_flags
&= AARCH64_FL_ISA_MODES
;
19452 std::string invalid_extension
;
19453 parse_res
= aarch64_parse_extension (str
, &isa_flags
, &invalid_extension
);
19455 if (parse_res
== AARCH_PARSE_OK
)
19457 aarch64_set_asm_isa_flags (isa_flags
);
19463 case AARCH_PARSE_MISSING_ARG
:
19464 error ("missing value in %<target()%> pragma or attribute");
19467 case AARCH_PARSE_INVALID_FEATURE
:
19468 error ("invalid feature modifier %qs of value %qs in "
19469 "%<target()%> pragma or attribute", invalid_extension
.c_str (), str
);
19473 gcc_unreachable ();
19479 /* The target attributes that we support. On top of these we also support just
19480 ISA extensions, like __attribute__ ((target ("+crc"))), but that case is
19481 handled explicitly in aarch64_process_one_target_attr. */
19483 static const struct aarch64_attribute_info aarch64_attributes
[] =
19485 { "general-regs-only", aarch64_attr_mask
, false, NULL
,
19486 OPT_mgeneral_regs_only
},
19487 { "fix-cortex-a53-835769", aarch64_attr_bool
, true, NULL
,
19488 OPT_mfix_cortex_a53_835769
},
19489 { "fix-cortex-a53-843419", aarch64_attr_bool
, true, NULL
,
19490 OPT_mfix_cortex_a53_843419
},
19491 { "cmodel", aarch64_attr_enum
, false, NULL
, OPT_mcmodel_
},
19492 { "strict-align", aarch64_attr_mask
, true, NULL
, OPT_mstrict_align
},
19493 { "omit-leaf-frame-pointer", aarch64_attr_bool
, true, NULL
,
19494 OPT_momit_leaf_frame_pointer
},
19495 { "tls-dialect", aarch64_attr_enum
, false, NULL
, OPT_mtls_dialect_
},
19496 { "arch", aarch64_attr_custom
, false, aarch64_handle_attr_arch
,
19498 { "cpu", aarch64_attr_custom
, false, aarch64_handle_attr_cpu
, OPT_mcpu_
},
19499 { "tune", aarch64_attr_custom
, false, aarch64_handle_attr_tune
,
19501 { "branch-protection", aarch64_attr_custom
, false,
19502 aarch64_handle_attr_branch_protection
, OPT_mbranch_protection_
},
19503 { "sign-return-address", aarch64_attr_enum
, false, NULL
,
19504 OPT_msign_return_address_
},
19505 { "outline-atomics", aarch64_attr_bool
, true, NULL
,
19506 OPT_moutline_atomics
},
19507 { NULL
, aarch64_attr_custom
, false, NULL
, OPT____
}
19510 /* Parse ARG_STR which contains the definition of one target attribute.
19511 Show appropriate errors if any or return true if the attribute is valid. */
19514 aarch64_process_one_target_attr (char *arg_str
)
19516 bool invert
= false;
19518 size_t len
= strlen (arg_str
);
19522 error ("malformed %<target()%> pragma or attribute");
19526 auto_vec
<char, 32> buffer
;
19527 buffer
.safe_grow (len
+ 1);
19528 char *str_to_check
= buffer
.address ();
19529 memcpy (str_to_check
, arg_str
, len
+ 1);
19531 /* We have something like __attribute__ ((target ("+fp+nosimd"))).
19532 It is easier to detect and handle it explicitly here rather than going
19533 through the machinery for the rest of the target attributes in this
19535 if (*str_to_check
== '+')
19536 return aarch64_handle_attr_isa_flags (str_to_check
);
19538 if (len
> 3 && startswith (str_to_check
, "no-"))
19543 char *arg
= strchr (str_to_check
, '=');
19545 /* If we found opt=foo then terminate STR_TO_CHECK at the '='
19546 and point ARG to "foo". */
19552 const struct aarch64_attribute_info
*p_attr
;
19553 bool found
= false;
19554 for (p_attr
= aarch64_attributes
; p_attr
->name
; p_attr
++)
19556 /* If the names don't match up, or the user has given an argument
19557 to an attribute that doesn't accept one, or didn't give an argument
19558 to an attribute that expects one, fail to match. */
19559 if (strcmp (str_to_check
, p_attr
->name
) != 0)
19563 bool attr_need_arg_p
= p_attr
->attr_type
== aarch64_attr_custom
19564 || p_attr
->attr_type
== aarch64_attr_enum
;
19566 if (attr_need_arg_p
^ (arg
!= NULL
))
19568 error ("pragma or attribute %<target(\"%s\")%> does not accept an argument", str_to_check
);
19572 /* If the name matches but the attribute does not allow "no-" versions
19573 then we can't match. */
19574 if (invert
&& !p_attr
->allow_neg
)
19576 error ("pragma or attribute %<target(\"%s\")%> does not allow a negated form", str_to_check
);
19580 switch (p_attr
->attr_type
)
19582 /* Has a custom handler registered.
19583 For example, cpu=, arch=, tune=. */
19584 case aarch64_attr_custom
:
19585 gcc_assert (p_attr
->handler
);
19586 if (!p_attr
->handler (arg
))
19590 /* Either set or unset a boolean option. */
19591 case aarch64_attr_bool
:
19593 struct cl_decoded_option decoded
;
19595 generate_option (p_attr
->opt_num
, NULL
, !invert
,
19596 CL_TARGET
, &decoded
);
19597 aarch64_handle_option (&global_options
, &global_options_set
,
19598 &decoded
, input_location
);
19601 /* Set or unset a bit in the target_flags. aarch64_handle_option
19602 should know what mask to apply given the option number. */
19603 case aarch64_attr_mask
:
19605 struct cl_decoded_option decoded
;
19606 /* We only need to specify the option number.
19607 aarch64_handle_option will know which mask to apply. */
19608 decoded
.opt_index
= p_attr
->opt_num
;
19609 decoded
.value
= !invert
;
19610 aarch64_handle_option (&global_options
, &global_options_set
,
19611 &decoded
, input_location
);
19614 /* Use the option setting machinery to set an option to an enum. */
19615 case aarch64_attr_enum
:
19620 valid
= opt_enum_arg_to_value (p_attr
->opt_num
, arg
,
19621 &value
, CL_TARGET
);
19624 set_option (&global_options
, NULL
, p_attr
->opt_num
, value
,
19625 NULL
, DK_UNSPECIFIED
, input_location
,
19630 error ("pragma or attribute %<target(\"%s=%s\")%> is not valid", str_to_check
, arg
);
19635 gcc_unreachable ();
19639 /* If we reached here we either have found an attribute and validated
19640 it or didn't match any. If we matched an attribute but its arguments
19641 were malformed we will have returned false already. */
19645 /* Count how many times the character C appears in
19646 NULL-terminated string STR. */
19648 static unsigned int
19649 num_occurences_in_str (char c
, char *str
)
19651 unsigned int res
= 0;
19652 while (*str
!= '\0')
19663 /* Parse the tree in ARGS that contains the target attribute information
19664 and update the global target options space. */
19667 aarch64_process_target_attr (tree args
)
19669 if (TREE_CODE (args
) == TREE_LIST
)
19673 tree head
= TREE_VALUE (args
);
19676 if (!aarch64_process_target_attr (head
))
19679 args
= TREE_CHAIN (args
);
19685 if (TREE_CODE (args
) != STRING_CST
)
19687 error ("attribute %<target%> argument not a string");
19691 size_t len
= strlen (TREE_STRING_POINTER (args
));
19692 auto_vec
<char, 32> buffer
;
19693 buffer
.safe_grow (len
+ 1);
19694 char *str_to_check
= buffer
.address ();
19695 memcpy (str_to_check
, TREE_STRING_POINTER (args
), len
+ 1);
19699 error ("malformed %<target()%> pragma or attribute");
19703 /* Used to catch empty spaces between commas i.e.
19704 attribute ((target ("attr1,,attr2"))). */
19705 unsigned int num_commas
= num_occurences_in_str (',', str_to_check
);
19707 /* Handle multiple target attributes separated by ','. */
19708 char *token
= strtok_r (str_to_check
, ",", &str_to_check
);
19710 unsigned int num_attrs
= 0;
19714 if (!aarch64_process_one_target_attr (token
))
19716 /* Check if token is possibly an arch extension without
19718 aarch64_feature_flags isa_temp
= 0;
19719 auto with_plus
= std::string ("+") + token
;
19720 enum aarch_parse_opt_result ext_res
19721 = aarch64_parse_extension (with_plus
.c_str (), &isa_temp
, nullptr);
19723 if (ext_res
== AARCH_PARSE_OK
)
19724 error ("arch extension %qs should be prefixed by %<+%>",
19727 error ("pragma or attribute %<target(\"%s\")%> is not valid", token
);
19731 token
= strtok_r (NULL
, ",", &str_to_check
);
19734 if (num_attrs
!= num_commas
+ 1)
19736 error ("malformed %<target(\"%s\")%> pragma or attribute", TREE_STRING_POINTER (args
));
19743 static bool aarch64_process_target_version_attr (tree args
);
19745 /* Implement TARGET_OPTION_VALID_ATTRIBUTE_P. This is used to
19746 process attribute ((target ("..."))). */
19749 aarch64_option_valid_attribute_p (tree fndecl
, tree
, tree args
, int)
19751 struct cl_target_option cur_target
;
19754 tree new_target
, new_optimize
;
19755 tree existing_target
= DECL_FUNCTION_SPECIFIC_TARGET (fndecl
);
19757 /* If what we're processing is the current pragma string then the
19758 target option node is already stored in target_option_current_node
19759 by aarch64_pragma_target_parse in aarch64-c.cc. Use that to avoid
19760 having to re-parse the string. This is especially useful to keep
19761 arm_neon.h compile times down since that header contains a lot
19762 of intrinsics enclosed in pragmas. */
19763 if (!existing_target
&& args
== current_target_pragma
)
19765 DECL_FUNCTION_SPECIFIC_TARGET (fndecl
) = target_option_current_node
;
19768 tree func_optimize
= DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl
);
19771 = build_optimization_node (&global_options
, &global_options_set
);
19772 func_optimize
= DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl
);
19774 /* If the function changed the optimization levels as well as setting
19775 target options, start with the optimizations specified. */
19776 if (func_optimize
&& func_optimize
!= old_optimize
)
19777 cl_optimization_restore (&global_options
, &global_options_set
,
19778 TREE_OPTIMIZATION (func_optimize
));
19780 /* Save the current target options to restore at the end. */
19781 cl_target_option_save (&cur_target
, &global_options
, &global_options_set
);
19783 /* If fndecl already has some target attributes applied to it, unpack
19784 them so that we add this attribute on top of them, rather than
19785 overwriting them. */
19786 if (existing_target
)
19788 struct cl_target_option
*existing_options
19789 = TREE_TARGET_OPTION (existing_target
);
19791 if (existing_options
)
19792 cl_target_option_restore (&global_options
, &global_options_set
,
19796 cl_target_option_restore (&global_options
, &global_options_set
,
19797 TREE_TARGET_OPTION (target_option_current_node
));
19799 ret
= aarch64_process_target_attr (args
);
19802 tree version_attr
= lookup_attribute ("target_version",
19803 DECL_ATTRIBUTES (fndecl
));
19804 if (version_attr
!= NULL_TREE
)
19806 /* Reapply any target_version attribute after target attribute.
19807 This should be equivalent to applying the target_version once
19808 after processing all target attributes. */
19809 tree version_args
= TREE_VALUE (version_attr
);
19810 ret
= aarch64_process_target_version_attr (version_args
);
19814 /* Set up any additional state. */
19817 aarch64_override_options_internal (&global_options
);
19818 new_target
= build_target_option_node (&global_options
,
19819 &global_options_set
);
19824 new_optimize
= build_optimization_node (&global_options
,
19825 &global_options_set
);
19829 DECL_FUNCTION_SPECIFIC_TARGET (fndecl
) = new_target
;
19831 if (old_optimize
!= new_optimize
)
19832 DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl
) = new_optimize
;
19835 cl_target_option_restore (&global_options
, &global_options_set
, &cur_target
);
19837 if (old_optimize
!= new_optimize
)
19838 cl_optimization_restore (&global_options
, &global_options_set
,
19839 TREE_OPTIMIZATION (old_optimize
));
19843 typedef unsigned long long aarch64_fmv_feature_mask
;
19848 aarch64_fmv_feature_mask feature_mask
;
19849 aarch64_feature_flags opt_flags
;
19850 } aarch64_fmv_feature_datum
;
19852 #define AARCH64_FMV_FEATURE(NAME, FEAT_NAME, C) \
19853 {NAME, 1ULL << FEAT_##FEAT_NAME, ::feature_deps::fmv_deps_##FEAT_NAME},
19855 /* The "rdma" alias uses a different FEAT_NAME to avoid a duplicate
19856 feature_deps name. */
19857 #define FEAT_RDMA FEAT_RDM
19859 /* FMV features are listed in priority order, to make it easier to sort target
19861 static aarch64_fmv_feature_datum aarch64_fmv_feature_data
[] = {
19862 #include "config/aarch64/aarch64-option-extensions.def"
19865 /* Parse a function multiversioning feature string STR, as found in a
19866 target_version or target_clones attribute.
19868 If ISA_FLAGS is nonnull, then update it with the specified architecture
19869 features turned on. If FEATURE_MASK is nonnull, then assign to it a bitmask
19870 representing the set of features explicitly specified in the feature string.
19871 Return an aarch_parse_opt_result describing the result.
19873 When the STR string contains an invalid or duplicate extension, a copy of
19874 the extension string is created and stored to INVALID_EXTENSION. */
19876 static enum aarch_parse_opt_result
19877 aarch64_parse_fmv_features (const char *str
, aarch64_feature_flags
*isa_flags
,
19878 aarch64_fmv_feature_mask
*feature_mask
,
19879 std::string
*invalid_extension
)
19882 *feature_mask
= 0ULL;
19884 if (strcmp (str
, "default") == 0)
19885 return AARCH_PARSE_OK
;
19887 while (str
!= NULL
&& *str
!= 0)
19892 ext
= strchr (str
, '+');
19897 len
= strlen (str
);
19900 return AARCH_PARSE_MISSING_ARG
;
19902 int num_features
= ARRAY_SIZE (aarch64_fmv_feature_data
);
19904 for (i
= 0; i
< num_features
; i
++)
19906 if (strlen (aarch64_fmv_feature_data
[i
].name
) == len
19907 && strncmp (aarch64_fmv_feature_data
[i
].name
, str
, len
) == 0)
19910 *isa_flags
|= aarch64_fmv_feature_data
[i
].opt_flags
;
19913 auto old_feature_mask
= *feature_mask
;
19914 *feature_mask
|= aarch64_fmv_feature_data
[i
].feature_mask
;
19915 if (*feature_mask
== old_feature_mask
)
19917 /* Duplicate feature. */
19918 if (invalid_extension
)
19919 *invalid_extension
= std::string (str
, len
);
19920 return AARCH_PARSE_DUPLICATE_FEATURE
;
19927 if (i
== num_features
)
19929 /* Feature not found in list. */
19930 if (invalid_extension
)
19931 *invalid_extension
= std::string (str
, len
);
19932 return AARCH_PARSE_INVALID_FEATURE
;
19937 /* Skip over the next '+'. */
19941 return AARCH_PARSE_OK
;
19944 /* Parse the tree in ARGS that contains the target_version attribute
19945 information and update the global target options space. */
19948 aarch64_process_target_version_attr (tree args
)
19950 static bool issued_warning
= false;
19951 if (!issued_warning
)
19953 warning (OPT_Wexperimental_fmv_target
,
19954 "Function Multi Versioning support is experimental, and the "
19955 "behavior is likely to change");
19956 issued_warning
= true;
19959 if (TREE_CODE (args
) == TREE_LIST
)
19961 if (TREE_CHAIN (args
))
19963 error ("attribute %<target_version%> has multiple values");
19966 args
= TREE_VALUE (args
);
19969 if (!args
|| TREE_CODE (args
) != STRING_CST
)
19971 error ("attribute %<target_version%> argument not a string");
19975 const char *str
= TREE_STRING_POINTER (args
);
19977 enum aarch_parse_opt_result parse_res
;
19978 auto isa_flags
= aarch64_asm_isa_flags
;
19980 std::string invalid_extension
;
19981 parse_res
= aarch64_parse_fmv_features (str
, &isa_flags
, NULL
,
19982 &invalid_extension
);
19984 if (parse_res
== AARCH_PARSE_OK
)
19986 aarch64_set_asm_isa_flags (isa_flags
);
19992 case AARCH_PARSE_MISSING_ARG
:
19993 error ("missing value in %<target_version%> attribute");
19996 case AARCH_PARSE_INVALID_FEATURE
:
19997 error ("invalid feature modifier %qs of value %qs in "
19998 "%<target_version%> attribute", invalid_extension
.c_str (),
20002 case AARCH_PARSE_DUPLICATE_FEATURE
:
20003 error ("duplicate feature modifier %qs of value %qs in "
20004 "%<target_version%> attribute", invalid_extension
.c_str (),
20009 gcc_unreachable ();
20015 /* Implement TARGET_OPTION_VALID_VERSION_ATTRIBUTE_P. This is used to
20016 process attribute ((target_version ("..."))). */
20019 aarch64_option_valid_version_attribute_p (tree fndecl
, tree
, tree args
, int)
20021 struct cl_target_option cur_target
;
20024 tree existing_target
= DECL_FUNCTION_SPECIFIC_TARGET (fndecl
);
20026 /* Save the current target options to restore at the end. */
20027 cl_target_option_save (&cur_target
, &global_options
, &global_options_set
);
20029 /* If fndecl already has some target attributes applied to it, unpack
20030 them so that we add this attribute on top of them, rather than
20031 overwriting them. */
20032 if (existing_target
)
20034 struct cl_target_option
*existing_options
20035 = TREE_TARGET_OPTION (existing_target
);
20037 if (existing_options
)
20038 cl_target_option_restore (&global_options
, &global_options_set
,
20042 cl_target_option_restore (&global_options
, &global_options_set
,
20043 TREE_TARGET_OPTION (target_option_current_node
));
20045 ret
= aarch64_process_target_version_attr (args
);
20047 /* Set up any additional state. */
20050 aarch64_override_options_internal (&global_options
);
20051 new_target
= build_target_option_node (&global_options
,
20052 &global_options_set
);
20058 DECL_FUNCTION_SPECIFIC_TARGET (fndecl
) = new_target
;
20060 cl_target_option_restore (&global_options
, &global_options_set
, &cur_target
);
20065 /* This parses the attribute arguments to target_version in DECL and the
20066 feature mask required to select those targets. No adjustments are made to
20067 add or remove redundant feature requirements. */
20069 static aarch64_fmv_feature_mask
20070 get_feature_mask_for_version (tree decl
)
20072 tree version_attr
= lookup_attribute ("target_version",
20073 DECL_ATTRIBUTES (decl
));
20074 if (version_attr
== NULL
)
20077 const char *version_string
= TREE_STRING_POINTER (TREE_VALUE (TREE_VALUE
20079 enum aarch_parse_opt_result parse_res
;
20080 aarch64_fmv_feature_mask feature_mask
;
20082 parse_res
= aarch64_parse_fmv_features (version_string
, NULL
, &feature_mask
,
20085 /* We should have detected any errors before getting here. */
20086 gcc_assert (parse_res
== AARCH_PARSE_OK
);
20088 return feature_mask
;
20091 /* Compare priorities of two feature masks. Return:
20092 1: mask1 is higher priority
20093 -1: mask2 is higher priority
20094 0: masks are equal. */
20097 compare_feature_masks (aarch64_fmv_feature_mask mask1
,
20098 aarch64_fmv_feature_mask mask2
)
20100 int pop1
= popcount_hwi (mask1
);
20101 int pop2
= popcount_hwi (mask2
);
20107 auto diff_mask
= mask1
^ mask2
;
20108 if (diff_mask
== 0ULL)
20110 int num_features
= ARRAY_SIZE (aarch64_fmv_feature_data
);
20111 for (int i
= num_features
- 1; i
>= 0; i
--)
20113 auto bit_mask
= aarch64_fmv_feature_data
[i
].feature_mask
;
20114 if (diff_mask
& bit_mask
)
20115 return (mask1
& bit_mask
) ? 1 : -1;
20120 /* Compare priorities of two version decls. */
20123 aarch64_compare_version_priority (tree decl1
, tree decl2
)
20125 auto mask1
= get_feature_mask_for_version (decl1
);
20126 auto mask2
= get_feature_mask_for_version (decl2
);
20128 return compare_feature_masks (mask1
, mask2
);
20131 /* Build the struct __ifunc_arg_t type:
20133 struct __ifunc_arg_t
20135 unsigned long _size; // Size of the struct, so it can grow.
20136 unsigned long _hwcap;
20137 unsigned long _hwcap2;
20142 build_ifunc_arg_type ()
20144 tree ifunc_arg_type
= lang_hooks
.types
.make_type (RECORD_TYPE
);
20145 tree field1
= build_decl (UNKNOWN_LOCATION
, FIELD_DECL
,
20146 get_identifier ("_size"),
20147 long_unsigned_type_node
);
20148 tree field2
= build_decl (UNKNOWN_LOCATION
, FIELD_DECL
,
20149 get_identifier ("_hwcap"),
20150 long_unsigned_type_node
);
20151 tree field3
= build_decl (UNKNOWN_LOCATION
, FIELD_DECL
,
20152 get_identifier ("_hwcap2"),
20153 long_unsigned_type_node
);
20155 DECL_FIELD_CONTEXT (field1
) = ifunc_arg_type
;
20156 DECL_FIELD_CONTEXT (field2
) = ifunc_arg_type
;
20157 DECL_FIELD_CONTEXT (field3
) = ifunc_arg_type
;
20159 TYPE_FIELDS (ifunc_arg_type
) = field1
;
20160 DECL_CHAIN (field1
) = field2
;
20161 DECL_CHAIN (field2
) = field3
;
20163 layout_type (ifunc_arg_type
);
20165 tree const_type
= build_qualified_type (ifunc_arg_type
, TYPE_QUAL_CONST
);
20166 tree pointer_type
= build_pointer_type (const_type
);
20168 return pointer_type
;
20171 /* Implement TARGET_MANGLE_DECL_ASSEMBLER_NAME, to add function multiversioning
20175 aarch64_mangle_decl_assembler_name (tree decl
, tree id
)
20177 /* For function version, add the target suffix to the assembler name. */
20178 if (TREE_CODE (decl
) == FUNCTION_DECL
20179 && DECL_FUNCTION_VERSIONED (decl
))
20181 aarch64_fmv_feature_mask feature_mask
= get_feature_mask_for_version (decl
);
20183 std::string name
= IDENTIFIER_POINTER (id
);
20185 /* For the default version, append ".default". */
20186 if (feature_mask
== 0ULL)
20188 name
+= ".default";
20189 return get_identifier (name
.c_str());
20194 int num_features
= ARRAY_SIZE (aarch64_fmv_feature_data
);
20195 for (int i
= 0; i
< num_features
; i
++)
20197 if (feature_mask
& aarch64_fmv_feature_data
[i
].feature_mask
)
20200 name
+= aarch64_fmv_feature_data
[i
].name
;
20204 if (DECL_ASSEMBLER_NAME_SET_P (decl
))
20205 SET_DECL_RTL (decl
, NULL
);
20207 id
= get_identifier (name
.c_str());
20212 /* Return an identifier for the base assembler name of a versioned function.
20213 This is computed by taking the default version's assembler name, and
20214 stripping off the ".default" suffix if it's already been appended. */
20217 get_suffixed_assembler_name (tree default_decl
, const char *suffix
)
20219 std::string name
= IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (default_decl
));
20221 auto size
= name
.size ();
20222 if (size
>= 8 && name
.compare (size
- 8, 8, ".default") == 0)
20223 name
.resize (size
- 8);
20225 return get_identifier (name
.c_str());
20228 /* Make the resolver function decl to dispatch the versions of
20229 a multi-versioned function, DEFAULT_DECL. IFUNC_ALIAS_DECL is
20230 ifunc alias that will point to the created resolver. Create an
20231 empty basic block in the resolver and store the pointer in
20232 EMPTY_BB. Return the decl of the resolver function. */
20235 make_resolver_func (const tree default_decl
,
20236 const tree ifunc_alias_decl
,
20237 basic_block
*empty_bb
)
20239 tree decl
, type
, t
;
20241 /* Create resolver function name based on default_decl. We need to remove an
20242 existing ".default" suffix if this has already been appended. */
20243 tree decl_name
= get_suffixed_assembler_name (default_decl
, ".resolver");
20244 const char *resolver_name
= IDENTIFIER_POINTER (decl_name
);
20246 /* The resolver function should have signature
20247 (void *) resolver (uint64_t, const __ifunc_arg_t *) */
20248 type
= build_function_type_list (ptr_type_node
,
20250 build_ifunc_arg_type (),
20253 decl
= build_fn_decl (resolver_name
, type
);
20254 SET_DECL_ASSEMBLER_NAME (decl
, decl_name
);
20256 DECL_NAME (decl
) = decl_name
;
20257 TREE_USED (decl
) = 1;
20258 DECL_ARTIFICIAL (decl
) = 1;
20259 DECL_IGNORED_P (decl
) = 1;
20260 TREE_PUBLIC (decl
) = 0;
20261 DECL_UNINLINABLE (decl
) = 1;
20263 /* Resolver is not external, body is generated. */
20264 DECL_EXTERNAL (decl
) = 0;
20265 DECL_EXTERNAL (ifunc_alias_decl
) = 0;
20267 DECL_CONTEXT (decl
) = NULL_TREE
;
20268 DECL_INITIAL (decl
) = make_node (BLOCK
);
20269 DECL_STATIC_CONSTRUCTOR (decl
) = 0;
20271 if (DECL_COMDAT_GROUP (default_decl
)
20272 || TREE_PUBLIC (default_decl
))
20274 /* In this case, each translation unit with a call to this
20275 versioned function will put out a resolver. Ensure it
20276 is comdat to keep just one copy. */
20277 DECL_COMDAT (decl
) = 1;
20278 make_decl_one_only (decl
, DECL_ASSEMBLER_NAME (decl
));
20281 TREE_PUBLIC (ifunc_alias_decl
) = 0;
20283 /* Build result decl and add to function_decl. */
20284 t
= build_decl (UNKNOWN_LOCATION
, RESULT_DECL
, NULL_TREE
, ptr_type_node
);
20285 DECL_CONTEXT (t
) = decl
;
20286 DECL_ARTIFICIAL (t
) = 1;
20287 DECL_IGNORED_P (t
) = 1;
20288 DECL_RESULT (decl
) = t
;
20290 /* Build parameter decls and add to function_decl. */
20291 tree arg1
= build_decl (UNKNOWN_LOCATION
, PARM_DECL
,
20292 get_identifier ("hwcap"),
20294 tree arg2
= build_decl (UNKNOWN_LOCATION
, PARM_DECL
,
20295 get_identifier ("arg"),
20296 build_ifunc_arg_type());
20297 DECL_CONTEXT (arg1
) = decl
;
20298 DECL_CONTEXT (arg2
) = decl
;
20299 DECL_ARTIFICIAL (arg1
) = 1;
20300 DECL_ARTIFICIAL (arg2
) = 1;
20301 DECL_IGNORED_P (arg1
) = 1;
20302 DECL_IGNORED_P (arg2
) = 1;
20303 DECL_ARG_TYPE (arg1
) = uint64_type_node
;
20304 DECL_ARG_TYPE (arg2
) = build_ifunc_arg_type ();
20305 DECL_ARGUMENTS (decl
) = arg1
;
20306 TREE_CHAIN (arg1
) = arg2
;
20308 gimplify_function_tree (decl
);
20309 push_cfun (DECL_STRUCT_FUNCTION (decl
));
20310 *empty_bb
= init_lowered_empty_function (decl
, false,
20311 profile_count::uninitialized ());
20313 cgraph_node::add_new_function (decl
, true);
20314 symtab
->call_cgraph_insertion_hooks (cgraph_node::get_create (decl
));
20318 gcc_assert (ifunc_alias_decl
!= NULL
);
20319 /* Mark ifunc_alias_decl as "ifunc" with resolver as resolver_name. */
20320 DECL_ATTRIBUTES (ifunc_alias_decl
)
20321 = make_attribute ("ifunc", resolver_name
,
20322 DECL_ATTRIBUTES (ifunc_alias_decl
));
20324 /* Create the alias for dispatch to resolver here. */
20325 cgraph_node::create_same_body_alias (ifunc_alias_decl
, decl
);
20329 /* This adds a condition to the basic_block NEW_BB in function FUNCTION_DECL
20330 to return a pointer to VERSION_DECL if all feature bits specified in
20331 FEATURE_MASK are not set in MASK_VAR. This function will be called during
20332 version dispatch to decide which function version to execute. It returns
20333 the basic block at the end, to which more conditions can be added. */
20335 add_condition_to_bb (tree function_decl
, tree version_decl
,
20336 aarch64_fmv_feature_mask feature_mask
,
20337 tree mask_var
, basic_block new_bb
)
20339 gimple
*return_stmt
;
20340 tree convert_expr
, result_var
;
20341 gimple
*convert_stmt
;
20342 gimple
*if_else_stmt
;
20344 basic_block bb1
, bb2
, bb3
;
20349 push_cfun (DECL_STRUCT_FUNCTION (function_decl
));
20351 gcc_assert (new_bb
!= NULL
);
20352 gseq
= bb_seq (new_bb
);
20354 convert_expr
= build1 (CONVERT_EXPR
, ptr_type_node
,
20355 build_fold_addr_expr (version_decl
));
20356 result_var
= create_tmp_var (ptr_type_node
);
20357 convert_stmt
= gimple_build_assign (result_var
, convert_expr
);
20358 return_stmt
= gimple_build_return (result_var
);
20360 if (feature_mask
== 0ULL)
20362 /* Default version. */
20363 gimple_seq_add_stmt (&gseq
, convert_stmt
);
20364 gimple_seq_add_stmt (&gseq
, return_stmt
);
20365 set_bb_seq (new_bb
, gseq
);
20366 gimple_set_bb (convert_stmt
, new_bb
);
20367 gimple_set_bb (return_stmt
, new_bb
);
20372 tree and_expr_var
= create_tmp_var (long_long_unsigned_type_node
);
20373 tree and_expr
= build2 (BIT_AND_EXPR
,
20374 long_long_unsigned_type_node
,
20376 build_int_cst (long_long_unsigned_type_node
,
20378 gimple
*and_stmt
= gimple_build_assign (and_expr_var
, and_expr
);
20379 gimple_set_block (and_stmt
, DECL_INITIAL (function_decl
));
20380 gimple_set_bb (and_stmt
, new_bb
);
20381 gimple_seq_add_stmt (&gseq
, and_stmt
);
20383 tree zero_llu
= build_int_cst (long_long_unsigned_type_node
, 0);
20384 if_else_stmt
= gimple_build_cond (EQ_EXPR
, and_expr_var
, zero_llu
,
20385 NULL_TREE
, NULL_TREE
);
20386 gimple_set_block (if_else_stmt
, DECL_INITIAL (function_decl
));
20387 gimple_set_bb (if_else_stmt
, new_bb
);
20388 gimple_seq_add_stmt (&gseq
, if_else_stmt
);
20390 gimple_seq_add_stmt (&gseq
, convert_stmt
);
20391 gimple_seq_add_stmt (&gseq
, return_stmt
);
20392 set_bb_seq (new_bb
, gseq
);
20395 e12
= split_block (bb1
, if_else_stmt
);
20397 e12
->flags
&= ~EDGE_FALLTHRU
;
20398 e12
->flags
|= EDGE_TRUE_VALUE
;
20400 e23
= split_block (bb2
, return_stmt
);
20402 gimple_set_bb (convert_stmt
, bb2
);
20403 gimple_set_bb (return_stmt
, bb2
);
20406 make_edge (bb1
, bb3
, EDGE_FALSE_VALUE
);
20409 make_edge (bb2
, EXIT_BLOCK_PTR_FOR_FN (cfun
), 0);
20416 /* This function generates the dispatch function for
20417 multi-versioned functions. DISPATCH_DECL is the function which will
20418 contain the dispatch logic. FNDECLS are the function choices for
20419 dispatch, and is a tree chain. EMPTY_BB is the basic block pointer
20420 in DISPATCH_DECL in which the dispatch code is generated. */
20423 dispatch_function_versions (tree dispatch_decl
,
20425 basic_block
*empty_bb
)
20427 gimple
*ifunc_cpu_init_stmt
;
20429 vec
<tree
> *fndecls
;
20431 gcc_assert (dispatch_decl
!= NULL
20432 && fndecls_p
!= NULL
20433 && empty_bb
!= NULL
);
20435 push_cfun (DECL_STRUCT_FUNCTION (dispatch_decl
));
20437 gseq
= bb_seq (*empty_bb
);
20438 /* Function version dispatch is via IFUNC. IFUNC resolvers fire before
20439 constructors, so explicity call __init_cpu_features_resolver here. */
20440 tree init_fn_type
= build_function_type_list (void_type_node
,
20441 long_unsigned_type_node
,
20442 build_ifunc_arg_type(),
20444 tree init_fn_id
= get_identifier ("__init_cpu_features_resolver");
20445 tree init_fn_decl
= build_decl (UNKNOWN_LOCATION
, FUNCTION_DECL
,
20446 init_fn_id
, init_fn_type
);
20447 DECL_EXTERNAL (init_fn_decl
) = 1;
20448 TREE_PUBLIC (init_fn_decl
) = 1;
20449 DECL_VISIBILITY (init_fn_decl
) = VISIBILITY_HIDDEN
;
20450 DECL_VISIBILITY_SPECIFIED (init_fn_decl
) = 1;
20451 tree arg1
= DECL_ARGUMENTS (dispatch_decl
);
20452 tree arg2
= TREE_CHAIN (arg1
);
20453 ifunc_cpu_init_stmt
= gimple_build_call (init_fn_decl
, 2, arg1
, arg2
);
20454 gimple_seq_add_stmt (&gseq
, ifunc_cpu_init_stmt
);
20455 gimple_set_bb (ifunc_cpu_init_stmt
, *empty_bb
);
20457 /* Build the struct type for __aarch64_cpu_features. */
20458 tree global_type
= lang_hooks
.types
.make_type (RECORD_TYPE
);
20459 tree field1
= build_decl (UNKNOWN_LOCATION
, FIELD_DECL
,
20460 get_identifier ("features"),
20461 long_long_unsigned_type_node
);
20462 DECL_FIELD_CONTEXT (field1
) = global_type
;
20463 TYPE_FIELDS (global_type
) = field1
;
20464 layout_type (global_type
);
20466 tree global_var
= build_decl (UNKNOWN_LOCATION
, VAR_DECL
,
20467 get_identifier ("__aarch64_cpu_features"),
20469 DECL_EXTERNAL (global_var
) = 1;
20470 TREE_PUBLIC (global_var
) = 1;
20471 DECL_VISIBILITY (global_var
) = VISIBILITY_HIDDEN
;
20472 DECL_VISIBILITY_SPECIFIED (global_var
) = 1;
20473 tree mask_var
= create_tmp_var (long_long_unsigned_type_node
);
20475 tree component_expr
= build3 (COMPONENT_REF
, long_long_unsigned_type_node
,
20476 global_var
, field1
, NULL_TREE
);
20477 gimple
*component_stmt
= gimple_build_assign (mask_var
, component_expr
);
20478 gimple_set_block (component_stmt
, DECL_INITIAL (dispatch_decl
));
20479 gimple_set_bb (component_stmt
, *empty_bb
);
20480 gimple_seq_add_stmt (&gseq
, component_stmt
);
20482 tree not_expr
= build1 (BIT_NOT_EXPR
, long_long_unsigned_type_node
, mask_var
);
20483 gimple
*not_stmt
= gimple_build_assign (mask_var
, not_expr
);
20484 gimple_set_block (not_stmt
, DECL_INITIAL (dispatch_decl
));
20485 gimple_set_bb (not_stmt
, *empty_bb
);
20486 gimple_seq_add_stmt (&gseq
, not_stmt
);
20488 set_bb_seq (*empty_bb
, gseq
);
20492 /* fndecls_p is actually a vector. */
20493 fndecls
= static_cast<vec
<tree
> *> (fndecls_p
);
20495 /* At least one more version other than the default. */
20496 unsigned int num_versions
= fndecls
->length ();
20497 gcc_assert (num_versions
>= 2);
20499 struct function_version_info
20502 aarch64_fmv_feature_mask feature_mask
;
20503 } *function_versions
;
20505 function_versions
= (struct function_version_info
*)
20506 XNEWVEC (struct function_version_info
, (num_versions
));
20508 unsigned int actual_versions
= 0;
20510 for (tree version_decl
: *fndecls
)
20512 aarch64_fmv_feature_mask feature_mask
;
20513 /* Get attribute string, parse it and find the right features. */
20514 feature_mask
= get_feature_mask_for_version (version_decl
);
20515 function_versions
[actual_versions
].version_decl
= version_decl
;
20516 function_versions
[actual_versions
].feature_mask
= feature_mask
;
20520 auto compare_feature_version_info
= [](const void *p1
, const void *p2
) {
20521 const function_version_info v1
= *(const function_version_info
*)p1
;
20522 const function_version_info v2
= *(const function_version_info
*)p2
;
20523 return - compare_feature_masks (v1
.feature_mask
, v2
.feature_mask
);
20526 /* Sort the versions according to descending order of dispatch priority. */
20527 qsort (function_versions
, actual_versions
,
20528 sizeof (struct function_version_info
), compare_feature_version_info
);
20530 for (unsigned int i
= 0; i
< actual_versions
; ++i
)
20531 *empty_bb
= add_condition_to_bb (dispatch_decl
,
20532 function_versions
[i
].version_decl
,
20533 function_versions
[i
].feature_mask
,
20537 free (function_versions
);
20541 /* Implement TARGET_GENERATE_VERSION_DISPATCHER_BODY. */
20544 aarch64_generate_version_dispatcher_body (void *node_p
)
20546 tree resolver_decl
;
20547 basic_block empty_bb
;
20548 tree default_ver_decl
;
20549 struct cgraph_node
*versn
;
20550 struct cgraph_node
*node
;
20552 struct cgraph_function_version_info
*node_version_info
= NULL
;
20553 struct cgraph_function_version_info
*versn_info
= NULL
;
20555 node
= (cgraph_node
*)node_p
;
20557 node_version_info
= node
->function_version ();
20558 gcc_assert (node
->dispatcher_function
20559 && node_version_info
!= NULL
);
20561 if (node_version_info
->dispatcher_resolver
)
20562 return node_version_info
->dispatcher_resolver
;
20564 /* The first version in the chain corresponds to the default version. */
20565 default_ver_decl
= node_version_info
->next
->this_node
->decl
;
20567 /* node is going to be an alias, so remove the finalized bit. */
20568 node
->definition
= false;
20570 resolver_decl
= make_resolver_func (default_ver_decl
,
20571 node
->decl
, &empty_bb
);
20573 node_version_info
->dispatcher_resolver
= resolver_decl
;
20575 push_cfun (DECL_STRUCT_FUNCTION (resolver_decl
));
20577 auto_vec
<tree
, 2> fn_ver_vec
;
20579 for (versn_info
= node_version_info
->next
; versn_info
;
20580 versn_info
= versn_info
->next
)
20582 versn
= versn_info
->this_node
;
20583 /* Check for virtual functions here again, as by this time it should
20584 have been determined if this function needs a vtable index or
20585 not. This happens for methods in derived classes that override
20586 virtual methods in base classes but are not explicitly marked as
20588 if (DECL_VINDEX (versn
->decl
))
20589 sorry ("virtual function multiversioning not supported");
20591 fn_ver_vec
.safe_push (versn
->decl
);
20594 dispatch_function_versions (resolver_decl
, &fn_ver_vec
, &empty_bb
);
20595 cgraph_edge::rebuild_edges ();
20598 /* Fix up symbol names. First we need to obtain the base name, which may
20599 have already been mangled. */
20600 tree base_name
= get_suffixed_assembler_name (default_ver_decl
, "");
20602 /* We need to redo the version mangling on the non-default versions for the
20603 target_clones case. Redoing the mangling for the target_version case is
20604 redundant but does no harm. We need to skip the default version, because
20605 expand_clones will append ".default" later; fortunately that suffix is the
20606 one we want anyway. */
20607 for (versn_info
= node_version_info
->next
->next
; versn_info
;
20608 versn_info
= versn_info
->next
)
20610 tree version_decl
= versn_info
->this_node
->decl
;
20611 tree name
= aarch64_mangle_decl_assembler_name (version_decl
,
20613 symtab
->change_decl_assembler_name (version_decl
, name
);
20616 /* We also need to use the base name for the ifunc declaration. */
20617 symtab
->change_decl_assembler_name (node
->decl
, base_name
);
20619 return resolver_decl
;
20622 /* Make a dispatcher declaration for the multi-versioned function DECL.
20623 Calls to DECL function will be replaced with calls to the dispatcher
20624 by the front-end. Returns the decl of the dispatcher function. */
20627 aarch64_get_function_versions_dispatcher (void *decl
)
20629 tree fn
= (tree
) decl
;
20630 struct cgraph_node
*node
= NULL
;
20631 struct cgraph_node
*default_node
= NULL
;
20632 struct cgraph_function_version_info
*node_v
= NULL
;
20633 struct cgraph_function_version_info
*first_v
= NULL
;
20635 tree dispatch_decl
= NULL
;
20637 struct cgraph_function_version_info
*default_version_info
= NULL
;
20639 gcc_assert (fn
!= NULL
&& DECL_FUNCTION_VERSIONED (fn
));
20641 node
= cgraph_node::get (fn
);
20642 gcc_assert (node
!= NULL
);
20644 node_v
= node
->function_version ();
20645 gcc_assert (node_v
!= NULL
);
20647 if (node_v
->dispatcher_resolver
!= NULL
)
20648 return node_v
->dispatcher_resolver
;
20650 /* Find the default version and make it the first node. */
20652 /* Go to the beginning of the chain. */
20653 while (first_v
->prev
!= NULL
)
20654 first_v
= first_v
->prev
;
20655 default_version_info
= first_v
;
20656 while (default_version_info
!= NULL
)
20658 if (get_feature_mask_for_version
20659 (default_version_info
->this_node
->decl
) == 0ULL)
20661 default_version_info
= default_version_info
->next
;
20664 /* If there is no default node, just return NULL. */
20665 if (default_version_info
== NULL
)
20668 /* Make default info the first node. */
20669 if (first_v
!= default_version_info
)
20671 default_version_info
->prev
->next
= default_version_info
->next
;
20672 if (default_version_info
->next
)
20673 default_version_info
->next
->prev
= default_version_info
->prev
;
20674 first_v
->prev
= default_version_info
;
20675 default_version_info
->next
= first_v
;
20676 default_version_info
->prev
= NULL
;
20679 default_node
= default_version_info
->this_node
;
20681 if (targetm
.has_ifunc_p ())
20683 struct cgraph_function_version_info
*it_v
= NULL
;
20684 struct cgraph_node
*dispatcher_node
= NULL
;
20685 struct cgraph_function_version_info
*dispatcher_version_info
= NULL
;
20687 /* Right now, the dispatching is done via ifunc. */
20688 dispatch_decl
= make_dispatcher_decl (default_node
->decl
);
20689 TREE_NOTHROW (dispatch_decl
) = TREE_NOTHROW (fn
);
20691 dispatcher_node
= cgraph_node::get_create (dispatch_decl
);
20692 gcc_assert (dispatcher_node
!= NULL
);
20693 dispatcher_node
->dispatcher_function
= 1;
20694 dispatcher_version_info
20695 = dispatcher_node
->insert_new_function_version ();
20696 dispatcher_version_info
->next
= default_version_info
;
20697 dispatcher_node
->definition
= 1;
20699 /* Set the dispatcher for all the versions. */
20700 it_v
= default_version_info
;
20701 while (it_v
!= NULL
)
20703 it_v
->dispatcher_resolver
= dispatch_decl
;
20709 error_at (DECL_SOURCE_LOCATION (default_node
->decl
),
20710 "multiversioning needs %<ifunc%> which is not supported "
20714 return dispatch_decl
;
20717 /* This function returns true if FN1 and FN2 are versions of the same function,
20718 that is, the target_version attributes of the function decls are different.
20719 This assumes that FN1 and FN2 have the same signature. */
20722 aarch64_common_function_versions (tree fn1
, tree fn2
)
20724 if (TREE_CODE (fn1
) != FUNCTION_DECL
20725 || TREE_CODE (fn2
) != FUNCTION_DECL
)
20728 return (aarch64_compare_version_priority (fn1
, fn2
) != 0);
20731 /* Implement TARGET_FUNCTION_ATTRIBUTE_INLINABLE_P. Use an opt-out
20732 rather than an opt-in list. */
20735 aarch64_function_attribute_inlinable_p (const_tree fndecl
)
20737 /* A function that has local SME state cannot be inlined into its caller,
20738 since we only support managing PSTATE.ZA switches at function scope. */
20739 return (!aarch64_fndecl_has_new_state (fndecl
, "za")
20740 && !aarch64_fndecl_has_new_state (fndecl
, "zt0"));
20743 /* Helper for aarch64_can_inline_p. In the case where CALLER and CALLEE are
20744 tri-bool options (yes, no, don't care) and the default value is
20745 DEF, determine whether to reject inlining. */
20748 aarch64_tribools_ok_for_inlining_p (int caller
, int callee
,
20749 int dont_care
, int def
)
20751 /* If the callee doesn't care, always allow inlining. */
20752 if (callee
== dont_care
)
20755 /* If the caller doesn't care, always allow inlining. */
20756 if (caller
== dont_care
)
20759 /* Otherwise, allow inlining if either the callee and caller values
20760 agree, or if the callee is using the default value. */
20761 return (callee
== caller
|| callee
== def
);
20764 /* Bit allocations for ipa_fn_summary::target_info. */
20766 /* Set if the function contains a stmt that relies on the function's
20767 choice of PSTATE.SM setting (0 for non-streaming, 1 for streaming).
20768 Not meaningful for streaming-compatible functions. */
20769 constexpr auto AARCH64_IPA_SM_FIXED
= 1U << 0;
20771 /* Set if the function clobbers ZA and ZT0. Not meaningful for functions that
20773 constexpr auto AARCH64_IPA_CLOBBERS_ZA
= 1U << 1;
20774 constexpr auto AARCH64_IPA_CLOBBERS_ZT0
= 1U << 2;
20776 /* Implement TARGET_NEED_IPA_FN_TARGET_INFO. */
20779 aarch64_need_ipa_fn_target_info (const_tree
, unsigned int &)
20781 /* We could in principle skip this for streaming-compatible functions
20782 that have ZA state, but that's a rare combination. */
20786 /* Implement TARGET_UPDATE_IPA_FN_TARGET_INFO. */
20789 aarch64_update_ipa_fn_target_info (unsigned int &info
, const gimple
*stmt
)
20791 if (auto *ga
= dyn_cast
<const gasm
*> (stmt
))
20793 /* We don't know what the asm does, so conservatively assume that
20794 it requires the function's current SM mode. */
20795 info
|= AARCH64_IPA_SM_FIXED
;
20796 for (unsigned int i
= 0; i
< gimple_asm_nclobbers (ga
); ++i
)
20798 tree op
= gimple_asm_clobber_op (ga
, i
);
20799 const char *clobber
= TREE_STRING_POINTER (TREE_VALUE (op
));
20800 if (strcmp (clobber
, "za") == 0)
20801 info
|= AARCH64_IPA_CLOBBERS_ZA
;
20802 if (strcmp (clobber
, "zt0") == 0)
20803 info
|= AARCH64_IPA_CLOBBERS_ZT0
;
20806 if (auto *call
= dyn_cast
<const gcall
*> (stmt
))
20808 if (gimple_call_builtin_p (call
, BUILT_IN_MD
))
20810 /* The attributes on AArch64 builtins are supposed to be accurate.
20811 If the function isn't marked streaming-compatible then it
20812 needs whichever SM mode it selects. */
20813 tree decl
= gimple_call_fndecl (call
);
20814 if (aarch64_fndecl_pstate_sm (decl
) != 0)
20815 info
|= AARCH64_IPA_SM_FIXED
;
20821 /* Implement TARGET_CAN_INLINE_P. Decide whether it is valid
20822 to inline CALLEE into CALLER based on target-specific info.
20823 Make sure that the caller and callee have compatible architectural
20824 features. Then go through the other possible target attributes
20825 and see if they can block inlining. Try not to reject always_inline
20826 callees unless they are incompatible architecturally. */
20829 aarch64_can_inline_p (tree caller
, tree callee
)
20831 tree caller_tree
= DECL_FUNCTION_SPECIFIC_TARGET (caller
);
20832 tree callee_tree
= DECL_FUNCTION_SPECIFIC_TARGET (callee
);
20834 struct cl_target_option
*caller_opts
20835 = TREE_TARGET_OPTION (caller_tree
? caller_tree
20836 : target_option_default_node
);
20838 struct cl_target_option
*callee_opts
20839 = TREE_TARGET_OPTION (callee_tree
? callee_tree
20840 : target_option_default_node
);
20842 /* Callee's ISA flags should be a subset of the caller's. */
20843 auto caller_asm_isa
= (aarch64_get_asm_isa_flags (caller_opts
)
20844 & ~AARCH64_FL_ISA_MODES
);
20845 auto callee_asm_isa
= (aarch64_get_asm_isa_flags (callee_opts
)
20846 & ~AARCH64_FL_ISA_MODES
);
20847 if (callee_asm_isa
& ~caller_asm_isa
)
20850 auto caller_isa
= (aarch64_get_isa_flags (caller_opts
)
20851 & ~AARCH64_FL_ISA_MODES
);
20852 auto callee_isa
= (aarch64_get_isa_flags (callee_opts
)
20853 & ~AARCH64_FL_ISA_MODES
);
20854 if (callee_isa
& ~caller_isa
)
20857 /* Return true if the callee might have target_info property PROPERTY.
20858 The answer must be true unless we have positive proof to the contrary. */
20859 auto callee_has_property
= [&](unsigned int property
)
20861 if (ipa_fn_summaries
)
20862 if (auto *summary
= ipa_fn_summaries
->get (cgraph_node::get (callee
)))
20863 if (!(summary
->target_info
& property
))
20868 /* Streaming-compatible code can be inlined into functions with any
20869 PSTATE.SM mode. Otherwise the caller and callee must agree on
20870 PSTATE.SM mode, unless we can prove that the callee is naturally
20871 streaming-compatible. */
20872 auto caller_sm
= (aarch64_get_isa_flags (caller_opts
) & AARCH64_FL_SM_STATE
);
20873 auto callee_sm
= (aarch64_get_isa_flags (callee_opts
) & AARCH64_FL_SM_STATE
);
20875 && caller_sm
!= callee_sm
20876 && callee_has_property (AARCH64_IPA_SM_FIXED
))
20879 /* aarch64_function_attribute_inlinable_p prevents new-ZA and new-ZT0
20880 functions from being inlined into others. We also need to prevent
20881 inlining of shared-ZA functions into functions without ZA state,
20882 since this is an error condition.
20884 The only other problematic case for ZA is inlining a function that
20885 directly clobbers ZA or ZT0 into a function that has ZA or ZT0 state. */
20886 auto caller_za
= (aarch64_get_isa_flags (caller_opts
) & AARCH64_FL_ZA_ON
);
20887 auto callee_za
= (aarch64_get_isa_flags (callee_opts
) & AARCH64_FL_ZA_ON
);
20888 if (!caller_za
&& callee_za
)
20891 && aarch64_fndecl_has_state (caller
, "za")
20892 && callee_has_property (AARCH64_IPA_CLOBBERS_ZA
))
20895 && aarch64_fndecl_has_state (caller
, "zt0")
20896 && callee_has_property (AARCH64_IPA_CLOBBERS_ZT0
))
20899 /* Allow non-strict aligned functions inlining into strict
20901 if ((TARGET_STRICT_ALIGN_P (caller_opts
->x_target_flags
)
20902 != TARGET_STRICT_ALIGN_P (callee_opts
->x_target_flags
))
20903 && !(!TARGET_STRICT_ALIGN_P (callee_opts
->x_target_flags
)
20904 && TARGET_STRICT_ALIGN_P (caller_opts
->x_target_flags
)))
20907 bool always_inline
= lookup_attribute ("always_inline",
20908 DECL_ATTRIBUTES (callee
));
20910 /* If the architectural features match up and the callee is always_inline
20911 then the other attributes don't matter. */
20915 if (caller_opts
->x_aarch64_cmodel_var
20916 != callee_opts
->x_aarch64_cmodel_var
)
20919 if (caller_opts
->x_aarch64_tls_dialect
20920 != callee_opts
->x_aarch64_tls_dialect
)
20923 /* Honour explicit requests to workaround errata. */
20924 if (!aarch64_tribools_ok_for_inlining_p (
20925 caller_opts
->x_aarch64_fix_a53_err835769
,
20926 callee_opts
->x_aarch64_fix_a53_err835769
,
20927 2, TARGET_FIX_ERR_A53_835769_DEFAULT
))
20930 if (!aarch64_tribools_ok_for_inlining_p (
20931 caller_opts
->x_aarch64_fix_a53_err843419
,
20932 callee_opts
->x_aarch64_fix_a53_err843419
,
20933 2, TARGET_FIX_ERR_A53_843419
))
20936 /* If the user explicitly specified -momit-leaf-frame-pointer for the
20937 caller and calle and they don't match up, reject inlining. */
20938 if (!aarch64_tribools_ok_for_inlining_p (
20939 caller_opts
->x_flag_omit_leaf_frame_pointer
,
20940 callee_opts
->x_flag_omit_leaf_frame_pointer
,
20944 /* If the callee has specific tuning overrides, respect them. */
20945 if (callee_opts
->x_aarch64_override_tune_string
!= NULL
20946 && caller_opts
->x_aarch64_override_tune_string
== NULL
)
20949 /* If the user specified tuning override strings for the
20950 caller and callee and they don't match up, reject inlining.
20951 We just do a string compare here, we don't analyze the meaning
20952 of the string, as it would be too costly for little gain. */
20953 if (callee_opts
->x_aarch64_override_tune_string
20954 && caller_opts
->x_aarch64_override_tune_string
20955 && (strcmp (callee_opts
->x_aarch64_override_tune_string
,
20956 caller_opts
->x_aarch64_override_tune_string
) != 0))
20962 /* Return the ID of the TLDESC ABI, initializing the descriptor if hasn't
20966 aarch64_tlsdesc_abi_id ()
20968 predefined_function_abi
&tlsdesc_abi
= function_abis
[ARM_PCS_TLSDESC
];
20969 if (!tlsdesc_abi
.initialized_p ())
20971 HARD_REG_SET full_reg_clobbers
;
20972 CLEAR_HARD_REG_SET (full_reg_clobbers
);
20973 SET_HARD_REG_BIT (full_reg_clobbers
, R0_REGNUM
);
20974 SET_HARD_REG_BIT (full_reg_clobbers
, CC_REGNUM
);
20975 for (int regno
= P0_REGNUM
; regno
<= P15_REGNUM
; ++regno
)
20976 SET_HARD_REG_BIT (full_reg_clobbers
, regno
);
20977 tlsdesc_abi
.initialize (ARM_PCS_TLSDESC
, full_reg_clobbers
);
20979 return ARM_PCS_TLSDESC
;
20982 /* Return true if SYMBOL_REF X binds locally. */
20985 aarch64_symbol_binds_local_p (const_rtx x
)
20987 return (SYMBOL_REF_DECL (x
)
20988 ? targetm
.binds_local_p (SYMBOL_REF_DECL (x
))
20989 : SYMBOL_REF_LOCAL_P (x
));
20992 /* Return true if SYMBOL_REF X is thread local */
20994 aarch64_tls_symbol_p (rtx x
)
20996 if (! TARGET_HAVE_TLS
)
20999 x
= strip_salt (x
);
21000 if (!SYMBOL_REF_P (x
))
21003 return SYMBOL_REF_TLS_MODEL (x
) != 0;
21006 /* Classify a TLS symbol into one of the TLS kinds. */
21007 enum aarch64_symbol_type
21008 aarch64_classify_tls_symbol (rtx x
)
21010 enum tls_model tls_kind
= tls_symbolic_operand_type (x
);
21014 case TLS_MODEL_GLOBAL_DYNAMIC
:
21015 case TLS_MODEL_LOCAL_DYNAMIC
:
21016 return TARGET_TLS_DESC
? SYMBOL_SMALL_TLSDESC
: SYMBOL_SMALL_TLSGD
;
21018 case TLS_MODEL_INITIAL_EXEC
:
21019 switch (aarch64_cmodel
)
21021 case AARCH64_CMODEL_TINY
:
21022 case AARCH64_CMODEL_TINY_PIC
:
21023 return SYMBOL_TINY_TLSIE
;
21025 return SYMBOL_SMALL_TLSIE
;
21028 case TLS_MODEL_LOCAL_EXEC
:
21029 if (aarch64_tls_size
== 12)
21030 return SYMBOL_TLSLE12
;
21031 else if (aarch64_tls_size
== 24)
21032 return SYMBOL_TLSLE24
;
21033 else if (aarch64_tls_size
== 32)
21034 return SYMBOL_TLSLE32
;
21035 else if (aarch64_tls_size
== 48)
21036 return SYMBOL_TLSLE48
;
21038 gcc_unreachable ();
21040 case TLS_MODEL_EMULATED
:
21041 case TLS_MODEL_NONE
:
21042 return SYMBOL_FORCE_TO_MEM
;
21045 gcc_unreachable ();
21049 /* Return the correct method for accessing X + OFFSET, where X is either
21050 a SYMBOL_REF or LABEL_REF. */
21052 enum aarch64_symbol_type
21053 aarch64_classify_symbol (rtx x
, HOST_WIDE_INT offset
)
21055 x
= strip_salt (x
);
21057 if (LABEL_REF_P (x
))
21059 switch (aarch64_cmodel
)
21061 case AARCH64_CMODEL_LARGE
:
21062 return SYMBOL_FORCE_TO_MEM
;
21064 case AARCH64_CMODEL_TINY_PIC
:
21065 case AARCH64_CMODEL_TINY
:
21066 return SYMBOL_TINY_ABSOLUTE
;
21068 case AARCH64_CMODEL_SMALL_SPIC
:
21069 case AARCH64_CMODEL_SMALL_PIC
:
21070 case AARCH64_CMODEL_SMALL
:
21071 return SYMBOL_SMALL_ABSOLUTE
;
21074 gcc_unreachable ();
21078 if (SYMBOL_REF_P (x
))
21080 if (aarch64_tls_symbol_p (x
))
21081 return aarch64_classify_tls_symbol (x
);
21083 switch (aarch64_cmodel
)
21085 case AARCH64_CMODEL_TINY_PIC
:
21086 case AARCH64_CMODEL_TINY
:
21087 /* With -fPIC non-local symbols use the GOT. For orthogonality
21088 always use the GOT for extern weak symbols. */
21090 && (flag_pic
|| SYMBOL_REF_WEAK (x
))
21091 && !aarch64_symbol_binds_local_p (x
))
21092 return SYMBOL_TINY_GOT
;
21094 /* When we retrieve symbol + offset address, we have to make sure
21095 the offset does not cause overflow of the final address. But
21096 we have no way of knowing the address of symbol at compile time
21097 so we can't accurately say if the distance between the PC and
21098 symbol + offset is outside the addressible range of +/-1MB in the
21099 TINY code model. So we limit the maximum offset to +/-64KB and
21100 assume the offset to the symbol is not larger than +/-(1MB - 64KB).
21101 If offset_within_block_p is true we allow larger offsets. */
21102 if (!(IN_RANGE (offset
, -0x10000, 0x10000)
21103 || offset_within_block_p (x
, offset
)))
21104 return SYMBOL_FORCE_TO_MEM
;
21106 return SYMBOL_TINY_ABSOLUTE
;
21109 case AARCH64_CMODEL_SMALL_SPIC
:
21110 case AARCH64_CMODEL_SMALL_PIC
:
21111 case AARCH64_CMODEL_SMALL
:
21113 && (flag_pic
|| SYMBOL_REF_WEAK (x
))
21114 && !aarch64_symbol_binds_local_p (x
))
21115 return aarch64_cmodel
== AARCH64_CMODEL_SMALL_SPIC
21116 ? SYMBOL_SMALL_GOT_28K
: SYMBOL_SMALL_GOT_4G
;
21118 /* Same reasoning as the tiny code model, but the offset cap here is
21119 1MB, allowing +/-3.9GB for the offset to the symbol. */
21120 if (!(IN_RANGE (offset
, -0x100000, 0x100000)
21121 || offset_within_block_p (x
, offset
)))
21122 return SYMBOL_FORCE_TO_MEM
;
21124 return SYMBOL_SMALL_ABSOLUTE
;
21126 case AARCH64_CMODEL_LARGE
:
21127 /* This is alright even in PIC code as the constant
21128 pool reference is always PC relative and within
21129 the same translation unit. */
21130 if (!aarch64_pcrelative_literal_loads
&& CONSTANT_POOL_ADDRESS_P (x
))
21131 return SYMBOL_SMALL_ABSOLUTE
;
21133 return SYMBOL_FORCE_TO_MEM
;
21136 gcc_unreachable ();
21140 /* By default push everything into the constant pool. */
21141 return SYMBOL_FORCE_TO_MEM
;
21145 aarch64_constant_address_p (rtx x
)
21147 return (CONSTANT_P (x
) && memory_address_p (DImode
, x
));
21151 aarch64_legitimate_pic_operand_p (rtx x
)
21154 x
= strip_offset_and_salt (x
, &offset
);
21155 if (SYMBOL_REF_P (x
))
21161 /* Implement TARGET_LEGITIMATE_CONSTANT_P hook. Return true for constants
21162 that should be rematerialized rather than spilled. */
21165 aarch64_legitimate_constant_p (machine_mode mode
, rtx x
)
21167 /* Support CSE and rematerialization of common constants. */
21168 if (CONST_INT_P (x
)
21169 || CONST_DOUBLE_P (x
))
21172 /* Only accept variable-length vector constants if they can be
21175 ??? It would be possible (but complex) to handle rematerialization
21176 of other constants via secondary reloads. */
21177 if (!GET_MODE_SIZE (mode
).is_constant ())
21178 return aarch64_simd_valid_mov_imm (x
);
21180 /* Otherwise, accept any CONST_VECTOR that, if all else fails, can at
21181 least be forced to memory and loaded from there. */
21182 if (CONST_VECTOR_P (x
))
21183 return !targetm
.cannot_force_const_mem (mode
, x
);
21185 /* Do not allow vector struct mode constants for Advanced SIMD.
21186 We could support 0 and -1 easily, but they need support in
21187 aarch64-simd.md. */
21188 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
21189 if (vec_flags
== (VEC_ADVSIMD
| VEC_STRUCT
))
21192 if (GET_CODE (x
) == HIGH
)
21195 /* Accept polynomial constants that can be calculated by using the
21196 destination of a move as the sole temporary. Constants that
21197 require a second temporary cannot be rematerialized (they can't be
21198 forced to memory and also aren't legitimate constants). */
21200 if (poly_int_rtx_p (x
, &offset
))
21201 return aarch64_offset_temporaries (false, offset
) <= 1;
21203 /* If an offset is being added to something else, we need to allow the
21204 base to be moved into the destination register, meaning that there
21205 are no free temporaries for the offset. */
21206 x
= strip_offset_and_salt (x
, &offset
);
21207 if (!offset
.is_constant () && aarch64_offset_temporaries (true, offset
) > 0)
21210 /* Do not allow const (plus (anchor_symbol, const_int)). */
21211 if (maybe_ne (offset
, 0) && SYMBOL_REF_P (x
) && SYMBOL_REF_ANCHOR_P (x
))
21214 /* Treat symbols as constants. Avoid TLS symbols as they are complex,
21215 so spilling them is better than rematerialization. */
21216 if (SYMBOL_REF_P (x
) && !SYMBOL_REF_TLS_MODEL (x
))
21219 /* Label references are always constant. */
21220 if (LABEL_REF_P (x
))
21227 aarch64_load_tp (rtx target
)
21230 || GET_MODE (target
) != Pmode
21231 || !register_operand (target
, Pmode
))
21232 target
= gen_reg_rtx (Pmode
);
21234 /* Can return in any reg. */
21235 emit_insn (gen_aarch64_load_tp_hard (target
));
21239 /* On AAPCS systems, this is the "struct __va_list". */
21240 static GTY(()) tree va_list_type
;
21242 /* Implement TARGET_BUILD_BUILTIN_VA_LIST.
21243 Return the type to use as __builtin_va_list.
21245 AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
21257 aarch64_build_builtin_va_list (void)
21260 tree f_stack
, f_grtop
, f_vrtop
, f_groff
, f_vroff
;
21262 /* Create the type. */
21263 va_list_type
= lang_hooks
.types
.make_type (RECORD_TYPE
);
21264 /* Give it the required name. */
21265 va_list_name
= build_decl (BUILTINS_LOCATION
,
21267 get_identifier ("__va_list"),
21269 DECL_ARTIFICIAL (va_list_name
) = 1;
21270 TREE_PUBLIC (va_list_name
) = 1;
21271 TYPE_NAME (va_list_type
) = va_list_name
;
21272 TYPE_STUB_DECL (va_list_type
) = va_list_name
;
21274 /* Create the fields. */
21275 f_stack
= build_decl (BUILTINS_LOCATION
,
21276 FIELD_DECL
, get_identifier ("__stack"),
21278 f_grtop
= build_decl (BUILTINS_LOCATION
,
21279 FIELD_DECL
, get_identifier ("__gr_top"),
21281 f_vrtop
= build_decl (BUILTINS_LOCATION
,
21282 FIELD_DECL
, get_identifier ("__vr_top"),
21284 f_groff
= build_decl (BUILTINS_LOCATION
,
21285 FIELD_DECL
, get_identifier ("__gr_offs"),
21286 integer_type_node
);
21287 f_vroff
= build_decl (BUILTINS_LOCATION
,
21288 FIELD_DECL
, get_identifier ("__vr_offs"),
21289 integer_type_node
);
21291 /* Tell tree-stdarg pass about our internal offset fields.
21292 NOTE: va_list_gpr/fpr_counter_field are only used for tree comparision
21293 purpose to identify whether the code is updating va_list internal
21294 offset fields through irregular way. */
21295 va_list_gpr_counter_field
= f_groff
;
21296 va_list_fpr_counter_field
= f_vroff
;
21298 DECL_ARTIFICIAL (f_stack
) = 1;
21299 DECL_ARTIFICIAL (f_grtop
) = 1;
21300 DECL_ARTIFICIAL (f_vrtop
) = 1;
21301 DECL_ARTIFICIAL (f_groff
) = 1;
21302 DECL_ARTIFICIAL (f_vroff
) = 1;
21304 DECL_FIELD_CONTEXT (f_stack
) = va_list_type
;
21305 DECL_FIELD_CONTEXT (f_grtop
) = va_list_type
;
21306 DECL_FIELD_CONTEXT (f_vrtop
) = va_list_type
;
21307 DECL_FIELD_CONTEXT (f_groff
) = va_list_type
;
21308 DECL_FIELD_CONTEXT (f_vroff
) = va_list_type
;
21310 TYPE_FIELDS (va_list_type
) = f_stack
;
21311 DECL_CHAIN (f_stack
) = f_grtop
;
21312 DECL_CHAIN (f_grtop
) = f_vrtop
;
21313 DECL_CHAIN (f_vrtop
) = f_groff
;
21314 DECL_CHAIN (f_groff
) = f_vroff
;
21316 /* Compute its layout. */
21317 layout_type (va_list_type
);
21319 return va_list_type
;
21322 /* Implement TARGET_EXPAND_BUILTIN_VA_START. */
21324 aarch64_expand_builtin_va_start (tree valist
, rtx nextarg ATTRIBUTE_UNUSED
)
21326 const CUMULATIVE_ARGS
*cum
;
21327 tree f_stack
, f_grtop
, f_vrtop
, f_groff
, f_vroff
;
21328 tree stack
, grtop
, vrtop
, groff
, vroff
;
21330 int gr_save_area_size
= cfun
->va_list_gpr_size
;
21331 int vr_save_area_size
= cfun
->va_list_fpr_size
;
21334 cum
= &crtl
->args
.info
;
21335 if (cfun
->va_list_gpr_size
)
21336 gr_save_area_size
= MIN ((NUM_ARG_REGS
- cum
->aapcs_ncrn
) * UNITS_PER_WORD
,
21337 cfun
->va_list_gpr_size
);
21338 if (cfun
->va_list_fpr_size
)
21339 vr_save_area_size
= MIN ((NUM_FP_ARG_REGS
- cum
->aapcs_nvrn
)
21340 * UNITS_PER_VREG
, cfun
->va_list_fpr_size
);
21344 gcc_assert (cum
->aapcs_nvrn
== 0);
21345 vr_save_area_size
= 0;
21348 f_stack
= TYPE_FIELDS (va_list_type_node
);
21349 f_grtop
= DECL_CHAIN (f_stack
);
21350 f_vrtop
= DECL_CHAIN (f_grtop
);
21351 f_groff
= DECL_CHAIN (f_vrtop
);
21352 f_vroff
= DECL_CHAIN (f_groff
);
21354 stack
= build3 (COMPONENT_REF
, TREE_TYPE (f_stack
), valist
, f_stack
,
21356 grtop
= build3 (COMPONENT_REF
, TREE_TYPE (f_grtop
), valist
, f_grtop
,
21358 vrtop
= build3 (COMPONENT_REF
, TREE_TYPE (f_vrtop
), valist
, f_vrtop
,
21360 groff
= build3 (COMPONENT_REF
, TREE_TYPE (f_groff
), valist
, f_groff
,
21362 vroff
= build3 (COMPONENT_REF
, TREE_TYPE (f_vroff
), valist
, f_vroff
,
21365 /* Emit code to initialize STACK, which points to the next varargs stack
21366 argument. CUM->AAPCS_STACK_SIZE gives the number of stack words used
21367 by named arguments. STACK is 8-byte aligned. */
21368 t
= make_tree (TREE_TYPE (stack
), virtual_incoming_args_rtx
);
21369 if (cum
->aapcs_stack_size
> 0)
21370 t
= fold_build_pointer_plus_hwi (t
, cum
->aapcs_stack_size
* UNITS_PER_WORD
);
21371 t
= build2 (MODIFY_EXPR
, TREE_TYPE (stack
), stack
, t
);
21372 expand_expr (t
, const0_rtx
, VOIDmode
, EXPAND_NORMAL
);
21374 /* Emit code to initialize GRTOP, the top of the GR save area.
21375 virtual_incoming_args_rtx should have been 16 byte aligned. */
21376 t
= make_tree (TREE_TYPE (grtop
), virtual_incoming_args_rtx
);
21377 t
= build2 (MODIFY_EXPR
, TREE_TYPE (grtop
), grtop
, t
);
21378 expand_expr (t
, const0_rtx
, VOIDmode
, EXPAND_NORMAL
);
21380 /* Emit code to initialize VRTOP, the top of the VR save area.
21381 This address is gr_save_area_bytes below GRTOP, rounded
21382 down to the next 16-byte boundary. */
21383 t
= make_tree (TREE_TYPE (vrtop
), virtual_incoming_args_rtx
);
21384 vr_offset
= ROUND_UP (gr_save_area_size
,
21385 STACK_BOUNDARY
/ BITS_PER_UNIT
);
21388 t
= fold_build_pointer_plus_hwi (t
, -vr_offset
);
21389 t
= build2 (MODIFY_EXPR
, TREE_TYPE (vrtop
), vrtop
, t
);
21390 expand_expr (t
, const0_rtx
, VOIDmode
, EXPAND_NORMAL
);
21392 /* Emit code to initialize GROFF, the offset from GRTOP of the
21393 next GPR argument. */
21394 t
= build2 (MODIFY_EXPR
, TREE_TYPE (groff
), groff
,
21395 build_int_cst (TREE_TYPE (groff
), -gr_save_area_size
));
21396 expand_expr (t
, const0_rtx
, VOIDmode
, EXPAND_NORMAL
);
21398 /* Likewise emit code to initialize VROFF, the offset from FTOP
21399 of the next VR argument. */
21400 t
= build2 (MODIFY_EXPR
, TREE_TYPE (vroff
), vroff
,
21401 build_int_cst (TREE_TYPE (vroff
), -vr_save_area_size
));
21402 expand_expr (t
, const0_rtx
, VOIDmode
, EXPAND_NORMAL
);
21405 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR. */
21408 aarch64_gimplify_va_arg_expr (tree valist
, tree type
, gimple_seq
*pre_p
,
21409 gimple_seq
*post_p ATTRIBUTE_UNUSED
)
21413 bool is_ha
; /* is HFA or HVA. */
21414 bool dw_align
; /* double-word align. */
21415 machine_mode ag_mode
= VOIDmode
;
21419 tree f_stack
, f_grtop
, f_vrtop
, f_groff
, f_vroff
;
21420 tree stack
, f_top
, f_off
, off
, arg
, roundup
, on_stack
;
21421 HOST_WIDE_INT size
, rsize
, adjust
, align
;
21422 tree t
, u
, cond1
, cond2
;
21424 indirect_p
= pass_va_arg_by_reference (type
);
21426 type
= build_pointer_type (type
);
21428 mode
= TYPE_MODE (type
);
21430 f_stack
= TYPE_FIELDS (va_list_type_node
);
21431 f_grtop
= DECL_CHAIN (f_stack
);
21432 f_vrtop
= DECL_CHAIN (f_grtop
);
21433 f_groff
= DECL_CHAIN (f_vrtop
);
21434 f_vroff
= DECL_CHAIN (f_groff
);
21436 stack
= build3 (COMPONENT_REF
, TREE_TYPE (f_stack
), unshare_expr (valist
),
21437 f_stack
, NULL_TREE
);
21438 size
= int_size_in_bytes (type
);
21440 unsigned int abi_break_gcc_9
;
21441 unsigned int abi_break_gcc_13
;
21442 unsigned int abi_break_gcc_14
;
21444 = aarch64_function_arg_alignment (mode
, type
, &abi_break_gcc_9
,
21445 &abi_break_gcc_13
, &abi_break_gcc_14
)
21450 if (aarch64_vfp_is_call_or_return_candidate (mode
, type
, &ag_mode
, &nregs
,
21453 /* No frontends can create types with variable-sized modes, so we
21454 shouldn't be asked to pass or return them. */
21455 unsigned int ag_size
= GET_MODE_SIZE (ag_mode
).to_constant ();
21457 /* TYPE passed in fp/simd registers. */
21459 aarch64_err_no_fpadvsimd (mode
);
21461 f_top
= build3 (COMPONENT_REF
, TREE_TYPE (f_vrtop
),
21462 unshare_expr (valist
), f_vrtop
, NULL_TREE
);
21463 f_off
= build3 (COMPONENT_REF
, TREE_TYPE (f_vroff
),
21464 unshare_expr (valist
), f_vroff
, NULL_TREE
);
21466 rsize
= nregs
* UNITS_PER_VREG
;
21470 if (BYTES_BIG_ENDIAN
&& ag_size
< UNITS_PER_VREG
)
21471 adjust
= UNITS_PER_VREG
- ag_size
;
21473 else if (BLOCK_REG_PADDING (mode
, type
, 1) == PAD_DOWNWARD
21474 && size
< UNITS_PER_VREG
)
21476 adjust
= UNITS_PER_VREG
- size
;
21481 /* TYPE passed in general registers. */
21482 f_top
= build3 (COMPONENT_REF
, TREE_TYPE (f_grtop
),
21483 unshare_expr (valist
), f_grtop
, NULL_TREE
);
21484 f_off
= build3 (COMPONENT_REF
, TREE_TYPE (f_groff
),
21485 unshare_expr (valist
), f_groff
, NULL_TREE
);
21486 rsize
= ROUND_UP (size
, UNITS_PER_WORD
);
21487 nregs
= rsize
/ UNITS_PER_WORD
;
21490 && abi_break_gcc_13
21492 && !bitint_or_aggr_of_bitint_p (type
))
21493 inform (input_location
, "parameter passing for argument of type "
21494 "%qT changed in GCC 13.1", type
);
21497 && abi_break_gcc_14
21498 && (abi_break_gcc_14
> 8 * BITS_PER_UNIT
) != (align
> 8)
21499 && !bitint_or_aggr_of_bitint_p (type
))
21500 inform (input_location
, "parameter passing for argument of type "
21501 "%qT changed in GCC 14.1", type
);
21505 if (abi_break_gcc_9
21507 && !bitint_or_aggr_of_bitint_p (type
))
21508 inform (input_location
, "parameter passing for argument of type "
21509 "%qT changed in GCC 9.1", type
);
21513 if (BLOCK_REG_PADDING (mode
, type
, 1) == PAD_DOWNWARD
21514 && size
< UNITS_PER_WORD
)
21516 adjust
= UNITS_PER_WORD
- size
;
21520 /* Get a local temporary for the field value. */
21521 off
= get_initialized_tmp_var (f_off
, pre_p
, NULL
);
21523 /* Emit code to branch if off >= 0. */
21524 t
= build2 (GE_EXPR
, boolean_type_node
, off
,
21525 build_int_cst (TREE_TYPE (off
), 0));
21526 cond1
= build3 (COND_EXPR
, ptr_type_node
, t
, NULL_TREE
, NULL_TREE
);
21530 /* Emit: offs = (offs + 15) & -16. */
21531 t
= build2 (PLUS_EXPR
, TREE_TYPE (off
), off
,
21532 build_int_cst (TREE_TYPE (off
), 15));
21533 t
= build2 (BIT_AND_EXPR
, TREE_TYPE (off
), t
,
21534 build_int_cst (TREE_TYPE (off
), -16));
21535 roundup
= build2 (MODIFY_EXPR
, TREE_TYPE (off
), off
, t
);
21540 /* Update ap.__[g|v]r_offs */
21541 t
= build2 (PLUS_EXPR
, TREE_TYPE (off
), off
,
21542 build_int_cst (TREE_TYPE (off
), rsize
));
21543 t
= build2 (MODIFY_EXPR
, TREE_TYPE (f_off
), unshare_expr (f_off
), t
);
21547 t
= build2 (COMPOUND_EXPR
, TREE_TYPE (t
), roundup
, t
);
21549 /* [cond2] if (ap.__[g|v]r_offs > 0) */
21550 u
= build2 (GT_EXPR
, boolean_type_node
, unshare_expr (f_off
),
21551 build_int_cst (TREE_TYPE (f_off
), 0));
21552 cond2
= build3 (COND_EXPR
, ptr_type_node
, u
, NULL_TREE
, NULL_TREE
);
21554 /* String up: make sure the assignment happens before the use. */
21555 t
= build2 (COMPOUND_EXPR
, TREE_TYPE (cond2
), t
, cond2
);
21556 COND_EXPR_ELSE (cond1
) = t
;
21558 /* Prepare the trees handling the argument that is passed on the stack;
21559 the top level node will store in ON_STACK. */
21560 arg
= get_initialized_tmp_var (stack
, pre_p
, NULL
);
21563 /* if (alignof(type) > 8) (arg = arg + 15) & -16; */
21564 t
= fold_build_pointer_plus_hwi (arg
, 15);
21565 t
= build2 (BIT_AND_EXPR
, TREE_TYPE (t
), t
,
21566 build_int_cst (TREE_TYPE (t
), -16));
21567 roundup
= build2 (MODIFY_EXPR
, TREE_TYPE (arg
), arg
, t
);
21571 /* Advance ap.__stack */
21572 t
= fold_build_pointer_plus_hwi (arg
, size
+ 7);
21573 t
= build2 (BIT_AND_EXPR
, TREE_TYPE (t
), t
,
21574 build_int_cst (TREE_TYPE (t
), -8));
21575 t
= build2 (MODIFY_EXPR
, TREE_TYPE (stack
), unshare_expr (stack
), t
);
21576 /* String up roundup and advance. */
21578 t
= build2 (COMPOUND_EXPR
, TREE_TYPE (t
), roundup
, t
);
21579 /* String up with arg */
21580 on_stack
= build2 (COMPOUND_EXPR
, TREE_TYPE (arg
), t
, arg
);
21581 /* Big-endianness related address adjustment. */
21582 if (BLOCK_REG_PADDING (mode
, type
, 1) == PAD_DOWNWARD
21583 && size
< UNITS_PER_WORD
)
21585 t
= build2 (POINTER_PLUS_EXPR
, TREE_TYPE (arg
), arg
,
21586 size_int (UNITS_PER_WORD
- size
));
21587 on_stack
= build2 (COMPOUND_EXPR
, TREE_TYPE (arg
), on_stack
, t
);
21590 COND_EXPR_THEN (cond1
) = unshare_expr (on_stack
);
21591 COND_EXPR_THEN (cond2
) = unshare_expr (on_stack
);
21593 /* Adjustment to OFFSET in the case of BIG_ENDIAN. */
21596 t
= build2 (PREINCREMENT_EXPR
, TREE_TYPE (off
), off
,
21597 build_int_cst (TREE_TYPE (off
), adjust
));
21599 t
= fold_convert (sizetype
, t
);
21600 t
= build2 (POINTER_PLUS_EXPR
, TREE_TYPE (f_top
), f_top
, t
);
21604 /* type ha; // treat as "struct {ftype field[n];}"
21605 ... [computing offs]
21606 for (i = 0; i <nregs; ++i, offs += 16)
21607 ha.field[i] = *((ftype *)(ap.__vr_top + offs));
21610 tree tmp_ha
, field_t
, field_ptr_t
;
21612 /* Declare a local variable. */
21613 tmp_ha
= create_tmp_var_raw (type
, "ha");
21614 gimple_add_tmp_var (tmp_ha
);
21616 /* Establish the base type. */
21620 field_t
= float_type_node
;
21621 field_ptr_t
= float_ptr_type_node
;
21624 field_t
= double_type_node
;
21625 field_ptr_t
= double_ptr_type_node
;
21628 field_t
= long_double_type_node
;
21629 field_ptr_t
= long_double_ptr_type_node
;
21632 field_t
= dfloat32_type_node
;
21633 field_ptr_t
= build_pointer_type (dfloat32_type_node
);
21636 field_t
= dfloat64_type_node
;
21637 field_ptr_t
= build_pointer_type (dfloat64_type_node
);
21640 field_t
= dfloat128_type_node
;
21641 field_ptr_t
= build_pointer_type (dfloat128_type_node
);
21644 field_t
= aarch64_fp16_type_node
;
21645 field_ptr_t
= aarch64_fp16_ptr_type_node
;
21648 field_t
= bfloat16_type_node
;
21649 field_ptr_t
= aarch64_bf16_ptr_type_node
;
21654 tree innertype
= make_signed_type (GET_MODE_PRECISION (SImode
));
21655 field_t
= build_vector_type_for_mode (innertype
, ag_mode
);
21656 field_ptr_t
= build_pointer_type (field_t
);
21663 /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area */
21664 TREE_ADDRESSABLE (tmp_ha
) = 1;
21665 tmp_ha
= build1 (ADDR_EXPR
, field_ptr_t
, tmp_ha
);
21667 t
= fold_convert (field_ptr_t
, addr
);
21668 t
= build2 (MODIFY_EXPR
, field_t
,
21669 build1 (INDIRECT_REF
, field_t
, tmp_ha
),
21670 build1 (INDIRECT_REF
, field_t
, t
));
21672 /* ha.field[i] = *((field_ptr_t)vr_saved_area + i) */
21673 for (i
= 1; i
< nregs
; ++i
)
21675 addr
= fold_build_pointer_plus_hwi (addr
, UNITS_PER_VREG
);
21676 u
= fold_convert (field_ptr_t
, addr
);
21677 u
= build2 (MODIFY_EXPR
, field_t
,
21678 build2 (MEM_REF
, field_t
, tmp_ha
,
21679 build_int_cst (field_ptr_t
,
21681 int_size_in_bytes (field_t
)))),
21682 build1 (INDIRECT_REF
, field_t
, u
));
21683 t
= build2 (COMPOUND_EXPR
, TREE_TYPE (t
), t
, u
);
21686 u
= fold_convert (TREE_TYPE (f_top
), tmp_ha
);
21687 t
= build2 (COMPOUND_EXPR
, TREE_TYPE (f_top
), t
, u
);
21690 COND_EXPR_ELSE (cond2
) = t
;
21691 addr
= fold_convert (build_pointer_type (type
), cond1
);
21692 addr
= build_va_arg_indirect_ref (addr
);
21695 addr
= build_va_arg_indirect_ref (addr
);
21700 /* Implement TARGET_SETUP_INCOMING_VARARGS. */
21703 aarch64_setup_incoming_varargs (cumulative_args_t cum_v
,
21704 const function_arg_info
&arg
,
21705 int *pretend_size ATTRIBUTE_UNUSED
, int no_rtl
)
21707 CUMULATIVE_ARGS
*cum
= get_cumulative_args (cum_v
);
21708 CUMULATIVE_ARGS local_cum
;
21709 int gr_saved
= cfun
->va_list_gpr_size
;
21710 int vr_saved
= cfun
->va_list_fpr_size
;
21712 /* The caller has advanced CUM up to, but not beyond, the last named
21713 argument. Advance a local copy of CUM past the last "real" named
21714 argument, to find out how many registers are left over. */
21716 if (!TYPE_NO_NAMED_ARGS_STDARG_P (TREE_TYPE (current_function_decl
)))
21717 aarch64_function_arg_advance (pack_cumulative_args(&local_cum
), arg
);
21719 /* Found out how many registers we need to save.
21720 Honor tree-stdvar analysis results. */
21721 if (cfun
->va_list_gpr_size
)
21722 gr_saved
= MIN (NUM_ARG_REGS
- local_cum
.aapcs_ncrn
,
21723 cfun
->va_list_gpr_size
/ UNITS_PER_WORD
);
21724 if (cfun
->va_list_fpr_size
)
21725 vr_saved
= MIN (NUM_FP_ARG_REGS
- local_cum
.aapcs_nvrn
,
21726 cfun
->va_list_fpr_size
/ UNITS_PER_VREG
);
21730 gcc_assert (local_cum
.aapcs_nvrn
== 0);
21740 /* virtual_incoming_args_rtx should have been 16-byte aligned. */
21741 ptr
= plus_constant (Pmode
, virtual_incoming_args_rtx
,
21742 - gr_saved
* UNITS_PER_WORD
);
21743 mem
= gen_frame_mem (BLKmode
, ptr
);
21744 set_mem_alias_set (mem
, get_varargs_alias_set ());
21746 move_block_from_reg (local_cum
.aapcs_ncrn
+ R0_REGNUM
,
21751 /* We can't use move_block_from_reg, because it will use
21752 the wrong mode, storing D regs only. */
21753 machine_mode mode
= TImode
;
21754 int off
, i
, vr_start
;
21756 /* Set OFF to the offset from virtual_incoming_args_rtx of
21757 the first vector register. The VR save area lies below
21758 the GR one, and is aligned to 16 bytes. */
21759 off
= -ROUND_UP (gr_saved
* UNITS_PER_WORD
,
21760 STACK_BOUNDARY
/ BITS_PER_UNIT
);
21761 off
-= vr_saved
* UNITS_PER_VREG
;
21763 vr_start
= V0_REGNUM
+ local_cum
.aapcs_nvrn
;
21764 for (i
= 0; i
< vr_saved
; ++i
)
21768 ptr
= plus_constant (Pmode
, virtual_incoming_args_rtx
, off
);
21769 mem
= gen_frame_mem (mode
, ptr
);
21770 set_mem_alias_set (mem
, get_varargs_alias_set ());
21771 aarch64_emit_move (mem
, gen_rtx_REG (mode
, vr_start
+ i
));
21772 off
+= UNITS_PER_VREG
;
21777 /* We don't save the size into *PRETEND_SIZE because we want to avoid
21778 any complication of having crtl->args.pretend_args_size changed. */
21779 cfun
->machine
->frame
.saved_varargs_size
21780 = (ROUND_UP (gr_saved
* UNITS_PER_WORD
,
21781 STACK_BOUNDARY
/ BITS_PER_UNIT
)
21782 + vr_saved
* UNITS_PER_VREG
);
21786 aarch64_conditional_register_usage (void)
21791 for (i
= V0_REGNUM
; i
<= V31_REGNUM
; i
++)
21794 call_used_regs
[i
] = 1;
21795 CLEAR_HARD_REG_BIT (operand_reg_set
, i
);
21799 for (i
= P0_REGNUM
; i
<= P15_REGNUM
; i
++)
21802 call_used_regs
[i
] = 1;
21805 /* Only allow these registers to be accessed via special patterns. */
21806 CLEAR_HARD_REG_BIT (operand_reg_set
, VG_REGNUM
);
21807 CLEAR_HARD_REG_BIT (operand_reg_set
, FFR_REGNUM
);
21808 CLEAR_HARD_REG_BIT (operand_reg_set
, FFRT_REGNUM
);
21809 for (int i
= FIRST_FAKE_REGNUM
; i
<= LAST_FAKE_REGNUM
; ++i
)
21810 CLEAR_HARD_REG_BIT (operand_reg_set
, i
);
21812 /* When tracking speculation, we need a couple of call-clobbered registers
21813 to track the speculation state. It would be nice to just use
21814 IP0 and IP1, but currently there are numerous places that just
21815 assume these registers are free for other uses (eg pointer
21816 authentication). */
21817 if (aarch64_track_speculation
)
21819 fixed_regs
[SPECULATION_TRACKER_REGNUM
] = 1;
21820 call_used_regs
[SPECULATION_TRACKER_REGNUM
] = 1;
21821 fixed_regs
[SPECULATION_SCRATCH_REGNUM
] = 1;
21822 call_used_regs
[SPECULATION_SCRATCH_REGNUM
] = 1;
21826 /* Implement TARGET_MEMBER_TYPE_FORCES_BLK. */
21829 aarch64_member_type_forces_blk (const_tree field_or_array
, machine_mode mode
)
21831 /* For records we're passed a FIELD_DECL, for arrays we're passed
21832 an ARRAY_TYPE. In both cases we're interested in the TREE_TYPE. */
21833 const_tree type
= TREE_TYPE (field_or_array
);
21835 /* Assign BLKmode to anything that contains more than 2 SVE predicates.
21836 For structures, the "multiple" case is indicated by MODE being
21838 unsigned int num_zr
, num_pr
;
21839 if (aarch64_sve::builtin_type_p (type
, &num_zr
, &num_pr
) && num_pr
> 2)
21841 if (TREE_CODE (field_or_array
) == ARRAY_TYPE
)
21842 return !simple_cst_equal (TYPE_SIZE (field_or_array
),
21844 return mode
== VOIDmode
;
21847 return default_member_type_forces_blk (field_or_array
, mode
);
21850 /* Bitmasks that indicate whether earlier versions of GCC would have
21851 taken a different path through the ABI logic. This should result in
21852 a -Wpsabi warning if the earlier path led to a different ABI decision.
21854 WARN_PSABI_EMPTY_CXX17_BASE
21855 Indicates that the type includes an artificial empty C++17 base field
21856 that, prior to GCC 10.1, would prevent the type from being treated as
21857 a HFA or HVA. See PR94383 for details.
21859 WARN_PSABI_NO_UNIQUE_ADDRESS
21860 Indicates that the type includes an empty [[no_unique_address]] field
21861 that, prior to GCC 10.1, would prevent the type from being treated as
21863 const unsigned int WARN_PSABI_EMPTY_CXX17_BASE
= 1U << 0;
21864 const unsigned int WARN_PSABI_NO_UNIQUE_ADDRESS
= 1U << 1;
21865 const unsigned int WARN_PSABI_ZERO_WIDTH_BITFIELD
= 1U << 2;
21867 /* Walk down the type tree of TYPE counting consecutive base elements.
21868 If *MODEP is VOIDmode, then set it to the first valid floating point
21869 type. If a non-floating point type is found, or if a floating point
21870 type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
21871 otherwise return the count in the sub-tree.
21873 The WARN_PSABI_FLAGS argument allows the caller to check whether this
21874 function has changed its behavior relative to earlier versions of GCC.
21875 Normally the argument should be nonnull and point to a zero-initialized
21876 variable. The function then records whether the ABI decision might
21877 be affected by a known fix to the ABI logic, setting the associated
21878 WARN_PSABI_* bits if so.
21880 When the argument is instead a null pointer, the function tries to
21881 simulate the behavior of GCC before all such ABI fixes were made.
21882 This is useful to check whether the function returns something
21883 different after the ABI fixes. */
21885 aapcs_vfp_sub_candidate (const_tree type
, machine_mode
*modep
,
21886 unsigned int *warn_psabi_flags
)
21889 HOST_WIDE_INT size
;
21891 if (aarch64_sve::builtin_type_p (type
))
21894 switch (TREE_CODE (type
))
21897 mode
= TYPE_MODE (type
);
21898 if (mode
!= DFmode
&& mode
!= SFmode
21899 && mode
!= TFmode
&& mode
!= HFmode
21900 && mode
!= SDmode
&& mode
!= DDmode
&& mode
!= TDmode
)
21903 if (*modep
== VOIDmode
)
21906 if (*modep
== mode
)
21912 mode
= TYPE_MODE (TREE_TYPE (type
));
21913 if (mode
!= DFmode
&& mode
!= SFmode
21914 && mode
!= TFmode
&& mode
!= HFmode
)
21917 if (*modep
== VOIDmode
)
21920 if (*modep
== mode
)
21926 /* Use V2SImode and V4SImode as representatives of all 64-bit
21927 and 128-bit vector types. */
21928 size
= int_size_in_bytes (type
);
21941 if (*modep
== VOIDmode
)
21944 /* Vector modes are considered to be opaque: two vectors are
21945 equivalent for the purposes of being homogeneous aggregates
21946 if they are the same size. */
21947 if (*modep
== mode
)
21955 tree index
= TYPE_DOMAIN (type
);
21957 /* Can't handle incomplete types nor sizes that are not
21959 if (!COMPLETE_TYPE_P (type
)
21960 || TREE_CODE (TYPE_SIZE (type
)) != INTEGER_CST
)
21963 count
= aapcs_vfp_sub_candidate (TREE_TYPE (type
), modep
,
21967 || !TYPE_MAX_VALUE (index
)
21968 || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index
))
21969 || !TYPE_MIN_VALUE (index
)
21970 || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index
))
21974 count
*= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index
))
21975 - tree_to_uhwi (TYPE_MIN_VALUE (index
)));
21977 /* There must be no padding. */
21978 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type
)),
21979 count
* GET_MODE_BITSIZE (*modep
)))
21991 /* Can't handle incomplete types nor sizes that are not
21993 if (!COMPLETE_TYPE_P (type
)
21994 || TREE_CODE (TYPE_SIZE (type
)) != INTEGER_CST
)
21997 for (field
= TYPE_FIELDS (type
); field
; field
= TREE_CHAIN (field
))
21999 if (TREE_CODE (field
) != FIELD_DECL
)
22002 if (DECL_FIELD_ABI_IGNORED (field
))
22004 /* See whether this is something that earlier versions of
22005 GCC failed to ignore. */
22007 if (lookup_attribute ("no_unique_address",
22008 DECL_ATTRIBUTES (field
)))
22009 flag
= WARN_PSABI_NO_UNIQUE_ADDRESS
;
22010 else if (cxx17_empty_base_field_p (field
))
22011 flag
= WARN_PSABI_EMPTY_CXX17_BASE
;
22013 /* No compatibility problem. */
22016 /* Simulate the old behavior when WARN_PSABI_FLAGS is null. */
22017 if (warn_psabi_flags
)
22019 *warn_psabi_flags
|= flag
;
22023 /* A zero-width bitfield may affect layout in some
22024 circumstances, but adds no members. The determination
22025 of whether or not a type is an HFA is performed after
22026 layout is complete, so if the type still looks like an
22027 HFA afterwards, it is still classed as one. This is
22028 potentially an ABI break for the hard-float ABI. */
22029 else if (DECL_BIT_FIELD (field
)
22030 && integer_zerop (DECL_SIZE (field
)))
22032 /* Prior to GCC-12 these fields were striped early,
22033 hiding them from the back-end entirely and
22034 resulting in the correct behaviour for argument
22035 passing. Simulate that old behaviour without
22036 generating a warning. */
22037 if (DECL_FIELD_CXX_ZERO_WIDTH_BIT_FIELD (field
))
22039 if (warn_psabi_flags
)
22041 *warn_psabi_flags
|= WARN_PSABI_ZERO_WIDTH_BITFIELD
;
22046 sub_count
= aapcs_vfp_sub_candidate (TREE_TYPE (field
), modep
,
22050 count
+= sub_count
;
22053 /* There must be no padding. */
22054 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type
)),
22055 count
* GET_MODE_BITSIZE (*modep
)))
22062 case QUAL_UNION_TYPE
:
22064 /* These aren't very interesting except in a degenerate case. */
22069 /* Can't handle incomplete types nor sizes that are not
22071 if (!COMPLETE_TYPE_P (type
)
22072 || TREE_CODE (TYPE_SIZE (type
)) != INTEGER_CST
)
22075 for (field
= TYPE_FIELDS (type
); field
; field
= TREE_CHAIN (field
))
22077 if (TREE_CODE (field
) != FIELD_DECL
)
22080 sub_count
= aapcs_vfp_sub_candidate (TREE_TYPE (field
), modep
,
22084 count
= count
> sub_count
? count
: sub_count
;
22087 /* There must be no padding. */
22088 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type
)),
22089 count
* GET_MODE_BITSIZE (*modep
)))
22102 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector
22103 type as described in AAPCS64 \S 4.1.2.
22105 See the comment above aarch64_composite_type_p for the notes on MODE. */
22108 aarch64_short_vector_p (const_tree type
,
22111 poly_int64 size
= -1;
22113 if (type
&& VECTOR_TYPE_P (type
))
22115 if (aarch64_sve::builtin_type_p (type
))
22117 size
= int_size_in_bytes (type
);
22119 else if (GET_MODE_CLASS (mode
) == MODE_VECTOR_INT
22120 || GET_MODE_CLASS (mode
) == MODE_VECTOR_FLOAT
)
22122 /* The containing "else if" is too loose: it means that we look at TYPE
22123 if the type is a vector type (good), but that we otherwise ignore TYPE
22124 and look only at the mode. This is wrong because the type describes
22125 the language-level information whereas the mode is purely an internal
22126 GCC concept. We can therefore reach here for types that are not
22127 vectors in the AAPCS64 sense.
22129 We can't "fix" that for the traditional Advanced SIMD vector modes
22130 without breaking backwards compatibility. However, there's no such
22131 baggage for the structure modes, which were introduced in GCC 12. */
22132 if (aarch64_advsimd_struct_mode_p (mode
))
22135 /* For similar reasons, rely only on the type, not the mode, when
22136 processing SVE types. */
22137 if (type
&& aarch64_some_values_include_pst_objects_p (type
))
22138 /* Leave later code to report an error if SVE is disabled. */
22139 gcc_assert (!TARGET_SVE
|| aarch64_sve_mode_p (mode
));
22141 size
= GET_MODE_SIZE (mode
);
22143 if (known_eq (size
, 8) || known_eq (size
, 16))
22145 /* 64-bit and 128-bit vectors should only acquire an SVE mode if
22146 they are being treated as scalable AAPCS64 types. */
22147 gcc_assert (!aarch64_sve_mode_p (mode
)
22148 && !aarch64_advsimd_struct_mode_p (mode
));
22154 /* Return TRUE if the type, as described by TYPE and MODE, is a composite
22155 type as described in AAPCS64 \S 4.3. This includes aggregate, union and
22156 array types. The C99 floating-point complex types are also considered
22157 as composite types, according to AAPCS64 \S 7.1.1. The complex integer
22158 types, which are GCC extensions and out of the scope of AAPCS64, are
22159 treated as composite types here as well.
22161 Note that MODE itself is not sufficient in determining whether a type
22162 is such a composite type or not. This is because
22163 stor-layout.cc:compute_record_mode may have already changed the MODE
22164 (BLKmode) of a RECORD_TYPE TYPE to some other mode. For example, a
22165 structure with only one field may have its MODE set to the mode of the
22166 field. Also an integer mode whose size matches the size of the
22167 RECORD_TYPE type may be used to substitute the original mode
22168 (i.e. BLKmode) in certain circumstances. In other words, MODE cannot be
22169 solely relied on. */
22172 aarch64_composite_type_p (const_tree type
,
22175 if (aarch64_short_vector_p (type
, mode
))
22178 if (type
&& (AGGREGATE_TYPE_P (type
) || TREE_CODE (type
) == COMPLEX_TYPE
))
22182 && TREE_CODE (type
) == BITINT_TYPE
22183 && int_size_in_bytes (type
) > 16)
22186 if (mode
== BLKmode
22187 || GET_MODE_CLASS (mode
) == MODE_COMPLEX_FLOAT
22188 || GET_MODE_CLASS (mode
) == MODE_COMPLEX_INT
)
22194 /* Return TRUE if an argument, whose type is described by TYPE and MODE,
22195 shall be passed or returned in simd/fp register(s) (providing these
22196 parameter passing registers are available).
22198 Upon successful return, *COUNT returns the number of needed registers,
22199 *BASE_MODE returns the mode of the individual register and when IS_HA
22200 is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
22201 floating-point aggregate or a homogeneous short-vector aggregate.
22203 SILENT_P is true if the function should refrain from reporting any
22204 diagnostics. This should only be used if the caller is certain that
22205 any ABI decisions would eventually come through this function with
22206 SILENT_P set to false. */
22209 aarch64_vfp_is_call_or_return_candidate (machine_mode mode
,
22211 machine_mode
*base_mode
,
22216 if (is_ha
!= NULL
) *is_ha
= false;
22218 machine_mode new_mode
= VOIDmode
;
22219 bool composite_p
= aarch64_composite_type_p (type
, mode
);
22222 && (GET_MODE_CLASS (mode
) == MODE_FLOAT
22223 || GET_MODE_CLASS (mode
) == MODE_DECIMAL_FLOAT
22224 || (type
&& TYPE_MAIN_VARIANT (type
) == aarch64_mfp8_type_node
)))
22225 || aarch64_short_vector_p (type
, mode
))
22230 else if (GET_MODE_CLASS (mode
) == MODE_COMPLEX_FLOAT
)
22232 if (is_ha
!= NULL
) *is_ha
= true;
22234 new_mode
= GET_MODE_INNER (mode
);
22236 else if (type
&& composite_p
)
22238 unsigned int warn_psabi_flags
= 0;
22239 int ag_count
= aapcs_vfp_sub_candidate (type
, &new_mode
,
22240 &warn_psabi_flags
);
22241 if (ag_count
> 0 && ag_count
<= HA_MAX_NUM_FLDS
)
22243 static unsigned last_reported_type_uid
;
22244 unsigned uid
= TYPE_UID (TYPE_MAIN_VARIANT (type
));
22248 && warn_psabi_flags
22249 && uid
!= last_reported_type_uid
22250 && ((alt
= aapcs_vfp_sub_candidate (type
, &new_mode
, NULL
))
22254 = CHANGES_ROOT_URL
"gcc-10/changes.html#empty_base";
22256 = CHANGES_ROOT_URL
"gcc-12/changes.html#zero_width_bitfields";
22257 gcc_assert (alt
== -1);
22258 last_reported_type_uid
= uid
;
22259 /* Use TYPE_MAIN_VARIANT to strip any redundant const
22261 if (warn_psabi_flags
& WARN_PSABI_NO_UNIQUE_ADDRESS
)
22262 inform (input_location
, "parameter passing for argument of "
22263 "type %qT with %<[[no_unique_address]]%> members "
22264 "changed %{in GCC 10.1%}",
22265 TYPE_MAIN_VARIANT (type
), url10
);
22266 else if (warn_psabi_flags
& WARN_PSABI_EMPTY_CXX17_BASE
)
22267 inform (input_location
, "parameter passing for argument of "
22268 "type %qT when C++17 is enabled changed to match "
22269 "C++14 %{in GCC 10.1%}",
22270 TYPE_MAIN_VARIANT (type
), url10
);
22271 else if (warn_psabi_flags
& WARN_PSABI_ZERO_WIDTH_BITFIELD
)
22272 inform (input_location
, "parameter passing for argument of "
22273 "type %qT changed %{in GCC 12.1%}",
22274 TYPE_MAIN_VARIANT (type
), url12
);
22277 if (is_ha
!= NULL
) *is_ha
= true;
22286 gcc_assert (!aarch64_sve_mode_p (new_mode
));
22287 *base_mode
= new_mode
;
22291 /* Implement TARGET_STRUCT_VALUE_RTX. */
22294 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED
,
22295 int incoming ATTRIBUTE_UNUSED
)
22297 return gen_rtx_REG (Pmode
, AARCH64_STRUCT_VALUE_REGNUM
);
22300 /* Implements target hook vector_mode_supported_p. */
22302 aarch64_vector_mode_supported_p (machine_mode mode
)
22304 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
22305 return vec_flags
!= 0 && (vec_flags
& VEC_STRUCT
) == 0;
22308 /* Implements target hook vector_mode_supported_any_target_p. */
22310 aarch64_vector_mode_supported_any_target_p (machine_mode mode
)
22312 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
, true);
22313 return vec_flags
!= 0 && (vec_flags
& VEC_STRUCT
) == 0;
22316 /* Return the full-width SVE vector mode for element mode MODE, if one
22319 aarch64_full_sve_mode (scalar_mode mode
)
22338 return VNx16QImode
;
22340 return opt_machine_mode ();
22344 /* Return the 64-bit Advanced SIMD vector mode for element mode MODE,
22347 aarch64_v64_mode (scalar_mode mode
)
22368 /* Return the 128-bit Advanced SIMD vector mode for element mode MODE,
22371 aarch64_v128_mode (scalar_mode mode
)
22392 return opt_machine_mode ();
22396 /* Return appropriate SIMD container
22397 for MODE within a vector of WIDTH bits. */
22398 static machine_mode
22399 aarch64_simd_container_mode (scalar_mode mode
, poly_int64 width
)
22402 && maybe_ne (width
, 128)
22403 && known_eq (width
, BITS_PER_SVE_VECTOR
))
22404 return aarch64_full_sve_mode (mode
).else_mode (word_mode
);
22406 gcc_assert (known_eq (width
, 64) || known_eq (width
, 128));
22407 if (TARGET_BASE_SIMD
)
22409 if (known_eq (width
, 128))
22410 return aarch64_v128_mode (mode
).else_mode (word_mode
);
22412 return aarch64_v64_mode (mode
).else_mode (word_mode
);
22417 /* Compare an SVE mode SVE_M and an Advanced SIMD mode ASIMD_M
22418 and return whether the SVE mode should be preferred over the
22419 Advanced SIMD one in aarch64_autovectorize_vector_modes. */
22421 aarch64_cmp_autovec_modes (machine_mode sve_m
, machine_mode asimd_m
)
22423 /* Take into account the aarch64-autovec-preference param if non-zero. */
22424 bool only_asimd_p
= aarch64_autovec_preference
== AARCH64_AUTOVEC_ASIMD_ONLY
;
22425 bool only_sve_p
= aarch64_autovec_preference
== AARCH64_AUTOVEC_SVE_ONLY
;
22432 /* The preference in case of a tie in costs. */
22433 bool prefer_asimd
= aarch64_autovec_preference
== AARCH64_AUTOVEC_PREFER_ASIMD
;
22434 bool prefer_sve
= aarch64_autovec_preference
== AARCH64_AUTOVEC_PREFER_SVE
;
22436 poly_int64 nunits_sve
= GET_MODE_NUNITS (sve_m
);
22437 poly_int64 nunits_asimd
= GET_MODE_NUNITS (asimd_m
);
22438 /* If the CPU information does not have an SVE width registered use the
22439 generic poly_int comparison that prefers SVE. If a preference is
22440 explicitly requested avoid this path. */
22441 if (aarch64_tune_params
.sve_width
== SVE_SCALABLE
22444 return maybe_gt (nunits_sve
, nunits_asimd
);
22446 /* Otherwise estimate the runtime width of the modes involved. */
22447 HOST_WIDE_INT est_sve
= estimated_poly_value (nunits_sve
);
22448 HOST_WIDE_INT est_asimd
= estimated_poly_value (nunits_asimd
);
22450 /* Preferring SVE means picking it first unless the Advanced SIMD mode
22451 is clearly wider. */
22453 return est_sve
>= est_asimd
;
22454 /* Conversely, preferring Advanced SIMD means picking SVE only if SVE
22455 is clearly wider. */
22457 return est_sve
> est_asimd
;
22459 /* In the default case prefer Advanced SIMD over SVE in case of a tie. */
22460 return est_sve
> est_asimd
;
22463 /* Return 128-bit container as the preferred SIMD mode for MODE. */
22464 static machine_mode
22465 aarch64_preferred_simd_mode (scalar_mode mode
)
22467 /* Take into account explicit auto-vectorization ISA preferences through
22468 aarch64_cmp_autovec_modes. */
22469 if (TARGET_SVE
&& aarch64_cmp_autovec_modes (VNx16QImode
, V16QImode
))
22470 return aarch64_full_sve_mode (mode
).else_mode (word_mode
);
22472 return aarch64_v128_mode (mode
).else_mode (word_mode
);
22476 /* Return a list of possible vector sizes for the vectorizer
22477 to iterate over. */
22478 static unsigned int
22479 aarch64_autovectorize_vector_modes (vector_modes
*modes
, bool)
22481 static const machine_mode sve_modes
[] = {
22482 /* Try using full vectors for all element types. */
22485 /* Try using 16-bit containers for 8-bit elements and full vectors
22486 for wider elements. */
22489 /* Try using 32-bit containers for 8-bit and 16-bit elements and
22490 full vectors for wider elements. */
22493 /* Try using 64-bit containers for all element types. */
22497 static const machine_mode advsimd_modes
[] = {
22498 /* Try using 128-bit vectors for all element types. */
22501 /* Try using 64-bit vectors for 8-bit elements and 128-bit vectors
22502 for wider elements. */
22505 /* Try using 64-bit vectors for 16-bit elements and 128-bit vectors
22506 for wider elements.
22508 TODO: We could support a limited form of V4QImode too, so that
22509 we use 32-bit vectors for 8-bit elements. */
22512 /* Try using 64-bit vectors for 32-bit elements and 128-bit vectors
22513 for 64-bit elements.
22515 TODO: We could similarly support limited forms of V2QImode and V2HImode
22520 /* Try using N-byte SVE modes only after trying N-byte Advanced SIMD mode.
22523 - If we can't use N-byte Advanced SIMD vectors then the placement
22524 doesn't matter; we'll just continue as though the Advanced SIMD
22525 entry didn't exist.
22527 - If an SVE main loop with N bytes ends up being cheaper than an
22528 Advanced SIMD main loop with N bytes then by default we'll replace
22529 the Advanced SIMD version with the SVE one.
22531 - If an Advanced SIMD main loop with N bytes ends up being cheaper
22532 than an SVE main loop with N bytes then by default we'll try to
22533 use the SVE loop to vectorize the epilogue instead. */
22535 bool only_asimd_p
= aarch64_autovec_preference
== AARCH64_AUTOVEC_ASIMD_ONLY
;
22536 bool only_sve_p
= aarch64_autovec_preference
== AARCH64_AUTOVEC_SVE_ONLY
;
22538 unsigned int sve_i
= (TARGET_SVE
&& !only_asimd_p
) ? 0 : ARRAY_SIZE (sve_modes
);
22539 unsigned int advsimd_i
= 0;
22541 while (!only_sve_p
&& advsimd_i
< ARRAY_SIZE (advsimd_modes
))
22543 if (sve_i
< ARRAY_SIZE (sve_modes
)
22544 && aarch64_cmp_autovec_modes (sve_modes
[sve_i
],
22545 advsimd_modes
[advsimd_i
]))
22546 modes
->safe_push (sve_modes
[sve_i
++]);
22548 modes
->safe_push (advsimd_modes
[advsimd_i
++]);
22550 while (sve_i
< ARRAY_SIZE (sve_modes
))
22551 modes
->safe_push (sve_modes
[sve_i
++]);
22553 unsigned int flags
= 0;
22554 if (aarch64_vect_compare_costs
)
22555 flags
|= VECT_COMPARE_COSTS
;
22559 /* Implement TARGET_MANGLE_TYPE. */
22561 static const char *
22562 aarch64_mangle_type (const_tree type
)
22564 /* The AArch64 ABI documents say that "__va_list" has to be
22565 mangled as if it is in the "std" namespace. */
22566 if (lang_hooks
.types_compatible_p (CONST_CAST_TREE (type
), va_list_type
))
22567 return "St9__va_list";
22569 /* Half-precision floating point types. */
22570 if (SCALAR_FLOAT_TYPE_P (type
) && TYPE_PRECISION (type
) == 16)
22572 if (TYPE_MAIN_VARIANT (type
) == float16_type_node
)
22574 if (TYPE_MODE (type
) == BFmode
)
22580 /* Modal 8 bit floating point types. */
22581 if (TYPE_MAIN_VARIANT (type
) == aarch64_mfp8_type_node
)
22584 /* Mangle AArch64-specific internal types. TYPE_NAME is non-NULL_TREE for
22586 if (TYPE_NAME (type
) != NULL
)
22589 if ((res
= aarch64_general_mangle_builtin_type (type
))
22590 || (res
= aarch64_sve::mangle_builtin_type (type
)))
22594 /* Use the default mangling. */
22598 /* Implement TARGET_INVALID_CONVERSION. */
22600 static const char *
22601 aarch64_invalid_conversion (const_tree fromtype
, const_tree totype
)
22603 /* Do not allow conversions to/from FP8. But do allow conversions between
22604 volatile and const variants of __mfp8. */
22605 bool fromtype_is_fp8
22606 = (TYPE_MAIN_VARIANT (fromtype
) == aarch64_mfp8_type_node
);
22607 bool totype_is_fp8
= (TYPE_MAIN_VARIANT (totype
) == aarch64_mfp8_type_node
);
22609 if (fromtype_is_fp8
&& totype_is_fp8
)
22612 if (fromtype_is_fp8
)
22613 return N_ ("invalid conversion from type %<mfloat8_t%>");
22615 return N_ ("invalid conversion to type %<mfloat8_t%>");
22617 /* Conversion allowed. */
22621 /* Implement TARGET_VERIFY_TYPE_CONTEXT. */
22624 aarch64_verify_type_context (location_t loc
, type_context_kind context
,
22625 const_tree type
, bool silent_p
)
22627 return aarch64_sve::verify_type_context (loc
, context
, type
, silent_p
);
22630 /* Find the first rtx_insn before insn that will generate an assembly
22634 aarch64_prev_real_insn (rtx_insn
*insn
)
22641 insn
= prev_real_insn (insn
);
22643 while (insn
&& recog_memoized (insn
) < 0);
22649 is_madd_op (enum attr_type t1
)
22652 /* A number of these may be AArch32 only. */
22653 enum attr_type mlatypes
[] = {
22654 TYPE_MLA
, TYPE_MLAS
, TYPE_SMLAD
, TYPE_SMLADX
, TYPE_SMLAL
, TYPE_SMLALD
,
22655 TYPE_SMLALS
, TYPE_SMLALXY
, TYPE_SMLAWX
, TYPE_SMLAWY
, TYPE_SMLAXY
,
22656 TYPE_SMMLA
, TYPE_UMLAL
, TYPE_UMLALS
,TYPE_SMLSD
, TYPE_SMLSDX
, TYPE_SMLSLD
22659 for (i
= 0; i
< ARRAY_SIZE (mlatypes
); i
++)
22661 if (t1
== mlatypes
[i
])
22668 /* Check if there is a register dependency between a load and the insn
22669 for which we hold recog_data. */
22672 dep_between_memop_and_curr (rtx memop
)
22677 gcc_assert (GET_CODE (memop
) == SET
);
22679 if (!REG_P (SET_DEST (memop
)))
22682 load_reg
= SET_DEST (memop
);
22683 for (opno
= 1; opno
< recog_data
.n_operands
; opno
++)
22685 rtx operand
= recog_data
.operand
[opno
];
22686 if (REG_P (operand
)
22687 && reg_overlap_mentioned_p (load_reg
, operand
))
22695 /* When working around the Cortex-A53 erratum 835769,
22696 given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate
22697 instruction and has a preceding memory instruction such that a NOP
22698 should be inserted between them. */
22701 aarch64_madd_needs_nop (rtx_insn
* insn
)
22703 enum attr_type attr_type
;
22707 if (!TARGET_FIX_ERR_A53_835769
)
22710 if (!INSN_P (insn
) || recog_memoized (insn
) < 0)
22713 attr_type
= get_attr_type (insn
);
22714 if (!is_madd_op (attr_type
))
22717 prev
= aarch64_prev_real_insn (insn
);
22718 /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN.
22719 Restore recog state to INSN to avoid state corruption. */
22720 extract_constrain_insn_cached (insn
);
22722 if (!prev
|| !contains_mem_rtx_p (PATTERN (prev
)))
22725 body
= single_set (prev
);
22727 /* If the previous insn is a memory op and there is no dependency between
22728 it and the DImode madd, emit a NOP between them. If body is NULL then we
22729 have a complex memory operation, probably a load/store pair.
22730 Be conservative for now and emit a NOP. */
22731 if (GET_MODE (recog_data
.operand
[0]) == DImode
22732 && (!body
|| !dep_between_memop_and_curr (body
)))
22740 /* Implement FINAL_PRESCAN_INSN. */
22743 aarch64_final_prescan_insn (rtx_insn
*insn
)
22745 if (aarch64_madd_needs_nop (insn
))
22746 fprintf (asm_out_file
, "\tnop // between mem op and mult-accumulate\n");
22750 /* Return true if BASE_OR_STEP is a valid immediate operand for an SVE INDEX
22754 aarch64_sve_index_immediate_p (rtx base_or_step
)
22756 return (CONST_INT_P (base_or_step
)
22757 && IN_RANGE (INTVAL (base_or_step
), -16, 15));
22760 /* Return true if X is a valid immediate for the SVE ADD and SUB instructions
22761 when applied to mode MODE. Negate X first if NEGATE_P is true. */
22764 aarch64_sve_arith_immediate_p (machine_mode mode
, rtx x
, bool negate_p
)
22766 rtx elt
= unwrap_const_vec_duplicate (x
);
22767 if (!CONST_INT_P (elt
))
22770 HOST_WIDE_INT val
= INTVAL (elt
);
22773 val
&= GET_MODE_MASK (GET_MODE_INNER (mode
));
22776 return IN_RANGE (val
, 0, 0xff);
22777 return IN_RANGE (val
, 0, 0xff00);
22780 /* Return true if X is a valid immediate for the SVE SQADD and SQSUB
22781 instructions when applied to mode MODE. Negate X first if NEGATE_P
22785 aarch64_sve_sqadd_sqsub_immediate_p (machine_mode mode
, rtx x
, bool negate_p
)
22787 if (!aarch64_sve_arith_immediate_p (mode
, x
, negate_p
))
22790 /* After the optional negation, the immediate must be nonnegative.
22791 E.g. a saturating add of -127 must be done via SQSUB Zn.B, Zn.B, #127
22792 instead of SQADD Zn.B, Zn.B, #129. */
22793 rtx elt
= unwrap_const_vec_duplicate (x
);
22794 return negate_p
== (INTVAL (elt
) < 0);
22797 /* Return true if X is a valid immediate operand for an SVE logical
22798 instruction such as AND. */
22801 aarch64_sve_bitmask_immediate_p (rtx x
)
22805 return (const_vec_duplicate_p (x
, &elt
)
22806 && CONST_INT_P (elt
)
22807 && aarch64_bitmask_imm (INTVAL (elt
),
22808 GET_MODE_INNER (GET_MODE (x
))));
22811 /* Return true if X is a valid immediate for the SVE DUP and CPY
22815 aarch64_sve_dup_immediate_p (rtx x
)
22817 x
= aarch64_bit_representation (unwrap_const_vec_duplicate (x
));
22818 if (!CONST_INT_P (x
))
22821 HOST_WIDE_INT val
= INTVAL (x
);
22823 return IN_RANGE (val
, -0x80, 0x7f);
22824 return IN_RANGE (val
, -0x8000, 0x7f00);
22827 /* Return true if X is a valid immediate operand for an SVE CMP instruction.
22828 SIGNED_P says whether the operand is signed rather than unsigned. */
22831 aarch64_sve_cmp_immediate_p (rtx x
, bool signed_p
)
22833 x
= unwrap_const_vec_duplicate (x
);
22834 return (CONST_INT_P (x
)
22836 ? IN_RANGE (INTVAL (x
), -16, 15)
22837 : IN_RANGE (INTVAL (x
), 0, 127)));
22840 /* Return true if X is a valid immediate operand for an SVE FADD or FSUB
22841 instruction. Negate X first if NEGATE_P is true. */
22844 aarch64_sve_float_arith_immediate_p (rtx x
, bool negate_p
)
22849 if (GET_MODE_INNER (GET_MODE (x
)) == BFmode
22850 || !const_vec_duplicate_p (x
, &elt
)
22851 || !CONST_DOUBLE_P (elt
))
22854 r
= *CONST_DOUBLE_REAL_VALUE (elt
);
22857 r
= real_value_negate (&r
);
22859 if (real_equal (&r
, &dconst1
))
22861 if (real_equal (&r
, &dconsthalf
))
22866 /* Return true if X is a valid immediate operand for an SVE FMUL
22870 aarch64_sve_float_mul_immediate_p (rtx x
)
22874 return (GET_MODE_INNER (GET_MODE (x
)) != BFmode
22875 && const_vec_duplicate_p (x
, &elt
)
22876 && CONST_DOUBLE_P (elt
)
22877 && (real_equal (CONST_DOUBLE_REAL_VALUE (elt
), &dconsthalf
)
22878 || real_equal (CONST_DOUBLE_REAL_VALUE (elt
), &dconst2
)));
22881 /* Return true if replicating VAL32 is a valid 2-byte or 4-byte immediate
22882 for the Advanced SIMD operation described by WHICH and INSN. If INFO
22883 is nonnull, use it to describe valid immediates. */
22885 aarch64_advsimd_valid_immediate_hs (unsigned int val32
,
22886 simd_immediate_info
*info
,
22887 enum simd_immediate_check which
,
22888 simd_immediate_info::insn_type insn
)
22890 /* Try a 4-byte immediate with LSL. */
22891 for (unsigned int shift
= 0; shift
< 32; shift
+= 8)
22892 if ((val32
& (0xff << shift
)) == val32
)
22895 *info
= simd_immediate_info (SImode
, val32
>> shift
, insn
,
22896 simd_immediate_info::LSL
, shift
);
22900 /* Try a 2-byte immediate with LSL. */
22901 unsigned int imm16
= val32
& 0xffff;
22902 if (imm16
== (val32
>> 16))
22903 for (unsigned int shift
= 0; shift
< 16; shift
+= 8)
22904 if ((imm16
& (0xff << shift
)) == imm16
)
22907 *info
= simd_immediate_info (HImode
, imm16
>> shift
, insn
,
22908 simd_immediate_info::LSL
, shift
);
22912 /* Try a 4-byte immediate with MSL, except for cases that MVN
22914 if (which
== AARCH64_CHECK_MOV
)
22915 for (unsigned int shift
= 8; shift
< 24; shift
+= 8)
22917 unsigned int low
= (1 << shift
) - 1;
22918 if (((val32
& (0xff << shift
)) | low
) == val32
)
22921 *info
= simd_immediate_info (SImode
, val32
>> shift
, insn
,
22922 simd_immediate_info::MSL
, shift
);
22930 /* Return true if replicating VAL64 with mode MODE is a valid immediate for the
22931 Advanced SIMD operation described by WHICH. If INFO is nonnull,
22932 use it to describe valid immediates. */
22934 aarch64_advsimd_valid_immediate (unsigned HOST_WIDE_INT val64
,
22935 scalar_int_mode mode
,
22936 simd_immediate_info
*info
,
22937 enum simd_immediate_check which
)
22939 unsigned int val32
= val64
& 0xffffffff;
22940 unsigned int val8
= val64
& 0xff;
22942 if (mode
!= DImode
)
22944 if ((which
== AARCH64_CHECK_MOV
|| which
== AARCH64_CHECK_ORR
)
22945 && aarch64_advsimd_valid_immediate_hs (val32
, info
, which
,
22946 simd_immediate_info::MOV
))
22949 if ((which
== AARCH64_CHECK_MOV
|| which
== AARCH64_CHECK_AND
)
22950 && aarch64_advsimd_valid_immediate_hs (~val32
, info
, which
,
22951 simd_immediate_info::MVN
))
22954 /* Try using a replicated byte. */
22955 if (which
== AARCH64_CHECK_MOV
&& mode
== QImode
)
22958 *info
= simd_immediate_info (QImode
, val8
);
22963 /* Try using a bit-to-bytemask. */
22964 if (which
== AARCH64_CHECK_MOV
)
22967 for (i
= 0; i
< 64; i
+= 8)
22969 unsigned char byte
= (val64
>> i
) & 0xff;
22970 if (byte
!= 0 && byte
!= 0xff)
22976 *info
= simd_immediate_info (DImode
, val64
);
22983 /* Return true if replicating IVAL with MODE gives a valid immediate for an SVE
22984 MOV instruction. If INFO is nonnull, use it to describe valid
22988 aarch64_sve_valid_immediate (unsigned HOST_WIDE_INT ival
, scalar_int_mode mode
,
22989 simd_immediate_info
*info
,
22990 enum simd_immediate_check which
)
22992 HOST_WIDE_INT val
= trunc_int_for_mode (ival
, mode
);
22994 if (which
== AARCH64_CHECK_MOV
)
22996 if (IN_RANGE (val
, -0x80, 0x7f))
22998 /* DUP with no shift. */
23000 *info
= simd_immediate_info (mode
, val
,
23001 simd_immediate_info::SVE_MOV
);
23004 if ((val
& 0xff) == 0 && IN_RANGE (val
, -0x8000, 0x7f00))
23006 /* DUP with LSL #8. */
23008 *info
= simd_immediate_info (mode
, val
,
23009 simd_immediate_info::SVE_MOV
);
23013 if (aarch64_bitmask_imm (ival
, mode
))
23017 *info
= simd_immediate_info (mode
, val
, simd_immediate_info::SVE_MOV
);
23023 /* Return true if X is an UNSPEC_PTRUE constant of the form:
23025 (const (unspec [PATTERN ZERO] UNSPEC_PTRUE))
23027 where PATTERN is the svpattern as a CONST_INT and where ZERO
23028 is a zero constant of the required PTRUE mode (which can have
23029 fewer elements than X's mode, if zero bits are significant).
23031 If so, and if INFO is nonnull, describe the immediate in INFO. */
23033 aarch64_sve_ptrue_svpattern_p (rtx x
, struct simd_immediate_info
*info
)
23035 if (GET_CODE (x
) != CONST
)
23039 if (GET_CODE (x
) != UNSPEC
|| XINT (x
, 1) != UNSPEC_PTRUE
)
23044 aarch64_svpattern pattern
23045 = (aarch64_svpattern
) INTVAL (XVECEXP (x
, 0, 0));
23046 machine_mode pred_mode
= GET_MODE (XVECEXP (x
, 0, 1));
23047 scalar_int_mode int_mode
= aarch64_sve_element_int_mode (pred_mode
);
23048 *info
= simd_immediate_info (int_mode
, pattern
);
23053 /* Return true if X is a valid SVE predicate. If INFO is nonnull, use
23054 it to describe valid immediates. */
23057 aarch64_sve_pred_valid_immediate (rtx x
, simd_immediate_info
*info
)
23059 if (aarch64_sve_ptrue_svpattern_p (x
, info
))
23062 if (x
== CONST0_RTX (GET_MODE (x
)))
23065 *info
= simd_immediate_info (DImode
, 0);
23069 /* Analyze the value as a VNx16BImode. This should be relatively
23070 efficient, since rtx_vector_builder has enough built-in capacity
23071 to store all VLA predicate constants without needing the heap. */
23072 rtx_vector_builder builder
;
23073 if (!aarch64_get_sve_pred_bits (builder
, x
))
23076 unsigned int elt_size
= aarch64_widest_sve_pred_elt_size (builder
);
23077 if (int vl
= aarch64_partial_ptrue_length (builder
, elt_size
))
23079 machine_mode mode
= aarch64_sve_pred_mode (elt_size
).require ();
23080 aarch64_svpattern pattern
= aarch64_svpattern_for_vl (mode
, vl
);
23081 if (pattern
!= AARCH64_NUM_SVPATTERNS
)
23085 scalar_int_mode int_mode
= aarch64_sve_element_int_mode (mode
);
23086 *info
= simd_immediate_info (int_mode
, pattern
);
23094 /* We can only represent floating point constants which will fit in
23095 "quarter-precision" values. These values are characterised by
23096 a sign bit, a 4-bit mantissa and a 3-bit exponent. And are given
23099 (-1)^s * (n/16) * 2^r
23102 's' is the sign bit.
23103 'n' is an integer in the range 16 <= n <= 31.
23104 'r' is an integer in the range -3 <= r <= 4.
23106 Return true iff R represents a vale encodable into an AArch64 floating point
23107 move instruction as an immediate. Othewise false. */
23110 aarch64_real_float_const_representable_p (REAL_VALUE_TYPE r
)
23112 /* This represents our current view of how many bits
23113 make up the mantissa. */
23114 int point_pos
= 2 * HOST_BITS_PER_WIDE_INT
- 1;
23116 unsigned HOST_WIDE_INT mantissa
, mask
;
23120 /* We cannot represent infinities, NaNs or +/-zero. We won't
23121 know if we have +zero until we analyse the mantissa, but we
23122 can reject the other invalid values. */
23123 if (REAL_VALUE_ISINF (r
) || REAL_VALUE_ISNAN (r
)
23124 || REAL_VALUE_MINUS_ZERO (r
))
23127 /* Extract exponent. */
23128 r
= real_value_abs (&r
);
23129 exponent
= REAL_EXP (&r
);
23131 /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
23132 highest (sign) bit, with a fixed binary point at bit point_pos.
23133 m1 holds the low part of the mantissa, m2 the high part.
23134 WARNING: If we ever have a representation using more than 2 * H_W_I - 1
23135 bits for the mantissa, this can fail (low bits will be lost). */
23136 real_ldexp (&m
, &r
, point_pos
- exponent
);
23137 wide_int w
= real_to_integer (&m
, &fail
, HOST_BITS_PER_WIDE_INT
* 2);
23139 /* If the low part of the mantissa has bits set we cannot represent
23141 if (fail
|| w
.ulow () != 0)
23144 /* We have rejected the lower HOST_WIDE_INT, so update our
23145 understanding of how many bits lie in the mantissa and
23146 look only at the high HOST_WIDE_INT. */
23147 mantissa
= w
.elt (1);
23148 point_pos
-= HOST_BITS_PER_WIDE_INT
;
23150 /* We can only represent values with a mantissa of the form 1.xxxx. */
23151 mask
= ((unsigned HOST_WIDE_INT
)1 << (point_pos
- 5)) - 1;
23152 if ((mantissa
& mask
) != 0)
23155 /* Having filtered unrepresentable values, we may now remove all
23156 but the highest 5 bits. */
23157 mantissa
>>= point_pos
- 5;
23159 /* We cannot represent the value 0.0, so reject it. This is handled
23164 /* Then, as bit 4 is always set, we can mask it off, leaving
23165 the mantissa in the range [0, 15]. */
23166 mantissa
&= ~(1 << 4);
23167 gcc_assert (mantissa
<= 15);
23169 /* GCC internally does not use IEEE754-like encoding (where normalized
23170 significands are in the range [1, 2). GCC uses [0.5, 1) (see real.cc).
23171 Our mantissa values are shifted 4 places to the left relative to
23172 normalized IEEE754 so we must modify the exponent returned by REAL_EXP
23173 by 5 places to correct for GCC's representation. */
23174 exponent
= 5 - exponent
;
23176 return (exponent
>= 0 && exponent
<= 7);
23179 /* Return true if OP is a valid SIMD immediate for the operation
23180 described by WHICH. If INFO is nonnull, use it to describe valid
23183 aarch64_simd_valid_imm (rtx op
, simd_immediate_info
*info
,
23184 enum simd_immediate_check which
)
23186 machine_mode mode
= GET_MODE (op
);
23187 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
23188 if (vec_flags
== 0 || vec_flags
== (VEC_ADVSIMD
| VEC_STRUCT
))
23191 if ((vec_flags
& VEC_ADVSIMD
) && !TARGET_SIMD
)
23194 if (vec_flags
== (VEC_SVE_PRED
| VEC_STRUCT
))
23195 return op
== CONST0_RTX (mode
) || op
== CONSTM1_RTX (mode
);
23197 if (vec_flags
& VEC_SVE_PRED
)
23198 return aarch64_sve_pred_valid_immediate (op
, info
);
23200 scalar_mode elt_mode
= GET_MODE_INNER (mode
);
23202 unsigned int n_elts
;
23203 if (CONST_VECTOR_P (op
)
23204 && CONST_VECTOR_DUPLICATE_P (op
))
23205 n_elts
= CONST_VECTOR_NPATTERNS (op
);
23206 else if (which
== AARCH64_CHECK_MOV
23208 && const_vec_series_p (op
, &base
, &step
))
23210 gcc_assert (GET_MODE_CLASS (mode
) == MODE_VECTOR_INT
);
23211 if (!aarch64_sve_index_immediate_p (base
)
23212 || !aarch64_sve_index_immediate_p (step
))
23217 /* Get the corresponding container mode. E.g. an INDEX on V2SI
23218 should yield two integer values per 128-bit block, meaning
23219 that we need to treat it in the same way as V2DI and then
23220 ignore the upper 32 bits of each element. */
23221 elt_mode
= aarch64_sve_container_int_mode (mode
);
23222 *info
= simd_immediate_info (elt_mode
, base
, step
);
23226 else if (CONST_VECTOR_P (op
)
23227 && CONST_VECTOR_NUNITS (op
).is_constant (&n_elts
))
23228 /* N_ELTS set above. */;
23232 /* If all elements in an SVE vector have the same value, we have a free
23233 choice between using the element mode and using the container mode.
23234 Using the element mode means that unused parts of the vector are
23235 duplicates of the used elements, while using the container mode means
23236 that the unused parts are an extension of the used elements. Using the
23237 element mode is better for (say) VNx4HI 0x101, since 0x01010101 is valid
23238 for its container mode VNx4SI while 0x00000101 isn't.
23240 If not all elements in an SVE vector have the same value, we need the
23241 transition from one element to the next to occur at container boundaries.
23242 E.g. a fixed-length VNx4HI containing { 1, 2, 3, 4 } should be treated
23243 in the same way as a VNx4SI containing { 1, 2, 3, 4 }. */
23244 scalar_int_mode elt_int_mode
;
23245 if ((vec_flags
& VEC_SVE_DATA
) && n_elts
> 1)
23246 elt_int_mode
= aarch64_sve_container_int_mode (mode
);
23248 elt_int_mode
= int_mode_for_mode (elt_mode
).require ();
23250 unsigned int elt_size
= GET_MODE_SIZE (elt_int_mode
);
23254 /* Expand the vector constant out into a byte vector, with the least
23255 significant byte of the register first. */
23256 auto_vec
<unsigned char, 16> bytes
;
23257 bytes
.reserve (n_elts
* elt_size
);
23258 for (unsigned int i
= 0; i
< n_elts
; i
++)
23260 /* The vector is provided in gcc endian-neutral fashion.
23261 For aarch64_be Advanced SIMD, it must be laid out in the vector
23262 register in reverse order. */
23263 bool swap_p
= ((vec_flags
& VEC_ADVSIMD
) != 0 && BYTES_BIG_ENDIAN
);
23264 rtx elt
= CONST_VECTOR_ELT (op
, swap_p
? (n_elts
- 1 - i
) : i
);
23266 if (elt_mode
!= elt_int_mode
)
23267 elt
= gen_lowpart (elt_int_mode
, elt
);
23269 if (!CONST_INT_P (elt
))
23272 unsigned HOST_WIDE_INT elt_val
= INTVAL (elt
);
23273 for (unsigned int byte
= 0; byte
< elt_size
; byte
++)
23275 bytes
.quick_push (elt_val
& 0xff);
23276 elt_val
>>= BITS_PER_UNIT
;
23280 /* The immediate must repeat every eight bytes. */
23281 unsigned int nbytes
= bytes
.length ();
23282 for (unsigned i
= 8; i
< nbytes
; ++i
)
23283 if (bytes
[i
] != bytes
[i
- 8])
23286 /* Get the repeating 8-byte value as an integer. No endian correction
23287 is needed here because bytes is already in lsb-first order. */
23288 unsigned HOST_WIDE_INT val64
= 0;
23289 for (unsigned int i
= 0; i
< 8; i
++)
23290 val64
|= ((unsigned HOST_WIDE_INT
) bytes
[i
% nbytes
]
23291 << (i
* BITS_PER_UNIT
));
23293 /* Try encoding the integer immediate as a floating point value if it's an
23295 scalar_float_mode fmode
= DFmode
;
23296 scalar_int_mode imode
= DImode
;
23297 unsigned HOST_WIDE_INT ival
= val64
;
23298 unsigned int val32
= val64
& 0xffffffff;
23299 if (val32
== (val64
>> 32))
23304 unsigned int val16
= val32
& 0xffff;
23305 if (val16
== (val32
>> 16))
23310 unsigned int val8
= val16
& 0xff;
23311 if (val8
== (val16
>> 8))
23319 if (which
== AARCH64_CHECK_MOV
23321 && (imode
!= HImode
|| TARGET_FP_F16INST
))
23323 long int as_long_ints
[2];
23324 as_long_ints
[0] = ival
& 0xFFFFFFFF;
23325 as_long_ints
[1] = (ival
>> 32) & 0xFFFFFFFF;
23328 real_from_target (&r
, as_long_ints
, fmode
);
23329 if (aarch64_real_float_const_representable_p (r
))
23333 rtx float_val
= const_double_from_real_value (r
, fmode
);
23334 *info
= simd_immediate_info (fmode
, float_val
);
23340 if (vec_flags
& VEC_SVE_DATA
)
23341 return aarch64_sve_valid_immediate (ival
, imode
, info
, which
);
23343 if (aarch64_advsimd_valid_immediate (val64
, imode
, info
, which
))
23347 return aarch64_sve_valid_immediate (ival
, imode
, info
, which
);
23351 /* Return true if OP is a valid SIMD move immediate for SVE or AdvSIMD. */
23353 aarch64_simd_valid_mov_imm (rtx op
)
23355 return aarch64_simd_valid_imm (op
, NULL
, AARCH64_CHECK_MOV
);
23358 /* Return true if OP is a valid SIMD orr immediate for SVE or AdvSIMD. */
23360 aarch64_simd_valid_orr_imm (rtx op
)
23362 return aarch64_simd_valid_imm (op
, NULL
, AARCH64_CHECK_ORR
);
23365 /* Return true if OP is a valid SIMD and immediate for SVE or AdvSIMD. */
23367 aarch64_simd_valid_and_imm (rtx op
)
23369 return aarch64_simd_valid_imm (op
, NULL
, AARCH64_CHECK_AND
);
23372 /* Return true if OP is a valid SIMD xor immediate for SVE. */
23374 aarch64_simd_valid_xor_imm (rtx op
)
23376 return aarch64_simd_valid_imm (op
, NULL
, AARCH64_CHECK_XOR
);
23379 /* Check whether X is a VEC_SERIES-like constant that starts at 0 and
23380 has a step in the range of INDEX. Return the index expression if so,
23381 otherwise return null. */
23383 aarch64_check_zero_based_sve_index_immediate (rtx x
)
23386 if (const_vec_series_p (x
, &base
, &step
)
23387 && base
== const0_rtx
23388 && aarch64_sve_index_immediate_p (step
))
23393 /* Check of immediate shift constants are within range. */
23395 aarch64_simd_shift_imm_p (rtx x
, machine_mode mode
, bool left
)
23397 x
= unwrap_const_vec_duplicate (x
);
23398 if (!CONST_INT_P (x
))
23400 int bit_width
= GET_MODE_UNIT_SIZE (mode
) * BITS_PER_UNIT
;
23402 return IN_RANGE (INTVAL (x
), 0, bit_width
- 1);
23404 return IN_RANGE (INTVAL (x
), 1, bit_width
);
23407 /* Return the bitmask CONST_INT to select the bits required by a zero extract
23408 operation of width WIDTH at bit position POS. */
23411 aarch64_mask_from_zextract_ops (rtx width
, rtx pos
)
23413 gcc_assert (CONST_INT_P (width
));
23414 gcc_assert (CONST_INT_P (pos
));
23416 unsigned HOST_WIDE_INT mask
23417 = ((unsigned HOST_WIDE_INT
) 1 << UINTVAL (width
)) - 1;
23418 return GEN_INT (mask
<< UINTVAL (pos
));
23422 aarch64_mov_operand_p (rtx x
, machine_mode mode
)
23424 if (GET_CODE (x
) == HIGH
23425 && aarch64_valid_symref (XEXP (x
, 0), GET_MODE (XEXP (x
, 0))))
23428 if (CONST_INT_P (x
))
23431 if (VECTOR_MODE_P (GET_MODE (x
)))
23433 /* Require predicate constants to be VNx16BI before RA, so that we
23434 force everything to have a canonical form. */
23435 if (!lra_in_progress
23436 && !reload_completed
23437 && aarch64_sve_pred_mode_p (GET_MODE (x
))
23438 && known_eq (GET_MODE_SIZE (GET_MODE (x
)), BYTES_PER_SVE_PRED
)
23439 && GET_MODE (x
) != VNx16BImode
)
23442 return aarch64_simd_valid_mov_imm (x
);
23445 /* Remove UNSPEC_SALT_ADDR before checking symbol reference. */
23446 x
= strip_salt (x
);
23448 /* GOT accesses are valid moves. */
23449 if (SYMBOL_REF_P (x
)
23450 && aarch64_classify_symbolic_expression (x
) == SYMBOL_SMALL_GOT_4G
)
23453 if (SYMBOL_REF_P (x
) && mode
== DImode
&& CONSTANT_ADDRESS_P (x
))
23457 && (aarch64_sve_cnt_immediate_p (x
)
23458 || aarch64_sve_rdvl_immediate_p (x
)))
23461 if (aarch64_rdsvl_immediate_p (x
))
23464 return aarch64_classify_symbolic_expression (x
)
23465 == SYMBOL_TINY_ABSOLUTE
;
23468 /* Return a function-invariant register that contains VALUE. *CACHED_INSN
23469 caches instructions that set up such registers, so that they can be
23470 reused by future calls. */
23473 aarch64_get_shareable_reg (rtx_insn
**cached_insn
, rtx value
)
23475 rtx_insn
*insn
= *cached_insn
;
23476 if (insn
&& INSN_P (insn
) && !insn
->deleted ())
23478 rtx pat
= PATTERN (insn
);
23479 if (GET_CODE (pat
) == SET
)
23481 rtx dest
= SET_DEST (pat
);
23483 && !HARD_REGISTER_P (dest
)
23484 && rtx_equal_p (SET_SRC (pat
), value
))
23488 rtx reg
= gen_reg_rtx (GET_MODE (value
));
23489 *cached_insn
= emit_insn_before (gen_rtx_SET (reg
, value
),
23490 function_beg_insn
);
23494 /* Create a 0 constant that is based on V4SI to allow CSE to optimally share
23495 the constant creation. */
23498 aarch64_gen_shareable_zero (machine_mode mode
)
23500 rtx reg
= aarch64_get_shareable_reg (&cfun
->machine
->advsimd_zero_insn
,
23501 CONST0_RTX (V4SImode
));
23502 return lowpart_subreg (mode
, reg
, GET_MODE (reg
));
23505 /* INSN is some form of extension or shift that can be split into a
23506 permutation involving a shared zero. Return true if we should
23507 perform such a split.
23509 ??? For now, make sure that the split instruction executes more
23510 frequently than the zero that feeds it. In future it would be good
23511 to split without that restriction and instead recombine shared zeros
23512 if they turn out not to be worthwhile. This would allow splits in
23513 single-block functions and would also cope more naturally with
23514 rematerialization. The downside of not doing this is that we lose the
23515 optimizations for vector epilogues as well. */
23518 aarch64_split_simd_shift_p (rtx_insn
*insn
)
23520 return (can_create_pseudo_p ()
23521 && optimize_bb_for_speed_p (BLOCK_FOR_INSN (insn
))
23522 && (ENTRY_BLOCK_PTR_FOR_FN (cfun
)->count
23523 < BLOCK_FOR_INSN (insn
)->count
));
23526 /* Return a const_int vector of VAL. */
23528 aarch64_simd_gen_const_vector_dup (machine_mode mode
, HOST_WIDE_INT val
)
23530 rtx c
= gen_int_mode (val
, GET_MODE_INNER (mode
));
23531 return gen_const_vec_duplicate (mode
, c
);
23534 /* Check OP is a legal scalar immediate for the MOVI instruction. */
23537 aarch64_simd_scalar_immediate_valid_for_move (rtx op
, scalar_int_mode mode
)
23539 machine_mode vmode
;
23541 vmode
= aarch64_simd_container_mode (mode
, 64);
23542 rtx op_v
= aarch64_simd_gen_const_vector_dup (vmode
, INTVAL (op
));
23543 return aarch64_simd_valid_mov_imm (op_v
);
23546 /* Construct and return a PARALLEL RTX vector with elements numbering the
23547 lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
23548 the vector - from the perspective of the architecture. This does not
23549 line up with GCC's perspective on lane numbers, so we end up with
23550 different masks depending on our target endian-ness. The diagram
23551 below may help. We must draw the distinction when building masks
23552 which select one half of the vector. An instruction selecting
23553 architectural low-lanes for a big-endian target, must be described using
23554 a mask selecting GCC high-lanes.
23556 Big-Endian Little-Endian
23558 GCC 0 1 2 3 3 2 1 0
23559 | x | x | x | x | | x | x | x | x |
23560 Architecture 3 2 1 0 3 2 1 0
23562 Low Mask: { 2, 3 } { 0, 1 }
23563 High Mask: { 0, 1 } { 2, 3 }
23565 MODE Is the mode of the vector and NUNITS is the number of units in it. */
23568 aarch64_simd_vect_par_cnst_half (machine_mode mode
, int nunits
, bool high
)
23570 rtvec v
= rtvec_alloc (nunits
/ 2);
23571 int high_base
= nunits
/ 2;
23577 if (BYTES_BIG_ENDIAN
)
23578 base
= high
? low_base
: high_base
;
23580 base
= high
? high_base
: low_base
;
23582 for (i
= 0; i
< nunits
/ 2; i
++)
23583 RTVEC_ELT (v
, i
) = GEN_INT (base
+ i
);
23585 t1
= gen_rtx_PARALLEL (mode
, v
);
23589 /* Check OP for validity as a PARALLEL RTX vector with elements
23590 numbering the lanes of either the high (HIGH == TRUE) or low lanes,
23591 from the perspective of the architecture. See the diagram above
23592 aarch64_simd_vect_par_cnst_half for more details. */
23595 aarch64_simd_check_vect_par_cnst_half (rtx op
, machine_mode mode
,
23599 if (!VECTOR_MODE_P (mode
) || !GET_MODE_NUNITS (mode
).is_constant (&nelts
))
23602 rtx ideal
= aarch64_simd_vect_par_cnst_half (mode
, nelts
, high
);
23603 HOST_WIDE_INT count_op
= XVECLEN (op
, 0);
23604 HOST_WIDE_INT count_ideal
= XVECLEN (ideal
, 0);
23607 if (count_op
!= count_ideal
)
23610 for (i
= 0; i
< count_ideal
; i
++)
23612 rtx elt_op
= XVECEXP (op
, 0, i
);
23613 rtx elt_ideal
= XVECEXP (ideal
, 0, i
);
23615 if (!CONST_INT_P (elt_op
)
23616 || INTVAL (elt_ideal
) != INTVAL (elt_op
))
23622 /* Return a PARALLEL containing NELTS elements, with element I equal
23623 to BASE + I * STEP. */
23626 aarch64_gen_stepped_int_parallel (unsigned int nelts
, int base
, int step
)
23628 rtvec vec
= rtvec_alloc (nelts
);
23629 for (unsigned int i
= 0; i
< nelts
; ++i
)
23630 RTVEC_ELT (vec
, i
) = gen_int_mode (base
+ i
* step
, DImode
);
23631 return gen_rtx_PARALLEL (VOIDmode
, vec
);
23634 /* Return true if OP is a PARALLEL of CONST_INTs that form a linear
23635 series with step STEP. */
23638 aarch64_stepped_int_parallel_p (rtx op
, int step
)
23640 if (GET_CODE (op
) != PARALLEL
|| !CONST_INT_P (XVECEXP (op
, 0, 0)))
23643 unsigned HOST_WIDE_INT base
= UINTVAL (XVECEXP (op
, 0, 0));
23644 for (int i
= 1; i
< XVECLEN (op
, 0); ++i
)
23645 if (!CONST_INT_P (XVECEXP (op
, 0, i
))
23646 || UINTVAL (XVECEXP (op
, 0, i
)) != base
+ i
* step
)
23652 /* Return true if OPERANDS[0] to OPERANDS[NUM_OPERANDS - 1] form a
23653 sequence of strided registers, with the stride being equal STRIDE.
23654 The operands are already known to be FPRs. */
23656 aarch64_strided_registers_p (rtx
*operands
, unsigned int num_operands
,
23657 unsigned int stride
)
23659 for (unsigned int i
= 1; i
< num_operands
; ++i
)
23660 if (REGNO (operands
[i
]) != REGNO (operands
[0]) + i
* stride
)
23665 /* Bounds-check lanes. Ensure OPERAND lies between LOW (inclusive) and
23666 HIGH (exclusive). */
23668 aarch64_simd_lane_bounds (rtx operand
, HOST_WIDE_INT low
, HOST_WIDE_INT high
,
23671 HOST_WIDE_INT lane
;
23672 gcc_assert (CONST_INT_P (operand
));
23673 lane
= INTVAL (operand
);
23675 if (lane
< low
|| lane
>= high
)
23678 error_at (EXPR_LOCATION (exp
), "lane %wd out of range %wd - %wd",
23679 lane
, low
, high
- 1);
23681 error ("lane %wd out of range %wd - %wd", lane
, low
, high
- 1);
23685 /* Peform endian correction on lane number N, which indexes a vector
23686 of mode MODE, and return the result as an SImode rtx. */
23689 aarch64_endian_lane_rtx (machine_mode mode
, unsigned int n
)
23691 return gen_int_mode (ENDIAN_LANE_N (GET_MODE_NUNITS (mode
), n
), SImode
);
23694 /* Return TRUE if OP is a valid vector addressing mode. */
23697 aarch64_simd_mem_operand_p (rtx op
)
23700 && (GET_CODE (XEXP (op
, 0)) == POST_INC
|| REG_P (XEXP (op
, 0)))
23701 && memory_operand (op
, VOIDmode
));
23704 /* Return true if OP is a valid MEM operand for an SVE LD1R instruction. */
23707 aarch64_sve_ld1r_operand_p (rtx op
)
23709 struct aarch64_address_info addr
;
23713 && is_a
<scalar_mode
> (GET_MODE (op
), &mode
)
23714 && aarch64_classify_address (&addr
, XEXP (op
, 0), mode
, false)
23715 && addr
.type
== ADDRESS_REG_IMM
23716 && offset_6bit_unsigned_scaled_p (mode
, addr
.const_offset
));
23719 /* Return true if OP is a valid MEM operand for an SVE LD1R{Q,O} instruction
23720 where the size of the read data is specified by `mode` and the size of the
23721 vector elements are specified by `elem_mode`. */
23723 aarch64_sve_ld1rq_ld1ro_operand_p (rtx op
, machine_mode mode
,
23724 scalar_mode elem_mode
)
23726 struct aarch64_address_info addr
;
23728 || !aarch64_classify_address (&addr
, XEXP (op
, 0), elem_mode
, false))
23731 if (addr
.type
== ADDRESS_REG_IMM
)
23732 return offset_4bit_signed_scaled_p (mode
, addr
.const_offset
);
23734 if (addr
.type
== ADDRESS_REG_REG
)
23735 return (1U << addr
.shift
) == GET_MODE_SIZE (elem_mode
);
23740 /* Return true if OP is a valid MEM operand for an SVE LD1RQ instruction. */
23742 aarch64_sve_ld1rq_operand_p (rtx op
)
23744 return aarch64_sve_ld1rq_ld1ro_operand_p (op
, TImode
,
23745 GET_MODE_INNER (GET_MODE (op
)));
23748 /* Return true if OP is a valid MEM operand for an SVE LD1RO instruction for
23749 accessing a vector where the element size is specified by `elem_mode`. */
23751 aarch64_sve_ld1ro_operand_p (rtx op
, scalar_mode elem_mode
)
23753 return aarch64_sve_ld1rq_ld1ro_operand_p (op
, OImode
, elem_mode
);
23756 /* Return true if OP is a valid MEM operand for an SVE LDFF1 instruction. */
23758 aarch64_sve_ldff1_operand_p (rtx op
)
23763 struct aarch64_address_info addr
;
23764 if (!aarch64_classify_address (&addr
, XEXP (op
, 0), GET_MODE (op
), false))
23767 if (addr
.type
== ADDRESS_REG_IMM
)
23768 return known_eq (addr
.const_offset
, 0);
23770 return addr
.type
== ADDRESS_REG_REG
;
23773 /* Return true if OP is a valid MEM operand for an SVE LDNF1 instruction. */
23775 aarch64_sve_ldnf1_operand_p (rtx op
)
23777 struct aarch64_address_info addr
;
23780 && aarch64_classify_address (&addr
, XEXP (op
, 0),
23781 GET_MODE (op
), false)
23782 && addr
.type
== ADDRESS_REG_IMM
);
23785 /* Return true if OP is a valid MEM operand for an SVE LDR instruction.
23786 The conditions for STR are the same. */
23788 aarch64_sve_ldr_operand_p (rtx op
)
23790 struct aarch64_address_info addr
;
23793 && aarch64_classify_address (&addr
, XEXP (op
, 0), GET_MODE (op
),
23794 false, ADDR_QUERY_ANY
)
23795 && addr
.type
== ADDRESS_REG_IMM
);
23798 /* Return true if OP is a valid address for an SVE PRF[BHWD] instruction,
23799 addressing memory of mode MODE. */
23801 aarch64_sve_prefetch_operand_p (rtx op
, machine_mode mode
)
23803 struct aarch64_address_info addr
;
23804 if (!aarch64_classify_address (&addr
, op
, mode
, false, ADDR_QUERY_ANY
))
23807 if (addr
.type
== ADDRESS_REG_IMM
)
23808 return offset_6bit_signed_scaled_p (mode
, addr
.const_offset
);
23810 return addr
.type
== ADDRESS_REG_REG
;
23813 /* Return true if OP is a valid MEM operand for an SVE_STRUCT mode.
23814 We need to be able to access the individual pieces, so the range
23815 is different from LD[234] and ST[234]. */
23817 aarch64_sve_struct_memory_operand_p (rtx op
)
23822 machine_mode mode
= GET_MODE (op
);
23823 struct aarch64_address_info addr
;
23824 if (!aarch64_classify_address (&addr
, XEXP (op
, 0), SVE_BYTE_MODE
, false,
23826 || addr
.type
!= ADDRESS_REG_IMM
)
23829 poly_int64 first
= addr
.const_offset
;
23830 poly_int64 last
= first
+ GET_MODE_SIZE (mode
) - BYTES_PER_SVE_VECTOR
;
23831 return (offset_4bit_signed_scaled_p (SVE_BYTE_MODE
, first
)
23832 && offset_4bit_signed_scaled_p (SVE_BYTE_MODE
, last
));
23835 /* Return true if OFFSET is a constant integer and if VNUM is
23836 OFFSET * the number of bytes in an SVE vector. This is the requirement
23837 that exists in SME LDR and STR instructions, where the VL offset must
23838 equal the ZA slice offset. */
23840 aarch64_sme_ldr_vnum_offset_p (rtx offset
, rtx vnum
)
23842 if (!CONST_INT_P (offset
) || !IN_RANGE (INTVAL (offset
), 0, 15))
23845 if (TARGET_STREAMING
)
23847 poly_int64 const_vnum
;
23848 return (poly_int_rtx_p (vnum
, &const_vnum
)
23849 && known_eq (const_vnum
,
23850 INTVAL (offset
) * BYTES_PER_SVE_VECTOR
));
23854 HOST_WIDE_INT factor
;
23855 return (aarch64_sme_vq_unspec_p (vnum
, &factor
)
23856 && factor
== INTVAL (offset
) * 16);
23860 /* Emit a register copy from operand to operand, taking care not to
23861 early-clobber source registers in the process.
23863 COUNT is the number of components into which the copy needs to be
23866 aarch64_simd_emit_reg_reg_move (rtx
*operands
, machine_mode mode
,
23867 unsigned int count
)
23870 int rdest
= REGNO (operands
[0]);
23871 int rsrc
= REGNO (operands
[1]);
23873 if (!reg_overlap_mentioned_p (operands
[0], operands
[1])
23875 for (i
= 0; i
< count
; i
++)
23876 emit_move_insn (gen_rtx_REG (mode
, rdest
+ i
),
23877 gen_rtx_REG (mode
, rsrc
+ i
));
23879 for (i
= 0; i
< count
; i
++)
23880 emit_move_insn (gen_rtx_REG (mode
, rdest
+ count
- i
- 1),
23881 gen_rtx_REG (mode
, rsrc
+ count
- i
- 1));
23884 /* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is
23885 one of VSTRUCT modes: OI, CI, or XI. */
23887 aarch64_simd_attr_length_rglist (machine_mode mode
)
23889 /* This is only used (and only meaningful) for Advanced SIMD, not SVE. */
23890 return (GET_MODE_SIZE (mode
).to_constant () / UNITS_PER_VREG
) * 4;
23893 /* Implement target hook TARGET_VECTOR_ALIGNMENT. The AAPCS64 sets the maximum
23894 alignment of a vector to 128 bits. SVE predicates have an alignment of
23896 static HOST_WIDE_INT
23897 aarch64_simd_vector_alignment (const_tree type
)
23899 /* ??? Checking the mode isn't ideal, but VECTOR_BOOLEAN_TYPE_P can
23900 be set for non-predicate vectors of booleans. Modes are the most
23901 direct way we have of identifying real SVE predicate types. */
23902 if (GET_MODE_CLASS (TYPE_MODE (type
)) == MODE_VECTOR_BOOL
)
23904 widest_int min_size
23905 = constant_lower_bound (wi::to_poly_widest (TYPE_SIZE (type
)));
23906 return wi::umin (min_size
, 128).to_uhwi ();
23909 /* Implement target hook TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT. */
23911 aarch64_vectorize_preferred_vector_alignment (const_tree type
)
23913 if (aarch64_sve_data_mode_p (TYPE_MODE (type
)))
23915 /* If the length of the vector is a fixed power of 2, try to align
23916 to that length, otherwise don't try to align at all. */
23917 HOST_WIDE_INT result
;
23918 if (!GET_MODE_BITSIZE (TYPE_MODE (type
)).is_constant (&result
)
23919 || !pow2p_hwi (result
))
23920 result
= TYPE_ALIGN (TREE_TYPE (type
));
23923 return TYPE_ALIGN (type
);
23926 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE. */
23928 aarch64_simd_vector_alignment_reachable (const_tree type
, bool is_packed
)
23933 /* For fixed-length vectors, check that the vectorizer will aim for
23934 full-vector alignment. This isn't true for generic GCC vectors
23935 that are wider than the ABI maximum of 128 bits. */
23936 poly_uint64 preferred_alignment
=
23937 aarch64_vectorize_preferred_vector_alignment (type
);
23938 if (TREE_CODE (TYPE_SIZE (type
)) == INTEGER_CST
23939 && maybe_ne (wi::to_widest (TYPE_SIZE (type
)),
23940 preferred_alignment
))
23943 /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned. */
23947 /* Return true if the vector misalignment factor is supported by the
23950 aarch64_builtin_support_vector_misalignment (machine_mode mode
,
23951 const_tree type
, int misalignment
,
23954 if (TARGET_SIMD
&& STRICT_ALIGNMENT
)
23956 /* Return if movmisalign pattern is not supported for this mode. */
23957 if (optab_handler (movmisalign_optab
, mode
) == CODE_FOR_nothing
)
23960 /* Misalignment factor is unknown at compile time. */
23961 if (misalignment
== -1)
23964 return default_builtin_support_vector_misalignment (mode
, type
, misalignment
,
23968 /* If VALS is a vector constant that can be loaded into a register
23969 using DUP, generate instructions to do so and return an RTX to
23970 assign to the register. Otherwise return NULL_RTX. */
23972 aarch64_simd_dup_constant (rtx vals
)
23974 machine_mode mode
= GET_MODE (vals
);
23975 machine_mode inner_mode
= GET_MODE_INNER (mode
);
23978 if (!const_vec_duplicate_p (vals
, &x
))
23981 /* We can load this constant by using DUP and a constant in a
23982 single ARM register. This will be cheaper than a vector
23984 x
= force_reg (inner_mode
, x
);
23985 return gen_vec_duplicate (mode
, x
);
23989 /* Generate code to load VALS, which is a PARALLEL containing only
23990 constants (for vec_init) or CONST_VECTOR, efficiently into a
23991 register. Returns an RTX to copy into the register, or NULL_RTX
23992 for a PARALLEL that cannot be converted into a CONST_VECTOR. */
23994 aarch64_simd_make_constant (rtx vals
)
23996 machine_mode mode
= GET_MODE (vals
);
23998 rtx const_vec
= NULL_RTX
;
24002 if (CONST_VECTOR_P (vals
))
24004 else if (GET_CODE (vals
) == PARALLEL
)
24006 /* A CONST_VECTOR must contain only CONST_INTs and
24007 CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
24008 Only store valid constants in a CONST_VECTOR. */
24009 int n_elts
= XVECLEN (vals
, 0);
24010 for (i
= 0; i
< n_elts
; ++i
)
24012 rtx x
= XVECEXP (vals
, 0, i
);
24013 if (CONST_INT_P (x
) || CONST_DOUBLE_P (x
))
24016 if (n_const
== n_elts
)
24017 const_vec
= gen_rtx_CONST_VECTOR (mode
, XVEC (vals
, 0));
24020 gcc_unreachable ();
24022 if (const_vec
!= NULL_RTX
24023 && aarch64_simd_valid_mov_imm (const_vec
))
24024 /* Load using MOVI/MVNI. */
24026 else if ((const_dup
= aarch64_simd_dup_constant (vals
)) != NULL_RTX
)
24027 /* Loaded using DUP. */
24029 else if (const_vec
!= NULL_RTX
)
24030 /* Load from constant pool. We cannot take advantage of single-cycle
24031 LD1 because we need a PC-relative addressing mode. */
24034 /* A PARALLEL containing something not valid inside CONST_VECTOR.
24035 We cannot construct an initializer. */
24039 /* A subroutine of aarch64_expand_vector_init, with the same interface.
24040 The caller has already tried a divide-and-conquer approach, so do
24041 not consider that case here. */
24044 aarch64_expand_vector_init_fallback (rtx target
, rtx vals
)
24046 machine_mode mode
= GET_MODE (target
);
24047 scalar_mode inner_mode
= GET_MODE_INNER (mode
);
24048 /* The number of vector elements. */
24049 int n_elts
= XVECLEN (vals
, 0);
24050 /* The number of vector elements which are not constant. */
24052 rtx any_const
= NULL_RTX
;
24053 /* The first element of vals. */
24054 rtx v0
= XVECEXP (vals
, 0, 0);
24055 bool all_same
= true;
24057 /* This is a special vec_init<M><N> where N is not an element mode but a
24058 vector mode with half the elements of M. We expect to find two entries
24059 of mode N in VALS and we must put their concatentation into TARGET. */
24060 if (XVECLEN (vals
, 0) == 2 && VECTOR_MODE_P (GET_MODE (XVECEXP (vals
, 0, 0))))
24062 machine_mode narrow_mode
= GET_MODE (XVECEXP (vals
, 0, 0));
24063 gcc_assert (GET_MODE_INNER (narrow_mode
) == inner_mode
24064 && known_eq (GET_MODE_SIZE (mode
),
24065 2 * GET_MODE_SIZE (narrow_mode
)));
24066 emit_insn (gen_aarch64_vec_concat (narrow_mode
, target
,
24067 XVECEXP (vals
, 0, 0),
24068 XVECEXP (vals
, 0, 1)));
24072 /* Count the number of variable elements to initialise. */
24073 for (int i
= 0; i
< n_elts
; ++i
)
24075 rtx x
= XVECEXP (vals
, 0, i
);
24076 if (!(CONST_INT_P (x
) || CONST_DOUBLE_P (x
)))
24081 all_same
&= rtx_equal_p (x
, v0
);
24084 /* No variable elements, hand off to aarch64_simd_make_constant which knows
24085 how best to handle this. */
24088 rtx constant
= aarch64_simd_make_constant (vals
);
24089 if (constant
!= NULL_RTX
)
24091 emit_move_insn (target
, constant
);
24096 /* Splat a single non-constant element if we can. */
24099 rtx x
= force_reg (inner_mode
, v0
);
24100 aarch64_emit_move (target
, gen_vec_duplicate (mode
, x
));
24104 enum insn_code icode
= optab_handler (vec_set_optab
, mode
);
24105 gcc_assert (icode
!= CODE_FOR_nothing
);
24107 /* If there are only variable elements, try to optimize
24108 the insertion using dup for the most common element
24109 followed by insertions. */
24111 /* The algorithm will fill matches[*][0] with the earliest matching element,
24112 and matches[X][1] with the count of duplicate elements (if X is the
24113 earliest element which has duplicates). */
24115 if (n_var
>= n_elts
- 1 && n_elts
<= 16)
24117 int matches
[16][2] = {0};
24118 for (int i
= 0; i
< n_elts
; i
++)
24120 for (int j
= 0; j
<= i
; j
++)
24122 if (rtx_equal_p (XVECEXP (vals
, 0, i
), XVECEXP (vals
, 0, j
)))
24130 int maxelement
= 0;
24132 rtx const_elem
= NULL_RTX
;
24133 int const_elem_pos
= 0;
24135 for (int i
= 0; i
< n_elts
; i
++)
24137 if (matches
[i
][1] > maxv
)
24140 maxv
= matches
[i
][1];
24142 if (CONST_INT_P (XVECEXP (vals
, 0, i
))
24143 || CONST_DOUBLE_P (XVECEXP (vals
, 0, i
)))
24145 const_elem_pos
= i
;
24146 const_elem
= XVECEXP (vals
, 0, i
);
24150 /* Create a duplicate of the most common element, unless all elements
24151 are equally useless to us, in which case just immediately set the
24152 vector register using the first element. */
24156 /* For vectors of two 64-bit elements, we can do even better. */
24158 && (inner_mode
== E_DImode
24159 || inner_mode
== E_DFmode
))
24162 rtx x0
= XVECEXP (vals
, 0, 0);
24163 rtx x1
= XVECEXP (vals
, 0, 1);
24164 /* Combine can pick up this case, but handling it directly
24165 here leaves clearer RTL.
24167 This is load_pair_lanes<mode>, and also gives us a clean-up
24168 for store_pair_lanes<mode>. */
24169 if (memory_operand (x0
, inner_mode
)
24170 && memory_operand (x1
, inner_mode
)
24171 && aarch64_mergeable_load_pair_p (mode
, x0
, x1
))
24174 if (inner_mode
== DFmode
)
24175 t
= gen_load_pair_lanesdf (target
, x0
, x1
);
24177 t
= gen_load_pair_lanesdi (target
, x0
, x1
);
24182 /* The subreg-move sequence below will move into lane zero of the
24183 vector register. For big-endian we want that position to hold
24184 the last element of VALS. */
24185 maxelement
= BYTES_BIG_ENDIAN
? n_elts
- 1 : 0;
24187 /* If we have a single constant element, use that for duplicating
24191 maxelement
= const_elem_pos
;
24192 aarch64_emit_move (target
, gen_vec_duplicate (mode
, const_elem
));
24196 rtx x
= force_reg (inner_mode
, XVECEXP (vals
, 0, maxelement
));
24197 aarch64_emit_move (target
, lowpart_subreg (mode
, x
, inner_mode
));
24202 rtx x
= force_reg (inner_mode
, XVECEXP (vals
, 0, maxelement
));
24203 aarch64_emit_move (target
, gen_vec_duplicate (mode
, x
));
24206 /* Insert the rest. */
24207 for (int i
= 0; i
< n_elts
; i
++)
24209 rtx x
= XVECEXP (vals
, 0, i
);
24210 if (matches
[i
][0] == maxelement
)
24212 x
= force_reg (inner_mode
, x
);
24213 emit_insn (GEN_FCN (icode
) (target
, x
, GEN_INT (i
)));
24218 /* Initialise a vector which is part-variable. We want to first try
24219 to build those lanes which are constant in the most efficient way we
24221 if (n_var
!= n_elts
)
24223 rtx copy
= copy_rtx (vals
);
24225 /* Load constant part of vector. We really don't care what goes into the
24226 parts we will overwrite, but we're more likely to be able to load the
24227 constant efficiently if it has fewer, larger, repeating parts
24228 (see aarch64_simd_valid_imm). */
24229 for (int i
= 0; i
< n_elts
; i
++)
24231 rtx x
= XVECEXP (vals
, 0, i
);
24232 if (CONST_INT_P (x
) || CONST_DOUBLE_P (x
))
24234 rtx subst
= any_const
;
24235 for (int bit
= n_elts
/ 2; bit
> 0; bit
/= 2)
24237 /* Look in the copied vector, as more elements are const. */
24238 rtx test
= XVECEXP (copy
, 0, i
^ bit
);
24239 if (CONST_INT_P (test
) || CONST_DOUBLE_P (test
))
24245 XVECEXP (copy
, 0, i
) = subst
;
24247 aarch64_expand_vector_init_fallback (target
, copy
);
24250 /* Insert the variable lanes directly. */
24251 for (int i
= 0; i
< n_elts
; i
++)
24253 rtx x
= XVECEXP (vals
, 0, i
);
24254 if (CONST_INT_P (x
) || CONST_DOUBLE_P (x
))
24256 x
= force_reg (inner_mode
, x
);
24257 emit_insn (GEN_FCN (icode
) (target
, x
, GEN_INT (i
)));
24261 /* Return even or odd half of VALS depending on EVEN_P. */
24264 aarch64_unzip_vector_init (machine_mode mode
, rtx vals
, bool even_p
)
24266 int n
= XVECLEN (vals
, 0);
24267 machine_mode new_mode
24268 = aarch64_simd_container_mode (GET_MODE_INNER (mode
),
24269 GET_MODE_BITSIZE (mode
).to_constant () / 2);
24270 rtvec vec
= rtvec_alloc (n
/ 2);
24271 for (int i
= 0; i
< n
/ 2; i
++)
24272 RTVEC_ELT (vec
, i
) = (even_p
) ? XVECEXP (vals
, 0, 2 * i
)
24273 : XVECEXP (vals
, 0, 2 * i
+ 1);
24274 return gen_rtx_PARALLEL (new_mode
, vec
);
24277 /* Return true if SET is a scalar move. */
24280 scalar_move_insn_p (rtx set
)
24282 rtx src
= SET_SRC (set
);
24283 rtx dest
= SET_DEST (set
);
24284 return (is_a
<scalar_mode
> (GET_MODE (dest
))
24285 && aarch64_mov_operand (src
, GET_MODE (dest
)));
24288 /* Similar to seq_cost, but ignore cost for scalar moves. */
24291 seq_cost_ignoring_scalar_moves (const rtx_insn
*seq
, bool speed
)
24295 for (; seq
; seq
= NEXT_INSN (seq
))
24296 if (NONDEBUG_INSN_P (seq
))
24298 if (rtx set
= single_set (seq
))
24300 if (!scalar_move_insn_p (set
))
24301 cost
+= set_rtx_cost (set
, speed
);
24305 int this_cost
= insn_cost (CONST_CAST_RTX_INSN (seq
), speed
);
24316 /* Expand a vector initialization sequence, such that TARGET is
24317 initialized to contain VALS. */
24320 aarch64_expand_vector_init (rtx target
, rtx vals
)
24322 /* Try decomposing the initializer into even and odd halves and
24323 then ZIP them together. Use the resulting sequence if it is
24324 strictly cheaper than loading VALS directly.
24326 Prefer the fallback sequence in the event of a tie, since it
24327 will tend to use fewer registers. */
24329 machine_mode mode
= GET_MODE (target
);
24330 int n_elts
= XVECLEN (vals
, 0);
24333 || maybe_ne (GET_MODE_BITSIZE (mode
), 128))
24335 aarch64_expand_vector_init_fallback (target
, vals
);
24342 for (int i
= 0; i
< 2; i
++)
24345 rtx new_vals
= aarch64_unzip_vector_init (mode
, vals
, i
== 0);
24346 rtx tmp_reg
= gen_reg_rtx (GET_MODE (new_vals
));
24347 aarch64_expand_vector_init (tmp_reg
, new_vals
);
24348 halves
[i
] = gen_rtx_SUBREG (mode
, tmp_reg
, 0);
24349 rtx_insn
*rec_seq
= get_insns ();
24351 costs
[i
] = seq_cost_ignoring_scalar_moves (rec_seq
, !optimize_size
);
24352 emit_insn (rec_seq
);
24355 rtvec v
= gen_rtvec (2, halves
[0], halves
[1]);
24356 rtx_insn
*zip1_insn
24357 = emit_set_insn (target
, gen_rtx_UNSPEC (mode
, v
, UNSPEC_ZIP1
));
24358 unsigned seq_total_cost
24359 = (!optimize_size
) ? std::max (costs
[0], costs
[1]) : costs
[0] + costs
[1];
24360 seq_total_cost
+= insn_cost (zip1_insn
, !optimize_size
);
24362 rtx_insn
*seq
= get_insns ();
24366 aarch64_expand_vector_init_fallback (target
, vals
);
24367 rtx_insn
*fallback_seq
= get_insns ();
24368 unsigned fallback_seq_cost
24369 = seq_cost_ignoring_scalar_moves (fallback_seq
, !optimize_size
);
24372 emit_insn (seq_total_cost
< fallback_seq_cost
? seq
: fallback_seq
);
24375 /* Emit RTL corresponding to:
24376 insr TARGET, ELEM. */
24379 emit_insr (rtx target
, rtx elem
)
24381 machine_mode mode
= GET_MODE (target
);
24382 scalar_mode elem_mode
= GET_MODE_INNER (mode
);
24383 elem
= force_reg (elem_mode
, elem
);
24385 insn_code icode
= optab_handler (vec_shl_insert_optab
, mode
);
24386 gcc_assert (icode
!= CODE_FOR_nothing
);
24387 emit_insn (GEN_FCN (icode
) (target
, target
, elem
));
24390 /* Subroutine of aarch64_sve_expand_vector_init for handling
24391 trailing constants.
24392 This function works as follows:
24393 (a) Create a new vector consisting of trailing constants.
24394 (b) Initialize TARGET with the constant vector using emit_move_insn.
24395 (c) Insert remaining elements in TARGET using insr.
24396 NELTS is the total number of elements in original vector while
24397 while NELTS_REQD is the number of elements that are actually
24400 ??? The heuristic used is to do above only if number of constants
24401 is at least half the total number of elements. May need fine tuning. */
24404 aarch64_sve_expand_vector_init_handle_trailing_constants
24405 (rtx target
, const rtx_vector_builder
&builder
, int nelts
, int nelts_reqd
)
24407 machine_mode mode
= GET_MODE (target
);
24408 scalar_mode elem_mode
= GET_MODE_INNER (mode
);
24409 int n_trailing_constants
= 0;
24411 for (int i
= nelts_reqd
- 1;
24412 i
>= 0 && valid_for_const_vector_p (elem_mode
, builder
.elt (i
));
24414 n_trailing_constants
++;
24416 if (n_trailing_constants
>= nelts_reqd
/ 2)
24418 /* Try to use the natural pattern of BUILDER to extend the trailing
24419 constant elements to a full vector. Replace any variables in the
24420 extra elements with zeros.
24422 ??? It would be better if the builders supported "don't care"
24423 elements, with the builder filling in whichever elements
24424 give the most compact encoding. */
24425 rtx_vector_builder
v (mode
, nelts
, 1);
24426 for (int i
= 0; i
< nelts
; i
++)
24428 rtx x
= builder
.elt (i
+ nelts_reqd
- n_trailing_constants
);
24429 if (!valid_for_const_vector_p (elem_mode
, x
))
24430 x
= CONST0_RTX (elem_mode
);
24433 rtx const_vec
= v
.build ();
24434 emit_move_insn (target
, const_vec
);
24436 for (int i
= nelts_reqd
- n_trailing_constants
- 1; i
>= 0; i
--)
24437 emit_insr (target
, builder
.elt (i
));
24445 /* Subroutine of aarch64_sve_expand_vector_init.
24447 (a) Initialize TARGET by broadcasting element NELTS_REQD - 1 of BUILDER.
24448 (b) Skip trailing elements from BUILDER, which are the same as
24449 element NELTS_REQD - 1.
24450 (c) Insert earlier elements in reverse order in TARGET using insr. */
24453 aarch64_sve_expand_vector_init_insert_elems (rtx target
,
24454 const rtx_vector_builder
&builder
,
24457 machine_mode mode
= GET_MODE (target
);
24458 scalar_mode elem_mode
= GET_MODE_INNER (mode
);
24460 struct expand_operand ops
[2];
24461 enum insn_code icode
= optab_handler (vec_duplicate_optab
, mode
);
24462 gcc_assert (icode
!= CODE_FOR_nothing
);
24464 create_output_operand (&ops
[0], target
, mode
);
24465 create_input_operand (&ops
[1], builder
.elt (nelts_reqd
- 1), elem_mode
);
24466 expand_insn (icode
, 2, ops
);
24468 int ndups
= builder
.count_dups (nelts_reqd
- 1, -1, -1);
24469 for (int i
= nelts_reqd
- ndups
- 1; i
>= 0; i
--)
24470 emit_insr (target
, builder
.elt (i
));
24473 /* Subroutine of aarch64_sve_expand_vector_init to handle case
24474 when all trailing elements of builder are same.
24475 This works as follows:
24476 (a) Use expand_insn interface to broadcast last vector element in TARGET.
24477 (b) Insert remaining elements in TARGET using insr.
24479 ??? The heuristic used is to do above if number of same trailing elements
24480 is at least 3/4 of total number of elements, loosely based on
24481 heuristic from mostly_zeros_p. May need fine-tuning. */
24484 aarch64_sve_expand_vector_init_handle_trailing_same_elem
24485 (rtx target
, const rtx_vector_builder
&builder
, int nelts_reqd
)
24487 int ndups
= builder
.count_dups (nelts_reqd
- 1, -1, -1);
24488 if (ndups
>= (3 * nelts_reqd
) / 4)
24490 aarch64_sve_expand_vector_init_insert_elems (target
, builder
,
24491 nelts_reqd
- ndups
+ 1);
24498 /* Initialize register TARGET from BUILDER. NELTS is the constant number
24499 of elements in BUILDER.
24501 The function tries to initialize TARGET from BUILDER if it fits one
24502 of the special cases outlined below.
24504 Failing that, the function divides BUILDER into two sub-vectors:
24505 v_even = even elements of BUILDER;
24506 v_odd = odd elements of BUILDER;
24508 and recursively calls itself with v_even and v_odd.
24510 if (recursive call succeeded for v_even or v_odd)
24511 TARGET = zip (v_even, v_odd)
24513 The function returns true if it managed to build TARGET from BUILDER
24514 with one of the special cases, false otherwise.
24516 Example: {a, 1, b, 2, c, 3, d, 4}
24518 The vector gets divided into:
24519 v_even = {a, b, c, d}
24520 v_odd = {1, 2, 3, 4}
24522 aarch64_sve_expand_vector_init(v_odd) hits case 1 and
24523 initialize tmp2 from constant vector v_odd using emit_move_insn.
24525 aarch64_sve_expand_vector_init(v_even) fails since v_even contains
24526 4 elements, so we construct tmp1 from v_even using insr:
24533 TARGET = zip (tmp1, tmp2)
24534 which sets TARGET to {a, 1, b, 2, c, 3, d, 4}. */
24537 aarch64_sve_expand_vector_init (rtx target
, const rtx_vector_builder
&builder
,
24538 int nelts
, int nelts_reqd
)
24540 machine_mode mode
= GET_MODE (target
);
24542 /* Case 1: Vector contains trailing constants. */
24544 if (aarch64_sve_expand_vector_init_handle_trailing_constants
24545 (target
, builder
, nelts
, nelts_reqd
))
24548 /* Case 2: Vector contains leading constants. */
24550 rtx_vector_builder
rev_builder (mode
, nelts_reqd
, 1);
24551 for (int i
= 0; i
< nelts_reqd
; i
++)
24552 rev_builder
.quick_push (builder
.elt (nelts_reqd
- i
- 1));
24553 rev_builder
.finalize ();
24555 if (aarch64_sve_expand_vector_init_handle_trailing_constants
24556 (target
, rev_builder
, nelts
, nelts_reqd
))
24558 emit_insn (gen_aarch64_sve_rev (mode
, target
, target
));
24562 /* Case 3: Vector contains trailing same element. */
24564 if (aarch64_sve_expand_vector_init_handle_trailing_same_elem
24565 (target
, builder
, nelts_reqd
))
24568 /* Case 4: Vector contains leading same element. */
24570 if (aarch64_sve_expand_vector_init_handle_trailing_same_elem
24571 (target
, rev_builder
, nelts_reqd
) && nelts_reqd
== nelts
)
24573 emit_insn (gen_aarch64_sve_rev (mode
, target
, target
));
24577 /* Avoid recursing below 4-elements.
24578 ??? The threshold 4 may need fine-tuning. */
24580 if (nelts_reqd
<= 4)
24583 rtx_vector_builder
v_even (mode
, nelts
, 1);
24584 rtx_vector_builder
v_odd (mode
, nelts
, 1);
24586 for (int i
= 0; i
< nelts
* 2; i
+= 2)
24588 v_even
.quick_push (builder
.elt (i
));
24589 v_odd
.quick_push (builder
.elt (i
+ 1));
24592 v_even
.finalize ();
24595 rtx tmp1
= gen_reg_rtx (mode
);
24596 bool did_even_p
= aarch64_sve_expand_vector_init (tmp1
, v_even
,
24597 nelts
, nelts_reqd
/ 2);
24599 rtx tmp2
= gen_reg_rtx (mode
);
24600 bool did_odd_p
= aarch64_sve_expand_vector_init (tmp2
, v_odd
,
24601 nelts
, nelts_reqd
/ 2);
24603 if (!did_even_p
&& !did_odd_p
)
24606 /* Initialize v_even and v_odd using INSR if it didn't match any of the
24607 special cases and zip v_even, v_odd. */
24610 aarch64_sve_expand_vector_init_insert_elems (tmp1
, v_even
, nelts_reqd
/ 2);
24613 aarch64_sve_expand_vector_init_insert_elems (tmp2
, v_odd
, nelts_reqd
/ 2);
24615 rtvec v
= gen_rtvec (2, tmp1
, tmp2
);
24616 emit_set_insn (target
, gen_rtx_UNSPEC (mode
, v
, UNSPEC_ZIP1
));
24620 /* Initialize register TARGET from the elements in PARALLEL rtx VALS. */
24623 aarch64_sve_expand_vector_init (rtx target
, rtx vals
)
24625 machine_mode mode
= GET_MODE (target
);
24626 int nelts
= XVECLEN (vals
, 0);
24628 rtx_vector_builder
v (mode
, nelts
, 1);
24629 for (int i
= 0; i
< nelts
; i
++)
24630 v
.quick_push (XVECEXP (vals
, 0, i
));
24633 /* If neither sub-vectors of v could be initialized specially,
24634 then use INSR to insert all elements from v into TARGET.
24635 ??? This might not be optimal for vectors with large
24636 initializers like 16-element or above.
24637 For nelts < 4, it probably isn't useful to handle specially. */
24640 || !aarch64_sve_expand_vector_init (target
, v
, nelts
, nelts
))
24641 aarch64_sve_expand_vector_init_insert_elems (target
, v
, nelts
);
24644 /* Initialize register TARGET from the two vector subelements in PARALLEL
24648 aarch64_sve_expand_vector_init_subvector (rtx target
, rtx vals
)
24650 machine_mode mode
= GET_MODE (target
);
24651 int nelts
= XVECLEN (vals
, 0);
24653 gcc_assert (nelts
% 2 == 0);
24655 /* We have to be concatting vector. */
24656 machine_mode elem_mode
= GET_MODE (XVECEXP (vals
, 0, 0));
24657 gcc_assert (VECTOR_MODE_P (elem_mode
));
24659 auto_vec
<rtx
> worklist
;
24660 machine_mode wider_mode
= elem_mode
;
24662 for (int i
= 0; i
< nelts
; i
++)
24663 worklist
.safe_push (force_reg (elem_mode
, XVECEXP (vals
, 0, i
)));
24665 /* Keep widening pairwise to have maximum throughput. */
24669 = related_vector_mode (wider_mode
, GET_MODE_INNER (wider_mode
),
24670 GET_MODE_NUNITS (wider_mode
) * 2).require ();
24672 for (int i
= 0; i
< nelts
; i
+= 2)
24674 rtx arg0
= worklist
[i
];
24675 rtx arg1
= worklist
[i
+1];
24676 gcc_assert (GET_MODE (arg0
) == GET_MODE (arg1
));
24678 rtx tmp
= gen_reg_rtx (wider_mode
);
24679 emit_insn (gen_aarch64_pack_partial (wider_mode
, tmp
, arg0
, arg1
));
24680 worklist
[i
/ 2] = tmp
;
24686 gcc_assert (wider_mode
== mode
);
24687 emit_move_insn (target
, worklist
[0]);
24692 /* Check whether VALUE is a vector constant in which every element
24693 is either a power of 2 or a negated power of 2. If so, return
24694 a constant vector of log2s, and flip CODE between PLUS and MINUS
24695 if VALUE contains negated powers of 2. Return NULL_RTX otherwise. */
24698 aarch64_convert_mult_to_shift (rtx value
, rtx_code
&code
)
24700 if (!CONST_VECTOR_P (value
))
24703 rtx_vector_builder builder
;
24704 if (!builder
.new_unary_operation (GET_MODE (value
), value
, false))
24707 scalar_mode int_mode
= GET_MODE_INNER (GET_MODE (value
));
24708 /* 1 if the result of the multiplication must be negated,
24709 0 if it mustn't, or -1 if we don't yet care. */
24711 unsigned int encoded_nelts
= const_vector_encoded_nelts (value
);
24712 for (unsigned int i
= 0; i
< encoded_nelts
; ++i
)
24714 rtx elt
= CONST_VECTOR_ENCODED_ELT (value
, i
);
24715 if (!CONST_SCALAR_INT_P (elt
))
24717 rtx_mode_t
val (elt
, int_mode
);
24718 wide_int pow2
= wi::neg (val
);
24721 /* It matters whether we negate or not. Make that choice,
24722 and make sure that it's consistent with previous elements. */
24723 if (negate
== !wi::neg_p (val
))
24725 negate
= wi::neg_p (val
);
24729 /* POW2 is now the value that we want to be a power of 2. */
24730 int shift
= wi::exact_log2 (pow2
);
24733 builder
.quick_push (gen_int_mode (shift
, int_mode
));
24736 /* PLUS and MINUS are equivalent; canonicalize on PLUS. */
24738 else if (negate
== 1)
24739 code
= code
== PLUS
? MINUS
: PLUS
;
24740 return builder
.build ();
24743 /* Prepare for an integer SVE multiply-add or multiply-subtract pattern;
24744 CODE is PLUS for the former and MINUS for the latter. OPERANDS is the
24745 operands array, in the same order as for fma_optab. Return true if
24746 the function emitted all the necessary instructions, false if the caller
24747 should generate the pattern normally with the new OPERANDS array. */
24750 aarch64_prepare_sve_int_fma (rtx
*operands
, rtx_code code
)
24752 machine_mode mode
= GET_MODE (operands
[0]);
24753 if (rtx shifts
= aarch64_convert_mult_to_shift (operands
[2], code
))
24755 rtx product
= expand_binop (mode
, vashl_optab
, operands
[1], shifts
,
24756 NULL_RTX
, true, OPTAB_DIRECT
);
24757 force_expand_binop (mode
, code
== PLUS
? add_optab
: sub_optab
,
24758 operands
[3], product
, operands
[0], true,
24762 operands
[2] = force_reg (mode
, operands
[2]);
24766 /* Likewise, but for a conditional pattern. */
24769 aarch64_prepare_sve_cond_int_fma (rtx
*operands
, rtx_code code
)
24771 machine_mode mode
= GET_MODE (operands
[0]);
24772 if (rtx shifts
= aarch64_convert_mult_to_shift (operands
[3], code
))
24774 rtx product
= expand_binop (mode
, vashl_optab
, operands
[2], shifts
,
24775 NULL_RTX
, true, OPTAB_DIRECT
);
24776 emit_insn (gen_cond (code
, mode
, operands
[0], operands
[1],
24777 operands
[4], product
, operands
[5]));
24780 operands
[3] = force_reg (mode
, operands
[3]);
24784 static unsigned HOST_WIDE_INT
24785 aarch64_shift_truncation_mask (machine_mode mode
)
24787 if (!SHIFT_COUNT_TRUNCATED
|| aarch64_vector_data_mode_p (mode
))
24789 return GET_MODE_UNIT_BITSIZE (mode
) - 1;
24792 /* Select a format to encode pointers in exception handling data. */
24794 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED
, int global
)
24797 switch (aarch64_cmodel
)
24799 case AARCH64_CMODEL_TINY
:
24800 case AARCH64_CMODEL_TINY_PIC
:
24801 case AARCH64_CMODEL_SMALL
:
24802 case AARCH64_CMODEL_SMALL_PIC
:
24803 case AARCH64_CMODEL_SMALL_SPIC
:
24804 /* text+got+data < 4Gb. 4-byte signed relocs are sufficient
24806 type
= DW_EH_PE_sdata4
;
24809 /* No assumptions here. 8-byte relocs required. */
24810 type
= DW_EH_PE_sdata8
;
24813 return (global
? DW_EH_PE_indirect
: 0) | DW_EH_PE_pcrel
| type
;
24816 /* Output .variant_pcs for aarch64_vector_pcs function symbols. */
24819 aarch64_asm_output_variant_pcs (FILE *stream
, const tree decl
, const char* name
)
24821 if (TREE_CODE (decl
) == FUNCTION_DECL
)
24823 arm_pcs pcs
= (arm_pcs
) fndecl_abi (decl
).id ();
24824 if (pcs
== ARM_PCS_SIMD
|| pcs
== ARM_PCS_SVE
)
24826 fprintf (stream
, "\t.variant_pcs\t");
24827 assemble_name (stream
, name
);
24828 fprintf (stream
, "\n");
24833 /* The last .arch and .tune assembly strings that we printed. */
24834 static std::string aarch64_last_printed_arch_string
;
24835 static std::string aarch64_last_printed_tune_string
;
24837 /* Implement ASM_DECLARE_FUNCTION_NAME. Output the ISA features used
24838 by the function fndecl. */
24841 aarch64_declare_function_name (FILE *stream
, const char* name
,
24844 tree target_parts
= DECL_FUNCTION_SPECIFIC_TARGET (fndecl
);
24846 struct cl_target_option
*targ_options
;
24848 targ_options
= TREE_TARGET_OPTION (target_parts
);
24850 targ_options
= TREE_TARGET_OPTION (target_option_current_node
);
24851 gcc_assert (targ_options
);
24853 auto isa_flags
= aarch64_get_asm_isa_flags (targ_options
);
24854 aarch64_arch arch
= targ_options
->x_selected_arch
;
24855 std::string to_print
24856 = aarch64_get_arch_string_for_assembler (arch
, isa_flags
);
24857 /* Only update the assembler .arch string if it is distinct from the last
24858 such string we printed. */
24859 if (to_print
!= aarch64_last_printed_arch_string
)
24861 asm_fprintf (asm_out_file
, "\t.arch %s\n", to_print
.c_str ());
24862 aarch64_last_printed_arch_string
= to_print
;
24865 /* Print the cpu name we're tuning for in the comments, might be
24866 useful to readers of the generated asm. Do it only when it changes
24867 from function to function and verbose assembly is requested. */
24868 const struct processor
*this_tune
24869 = aarch64_get_tune_cpu (targ_options
->x_selected_tune
);
24871 if (flag_debug_asm
&& aarch64_last_printed_tune_string
!= this_tune
->name
)
24873 asm_fprintf (asm_out_file
, "\t" ASM_COMMENT_START
".tune %s\n",
24875 aarch64_last_printed_tune_string
= this_tune
->name
;
24878 aarch64_asm_output_variant_pcs (stream
, fndecl
, name
);
24880 /* Don't forget the type directive for ELF. */
24881 ASM_OUTPUT_TYPE_DIRECTIVE (stream
, name
, "function");
24882 ASM_OUTPUT_FUNCTION_LABEL (stream
, name
, fndecl
);
24884 cfun
->machine
->label_is_assembled
= true;
24887 /* Implement PRINT_PATCHABLE_FUNCTION_ENTRY. */
24890 aarch64_print_patchable_function_entry (FILE *file
,
24891 unsigned HOST_WIDE_INT patch_area_size
,
24894 if (!cfun
->machine
->label_is_assembled
)
24896 /* Emit the patching area before the entry label, if any. */
24897 default_print_patchable_function_entry (file
, patch_area_size
,
24902 rtx pa
= gen_patchable_area (GEN_INT (patch_area_size
),
24903 GEN_INT (record_p
));
24904 basic_block bb
= ENTRY_BLOCK_PTR_FOR_FN (cfun
)->next_bb
;
24906 if (!aarch_bti_enabled ()
24907 || cgraph_node::get (cfun
->decl
)->only_called_directly_p ())
24909 /* Emit the patchable_area at the beginning of the function. */
24910 rtx_insn
*insn
= emit_insn_before (pa
, BB_HEAD (bb
));
24911 INSN_ADDRESSES_NEW (insn
, -1);
24915 rtx_insn
*insn
= next_real_nondebug_insn (get_insns ());
24918 || GET_CODE (PATTERN (insn
)) != UNSPEC_VOLATILE
24919 || XINT (PATTERN (insn
), 1) != UNSPECV_BTI_C
)
24921 /* Emit a BTI_C. */
24922 insn
= emit_insn_before (gen_bti_c (), BB_HEAD (bb
));
24925 /* Emit the patchable_area after BTI_C. */
24926 insn
= emit_insn_after (pa
, insn
);
24927 INSN_ADDRESSES_NEW (insn
, -1);
24930 /* Output patchable area. */
24933 aarch64_output_patchable_area (unsigned int patch_area_size
, bool record_p
)
24935 default_print_patchable_function_entry (asm_out_file
, patch_area_size
,
24939 /* Implement ASM_OUTPUT_DEF_FROM_DECLS. Output .variant_pcs for aliases. */
24942 aarch64_asm_output_alias (FILE *stream
, const tree decl
, const tree target
)
24944 const char *name
= XSTR (XEXP (DECL_RTL (decl
), 0), 0);
24945 const char *value
= IDENTIFIER_POINTER (target
);
24946 aarch64_asm_output_variant_pcs (stream
, decl
, name
);
24947 ASM_OUTPUT_DEF (stream
, name
, value
);
24950 /* Implement ASM_OUTPUT_EXTERNAL. Output .variant_pcs for undefined
24951 function symbol references. */
24954 aarch64_asm_output_external (FILE *stream
, tree decl
, const char* name
)
24956 default_elf_asm_output_external (stream
, decl
, name
);
24957 aarch64_asm_output_variant_pcs (stream
, decl
, name
);
24960 /* Triggered after a .cfi_startproc directive is emitted into the assembly file.
24961 Used to output the .cfi_b_key_frame directive when signing the current
24962 function with the B key. */
24965 aarch64_post_cfi_startproc (FILE *f
, tree ignored ATTRIBUTE_UNUSED
)
24967 if (cfun
->machine
->frame
.laid_out
&& aarch64_return_address_signing_enabled ()
24968 && aarch64_ra_sign_key
== AARCH64_KEY_B
)
24969 asm_fprintf (f
, "\t.cfi_b_key_frame\n");
24972 /* Implements TARGET_ASM_FILE_START. Output the assembly header. */
24975 aarch64_start_file (void)
24977 struct cl_target_option
*default_options
24978 = TREE_TARGET_OPTION (target_option_default_node
);
24980 aarch64_arch default_arch
= default_options
->x_selected_arch
;
24981 auto default_isa_flags
= aarch64_get_asm_isa_flags (default_options
);
24982 std::string arch_string
24983 = aarch64_get_arch_string_for_assembler (default_arch
, default_isa_flags
);
24984 aarch64_last_printed_arch_string
= arch_string
;
24985 aarch64_last_printed_tune_string
= "";
24986 asm_fprintf (asm_out_file
, "\t.arch %s\n",
24987 arch_string
.c_str ());
24989 default_file_start ();
24992 /* Emit load exclusive. */
24995 aarch64_emit_load_exclusive (machine_mode mode
, rtx rval
,
24996 rtx mem
, rtx model_rtx
)
24998 if (mode
== TImode
)
24999 emit_insn (gen_aarch64_load_exclusive_pair (gen_lowpart (DImode
, rval
),
25000 gen_highpart (DImode
, rval
),
25003 emit_insn (gen_aarch64_load_exclusive (mode
, rval
, mem
, model_rtx
));
25006 /* Emit store exclusive. */
25009 aarch64_emit_store_exclusive (machine_mode mode
, rtx bval
,
25010 rtx mem
, rtx rval
, rtx model_rtx
)
25012 if (mode
== TImode
)
25013 emit_insn (gen_aarch64_store_exclusive_pair
25014 (bval
, mem
, operand_subword (rval
, 0, 0, TImode
),
25015 operand_subword (rval
, 1, 0, TImode
), model_rtx
));
25017 emit_insn (gen_aarch64_store_exclusive (mode
, bval
, mem
, rval
, model_rtx
));
25020 /* Mark the previous jump instruction as unlikely. */
25023 aarch64_emit_unlikely_jump (rtx insn
)
25025 rtx_insn
*jump
= emit_jump_insn (insn
);
25026 add_reg_br_prob_note (jump
, profile_probability::very_unlikely ());
25029 /* We store the names of the various atomic helpers in a 5x5 array.
25030 Return the libcall function given MODE, MODEL and NAMES. */
25033 aarch64_atomic_ool_func(machine_mode mode
, rtx model_rtx
,
25034 const atomic_ool_names
*names
)
25036 memmodel model
= memmodel_from_int (INTVAL (model_rtx
));
25037 int mode_idx
, model_idx
;
25057 gcc_unreachable ();
25062 case MEMMODEL_RELAXED
:
25065 case MEMMODEL_CONSUME
:
25066 case MEMMODEL_ACQUIRE
:
25069 case MEMMODEL_RELEASE
:
25072 case MEMMODEL_ACQ_REL
:
25073 case MEMMODEL_SEQ_CST
:
25076 case MEMMODEL_SYNC_ACQUIRE
:
25077 case MEMMODEL_SYNC_RELEASE
:
25078 case MEMMODEL_SYNC_SEQ_CST
:
25082 gcc_unreachable ();
25085 return init_one_libfunc_visibility (names
->str
[mode_idx
][model_idx
],
25086 VISIBILITY_HIDDEN
);
25089 #define DEF0(B, N) \
25090 { "__aarch64_" #B #N "_relax", \
25091 "__aarch64_" #B #N "_acq", \
25092 "__aarch64_" #B #N "_rel", \
25093 "__aarch64_" #B #N "_acq_rel", \
25094 "__aarch64_" #B #N "_sync" }
25096 #define DEF4(B) DEF0(B, 1), DEF0(B, 2), DEF0(B, 4), DEF0(B, 8), \
25097 { NULL, NULL, NULL, NULL }
25098 #define DEF5(B) DEF0(B, 1), DEF0(B, 2), DEF0(B, 4), DEF0(B, 8), DEF0(B, 16)
25100 static const atomic_ool_names aarch64_ool_cas_names
= { { DEF5(cas
) } };
25101 const atomic_ool_names aarch64_ool_swp_names
= { { DEF4(swp
) } };
25102 const atomic_ool_names aarch64_ool_ldadd_names
= { { DEF4(ldadd
) } };
25103 const atomic_ool_names aarch64_ool_ldset_names
= { { DEF4(ldset
) } };
25104 const atomic_ool_names aarch64_ool_ldclr_names
= { { DEF4(ldclr
) } };
25105 const atomic_ool_names aarch64_ool_ldeor_names
= { { DEF4(ldeor
) } };
25111 /* Expand a compare and swap pattern. */
25114 aarch64_expand_compare_and_swap (rtx operands
[])
25116 rtx bval
, rval
, mem
, oldval
, newval
, is_weak
, mod_s
, mod_f
, x
, cc_reg
;
25117 machine_mode mode
, r_mode
;
25119 bval
= operands
[0];
25120 rval
= operands
[1];
25122 oldval
= operands
[3];
25123 newval
= operands
[4];
25124 is_weak
= operands
[5];
25125 mod_s
= operands
[6];
25126 mod_f
= operands
[7];
25127 mode
= GET_MODE (mem
);
25129 /* Normally the succ memory model must be stronger than fail, but in the
25130 unlikely event of fail being ACQUIRE and succ being RELEASE we need to
25131 promote succ to ACQ_REL so that we don't lose the acquire semantics. */
25132 if (is_mm_acquire (memmodel_from_int (INTVAL (mod_f
)))
25133 && is_mm_release (memmodel_from_int (INTVAL (mod_s
))))
25134 mod_s
= GEN_INT (MEMMODEL_ACQ_REL
);
25137 if (mode
== QImode
|| mode
== HImode
)
25140 rval
= gen_reg_rtx (r_mode
);
25145 /* The CAS insn requires oldval and rval overlap, but we need to
25146 have a copy of oldval saved across the operation to tell if
25147 the operation is successful. */
25148 if (reg_overlap_mentioned_p (rval
, oldval
))
25149 rval
= copy_to_mode_reg (r_mode
, oldval
);
25151 emit_move_insn (rval
, gen_lowpart (r_mode
, oldval
));
25152 if (mode
== TImode
)
25153 newval
= force_reg (mode
, newval
);
25155 emit_insn (gen_aarch64_compare_and_swap_lse (mode
, rval
, mem
,
25157 cc_reg
= aarch64_gen_compare_reg_maybe_ze (NE
, rval
, oldval
, mode
);
25159 else if (TARGET_OUTLINE_ATOMICS
)
25161 /* Oldval must satisfy compare afterward. */
25162 if (!aarch64_plus_operand (oldval
, mode
))
25163 oldval
= force_reg (mode
, oldval
);
25164 rtx func
= aarch64_atomic_ool_func (mode
, mod_s
, &aarch64_ool_cas_names
);
25165 rval
= emit_library_call_value (func
, NULL_RTX
, LCT_NORMAL
, r_mode
,
25166 oldval
, mode
, newval
, mode
,
25167 XEXP (mem
, 0), Pmode
);
25168 cc_reg
= aarch64_gen_compare_reg_maybe_ze (NE
, rval
, oldval
, mode
);
25172 /* The oldval predicate varies by mode. Test it and force to reg. */
25173 insn_code code
= code_for_aarch64_compare_and_swap (mode
);
25174 if (!insn_data
[code
].operand
[2].predicate (oldval
, mode
))
25175 oldval
= force_reg (mode
, oldval
);
25177 emit_insn (GEN_FCN (code
) (rval
, mem
, oldval
, newval
,
25178 is_weak
, mod_s
, mod_f
));
25179 cc_reg
= gen_rtx_REG (CCmode
, CC_REGNUM
);
25182 if (r_mode
!= mode
)
25183 rval
= gen_lowpart (mode
, rval
);
25184 emit_move_insn (operands
[1], rval
);
25186 x
= gen_rtx_EQ (SImode
, cc_reg
, const0_rtx
);
25187 emit_insn (gen_rtx_SET (bval
, x
));
25190 /* Emit a barrier, that is appropriate for memory model MODEL, at the end of a
25191 sequence implementing an atomic operation. */
25194 aarch64_emit_post_barrier (enum memmodel model
)
25196 const enum memmodel base_model
= memmodel_base (model
);
25198 if (is_mm_sync (model
)
25199 && (base_model
== MEMMODEL_ACQUIRE
25200 || base_model
== MEMMODEL_ACQ_REL
25201 || base_model
== MEMMODEL_SEQ_CST
))
25203 emit_insn (gen_mem_thread_fence (GEN_INT (MEMMODEL_SEQ_CST
)));
25207 /* Split a compare and swap pattern. */
25210 aarch64_split_compare_and_swap (rtx operands
[])
25212 /* Split after prolog/epilog to avoid interactions with shrinkwrapping. */
25213 gcc_assert (epilogue_completed
);
25215 rtx rval
, mem
, oldval
, newval
, scratch
, x
, model_rtx
;
25218 rtx_code_label
*label1
, *label2
;
25219 enum memmodel model
;
25221 rval
= operands
[0];
25223 oldval
= operands
[2];
25224 newval
= operands
[3];
25225 model_rtx
= operands
[5];
25226 scratch
= operands
[7];
25227 mode
= GET_MODE (mem
);
25228 model
= memmodel_from_int (INTVAL (model_rtx
));
25229 is_weak
= operands
[4] != const0_rtx
&& mode
!= TImode
;
25231 /* When OLDVAL is zero and we want the strong version we can emit a tighter
25234 LD[A]XR rval, [mem]
25236 ST[L]XR scratch, newval, [mem]
25237 CBNZ scratch, .label1
25240 bool strong_zero_p
= (!is_weak
&& !aarch64_track_speculation
&&
25241 oldval
== const0_rtx
&& mode
!= TImode
);
25246 label1
= gen_label_rtx ();
25247 emit_label (label1
);
25249 label2
= gen_label_rtx ();
25251 /* The initial load can be relaxed for a __sync operation since a final
25252 barrier will be emitted to stop code hoisting. */
25253 if (is_mm_sync (model
))
25254 aarch64_emit_load_exclusive (mode
, rval
, mem
, GEN_INT (MEMMODEL_RELAXED
));
25256 aarch64_emit_load_exclusive (mode
, rval
, mem
, model_rtx
);
25259 x
= gen_rtx_NE (VOIDmode
, rval
, const0_rtx
);
25262 rtx cc_reg
= aarch64_gen_compare_reg_maybe_ze (NE
, rval
, oldval
, mode
);
25263 x
= gen_rtx_NE (VOIDmode
, cc_reg
, const0_rtx
);
25265 x
= gen_rtx_IF_THEN_ELSE (VOIDmode
, x
,
25266 gen_rtx_LABEL_REF (Pmode
, label2
), pc_rtx
);
25267 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx
, x
));
25269 aarch64_emit_store_exclusive (mode
, scratch
, mem
, newval
, model_rtx
);
25273 x
= aarch64_gen_compare_zero_and_branch (NE
, scratch
, label1
);
25274 aarch64_emit_unlikely_jump (x
);
25277 aarch64_gen_compare_reg (NE
, scratch
, const0_rtx
);
25279 /* 128-bit LDAXP is not atomic unless STLXP succeeds. So for a mismatch,
25280 store the returned value and loop if the STLXP fails. */
25281 if (mode
== TImode
)
25283 rtx_code_label
*label3
= gen_label_rtx ();
25284 emit_jump_insn (gen_rtx_SET (pc_rtx
, gen_rtx_LABEL_REF (Pmode
, label3
)));
25287 emit_label (label2
);
25288 aarch64_emit_store_exclusive (mode
, scratch
, mem
, rval
, model_rtx
);
25290 x
= aarch64_gen_compare_zero_and_branch (NE
, scratch
, label1
);
25291 aarch64_emit_unlikely_jump (x
);
25296 emit_label (label2
);
25298 /* If we used a CBNZ in the exchange loop emit an explicit compare with RVAL
25299 to set the condition flags. If this is not used it will be removed by
25302 aarch64_gen_compare_reg (NE
, rval
, const0_rtx
);
25304 /* Emit any final barrier needed for a __sync operation. */
25305 if (is_mm_sync (model
))
25306 aarch64_emit_post_barrier (model
);
25309 /* Split an atomic operation. */
25312 aarch64_split_atomic_op (enum rtx_code code
, rtx old_out
, rtx new_out
, rtx mem
,
25313 rtx value
, rtx model_rtx
, rtx cond
)
25315 /* Split after prolog/epilog to avoid interactions with shrinkwrapping. */
25316 gcc_assert (epilogue_completed
);
25318 machine_mode mode
= GET_MODE (mem
);
25319 machine_mode wmode
= (mode
== DImode
? DImode
: SImode
);
25320 const enum memmodel model
= memmodel_from_int (INTVAL (model_rtx
));
25321 const bool is_sync
= is_mm_sync (model
);
25322 rtx_code_label
*label
;
25325 /* Split the atomic operation into a sequence. */
25326 label
= gen_label_rtx ();
25327 emit_label (label
);
25330 new_out
= gen_lowpart (wmode
, new_out
);
25332 old_out
= gen_lowpart (wmode
, old_out
);
25335 value
= simplify_gen_subreg (wmode
, value
, mode
, 0);
25337 /* The initial load can be relaxed for a __sync operation since a final
25338 barrier will be emitted to stop code hoisting. */
25340 aarch64_emit_load_exclusive (mode
, old_out
, mem
,
25341 GEN_INT (MEMMODEL_RELAXED
));
25343 aarch64_emit_load_exclusive (mode
, old_out
, mem
, model_rtx
);
25352 x
= gen_rtx_AND (wmode
, old_out
, value
);
25353 emit_insn (gen_rtx_SET (new_out
, x
));
25354 x
= gen_rtx_NOT (wmode
, new_out
);
25355 emit_insn (gen_rtx_SET (new_out
, x
));
25359 if (CONST_INT_P (value
))
25361 value
= GEN_INT (-UINTVAL (value
));
25364 /* Fall through. */
25367 x
= gen_rtx_fmt_ee (code
, wmode
, old_out
, value
);
25368 emit_insn (gen_rtx_SET (new_out
, x
));
25372 aarch64_emit_store_exclusive (mode
, cond
, mem
,
25373 gen_lowpart (mode
, new_out
), model_rtx
);
25375 x
= aarch64_gen_compare_zero_and_branch (NE
, cond
, label
);
25376 aarch64_emit_unlikely_jump (x
);
25378 /* Emit any final barrier needed for a __sync operation. */
25380 aarch64_emit_post_barrier (model
);
25384 aarch64_init_libfuncs (void)
25386 /* Half-precision float operations. The compiler handles all operations
25387 with NULL libfuncs by converting to SFmode. */
25390 set_conv_libfunc (trunc_optab
, HFmode
, SFmode
, "__gnu_f2h_ieee");
25391 set_conv_libfunc (sext_optab
, SFmode
, HFmode
, "__gnu_h2f_ieee");
25394 set_optab_libfunc (add_optab
, HFmode
, NULL
);
25395 set_optab_libfunc (sdiv_optab
, HFmode
, NULL
);
25396 set_optab_libfunc (smul_optab
, HFmode
, NULL
);
25397 set_optab_libfunc (neg_optab
, HFmode
, NULL
);
25398 set_optab_libfunc (sub_optab
, HFmode
, NULL
);
25401 set_optab_libfunc (eq_optab
, HFmode
, NULL
);
25402 set_optab_libfunc (ne_optab
, HFmode
, NULL
);
25403 set_optab_libfunc (lt_optab
, HFmode
, NULL
);
25404 set_optab_libfunc (le_optab
, HFmode
, NULL
);
25405 set_optab_libfunc (ge_optab
, HFmode
, NULL
);
25406 set_optab_libfunc (gt_optab
, HFmode
, NULL
);
25407 set_optab_libfunc (unord_optab
, HFmode
, NULL
);
25410 /* Target hook for c_mode_for_suffix. */
25411 static machine_mode
25412 aarch64_c_mode_for_suffix (char suffix
)
25420 /* Return true iff X with mode MODE can be represented by a quarter-precision
25421 floating point immediate operand X. Note, we cannot represent 0.0. */
25424 aarch64_float_const_representable_p (rtx x
)
25426 x
= unwrap_const_vec_duplicate (x
);
25427 machine_mode mode
= GET_MODE (x
);
25428 if (!CONST_DOUBLE_P (x
))
25431 if ((mode
== HFmode
&& !TARGET_FP_F16INST
)
25435 REAL_VALUE_TYPE r
= *CONST_DOUBLE_REAL_VALUE (x
);
25437 return aarch64_real_float_const_representable_p (r
);
25440 /* Returns the string with the instruction for the SIMD immediate
25441 * CONST_VECTOR of MODE and WIDTH. WHICH selects a move, and(bic) or orr. */
25443 aarch64_output_simd_imm (rtx const_vector
, unsigned width
,
25444 enum simd_immediate_check which
)
25447 static char templ
[40];
25448 const char *mnemonic
;
25449 const char *shift_op
;
25450 unsigned int lane_count
= 0;
25453 struct simd_immediate_info info
;
25455 is_valid
= aarch64_simd_valid_imm (const_vector
, &info
, which
);
25456 gcc_assert (is_valid
);
25458 element_char
= sizetochar (GET_MODE_BITSIZE (info
.elt_mode
));
25459 lane_count
= width
/ GET_MODE_BITSIZE (info
.elt_mode
);
25461 if (GET_MODE_CLASS (info
.elt_mode
) == MODE_FLOAT
)
25463 gcc_assert (info
.insn
== simd_immediate_info::MOV
25464 && info
.u
.mov
.shift
== 0);
25465 /* For FP zero change it to a CONST_INT 0 and use the integer SIMD
25466 move immediate path. */
25467 if (aarch64_float_const_zero_rtx_p (info
.u
.mov
.value
))
25468 info
.u
.mov
.value
= GEN_INT (0);
25471 const unsigned int buf_size
= 20;
25472 char float_buf
[buf_size
] = {'\0'};
25473 real_to_decimal_for_mode (float_buf
,
25474 CONST_DOUBLE_REAL_VALUE (info
.u
.mov
.value
),
25475 buf_size
, buf_size
, 1, info
.elt_mode
);
25477 if (lane_count
== 1)
25478 snprintf (templ
, sizeof (templ
), "fmov\t%%d0, %s", float_buf
);
25480 snprintf (templ
, sizeof (templ
), "fmov\t%%0.%d%c, %s",
25481 lane_count
, element_char
, float_buf
);
25486 gcc_assert (CONST_INT_P (info
.u
.mov
.value
));
25488 if (which
== AARCH64_CHECK_MOV
)
25490 if (info
.insn
== simd_immediate_info::INDEX
)
25492 gcc_assert (TARGET_SVE
);
25493 snprintf (templ
, sizeof (templ
), "index\t%%Z0.%c, #"
25494 HOST_WIDE_INT_PRINT_DEC
", #" HOST_WIDE_INT_PRINT_DEC
,
25495 element_char
, INTVAL (info
.u
.index
.base
),
25496 INTVAL (info
.u
.index
.step
));
25500 if (info
.insn
== simd_immediate_info::SVE_MOV
)
25502 gcc_assert (TARGET_SVE
);
25503 snprintf (templ
, sizeof (templ
), "mov\t%%Z0.%c, #" HOST_WIDE_INT_PRINT_DEC
,
25504 element_char
, INTVAL (info
.u
.mov
.value
));
25508 mnemonic
= info
.insn
== simd_immediate_info::MVN
? "mvni" : "movi";
25509 shift_op
= (info
.u
.mov
.modifier
== simd_immediate_info::MSL
25511 if (lane_count
== 1)
25512 snprintf (templ
, sizeof (templ
), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX
,
25513 mnemonic
, UINTVAL (info
.u
.mov
.value
));
25514 else if (info
.u
.mov
.shift
)
25515 snprintf (templ
, sizeof (templ
), "%s\t%%0.%d%c, "
25516 HOST_WIDE_INT_PRINT_HEX
", %s %d", mnemonic
, lane_count
,
25517 element_char
, UINTVAL (info
.u
.mov
.value
), shift_op
,
25520 snprintf (templ
, sizeof (templ
), "%s\t%%0.%d%c, "
25521 HOST_WIDE_INT_PRINT_HEX
, mnemonic
, lane_count
,
25522 element_char
, UINTVAL (info
.u
.mov
.value
));
25526 /* AARCH64_CHECK_ORR, AARCH64_CHECK_AND or AARCH64_CHECK_XOR. */
25528 if (which
== AARCH64_CHECK_AND
)
25529 mnemonic
= info
.insn
== simd_immediate_info::MVN
? "bic" : "and";
25530 else if (which
== AARCH64_CHECK_XOR
)
25533 if (info
.insn
== simd_immediate_info::SVE_MOV
)
25535 gcc_assert (TARGET_SVE
);
25536 snprintf (templ
, sizeof (templ
), "%s\t%%Z0.%c, %%Z0.%c, "
25537 HOST_WIDE_INT_PRINT_DEC
, mnemonic
, element_char
,
25538 element_char
, INTVAL (info
.u
.mov
.value
));
25540 else if (info
.u
.mov
.shift
)
25541 snprintf (templ
, sizeof (templ
), "%s\t%%0.%d%c, #"
25542 HOST_WIDE_INT_PRINT_DEC
", %s #%d", mnemonic
, lane_count
,
25543 element_char
, UINTVAL (info
.u
.mov
.value
), "lsl",
25546 snprintf (templ
, sizeof (templ
), "%s\t%%0.%d%c, #"
25547 HOST_WIDE_INT_PRINT_DEC
, mnemonic
, lane_count
,
25548 element_char
, UINTVAL (info
.u
.mov
.value
));
25553 /* Returns the string with the ORR instruction for the SIMD immediate
25554 CONST_VECTOR of WIDTH bits. */
25556 aarch64_output_simd_orr_imm (rtx const_vector
, unsigned width
)
25558 return aarch64_output_simd_imm (const_vector
, width
, AARCH64_CHECK_ORR
);
25561 /* Returns the string with the AND/BIC instruction for the SIMD immediate
25562 CONST_VECTOR of WIDTH bits. */
25564 aarch64_output_simd_and_imm (rtx const_vector
, unsigned width
)
25566 return aarch64_output_simd_imm (const_vector
, width
, AARCH64_CHECK_AND
);
25569 /* Returns the string with the EOR instruction for the SIMD immediate
25570 CONST_VECTOR of WIDTH bits. */
25572 aarch64_output_simd_xor_imm (rtx const_vector
, unsigned width
)
25574 return aarch64_output_simd_imm (const_vector
, width
, AARCH64_CHECK_XOR
);
25577 /* Returns the string with the MOV instruction for the SIMD immediate
25578 CONST_VECTOR of WIDTH bits. */
25580 aarch64_output_simd_mov_imm (rtx const_vector
, unsigned width
)
25582 return aarch64_output_simd_imm (const_vector
, width
, AARCH64_CHECK_MOV
);
25586 aarch64_output_scalar_simd_mov_immediate (rtx immediate
, scalar_int_mode mode
)
25589 /* If a floating point number was passed and we desire to use it in an
25590 integer mode do the conversion to integer. */
25591 if (CONST_DOUBLE_P (immediate
) && GET_MODE_CLASS (mode
) == MODE_INT
)
25593 unsigned HOST_WIDE_INT ival
;
25594 if (!aarch64_reinterpret_float_as_int (immediate
, &ival
))
25595 gcc_unreachable ();
25596 immediate
= gen_int_mode (ival
, mode
);
25599 machine_mode vmode
;
25600 /* use a 64 bit mode for everything except for DI/DF/DD mode, where we use
25601 a 128 bit vector mode. */
25602 int width
= GET_MODE_BITSIZE (mode
) == 64 ? 128 : 64;
25604 vmode
= aarch64_simd_container_mode (mode
, width
);
25605 rtx v_op
= aarch64_simd_gen_const_vector_dup (vmode
, INTVAL (immediate
));
25606 return aarch64_output_simd_mov_imm (v_op
, width
);
25609 /* Return the output string to use for moving immediate CONST_VECTOR
25610 into an SVE register. */
25613 aarch64_output_sve_mov_immediate (rtx const_vector
)
25615 static char templ
[40];
25616 struct simd_immediate_info info
;
25620 is_valid
= aarch64_simd_valid_imm (const_vector
, &info
, AARCH64_CHECK_MOV
);
25621 gcc_assert (is_valid
);
25623 element_char
= sizetochar (GET_MODE_BITSIZE (info
.elt_mode
));
25625 machine_mode vec_mode
= GET_MODE (const_vector
);
25626 if (aarch64_sve_pred_mode_p (vec_mode
))
25628 static char buf
[sizeof ("ptrue\t%0.N, vlNNNNN")];
25629 if (info
.insn
== simd_immediate_info::MOV
)
25631 gcc_assert (info
.u
.mov
.value
== const0_rtx
);
25632 snprintf (buf
, sizeof (buf
), "pfalse\t%%0.b");
25636 gcc_assert (info
.insn
== simd_immediate_info::PTRUE
);
25637 unsigned int total_bytes
;
25638 if (info
.u
.pattern
== AARCH64_SV_ALL
25639 && BYTES_PER_SVE_VECTOR
.is_constant (&total_bytes
))
25640 snprintf (buf
, sizeof (buf
), "ptrue\t%%0.%c, vl%d", element_char
,
25641 total_bytes
/ GET_MODE_SIZE (info
.elt_mode
));
25643 snprintf (buf
, sizeof (buf
), "ptrue\t%%0.%c, %s", element_char
,
25644 svpattern_token (info
.u
.pattern
));
25649 if (info
.insn
== simd_immediate_info::INDEX
)
25651 snprintf (templ
, sizeof (templ
), "index\t%%0.%c, #"
25652 HOST_WIDE_INT_PRINT_DEC
", #" HOST_WIDE_INT_PRINT_DEC
,
25653 element_char
, INTVAL (info
.u
.index
.base
),
25654 INTVAL (info
.u
.index
.step
));
25658 if (GET_MODE_CLASS (info
.elt_mode
) == MODE_FLOAT
)
25660 if (aarch64_float_const_zero_rtx_p (info
.u
.mov
.value
))
25661 info
.u
.mov
.value
= GEN_INT (0);
25664 const int buf_size
= 20;
25665 char float_buf
[buf_size
] = {};
25666 real_to_decimal_for_mode (float_buf
,
25667 CONST_DOUBLE_REAL_VALUE (info
.u
.mov
.value
),
25668 buf_size
, buf_size
, 1, info
.elt_mode
);
25670 snprintf (templ
, sizeof (templ
), "fmov\t%%0.%c, #%s",
25671 element_char
, float_buf
);
25676 if (info
.u
.mov
.value
== const0_rtx
&& TARGET_NON_STREAMING
)
25677 snprintf (templ
, sizeof (templ
), "movi\t%%d0, #0");
25679 snprintf (templ
, sizeof (templ
), "mov\t%%0.%c, #" HOST_WIDE_INT_PRINT_DEC
,
25680 element_char
, INTVAL (info
.u
.mov
.value
));
25684 /* Return the asm template for a PTRUES. CONST_UNSPEC is the
25685 aarch64_sve_ptrue_svpattern_immediate that describes the predicate
25689 aarch64_output_sve_ptrues (rtx const_unspec
)
25691 static char templ
[40];
25692 struct simd_immediate_info info
;
25695 is_valid
= aarch64_simd_valid_imm (const_unspec
, &info
, AARCH64_CHECK_MOV
);
25696 gcc_assert (is_valid
&& info
.insn
== simd_immediate_info::PTRUE
);
25698 char element_char
= sizetochar (GET_MODE_BITSIZE (info
.elt_mode
));
25699 snprintf (templ
, sizeof (templ
), "ptrues\t%%0.%c, %s", element_char
,
25700 svpattern_token (info
.u
.pattern
));
25704 /* Split operands into moves from op[1] + op[2] into op[0]. */
25707 aarch64_split_combinev16qi (rtx operands
[3])
25709 machine_mode halfmode
= GET_MODE (operands
[1]);
25711 gcc_assert (halfmode
== V16QImode
);
25713 rtx destlo
= simplify_gen_subreg (halfmode
, operands
[0],
25714 GET_MODE (operands
[0]), 0);
25715 rtx desthi
= simplify_gen_subreg (halfmode
, operands
[0],
25716 GET_MODE (operands
[0]),
25717 GET_MODE_SIZE (halfmode
));
25719 bool skiplo
= rtx_equal_p (destlo
, operands
[1]);
25720 bool skiphi
= rtx_equal_p (desthi
, operands
[2]);
25722 if (skiplo
&& skiphi
)
25724 /* No-op move. Can't split to nothing; emit something. */
25725 emit_note (NOTE_INSN_DELETED
);
25729 /* Special case of reversed high/low parts. */
25730 if (reg_overlap_mentioned_p (operands
[2], destlo
)
25731 && reg_overlap_mentioned_p (operands
[1], desthi
))
25733 emit_insn (gen_xorv16qi3 (operands
[1], operands
[1], operands
[2]));
25734 emit_insn (gen_xorv16qi3 (operands
[2], operands
[1], operands
[2]));
25735 emit_insn (gen_xorv16qi3 (operands
[1], operands
[1], operands
[2]));
25737 else if (!reg_overlap_mentioned_p (operands
[2], destlo
))
25739 /* Try to avoid unnecessary moves if part of the result
25740 is in the right place already. */
25742 emit_move_insn (destlo
, operands
[1]);
25744 emit_move_insn (desthi
, operands
[2]);
25749 emit_move_insn (desthi
, operands
[2]);
25751 emit_move_insn (destlo
, operands
[1]);
25755 /* vec_perm support. */
25757 struct expand_vec_perm_d
25759 rtx target
, op0
, op1
;
25760 vec_perm_indices perm
;
25761 machine_mode vmode
;
25762 machine_mode op_mode
;
25763 unsigned int vec_flags
;
25764 unsigned int op_vec_flags
;
25766 bool zero_op0_p
, zero_op1_p
;
25770 static bool aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d
*d
);
25772 /* Generate a variable permutation. */
25775 aarch64_expand_vec_perm_1 (rtx target
, rtx op0
, rtx op1
, rtx sel
)
25777 machine_mode vmode
= GET_MODE (target
);
25778 bool one_vector_p
= rtx_equal_p (op0
, op1
);
25780 gcc_checking_assert (vmode
== V8QImode
|| vmode
== V16QImode
);
25781 gcc_checking_assert (GET_MODE (op0
) == vmode
);
25782 gcc_checking_assert (GET_MODE (op1
) == vmode
);
25783 gcc_checking_assert (GET_MODE (sel
) == vmode
);
25784 gcc_checking_assert (TARGET_SIMD
);
25788 if (vmode
== V8QImode
)
25790 /* Expand the argument to a V16QI mode by duplicating it. */
25791 rtx pair
= gen_reg_rtx (V16QImode
);
25792 emit_insn (gen_aarch64_combinev8qi (pair
, op0
, op0
));
25793 emit_insn (gen_aarch64_qtbl1v8qi (target
, pair
, sel
));
25797 emit_insn (gen_aarch64_qtbl1v16qi (target
, op0
, sel
));
25804 if (vmode
== V8QImode
)
25806 pair
= gen_reg_rtx (V16QImode
);
25807 emit_insn (gen_aarch64_combinev8qi (pair
, op0
, op1
));
25808 emit_insn (gen_aarch64_qtbl1v8qi (target
, pair
, sel
));
25812 pair
= gen_reg_rtx (V2x16QImode
);
25813 emit_insn (gen_aarch64_combinev16qi (pair
, op0
, op1
));
25814 emit_insn (gen_aarch64_qtbl2v16qi (target
, pair
, sel
));
25819 /* Expand a vec_perm with the operands given by TARGET, OP0, OP1 and SEL.
25820 NELT is the number of elements in the vector. */
25823 aarch64_expand_vec_perm (rtx target
, rtx op0
, rtx op1
, rtx sel
,
25826 machine_mode vmode
= GET_MODE (target
);
25827 bool one_vector_p
= rtx_equal_p (op0
, op1
);
25830 /* The TBL instruction does not use a modulo index, so we must take care
25831 of that ourselves. */
25832 mask
= aarch64_simd_gen_const_vector_dup (vmode
,
25833 one_vector_p
? nelt
- 1 : 2 * nelt
- 1);
25834 sel
= expand_simple_binop (vmode
, AND
, sel
, mask
, NULL
, 0, OPTAB_LIB_WIDEN
);
25836 /* For big-endian, we also need to reverse the index within the vector
25837 (but not which vector). */
25838 if (BYTES_BIG_ENDIAN
)
25840 /* If one_vector_p, mask is a vector of (nelt - 1)'s already. */
25842 mask
= aarch64_simd_gen_const_vector_dup (vmode
, nelt
- 1);
25843 sel
= expand_simple_binop (vmode
, XOR
, sel
, mask
,
25844 NULL
, 0, OPTAB_LIB_WIDEN
);
25846 aarch64_expand_vec_perm_1 (target
, op0
, op1
, sel
);
25849 /* Generate (set TARGET (unspec [OP0 OP1] CODE)). */
25852 emit_unspec2 (rtx target
, int code
, rtx op0
, rtx op1
)
25854 emit_insn (gen_rtx_SET (target
,
25855 gen_rtx_UNSPEC (GET_MODE (target
),
25856 gen_rtvec (2, op0
, op1
), code
)));
25859 /* Expand an SVE vec_perm with the given operands. */
25862 aarch64_expand_sve_vec_perm (rtx target
, rtx op0
, rtx op1
, rtx sel
)
25864 machine_mode data_mode
= GET_MODE (target
);
25865 machine_mode sel_mode
= GET_MODE (sel
);
25866 /* Enforced by the pattern condition. */
25867 int nunits
= GET_MODE_NUNITS (sel_mode
).to_constant ();
25869 /* Note: vec_perm indices are supposed to wrap when they go beyond the
25870 size of the two value vectors, i.e. the upper bits of the indices
25871 are effectively ignored. SVE TBL instead produces 0 for any
25872 out-of-range indices, so we need to modulo all the vec_perm indices
25873 to ensure they are all in range. */
25874 rtx sel_reg
= force_reg (sel_mode
, sel
);
25876 /* Check if the sel only references the first values vector. */
25877 if (CONST_VECTOR_P (sel
)
25878 && aarch64_const_vec_all_in_range_p (sel
, 0, nunits
- 1))
25880 emit_unspec2 (target
, UNSPEC_TBL
, op0
, sel_reg
);
25884 /* Check if the two values vectors are the same. */
25885 if (rtx_equal_p (op0
, op1
))
25887 rtx max_sel
= aarch64_simd_gen_const_vector_dup (sel_mode
, nunits
- 1);
25888 rtx sel_mod
= expand_simple_binop (sel_mode
, AND
, sel_reg
, max_sel
,
25889 NULL
, 0, OPTAB_DIRECT
);
25890 emit_unspec2 (target
, UNSPEC_TBL
, op0
, sel_mod
);
25894 /* Run TBL on for each value vector and combine the results. */
25896 rtx res0
= gen_reg_rtx (data_mode
);
25897 rtx res1
= gen_reg_rtx (data_mode
);
25898 rtx neg_num_elems
= aarch64_simd_gen_const_vector_dup (sel_mode
, -nunits
);
25899 if (!CONST_VECTOR_P (sel
)
25900 || !aarch64_const_vec_all_in_range_p (sel
, 0, 2 * nunits
- 1))
25902 rtx max_sel
= aarch64_simd_gen_const_vector_dup (sel_mode
,
25904 sel_reg
= expand_simple_binop (sel_mode
, AND
, sel_reg
, max_sel
,
25905 NULL
, 0, OPTAB_DIRECT
);
25907 emit_unspec2 (res0
, UNSPEC_TBL
, op0
, sel_reg
);
25908 rtx sel_sub
= expand_simple_binop (sel_mode
, PLUS
, sel_reg
, neg_num_elems
,
25909 NULL
, 0, OPTAB_DIRECT
);
25910 emit_unspec2 (res1
, UNSPEC_TBL
, op1
, sel_sub
);
25911 if (GET_MODE_CLASS (data_mode
) == MODE_VECTOR_INT
)
25912 emit_insn (gen_rtx_SET (target
, gen_rtx_IOR (data_mode
, res0
, res1
)));
25914 emit_unspec2 (target
, UNSPEC_IORF
, res0
, res1
);
25917 /* Recognize patterns suitable for the TRN instructions. */
25919 aarch64_evpc_trn (struct expand_vec_perm_d
*d
)
25922 poly_uint64 nelt
= d
->perm
.length ();
25924 machine_mode vmode
= d
->vmode
;
25926 if (GET_MODE_UNIT_SIZE (vmode
) > 8)
25929 /* Note that these are little-endian tests.
25930 We correct for big-endian later. */
25931 if (!d
->perm
[0].is_constant (&odd
)
25932 || (odd
!= 0 && odd
!= 1)
25933 || !d
->perm
.series_p (0, 2, odd
, 2)
25934 || !d
->perm
.series_p (1, 2, nelt
+ odd
, 2))
25943 /* We don't need a big-endian lane correction for SVE; see the comment
25944 at the head of aarch64-sve.md for details. */
25945 if (BYTES_BIG_ENDIAN
&& d
->vec_flags
== VEC_ADVSIMD
)
25947 std::swap (in0
, in1
);
25952 emit_set_insn (out
, gen_rtx_UNSPEC (vmode
, gen_rtvec (2, in0
, in1
),
25953 odd
? UNSPEC_TRN2
: UNSPEC_TRN1
));
25957 /* Try to re-encode the PERM constant so it combines odd and even elements.
25958 This rewrites constants such as {0, 1, 4, 5}/V4SF to {0, 2}/V2DI.
25959 We retry with this new constant with the full suite of patterns. */
25961 aarch64_evpc_reencode (struct expand_vec_perm_d
*d
)
25963 expand_vec_perm_d newd
;
25965 /* The subregs that we'd create are not supported for big-endian SVE;
25966 see aarch64_modes_compatible_p for details. */
25967 if (BYTES_BIG_ENDIAN
&& (d
->vec_flags
& VEC_ANY_SVE
))
25970 /* Get the new mode. Always twice the size of the inner
25971 and half the elements. */
25972 machine_mode new_mode
;
25973 if (!aarch64_coalesce_units (d
->vmode
, 2).exists (&new_mode
))
25976 vec_perm_indices newpermindices
;
25977 if (!newpermindices
.new_shrunk_vector (d
->perm
, 2))
25980 newd
.vmode
= new_mode
;
25981 newd
.vec_flags
= d
->vec_flags
;
25982 newd
.op_mode
= newd
.vmode
;
25983 newd
.op_vec_flags
= newd
.vec_flags
;
25984 newd
.target
= d
->target
? gen_lowpart (new_mode
, d
->target
) : NULL
;
25985 newd
.op0
= d
->op0
? gen_lowpart (new_mode
, d
->op0
) : NULL
;
25986 newd
.op1
= d
->op1
? gen_lowpart (new_mode
, d
->op1
) : NULL
;
25987 newd
.testing_p
= d
->testing_p
;
25988 newd
.one_vector_p
= d
->one_vector_p
;
25990 newd
.perm
.new_vector (newpermindices
.encoding (), newd
.one_vector_p
? 1 : 2,
25991 newpermindices
.nelts_per_input ());
25992 return aarch64_expand_vec_perm_const_1 (&newd
);
25995 /* Recognize patterns suitable for the UZP instructions. */
25997 aarch64_evpc_uzp (struct expand_vec_perm_d
*d
)
26001 machine_mode vmode
= d
->vmode
;
26003 if (GET_MODE_UNIT_SIZE (vmode
) > 8)
26006 /* Note that these are little-endian tests.
26007 We correct for big-endian later. */
26008 if (!d
->perm
[0].is_constant (&odd
)
26009 || (odd
!= 0 && odd
!= 1)
26010 || !d
->perm
.series_p (0, 1, odd
, 2))
26019 /* We don't need a big-endian lane correction for SVE; see the comment
26020 at the head of aarch64-sve.md for details. */
26021 if (BYTES_BIG_ENDIAN
&& d
->vec_flags
== VEC_ADVSIMD
)
26023 std::swap (in0
, in1
);
26028 emit_set_insn (out
, gen_rtx_UNSPEC (vmode
, gen_rtvec (2, in0
, in1
),
26029 odd
? UNSPEC_UZP2
: UNSPEC_UZP1
));
26033 /* Recognize patterns suitable for the ZIP instructions. */
26035 aarch64_evpc_zip (struct expand_vec_perm_d
*d
)
26038 poly_uint64 nelt
= d
->perm
.length ();
26040 machine_mode vmode
= d
->vmode
;
26042 if (GET_MODE_UNIT_SIZE (vmode
) > 8)
26045 /* Note that these are little-endian tests.
26046 We correct for big-endian later. */
26047 poly_uint64 first
= d
->perm
[0];
26048 if ((maybe_ne (first
, 0U) && maybe_ne (first
* 2, nelt
))
26049 || !d
->perm
.series_p (0, 2, first
, 1)
26050 || !d
->perm
.series_p (1, 2, first
+ nelt
, 1))
26052 high
= maybe_ne (first
, 0U);
26060 /* We don't need a big-endian lane correction for SVE; see the comment
26061 at the head of aarch64-sve.md for details. */
26062 if (BYTES_BIG_ENDIAN
&& d
->vec_flags
== VEC_ADVSIMD
)
26064 std::swap (in0
, in1
);
26069 emit_set_insn (out
, gen_rtx_UNSPEC (vmode
, gen_rtvec (2, in0
, in1
),
26070 high
? UNSPEC_ZIP2
: UNSPEC_ZIP1
));
26074 /* Recognize patterns for the EXT insn. */
26077 aarch64_evpc_ext (struct expand_vec_perm_d
*d
)
26079 HOST_WIDE_INT location
;
26082 /* The first element always refers to the first vector.
26083 Check if the extracted indices are increasing by one. */
26084 if ((d
->vec_flags
& VEC_SVE_PRED
)
26085 || !d
->perm
[0].is_constant (&location
)
26086 || !d
->perm
.series_p (0, 1, location
, 1))
26093 /* The case where (location == 0) is a no-op for both big- and little-endian,
26094 and is removed by the mid-end at optimization levels -O1 and higher.
26096 We don't need a big-endian lane correction for SVE; see the comment
26097 at the head of aarch64-sve.md for details. */
26098 if (BYTES_BIG_ENDIAN
&& location
!= 0 && d
->vec_flags
== VEC_ADVSIMD
)
26100 /* After setup, we want the high elements of the first vector (stored
26101 at the LSB end of the register), and the low elements of the second
26102 vector (stored at the MSB end of the register). So swap. */
26103 std::swap (d
->op0
, d
->op1
);
26104 /* location != 0 (above), so safe to assume (nelt - location) < nelt.
26105 to_constant () is safe since this is restricted to Advanced SIMD
26107 location
= d
->perm
.length ().to_constant () - location
;
26110 offset
= GEN_INT (location
);
26111 emit_set_insn (d
->target
,
26112 gen_rtx_UNSPEC (d
->vmode
,
26113 gen_rtvec (3, d
->op0
, d
->op1
, offset
),
26118 /* Recognize patterns for the REV{64,32,16} insns, which reverse elements
26119 within each 64-bit, 32-bit or 16-bit granule. */
26122 aarch64_evpc_rev_local (struct expand_vec_perm_d
*d
)
26124 HOST_WIDE_INT diff
;
26125 unsigned int i
, size
, unspec
;
26126 machine_mode pred_mode
;
26128 if ((d
->vec_flags
& VEC_SVE_PRED
)
26129 || !d
->one_vector_p
26130 || !d
->perm
[0].is_constant (&diff
)
26134 if (d
->vec_flags
& VEC_SVE_DATA
)
26135 size
= (diff
+ 1) * aarch64_sve_container_bits (d
->vmode
);
26137 size
= (diff
+ 1) * GET_MODE_UNIT_BITSIZE (d
->vmode
);
26140 unspec
= UNSPEC_REV64
;
26141 pred_mode
= VNx2BImode
;
26143 else if (size
== 32)
26145 unspec
= UNSPEC_REV32
;
26146 pred_mode
= VNx4BImode
;
26148 else if (size
== 16)
26150 unspec
= UNSPEC_REV16
;
26151 pred_mode
= VNx8BImode
;
26156 unsigned int step
= diff
+ 1;
26157 for (i
= 0; i
< step
; ++i
)
26158 if (!d
->perm
.series_p (i
, step
, diff
- i
, step
))
26165 if (d
->vec_flags
& VEC_SVE_DATA
)
26167 rtx pred
= aarch64_ptrue_reg (pred_mode
);
26168 emit_insn (gen_aarch64_sve_revbhw (d
->vmode
, pred_mode
,
26169 d
->target
, pred
, d
->op0
));
26172 rtx src
= gen_rtx_UNSPEC (d
->vmode
, gen_rtvec (1, d
->op0
), unspec
);
26173 emit_set_insn (d
->target
, src
);
26177 /* Recognize patterns for the REV insn, which reverses elements within
26181 aarch64_evpc_rev_global (struct expand_vec_perm_d
*d
)
26183 poly_uint64 nelt
= d
->perm
.length ();
26185 if (!d
->one_vector_p
|| d
->vec_flags
== VEC_ADVSIMD
)
26188 if (!d
->perm
.series_p (0, 1, nelt
- 1, -1))
26195 rtx src
= gen_rtx_UNSPEC (d
->vmode
, gen_rtvec (1, d
->op0
), UNSPEC_REV
);
26196 emit_set_insn (d
->target
, src
);
26201 aarch64_evpc_dup (struct expand_vec_perm_d
*d
)
26203 rtx out
= d
->target
;
26206 machine_mode vmode
= d
->vmode
;
26209 if ((d
->vec_flags
& VEC_SVE_PRED
)
26210 || d
->perm
.encoding ().encoded_nelts () != 1
26211 || !d
->perm
[0].is_constant (&elt
))
26214 if ((d
->vec_flags
& VEC_SVE_DATA
)
26215 && elt
* (aarch64_sve_container_bits (vmode
) / 8) >= 64)
26222 /* The generic preparation in aarch64_expand_vec_perm_const_1
26223 swaps the operand order and the permute indices if it finds
26224 d->perm[0] to be in the second operand. Thus, we can always
26225 use d->op0 and need not do any extra arithmetic to get the
26226 correct lane number. */
26228 lane
= GEN_INT (elt
); /* The pattern corrects for big-endian. */
26230 rtx parallel
= gen_rtx_PARALLEL (vmode
, gen_rtvec (1, lane
));
26231 rtx select
= gen_rtx_VEC_SELECT (GET_MODE_INNER (vmode
), in0
, parallel
);
26232 emit_set_insn (out
, gen_rtx_VEC_DUPLICATE (vmode
, select
));
26236 /* Recognize things that can be done using the SVE2p1 Hybrid-VLA
26237 permutations, which apply Advanced-SIMD-style permutations to each
26238 individual 128-bit block. */
26241 aarch64_evpc_hvla (struct expand_vec_perm_d
*d
)
26243 machine_mode vmode
= d
->vmode
;
26245 || !TARGET_NON_STREAMING
26246 || BYTES_BIG_ENDIAN
26247 || d
->vec_flags
!= VEC_SVE_DATA
26248 || GET_MODE_UNIT_BITSIZE (vmode
) > 64)
26251 /* Set SUBELTS to the number of elements in an Advanced SIMD vector
26252 and make sure that adding SUBELTS to each block of SUBELTS indices
26253 gives the next block of SUBELTS indices. That is, it must be possible
26254 to interpret the index vector as SUBELTS interleaved linear series in
26255 which each series has step SUBELTS. */
26256 unsigned int subelts
= 128U / GET_MODE_UNIT_BITSIZE (vmode
);
26257 unsigned int pairs
= subelts
/ 2;
26258 for (unsigned int i
= 0; i
< subelts
; ++i
)
26259 if (!d
->perm
.series_p (i
, subelts
, d
->perm
[i
], subelts
))
26262 /* Used once we have verified that we can use UNSPEC to do the operation. */
26263 auto use_binary
= [&](int unspec
) -> bool
26267 rtvec vec
= gen_rtvec (2, d
->op0
, d
->op1
);
26268 emit_set_insn (d
->target
, gen_rtx_UNSPEC (vmode
, vec
, unspec
));
26273 /* Now check whether the first SUBELTS elements match a supported
26274 Advanced-SIMD-style operation. */
26275 poly_int64 first
= d
->perm
[0];
26276 poly_int64 nelt
= d
->perm
.length ();
26277 auto try_zip
= [&]() -> bool
26279 if (maybe_ne (first
, 0) && maybe_ne (first
, pairs
))
26281 for (unsigned int i
= 0; i
< pairs
; ++i
)
26282 if (maybe_ne (d
->perm
[i
* 2], first
+ i
)
26283 || maybe_ne (d
->perm
[i
* 2 + 1], first
+ nelt
+ i
))
26285 return use_binary (maybe_ne (first
, 0) ? UNSPEC_ZIPQ2
: UNSPEC_ZIPQ1
);
26287 auto try_uzp
= [&]() -> bool
26289 if (maybe_ne (first
, 0) && maybe_ne (first
, 1))
26291 for (unsigned int i
= 0; i
< pairs
; ++i
)
26292 if (maybe_ne (d
->perm
[i
], first
+ i
* 2)
26293 || maybe_ne (d
->perm
[i
+ pairs
], first
+ nelt
+ i
* 2))
26295 return use_binary (maybe_ne (first
, 0) ? UNSPEC_UZPQ2
: UNSPEC_UZPQ1
);
26297 auto try_extq
= [&]() -> bool
26299 HOST_WIDE_INT start
;
26300 if (!first
.is_constant (&start
) || !IN_RANGE (start
, 0, subelts
- 1))
26302 for (unsigned int i
= 0; i
< subelts
; ++i
)
26304 poly_int64 next
= (start
+ i
>= subelts
26305 ? start
+ i
- subelts
+ nelt
26307 if (maybe_ne (d
->perm
[i
], next
))
26312 rtx op2
= gen_int_mode (start
, SImode
);
26313 emit_insn (gen_aarch64_sve_extq (vmode
, d
->target
,
26314 d
->op0
, d
->op1
, op2
));
26318 auto try_dupq
= [&]() -> bool
26320 HOST_WIDE_INT start
;
26321 if (!first
.is_constant (&start
) || !IN_RANGE (start
, 0, subelts
- 1))
26323 for (unsigned int i
= 0; i
< subelts
; ++i
)
26324 if (maybe_ne (d
->perm
[i
], start
))
26328 rtx op1
= gen_int_mode (start
, SImode
);
26329 emit_insn (gen_aarch64_sve_dupq (vmode
, d
->target
, d
->op0
, op1
));
26334 return try_zip () || try_uzp () || try_extq () || try_dupq ();
26338 aarch64_evpc_tbl (struct expand_vec_perm_d
*d
)
26340 rtx rperm
[MAX_COMPILE_TIME_VEC_BYTES
], sel
;
26341 machine_mode vmode
= d
->vmode
;
26343 /* Make sure that the indices are constant. */
26344 unsigned int encoded_nelts
= d
->perm
.encoding ().encoded_nelts ();
26345 for (unsigned int i
= 0; i
< encoded_nelts
; ++i
)
26346 if (!d
->perm
[i
].is_constant ())
26352 /* Generic code will try constant permutation twice. Once with the
26353 original mode and again with the elements lowered to QImode.
26354 So wait and don't do the selector expansion ourselves. */
26355 if (vmode
!= V8QImode
&& vmode
!= V16QImode
)
26358 /* to_constant is safe since this routine is specific to Advanced SIMD
26360 unsigned int nelt
= d
->perm
.length ().to_constant ();
26362 /* If one register is the constant vector of 0 then we only need
26363 a one reg TBL and we map any accesses to the vector of 0 to -1. We can't
26364 do this earlier since vec_perm_indices clamps elements to within range so
26365 we can only do it during codegen. */
26368 else if (d
->zero_op1_p
)
26371 for (unsigned int i
= 0; i
< nelt
; ++i
)
26373 auto val
= d
->perm
[i
].to_constant ();
26375 /* If we're selecting from a 0 vector, we can just use an out of range
26377 if ((d
->zero_op0_p
&& val
< nelt
) || (d
->zero_op1_p
&& val
>= nelt
))
26378 rperm
[i
] = constm1_rtx
;
26381 /* If we are remapping a zero register as the first parameter we need
26382 to adjust the indices of the non-zero register. */
26386 /* If big-endian and two vectors we end up with a weird mixed-endian
26387 mode on NEON. Reverse the index within each word but not the word
26388 itself. to_constant is safe because we checked is_constant
26390 rperm
[i
] = GEN_INT (BYTES_BIG_ENDIAN
? val
^ (nelt
- 1) : val
);
26394 sel
= gen_rtx_CONST_VECTOR (vmode
, gen_rtvec_v (nelt
, rperm
));
26395 sel
= force_reg (vmode
, sel
);
26397 aarch64_expand_vec_perm_1 (d
->target
, d
->op0
, d
->op1
, sel
);
26401 /* Try to implement D using an SVE TBL instruction. */
26404 aarch64_evpc_sve_tbl (struct expand_vec_perm_d
*d
)
26406 unsigned HOST_WIDE_INT nelt
;
26408 /* Permuting two variable-length vectors could overflow the
26410 if (!d
->one_vector_p
&& !d
->perm
.length ().is_constant (&nelt
))
26416 machine_mode sel_mode
= related_int_vector_mode (d
->vmode
).require ();
26417 rtx sel
= vec_perm_indices_to_rtx (sel_mode
, d
->perm
);
26418 if (d
->one_vector_p
)
26419 emit_unspec2 (d
->target
, UNSPEC_TBL
, d
->op0
, force_reg (sel_mode
, sel
));
26421 aarch64_expand_sve_vec_perm (d
->target
, d
->op0
, d
->op1
, sel
);
26425 /* Try to implement D using SVE dup instruction. */
26428 aarch64_evpc_sve_dup (struct expand_vec_perm_d
*d
)
26430 if (BYTES_BIG_ENDIAN
26431 || !d
->one_vector_p
26432 || d
->vec_flags
!= VEC_SVE_DATA
26433 || d
->op_vec_flags
!= VEC_ADVSIMD
26434 || d
->perm
.encoding ().nelts_per_pattern () != 1
26435 || !known_eq (d
->perm
.encoding ().npatterns (),
26436 GET_MODE_NUNITS (d
->op_mode
))
26437 || !known_eq (GET_MODE_BITSIZE (d
->op_mode
), 128))
26440 int npatterns
= d
->perm
.encoding ().npatterns ();
26441 for (int i
= 0; i
< npatterns
; i
++)
26442 if (!known_eq (d
->perm
[i
], i
))
26448 aarch64_expand_sve_dupq (d
->target
, GET_MODE (d
->target
), d
->op0
);
26452 /* Try to implement D using SVE SEL instruction. */
26455 aarch64_evpc_sel (struct expand_vec_perm_d
*d
)
26457 machine_mode vmode
= d
->vmode
;
26458 int unit_size
= GET_MODE_UNIT_SIZE (vmode
);
26460 if (d
->vec_flags
!= VEC_SVE_DATA
26464 int n_patterns
= d
->perm
.encoding ().npatterns ();
26465 poly_int64 vec_len
= d
->perm
.length ();
26467 for (int i
= 0; i
< n_patterns
; ++i
)
26468 if (!known_eq (d
->perm
[i
], i
)
26469 && !known_eq (d
->perm
[i
], vec_len
+ i
))
26472 for (int i
= n_patterns
; i
< n_patterns
* 2; i
++)
26473 if (!d
->perm
.series_p (i
, n_patterns
, i
, n_patterns
)
26474 && !d
->perm
.series_p (i
, n_patterns
, vec_len
+ i
, n_patterns
))
26480 machine_mode pred_mode
= aarch64_sve_pred_mode (vmode
);
26482 /* Build a predicate that is true when op0 elements should be used. */
26483 rtx_vector_builder
builder (pred_mode
, n_patterns
, 2);
26484 for (int i
= 0; i
< n_patterns
* 2; i
++)
26486 rtx elem
= known_eq (d
->perm
[i
], i
) ? CONST1_RTX (BImode
)
26487 : CONST0_RTX (BImode
);
26488 builder
.quick_push (elem
);
26491 rtx const_vec
= builder
.build ();
26492 rtx pred
= force_reg (pred_mode
, const_vec
);
26493 /* TARGET = PRED ? OP0 : OP1. */
26494 emit_insn (gen_vcond_mask (vmode
, vmode
, d
->target
, d
->op0
, d
->op1
, pred
));
26498 /* Recognize patterns suitable for the INS instructions. */
26500 aarch64_evpc_ins (struct expand_vec_perm_d
*d
)
26502 machine_mode mode
= d
->vmode
;
26503 unsigned HOST_WIDE_INT nelt
;
26505 if (d
->vec_flags
!= VEC_ADVSIMD
)
26508 /* to_constant is safe since this routine is specific to Advanced SIMD
26510 nelt
= d
->perm
.length ().to_constant ();
26513 HOST_WIDE_INT idx
= -1;
26515 for (unsigned HOST_WIDE_INT i
= 0; i
< nelt
; i
++)
26518 if (!d
->perm
[i
].is_constant (&elt
))
26520 if (elt
== (HOST_WIDE_INT
) i
)
26533 for (unsigned HOST_WIDE_INT i
= 0; i
< nelt
; i
++)
26535 if (d
->perm
[i
].to_constant () == (HOST_WIDE_INT
) (i
+ nelt
))
26549 gcc_assert (idx
!= -1);
26551 unsigned extractindex
= d
->perm
[idx
].to_constant ();
26552 rtx extractv
= d
->op0
;
26553 if (extractindex
>= nelt
)
26556 extractindex
-= nelt
;
26558 gcc_assert (extractindex
< nelt
);
26560 insn_code icode
= code_for_aarch64_simd_vec_copy_lane (mode
);
26561 expand_operand ops
[5];
26562 create_output_operand (&ops
[0], d
->target
, mode
);
26563 create_input_operand (&ops
[1], insv
, mode
);
26564 create_integer_operand (&ops
[2], 1 << idx
);
26565 create_input_operand (&ops
[3], extractv
, mode
);
26566 create_integer_operand (&ops
[4], extractindex
);
26567 expand_insn (icode
, 5, ops
);
26573 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d
*d
)
26575 gcc_assert (d
->op_mode
!= E_VOIDmode
);
26577 /* The pattern matching functions above are written to look for a small
26578 number to begin the sequence (0, 1, N/2). If we begin with an index
26579 from the second operand, we can swap the operands. */
26580 poly_int64 nelt
= d
->perm
.length ();
26581 if (known_ge (d
->perm
[0], nelt
))
26583 d
->perm
.rotate_inputs (1);
26584 std::swap (d
->op0
, d
->op1
);
26587 if (((d
->vec_flags
== VEC_ADVSIMD
&& TARGET_SIMD
)
26588 || d
->vec_flags
== VEC_SVE_DATA
26589 || d
->vec_flags
== (VEC_SVE_DATA
| VEC_PARTIAL
)
26590 || d
->vec_flags
== VEC_SVE_PRED
)
26591 && known_gt (nelt
, 1))
26593 if (d
->vmode
== d
->op_mode
)
26595 if (aarch64_evpc_rev_local (d
))
26597 else if (aarch64_evpc_rev_global (d
))
26599 else if (aarch64_evpc_ext (d
))
26601 else if (aarch64_evpc_dup (d
))
26603 else if (aarch64_evpc_zip (d
))
26605 else if (aarch64_evpc_uzp (d
))
26607 else if (aarch64_evpc_trn (d
))
26609 else if (aarch64_evpc_sel (d
))
26611 else if (aarch64_evpc_ins (d
))
26613 else if (aarch64_evpc_hvla (d
))
26615 else if (aarch64_evpc_reencode (d
))
26618 if (d
->vec_flags
== VEC_SVE_DATA
)
26619 return aarch64_evpc_sve_tbl (d
);
26620 else if (d
->vec_flags
== VEC_ADVSIMD
)
26621 return aarch64_evpc_tbl (d
);
26625 if (aarch64_evpc_sve_dup (d
))
26632 /* Implement TARGET_VECTORIZE_VEC_PERM_CONST. */
26635 aarch64_vectorize_vec_perm_const (machine_mode vmode
, machine_mode op_mode
,
26636 rtx target
, rtx op0
, rtx op1
,
26637 const vec_perm_indices
&sel
)
26639 struct expand_vec_perm_d d
;
26641 /* Check whether the mask can be applied to a single vector. */
26642 if (sel
.ninputs () == 1
26643 || (op0
&& rtx_equal_p (op0
, op1
)))
26644 d
.one_vector_p
= true;
26645 else if (sel
.all_from_input_p (0))
26647 d
.one_vector_p
= true;
26650 else if (sel
.all_from_input_p (1))
26652 d
.one_vector_p
= true;
26656 d
.one_vector_p
= false;
26658 d
.zero_op0_p
= op0
== CONST0_RTX (op_mode
);
26659 d
.zero_op1_p
= op1
== CONST0_RTX (op_mode
);
26660 d
.perm
.new_vector (sel
.encoding (), d
.one_vector_p
? 1 : 2,
26661 sel
.nelts_per_input ());
26663 d
.vec_flags
= aarch64_classify_vector_mode (d
.vmode
);
26664 d
.op_mode
= op_mode
;
26665 d
.op_vec_flags
= aarch64_classify_vector_mode (d
.op_mode
);
26667 d
.op0
= op0
? force_reg (op_mode
, op0
) : NULL_RTX
;
26671 d
.op1
= op1
? force_reg (op_mode
, op1
) : NULL_RTX
;
26672 d
.testing_p
= !target
;
26675 return aarch64_expand_vec_perm_const_1 (&d
);
26677 rtx_insn
*last
= get_last_insn ();
26678 bool ret
= aarch64_expand_vec_perm_const_1 (&d
);
26679 gcc_assert (last
== get_last_insn ());
26683 /* Generate a byte permute mask for a register of mode MODE,
26684 which has NUNITS units. */
26687 aarch64_reverse_mask (machine_mode mode
, unsigned int nunits
)
26689 /* We have to reverse each vector because we dont have
26690 a permuted load that can reverse-load according to ABI rules. */
26692 rtvec v
= rtvec_alloc (16);
26694 unsigned int usize
= GET_MODE_UNIT_SIZE (mode
);
26696 gcc_assert (BYTES_BIG_ENDIAN
);
26697 gcc_assert (AARCH64_VALID_SIMD_QREG_MODE (mode
));
26699 for (i
= 0; i
< nunits
; i
++)
26700 for (j
= 0; j
< usize
; j
++)
26701 RTVEC_ELT (v
, i
* usize
+ j
) = GEN_INT ((i
+ 1) * usize
- 1 - j
);
26702 mask
= gen_rtx_CONST_VECTOR (V16QImode
, v
);
26703 return force_reg (V16QImode
, mask
);
26706 /* Expand an SVE integer comparison using the SVE equivalent of:
26708 (set TARGET (CODE OP0 OP1)). */
26711 aarch64_expand_sve_vec_cmp_int (rtx target
, rtx_code code
, rtx op0
, rtx op1
)
26713 machine_mode pred_mode
= GET_MODE (target
);
26714 machine_mode data_mode
= GET_MODE (op0
);
26715 rtx res
= aarch64_sve_emit_int_cmp (target
, pred_mode
, code
, data_mode
,
26717 if (!rtx_equal_p (target
, res
))
26718 emit_move_insn (target
, res
);
26721 /* Return the UNSPEC_COND_* code for comparison CODE. */
26723 static unsigned int
26724 aarch64_unspec_cond_code (rtx_code code
)
26729 return UNSPEC_COND_FCMNE
;
26731 return UNSPEC_COND_FCMEQ
;
26733 return UNSPEC_COND_FCMLT
;
26735 return UNSPEC_COND_FCMGT
;
26737 return UNSPEC_COND_FCMLE
;
26739 return UNSPEC_COND_FCMGE
;
26741 return UNSPEC_COND_FCMUO
;
26743 gcc_unreachable ();
26749 (set TARGET (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X>))
26751 where <X> is the operation associated with comparison CODE.
26752 KNOWN_PTRUE_P is true if PRED is known to be a PTRUE. */
26755 aarch64_emit_sve_fp_cond (rtx target
, rtx_code code
, rtx pred
,
26756 bool known_ptrue_p
, rtx op0
, rtx op1
)
26758 rtx flag
= gen_int_mode (known_ptrue_p
, SImode
);
26759 rtx unspec
= gen_rtx_UNSPEC (GET_MODE (pred
),
26760 gen_rtvec (4, pred
, flag
, op0
, op1
),
26761 aarch64_unspec_cond_code (code
));
26762 emit_set_insn (target
, unspec
);
26765 /* Emit the SVE equivalent of:
26767 (set TMP1 (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X1>))
26768 (set TMP2 (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X2>))
26769 (set TARGET (ior:PRED_MODE TMP1 TMP2))
26771 where <Xi> is the operation associated with comparison CODEi.
26772 KNOWN_PTRUE_P is true if PRED is known to be a PTRUE. */
26775 aarch64_emit_sve_or_fp_conds (rtx target
, rtx_code code1
, rtx_code code2
,
26776 rtx pred
, bool known_ptrue_p
, rtx op0
, rtx op1
)
26778 machine_mode pred_mode
= GET_MODE (pred
);
26779 rtx tmp1
= gen_reg_rtx (pred_mode
);
26780 aarch64_emit_sve_fp_cond (tmp1
, code1
, pred
, known_ptrue_p
, op0
, op1
);
26781 rtx tmp2
= gen_reg_rtx (pred_mode
);
26782 aarch64_emit_sve_fp_cond (tmp2
, code2
, pred
, known_ptrue_p
, op0
, op1
);
26783 aarch64_emit_binop (target
, ior_optab
, tmp1
, tmp2
);
26786 /* Emit the SVE equivalent of:
26788 (set TMP (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X>))
26789 (set TARGET (not TMP))
26791 where <X> is the operation associated with comparison CODE.
26792 KNOWN_PTRUE_P is true if PRED is known to be a PTRUE. */
26795 aarch64_emit_sve_invert_fp_cond (rtx target
, rtx_code code
, rtx pred
,
26796 bool known_ptrue_p
, rtx op0
, rtx op1
)
26798 machine_mode pred_mode
= GET_MODE (pred
);
26799 rtx tmp
= gen_reg_rtx (pred_mode
);
26800 aarch64_emit_sve_fp_cond (tmp
, code
, pred
, known_ptrue_p
, op0
, op1
);
26801 aarch64_emit_unop (target
, one_cmpl_optab
, tmp
);
26804 /* Expand an SVE floating-point comparison using the SVE equivalent of:
26806 (set TARGET (CODE OP0 OP1))
26808 If CAN_INVERT_P is true, the caller can also handle inverted results;
26809 return true if the result is in fact inverted. */
26812 aarch64_expand_sve_vec_cmp_float (rtx target
, rtx_code code
,
26813 rtx op0
, rtx op1
, bool can_invert_p
)
26815 machine_mode pred_mode
= GET_MODE (target
);
26816 machine_mode data_mode
= GET_MODE (op0
);
26818 rtx ptrue
= aarch64_ptrue_reg (pred_mode
);
26822 /* UNORDERED has no immediate form. */
26823 op1
= force_reg (data_mode
, op1
);
26832 /* There is native support for the comparison. */
26833 aarch64_emit_sve_fp_cond (target
, code
, ptrue
, true, op0
, op1
);
26838 /* This is a trapping operation (LT or GT). */
26839 aarch64_emit_sve_or_fp_conds (target
, LT
, GT
, ptrue
, true, op0
, op1
);
26843 if (!flag_trapping_math
)
26845 /* This would trap for signaling NaNs. */
26846 op1
= force_reg (data_mode
, op1
);
26847 aarch64_emit_sve_or_fp_conds (target
, UNORDERED
, EQ
,
26848 ptrue
, true, op0
, op1
);
26856 if (flag_trapping_math
)
26858 /* Work out which elements are ordered. */
26859 rtx ordered
= gen_reg_rtx (pred_mode
);
26860 op1
= force_reg (data_mode
, op1
);
26861 aarch64_emit_sve_invert_fp_cond (ordered
, UNORDERED
,
26862 ptrue
, true, op0
, op1
);
26864 /* Test the opposite condition for the ordered elements,
26865 then invert the result. */
26869 code
= reverse_condition_maybe_unordered (code
);
26872 aarch64_emit_sve_fp_cond (target
, code
,
26873 ordered
, false, op0
, op1
);
26876 aarch64_emit_sve_invert_fp_cond (target
, code
,
26877 ordered
, false, op0
, op1
);
26883 /* ORDERED has no immediate form. */
26884 op1
= force_reg (data_mode
, op1
);
26888 gcc_unreachable ();
26891 /* There is native support for the inverse comparison. */
26892 code
= reverse_condition_maybe_unordered (code
);
26895 aarch64_emit_sve_fp_cond (target
, code
, ptrue
, true, op0
, op1
);
26898 aarch64_emit_sve_invert_fp_cond (target
, code
, ptrue
, true, op0
, op1
);
26904 (a) MODE1 and MODE2 use the same layout for bytes that are common
26907 (b) subregs involving the two modes behave as the target-independent
26908 subreg rules require; and
26910 (c) there is at least one register that can hold both modes.
26912 Return false otherwise. */
26915 aarch64_modes_compatible_p (machine_mode mode1
, machine_mode mode2
)
26917 unsigned int flags1
= aarch64_classify_vector_mode (mode1
);
26918 unsigned int flags2
= aarch64_classify_vector_mode (mode2
);
26920 bool sve1_p
= (flags1
& VEC_ANY_SVE
);
26921 bool sve2_p
= (flags2
& VEC_ANY_SVE
);
26923 bool partial_sve1_p
= sve1_p
&& (flags1
& VEC_PARTIAL
);
26924 bool partial_sve2_p
= sve2_p
&& (flags2
& VEC_PARTIAL
);
26926 bool pred1_p
= (flags1
& VEC_SVE_PRED
);
26927 bool pred2_p
= (flags2
& VEC_SVE_PRED
);
26929 bool partial_advsimd_struct1_p
= (flags1
== (VEC_ADVSIMD
| VEC_STRUCT
26931 bool partial_advsimd_struct2_p
= (flags2
== (VEC_ADVSIMD
| VEC_STRUCT
26934 /* Don't allow changes between predicate modes and other modes.
26935 Only predicate registers can hold predicate modes and only
26936 non-predicate registers can hold non-predicate modes, so any
26937 attempt to mix them would require a round trip through memory. */
26938 if (pred1_p
!= pred2_p
)
26941 /* The contents of partial SVE modes are distributed evenly across
26942 the register, whereas GCC expects them to be clustered together.
26943 We therefore need to be careful about mode changes involving them. */
26944 if (partial_sve1_p
&& partial_sve2_p
)
26946 /* Reject changes between partial SVE modes that have different
26947 patterns of significant and insignificant bits. */
26948 if ((aarch64_sve_container_bits (mode1
)
26949 != aarch64_sve_container_bits (mode2
))
26950 || GET_MODE_UNIT_SIZE (mode1
) != GET_MODE_UNIT_SIZE (mode2
))
26953 else if (partial_sve1_p
)
26955 /* The first lane of MODE1 is where GCC expects it, but anything
26956 bigger than that is not. */
26957 if (maybe_gt (GET_MODE_SIZE (mode2
), GET_MODE_UNIT_SIZE (mode1
)))
26960 else if (partial_sve2_p
)
26962 /* Similarly in reverse. */
26963 if (maybe_gt (GET_MODE_SIZE (mode1
), GET_MODE_UNIT_SIZE (mode2
)))
26967 /* Don't allow changes between partial Advanced SIMD structure modes
26968 and other modes that are bigger than 8 bytes. E.g. V16QI and V2x8QI
26969 are the same size, but the former occupies one Q register while the
26970 latter occupies two D registers. */
26971 if (partial_advsimd_struct1_p
!= partial_advsimd_struct2_p
26972 && maybe_gt (GET_MODE_SIZE (mode1
), 8)
26973 && maybe_gt (GET_MODE_SIZE (mode2
), 8))
26976 if (maybe_ne (BITS_PER_SVE_VECTOR
, 128u))
26978 /* Don't allow changes between SVE modes and other modes that might
26979 be bigger than 128 bits. In particular, OImode, CImode and XImode
26980 divide into 128-bit quantities while SVE modes divide into
26981 BITS_PER_SVE_VECTOR quantities. */
26982 if (sve1_p
&& !sve2_p
&& maybe_gt (GET_MODE_BITSIZE (mode2
), 128))
26984 if (sve2_p
&& !sve1_p
&& maybe_gt (GET_MODE_BITSIZE (mode1
), 128))
26988 if (BYTES_BIG_ENDIAN
)
26990 /* Don't allow changes between SVE data modes and non-SVE modes.
26991 See the comment at the head of aarch64-sve.md for details. */
26992 if (sve1_p
!= sve2_p
)
26995 /* Don't allow changes in element size: lane 0 of the new vector
26996 would not then be lane 0 of the old vector. See the comment
26997 above aarch64_maybe_expand_sve_subreg_move for a more detailed
27000 In the worst case, this forces a register to be spilled in
27001 one mode and reloaded in the other, which handles the
27002 endianness correctly. */
27003 if (sve1_p
&& GET_MODE_UNIT_SIZE (mode1
) != GET_MODE_UNIT_SIZE (mode2
))
27009 /* Implement TARGET_MODES_TIEABLE_P. In principle we should always defer
27010 to aarch64_modes_compatible_p. However due to issues with register
27011 allocation it is preferable to avoid tieing integer scalar and FP
27012 scalar modes. Executing integer operations in general registers is
27013 better than treating them as scalar vector operations. This reduces
27014 latency and avoids redundant int<->FP moves. So tie modes if they
27015 are either the same class, or one of them is a vector mode. */
27018 aarch64_modes_tieable_p (machine_mode mode1
, machine_mode mode2
)
27020 if (aarch64_modes_compatible_p (mode1
, mode2
))
27022 if (GET_MODE_CLASS (mode1
) == GET_MODE_CLASS (mode2
))
27024 if (VECTOR_MODE_P (mode1
) || VECTOR_MODE_P (mode2
))
27030 /* Return a new RTX holding the result of moving POINTER forward by
27034 aarch64_move_pointer (rtx pointer
, poly_int64 amount
)
27036 rtx next
= plus_constant (Pmode
, XEXP (pointer
, 0), amount
);
27038 return adjust_automodify_address (pointer
, GET_MODE (pointer
),
27042 /* Expand a cpymem/movmem using the MOPS extension. OPERANDS are taken
27043 from the cpymem/movmem pattern. IS_MEMMOVE is true if this is a memmove
27044 rather than memcpy. Return true iff we succeeded. */
27046 aarch64_expand_cpymem_mops (rtx
*operands
, bool is_memmove
)
27051 /* All three registers are changed by the instruction, so each one
27052 must be a fresh pseudo. */
27053 rtx dst_addr
= copy_to_mode_reg (Pmode
, XEXP (operands
[0], 0));
27054 rtx src_addr
= copy_to_mode_reg (Pmode
, XEXP (operands
[1], 0));
27055 rtx dst_mem
= replace_equiv_address (operands
[0], dst_addr
);
27056 rtx src_mem
= replace_equiv_address (operands
[1], src_addr
);
27057 rtx sz_reg
= copy_to_mode_reg (DImode
, operands
[2]);
27059 emit_insn (gen_aarch64_movmemdi (dst_mem
, src_mem
, sz_reg
));
27061 emit_insn (gen_aarch64_cpymemdi (dst_mem
, src_mem
, sz_reg
));
27065 /* Expand cpymem/movmem, as if from a __builtin_memcpy/memmove.
27066 OPERANDS are taken from the cpymem/movmem pattern. IS_MEMMOVE is true
27067 if this is a memmove rather than memcpy. Return true if we succeed,
27068 otherwise return false, indicating that a libcall should be emitted. */
27070 aarch64_expand_cpymem (rtx
*operands
, bool is_memmove
)
27073 rtx dst
= operands
[0];
27074 rtx src
= operands
[1];
27075 unsigned align
= UINTVAL (operands
[3]);
27077 machine_mode mode
= BLKmode
, next_mode
;
27079 /* Variable-sized or strict-align copies may use the MOPS expansion. */
27080 if (!CONST_INT_P (operands
[2]) || (STRICT_ALIGNMENT
&& align
< 16))
27081 return aarch64_expand_cpymem_mops (operands
, is_memmove
);
27083 unsigned HOST_WIDE_INT size
= UINTVAL (operands
[2]);
27085 /* Set inline limits for memmove/memcpy. MOPS has a separate threshold. */
27086 unsigned max_copy_size
= TARGET_SIMD
? 256 : 128;
27087 unsigned mops_threshold
= is_memmove
? aarch64_mops_memmove_size_threshold
27088 : aarch64_mops_memcpy_size_threshold
;
27090 /* Reduce the maximum size with -Os. */
27091 if (optimize_function_for_size_p (cfun
))
27092 max_copy_size
/= 4;
27094 /* Large copies use MOPS when available or a library call. */
27095 if (size
> max_copy_size
|| (TARGET_MOPS
&& size
> mops_threshold
))
27096 return aarch64_expand_cpymem_mops (operands
, is_memmove
);
27098 /* Default to 32-byte LDP/STP on large copies, however small copies or
27099 no SIMD support fall back to 16-byte chunks.
27100 ??? Although it would be possible to use LDP/STP Qn in streaming mode
27101 (so using TARGET_BASE_SIMD instead of TARGET_SIMD), it isn't clear
27102 whether that would improve performance. */
27103 bool use_qregs
= size
> 24 && TARGET_SIMD
;
27105 base
= copy_to_mode_reg (Pmode
, XEXP (dst
, 0));
27106 dst
= adjust_automodify_address (dst
, VOIDmode
, base
, 0);
27108 base
= copy_to_mode_reg (Pmode
, XEXP (src
, 0));
27109 src
= adjust_automodify_address (src
, VOIDmode
, base
, 0);
27111 auto_vec
<std::pair
<rtx
, rtx
>, 16> ops
;
27116 /* Find the largest mode in which to do the copy in without over reading
27118 opt_scalar_int_mode mode_iter
;
27119 FOR_EACH_MODE_IN_CLASS (mode_iter
, MODE_INT
)
27120 if (GET_MODE_SIZE (mode_iter
.require ()) <= MIN (size
, 16))
27121 mode
= mode_iter
.require ();
27123 gcc_assert (mode
!= BLKmode
);
27125 mode_bytes
= GET_MODE_SIZE (mode
).to_constant ();
27127 /* Prefer Q-register accesses. */
27128 if (mode_bytes
== 16 && use_qregs
)
27131 rtx reg
= gen_reg_rtx (mode
);
27132 rtx load
= gen_move_insn (reg
, adjust_address (src
, mode
, offset
));
27133 rtx store
= gen_move_insn (adjust_address (dst
, mode
, offset
), reg
);
27134 ops
.safe_push ({ load
, store
});
27135 size
-= mode_bytes
;
27136 offset
+= mode_bytes
;
27138 /* Emit trailing copies using overlapping unaligned accesses
27139 (when !STRICT_ALIGNMENT) - this is smaller and faster. */
27140 if (size
> 0 && size
< 16 && !STRICT_ALIGNMENT
)
27142 next_mode
= smallest_mode_for_size
27143 (size
* BITS_PER_UNIT
, MODE_INT
).require ();
27144 int n_bytes
= GET_MODE_SIZE (next_mode
).to_constant ();
27145 gcc_assert (n_bytes
<= mode_bytes
);
27146 offset
-= n_bytes
- size
;
27151 /* Memcpy interleaves loads with stores, memmove emits all loads first. */
27152 int nops
= ops
.length();
27153 int inc
= is_memmove
|| nops
<= 8 ? nops
: 6;
27155 for (int i
= 0; i
< nops
; i
+= inc
)
27157 int m
= MIN (nops
, i
+ inc
);
27159 for (int j
= i
; j
< m
; j
++)
27160 emit_insn (ops
[j
].first
);
27162 for (int j
= i
; j
< m
; j
++)
27163 emit_insn (ops
[j
].second
);
27168 /* Expand a setmem using the MOPS instructions. OPERANDS are the same
27169 as for the setmem pattern. Return true iff we succeed. */
27171 aarch64_expand_setmem_mops (rtx
*operands
)
27176 /* The first two registers are changed by the instruction, so both
27177 of them must be a fresh pseudo. */
27178 rtx dst_addr
= copy_to_mode_reg (Pmode
, XEXP (operands
[0], 0));
27179 rtx dst_mem
= replace_equiv_address (operands
[0], dst_addr
);
27180 rtx sz_reg
= copy_to_mode_reg (DImode
, operands
[1]);
27181 rtx val
= operands
[2];
27182 if (val
!= CONST0_RTX (QImode
))
27183 val
= force_reg (QImode
, val
);
27184 emit_insn (gen_aarch64_setmemdi (dst_mem
, val
, sz_reg
));
27188 /* Expand setmem, as if from a __builtin_memset. Return true if
27189 we succeed, otherwise return false. */
27192 aarch64_expand_setmem (rtx
*operands
)
27195 unsigned HOST_WIDE_INT len
;
27196 rtx dst
= operands
[0];
27197 rtx val
= operands
[2], src
;
27198 unsigned align
= UINTVAL (operands
[3]);
27200 machine_mode mode
= BLKmode
, next_mode
;
27202 /* Variable-sized or strict-align memset may use the MOPS expansion. */
27203 if (!CONST_INT_P (operands
[1]) || !TARGET_SIMD
27204 || (STRICT_ALIGNMENT
&& align
< 16))
27205 return aarch64_expand_setmem_mops (operands
);
27207 /* Set inline limits for memset. MOPS has a separate threshold. */
27208 unsigned max_set_size
= MAX_SET_SIZE (optimize_function_for_speed_p (cfun
));
27209 unsigned mops_threshold
= aarch64_mops_memset_size_threshold
;
27211 len
= UINTVAL (operands
[1]);
27213 /* Large memset uses MOPS when available or a library call. */
27214 if (len
> max_set_size
|| (TARGET_MOPS
&& len
> mops_threshold
))
27215 return aarch64_expand_setmem_mops (operands
);
27217 base
= copy_to_mode_reg (Pmode
, XEXP (dst
, 0));
27218 dst
= adjust_automodify_address (dst
, VOIDmode
, base
, 0);
27220 /* Prepare the val using a DUP/MOVI v0.16B, val. */
27221 val
= expand_vector_broadcast (V16QImode
, val
);
27222 val
= force_reg (V16QImode
, val
);
27227 /* Find the largest mode in which to do the copy without
27229 opt_scalar_int_mode mode_iter
;
27230 FOR_EACH_MODE_IN_CLASS (mode_iter
, MODE_INT
)
27231 if (GET_MODE_SIZE (mode_iter
.require ()) <= MIN (len
, 16))
27232 mode
= mode_iter
.require ();
27234 gcc_assert (mode
!= BLKmode
);
27236 mode_bytes
= GET_MODE_SIZE (mode
).to_constant ();
27240 /* Prefer Q-register accesses. */
27241 if (mode_bytes
== 16)
27244 src
= lowpart_subreg (mode
, src
, GET_MODE (val
));
27246 emit_move_insn (adjust_address (dst
, mode
, offset
), src
);
27248 offset
+= mode_bytes
;
27250 /* Emit trailing writes using overlapping unaligned accesses
27251 (when !STRICT_ALIGNMENT) - this is smaller and faster. */
27252 if (len
> 0 && len
< 16 && !STRICT_ALIGNMENT
)
27254 next_mode
= smallest_mode_for_size
27255 (len
* BITS_PER_UNIT
, MODE_INT
).require ();
27256 int n_bytes
= GET_MODE_SIZE (next_mode
).to_constant ();
27257 gcc_assert (n_bytes
<= mode_bytes
);
27258 offset
-= n_bytes
- len
;
27267 /* Split a DImode store of a CONST_INT SRC to MEM DST as two
27268 SImode stores. Handle the case when the constant has identical
27269 bottom and top halves. This is beneficial when the two stores can be
27270 merged into an STP and we avoid synthesising potentially expensive
27271 immediates twice. Return true if such a split is possible. */
27274 aarch64_split_dimode_const_store (rtx dst
, rtx src
)
27276 rtx lo
= gen_lowpart (SImode
, src
);
27277 rtx hi
= gen_highpart_mode (SImode
, DImode
, src
);
27279 if (!rtx_equal_p (lo
, hi
))
27282 unsigned int orig_cost
27283 = aarch64_internal_mov_immediate (NULL_RTX
, src
, false, DImode
);
27284 unsigned int lo_cost
27285 = aarch64_internal_mov_immediate (NULL_RTX
, lo
, false, SImode
);
27287 /* We want to transform:
27289 MOVK x1, 0x140, lsl 16
27290 MOVK x1, 0xc0da, lsl 32
27291 MOVK x1, 0x140, lsl 48
27295 MOVK w1, 0x140, lsl 16
27297 So we want to perform this when we save at least one instruction. */
27298 if (orig_cost
<= lo_cost
)
27301 rtx mem_lo
= adjust_address (dst
, SImode
, 0);
27302 if (!aarch64_mem_pair_operand (mem_lo
, SImode
))
27305 rtx tmp_reg
= gen_reg_rtx (SImode
);
27306 aarch64_expand_mov_immediate (tmp_reg
, lo
);
27307 rtx mem_hi
= aarch64_move_pointer (mem_lo
, GET_MODE_SIZE (SImode
));
27308 /* Don't emit an explicit store pair as this may not be always profitable.
27309 Let the sched-fusion logic decide whether to merge them. */
27310 emit_move_insn (mem_lo
, tmp_reg
);
27311 emit_move_insn (mem_hi
, tmp_reg
);
27316 /* Generate RTL for a conditional branch with rtx comparison CODE in
27317 mode CC_MODE. The destination of the unlikely conditional branch
27321 aarch64_gen_unlikely_cbranch (enum rtx_code code
, machine_mode cc_mode
,
27325 x
= gen_rtx_fmt_ee (code
, VOIDmode
,
27326 gen_rtx_REG (cc_mode
, CC_REGNUM
),
27329 x
= gen_rtx_IF_THEN_ELSE (VOIDmode
, x
,
27330 gen_rtx_LABEL_REF (VOIDmode
, label_ref
),
27332 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx
, x
));
27335 /* Generate DImode scratch registers for 128-bit (TImode) addition.
27337 OP1 represents the TImode destination operand 1
27338 OP2 represents the TImode destination operand 2
27339 LOW_DEST represents the low half (DImode) of TImode operand 0
27340 LOW_IN1 represents the low half (DImode) of TImode operand 1
27341 LOW_IN2 represents the low half (DImode) of TImode operand 2
27342 HIGH_DEST represents the high half (DImode) of TImode operand 0
27343 HIGH_IN1 represents the high half (DImode) of TImode operand 1
27344 HIGH_IN2 represents the high half (DImode) of TImode operand 2. */
27347 aarch64_addti_scratch_regs (rtx op1
, rtx op2
, rtx
*low_dest
,
27348 rtx
*low_in1
, rtx
*low_in2
,
27349 rtx
*high_dest
, rtx
*high_in1
,
27352 *low_dest
= gen_reg_rtx (DImode
);
27353 *low_in1
= force_lowpart_subreg (DImode
, op1
, TImode
);
27354 *low_in2
= force_lowpart_subreg (DImode
, op2
, TImode
);
27355 *high_dest
= gen_reg_rtx (DImode
);
27356 *high_in1
= force_highpart_subreg (DImode
, op1
, TImode
);
27357 *high_in2
= force_highpart_subreg (DImode
, op2
, TImode
);
27360 /* Generate DImode scratch registers for 128-bit (TImode) subtraction.
27362 OP1 represents the TImode destination operand 1
27363 OP2 represents the TImode destination operand 2
27364 LOW_DEST represents the low half (DImode) of TImode operand 0
27365 LOW_IN1 represents the low half (DImode) of TImode operand 1
27366 LOW_IN2 represents the low half (DImode) of TImode operand 2
27367 HIGH_DEST represents the high half (DImode) of TImode operand 0
27368 HIGH_IN1 represents the high half (DImode) of TImode operand 1
27369 HIGH_IN2 represents the high half (DImode) of TImode operand 2. */
27373 aarch64_subvti_scratch_regs (rtx op1
, rtx op2
, rtx
*low_dest
,
27374 rtx
*low_in1
, rtx
*low_in2
,
27375 rtx
*high_dest
, rtx
*high_in1
,
27378 *low_dest
= gen_reg_rtx (DImode
);
27379 *low_in1
= force_lowpart_subreg (DImode
, op1
, TImode
);
27380 *low_in2
= force_lowpart_subreg (DImode
, op2
, TImode
);
27381 *high_dest
= gen_reg_rtx (DImode
);
27383 *high_in1
= force_highpart_subreg (DImode
, op1
, TImode
);
27384 *high_in2
= force_highpart_subreg (DImode
, op2
, TImode
);
27387 /* Generate RTL for 128-bit (TImode) subtraction with overflow.
27389 OP0 represents the TImode destination operand 0
27390 LOW_DEST represents the low half (DImode) of TImode operand 0
27391 LOW_IN1 represents the low half (DImode) of TImode operand 1
27392 LOW_IN2 represents the low half (DImode) of TImode operand 2
27393 HIGH_DEST represents the high half (DImode) of TImode operand 0
27394 HIGH_IN1 represents the high half (DImode) of TImode operand 1
27395 HIGH_IN2 represents the high half (DImode) of TImode operand 2
27396 UNSIGNED_P is true if the operation is being performed on unsigned
27399 aarch64_expand_subvti (rtx op0
, rtx low_dest
, rtx low_in1
,
27400 rtx low_in2
, rtx high_dest
, rtx high_in1
,
27401 rtx high_in2
, bool unsigned_p
)
27403 if (low_in2
== const0_rtx
)
27405 low_dest
= low_in1
;
27406 high_in2
= force_reg (DImode
, high_in2
);
27408 emit_insn (gen_subdi3_compare1 (high_dest
, high_in1
, high_in2
));
27410 emit_insn (gen_subvdi_insn (high_dest
, high_in1
, high_in2
));
27414 if (aarch64_plus_immediate (low_in2
, DImode
))
27415 emit_insn (gen_subdi3_compare1_imm (low_dest
, low_in1
, low_in2
,
27416 GEN_INT (-UINTVAL (low_in2
))));
27419 low_in2
= force_reg (DImode
, low_in2
);
27420 emit_insn (gen_subdi3_compare1 (low_dest
, low_in1
, low_in2
));
27422 high_in2
= force_reg (DImode
, high_in2
);
27425 emit_insn (gen_usubdi3_carryinC (high_dest
, high_in1
, high_in2
));
27427 emit_insn (gen_subdi3_carryinV (high_dest
, high_in1
, high_in2
));
27430 emit_move_insn (gen_lowpart (DImode
, op0
), low_dest
);
27431 emit_move_insn (gen_highpart (DImode
, op0
), high_dest
);
27435 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook. */
27437 static unsigned HOST_WIDE_INT
27438 aarch64_asan_shadow_offset (void)
27441 return (HOST_WIDE_INT_1
<< 29);
27443 return (HOST_WIDE_INT_1
<< 36);
27447 aarch64_gen_ccmp_first (rtx_insn
**prep_seq
, rtx_insn
**gen_seq
,
27448 rtx_code code
, tree treeop0
, tree treeop1
)
27450 machine_mode op_mode
, cmp_mode
, cc_mode
= CCmode
;
27452 int unsignedp
= TYPE_UNSIGNED (TREE_TYPE (treeop0
));
27454 struct expand_operand ops
[4];
27457 expand_operands (treeop0
, treeop1
, NULL_RTX
, &op0
, &op1
, EXPAND_NORMAL
);
27459 op_mode
= GET_MODE (op0
);
27460 if (op_mode
== VOIDmode
)
27461 op_mode
= GET_MODE (op1
);
27463 if (CONST_SCALAR_INT_P (op1
))
27464 canonicalize_comparison (op_mode
, &code
, &op1
);
27472 icode
= CODE_FOR_cmpsi
;
27477 icode
= CODE_FOR_cmpdi
;
27482 cc_mode
= aarch64_select_cc_mode (code
, op0
, op1
);
27483 icode
= cc_mode
== CCFPEmode
? CODE_FOR_fcmpesf
: CODE_FOR_fcmpsf
;
27488 cc_mode
= aarch64_select_cc_mode (code
, op0
, op1
);
27489 icode
= cc_mode
== CCFPEmode
? CODE_FOR_fcmpedf
: CODE_FOR_fcmpdf
;
27497 op0
= prepare_operand (icode
, op0
, 0, op_mode
, cmp_mode
, unsignedp
);
27498 op1
= prepare_operand (icode
, op1
, 1, op_mode
, cmp_mode
, unsignedp
);
27504 *prep_seq
= get_insns ();
27507 create_fixed_operand (&ops
[0], op0
);
27508 create_fixed_operand (&ops
[1], op1
);
27511 if (!maybe_expand_insn (icode
, 2, ops
))
27516 *gen_seq
= get_insns ();
27519 return gen_rtx_fmt_ee (code
, cc_mode
,
27520 gen_rtx_REG (cc_mode
, CC_REGNUM
), const0_rtx
);
27524 aarch64_gen_ccmp_next (rtx_insn
**prep_seq
, rtx_insn
**gen_seq
, rtx prev
,
27525 rtx_code cmp_code
, tree treeop0
, tree treeop1
,
27528 rtx op0
, op1
, target
;
27529 machine_mode op_mode
, cmp_mode
, cc_mode
= CCmode
;
27530 int unsignedp
= TYPE_UNSIGNED (TREE_TYPE (treeop0
));
27532 struct expand_operand ops
[6];
27535 push_to_sequence (*prep_seq
);
27536 expand_operands (treeop0
, treeop1
, NULL_RTX
, &op0
, &op1
, EXPAND_NORMAL
);
27538 op_mode
= GET_MODE (op0
);
27539 if (op_mode
== VOIDmode
)
27540 op_mode
= GET_MODE (op1
);
27542 if (CONST_SCALAR_INT_P (op1
))
27543 canonicalize_comparison (op_mode
, &cmp_code
, &op1
);
27559 cc_mode
= aarch64_select_cc_mode (cmp_code
, op0
, op1
);
27564 cc_mode
= aarch64_select_cc_mode (cmp_code
, op0
, op1
);
27572 icode
= code_for_ccmp (cc_mode
, cmp_mode
);
27574 op0
= prepare_operand (icode
, op0
, 2, op_mode
, cmp_mode
, unsignedp
);
27575 op1
= prepare_operand (icode
, op1
, 3, op_mode
, cmp_mode
, unsignedp
);
27581 *prep_seq
= get_insns ();
27584 target
= gen_rtx_REG (cc_mode
, CC_REGNUM
);
27585 aarch64_cond
= aarch64_get_condition_code_1 (cc_mode
, cmp_code
);
27587 if (bit_code
!= AND
)
27589 /* Treat the ccmp patterns as canonical and use them where possible,
27590 but fall back to ccmp_rev patterns if there's no other option. */
27591 rtx_code prev_code
= GET_CODE (prev
);
27592 machine_mode prev_mode
= GET_MODE (XEXP (prev
, 0));
27593 if ((prev_mode
== CCFPmode
|| prev_mode
== CCFPEmode
)
27594 && !(prev_code
== EQ
27596 || prev_code
== ORDERED
27597 || prev_code
== UNORDERED
))
27598 icode
= code_for_ccmp_rev (cc_mode
, cmp_mode
);
27601 rtx_code code
= reverse_condition (prev_code
);
27602 prev
= gen_rtx_fmt_ee (code
, VOIDmode
, XEXP (prev
, 0), const0_rtx
);
27604 aarch64_cond
= AARCH64_INVERSE_CONDITION_CODE (aarch64_cond
);
27607 create_fixed_operand (&ops
[0], XEXP (prev
, 0));
27608 create_fixed_operand (&ops
[1], target
);
27609 create_fixed_operand (&ops
[2], op0
);
27610 create_fixed_operand (&ops
[3], op1
);
27611 create_fixed_operand (&ops
[4], prev
);
27612 create_fixed_operand (&ops
[5], GEN_INT (aarch64_cond
));
27614 push_to_sequence (*gen_seq
);
27615 if (!maybe_expand_insn (icode
, 6, ops
))
27621 *gen_seq
= get_insns ();
27624 return gen_rtx_fmt_ee (cmp_code
, VOIDmode
, target
, const0_rtx
);
27627 #undef TARGET_GEN_CCMP_FIRST
27628 #define TARGET_GEN_CCMP_FIRST aarch64_gen_ccmp_first
27630 #undef TARGET_GEN_CCMP_NEXT
27631 #define TARGET_GEN_CCMP_NEXT aarch64_gen_ccmp_next
27633 /* Implement TARGET_SCHED_MACRO_FUSION_P. Return true if target supports
27634 instruction fusion of some sort. */
27637 aarch64_macro_fusion_p (void)
27639 return aarch64_tune_params
.fusible_ops
!= AARCH64_FUSE_NOTHING
;
27643 /* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P. Return true if PREV and CURR
27644 should be kept together during scheduling. */
27647 aarch_macro_fusion_pair_p (rtx_insn
*prev
, rtx_insn
*curr
)
27650 rtx prev_set
= single_set (prev
);
27651 rtx curr_set
= single_set (curr
);
27652 /* prev and curr are simple SET insns i.e. no flag setting or branching. */
27653 bool simple_sets_p
= prev_set
&& curr_set
&& !any_condjump_p (curr
);
27655 if (!aarch64_macro_fusion_p ())
27658 if (simple_sets_p
&& aarch64_fusion_enabled_p (AARCH64_FUSE_MOV_MOVK
))
27660 /* We are trying to match:
27661 prev (mov) == (set (reg r0) (const_int imm16))
27662 curr (movk) == (set (zero_extract (reg r0)
27665 (const_int imm16_1)) */
27667 set_dest
= SET_DEST (curr_set
);
27669 if (GET_CODE (set_dest
) == ZERO_EXTRACT
27670 && CONST_INT_P (SET_SRC (curr_set
))
27671 && CONST_INT_P (SET_SRC (prev_set
))
27672 && CONST_INT_P (XEXP (set_dest
, 2))
27673 && INTVAL (XEXP (set_dest
, 2)) == 16
27674 && REG_P (XEXP (set_dest
, 0))
27675 && REG_P (SET_DEST (prev_set
))
27676 && REGNO (XEXP (set_dest
, 0)) == REGNO (SET_DEST (prev_set
)))
27682 if (simple_sets_p
&& aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_ADD
))
27685 /* We're trying to match:
27686 prev (adrp) == (set (reg r1)
27687 (high (symbol_ref ("SYM"))))
27688 curr (add) == (set (reg r0)
27690 (symbol_ref ("SYM"))))
27691 Note that r0 need not necessarily be the same as r1, especially
27692 during pre-regalloc scheduling. */
27694 if (satisfies_constraint_Ush (SET_SRC (prev_set
))
27695 && REG_P (SET_DEST (prev_set
)) && REG_P (SET_DEST (curr_set
)))
27697 if (GET_CODE (SET_SRC (curr_set
)) == LO_SUM
27698 && REG_P (XEXP (SET_SRC (curr_set
), 0))
27699 && REGNO (XEXP (SET_SRC (curr_set
), 0))
27700 == REGNO (SET_DEST (prev_set
))
27701 && rtx_equal_p (XEXP (SET_SRC (prev_set
), 0),
27702 XEXP (SET_SRC (curr_set
), 1)))
27707 if (simple_sets_p
&& aarch64_fusion_enabled_p (AARCH64_FUSE_MOVK_MOVK
))
27710 /* We're trying to match:
27711 prev (movk) == (set (zero_extract (reg r0)
27714 (const_int imm16_1))
27715 curr (movk) == (set (zero_extract (reg r0)
27718 (const_int imm16_2)) */
27720 if (GET_CODE (SET_DEST (prev_set
)) == ZERO_EXTRACT
27721 && GET_CODE (SET_DEST (curr_set
)) == ZERO_EXTRACT
27722 && REG_P (XEXP (SET_DEST (prev_set
), 0))
27723 && REG_P (XEXP (SET_DEST (curr_set
), 0))
27724 && REGNO (XEXP (SET_DEST (prev_set
), 0))
27725 == REGNO (XEXP (SET_DEST (curr_set
), 0))
27726 && CONST_INT_P (XEXP (SET_DEST (prev_set
), 2))
27727 && CONST_INT_P (XEXP (SET_DEST (curr_set
), 2))
27728 && INTVAL (XEXP (SET_DEST (prev_set
), 2)) == 32
27729 && INTVAL (XEXP (SET_DEST (curr_set
), 2)) == 48
27730 && CONST_INT_P (SET_SRC (prev_set
))
27731 && CONST_INT_P (SET_SRC (curr_set
)))
27735 if (simple_sets_p
&& aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_LDR
))
27737 /* We're trying to match:
27738 prev (adrp) == (set (reg r0)
27739 (high (symbol_ref ("SYM"))))
27740 curr (ldr) == (set (reg r1)
27741 (mem (lo_sum (reg r0)
27742 (symbol_ref ("SYM")))))
27744 curr (ldr) == (set (reg r1)
27747 (symbol_ref ("SYM")))))) */
27748 if (satisfies_constraint_Ush (SET_SRC (prev_set
))
27749 && REG_P (SET_DEST (prev_set
)) && REG_P (SET_DEST (curr_set
)))
27751 rtx curr_src
= SET_SRC (curr_set
);
27753 if (GET_CODE (curr_src
) == ZERO_EXTEND
)
27754 curr_src
= XEXP (curr_src
, 0);
27756 if (MEM_P (curr_src
) && GET_CODE (XEXP (curr_src
, 0)) == LO_SUM
27757 && REG_P (XEXP (XEXP (curr_src
, 0), 0))
27758 && REGNO (XEXP (XEXP (curr_src
, 0), 0))
27759 == REGNO (SET_DEST (prev_set
))
27760 && rtx_equal_p (XEXP (XEXP (curr_src
, 0), 1),
27761 XEXP (SET_SRC (prev_set
), 0)))
27766 /* Fuse compare (CMP/CMN/TST/BICS) and conditional branch. */
27767 if (aarch64_fusion_enabled_p (AARCH64_FUSE_CMP_BRANCH
)
27768 && prev_set
&& curr_set
&& any_condjump_p (curr
)
27769 && GET_CODE (SET_SRC (prev_set
)) == COMPARE
27770 && SCALAR_INT_MODE_P (GET_MODE (XEXP (SET_SRC (prev_set
), 0)))
27771 && reg_referenced_p (SET_DEST (prev_set
), PATTERN (curr
)))
27774 /* Fuse CMP and CSEL/CSET. */
27775 if (prev_set
&& curr_set
27776 && GET_CODE (SET_SRC (prev_set
)) == COMPARE
27777 && SCALAR_INT_MODE_P (GET_MODE (XEXP (SET_SRC (prev_set
), 0)))
27778 && reg_referenced_p (SET_DEST (prev_set
), PATTERN (curr
)))
27780 enum attr_type prev_type
= get_attr_type (prev
);
27781 if ((prev_type
== TYPE_ALUS_SREG
|| prev_type
== TYPE_ALUS_IMM
)
27782 && ((aarch64_fusion_enabled_p (AARCH64_FUSE_CMP_CSEL
)
27783 && GET_CODE (SET_SRC (curr_set
)) == IF_THEN_ELSE
27784 && aarch64_reg_or_zero (XEXP (SET_SRC (curr_set
), 1), VOIDmode
)
27785 && aarch64_reg_or_zero (XEXP (SET_SRC (curr_set
), 2), VOIDmode
)
27786 && SCALAR_INT_MODE_P (GET_MODE (XEXP (SET_SRC (curr_set
), 1))))
27787 || (aarch64_fusion_enabled_p (AARCH64_FUSE_CMP_CSET
)
27788 && GET_RTX_CLASS (GET_CODE (SET_SRC (curr_set
)))
27790 && REG_P (SET_DEST (curr_set
)))))
27794 /* Fuse flag-setting ALU instructions and conditional branch. */
27795 if (aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_BRANCH
)
27796 && any_condjump_p (curr
))
27798 unsigned int condreg1
, condreg2
;
27800 aarch64_fixed_condition_code_regs (&condreg1
, &condreg2
);
27801 cc_reg_1
= gen_rtx_REG (CCmode
, condreg1
);
27803 if (reg_referenced_p (cc_reg_1
, PATTERN (curr
))
27805 && modified_in_p (cc_reg_1
, prev
))
27807 enum attr_type prev_type
= get_attr_type (prev
);
27809 /* FIXME: this misses some which is considered simple arthematic
27810 instructions for ThunderX. Simple shifts are missed here. */
27811 if (prev_type
== TYPE_ALUS_SREG
27812 || prev_type
== TYPE_ALUS_IMM
27813 || prev_type
== TYPE_LOGICS_REG
27814 || prev_type
== TYPE_LOGICS_IMM
)
27819 /* Fuse ALU instructions and CBZ/CBNZ. */
27822 && aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_CBZ
)
27823 && any_condjump_p (curr
))
27825 /* We're trying to match:
27826 prev (alu_insn) == (set (r0) plus ((r0) (r1/imm)))
27827 curr (cbz) == (set (pc) (if_then_else (eq/ne) (r0)
27829 (label_ref ("SYM"))
27831 if (SET_DEST (curr_set
) == (pc_rtx
)
27832 && GET_CODE (SET_SRC (curr_set
)) == IF_THEN_ELSE
27833 && REG_P (XEXP (XEXP (SET_SRC (curr_set
), 0), 0))
27834 && REG_P (SET_DEST (prev_set
))
27835 && REGNO (SET_DEST (prev_set
))
27836 == REGNO (XEXP (XEXP (SET_SRC (curr_set
), 0), 0)))
27838 /* Fuse ALU operations followed by conditional branch instruction. */
27839 switch (get_attr_type (prev
))
27842 case TYPE_ALU_SREG
:
27845 case TYPE_ADCS_REG
:
27846 case TYPE_ADCS_IMM
:
27847 case TYPE_LOGIC_REG
:
27848 case TYPE_LOGIC_IMM
:
27852 case TYPE_SHIFT_REG
:
27853 case TYPE_SHIFT_IMM
:
27865 /* Fuse A+B+1 and A-B-1 */
27867 && aarch64_fusion_enabled_p (AARCH64_FUSE_ADDSUB_2REG_CONST1
))
27869 /* We're trying to match:
27870 prev == (set (r0) (plus (r0) (r1)))
27871 curr == (set (r0) (plus (r0) (const_int 1)))
27873 prev == (set (r0) (minus (r0) (r1)))
27874 curr == (set (r0) (plus (r0) (const_int -1))) */
27876 rtx prev_src
= SET_SRC (prev_set
);
27877 rtx curr_src
= SET_SRC (curr_set
);
27880 if (GET_CODE (prev_src
) == MINUS
)
27883 if (GET_CODE (curr_src
) == PLUS
27884 && (GET_CODE (prev_src
) == PLUS
|| GET_CODE (prev_src
) == MINUS
)
27885 && CONST_INT_P (XEXP (curr_src
, 1))
27886 && INTVAL (XEXP (curr_src
, 1)) == polarity
27887 && REG_P (XEXP (curr_src
, 0))
27888 && REG_P (SET_DEST (prev_set
))
27889 && REGNO (SET_DEST (prev_set
)) == REGNO (XEXP (curr_src
, 0)))
27896 /* Return true iff the instruction fusion described by OP is enabled. */
27899 aarch64_fusion_enabled_p (enum aarch64_fusion_pairs op
)
27901 return (aarch64_tune_params
.fusible_ops
& op
) != 0;
27904 /* If MEM is in the form of [base+offset], extract the two parts
27905 of address and set to BASE and OFFSET, otherwise return false
27906 after clearing BASE and OFFSET. */
27909 extract_base_offset_in_addr (rtx mem
, rtx
*base
, rtx
*offset
)
27913 gcc_assert (MEM_P (mem
));
27915 addr
= XEXP (mem
, 0);
27920 *offset
= const0_rtx
;
27924 if (GET_CODE (addr
) == PLUS
27925 && REG_P (XEXP (addr
, 0)) && CONST_INT_P (XEXP (addr
, 1)))
27927 *base
= XEXP (addr
, 0);
27928 *offset
= XEXP (addr
, 1);
27933 *offset
= NULL_RTX
;
27938 /* Types for scheduling fusion. */
27939 enum sched_fusion_type
27941 SCHED_FUSION_NONE
= 0,
27942 SCHED_FUSION_LD_SIGN_EXTEND
,
27943 SCHED_FUSION_LD_ZERO_EXTEND
,
27949 /* If INSN is a load or store of address in the form of [base+offset],
27950 extract the two parts and set to BASE and OFFSET. Return scheduling
27951 fusion type this INSN is. */
27953 static enum sched_fusion_type
27954 fusion_load_store (rtx_insn
*insn
, rtx
*base
, rtx
*offset
)
27957 enum sched_fusion_type fusion
= SCHED_FUSION_LD
;
27959 gcc_assert (INSN_P (insn
));
27960 x
= PATTERN (insn
);
27961 if (GET_CODE (x
) != SET
)
27962 return SCHED_FUSION_NONE
;
27965 dest
= SET_DEST (x
);
27967 machine_mode dest_mode
= GET_MODE (dest
);
27969 if (!aarch64_mode_valid_for_sched_fusion_p (dest_mode
))
27970 return SCHED_FUSION_NONE
;
27972 if (GET_CODE (src
) == SIGN_EXTEND
)
27974 fusion
= SCHED_FUSION_LD_SIGN_EXTEND
;
27975 src
= XEXP (src
, 0);
27976 if (!MEM_P (src
) || GET_MODE (src
) != SImode
)
27977 return SCHED_FUSION_NONE
;
27979 else if (GET_CODE (src
) == ZERO_EXTEND
)
27981 fusion
= SCHED_FUSION_LD_ZERO_EXTEND
;
27982 src
= XEXP (src
, 0);
27983 if (!MEM_P (src
) || GET_MODE (src
) != SImode
)
27984 return SCHED_FUSION_NONE
;
27987 if (MEM_P (src
) && REG_P (dest
))
27988 extract_base_offset_in_addr (src
, base
, offset
);
27989 else if (MEM_P (dest
) && (REG_P (src
) || src
== const0_rtx
))
27991 fusion
= SCHED_FUSION_ST
;
27992 extract_base_offset_in_addr (dest
, base
, offset
);
27995 return SCHED_FUSION_NONE
;
27997 if (*base
== NULL_RTX
|| *offset
== NULL_RTX
)
27998 fusion
= SCHED_FUSION_NONE
;
28003 /* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
28005 Currently we only support to fuse ldr or str instructions, so FUSION_PRI
28006 and PRI are only calculated for these instructions. For other instruction,
28007 FUSION_PRI and PRI are simply set to MAX_PRI - 1. In the future, other
28008 type instruction fusion can be added by returning different priorities.
28010 It's important that irrelevant instructions get the largest FUSION_PRI. */
28013 aarch64_sched_fusion_priority (rtx_insn
*insn
, int max_pri
,
28014 int *fusion_pri
, int *pri
)
28018 enum sched_fusion_type fusion
;
28020 gcc_assert (INSN_P (insn
));
28023 fusion
= fusion_load_store (insn
, &base
, &offset
);
28024 if (fusion
== SCHED_FUSION_NONE
)
28031 /* Set FUSION_PRI according to fusion type and base register. */
28032 *fusion_pri
= tmp
- fusion
* FIRST_PSEUDO_REGISTER
- REGNO (base
);
28034 /* Calculate PRI. */
28037 /* INSN with smaller offset goes first. */
28038 off_val
= (int)(INTVAL (offset
));
28040 tmp
-= (off_val
& 0xfffff);
28042 tmp
+= ((- off_val
) & 0xfffff);
28048 /* Implement the TARGET_SCHED_ADJUST_PRIORITY hook.
28049 Adjust priority of sha1h instructions so they are scheduled before
28050 other SHA1 instructions. */
28053 aarch64_sched_adjust_priority (rtx_insn
*insn
, int priority
)
28055 rtx x
= PATTERN (insn
);
28057 if (GET_CODE (x
) == SET
)
28061 if (GET_CODE (x
) == UNSPEC
&& XINT (x
, 1) == UNSPEC_SHA1H
)
28062 return priority
+ 10;
28068 /* If REVERSED is null, return true if memory reference *MEM2 comes
28069 immediately after memory reference *MEM1. Do not change the references
28072 Otherwise, check if *MEM1 and *MEM2 are consecutive memory references and,
28073 if they are, try to make them use constant offsets from the same base
28074 register. Return true on success. When returning true, set *REVERSED
28075 to true if *MEM1 comes after *MEM2, false if *MEM1 comes before *MEM2. */
28077 aarch64_check_consecutive_mems (rtx
*mem1
, rtx
*mem2
, bool *reversed
)
28082 if (GET_RTX_CLASS (GET_CODE (XEXP (*mem1
, 0))) == RTX_AUTOINC
28083 || GET_RTX_CLASS (GET_CODE (XEXP (*mem2
, 0))) == RTX_AUTOINC
)
28086 if (!MEM_SIZE_KNOWN_P (*mem1
) || !MEM_SIZE_KNOWN_P (*mem2
))
28089 auto size1
= MEM_SIZE (*mem1
);
28090 auto size2
= MEM_SIZE (*mem2
);
28092 rtx base1
, base2
, offset1
, offset2
;
28093 extract_base_offset_in_addr (*mem1
, &base1
, &offset1
);
28094 extract_base_offset_in_addr (*mem2
, &base2
, &offset2
);
28096 /* Make sure at least one memory is in base+offset form. */
28097 if (!(base1
&& offset1
) && !(base2
&& offset2
))
28100 /* If both mems already use the same base register, just check the
28102 if (base1
&& base2
&& rtx_equal_p (base1
, base2
))
28104 if (!offset1
|| !offset2
)
28107 if (known_eq (UINTVAL (offset1
) + size1
, UINTVAL (offset2
)))
28110 if (known_eq (UINTVAL (offset2
) + size2
, UINTVAL (offset1
)) && reversed
)
28119 /* Otherwise, check whether the MEM_EXPRs and MEM_OFFSETs together
28120 guarantee that the values are consecutive. */
28121 if (MEM_EXPR (*mem1
)
28122 && MEM_EXPR (*mem2
)
28123 && MEM_OFFSET_KNOWN_P (*mem1
)
28124 && MEM_OFFSET_KNOWN_P (*mem2
))
28126 poly_int64 expr_offset1
;
28127 poly_int64 expr_offset2
;
28128 tree expr_base1
= get_addr_base_and_unit_offset (MEM_EXPR (*mem1
),
28130 tree expr_base2
= get_addr_base_and_unit_offset (MEM_EXPR (*mem2
),
28134 || !DECL_P (expr_base1
)
28135 || !operand_equal_p (expr_base1
, expr_base2
, OEP_ADDRESS_OF
))
28138 expr_offset1
+= MEM_OFFSET (*mem1
);
28139 expr_offset2
+= MEM_OFFSET (*mem2
);
28141 if (known_eq (expr_offset1
+ size1
, expr_offset2
))
28143 else if (known_eq (expr_offset2
+ size2
, expr_offset1
) && reversed
)
28152 rtx addr1
= plus_constant (Pmode
, XEXP (*mem2
, 0),
28153 expr_offset1
- expr_offset2
);
28154 *mem1
= replace_equiv_address_nv (*mem1
, addr1
);
28158 rtx addr2
= plus_constant (Pmode
, XEXP (*mem1
, 0),
28159 expr_offset2
- expr_offset1
);
28160 *mem2
= replace_equiv_address_nv (*mem2
, addr2
);
28169 /* Test if MODE is suitable for a single transfer register in an ldp or stp
28173 aarch64_ldpstp_operand_mode_p (machine_mode mode
)
28175 if (!targetm
.hard_regno_mode_ok (V0_REGNUM
, mode
)
28176 || hard_regno_nregs (V0_REGNUM
, mode
) > 1)
28179 const auto size
= GET_MODE_SIZE (mode
);
28180 return known_eq (size
, 4) || known_eq (size
, 8) || known_eq (size
, 16);
28183 /* Return true if MEM1 and MEM2 can be combined into a single access
28184 of mode MODE, with the combined access having the same address as MEM1. */
28187 aarch64_mergeable_load_pair_p (machine_mode mode
, rtx mem1
, rtx mem2
)
28189 if (STRICT_ALIGNMENT
&& MEM_ALIGN (mem1
) < GET_MODE_ALIGNMENT (mode
))
28191 return aarch64_check_consecutive_mems (&mem1
, &mem2
, nullptr);
28194 /* Return true if MEM agrees with the ldp-stp policy model.
28195 Otherwise, false. */
28198 aarch64_mem_ok_with_ldpstp_policy_model (rtx mem
, bool load
, machine_mode mode
)
28200 auto policy
= (load
28201 ? aarch64_tune_params
.ldp_policy_model
28202 : aarch64_tune_params
.stp_policy_model
);
28204 /* If we have AARCH64_LDP_STP_POLICY_NEVER, reject the load pair. */
28205 if (policy
== AARCH64_LDP_STP_POLICY_NEVER
)
28208 /* If we have AARCH64_LDP_STP_POLICY_ALIGNED,
28209 do not emit the load pair unless the alignment is checked to be
28210 at least double the alignment of the type. */
28211 if (policy
== AARCH64_LDP_STP_POLICY_ALIGNED
28212 && !optimize_function_for_size_p (cfun
)
28213 && MEM_ALIGN (mem
) < 2 * GET_MODE_ALIGNMENT (mode
))
28219 /* Given OPERANDS of consecutive load/store, check if we can merge
28220 them into ldp/stp. LOAD is true if they are load instructions. */
28223 aarch64_operands_ok_for_ldpstp (rtx
*operands
, bool load
)
28225 enum reg_class rclass_1
, rclass_2
;
28226 rtx mem_1
, mem_2
, reg_1
, reg_2
;
28230 mem_1
= operands
[1];
28231 mem_2
= operands
[3];
28232 reg_1
= operands
[0];
28233 reg_2
= operands
[2];
28234 gcc_assert (REG_P (reg_1
) && REG_P (reg_2
));
28235 if (REGNO (reg_1
) == REGNO (reg_2
))
28237 if (reg_overlap_mentioned_p (reg_1
, mem_2
))
28242 mem_1
= operands
[0];
28243 mem_2
= operands
[2];
28244 reg_1
= operands
[1];
28245 reg_2
= operands
[3];
28248 /* The mems cannot be volatile. */
28249 if (MEM_VOLATILE_P (mem_1
) || MEM_VOLATILE_P (mem_2
))
28252 /* Check if the addresses are in the form of [base+offset]. */
28253 bool reversed
= false;
28254 if (!aarch64_check_consecutive_mems (&mem_1
, &mem_2
, &reversed
))
28257 /* The operands must be of the same size. */
28258 gcc_assert (known_eq (GET_MODE_SIZE (GET_MODE (mem_1
)),
28259 GET_MODE_SIZE (GET_MODE (mem_2
))));
28261 /* The lower memory access must be a mem-pair operand. */
28262 rtx lower_mem
= reversed
? mem_2
: mem_1
;
28263 machine_mode lower_mem_mode
= GET_MODE (lower_mem
);
28264 if (!aarch64_mem_pair_operand (lower_mem
, lower_mem_mode
))
28267 /* Check if lower_mem is ok with the ldp-stp policy model. */
28268 if (!aarch64_mem_ok_with_ldpstp_policy_model (lower_mem
, load
,
28272 if (REG_P (reg_1
) && FP_REGNUM_P (REGNO (reg_1
)))
28273 rclass_1
= FP_REGS
;
28275 rclass_1
= GENERAL_REGS
;
28277 if (REG_P (reg_2
) && FP_REGNUM_P (REGNO (reg_2
)))
28278 rclass_2
= FP_REGS
;
28280 rclass_2
= GENERAL_REGS
;
28282 /* Check if the registers are of same class. */
28283 if (rclass_1
!= rclass_2
)
28289 /* Given OPERANDS of consecutive load/store that can be merged,
28290 swap them if they are not in ascending order. */
28292 aarch64_swap_ldrstr_operands (rtx
* operands
, bool load
)
28294 int mem_op
= load
? 1 : 0;
28295 bool reversed
= false;
28296 if (!aarch64_check_consecutive_mems (operands
+ mem_op
,
28297 operands
+ mem_op
+ 2, &reversed
))
28298 gcc_unreachable ();
28302 /* Irrespective of whether this is a load or a store,
28303 we do the same swap. */
28304 std::swap (operands
[0], operands
[2]);
28305 std::swap (operands
[1], operands
[3]);
28309 /* Helper function used for generation of load/store pair instructions, called
28310 from peepholes in aarch64-ldpstp.md. OPERANDS is an array of
28311 operands as matched by the peepholes in that file. LOAD_P is true if we're
28312 generating a load pair, otherwise we're generating a store pair. CODE is
28313 either {ZERO,SIGN}_EXTEND for extending loads or UNKNOWN if we're generating a
28314 standard load/store pair. */
28317 aarch64_finish_ldpstp_peephole (rtx
*operands
, bool load_p
, enum rtx_code code
)
28319 aarch64_swap_ldrstr_operands (operands
, load_p
);
28322 emit_insn (aarch64_gen_load_pair (operands
[0], operands
[2],
28323 operands
[1], code
));
28326 gcc_assert (code
== UNKNOWN
);
28327 emit_insn (aarch64_gen_store_pair (operands
[0], operands
[1],
28332 /* Taking X and Y to be HOST_WIDE_INT pointers, return the result of a
28333 comparison between the two. */
28335 aarch64_host_wide_int_compare (const void *x
, const void *y
)
28337 return wi::cmps (* ((const HOST_WIDE_INT
*) x
),
28338 * ((const HOST_WIDE_INT
*) y
));
28341 /* Taking X and Y to be pairs of RTX, one pointing to a MEM rtx and the
28342 other pointing to a REG rtx containing an offset, compare the offsets
28347 1 iff offset (X) > offset (Y)
28348 0 iff offset (X) == offset (Y)
28349 -1 iff offset (X) < offset (Y) */
28351 aarch64_ldrstr_offset_compare (const void *x
, const void *y
)
28353 const rtx
* operands_1
= (const rtx
*) x
;
28354 const rtx
* operands_2
= (const rtx
*) y
;
28355 rtx mem_1
, mem_2
, base
, offset_1
, offset_2
;
28357 if (MEM_P (operands_1
[0]))
28358 mem_1
= operands_1
[0];
28360 mem_1
= operands_1
[1];
28362 if (MEM_P (operands_2
[0]))
28363 mem_2
= operands_2
[0];
28365 mem_2
= operands_2
[1];
28367 /* Extract the offsets. */
28368 extract_base_offset_in_addr (mem_1
, &base
, &offset_1
);
28369 extract_base_offset_in_addr (mem_2
, &base
, &offset_2
);
28371 gcc_assert (offset_1
!= NULL_RTX
&& offset_2
!= NULL_RTX
);
28373 return wi::cmps (INTVAL (offset_1
), INTVAL (offset_2
));
28376 /* Given OPERANDS of consecutive load/store, check if we can merge
28377 them into ldp/stp by adjusting the offset. LOAD is true if they
28378 are load instructions. MODE is the mode of memory operands.
28380 Given below consecutive stores:
28382 str w1, [xb, 0x100]
28383 str w1, [xb, 0x104]
28384 str w1, [xb, 0x108]
28385 str w1, [xb, 0x10c]
28387 Though the offsets are out of the range supported by stp, we can
28388 still pair them after adjusting the offset, like:
28390 add scratch, xb, 0x100
28391 stp w1, w1, [scratch]
28392 stp w1, w1, [scratch, 0x8]
28394 The peephole patterns detecting this opportunity should guarantee
28395 the scratch register is avaliable. */
28398 aarch64_operands_adjust_ok_for_ldpstp (rtx
*operands
, bool load
,
28401 const int num_insns
= 4;
28402 enum reg_class rclass
;
28403 HOST_WIDE_INT offvals
[num_insns
], msize
;
28404 rtx mem
[num_insns
], reg
[num_insns
], base
[num_insns
], offset
[num_insns
];
28408 for (int i
= 0; i
< num_insns
; i
++)
28410 reg
[i
] = operands
[2 * i
];
28411 mem
[i
] = operands
[2 * i
+ 1];
28413 gcc_assert (REG_P (reg
[i
]));
28416 /* Do not attempt to merge the loads if the loads clobber each other. */
28417 for (int i
= 0; i
< 8; i
+= 2)
28418 for (int j
= i
+ 2; j
< 8; j
+= 2)
28419 if (reg_overlap_mentioned_p (operands
[i
], operands
[j
]))
28423 for (int i
= 0; i
< num_insns
; i
++)
28425 mem
[i
] = operands
[2 * i
];
28426 reg
[i
] = operands
[2 * i
+ 1];
28429 /* Skip if memory operand is by itself valid for ldp/stp. */
28430 if (!MEM_P (mem
[0]) || aarch64_mem_pair_operand (mem
[0], mode
))
28433 for (int i
= 0; i
< num_insns
; i
++)
28435 /* The mems cannot be volatile. */
28436 if (MEM_VOLATILE_P (mem
[i
]))
28439 /* Check if the addresses are in the form of [base+offset]. */
28440 extract_base_offset_in_addr (mem
[i
], base
+ i
, offset
+ i
);
28441 if (base
[i
] == NULL_RTX
|| offset
[i
] == NULL_RTX
)
28445 /* Check if the registers are of same class. */
28446 rclass
= REG_P (reg
[0]) && FP_REGNUM_P (REGNO (reg
[0]))
28447 ? FP_REGS
: GENERAL_REGS
;
28449 for (int i
= 1; i
< num_insns
; i
++)
28450 if (REG_P (reg
[i
]) && FP_REGNUM_P (REGNO (reg
[i
])))
28452 if (rclass
!= FP_REGS
)
28457 if (rclass
!= GENERAL_REGS
)
28461 /* Only the last register in the order in which they occur
28462 may be clobbered by the load. */
28463 if (rclass
== GENERAL_REGS
&& load
)
28464 for (int i
= 0; i
< num_insns
- 1; i
++)
28465 if (reg_mentioned_p (reg
[i
], mem
[i
]))
28468 /* Check if the bases are same. */
28469 for (int i
= 0; i
< num_insns
- 1; i
++)
28470 if (!rtx_equal_p (base
[i
], base
[i
+ 1]))
28473 for (int i
= 0; i
< num_insns
; i
++)
28474 offvals
[i
] = INTVAL (offset
[i
]);
28476 msize
= GET_MODE_SIZE (mode
).to_constant ();
28478 /* Check if the offsets can be put in the right order to do a ldp/stp. */
28479 qsort (offvals
, num_insns
, sizeof (HOST_WIDE_INT
),
28480 aarch64_host_wide_int_compare
);
28482 if (!(offvals
[1] == offvals
[0] + msize
28483 && offvals
[3] == offvals
[2] + msize
))
28486 /* Check that offsets are within range of each other. The ldp/stp
28487 instructions have 7 bit immediate offsets, so use 0x80. */
28488 if (offvals
[2] - offvals
[0] >= msize
* 0x80)
28491 /* The offsets must be aligned with respect to each other. */
28492 if (offvals
[0] % msize
!= offvals
[2] % msize
)
28495 /* Check if mem[0] is ok with the ldp-stp policy model. */
28496 if (!aarch64_mem_ok_with_ldpstp_policy_model (mem
[0], load
, mode
))
28502 /* Given OPERANDS of consecutive load/store, this function pairs them
28503 into LDP/STP after adjusting the offset. It depends on the fact
28504 that the operands can be sorted so the offsets are correct for STP.
28505 MODE is the mode of memory operands. CODE is the rtl operator
28506 which should be applied to all memory operands, it's SIGN_EXTEND,
28507 ZERO_EXTEND or UNKNOWN. */
28510 aarch64_gen_adjusted_ldpstp (rtx
*operands
, bool load
,
28511 machine_mode mode
, RTX_CODE code
)
28513 rtx base
, offset_1
, offset_2
;
28515 rtx temp_operands
[8];
28516 HOST_WIDE_INT off_val_1
, off_val_2
, base_off
, new_off_1
, new_off_2
,
28517 stp_off_upper_limit
, stp_off_lower_limit
, msize
;
28519 /* We make changes on a copy as we may still bail out. */
28520 for (int i
= 0; i
< 8; i
++)
28521 temp_operands
[i
] = operands
[i
];
28523 /* Sort the operands. Note for cases as below:
28528 We need stable sorting otherwise wrong data may be store to offset 0x320.
28529 Also note the dead store in above case should be optimized away, but no
28530 guarantees here. */
28531 gcc_stablesort(temp_operands
, 4, 2 * sizeof (rtx
*),
28532 aarch64_ldrstr_offset_compare
);
28534 /* Copy the memory operands so that if we have to bail for some
28535 reason the original addresses are unchanged. */
28538 mem_1
= copy_rtx (temp_operands
[1]);
28539 mem_2
= copy_rtx (temp_operands
[5]);
28543 mem_1
= copy_rtx (temp_operands
[0]);
28544 mem_2
= copy_rtx (temp_operands
[4]);
28545 gcc_assert (code
== UNKNOWN
);
28548 extract_base_offset_in_addr (mem_1
, &base
, &offset_1
);
28549 extract_base_offset_in_addr (mem_2
, &base
, &offset_2
);
28550 gcc_assert (base
!= NULL_RTX
&& offset_1
!= NULL_RTX
28551 && offset_2
!= NULL_RTX
);
28553 /* Adjust offset so it can fit in LDP/STP instruction. */
28554 msize
= GET_MODE_SIZE (mode
).to_constant();
28555 stp_off_upper_limit
= msize
* (0x40 - 1);
28556 stp_off_lower_limit
= - msize
* 0x40;
28558 off_val_1
= INTVAL (offset_1
);
28559 off_val_2
= INTVAL (offset_2
);
28561 /* The base offset is optimally half way between the two STP/LDP offsets. */
28563 base_off
= (off_val_1
+ off_val_2
) / 2;
28565 /* However, due to issues with negative LDP/STP offset generation for
28566 larger modes, for DF, DD, DI and vector modes. we must not use negative
28567 addresses smaller than 9 signed unadjusted bits can store. This
28568 provides the most range in this case. */
28569 base_off
= off_val_1
;
28571 /* Adjust the base so that it is aligned with the addresses but still
28573 if (base_off
% msize
!= off_val_1
% msize
)
28574 /* Fix the offset, bearing in mind we want to make it bigger not
28576 base_off
+= (((base_off
% msize
) - (off_val_1
% msize
)) + msize
) % msize
;
28577 else if (msize
<= 4)
28578 /* The negative range of LDP/STP is one larger than the positive range. */
28581 /* Check if base offset is too big or too small. We can attempt to resolve
28582 this issue by setting it to the maximum value and seeing if the offsets
28584 if (base_off
>= 0x1000)
28586 base_off
= 0x1000 - 1;
28587 /* We must still make sure that the base offset is aligned with respect
28588 to the address. But it may not be made any bigger. */
28589 base_off
-= (((base_off
% msize
) - (off_val_1
% msize
)) + msize
) % msize
;
28592 /* Likewise for the case where the base is too small. */
28593 if (base_off
<= -0x1000)
28595 base_off
= -0x1000 + 1;
28596 base_off
+= (((base_off
% msize
) - (off_val_1
% msize
)) + msize
) % msize
;
28599 /* Offset of the first STP/LDP. */
28600 new_off_1
= off_val_1
- base_off
;
28602 /* Offset of the second STP/LDP. */
28603 new_off_2
= off_val_2
- base_off
;
28605 /* The offsets must be within the range of the LDP/STP instructions. */
28606 if (new_off_1
> stp_off_upper_limit
|| new_off_1
< stp_off_lower_limit
28607 || new_off_2
> stp_off_upper_limit
|| new_off_2
< stp_off_lower_limit
)
28610 replace_equiv_address_nv (mem_1
, plus_constant (Pmode
, operands
[8],
28612 replace_equiv_address_nv (mem_2
, plus_constant (Pmode
, operands
[8],
28615 if (!aarch64_mem_pair_operand (mem_1
, mode
)
28616 || !aarch64_mem_pair_operand (mem_2
, mode
))
28621 operands
[0] = temp_operands
[0];
28622 operands
[1] = mem_1
;
28623 operands
[2] = temp_operands
[2];
28624 operands
[4] = temp_operands
[4];
28625 operands
[5] = mem_2
;
28626 operands
[6] = temp_operands
[6];
28630 operands
[0] = mem_1
;
28631 operands
[1] = temp_operands
[1];
28632 operands
[3] = temp_operands
[3];
28633 operands
[4] = mem_2
;
28634 operands
[5] = temp_operands
[5];
28635 operands
[7] = temp_operands
[7];
28638 /* Emit adjusting instruction. */
28639 emit_insn (gen_rtx_SET (operands
[8], plus_constant (DImode
, base
, base_off
)));
28640 /* Emit ldp/stp instructions. */
28643 emit_insn (aarch64_gen_load_pair (operands
[0], operands
[2],
28644 operands
[1], code
));
28645 emit_insn (aarch64_gen_load_pair (operands
[4], operands
[6],
28646 operands
[5], code
));
28650 emit_insn (aarch64_gen_store_pair (operands
[0], operands
[1],
28652 emit_insn (aarch64_gen_store_pair (operands
[4], operands
[5],
28658 /* Implement TARGET_VECTORIZE_CONDITIONAL_OPERATION_IS_EXPENSIVE. Assume that
28659 predicated operations when available are beneficial. */
28662 aarch64_conditional_operation_is_expensive (unsigned)
28667 /* Implement TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE. Assume for now that
28668 it isn't worth branching around empty masked ops (including masked
28672 aarch64_empty_mask_is_expensive (unsigned)
28677 /* Return 1 if pseudo register should be created and used to hold
28678 GOT address for PIC code. */
28681 aarch64_use_pseudo_pic_reg (void)
28683 return aarch64_cmodel
== AARCH64_CMODEL_SMALL_SPIC
;
28686 /* Implement TARGET_UNSPEC_MAY_TRAP_P. */
28689 aarch64_unspec_may_trap_p (const_rtx x
, unsigned flags
)
28691 switch (XINT (x
, 1))
28693 case UNSPEC_GOTSMALLPIC
:
28694 case UNSPEC_GOTSMALLPIC28K
:
28695 case UNSPEC_GOTTINYPIC
:
28701 return default_unspec_may_trap_p (x
, flags
);
28705 /* If X is a positive CONST_DOUBLE with a value that is a power of 2
28706 return the log2 of that value. Otherwise return -1. */
28709 aarch64_fpconst_pow_of_2 (rtx x
)
28711 const REAL_VALUE_TYPE
*r
;
28713 if (!CONST_DOUBLE_P (x
))
28716 r
= CONST_DOUBLE_REAL_VALUE (x
);
28718 if (REAL_VALUE_NEGATIVE (*r
)
28719 || REAL_VALUE_ISNAN (*r
)
28720 || REAL_VALUE_ISINF (*r
)
28721 || !real_isinteger (r
, DFmode
))
28724 return exact_log2 (real_to_integer (r
));
28727 /* If X is a positive CONST_DOUBLE with a value that is the reciprocal of a
28728 power of 2 (i.e 1/2^n) return the number of float bits. e.g. for x==(1/2^n)
28729 return n. Otherwise return -1. */
28732 aarch64_fpconst_pow2_recip (rtx x
)
28734 REAL_VALUE_TYPE r0
;
28736 if (!CONST_DOUBLE_P (x
))
28739 r0
= *CONST_DOUBLE_REAL_VALUE (x
);
28740 if (exact_real_inverse (DFmode
, &r0
)
28741 && !REAL_VALUE_NEGATIVE (r0
))
28743 int ret
= exact_log2 (real_to_integer (&r0
));
28744 if (ret
>= 1 && ret
<= 32)
28750 /* If X is a vector of equal CONST_DOUBLE values and that value is
28751 Y, return the aarch64_fpconst_pow_of_2 of Y. Otherwise return -1. */
28754 aarch64_vec_fpconst_pow_of_2 (rtx x
)
28757 if (!CONST_VECTOR_P (x
)
28758 || !CONST_VECTOR_NUNITS (x
).is_constant (&nelts
))
28761 if (GET_MODE_CLASS (GET_MODE (x
)) != MODE_VECTOR_FLOAT
)
28764 int firstval
= aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x
, 0));
28768 for (int i
= 1; i
< nelts
; i
++)
28769 if (aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x
, i
)) != firstval
)
28775 /* Implement TARGET_PROMOTED_TYPE to promote 16-bit floating point types
28778 __fp16 always promotes through this hook.
28779 _Float16 may promote if TARGET_FLT_EVAL_METHOD is 16, but we do that
28780 through the generic excess precision logic rather than here. */
28783 aarch64_promoted_type (const_tree t
)
28785 if (SCALAR_FLOAT_TYPE_P (t
)
28786 && TYPE_MAIN_VARIANT (t
) == aarch64_fp16_type_node
)
28787 return float_type_node
;
28792 /* Implement the TARGET_OPTAB_SUPPORTED_P hook. */
28795 aarch64_optab_supported_p (int op
, machine_mode mode1
, machine_mode
,
28796 optimization_type opt_type
)
28801 return opt_type
== OPTIMIZE_FOR_SPEED
&& use_rsqrt_p (mode1
);
28808 /* Implement the TARGET_DWARF_POLY_INDETERMINATE_VALUE hook. */
28810 static unsigned int
28811 aarch64_dwarf_poly_indeterminate_value (unsigned int i
, unsigned int *factor
,
28814 /* Polynomial invariant 1 == (VG / 2) - 1. */
28815 gcc_assert (i
== 1);
28818 return AARCH64_DWARF_VG
;
28821 /* Implement TARGET_LIBGCC_FLOATING_POINT_MODE_SUPPORTED_P - return TRUE
28822 if MODE is [BH]Fmode, and punt to the generic implementation otherwise. */
28825 aarch64_libgcc_floating_mode_supported_p (scalar_float_mode mode
)
28827 return ((mode
== HFmode
|| mode
== BFmode
)
28829 : default_libgcc_floating_mode_supported_p (mode
));
28832 /* Implement TARGET_SCALAR_MODE_SUPPORTED_P - return TRUE
28833 if MODE is [BH]Fmode, and punt to the generic implementation otherwise. */
28836 aarch64_scalar_mode_supported_p (scalar_mode mode
)
28838 if (DECIMAL_FLOAT_MODE_P (mode
))
28839 return default_decimal_float_supported_p ();
28841 return ((mode
== HFmode
|| mode
== BFmode
)
28843 : default_scalar_mode_supported_p (mode
));
28846 /* Set the value of FLT_EVAL_METHOD.
28847 ISO/IEC TS 18661-3 defines two values that we'd like to make use of:
28849 0: evaluate all operations and constants, whose semantic type has at
28850 most the range and precision of type float, to the range and
28851 precision of float; evaluate all other operations and constants to
28852 the range and precision of the semantic type;
28854 N, where _FloatN is a supported interchange floating type
28855 evaluate all operations and constants, whose semantic type has at
28856 most the range and precision of _FloatN type, to the range and
28857 precision of the _FloatN type; evaluate all other operations and
28858 constants to the range and precision of the semantic type;
28860 If we have the ARMv8.2-A extensions then we support _Float16 in native
28861 precision, so we should set this to 16. Otherwise, we support the type,
28862 but want to evaluate expressions in float precision, so set this to
28865 static enum flt_eval_method
28866 aarch64_excess_precision (enum excess_precision_type type
)
28870 case EXCESS_PRECISION_TYPE_FAST
:
28871 case EXCESS_PRECISION_TYPE_STANDARD
:
28872 /* We can calculate either in 16-bit range and precision or
28873 32-bit range and precision. Make that decision based on whether
28874 we have native support for the ARMv8.2-A 16-bit floating-point
28875 instructions or not. */
28876 return (TARGET_FP_F16INST
28877 ? FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16
28878 : FLT_EVAL_METHOD_PROMOTE_TO_FLOAT
);
28879 case EXCESS_PRECISION_TYPE_IMPLICIT
:
28880 case EXCESS_PRECISION_TYPE_FLOAT16
:
28881 return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16
;
28883 gcc_unreachable ();
28885 return FLT_EVAL_METHOD_UNPREDICTABLE
;
28888 /* Implement TARGET_C_BITINT_TYPE_INFO.
28889 Return true if _BitInt(N) is supported and fill its details into *INFO. */
28891 aarch64_bitint_type_info (int n
, struct bitint_info
*info
)
28893 if (TARGET_BIG_END
)
28897 info
->limb_mode
= QImode
;
28899 info
->limb_mode
= HImode
;
28901 info
->limb_mode
= SImode
;
28903 info
->limb_mode
= DImode
;
28905 info
->limb_mode
= TImode
;
28907 /* The AAPCS for AArch64 defines _BitInt(N > 128) as an array with
28908 type {signed,unsigned} __int128[M] where M*128 >= N. However, to be
28909 able to use libgcc's implementation to support large _BitInt's we need
28910 to use a LIMB_MODE that is no larger than 'long long'. This is why we
28911 use DImode for our internal LIMB_MODE and we define the ABI_LIMB_MODE to
28912 be TImode to ensure we are ABI compliant. */
28913 info
->limb_mode
= DImode
;
28916 info
->abi_limb_mode
= TImode
;
28918 info
->abi_limb_mode
= info
->limb_mode
;
28919 info
->big_endian
= TARGET_BIG_END
;
28920 info
->extended
= false;
28924 /* Implement TARGET_C_MODE_FOR_FLOATING_TYPE. Return TFmode for
28925 TI_LONG_DOUBLE_TYPE which is for long double type, go with the default
28926 one for the others. */
28928 static machine_mode
28929 aarch64_c_mode_for_floating_type (enum tree_index ti
)
28931 if (ti
== TI_LONG_DOUBLE_TYPE
)
28933 return default_mode_for_floating_type (ti
);
28936 /* Implement TARGET_SCHED_CAN_SPECULATE_INSN. Return true if INSN can be
28937 scheduled for speculative execution. Reject the long-running division
28938 and square-root instructions. */
28941 aarch64_sched_can_speculate_insn (rtx_insn
*insn
)
28943 switch (get_attr_type (insn
))
28951 case TYPE_NEON_FP_SQRT_S
:
28952 case TYPE_NEON_FP_SQRT_D
:
28953 case TYPE_NEON_FP_SQRT_S_Q
:
28954 case TYPE_NEON_FP_SQRT_D_Q
:
28955 case TYPE_NEON_FP_DIV_S
:
28956 case TYPE_NEON_FP_DIV_D
:
28957 case TYPE_NEON_FP_DIV_S_Q
:
28958 case TYPE_NEON_FP_DIV_D_Q
:
28965 /* Implement TARGET_COMPUTE_PRESSURE_CLASSES. */
28968 aarch64_compute_pressure_classes (reg_class
*classes
)
28971 classes
[i
++] = GENERAL_REGS
;
28972 classes
[i
++] = FP_REGS
;
28973 /* PR_REGS isn't a useful pressure class because many predicate pseudo
28974 registers need to go in PR_LO_REGS at some point during their
28975 lifetime. Splitting it into two halves has the effect of making
28976 all predicates count against PR_LO_REGS, so that we try whenever
28977 possible to restrict the number of live predicates to 8. This
28978 greatly reduces the amount of spilling in certain loops. */
28979 classes
[i
++] = PR_LO_REGS
;
28980 classes
[i
++] = PR_HI_REGS
;
28984 /* Implement TARGET_CAN_CHANGE_MODE_CLASS. */
28987 aarch64_can_change_mode_class (machine_mode from
,
28988 machine_mode to
, reg_class_t
)
28990 return aarch64_modes_compatible_p (from
, to
);
28993 /* Implement TARGET_EARLY_REMAT_MODES. */
28996 aarch64_select_early_remat_modes (sbitmap modes
)
28998 /* SVE values are not normally live across a call, so it should be
28999 worth doing early rematerialization even in VL-specific mode. */
29000 for (int i
= 0; i
< NUM_MACHINE_MODES
; ++i
)
29001 if (aarch64_sve_mode_p ((machine_mode
) i
))
29002 bitmap_set_bit (modes
, i
);
29005 /* Override the default target speculation_safe_value. */
29007 aarch64_speculation_safe_value (machine_mode mode
,
29008 rtx result
, rtx val
, rtx failval
)
29010 /* Maybe we should warn if falling back to hard barriers. They are
29011 likely to be noticably more expensive than the alternative below. */
29012 if (!aarch64_track_speculation
)
29013 return default_speculation_safe_value (mode
, result
, val
, failval
);
29016 val
= copy_to_mode_reg (mode
, val
);
29018 if (!aarch64_reg_or_zero (failval
, mode
))
29019 failval
= copy_to_mode_reg (mode
, failval
);
29021 emit_insn (gen_despeculate_copy (mode
, result
, val
, failval
));
29025 /* Implement TARGET_ESTIMATED_POLY_VALUE.
29026 Look into the tuning structure for an estimate.
29027 KIND specifies the type of requested estimate: min, max or likely.
29028 For cores with a known SVE width all three estimates are the same.
29029 For generic SVE tuning we want to distinguish the maximum estimate from
29030 the minimum and likely ones.
29031 The likely estimate is the same as the minimum in that case to give a
29032 conservative behavior of auto-vectorizing with SVE when it is a win
29033 even for 128-bit SVE.
29034 When SVE width information is available VAL.coeffs[1] is multiplied by
29035 the number of VQ chunks over the initial Advanced SIMD 128 bits. */
29037 static HOST_WIDE_INT
29038 aarch64_estimated_poly_value (poly_int64 val
,
29039 poly_value_estimate_kind kind
29040 = POLY_VALUE_LIKELY
)
29042 unsigned int width_source
= aarch64_tune_params
.sve_width
;
29044 /* If there is no core-specific information then the minimum and likely
29045 values are based on 128-bit vectors and the maximum is based on
29046 the architectural maximum of 2048 bits. */
29047 if (width_source
== SVE_SCALABLE
)
29050 case POLY_VALUE_MIN
:
29051 case POLY_VALUE_LIKELY
:
29052 return val
.coeffs
[0];
29053 case POLY_VALUE_MAX
:
29054 return val
.coeffs
[0] + val
.coeffs
[1] * 15;
29057 /* Allow sve_width to be a bitmask of different VL, treating the lowest
29058 as likely. This could be made more general if future -mtune options
29060 if (kind
== POLY_VALUE_MAX
)
29061 width_source
= 1 << floor_log2 (width_source
);
29063 width_source
= least_bit_hwi (width_source
);
29065 /* If the core provides width information, use that. */
29066 HOST_WIDE_INT over_128
= width_source
- 128;
29067 return val
.coeffs
[0] + val
.coeffs
[1] * over_128
/ 128;
29071 /* Return true for types that could be supported as SIMD return or
29075 supported_simd_type (tree t
)
29077 if (SCALAR_FLOAT_TYPE_P (t
) || INTEGRAL_TYPE_P (t
) || POINTER_TYPE_P (t
))
29079 HOST_WIDE_INT s
= tree_to_shwi (TYPE_SIZE_UNIT (t
));
29080 return s
== 1 || s
== 2 || s
== 4 || s
== 8;
29085 /* Determine the lane size for the clone argument/return type. This follows
29086 the LS(P) rule in the VFABIA64. */
29089 lane_size (cgraph_simd_clone_arg_type clone_arg_type
, tree type
)
29091 gcc_assert (clone_arg_type
!= SIMD_CLONE_ARG_TYPE_MASK
);
29093 /* For non map-to-vector types that are pointers we use the element type it
29095 if (POINTER_TYPE_P (type
))
29096 switch (clone_arg_type
)
29100 case SIMD_CLONE_ARG_TYPE_UNIFORM
:
29101 case SIMD_CLONE_ARG_TYPE_LINEAR_CONSTANT_STEP
:
29102 case SIMD_CLONE_ARG_TYPE_LINEAR_VARIABLE_STEP
:
29103 type
= TREE_TYPE (type
);
29107 /* For types (or pointers of non map-to-vector types point to) that are
29108 integers or floating point, we use their size if they are 1, 2, 4 or 8.
29110 if (INTEGRAL_TYPE_P (type
)
29111 || SCALAR_FLOAT_TYPE_P (type
))
29112 switch (TYPE_PRECISION (type
) / BITS_PER_UNIT
)
29120 return TYPE_PRECISION (type
);
29122 /* For any other we use the size of uintptr_t. For map-to-vector types that
29123 are pointers, using the size of uintptr_t is the same as using the size of
29124 their type, seeing all pointers are the same size as uintptr_t. */
29125 return POINTER_SIZE
;
29129 /* Implement TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN. */
29132 aarch64_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node
*node
,
29133 struct cgraph_simd_clone
*clonei
,
29134 tree base_type ATTRIBUTE_UNUSED
,
29135 int num
, bool explicit_p
)
29138 unsigned int nds_elt_bits
, wds_elt_bits
;
29139 unsigned HOST_WIDE_INT const_simdlen
;
29144 /* For now, SVE simdclones won't produce illegal simdlen, So only check
29145 const simdlens here. */
29146 if (maybe_ne (clonei
->simdlen
, 0U)
29147 && clonei
->simdlen
.is_constant (&const_simdlen
)
29148 && (const_simdlen
< 2
29149 || const_simdlen
> 1024
29150 || (const_simdlen
& (const_simdlen
- 1)) != 0))
29153 warning_at (DECL_SOURCE_LOCATION (node
->decl
), 0,
29154 "unsupported simdlen %wd", const_simdlen
);
29158 ret_type
= TREE_TYPE (TREE_TYPE (node
->decl
));
29159 /* According to AArch64's Vector ABI the type that determines the simdlen is
29160 the narrowest of types, so we ignore base_type for AArch64. */
29161 if (TREE_CODE (ret_type
) != VOID_TYPE
29162 && !supported_simd_type (ret_type
))
29166 else if (COMPLEX_FLOAT_TYPE_P (ret_type
))
29167 warning_at (DECL_SOURCE_LOCATION (node
->decl
), 0,
29168 "GCC does not currently support return type %qT "
29169 "for simd", ret_type
);
29171 warning_at (DECL_SOURCE_LOCATION (node
->decl
), 0,
29172 "unsupported return type %qT for simd",
29177 auto_vec
<std::pair
<tree
, unsigned int>> vec_elts (clonei
->nargs
+ 1);
29179 /* We are looking for the NDS type here according to the VFABIA64. */
29180 if (TREE_CODE (ret_type
) != VOID_TYPE
)
29182 nds_elt_bits
= lane_size (SIMD_CLONE_ARG_TYPE_VECTOR
, ret_type
);
29183 wds_elt_bits
= nds_elt_bits
;
29184 vec_elts
.safe_push (std::make_pair (ret_type
, nds_elt_bits
));
29188 nds_elt_bits
= POINTER_SIZE
;
29193 tree type_arg_types
= TYPE_ARG_TYPES (TREE_TYPE (node
->decl
));
29194 bool decl_arg_p
= (node
->definition
|| type_arg_types
== NULL_TREE
);
29195 for (t
= (decl_arg_p
? DECL_ARGUMENTS (node
->decl
) : type_arg_types
), i
= 0;
29196 t
&& t
!= void_list_node
; t
= TREE_CHAIN (t
), i
++)
29198 tree type
= decl_arg_p
? TREE_TYPE (t
) : TREE_VALUE (t
);
29199 if (clonei
->args
[i
].arg_type
!= SIMD_CLONE_ARG_TYPE_UNIFORM
29200 && !supported_simd_type (type
))
29204 else if (COMPLEX_FLOAT_TYPE_P (type
))
29205 warning_at (DECL_SOURCE_LOCATION (node
->decl
), 0,
29206 "GCC does not currently support argument type %qT "
29209 warning_at (DECL_SOURCE_LOCATION (node
->decl
), 0,
29210 "unsupported argument type %qT for simd",
29214 unsigned lane_bits
= lane_size (clonei
->args
[i
].arg_type
, type
);
29215 if (clonei
->args
[i
].arg_type
== SIMD_CLONE_ARG_TYPE_VECTOR
)
29216 vec_elts
.safe_push (std::make_pair (type
, lane_bits
));
29217 if (nds_elt_bits
> lane_bits
)
29218 nds_elt_bits
= lane_bits
;
29219 if (wds_elt_bits
< lane_bits
)
29220 wds_elt_bits
= lane_bits
;
29223 /* If we could not determine the WDS type from available parameters/return,
29224 then fallback to using uintptr_t. */
29225 if (wds_elt_bits
== 0)
29226 wds_elt_bits
= POINTER_SIZE
;
29228 clonei
->mask_mode
= VOIDmode
;
29229 poly_uint64 simdlen
;
29234 } aarch64_clone_info
;
29235 auto_vec
<aarch64_clone_info
, 3> clones
;
29237 /* Keep track of the possible simdlens the clones of this function can have,
29238 and check them later to see if we support them. */
29239 if (known_eq (clonei
->simdlen
, 0U))
29241 simdlen
= exact_div (poly_uint64 (64), nds_elt_bits
);
29242 if (maybe_ne (simdlen
, 1U))
29243 clones
.safe_push ({simdlen
, 'n'});
29244 clones
.safe_push ({simdlen
* 2, 'n'});
29245 /* Only create an SVE simd clone if we aren't dealing with an unprototyped
29247 We have also disabled support for creating SVE simdclones for functions
29248 with function bodies and any simdclones when -msve-vector-bits is used.
29249 TODO: add support for these. */
29250 if (prototype_p (TREE_TYPE (node
->decl
))
29251 && !node
->definition
29252 && !aarch64_sve_vg
.is_constant ())
29253 clones
.safe_push ({exact_div (BITS_PER_SVE_VECTOR
, wds_elt_bits
), 's'});
29256 clones
.safe_push ({clonei
->simdlen
, 'n'});
29258 clonei
->vecsize_int
= 0;
29259 clonei
->vecsize_float
= 0;
29261 /* We currently do not support generating simdclones where vector arguments
29262 do not fit into a single vector register, i.e. vector types that are more
29263 than 128-bits large. This is because of how we currently represent such
29264 types in ACLE, where we use a struct to allow us to pass them as arguments
29266 Hence why we have to check whether the simdlens available for this
29267 simdclone would cause a vector type to be larger than 128-bits, and reject
29270 while (j
< clones
.length ())
29272 bool remove_simdlen
= false;
29273 for (auto elt
: vec_elts
)
29274 if (clones
[j
].mangle
== 'n'
29275 && known_gt (clones
[j
].len
* elt
.second
, 128U))
29277 /* Don't issue a warning for every simdclone when there is no
29278 specific simdlen clause. */
29279 if (explicit_p
&& maybe_ne (clonei
->simdlen
, 0U))
29280 warning_at (DECL_SOURCE_LOCATION (node
->decl
), 0,
29281 "GCC does not currently support simdlen %wd for "
29283 constant_lower_bound (clones
[j
].len
), elt
.first
);
29284 remove_simdlen
= true;
29287 if (remove_simdlen
)
29288 clones
.ordered_remove (j
);
29293 int count
= clones
.length ();
29296 if (explicit_p
&& known_eq (clonei
->simdlen
, 0U))
29298 /* Warn the user if we can't generate any simdclone. */
29299 simdlen
= exact_div (poly_uint64 (64), nds_elt_bits
);
29300 warning_at (DECL_SOURCE_LOCATION (node
->decl
), 0,
29301 "GCC does not currently support a simdclone with simdlens"
29302 " %wd and %wd for these types.",
29303 constant_lower_bound (simdlen
),
29304 constant_lower_bound (simdlen
*2));
29309 gcc_assert (num
< count
);
29310 clonei
->simdlen
= clones
[num
].len
;
29311 clonei
->vecsize_mangle
= clones
[num
].mangle
;
29312 /* SVE simdclones always have a Mask, so set inbranch to 1. */
29313 if (clonei
->vecsize_mangle
== 's')
29314 clonei
->inbranch
= 1;
29318 /* Helper function to adjust an SVE vector type of an SVE simd clone. Returns
29319 an SVE vector type based on the element type of the vector TYPE, with SIMDLEN
29320 number of elements. If IS_MASK, returns an SVE mask type appropriate for use
29321 with the SVE type it would otherwise return. */
29324 simd_clone_adjust_sve_vector_type (tree type
, bool is_mask
, poly_uint64 simdlen
)
29326 unsigned int num_zr
= 0;
29327 unsigned int num_pr
= 0;
29328 machine_mode vector_mode
;
29329 type
= TREE_TYPE (type
);
29330 scalar_mode scalar_m
= SCALAR_TYPE_MODE (type
);
29331 vector_mode
= aarch64_sve_data_mode (scalar_m
, simdlen
).require ();
29332 type
= build_vector_type_for_mode (type
, vector_mode
);
29335 type
= truth_type_for (type
);
29341 /* We create new types here with the SVE type attribute instead of using ACLE
29342 types as we need to support unpacked vectors which aren't available as
29345 /* ??? This creates anonymous "SVE type" attributes for all types,
29346 even those that correspond to <arm_sve.h> types. This affects type
29347 compatibility in C/C++, but not in gimple. (Gimple type equivalence
29348 is instead decided by TARGET_COMPATIBLE_VECTOR_TYPES_P.)
29350 Thus a C/C++ definition of the implementation function will have a
29351 different function type from the declaration that this code creates.
29352 However, it doesn't seem worth trying to fix that until we have a
29353 way of handling implementations that operate on unpacked types. */
29354 type
= build_distinct_type_copy (type
);
29355 aarch64_sve::add_sve_type_attribute (type
, num_zr
, num_pr
, NULL
, NULL
);
29359 /* Implement TARGET_SIMD_CLONE_ADJUST. */
29361 aarch64_simd_clone_adjust (struct cgraph_node
*node
)
29363 tree t
= TREE_TYPE (node
->decl
);
29365 if (node
->simdclone
->vecsize_mangle
== 's')
29367 /* This is additive and has no effect if SVE, or a superset thereof, is
29368 already enabled. */
29369 tree target
= build_string (strlen ("+sve") + 1, "+sve");
29370 if (!aarch64_option_valid_attribute_p (node
->decl
, NULL_TREE
, target
, 0))
29371 gcc_unreachable ();
29372 push_function_decl (node
->decl
);
29376 /* Add aarch64_vector_pcs target attribute to SIMD clones so they
29377 use the correct ABI. */
29378 TYPE_ATTRIBUTES (t
) = make_attribute ("aarch64_vector_pcs", "default",
29379 TYPE_ATTRIBUTES (t
));
29382 cgraph_simd_clone
*sc
= node
->simdclone
;
29384 for (unsigned i
= 0; i
< sc
->nargs
; ++i
)
29386 bool is_mask
= false;
29388 switch (sc
->args
[i
].arg_type
)
29390 case SIMD_CLONE_ARG_TYPE_MASK
:
29392 gcc_fallthrough ();
29393 case SIMD_CLONE_ARG_TYPE_VECTOR
:
29394 case SIMD_CLONE_ARG_TYPE_LINEAR_VAL_CONSTANT_STEP
:
29395 case SIMD_CLONE_ARG_TYPE_LINEAR_VAL_VARIABLE_STEP
:
29396 type
= sc
->args
[i
].vector_type
;
29397 gcc_assert (VECTOR_TYPE_P (type
));
29398 if (node
->simdclone
->vecsize_mangle
== 's')
29399 type
= simd_clone_adjust_sve_vector_type (type
, is_mask
,
29401 sc
->args
[i
].vector_type
= type
;
29407 if (node
->simdclone
->vecsize_mangle
== 's')
29409 tree ret_type
= TREE_TYPE (t
);
29410 if (VECTOR_TYPE_P (ret_type
))
29412 = simd_clone_adjust_sve_vector_type (ret_type
, false,
29413 node
->simdclone
->simdlen
);
29414 pop_function_decl ();
29418 /* Implement TARGET_SIMD_CLONE_USABLE. */
29421 aarch64_simd_clone_usable (struct cgraph_node
*node
, machine_mode vector_mode
)
29423 switch (node
->simdclone
->vecsize_mangle
)
29426 if (!TARGET_SIMD
|| aarch64_sve_mode_p (vector_mode
))
29431 || !aarch64_sve_mode_p (vector_mode
))
29435 gcc_unreachable ();
29439 /* Implement TARGET_COMP_TYPE_ATTRIBUTES */
29442 aarch64_comp_type_attributes (const_tree type1
, const_tree type2
)
29444 auto check_attr
= [&](const char *ns
, const char *name
) {
29445 tree attr1
= lookup_attribute (ns
, name
, TYPE_ATTRIBUTES (type1
));
29446 tree attr2
= lookup_attribute (ns
, name
, TYPE_ATTRIBUTES (type2
));
29447 if (!attr1
&& !attr2
)
29450 return attr1
&& attr2
&& attribute_value_equal (attr1
, attr2
);
29453 if (!check_attr ("gnu", "aarch64_vector_pcs"))
29455 if (!check_attr ("gnu", "indirect_return"))
29457 if (!check_attr ("gnu", "Advanced SIMD type"))
29459 if (!check_attr ("gnu", "SVE type"))
29461 if (!check_attr ("gnu", "SVE sizeless type"))
29463 if (!check_attr ("arm", "streaming"))
29465 if (!check_attr ("arm", "streaming_compatible"))
29467 if (aarch64_lookup_shared_state_flags (TYPE_ATTRIBUTES (type1
), "za")
29468 != aarch64_lookup_shared_state_flags (TYPE_ATTRIBUTES (type2
), "za"))
29470 if (aarch64_lookup_shared_state_flags (TYPE_ATTRIBUTES (type1
), "zt0")
29471 != aarch64_lookup_shared_state_flags (TYPE_ATTRIBUTES (type2
), "zt0"))
29476 /* Implement TARGET_MERGE_DECL_ATTRIBUTES. */
29479 aarch64_merge_decl_attributes (tree olddecl
, tree newdecl
)
29481 tree old_attrs
= DECL_ATTRIBUTES (olddecl
);
29482 tree old_new
= lookup_attribute ("arm", "new", old_attrs
);
29484 tree new_attrs
= DECL_ATTRIBUTES (newdecl
);
29485 tree new_new
= lookup_attribute ("arm", "new", new_attrs
);
29487 if (DECL_INITIAL (olddecl
) && new_new
)
29489 error ("cannot apply attribute %qs to %q+D after the function"
29490 " has been defined", "new", newdecl
);
29491 inform (DECL_SOURCE_LOCATION (olddecl
), "%q+D defined here",
29496 if (old_new
&& new_new
)
29498 old_attrs
= remove_attribute ("arm", "new", old_attrs
);
29499 TREE_VALUE (new_new
) = chainon (TREE_VALUE (new_new
),
29500 TREE_VALUE (old_new
));
29503 aarch64_check_arm_new_against_type (TREE_VALUE (new_new
), newdecl
);
29506 return merge_attributes (old_attrs
, new_attrs
);
29509 /* Implement TARGET_GET_MULTILIB_ABI_NAME */
29511 static const char *
29512 aarch64_get_multilib_abi_name (void)
29514 if (TARGET_BIG_END
)
29515 return TARGET_ILP32
? "aarch64_be_ilp32" : "aarch64_be";
29516 return TARGET_ILP32
? "aarch64_ilp32" : "aarch64";
29519 /* Implement TARGET_STACK_PROTECT_GUARD. In case of a
29520 global variable based guard use the default else
29521 return a null tree. */
29523 aarch64_stack_protect_guard (void)
29525 if (aarch64_stack_protector_guard
== SSP_GLOBAL
)
29526 return default_stack_protect_guard ();
29531 /* Implement TARGET_INVALID_UNARY_OP. */
29533 static const char *
29534 aarch64_invalid_unary_op (int op
, const_tree type
)
29536 /* Reject all single-operand operations on __mfp8 except for &. */
29537 if (TYPE_MAIN_VARIANT (type
) == aarch64_mfp8_type_node
&& op
!= ADDR_EXPR
)
29538 return N_ ("operation not permitted on type %<mfloat8_t%>");
29540 /* Operation allowed. */
29544 /* Implement TARGET_INVALID_BINARY_OP. */
29546 static const char *
29547 aarch64_invalid_binary_op (int op ATTRIBUTE_UNUSED
, const_tree type1
,
29550 if (VECTOR_TYPE_P (type1
)
29551 && VECTOR_TYPE_P (type2
)
29552 && !TYPE_INDIVISIBLE_P (type1
)
29553 && !TYPE_INDIVISIBLE_P (type2
)
29554 && (aarch64_sve::builtin_type_p (type1
)
29555 != aarch64_sve::builtin_type_p (type2
)))
29556 return N_("cannot combine GNU and SVE vectors in a binary operation");
29558 /* Reject all 2-operand operations on __mfp8. */
29559 if (TYPE_MAIN_VARIANT (type1
) == aarch64_mfp8_type_node
29560 || TYPE_MAIN_VARIANT (type2
) == aarch64_mfp8_type_node
)
29561 return N_ ("operation not permitted on type %<mfloat8_t%>");
29563 /* Operation allowed. */
29567 /* Implement TARGET_MEMTAG_CAN_TAG_ADDRESSES. Here we tell the rest of the
29568 compiler that we automatically ignore the top byte of our pointers, which
29569 allows using -fsanitize=hwaddress. */
29571 aarch64_can_tag_addresses ()
29573 return !TARGET_ILP32
;
29576 /* Implement TARGET_ASM_FILE_END for AArch64. This adds the AArch64 GNU NOTE
29577 section at the end if needed. */
29578 #define GNU_PROPERTY_AARCH64_FEATURE_1_AND 0xc0000000
29579 #define GNU_PROPERTY_AARCH64_FEATURE_1_BTI (1U << 0)
29580 #define GNU_PROPERTY_AARCH64_FEATURE_1_PAC (1U << 1)
29581 #define GNU_PROPERTY_AARCH64_FEATURE_1_GCS (1U << 2)
29583 aarch64_file_end_indicate_exec_stack ()
29585 file_end_indicate_exec_stack ();
29587 unsigned feature_1_and
= 0;
29588 if (aarch_bti_enabled ())
29589 feature_1_and
|= GNU_PROPERTY_AARCH64_FEATURE_1_BTI
;
29591 if (aarch_ra_sign_scope
!= AARCH_FUNCTION_NONE
)
29592 feature_1_and
|= GNU_PROPERTY_AARCH64_FEATURE_1_PAC
;
29594 if (aarch64_gcs_enabled ())
29595 feature_1_and
|= GNU_PROPERTY_AARCH64_FEATURE_1_GCS
;
29599 /* Generate .note.gnu.property section. */
29600 switch_to_section (get_section (".note.gnu.property",
29601 SECTION_NOTYPE
, NULL
));
29603 /* PT_NOTE header: namesz, descsz, type.
29604 namesz = 4 ("GNU\0")
29605 descsz = 16 (Size of the program property array)
29606 [(12 + padding) * Number of array elements]
29607 type = 5 (NT_GNU_PROPERTY_TYPE_0). */
29608 assemble_align (POINTER_SIZE
);
29609 assemble_integer (GEN_INT (4), 4, 32, 1);
29610 assemble_integer (GEN_INT (ROUND_UP (12, POINTER_BYTES
)), 4, 32, 1);
29611 assemble_integer (GEN_INT (5), 4, 32, 1);
29613 /* PT_NOTE name. */
29614 assemble_string ("GNU", 4);
29616 /* PT_NOTE contents for NT_GNU_PROPERTY_TYPE_0:
29617 type = GNU_PROPERTY_AARCH64_FEATURE_1_AND
29619 data = feature_1_and. */
29620 assemble_integer (GEN_INT (GNU_PROPERTY_AARCH64_FEATURE_1_AND
), 4, 32, 1);
29621 assemble_integer (GEN_INT (4), 4, 32, 1);
29622 assemble_integer (GEN_INT (feature_1_and
), 4, 32, 1);
29624 /* Pad the size of the note to the required alignment. */
29625 assemble_align (POINTER_SIZE
);
29628 #undef GNU_PROPERTY_AARCH64_FEATURE_1_GCS
29629 #undef GNU_PROPERTY_AARCH64_FEATURE_1_PAC
29630 #undef GNU_PROPERTY_AARCH64_FEATURE_1_BTI
29631 #undef GNU_PROPERTY_AARCH64_FEATURE_1_AND
29633 /* Helper function for straight line speculation.
29634 Return what barrier should be emitted for straight line speculation
29636 When not mitigating against straight line speculation this function returns
29638 When mitigating against straight line speculation, use:
29639 * SB when the v8.5-A SB extension is enabled.
29640 * DSB+ISB otherwise. */
29642 aarch64_sls_barrier (int mitigation_required
)
29644 return mitigation_required
29645 ? (TARGET_SB
? "sb" : "dsb\tsy\n\tisb")
29649 static GTY (()) tree aarch64_sls_shared_thunks
[30];
29650 static GTY (()) bool aarch64_sls_shared_thunks_needed
= false;
29651 const char *indirect_symbol_names
[30] = {
29652 "__call_indirect_x0",
29653 "__call_indirect_x1",
29654 "__call_indirect_x2",
29655 "__call_indirect_x3",
29656 "__call_indirect_x4",
29657 "__call_indirect_x5",
29658 "__call_indirect_x6",
29659 "__call_indirect_x7",
29660 "__call_indirect_x8",
29661 "__call_indirect_x9",
29662 "__call_indirect_x10",
29663 "__call_indirect_x11",
29664 "__call_indirect_x12",
29665 "__call_indirect_x13",
29666 "__call_indirect_x14",
29667 "__call_indirect_x15",
29668 "", /* "__call_indirect_x16", */
29669 "", /* "__call_indirect_x17", */
29670 "__call_indirect_x18",
29671 "__call_indirect_x19",
29672 "__call_indirect_x20",
29673 "__call_indirect_x21",
29674 "__call_indirect_x22",
29675 "__call_indirect_x23",
29676 "__call_indirect_x24",
29677 "__call_indirect_x25",
29678 "__call_indirect_x26",
29679 "__call_indirect_x27",
29680 "__call_indirect_x28",
29681 "__call_indirect_x29",
29684 /* Function to create a BLR thunk. This thunk is used to mitigate straight
29685 line speculation. Instead of a simple BLR that can be speculated past,
29686 we emit a BL to this thunk, and this thunk contains a BR to the relevant
29687 register. These thunks have the relevant speculation barries put after
29688 their indirect branch so that speculation is blocked.
29690 We use such a thunk so the speculation barriers are kept off the
29691 architecturally executed path in order to reduce the performance overhead.
29693 When optimizing for size we use stubs shared by the linked object.
29694 When optimizing for performance we emit stubs for each function in the hope
29695 that the branch predictor can better train on jumps specific for a given
29698 aarch64_sls_create_blr_label (int regnum
)
29700 gcc_assert (STUB_REGNUM_P (regnum
));
29701 if (optimize_function_for_size_p (cfun
))
29703 /* For the thunks shared between different functions in this compilation
29704 unit we use a named symbol -- this is just for users to more easily
29705 understand the generated assembly. */
29706 aarch64_sls_shared_thunks_needed
= true;
29707 const char *thunk_name
= indirect_symbol_names
[regnum
];
29708 if (aarch64_sls_shared_thunks
[regnum
] == NULL
)
29710 /* Build a decl representing this function stub and record it for
29711 later. We build a decl here so we can use the GCC machinery for
29712 handling sections automatically (through `get_named_section` and
29713 `make_decl_one_only`). That saves us a lot of trouble handling
29714 the specifics of different output file formats. */
29715 tree decl
= build_decl (BUILTINS_LOCATION
, FUNCTION_DECL
,
29716 get_identifier (thunk_name
),
29717 build_function_type_list (void_type_node
,
29719 DECL_RESULT (decl
) = build_decl (BUILTINS_LOCATION
, RESULT_DECL
,
29720 NULL_TREE
, void_type_node
);
29721 TREE_PUBLIC (decl
) = 1;
29722 TREE_STATIC (decl
) = 1;
29723 DECL_IGNORED_P (decl
) = 1;
29724 DECL_ARTIFICIAL (decl
) = 1;
29725 make_decl_one_only (decl
, DECL_ASSEMBLER_NAME (decl
));
29726 resolve_unique_section (decl
, 0, false);
29727 aarch64_sls_shared_thunks
[regnum
] = decl
;
29730 return gen_rtx_SYMBOL_REF (Pmode
, thunk_name
);
29733 if (cfun
->machine
->call_via
[regnum
] == NULL
)
29734 cfun
->machine
->call_via
[regnum
]
29735 = gen_rtx_LABEL_REF (Pmode
, gen_label_rtx ());
29736 return cfun
->machine
->call_via
[regnum
];
29739 /* Helper function for aarch64_sls_emit_blr_function_thunks and
29740 aarch64_sls_emit_shared_blr_thunks below. */
29742 aarch64_sls_emit_function_stub (FILE *out_file
, int regnum
)
29744 /* Save in x16 and branch to that function so this transformation does
29745 not prevent jumping to `BTI c` instructions. */
29746 asm_fprintf (out_file
, "\tmov\tx16, x%d\n", regnum
);
29747 asm_fprintf (out_file
, "\tbr\tx16\n");
29750 /* Emit all BLR stubs for this particular function.
29751 Here we emit all the BLR stubs needed for the current function. Since we
29752 emit these stubs in a consecutive block we know there will be no speculation
29753 gadgets between each stub, and hence we only emit a speculation barrier at
29754 the end of the stub sequences.
29756 This is called in the TARGET_ASM_FUNCTION_EPILOGUE hook. */
29758 aarch64_sls_emit_blr_function_thunks (FILE *out_file
)
29760 if (! aarch64_harden_sls_blr_p ())
29763 bool any_functions_emitted
= false;
29764 /* We must save and restore the current function section since this assembly
29765 is emitted at the end of the function. This means it can be emitted *just
29766 after* the cold section of a function. That cold part would be emitted in
29767 a different section. That switch would trigger a `.cfi_endproc` directive
29768 to be emitted in the original section and a `.cfi_startproc` directive to
29769 be emitted in the new section. Switching to the original section without
29770 restoring would mean that the `.cfi_endproc` emitted as a function ends
29771 would happen in a different section -- leaving an unmatched
29772 `.cfi_startproc` in the cold text section and an unmatched `.cfi_endproc`
29773 in the standard text section. */
29774 section
*save_text_section
= in_section
;
29775 switch_to_section (function_section (current_function_decl
));
29776 for (int regnum
= 0; regnum
< 30; ++regnum
)
29778 rtx specu_label
= cfun
->machine
->call_via
[regnum
];
29779 if (specu_label
== NULL
)
29782 targetm
.asm_out
.print_operand (out_file
, specu_label
, 0);
29783 asm_fprintf (out_file
, ":\n");
29784 aarch64_sls_emit_function_stub (out_file
, regnum
);
29785 any_functions_emitted
= true;
29787 if (any_functions_emitted
)
29788 /* Can use the SB if needs be here, since this stub will only be used
29789 by the current function, and hence for the current target. */
29790 asm_fprintf (out_file
, "\t%s\n", aarch64_sls_barrier (true));
29791 switch_to_section (save_text_section
);
29794 /* Emit shared BLR stubs for the current compilation unit.
29795 Over the course of compiling this unit we may have converted some BLR
29796 instructions to a BL to a shared stub function. This is where we emit those
29798 This function is for the stubs shared between different functions in this
29799 compilation unit. We share when optimizing for size instead of speed.
29801 This function is called through the TARGET_ASM_FILE_END hook. */
29803 aarch64_sls_emit_shared_blr_thunks (FILE *out_file
)
29805 if (! aarch64_sls_shared_thunks_needed
)
29808 for (int regnum
= 0; regnum
< 30; ++regnum
)
29810 tree decl
= aarch64_sls_shared_thunks
[regnum
];
29814 const char *name
= indirect_symbol_names
[regnum
];
29815 switch_to_section (get_named_section (decl
, NULL
, 0));
29816 ASM_OUTPUT_ALIGN (out_file
, 2);
29817 targetm
.asm_out
.globalize_label (out_file
, name
);
29818 /* Only emits if the compiler is configured for an assembler that can
29819 handle visibility directives. */
29820 targetm
.asm_out
.assemble_visibility (decl
, VISIBILITY_HIDDEN
);
29821 ASM_OUTPUT_TYPE_DIRECTIVE (out_file
, name
, "function");
29822 ASM_OUTPUT_LABEL (out_file
, name
);
29823 aarch64_sls_emit_function_stub (out_file
, regnum
);
29824 /* Use the most conservative target to ensure it can always be used by any
29825 function in the translation unit. */
29826 asm_fprintf (out_file
, "\tdsb\tsy\n\tisb\n");
29827 ASM_DECLARE_FUNCTION_SIZE (out_file
, name
, decl
);
29831 /* Implement TARGET_ASM_FILE_END. */
29833 aarch64_asm_file_end ()
29835 aarch64_sls_emit_shared_blr_thunks (asm_out_file
);
29836 /* Since this function will be called for the ASM_FILE_END hook, we ensure
29837 that what would be called otherwise (e.g. `file_end_indicate_exec_stack`
29838 for FreeBSD) still gets called. */
29839 #ifdef TARGET_ASM_FILE_END
29840 TARGET_ASM_FILE_END ();
29845 aarch64_indirect_call_asm (rtx addr
)
29847 gcc_assert (REG_P (addr
));
29848 if (aarch64_harden_sls_blr_p ())
29850 rtx stub_label
= aarch64_sls_create_blr_label (REGNO (addr
));
29851 output_asm_insn ("bl\t%0", &stub_label
);
29854 output_asm_insn ("blr\t%0", &addr
);
29858 /* Emit the assembly instruction to load the thread pointer into DEST.
29859 Select between different tpidr_elN registers depending on -mtp= setting. */
29862 aarch64_output_load_tp (rtx dest
)
29864 const char *tpidrs
[] = {"tpidr_el0", "tpidr_el1", "tpidr_el2",
29865 "tpidr_el3", "tpidrro_el0"};
29867 snprintf (buffer
, sizeof (buffer
), "mrs\t%%0, %s",
29868 tpidrs
[aarch64_tpidr_register
]);
29869 output_asm_insn (buffer
, &dest
);
29873 /* Set up the value of REG_ALLOC_ORDER from scratch.
29875 It was previously good practice to put call-clobbered registers ahead
29876 of call-preserved registers, but that isn't necessary these days.
29877 IRA's model of register save/restore costs is much more sophisticated
29878 than the model that a simple ordering could provide. We leave
29879 HONOR_REG_ALLOC_ORDER undefined so that we can get the full benefit
29882 However, it is still useful to list registers that are members of
29883 multiple classes after registers that are members of fewer classes.
29884 For example, we have:
29886 - FP_LO8_REGS: v0-v7
29887 - FP_LO_REGS: v0-v15
29890 If, as a tie-breaker, we allocate FP_REGS in the order v0-v31,
29891 we run the risk of starving other (lower-priority) pseudos that
29892 require FP_LO8_REGS or FP_LO_REGS. Allocating FP_LO_REGS in the
29893 order v0-v15 could similarly starve pseudos that require FP_LO8_REGS.
29894 Allocating downwards rather than upwards avoids this problem, at least
29895 in code that has reasonable register pressure.
29897 The situation for predicate registers is similar. */
29900 aarch64_adjust_reg_alloc_order ()
29902 for (int i
= 0; i
< FIRST_PSEUDO_REGISTER
; ++i
)
29903 if (IN_RANGE (i
, V0_REGNUM
, V31_REGNUM
))
29904 reg_alloc_order
[i
] = V31_REGNUM
- (i
- V0_REGNUM
);
29905 else if (IN_RANGE (i
, P0_REGNUM
, P15_REGNUM
))
29906 reg_alloc_order
[i
] = P15_REGNUM
- (i
- P0_REGNUM
);
29908 reg_alloc_order
[i
] = i
;
29911 /* Return true if the PARALLEL PAR can be used in a VEC_SELECT expression
29912 of vector mode MODE to select half the elements of that vector.
29913 Allow any combination of indices except duplicates (or out of range of
29914 the mode units). */
29917 aarch64_parallel_select_half_p (machine_mode mode
, rtx par
)
29919 int nunits
= XVECLEN (par
, 0);
29920 if (!known_eq (GET_MODE_NUNITS (mode
), nunits
* 2))
29922 int mode_nunits
= nunits
* 2;
29923 /* Put all the elements of PAR into a hash_set and use its
29924 uniqueness guarantees to check that we don't try to insert the same
29926 hash_set
<rtx
> parset
;
29927 for (int i
= 0; i
< nunits
; ++i
)
29929 rtx elt
= XVECEXP (par
, 0, i
);
29930 if (!CONST_INT_P (elt
)
29931 || !IN_RANGE (INTVAL (elt
), 0, mode_nunits
- 1)
29932 || parset
.add (elt
))
29938 /* Return true if PAR1 and PAR2, two PARALLEL rtxes of CONST_INT values,
29939 contain any common elements. */
29942 aarch64_pars_overlap_p (rtx par1
, rtx par2
)
29944 int len1
= XVECLEN (par1
, 0);
29945 int len2
= XVECLEN (par2
, 0);
29946 hash_set
<rtx
> parset
;
29947 for (int i
= 0; i
< len1
; ++i
)
29948 parset
.add (XVECEXP (par1
, 0, i
));
29949 for (int i
= 0; i
< len2
; ++i
)
29950 if (parset
.contains (XVECEXP (par2
, 0, i
)))
29955 /* Implement OPTIMIZE_MODE_SWITCHING. */
29958 aarch64_optimize_mode_switching (aarch64_mode_entity entity
)
29960 bool have_sme_state
= (aarch64_cfun_incoming_pstate_za () != 0
29961 || (aarch64_cfun_has_new_state ("za")
29962 && df_regs_ever_live_p (ZA_REGNUM
))
29963 || (aarch64_cfun_has_new_state ("zt0")
29964 && df_regs_ever_live_p (ZT0_REGNUM
)));
29966 if (have_sme_state
&& nonlocal_goto_handler_labels
)
29968 static bool reported
;
29971 sorry ("non-local gotos in functions with SME state");
29978 case aarch64_mode_entity::HAVE_ZA_SAVE_BUFFER
:
29979 case aarch64_mode_entity::LOCAL_SME_STATE
:
29980 return have_sme_state
&& !nonlocal_goto_handler_labels
;
29982 gcc_unreachable ();
29985 /* Implement TARGET_MODE_EMIT for ZA_SAVE_BUFFER. */
29988 aarch64_mode_emit_za_save_buffer (aarch64_tristate_mode mode
,
29989 aarch64_tristate_mode prev_mode
)
29991 if (mode
== aarch64_tristate_mode::YES
)
29993 gcc_assert (prev_mode
== aarch64_tristate_mode::NO
);
29994 aarch64_init_tpidr2_block ();
29997 gcc_unreachable ();
30000 /* Implement TARGET_MODE_EMIT for LOCAL_SME_STATE. */
30003 aarch64_mode_emit_local_sme_state (aarch64_local_sme_state mode
,
30004 aarch64_local_sme_state prev_mode
)
30006 /* Back-propagation should ensure that we're always starting from
30008 gcc_assert (prev_mode
!= aarch64_local_sme_state::ANY
);
30010 if (prev_mode
== aarch64_local_sme_state::INACTIVE_CALLER
)
30012 /* Commit any uncommitted lazy save. This leaves ZA either active
30013 and zero (lazy save case) or off (normal case).
30017 mrs <temp>, tpidr2_el0
30018 cbz <temp>, no_save
30019 bl __arm_tpidr2_save
30020 msr tpidr2_el0, xzr
30021 zero { za } // Only if ZA is live
30022 zero { zt0 } // Only if ZT0 is live
30024 auto tmp_reg
= gen_reg_rtx (DImode
);
30025 emit_insn (gen_aarch64_read_tpidr2 (tmp_reg
));
30026 auto label
= gen_label_rtx ();
30027 rtx branch
= aarch64_gen_compare_zero_and_branch (EQ
, tmp_reg
, label
);
30028 auto jump
= emit_jump_insn (branch
);
30029 JUMP_LABEL (jump
) = label
;
30030 emit_insn (gen_aarch64_tpidr2_save ());
30031 emit_insn (gen_aarch64_clear_tpidr2 ());
30032 if (mode
== aarch64_local_sme_state::ACTIVE_LIVE
30033 || mode
== aarch64_local_sme_state::ACTIVE_DEAD
)
30035 if (aarch64_cfun_has_state ("za"))
30036 emit_insn (gen_aarch64_initial_zero_za ());
30037 if (aarch64_cfun_has_state ("zt0"))
30038 emit_insn (gen_aarch64_sme_zero_zt0 ());
30040 emit_label (label
);
30043 if (mode
== aarch64_local_sme_state::ACTIVE_LIVE
30044 || mode
== aarch64_local_sme_state::ACTIVE_DEAD
)
30046 if (prev_mode
== aarch64_local_sme_state::INACTIVE_LOCAL
)
30048 /* Make ZA active after being inactive.
30050 First handle the case in which the lazy save we set up was
30051 committed by a callee. If the function's source-level ZA state
30052 is live then we must conditionally restore it from the lazy
30053 save buffer. Otherwise we can just force PSTATE.ZA to 1. */
30054 if (mode
== aarch64_local_sme_state::ACTIVE_LIVE
)
30055 emit_insn (gen_aarch64_restore_za (aarch64_get_tpidr2_ptr ()));
30057 emit_insn (gen_aarch64_smstart_za ());
30059 /* Now handle the case in which the lazy save was not committed.
30060 In that case, ZA still contains the current function's ZA state,
30061 and we just need to cancel the lazy save. */
30062 emit_insn (gen_aarch64_clear_tpidr2 ());
30064 /* Restore the ZT0 state, if we have some. */
30065 if (aarch64_cfun_has_state ("zt0"))
30066 aarch64_restore_zt0 (true);
30071 if (prev_mode
== aarch64_local_sme_state::SAVED_LOCAL
)
30073 /* Retrieve the current function's ZA state from the lazy save
30075 aarch64_restore_za (aarch64_get_tpidr2_ptr ());
30077 /* Restore the ZT0 state, if we have some. */
30078 if (aarch64_cfun_has_state ("zt0"))
30079 aarch64_restore_zt0 (true);
30083 if (prev_mode
== aarch64_local_sme_state::INACTIVE_CALLER
30084 || prev_mode
== aarch64_local_sme_state::OFF
)
30086 /* INACTIVE_CALLER means that we are enabling ZA for the first
30087 time in this function. The code above means that ZA is either
30088 active and zero (if we committed a lazy save) or off. Handle
30089 the latter case by forcing ZA on.
30091 OFF means that PSTATE.ZA is guaranteed to be 0. We just need
30094 Both cases leave ZA zeroed. */
30095 emit_insn (gen_aarch64_smstart_za ());
30097 /* Restore the ZT0 state, if we have some. */
30098 if (prev_mode
== aarch64_local_sme_state::OFF
30099 && aarch64_cfun_has_state ("zt0"))
30100 aarch64_restore_zt0 (true);
30104 if (prev_mode
== aarch64_local_sme_state::ACTIVE_DEAD
30105 || prev_mode
== aarch64_local_sme_state::ACTIVE_LIVE
)
30106 /* A simple change in liveness, such as in a CFG structure where
30107 ZA is only conditionally defined. No code is needed. */
30110 gcc_unreachable ();
30113 if (mode
== aarch64_local_sme_state::INACTIVE_LOCAL
)
30115 if (prev_mode
== aarch64_local_sme_state::ACTIVE_LIVE
30116 || prev_mode
== aarch64_local_sme_state::ACTIVE_DEAD
30117 || prev_mode
== aarch64_local_sme_state::INACTIVE_CALLER
)
30119 /* Save the ZT0 state, if we have some. */
30120 if (aarch64_cfun_has_state ("zt0"))
30121 aarch64_save_zt0 ();
30123 /* A transition from ACTIVE_LIVE to INACTIVE_LOCAL is the usual
30124 case of setting up a lazy save buffer before a call.
30125 A transition from INACTIVE_CALLER is similar, except that
30126 the contents of ZA are known to be zero.
30128 A transition from ACTIVE_DEAD means that ZA is live at the
30129 point of the transition, but is dead on at least one incoming
30130 edge. (That is, ZA is only conditionally initialized.)
30131 For efficiency, we want to set up a lazy save even for
30132 dead contents, since forcing ZA off would make later code
30133 restore ZA from the lazy save buffer. */
30134 emit_insn (gen_aarch64_write_tpidr2 (aarch64_get_tpidr2_ptr ()));
30138 if (prev_mode
== aarch64_local_sme_state::SAVED_LOCAL
30139 || prev_mode
== aarch64_local_sme_state::OFF
)
30140 /* We're simply discarding the information about which inactive
30144 gcc_unreachable ();
30147 if (mode
== aarch64_local_sme_state::INACTIVE_CALLER
30148 || mode
== aarch64_local_sme_state::OFF
)
30150 /* Save the ZT0 state, if we have some. */
30151 if ((prev_mode
== aarch64_local_sme_state::ACTIVE_LIVE
30152 || prev_mode
== aarch64_local_sme_state::ACTIVE_DEAD
)
30153 && mode
== aarch64_local_sme_state::OFF
30154 && aarch64_cfun_has_state ("zt0"))
30155 aarch64_save_zt0 ();
30157 /* The transition to INACTIVE_CALLER is used before returning from
30158 new("za") functions. Any state in ZA belongs to the current
30159 function rather than a caller, but that state is no longer
30160 needed. Clear any pending lazy save and turn ZA off.
30162 The transition to OFF is used before calling a private-ZA function.
30163 We committed any incoming lazy save above, so at this point any
30164 contents in ZA belong to the current function. */
30165 if (prev_mode
== aarch64_local_sme_state::INACTIVE_LOCAL
)
30166 emit_insn (gen_aarch64_clear_tpidr2 ());
30168 if (prev_mode
!= aarch64_local_sme_state::OFF
30169 && prev_mode
!= aarch64_local_sme_state::SAVED_LOCAL
)
30170 emit_insn (gen_aarch64_smstop_za ());
30175 if (mode
== aarch64_local_sme_state::SAVED_LOCAL
)
30177 /* This is a transition to an exception handler. */
30178 gcc_assert (prev_mode
== aarch64_local_sme_state::OFF
30179 || prev_mode
== aarch64_local_sme_state::INACTIVE_LOCAL
);
30183 gcc_unreachable ();
30186 /* Implement TARGET_MODE_EMIT. */
30189 aarch64_mode_emit (int entity
, int mode
, int prev_mode
, HARD_REG_SET live
)
30191 if (mode
== prev_mode
)
30195 switch (aarch64_mode_entity (entity
))
30197 case aarch64_mode_entity::HAVE_ZA_SAVE_BUFFER
:
30198 aarch64_mode_emit_za_save_buffer (aarch64_tristate_mode (mode
),
30199 aarch64_tristate_mode (prev_mode
));
30202 case aarch64_mode_entity::LOCAL_SME_STATE
:
30203 aarch64_mode_emit_local_sme_state (aarch64_local_sme_state (mode
),
30204 aarch64_local_sme_state (prev_mode
));
30207 rtx_insn
*seq
= get_insns ();
30210 /* Get the set of clobbered registers that are currently live. */
30211 HARD_REG_SET clobbers
= {};
30212 for (rtx_insn
*insn
= seq
; insn
; insn
= NEXT_INSN (insn
))
30214 if (!NONDEBUG_INSN_P (insn
))
30216 vec_rtx_properties properties
;
30217 properties
.add_insn (insn
, false);
30218 for (rtx_obj_reference ref
: properties
.refs ())
30219 if (ref
.is_write () && HARD_REGISTER_NUM_P (ref
.regno
))
30220 SET_HARD_REG_BIT (clobbers
, ref
.regno
);
30224 /* Emit instructions to save clobbered registers to pseudos. Queue
30225 instructions to restore the registers afterwards.
30227 This should only needed in rare situations. */
30228 auto_vec
<rtx
, 33> after
;
30229 for (unsigned int regno
= R0_REGNUM
; regno
< R30_REGNUM
; ++regno
)
30230 if (TEST_HARD_REG_BIT (clobbers
, regno
))
30232 rtx hard_reg
= gen_rtx_REG (DImode
, regno
);
30233 rtx pseudo_reg
= gen_reg_rtx (DImode
);
30234 emit_move_insn (pseudo_reg
, hard_reg
);
30235 after
.quick_push (gen_move_insn (hard_reg
, pseudo_reg
));
30237 if (TEST_HARD_REG_BIT (clobbers
, CC_REGNUM
))
30239 rtx pseudo_reg
= gen_reg_rtx (DImode
);
30240 emit_insn (gen_aarch64_save_nzcv (pseudo_reg
));
30241 after
.quick_push (gen_aarch64_restore_nzcv (pseudo_reg
));
30244 /* Emit the transition instructions themselves. */
30247 /* Restore the clobbered registers. */
30248 for (auto *insn
: after
)
30252 /* Return true if INSN references the SME state represented by hard register
30256 aarch64_insn_references_sme_state_p (rtx_insn
*insn
, unsigned int regno
)
30259 FOR_EACH_INSN_DEF (ref
, insn
)
30260 if (!DF_REF_FLAGS_IS_SET (ref
, DF_REF_MUST_CLOBBER
)
30261 && DF_REF_REGNO (ref
) == regno
)
30263 FOR_EACH_INSN_USE (ref
, insn
)
30264 if (DF_REF_REGNO (ref
) == regno
)
30269 /* Implement TARGET_MODE_NEEDED for LOCAL_SME_STATE. */
30271 static aarch64_local_sme_state
30272 aarch64_mode_needed_local_sme_state (rtx_insn
*insn
, HARD_REG_SET live
)
30275 && find_reg_note (insn
, REG_EH_REGION
, NULL_RTX
))
30277 static bool reported
;
30280 sorry ("catching non-call exceptions in functions with SME state");
30283 /* Aim for graceful error recovery by picking the value that is
30284 least likely to generate an ICE. */
30285 return aarch64_local_sme_state::INACTIVE_LOCAL
;
30288 /* A non-local goto is equivalent to a return. We disallow non-local
30289 receivers in functions with SME state, so we know that the target
30290 expects ZA to be dormant or off. */
30292 && find_reg_note (insn
, REG_NON_LOCAL_GOTO
, NULL_RTX
))
30293 return aarch64_local_sme_state::INACTIVE_CALLER
;
30295 /* start_private_za_call and end_private_za_call bracket a sequence
30296 that calls a private-ZA function. Force ZA to be turned off if the
30297 function doesn't have any live ZA state, otherwise require ZA to be
30299 auto icode
= recog_memoized (insn
);
30300 if (icode
== CODE_FOR_aarch64_start_private_za_call
30301 || icode
== CODE_FOR_aarch64_end_private_za_call
)
30302 return (TEST_HARD_REG_BIT (live
, ZA_REGNUM
)
30303 ? aarch64_local_sme_state::INACTIVE_LOCAL
30304 : aarch64_local_sme_state::OFF
);
30306 /* Force ZA to contain the current function's ZA state if INSN wants
30307 to access it. Do the same for accesses to ZT0, since ZA and ZT0
30308 are both controlled by PSTATE.ZA. */
30309 if (aarch64_insn_references_sme_state_p (insn
, ZA_REGNUM
)
30310 || aarch64_insn_references_sme_state_p (insn
, ZT0_REGNUM
))
30311 return (TEST_HARD_REG_BIT (live
, ZA_REGNUM
)
30312 ? aarch64_local_sme_state::ACTIVE_LIVE
30313 : aarch64_local_sme_state::ACTIVE_DEAD
);
30315 return aarch64_local_sme_state::ANY
;
30318 /* Implement TARGET_MODE_NEEDED for ZA_SAVE_BUFFER. */
30320 static aarch64_tristate_mode
30321 aarch64_mode_needed_za_save_buffer (rtx_insn
*insn
, HARD_REG_SET live
)
30323 /* We need to set up a lazy save buffer no later than the first
30324 transition to INACTIVE_LOCAL (which involves setting up a lazy save). */
30325 if (aarch64_mode_needed_local_sme_state (insn
, live
)
30326 == aarch64_local_sme_state::INACTIVE_LOCAL
)
30327 return aarch64_tristate_mode::YES
;
30329 /* Also make sure that the lazy save buffer is set up before the first
30330 insn that throws internally. The exception handler will sometimes
30332 if (find_reg_note (insn
, REG_EH_REGION
, NULL_RTX
))
30333 return aarch64_tristate_mode::YES
;
30335 return aarch64_tristate_mode::MAYBE
;
30338 /* Implement TARGET_MODE_NEEDED. */
30341 aarch64_mode_needed (int entity
, rtx_insn
*insn
, HARD_REG_SET live
)
30343 switch (aarch64_mode_entity (entity
))
30345 case aarch64_mode_entity::HAVE_ZA_SAVE_BUFFER
:
30346 return int (aarch64_mode_needed_za_save_buffer (insn
, live
));
30348 case aarch64_mode_entity::LOCAL_SME_STATE
:
30349 return int (aarch64_mode_needed_local_sme_state (insn
, live
));
30351 gcc_unreachable ();
30354 /* Implement TARGET_MODE_AFTER for LOCAL_SME_STATE. */
30356 static aarch64_local_sme_state
30357 aarch64_mode_after_local_sme_state (aarch64_local_sme_state mode
,
30360 /* Note places where ZA dies, so that we can try to avoid saving and
30361 restoring state that isn't needed. */
30362 if (mode
== aarch64_local_sme_state::ACTIVE_LIVE
30363 && !TEST_HARD_REG_BIT (live
, ZA_REGNUM
))
30364 return aarch64_local_sme_state::ACTIVE_DEAD
;
30366 /* Note where ZA is born, e.g. when moving past an __arm_out("za")
30368 if (mode
== aarch64_local_sme_state::ACTIVE_DEAD
30369 && TEST_HARD_REG_BIT (live
, ZA_REGNUM
))
30370 return aarch64_local_sme_state::ACTIVE_LIVE
;
30375 /* Implement TARGET_MODE_AFTER. */
30378 aarch64_mode_after (int entity
, int mode
, rtx_insn
*, HARD_REG_SET live
)
30380 switch (aarch64_mode_entity (entity
))
30382 case aarch64_mode_entity::HAVE_ZA_SAVE_BUFFER
:
30385 case aarch64_mode_entity::LOCAL_SME_STATE
:
30386 return int (aarch64_mode_after_local_sme_state
30387 (aarch64_local_sme_state (mode
), live
));
30389 gcc_unreachable ();
30392 /* Implement TARGET_MODE_CONFLUENCE for LOCAL_SME_STATE. */
30394 static aarch64_local_sme_state
30395 aarch64_local_sme_confluence (aarch64_local_sme_state mode1
,
30396 aarch64_local_sme_state mode2
)
30398 /* Perform a symmetrical check for two values. */
30399 auto is_pair
= [&](aarch64_local_sme_state val1
,
30400 aarch64_local_sme_state val2
)
30402 return ((mode1
== val1
&& mode2
== val2
)
30403 || (mode1
== val2
&& mode2
== val1
));
30406 /* INACTIVE_CALLER means ZA is off or it has dormant contents belonging
30407 to a caller. OFF is one of the options. */
30408 if (is_pair (aarch64_local_sme_state::INACTIVE_CALLER
,
30409 aarch64_local_sme_state::OFF
))
30410 return aarch64_local_sme_state::INACTIVE_CALLER
;
30412 /* Similarly for dormant contents belonging to the current function. */
30413 if (is_pair (aarch64_local_sme_state::INACTIVE_LOCAL
,
30414 aarch64_local_sme_state::OFF
))
30415 return aarch64_local_sme_state::INACTIVE_LOCAL
;
30417 /* Treat a conditionally-initialized value as a fully-initialized value. */
30418 if (is_pair (aarch64_local_sme_state::ACTIVE_LIVE
,
30419 aarch64_local_sme_state::ACTIVE_DEAD
))
30420 return aarch64_local_sme_state::ACTIVE_LIVE
;
30422 return aarch64_local_sme_state::ANY
;
30425 /* Implement TARGET_MODE_CONFLUENCE. */
30428 aarch64_mode_confluence (int entity
, int mode1
, int mode2
)
30430 gcc_assert (mode1
!= mode2
);
30431 switch (aarch64_mode_entity (entity
))
30433 case aarch64_mode_entity::HAVE_ZA_SAVE_BUFFER
:
30434 return int (aarch64_tristate_mode::MAYBE
);
30436 case aarch64_mode_entity::LOCAL_SME_STATE
:
30437 return int (aarch64_local_sme_confluence
30438 (aarch64_local_sme_state (mode1
),
30439 aarch64_local_sme_state (mode2
)));
30441 gcc_unreachable ();
30444 /* Implement TARGET_MODE_BACKPROP for an entity that either stays
30445 NO throughput, or makes one transition from NO to YES. */
30447 static aarch64_tristate_mode
30448 aarch64_one_shot_backprop (aarch64_tristate_mode mode1
,
30449 aarch64_tristate_mode mode2
)
30451 /* Keep bringing the transition forward until it starts from NO. */
30452 if (mode1
== aarch64_tristate_mode::MAYBE
30453 && mode2
== aarch64_tristate_mode::YES
)
30456 return aarch64_tristate_mode::MAYBE
;
30459 /* Implement TARGET_MODE_BACKPROP for LOCAL_SME_STATE. */
30461 static aarch64_local_sme_state
30462 aarch64_local_sme_backprop (aarch64_local_sme_state mode1
,
30463 aarch64_local_sme_state mode2
)
30465 /* We always need to know what the current state is when transitioning
30466 to a new state. Force any location with indeterminate starting state
30468 if (mode1
== aarch64_local_sme_state::ANY
)
30471 case aarch64_local_sme_state::INACTIVE_CALLER
:
30472 case aarch64_local_sme_state::OFF
:
30473 case aarch64_local_sme_state::ACTIVE_DEAD
:
30474 /* The current function's ZA state is not live. */
30475 return aarch64_local_sme_state::ACTIVE_DEAD
;
30477 case aarch64_local_sme_state::INACTIVE_LOCAL
:
30478 case aarch64_local_sme_state::ACTIVE_LIVE
:
30479 /* The current function's ZA state is live. */
30480 return aarch64_local_sme_state::ACTIVE_LIVE
;
30482 case aarch64_local_sme_state::SAVED_LOCAL
:
30483 /* This is a transition to an exception handler. Since we don't
30484 support non-call exceptions for SME functions, the source of
30485 the transition must be known. We'll assert later if that's
30487 return aarch64_local_sme_state::ANY
;
30489 case aarch64_local_sme_state::ANY
:
30490 return aarch64_local_sme_state::ANY
;
30493 return aarch64_local_sme_state::ANY
;
30496 /* Implement TARGET_MODE_BACKPROP. */
30499 aarch64_mode_backprop (int entity
, int mode1
, int mode2
)
30501 switch (aarch64_mode_entity (entity
))
30503 case aarch64_mode_entity::HAVE_ZA_SAVE_BUFFER
:
30504 return int (aarch64_one_shot_backprop (aarch64_tristate_mode (mode1
),
30505 aarch64_tristate_mode (mode2
)));
30507 case aarch64_mode_entity::LOCAL_SME_STATE
:
30508 return int (aarch64_local_sme_backprop
30509 (aarch64_local_sme_state (mode1
),
30510 aarch64_local_sme_state (mode2
)));
30512 gcc_unreachable ();
30515 /* Implement TARGET_MODE_ENTRY. */
30518 aarch64_mode_entry (int entity
)
30520 switch (aarch64_mode_entity (entity
))
30522 case aarch64_mode_entity::HAVE_ZA_SAVE_BUFFER
:
30523 return int (aarch64_tristate_mode::NO
);
30525 case aarch64_mode_entity::LOCAL_SME_STATE
:
30526 return int (aarch64_cfun_shared_flags ("za") != 0
30527 ? aarch64_local_sme_state::ACTIVE_LIVE
30528 : aarch64_cfun_incoming_pstate_za () != 0
30529 ? aarch64_local_sme_state::ACTIVE_DEAD
30530 : aarch64_local_sme_state::INACTIVE_CALLER
);
30532 gcc_unreachable ();
30535 /* Implement TARGET_MODE_EXIT. */
30538 aarch64_mode_exit (int entity
)
30540 switch (aarch64_mode_entity (entity
))
30542 case aarch64_mode_entity::HAVE_ZA_SAVE_BUFFER
:
30543 return int (aarch64_tristate_mode::MAYBE
);
30545 case aarch64_mode_entity::LOCAL_SME_STATE
:
30546 return int (aarch64_cfun_shared_flags ("za") != 0
30547 ? aarch64_local_sme_state::ACTIVE_LIVE
30548 : aarch64_cfun_incoming_pstate_za () != 0
30549 ? aarch64_local_sme_state::ACTIVE_DEAD
30550 : aarch64_local_sme_state::INACTIVE_CALLER
);
30552 gcc_unreachable ();
30555 /* Implement TARGET_MODE_EH_HANDLER. */
30558 aarch64_mode_eh_handler (int entity
)
30560 switch (aarch64_mode_entity (entity
))
30562 case aarch64_mode_entity::HAVE_ZA_SAVE_BUFFER
:
30563 /* Require a lazy save buffer to be allocated before the first
30564 insn that can throw. */
30565 return int (aarch64_tristate_mode::YES
);
30567 case aarch64_mode_entity::LOCAL_SME_STATE
:
30568 return int (aarch64_local_sme_state::SAVED_LOCAL
);
30570 gcc_unreachable ();
30573 /* Implement TARGET_MODE_PRIORITY. */
30576 aarch64_mode_priority (int, int n
)
30581 /* Implement TARGET_MD_ASM_ADJUST. */
30584 aarch64_md_asm_adjust (vec
<rtx
> &outputs
, vec
<rtx
> &inputs
,
30585 vec
<machine_mode
> &input_modes
,
30586 vec
<const char *> &constraints
,
30587 vec
<rtx
> &uses
, vec
<rtx
> &clobbers
,
30588 HARD_REG_SET
&clobbered_regs
, location_t loc
)
30590 rtx_insn
*seq
= arm_md_asm_adjust (outputs
, inputs
, input_modes
, constraints
,
30591 uses
, clobbers
, clobbered_regs
, loc
);
30593 /* "za" in the clobber list of a function with ZA state is defined to
30594 mean that the asm can read from and write to ZA. We can model the
30595 read using a USE, but unfortunately, it's not possible to model the
30596 write directly. Use a separate insn to model the effect.
30598 We must ensure that ZA is active on entry, which is enforced by using
30599 SME_STATE_REGNUM. The asm must ensure that ZA is active on return.
30601 The same thing applies to ZT0. */
30603 for (unsigned int i
= clobbers
.length (); i
-- > 0; )
30605 rtx x
= clobbers
[i
];
30607 && (REGNO (x
) == ZA_REGNUM
|| REGNO (x
) == ZT0_REGNUM
))
30609 auto id
= cfun
->machine
->next_asm_update_za_id
++;
30614 rtx id_rtx
= gen_int_mode (id
, SImode
);
30615 emit_insn (REGNO (x
) == ZA_REGNUM
30616 ? gen_aarch64_asm_update_za (id_rtx
)
30617 : gen_aarch64_asm_update_zt0 (id_rtx
));
30618 seq
= get_insns ();
30621 auto mode
= REGNO (x
) == ZA_REGNUM
? VNx16QImode
: V8DImode
;
30622 uses
.safe_push (gen_rtx_REG (mode
, REGNO (x
)));
30623 uses
.safe_push (gen_rtx_REG (DImode
, SME_STATE_REGNUM
));
30625 clobbers
.ordered_remove (i
);
30626 CLEAR_HARD_REG_BIT (clobbered_regs
, REGNO (x
));
30632 /* BB is the target of an exception or nonlocal goto edge, which means
30633 that PSTATE.SM is known to be 0 on entry. Put it into the state that
30634 the current function requires. */
30637 aarch64_switch_pstate_sm_for_landing_pad (basic_block bb
)
30639 if (TARGET_NON_STREAMING
)
30643 rtx_insn
*guard_label
= nullptr;
30644 if (TARGET_STREAMING_COMPATIBLE
)
30645 guard_label
= aarch64_guard_switch_pstate_sm (IP0_REGNUM
,
30646 AARCH64_ISA_MODE_SM_OFF
);
30647 aarch64_sme_mode_switch_regs args_switch
;
30648 args_switch
.add_call_preserved_regs (df_get_live_in (bb
));
30649 args_switch
.emit_prologue ();
30650 aarch64_switch_pstate_sm (AARCH64_ISA_MODE_SM_OFF
, AARCH64_ISA_MODE_SM_ON
);
30651 args_switch
.emit_epilogue ();
30653 emit_label (guard_label
);
30654 auto seq
= get_insns ();
30657 emit_insn_after (seq
, bb_note (bb
));
30661 /* JUMP is a nonlocal goto. Its target requires PSTATE.SM to be 0 on entry,
30662 so arrange to make it so. */
30665 aarch64_switch_pstate_sm_for_jump (rtx_insn
*jump
)
30667 if (TARGET_NON_STREAMING
)
30671 rtx_insn
*guard_label
= nullptr;
30672 if (TARGET_STREAMING_COMPATIBLE
)
30673 guard_label
= aarch64_guard_switch_pstate_sm (IP0_REGNUM
,
30674 AARCH64_ISA_MODE_SM_OFF
);
30675 aarch64_switch_pstate_sm (AARCH64_ISA_MODE_SM_ON
, AARCH64_ISA_MODE_SM_OFF
);
30677 emit_label (guard_label
);
30678 auto seq
= get_insns ();
30681 emit_insn_before (seq
, jump
);
30685 /* If CALL involves a change in PSTATE.SM, emit the instructions needed
30686 to switch to the new mode and the instructions needed to restore the
30687 original mode. Return true if something changed. */
30689 aarch64_switch_pstate_sm_for_call (rtx_call_insn
*call
)
30691 /* Mode switches for sibling calls are handled via the epilogue. */
30692 if (SIBLING_CALL_P (call
))
30695 auto callee_isa_mode
= aarch64_insn_callee_isa_mode (call
);
30696 if (!aarch64_call_switches_pstate_sm (callee_isa_mode
))
30699 /* Switch mode before the call, preserving any argument registers
30700 across the switch. */
30702 rtx_insn
*args_guard_label
= nullptr;
30703 if (TARGET_STREAMING_COMPATIBLE
)
30704 args_guard_label
= aarch64_guard_switch_pstate_sm (IP0_REGNUM
,
30706 aarch64_sme_mode_switch_regs args_switch
;
30707 args_switch
.add_call_args (call
);
30708 args_switch
.emit_prologue ();
30709 aarch64_switch_pstate_sm (AARCH64_ISA_MODE
, callee_isa_mode
);
30710 args_switch
.emit_epilogue ();
30711 if (args_guard_label
)
30712 emit_label (args_guard_label
);
30713 auto args_seq
= get_insns ();
30715 emit_insn_before (args_seq
, call
);
30717 if (find_reg_note (call
, REG_NORETURN
, NULL_RTX
))
30720 /* Switch mode after the call, preserving any return registers across
30723 rtx_insn
*return_guard_label
= nullptr;
30724 if (TARGET_STREAMING_COMPATIBLE
)
30725 return_guard_label
= aarch64_guard_switch_pstate_sm (IP0_REGNUM
,
30727 aarch64_sme_mode_switch_regs return_switch
;
30728 return_switch
.add_call_result (call
);
30729 return_switch
.emit_prologue ();
30730 aarch64_switch_pstate_sm (callee_isa_mode
, AARCH64_ISA_MODE
);
30731 return_switch
.emit_epilogue ();
30732 if (return_guard_label
)
30733 emit_label (return_guard_label
);
30734 auto result_seq
= get_insns ();
30736 emit_insn_after (result_seq
, call
);
30742 const pass_data pass_data_switch_pstate_sm
=
30745 "smstarts", // name
30746 OPTGROUP_NONE
, // optinfo_flags
30748 0, // properties_required
30749 0, // properties_provided
30750 0, // properties_destroyed
30751 0, // todo_flags_start
30752 TODO_df_finish
, // todo_flags_finish
30755 class pass_switch_pstate_sm
: public rtl_opt_pass
30758 pass_switch_pstate_sm (gcc::context
*ctxt
)
30759 : rtl_opt_pass (pass_data_switch_pstate_sm
, ctxt
)
30762 // opt_pass methods:
30763 bool gate (function
*) override final
;
30764 unsigned int execute (function
*) override final
;
30768 pass_switch_pstate_sm::gate (function
*fn
)
30770 return (aarch64_fndecl_pstate_sm (fn
->decl
) != AARCH64_ISA_MODE_SM_OFF
30771 || cfun
->machine
->call_switches_pstate_sm
);
30774 /* Emit any instructions needed to switch PSTATE.SM. */
30776 pass_switch_pstate_sm::execute (function
*fn
)
30780 auto_sbitmap
blocks (last_basic_block_for_fn (cfun
));
30781 bitmap_clear (blocks
);
30782 FOR_EACH_BB_FN (bb
, fn
)
30784 if (has_abnormal_call_or_eh_pred_edge_p (bb
)
30785 && aarch64_switch_pstate_sm_for_landing_pad (bb
))
30786 bitmap_set_bit (blocks
, bb
->index
);
30788 if (cfun
->machine
->call_switches_pstate_sm
)
30791 FOR_BB_INSNS (bb
, insn
)
30792 if (auto *call
= dyn_cast
<rtx_call_insn
*> (insn
))
30793 if (aarch64_switch_pstate_sm_for_call (call
))
30794 bitmap_set_bit (blocks
, bb
->index
);
30797 auto end
= BB_END (bb
);
30799 && find_reg_note (end
, REG_NON_LOCAL_GOTO
, NULL_RTX
)
30800 && aarch64_switch_pstate_sm_for_jump (end
))
30801 bitmap_set_bit (blocks
, bb
->index
);
30803 find_many_sub_basic_blocks (blocks
);
30804 clear_aux_for_blocks ();
30811 make_pass_switch_pstate_sm (gcc::context
*ctxt
)
30813 return new pass_switch_pstate_sm (ctxt
);
30816 /* Parse an implementation-defined system register name of
30817 the form S[0-3]_[0-7]_C[0-15]_C[0-15]_[0-7].
30818 Return true if name matched against above pattern, false
30821 aarch64_is_implem_def_reg (const char *regname
)
30824 unsigned name_len
= strlen (regname
);
30825 if (name_len
< 12 || name_len
> 14)
30828 auto cterm_valid_p
= [&]()
30830 bool leading_zero_p
= false;
30834 if (regname
[pos
] != 'c')
30837 while (regname
[pos
] != '_')
30839 if (leading_zero_p
)
30841 if (i
== 0 && regname
[pos
] == '0')
30842 leading_zero_p
= true;
30845 if (!ISDIGIT (regname
[pos
]))
30847 n
[i
++] = regname
[pos
++];
30854 if (regname
[pos
] != 's')
30857 if (regname
[pos
] < '0' || regname
[pos
] > '3')
30860 if (regname
[pos
++] != '_')
30862 if (regname
[pos
] < '0' || regname
[pos
] > '7')
30865 if (regname
[pos
++] != '_')
30867 if (!cterm_valid_p ())
30869 if (regname
[pos
++] != '_')
30871 if (!cterm_valid_p ())
30873 if (regname
[pos
++] != '_')
30875 if (regname
[pos
] < '0' || regname
[pos
] > '7')
30880 /* Return true if REGNAME matches either a known permitted system
30881 register name, or a generic sysreg specification. For use in
30882 back-end predicate `aarch64_sysreg_string'. */
30884 aarch64_valid_sysreg_name_p (const char *regname
)
30886 const sysreg_t
*sysreg
= aarch64_lookup_sysreg_map (regname
);
30887 if (sysreg
== NULL
)
30888 return aarch64_is_implem_def_reg (regname
);
30889 if (sysreg
->arch_reqs
)
30890 return bool (aarch64_isa_flags
& sysreg
->arch_reqs
);
30894 /* Return the generic sysreg specification for a valid system register
30895 name, otherwise NULL. WRITE_P is true iff the register is being
30896 written to. IS128OP indicates the requested system register should
30897 be checked for a 128-bit implementation. */
30899 aarch64_retrieve_sysreg (const char *regname
, bool write_p
, bool is128op
)
30901 const sysreg_t
*sysreg
= aarch64_lookup_sysreg_map (regname
);
30902 if (sysreg
== NULL
)
30904 if (aarch64_is_implem_def_reg (regname
))
30909 if (is128op
&& !(sysreg
->properties
& F_REG_128
))
30911 if ((write_p
&& (sysreg
->properties
& F_REG_READ
))
30912 || (!write_p
&& (sysreg
->properties
& F_REG_WRITE
)))
30914 if ((~aarch64_isa_flags
& sysreg
->arch_reqs
) != 0)
30916 return sysreg
->encoding
;
30919 /* Report that LOCATION has a call to FNDECL in which argument ARGNO
30920 was not an integer constant expression. ARGNO counts from zero. */
30922 aarch64::report_non_ice (location_t location
, tree fndecl
, unsigned int argno
)
30924 error_at (location
, "argument %d of %qE must be an integer constant"
30925 " expression", argno
+ 1, fndecl
);
30928 /* Report that LOCATION has a call to FNDECL in which argument ARGNO has
30929 the value ACTUAL, whereas the function requires a value in the range
30930 [MIN, MAX]. ARGNO counts from zero. */
30932 aarch64::report_out_of_range (location_t location
, tree fndecl
,
30933 unsigned int argno
, HOST_WIDE_INT actual
,
30934 HOST_WIDE_INT min
, HOST_WIDE_INT max
)
30937 error_at (location
, "passing %wd to argument %d of %qE, which expects"
30938 " the value %wd", actual
, argno
+ 1, fndecl
, min
);
30940 error_at (location
, "passing %wd to argument %d of %qE, which expects"
30941 " a value in the range [%wd, %wd]", actual
, argno
+ 1, fndecl
,
30945 /* Report that LOCATION has a call to FNDECL in which argument ARGNO has
30946 the value ACTUAL, whereas the function requires either VALUE0 or
30947 VALUE1. ARGNO counts from zero. */
30949 aarch64::report_neither_nor (location_t location
, tree fndecl
,
30950 unsigned int argno
, HOST_WIDE_INT actual
,
30951 HOST_WIDE_INT value0
, HOST_WIDE_INT value1
)
30953 error_at (location
, "passing %wd to argument %d of %qE, which expects"
30954 " either %wd or %wd", actual
, argno
+ 1, fndecl
, value0
, value1
);
30957 /* Report that LOCATION has a call to FNDECL in which argument ARGNO has
30958 the value ACTUAL, whereas the function requires one of VALUE0..3.
30959 ARGNO counts from zero. */
30961 aarch64::report_not_one_of (location_t location
, tree fndecl
,
30962 unsigned int argno
, HOST_WIDE_INT actual
,
30963 HOST_WIDE_INT value0
, HOST_WIDE_INT value1
,
30964 HOST_WIDE_INT value2
,
30965 HOST_WIDE_INT value3
)
30967 error_at (location
, "passing %wd to argument %d of %qE, which expects"
30968 " %wd, %wd, %wd or %wd", actual
, argno
+ 1, fndecl
, value0
, value1
,
30972 /* Report that LOCATION has a call to FNDECL in which argument ARGNO has
30973 the value ACTUAL, whereas the function requires a valid value of
30974 enum type ENUMTYPE. ARGNO counts from zero. */
30976 aarch64::report_not_enum (location_t location
, tree fndecl
, unsigned int argno
,
30977 HOST_WIDE_INT actual
, tree enumtype
)
30979 error_at (location
, "passing %wd to argument %d of %qE, which expects"
30980 " a valid %qT value", actual
, argno
+ 1, fndecl
, enumtype
);
30983 /* Generate assembly to calculate CRC
30984 using carry-less multiplication instruction.
30985 OPERANDS[1] is input CRC,
30986 OPERANDS[2] is data (message),
30987 OPERANDS[3] is the polynomial without the leading 1. */
30990 aarch64_expand_crc_using_pmull (scalar_mode crc_mode
,
30991 scalar_mode data_mode
,
30994 /* Check and keep arguments. */
30995 gcc_assert (!CONST_INT_P (operands
[0]));
30996 gcc_assert (CONST_INT_P (operands
[3]));
30997 rtx crc
= operands
[1];
30998 rtx data
= operands
[2];
30999 rtx polynomial
= operands
[3];
31001 unsigned HOST_WIDE_INT crc_size
= GET_MODE_BITSIZE (crc_mode
);
31002 unsigned HOST_WIDE_INT data_size
= GET_MODE_BITSIZE (data_mode
);
31003 gcc_assert (crc_size
<= 32);
31004 gcc_assert (data_size
<= crc_size
);
31006 /* Calculate the quotient. */
31007 unsigned HOST_WIDE_INT
31008 q
= gf2n_poly_long_div_quotient (UINTVAL (polynomial
), crc_size
);
31009 /* CRC calculation's main part. */
31010 if (crc_size
> data_size
)
31011 crc
= expand_shift (RSHIFT_EXPR
, DImode
, crc
, crc_size
- data_size
,
31014 rtx t0
= force_reg (DImode
, gen_int_mode (q
, DImode
));
31015 polynomial
= simplify_gen_unary (ZERO_EXTEND
, DImode
, polynomial
,
31016 GET_MODE (polynomial
));
31017 rtx t1
= force_reg (DImode
, polynomial
);
31019 rtx a0
= expand_binop (DImode
, xor_optab
, crc
, data
, NULL_RTX
, 1,
31022 rtx pmull_res
= gen_reg_rtx (TImode
);
31023 emit_insn (gen_aarch64_crypto_pmulldi (pmull_res
, a0
, t0
));
31024 a0
= gen_lowpart (DImode
, pmull_res
);
31026 a0
= expand_shift (RSHIFT_EXPR
, DImode
, a0
, crc_size
, NULL_RTX
, 1);
31028 emit_insn (gen_aarch64_crypto_pmulldi (pmull_res
, a0
, t1
));
31029 a0
= gen_lowpart (DImode
, pmull_res
);
31031 if (crc_size
> data_size
)
31033 rtx crc_part
= expand_shift (LSHIFT_EXPR
, DImode
, operands
[1], data_size
,
31035 a0
= expand_binop (DImode
, xor_optab
, a0
, crc_part
, NULL_RTX
, 1,
31039 aarch64_emit_move (operands
[0], gen_lowpart (crc_mode
, a0
));
31042 /* Generate assembly to calculate reversed CRC
31043 using carry-less multiplication instruction.
31044 OPERANDS[1] is input CRC,
31045 OPERANDS[2] is data,
31046 OPERANDS[3] is the polynomial without the leading 1. */
31049 aarch64_expand_reversed_crc_using_pmull (scalar_mode crc_mode
,
31050 scalar_mode data_mode
,
31053 /* Check and keep arguments. */
31054 gcc_assert (!CONST_INT_P (operands
[0]));
31055 gcc_assert (CONST_INT_P (operands
[3]));
31056 rtx crc
= operands
[1];
31057 rtx data
= operands
[2];
31058 rtx polynomial
= operands
[3];
31060 unsigned HOST_WIDE_INT crc_size
= GET_MODE_BITSIZE (crc_mode
);
31061 unsigned HOST_WIDE_INT data_size
= GET_MODE_BITSIZE (data_mode
);
31062 gcc_assert (crc_size
<= 32);
31063 gcc_assert (data_size
<= crc_size
);
31065 /* Calculate the quotient. */
31066 unsigned HOST_WIDE_INT
31067 q
= gf2n_poly_long_div_quotient (UINTVAL (polynomial
), crc_size
);
31068 /* Reflect the calculated quotient. */
31069 q
= reflect_hwi (q
, crc_size
+ 1);
31070 rtx t0
= force_reg (DImode
, gen_int_mode (q
, DImode
));
31072 /* Reflect the polynomial. */
31073 unsigned HOST_WIDE_INT ref_polynomial
= reflect_hwi (UINTVAL (polynomial
),
31075 /* An unshifted multiplier would require the final result to be extracted
31076 using a shift right by DATA_SIZE - 1 bits. Shift the multiplier left
31077 so that the shift right can be by CRC_SIZE bits instead. */
31078 ref_polynomial
<<= crc_size
- data_size
+ 1;
31079 rtx t1
= force_reg (DImode
, gen_int_mode (ref_polynomial
, DImode
));
31081 /* CRC calculation's main part. */
31082 rtx a0
= expand_binop (DImode
, xor_optab
, crc
, data
, NULL_RTX
, 1,
31085 /* Perform carry-less multiplication and get low part. */
31086 rtx pmull_res
= gen_reg_rtx (TImode
);
31087 emit_insn (gen_aarch64_crypto_pmulldi (pmull_res
, a0
, t0
));
31088 a0
= gen_lowpart (DImode
, pmull_res
);
31090 a0
= expand_binop (DImode
, and_optab
, a0
,
31091 gen_int_mode (GET_MODE_MASK (data_mode
), DImode
),
31092 NULL_RTX
, 1, OPTAB_WIDEN
);
31094 /* Perform carry-less multiplication. */
31095 emit_insn (gen_aarch64_crypto_pmulldi (pmull_res
, a0
, t1
));
31097 /* Perform a shift right by CRC_SIZE as an extraction of lane 1. */
31098 machine_mode crc_vmode
= aarch64_v128_mode (crc_mode
).require ();
31099 a0
= (crc_size
> data_size
? gen_reg_rtx (crc_mode
) : operands
[0]);
31100 emit_insn (gen_aarch64_get_lane (crc_vmode
, a0
,
31101 gen_lowpart (crc_vmode
, pmull_res
),
31102 aarch64_endian_lane_rtx (crc_vmode
, 1)));
31104 if (crc_size
> data_size
)
31106 rtx crc_part
= expand_shift (RSHIFT_EXPR
, crc_mode
, crc
, data_size
,
31108 a0
= expand_binop (crc_mode
, xor_optab
, a0
, crc_part
, operands
[0], 1,
31110 aarch64_emit_move (operands
[0], a0
);
31114 /* Target-specific selftests. */
31118 namespace selftest
{
31120 /* Selftest for the RTL loader.
31121 Verify that the RTL loader copes with a dump from
31122 print_rtx_function. This is essentially just a test that class
31123 function_reader can handle a real dump, but it also verifies
31124 that lookup_reg_by_dump_name correctly handles hard regs.
31125 The presence of hard reg names in the dump means that the test is
31126 target-specific, hence it is in this file. */
31129 aarch64_test_loading_full_dump ()
31131 rtl_dump_test
t (SELFTEST_LOCATION
, locate_file ("aarch64/times-two.rtl"));
31133 ASSERT_STREQ ("times_two", IDENTIFIER_POINTER (DECL_NAME (cfun
->decl
)));
31135 rtx_insn
*insn_1
= get_insn_by_uid (1);
31136 ASSERT_EQ (NOTE
, GET_CODE (insn_1
));
31138 rtx_insn
*insn_15
= get_insn_by_uid (15);
31139 ASSERT_EQ (INSN
, GET_CODE (insn_15
));
31140 ASSERT_EQ (USE
, GET_CODE (PATTERN (insn_15
)));
31142 /* Verify crtl->return_rtx. */
31143 ASSERT_EQ (REG
, GET_CODE (crtl
->return_rtx
));
31144 ASSERT_EQ (0, REGNO (crtl
->return_rtx
));
31145 ASSERT_EQ (SImode
, GET_MODE (crtl
->return_rtx
));
31148 /* Test the fractional_cost class. */
31151 aarch64_test_fractional_cost ()
31153 using cf
= fractional_cost
;
31155 ASSERT_EQ (cf (0, 20), 0);
31157 ASSERT_EQ (cf (4, 2), 2);
31158 ASSERT_EQ (3, cf (9, 3));
31160 ASSERT_NE (cf (5, 2), 2);
31161 ASSERT_NE (3, cf (8, 3));
31163 ASSERT_EQ (cf (7, 11) + cf (15, 11), 2);
31164 ASSERT_EQ (cf (2, 3) + cf (3, 5), cf (19, 15));
31165 ASSERT_EQ (cf (2, 3) + cf (1, 6) + cf (1, 6), 1);
31167 ASSERT_EQ (cf (14, 15) - cf (4, 15), cf (2, 3));
31168 ASSERT_EQ (cf (1, 4) - cf (1, 2), 0);
31169 ASSERT_EQ (cf (3, 5) - cf (1, 10), cf (1, 2));
31170 ASSERT_EQ (cf (11, 3) - 3, cf (2, 3));
31171 ASSERT_EQ (3 - cf (7, 3), cf (2, 3));
31172 ASSERT_EQ (3 - cf (10, 3), 0);
31174 ASSERT_EQ (cf (2, 3) * 5, cf (10, 3));
31175 ASSERT_EQ (14 * cf (11, 21), cf (22, 3));
31177 ASSERT_TRUE (cf (4, 15) <= cf (5, 15));
31178 ASSERT_TRUE (cf (5, 15) <= cf (5, 15));
31179 ASSERT_FALSE (cf (6, 15) <= cf (5, 15));
31180 ASSERT_TRUE (cf (1, 3) <= cf (2, 5));
31181 ASSERT_TRUE (cf (1, 12) <= cf (1, 6));
31182 ASSERT_TRUE (cf (5, 3) <= cf (5, 3));
31183 ASSERT_TRUE (cf (239, 240) <= 1);
31184 ASSERT_TRUE (cf (240, 240) <= 1);
31185 ASSERT_FALSE (cf (241, 240) <= 1);
31186 ASSERT_FALSE (2 <= cf (207, 104));
31187 ASSERT_TRUE (2 <= cf (208, 104));
31188 ASSERT_TRUE (2 <= cf (209, 104));
31190 ASSERT_TRUE (cf (4, 15) < cf (5, 15));
31191 ASSERT_FALSE (cf (5, 15) < cf (5, 15));
31192 ASSERT_FALSE (cf (6, 15) < cf (5, 15));
31193 ASSERT_TRUE (cf (1, 3) < cf (2, 5));
31194 ASSERT_TRUE (cf (1, 12) < cf (1, 6));
31195 ASSERT_FALSE (cf (5, 3) < cf (5, 3));
31196 ASSERT_TRUE (cf (239, 240) < 1);
31197 ASSERT_FALSE (cf (240, 240) < 1);
31198 ASSERT_FALSE (cf (241, 240) < 1);
31199 ASSERT_FALSE (2 < cf (207, 104));
31200 ASSERT_FALSE (2 < cf (208, 104));
31201 ASSERT_TRUE (2 < cf (209, 104));
31203 ASSERT_FALSE (cf (4, 15) >= cf (5, 15));
31204 ASSERT_TRUE (cf (5, 15) >= cf (5, 15));
31205 ASSERT_TRUE (cf (6, 15) >= cf (5, 15));
31206 ASSERT_FALSE (cf (1, 3) >= cf (2, 5));
31207 ASSERT_FALSE (cf (1, 12) >= cf (1, 6));
31208 ASSERT_TRUE (cf (5, 3) >= cf (5, 3));
31209 ASSERT_FALSE (cf (239, 240) >= 1);
31210 ASSERT_TRUE (cf (240, 240) >= 1);
31211 ASSERT_TRUE (cf (241, 240) >= 1);
31212 ASSERT_TRUE (2 >= cf (207, 104));
31213 ASSERT_TRUE (2 >= cf (208, 104));
31214 ASSERT_FALSE (2 >= cf (209, 104));
31216 ASSERT_FALSE (cf (4, 15) > cf (5, 15));
31217 ASSERT_FALSE (cf (5, 15) > cf (5, 15));
31218 ASSERT_TRUE (cf (6, 15) > cf (5, 15));
31219 ASSERT_FALSE (cf (1, 3) > cf (2, 5));
31220 ASSERT_FALSE (cf (1, 12) > cf (1, 6));
31221 ASSERT_FALSE (cf (5, 3) > cf (5, 3));
31222 ASSERT_FALSE (cf (239, 240) > 1);
31223 ASSERT_FALSE (cf (240, 240) > 1);
31224 ASSERT_TRUE (cf (241, 240) > 1);
31225 ASSERT_TRUE (2 > cf (207, 104));
31226 ASSERT_FALSE (2 > cf (208, 104));
31227 ASSERT_FALSE (2 > cf (209, 104));
31229 ASSERT_EQ (cf (1, 2).ceil (), 1);
31230 ASSERT_EQ (cf (11, 7).ceil (), 2);
31231 ASSERT_EQ (cf (20, 1).ceil (), 20);
31232 ASSERT_EQ ((cf (0xfffffffd) + 1).ceil (), 0xfffffffe);
31233 ASSERT_EQ ((cf (0xfffffffd) + 2).ceil (), 0xffffffff);
31234 ASSERT_EQ ((cf (0xfffffffd) + 3).ceil (), 0xffffffff);
31235 ASSERT_EQ ((cf (0x7fffffff) * 2).ceil (), 0xfffffffe);
31236 ASSERT_EQ ((cf (0x80000000) * 2).ceil (), 0xffffffff);
31238 ASSERT_EQ (cf (1, 2).as_double (), 0.5);
31241 /* Calculate whether our system register data, as imported from
31242 `aarch64-sys-reg.def' has any duplicate entries. */
31244 aarch64_test_sysreg_encoding_clashes (void)
31246 using dup_instances_t
= hash_map
<nofree_string_hash
,
31247 std::vector
<const sysreg_t
*>>;
31249 dup_instances_t duplicate_instances
;
31251 /* Every time an encoding is established to come up more than once
31252 we add it to a "clash-analysis queue", which is then used to extract
31253 necessary information from our hash map when establishing whether
31254 repeated encodings are valid. */
31256 /* 1) Collect recurrence information. */
31257 for (unsigned i
= 0; i
< ARRAY_SIZE (aarch64_sysregs
); i
++)
31259 const sysreg_t
*reg
= aarch64_sysregs
+ i
;
31261 std::vector
<const sysreg_t
*> *tmp
31262 = &duplicate_instances
.get_or_insert (reg
->encoding
);
31264 tmp
->push_back (reg
);
31267 /* 2) Carry out analysis on collected data. */
31268 for (auto instance
: duplicate_instances
)
31270 unsigned nrep
= instance
.second
.size ();
31272 for (unsigned i
= 0; i
< nrep
; i
++)
31273 for (unsigned j
= i
+ 1; j
< nrep
; j
++)
31275 const sysreg_t
*a
= instance
.second
[i
];
31276 const sysreg_t
*b
= instance
.second
[j
];
31277 ASSERT_TRUE ((a
->properties
!= b
->properties
)
31278 || (a
->arch_reqs
!= b
->arch_reqs
));
31283 /* Run all target-specific selftests. */
31286 aarch64_run_selftests (void)
31288 aarch64_test_loading_full_dump ();
31289 aarch64_test_fractional_cost ();
31290 aarch64_test_sysreg_encoding_clashes ();
31293 } // namespace selftest
31295 #endif /* #if CHECKING_P */
31297 #undef TARGET_STACK_PROTECT_GUARD
31298 #define TARGET_STACK_PROTECT_GUARD aarch64_stack_protect_guard
31300 #undef TARGET_ADDRESS_COST
31301 #define TARGET_ADDRESS_COST aarch64_address_cost
31303 /* This hook will determines whether unnamed bitfields affect the alignment
31304 of the containing structure. The hook returns true if the structure
31305 should inherit the alignment requirements of an unnamed bitfield's
31307 #undef TARGET_ALIGN_ANON_BITFIELD
31308 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
31310 #undef TARGET_ASM_ALIGNED_DI_OP
31311 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
31313 #undef TARGET_ASM_ALIGNED_HI_OP
31314 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
31316 #undef TARGET_ASM_ALIGNED_SI_OP
31317 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
31320 #undef TARGET_ASM_UNALIGNED_HI_OP
31321 #define TARGET_ASM_UNALIGNED_HI_OP TARGET_ASM_ALIGNED_HI_OP
31322 #undef TARGET_ASM_UNALIGNED_SI_OP
31323 #define TARGET_ASM_UNALIGNED_SI_OP TARGET_ASM_ALIGNED_SI_OP
31324 #undef TARGET_ASM_UNALIGNED_DI_OP
31325 #define TARGET_ASM_UNALIGNED_DI_OP TARGET_ASM_ALIGNED_DI_OP
31328 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
31329 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
31330 hook_bool_const_tree_hwi_hwi_const_tree_true
31332 #undef TARGET_ASM_FILE_START
31333 #define TARGET_ASM_FILE_START aarch64_start_file
31335 #undef TARGET_ASM_OUTPUT_MI_THUNK
31336 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
31338 #undef TARGET_ASM_SELECT_RTX_SECTION
31339 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
31341 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
31342 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
31344 #undef TARGET_ASM_PRINT_PATCHABLE_FUNCTION_ENTRY
31345 #define TARGET_ASM_PRINT_PATCHABLE_FUNCTION_ENTRY aarch64_print_patchable_function_entry
31347 #undef TARGET_BUILD_BUILTIN_VA_LIST
31348 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
31350 #undef TARGET_CALLEE_COPIES
31351 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_arg_info_false
31353 #undef TARGET_FRAME_POINTER_REQUIRED
31354 #define TARGET_FRAME_POINTER_REQUIRED aarch64_frame_pointer_required
31356 #undef TARGET_CAN_ELIMINATE
31357 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
31359 #undef TARGET_FUNCTION_ATTRIBUTE_INLINABLE_P
31360 #define TARGET_FUNCTION_ATTRIBUTE_INLINABLE_P \
31361 aarch64_function_attribute_inlinable_p
31363 #undef TARGET_NEED_IPA_FN_TARGET_INFO
31364 #define TARGET_NEED_IPA_FN_TARGET_INFO aarch64_need_ipa_fn_target_info
31366 #undef TARGET_UPDATE_IPA_FN_TARGET_INFO
31367 #define TARGET_UPDATE_IPA_FN_TARGET_INFO aarch64_update_ipa_fn_target_info
31369 #undef TARGET_CAN_INLINE_P
31370 #define TARGET_CAN_INLINE_P aarch64_can_inline_p
31372 #undef TARGET_CANNOT_FORCE_CONST_MEM
31373 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
31375 #undef TARGET_CASE_VALUES_THRESHOLD
31376 #define TARGET_CASE_VALUES_THRESHOLD aarch64_case_values_threshold
31378 #undef TARGET_CONDITIONAL_REGISTER_USAGE
31379 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
31381 #undef TARGET_MEMBER_TYPE_FORCES_BLK
31382 #define TARGET_MEMBER_TYPE_FORCES_BLK aarch64_member_type_forces_blk
31384 /* Only the least significant bit is used for initialization guard
31386 #undef TARGET_CXX_GUARD_MASK_BIT
31387 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
31389 #undef TARGET_C_MODE_FOR_SUFFIX
31390 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
31392 #ifdef TARGET_BIG_ENDIAN_DEFAULT
31393 #undef TARGET_DEFAULT_TARGET_FLAGS
31394 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
31397 #undef TARGET_CLASS_MAX_NREGS
31398 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
31400 #undef TARGET_BUILTIN_DECL
31401 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
31403 #undef TARGET_BUILTIN_RECIPROCAL
31404 #define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal
31406 #undef TARGET_C_EXCESS_PRECISION
31407 #define TARGET_C_EXCESS_PRECISION aarch64_excess_precision
31409 #undef TARGET_C_BITINT_TYPE_INFO
31410 #define TARGET_C_BITINT_TYPE_INFO aarch64_bitint_type_info
31412 #undef TARGET_C_MODE_FOR_FLOATING_TYPE
31413 #define TARGET_C_MODE_FOR_FLOATING_TYPE aarch64_c_mode_for_floating_type
31415 #undef TARGET_EXPAND_BUILTIN
31416 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
31418 #undef TARGET_EXPAND_BUILTIN_VA_START
31419 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
31421 #undef TARGET_FOLD_BUILTIN
31422 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin
31424 #undef TARGET_FUNCTION_ARG
31425 #define TARGET_FUNCTION_ARG aarch64_function_arg
31427 #undef TARGET_FUNCTION_ARG_ADVANCE
31428 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
31430 #undef TARGET_FUNCTION_ARG_BOUNDARY
31431 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
31433 #undef TARGET_FUNCTION_ARG_PADDING
31434 #define TARGET_FUNCTION_ARG_PADDING aarch64_function_arg_padding
31436 #undef TARGET_GET_RAW_RESULT_MODE
31437 #define TARGET_GET_RAW_RESULT_MODE aarch64_get_reg_raw_mode
31438 #undef TARGET_GET_RAW_ARG_MODE
31439 #define TARGET_GET_RAW_ARG_MODE aarch64_get_reg_raw_mode
31441 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
31442 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
31444 #undef TARGET_FUNCTION_VALUE
31445 #define TARGET_FUNCTION_VALUE aarch64_function_value
31447 #undef TARGET_FUNCTION_VALUE_REGNO_P
31448 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
31450 #undef TARGET_START_CALL_ARGS
31451 #define TARGET_START_CALL_ARGS aarch64_start_call_args
31453 #undef TARGET_END_CALL_ARGS
31454 #define TARGET_END_CALL_ARGS aarch64_end_call_args
31456 #undef TARGET_GIMPLE_FOLD_BUILTIN
31457 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
31459 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
31460 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
31462 #undef TARGET_INIT_BUILTINS
31463 #define TARGET_INIT_BUILTINS aarch64_init_builtins
31465 #undef TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
31466 #define TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS \
31467 aarch64_ira_change_pseudo_allocno_class
31469 #undef TARGET_LEGITIMATE_ADDRESS_P
31470 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
31472 #undef TARGET_LEGITIMATE_CONSTANT_P
31473 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
31475 #undef TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT
31476 #define TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT \
31477 aarch64_legitimize_address_displacement
31479 #undef TARGET_LIBGCC_CMP_RETURN_MODE
31480 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
31482 #undef TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P
31483 #define TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P \
31484 aarch64_libgcc_floating_mode_supported_p
31486 #undef TARGET_MANGLE_TYPE
31487 #define TARGET_MANGLE_TYPE aarch64_mangle_type
31489 #undef TARGET_INVALID_CONVERSION
31490 #define TARGET_INVALID_CONVERSION aarch64_invalid_conversion
31492 #undef TARGET_INVALID_UNARY_OP
31493 #define TARGET_INVALID_UNARY_OP aarch64_invalid_unary_op
31495 #undef TARGET_INVALID_BINARY_OP
31496 #define TARGET_INVALID_BINARY_OP aarch64_invalid_binary_op
31498 #undef TARGET_VERIFY_TYPE_CONTEXT
31499 #define TARGET_VERIFY_TYPE_CONTEXT aarch64_verify_type_context
31501 #undef TARGET_MEMORY_MOVE_COST
31502 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
31504 #undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
31505 #define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
31507 #undef TARGET_MUST_PASS_IN_STACK
31508 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
31510 /* This target hook should return true if accesses to volatile bitfields
31511 should use the narrowest mode possible. It should return false if these
31512 accesses should use the bitfield container type. */
31513 #undef TARGET_NARROW_VOLATILE_BITFIELD
31514 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
31516 #undef TARGET_OPTION_OVERRIDE
31517 #define TARGET_OPTION_OVERRIDE aarch64_override_options
31519 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
31520 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
31521 aarch64_override_options_after_change
31523 #undef TARGET_OFFLOAD_OPTIONS
31524 #define TARGET_OFFLOAD_OPTIONS aarch64_offload_options
31526 #undef TARGET_OPTION_RESTORE
31527 #define TARGET_OPTION_RESTORE aarch64_option_restore
31529 #undef TARGET_OPTION_PRINT
31530 #define TARGET_OPTION_PRINT aarch64_option_print
31532 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
31533 #define TARGET_OPTION_VALID_ATTRIBUTE_P aarch64_option_valid_attribute_p
31535 #undef TARGET_OPTION_VALID_VERSION_ATTRIBUTE_P
31536 #define TARGET_OPTION_VALID_VERSION_ATTRIBUTE_P \
31537 aarch64_option_valid_version_attribute_p
31539 #undef TARGET_SET_CURRENT_FUNCTION
31540 #define TARGET_SET_CURRENT_FUNCTION aarch64_set_current_function
31542 #undef TARGET_PASS_BY_REFERENCE
31543 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
31545 #undef TARGET_PREFERRED_RELOAD_CLASS
31546 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
31548 #undef TARGET_SCHED_REASSOCIATION_WIDTH
31549 #define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width
31551 #undef TARGET_DWARF_FRAME_REG_MODE
31552 #define TARGET_DWARF_FRAME_REG_MODE aarch64_dwarf_frame_reg_mode
31554 #undef TARGET_OUTPUT_CFI_DIRECTIVE
31555 #define TARGET_OUTPUT_CFI_DIRECTIVE aarch64_output_cfi_directive
31557 #undef TARGET_DW_CFI_OPRND1_DESC
31558 #define TARGET_DW_CFI_OPRND1_DESC aarch64_dw_cfi_oprnd1_desc
31560 #undef TARGET_PROMOTED_TYPE
31561 #define TARGET_PROMOTED_TYPE aarch64_promoted_type
31563 #undef TARGET_SECONDARY_RELOAD
31564 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
31566 #undef TARGET_SECONDARY_MEMORY_NEEDED
31567 #define TARGET_SECONDARY_MEMORY_NEEDED aarch64_secondary_memory_needed
31569 #undef TARGET_SHIFT_TRUNCATION_MASK
31570 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
31572 #undef TARGET_SETUP_INCOMING_VARARGS
31573 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
31575 #undef TARGET_STRUCT_VALUE_RTX
31576 #define TARGET_STRUCT_VALUE_RTX aarch64_struct_value_rtx
31578 #undef TARGET_REGISTER_MOVE_COST
31579 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
31581 #undef TARGET_RETURN_IN_MEMORY
31582 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
31584 #undef TARGET_RETURN_IN_MSB
31585 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb
31587 #undef TARGET_RTX_COSTS
31588 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
31590 #undef TARGET_INSN_COST
31591 #define TARGET_INSN_COST aarch64_insn_cost
31593 #undef TARGET_SCALAR_MODE_SUPPORTED_P
31594 #define TARGET_SCALAR_MODE_SUPPORTED_P aarch64_scalar_mode_supported_p
31596 #undef TARGET_SCHED_ISSUE_RATE
31597 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
31599 #undef TARGET_SCHED_VARIABLE_ISSUE
31600 #define TARGET_SCHED_VARIABLE_ISSUE aarch64_sched_variable_issue
31602 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
31603 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
31604 aarch64_sched_first_cycle_multipass_dfa_lookahead
31606 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD
31607 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD \
31608 aarch64_first_cycle_multipass_dfa_lookahead_guard
31610 #undef TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS
31611 #define TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS \
31612 aarch64_get_separate_components
31614 #undef TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB
31615 #define TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB \
31616 aarch64_components_for_bb
31618 #undef TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS
31619 #define TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS \
31620 aarch64_disqualify_components
31622 #undef TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS
31623 #define TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS \
31624 aarch64_emit_prologue_components
31626 #undef TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS
31627 #define TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS \
31628 aarch64_emit_epilogue_components
31630 #undef TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS
31631 #define TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS \
31632 aarch64_set_handled_components
31634 #undef TARGET_TRAMPOLINE_INIT
31635 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
31637 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
31638 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
31640 #undef TARGET_VECTOR_MODE_SUPPORTED_P
31641 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
31643 #undef TARGET_VECTOR_MODE_SUPPORTED_ANY_TARGET_P
31644 #define TARGET_VECTOR_MODE_SUPPORTED_ANY_TARGET_P aarch64_vector_mode_supported_any_target_p
31646 #undef TARGET_COMPATIBLE_VECTOR_TYPES_P
31647 #define TARGET_COMPATIBLE_VECTOR_TYPES_P aarch64_compatible_vector_types_p
31649 #undef TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT
31650 #define TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT \
31651 aarch64_builtin_support_vector_misalignment
31653 #undef TARGET_ARRAY_MODE
31654 #define TARGET_ARRAY_MODE aarch64_array_mode
31656 #undef TARGET_ARRAY_MODE_SUPPORTED_P
31657 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
31659 #undef TARGET_VECTORIZE_CREATE_COSTS
31660 #define TARGET_VECTORIZE_CREATE_COSTS aarch64_vectorize_create_costs
31662 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
31663 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
31664 aarch64_builtin_vectorization_cost
31666 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
31667 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
31669 #undef TARGET_VECTORIZE_BUILTINS
31670 #define TARGET_VECTORIZE_BUILTINS
31672 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_MODES
31673 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_MODES \
31674 aarch64_autovectorize_vector_modes
31676 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
31677 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
31678 aarch64_atomic_assign_expand_fenv
31680 /* Section anchor support. */
31682 #undef TARGET_MIN_ANCHOR_OFFSET
31683 #define TARGET_MIN_ANCHOR_OFFSET -256
31685 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a
31686 byte offset; we can do much more for larger data types, but have no way
31687 to determine the size of the access. We assume accesses are aligned. */
31688 #undef TARGET_MAX_ANCHOR_OFFSET
31689 #define TARGET_MAX_ANCHOR_OFFSET 4095
31691 #undef TARGET_VECTORIZE_PREFERRED_DIV_AS_SHIFTS_OVER_MULT
31692 #define TARGET_VECTORIZE_PREFERRED_DIV_AS_SHIFTS_OVER_MULT \
31693 aarch64_vectorize_preferred_div_as_shifts_over_mult
31695 #undef TARGET_VECTOR_ALIGNMENT
31696 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
31698 #undef TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT
31699 #define TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT \
31700 aarch64_vectorize_preferred_vector_alignment
31701 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
31702 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
31703 aarch64_simd_vector_alignment_reachable
31705 /* vec_perm support. */
31707 #undef TARGET_VECTORIZE_VEC_PERM_CONST
31708 #define TARGET_VECTORIZE_VEC_PERM_CONST \
31709 aarch64_vectorize_vec_perm_const
31711 #undef TARGET_VECTORIZE_RELATED_MODE
31712 #define TARGET_VECTORIZE_RELATED_MODE aarch64_vectorize_related_mode
31713 #undef TARGET_VECTORIZE_GET_MASK_MODE
31714 #define TARGET_VECTORIZE_GET_MASK_MODE aarch64_get_mask_mode
31715 #undef TARGET_VECTORIZE_CONDITIONAL_OPERATION_IS_EXPENSIVE
31716 #define TARGET_VECTORIZE_CONDITIONAL_OPERATION_IS_EXPENSIVE \
31717 aarch64_conditional_operation_is_expensive
31718 #undef TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE
31719 #define TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE \
31720 aarch64_empty_mask_is_expensive
31721 #undef TARGET_PREFERRED_ELSE_VALUE
31722 #define TARGET_PREFERRED_ELSE_VALUE \
31723 aarch64_preferred_else_value
31725 #undef TARGET_INIT_LIBFUNCS
31726 #define TARGET_INIT_LIBFUNCS aarch64_init_libfuncs
31728 #undef TARGET_FIXED_CONDITION_CODE_REGS
31729 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
31731 #undef TARGET_FLAGS_REGNUM
31732 #define TARGET_FLAGS_REGNUM CC_REGNUM
31734 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
31735 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
31737 #undef TARGET_ASAN_SHADOW_OFFSET
31738 #define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset
31740 #undef TARGET_LEGITIMIZE_ADDRESS
31741 #define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address
31743 #undef TARGET_SCHED_CAN_SPECULATE_INSN
31744 #define TARGET_SCHED_CAN_SPECULATE_INSN aarch64_sched_can_speculate_insn
31746 #undef TARGET_CAN_USE_DOLOOP_P
31747 #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
31749 #undef TARGET_SCHED_ADJUST_PRIORITY
31750 #define TARGET_SCHED_ADJUST_PRIORITY aarch64_sched_adjust_priority
31752 #undef TARGET_SCHED_MACRO_FUSION_P
31753 #define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p
31755 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
31756 #define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
31758 #undef TARGET_SCHED_FUSION_PRIORITY
31759 #define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority
31761 #undef TARGET_UNSPEC_MAY_TRAP_P
31762 #define TARGET_UNSPEC_MAY_TRAP_P aarch64_unspec_may_trap_p
31764 #undef TARGET_USE_PSEUDO_PIC_REG
31765 #define TARGET_USE_PSEUDO_PIC_REG aarch64_use_pseudo_pic_reg
31767 #undef TARGET_PRINT_OPERAND
31768 #define TARGET_PRINT_OPERAND aarch64_print_operand
31770 #undef TARGET_PRINT_OPERAND_ADDRESS
31771 #define TARGET_PRINT_OPERAND_ADDRESS aarch64_print_operand_address
31773 #undef TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA
31774 #define TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA aarch64_output_addr_const_extra
31776 #undef TARGET_OPTAB_SUPPORTED_P
31777 #define TARGET_OPTAB_SUPPORTED_P aarch64_optab_supported_p
31779 #undef TARGET_OMIT_STRUCT_RETURN_REG
31780 #define TARGET_OMIT_STRUCT_RETURN_REG true
31782 #undef TARGET_DWARF_POLY_INDETERMINATE_VALUE
31783 #define TARGET_DWARF_POLY_INDETERMINATE_VALUE \
31784 aarch64_dwarf_poly_indeterminate_value
31786 /* The architecture reserves bits 0 and 1 so use bit 2 for descriptors. */
31787 #undef TARGET_CUSTOM_FUNCTION_DESCRIPTORS
31788 #define TARGET_CUSTOM_FUNCTION_DESCRIPTORS 4
31790 #undef TARGET_HARD_REGNO_NREGS
31791 #define TARGET_HARD_REGNO_NREGS aarch64_hard_regno_nregs
31792 #undef TARGET_HARD_REGNO_MODE_OK
31793 #define TARGET_HARD_REGNO_MODE_OK aarch64_hard_regno_mode_ok
31795 #undef TARGET_MODES_TIEABLE_P
31796 #define TARGET_MODES_TIEABLE_P aarch64_modes_tieable_p
31798 #undef TARGET_HARD_REGNO_CALL_PART_CLOBBERED
31799 #define TARGET_HARD_REGNO_CALL_PART_CLOBBERED \
31800 aarch64_hard_regno_call_part_clobbered
31802 #undef TARGET_INSN_CALLEE_ABI
31803 #define TARGET_INSN_CALLEE_ABI aarch64_insn_callee_abi
31805 #undef TARGET_CONSTANT_ALIGNMENT
31806 #define TARGET_CONSTANT_ALIGNMENT aarch64_constant_alignment
31808 #undef TARGET_STACK_CLASH_PROTECTION_ALLOCA_PROBE_RANGE
31809 #define TARGET_STACK_CLASH_PROTECTION_ALLOCA_PROBE_RANGE \
31810 aarch64_stack_clash_protection_alloca_probe_range
31812 #undef TARGET_COMPUTE_PRESSURE_CLASSES
31813 #define TARGET_COMPUTE_PRESSURE_CLASSES aarch64_compute_pressure_classes
31815 #undef TARGET_CAN_CHANGE_MODE_CLASS
31816 #define TARGET_CAN_CHANGE_MODE_CLASS aarch64_can_change_mode_class
31818 #undef TARGET_SELECT_EARLY_REMAT_MODES
31819 #define TARGET_SELECT_EARLY_REMAT_MODES aarch64_select_early_remat_modes
31821 #undef TARGET_SPECULATION_SAFE_VALUE
31822 #define TARGET_SPECULATION_SAFE_VALUE aarch64_speculation_safe_value
31824 #undef TARGET_ESTIMATED_POLY_VALUE
31825 #define TARGET_ESTIMATED_POLY_VALUE aarch64_estimated_poly_value
31827 #undef TARGET_ATTRIBUTE_TABLE
31828 #define TARGET_ATTRIBUTE_TABLE aarch64_attribute_table
31830 #undef TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN
31831 #define TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN \
31832 aarch64_simd_clone_compute_vecsize_and_simdlen
31834 #undef TARGET_SIMD_CLONE_ADJUST
31835 #define TARGET_SIMD_CLONE_ADJUST aarch64_simd_clone_adjust
31837 #undef TARGET_SIMD_CLONE_USABLE
31838 #define TARGET_SIMD_CLONE_USABLE aarch64_simd_clone_usable
31840 #undef TARGET_COMP_TYPE_ATTRIBUTES
31841 #define TARGET_COMP_TYPE_ATTRIBUTES aarch64_comp_type_attributes
31843 #undef TARGET_MERGE_DECL_ATTRIBUTES
31844 #define TARGET_MERGE_DECL_ATTRIBUTES aarch64_merge_decl_attributes
31846 #undef TARGET_GET_MULTILIB_ABI_NAME
31847 #define TARGET_GET_MULTILIB_ABI_NAME aarch64_get_multilib_abi_name
31849 #undef TARGET_FNTYPE_ABI
31850 #define TARGET_FNTYPE_ABI aarch64_fntype_abi
31852 #undef TARGET_MEMTAG_CAN_TAG_ADDRESSES
31853 #define TARGET_MEMTAG_CAN_TAG_ADDRESSES aarch64_can_tag_addresses
31856 #undef TARGET_RUN_TARGET_SELFTESTS
31857 #define TARGET_RUN_TARGET_SELFTESTS selftest::aarch64_run_selftests
31858 #endif /* #if CHECKING_P */
31860 #undef TARGET_ASM_POST_CFI_STARTPROC
31861 #define TARGET_ASM_POST_CFI_STARTPROC aarch64_post_cfi_startproc
31863 #undef TARGET_STRICT_ARGUMENT_NAMING
31864 #define TARGET_STRICT_ARGUMENT_NAMING hook_bool_CUMULATIVE_ARGS_true
31866 #undef TARGET_MODE_EMIT
31867 #define TARGET_MODE_EMIT aarch64_mode_emit
31869 #undef TARGET_MODE_NEEDED
31870 #define TARGET_MODE_NEEDED aarch64_mode_needed
31872 #undef TARGET_MODE_AFTER
31873 #define TARGET_MODE_AFTER aarch64_mode_after
31875 #undef TARGET_MODE_CONFLUENCE
31876 #define TARGET_MODE_CONFLUENCE aarch64_mode_confluence
31878 #undef TARGET_MODE_BACKPROP
31879 #define TARGET_MODE_BACKPROP aarch64_mode_backprop
31881 #undef TARGET_MODE_ENTRY
31882 #define TARGET_MODE_ENTRY aarch64_mode_entry
31884 #undef TARGET_MODE_EXIT
31885 #define TARGET_MODE_EXIT aarch64_mode_exit
31887 #undef TARGET_MODE_EH_HANDLER
31888 #define TARGET_MODE_EH_HANDLER aarch64_mode_eh_handler
31890 #undef TARGET_MODE_PRIORITY
31891 #define TARGET_MODE_PRIORITY aarch64_mode_priority
31893 #undef TARGET_MD_ASM_ADJUST
31894 #define TARGET_MD_ASM_ADJUST aarch64_md_asm_adjust
31896 #undef TARGET_ASM_FILE_END
31897 #define TARGET_ASM_FILE_END aarch64_asm_file_end
31899 #undef TARGET_ASM_FUNCTION_EPILOGUE
31900 #define TARGET_ASM_FUNCTION_EPILOGUE aarch64_sls_emit_blr_function_thunks
31902 #undef TARGET_HAVE_SHADOW_CALL_STACK
31903 #define TARGET_HAVE_SHADOW_CALL_STACK true
31905 #undef TARGET_CONST_ANCHOR
31906 #define TARGET_CONST_ANCHOR 0x1000000
31908 #undef TARGET_EXTRA_LIVE_ON_ENTRY
31909 #define TARGET_EXTRA_LIVE_ON_ENTRY aarch64_extra_live_on_entry
31911 #undef TARGET_USE_LATE_PROLOGUE_EPILOGUE
31912 #define TARGET_USE_LATE_PROLOGUE_EPILOGUE aarch64_use_late_prologue_epilogue
31914 #undef TARGET_EMIT_EPILOGUE_FOR_SIBCALL
31915 #define TARGET_EMIT_EPILOGUE_FOR_SIBCALL aarch64_expand_epilogue
31917 #undef TARGET_OPTION_FUNCTION_VERSIONS
31918 #define TARGET_OPTION_FUNCTION_VERSIONS aarch64_common_function_versions
31920 #undef TARGET_COMPARE_VERSION_PRIORITY
31921 #define TARGET_COMPARE_VERSION_PRIORITY aarch64_compare_version_priority
31923 #undef TARGET_GENERATE_VERSION_DISPATCHER_BODY
31924 #define TARGET_GENERATE_VERSION_DISPATCHER_BODY \
31925 aarch64_generate_version_dispatcher_body
31927 #undef TARGET_GET_FUNCTION_VERSIONS_DISPATCHER
31928 #define TARGET_GET_FUNCTION_VERSIONS_DISPATCHER \
31929 aarch64_get_function_versions_dispatcher
31931 #undef TARGET_MANGLE_DECL_ASSEMBLER_NAME
31932 #define TARGET_MANGLE_DECL_ASSEMBLER_NAME aarch64_mangle_decl_assembler_name
31934 #undef TARGET_DOCUMENTATION_NAME
31935 #define TARGET_DOCUMENTATION_NAME "AArch64"
31937 struct gcc_target targetm
= TARGET_INITIALIZER
;
31939 #include "gt-aarch64.h"