gcc/config/aarch64/aarch64.cc

   1 /* Machine description for AArch64 architecture.
   2    Copyright (C) 2009-2025 Free Software Foundation, Inc.
   3    Contributed by ARM Ltd.
   4
   5    This file is part of GCC.
   6
   7    GCC is free software; you can redistribute it and/or modify it
   8    under the terms of the GNU General Public License as published by
   9    the Free Software Foundation; either version 3, or (at your option)
  10    any later version.
  11
  12    GCC is distributed in the hope that it will be useful, but
  13    WITHOUT ANY WARRANTY; without even the implied warranty of
  14    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15    General Public License for more details.
  16
  17    You should have received a copy of the GNU General Public License
  18    along with GCC; see the file COPYING3.  If not see
  19    <http://www.gnu.org/licenses/>.  */
  20
  21 #define IN_TARGET_CODE 1
  22
  23 #define INCLUDE_STRING
  24 #define INCLUDE_ALGORITHM
  25 #define INCLUDE_VECTOR
  26 #include "config.h"
  27 #include "system.h"
  28 #include "coretypes.h"
  29 #include "backend.h"
  30 #include "target.h"
  31 #include "rtl.h"
  32 #include "tree.h"
  33 #include "memmodel.h"
  34 #include "gimple.h"
  35 #include "cfghooks.h"
  36 #include "cfgloop.h"
  37 #include "df.h"
  38 #include "tm_p.h"
  39 #include "stringpool.h"
  40 #include "attribs.h"
  41 #include "optabs.h"
  42 #include "regs.h"
  43 #include "emit-rtl.h"
  44 #include "recog.h"
  45 #include "cgraph.h"
  46 #include "diagnostic.h"
  47 #include "insn-attr.h"
  48 #include "alias.h"
  49 #include "fold-const.h"
  50 #include "stor-layout.h"
  51 #include "calls.h"
  52 #include "varasm.h"
  53 #include "output.h"
  54 #include "flags.h"
  55 #include "explow.h"
  56 #include "expr.h"
  57 #include "reload.h"
  58 #include "langhooks.h"
  59 #include "opts.h"
  60 #include "gimplify.h"
  61 #include "dwarf2.h"
  62 #include "dwarf2out.h"
  63 #include "gimple-iterator.h"
  64 #include "tree-vectorizer.h"
  65 #include "aarch64-cost-tables.h"
  66 #include "dumpfile.h"
  67 #include "builtins.h"
  68 #include "rtl-iter.h"
  69 #include "tm-constrs.h"
  70 #include "sched-int.h"
  71 #include "target-globals.h"
  72 #include "common/common-target.h"
  73 #include "cfgrtl.h"
  74 #include "selftest.h"
  75 #include "selftest-rtl.h"
  76 #include "rtx-vector-builder.h"
  77 #include "intl.h"
  78 #include "expmed.h"
  79 #include "function-abi.h"
  80 #include "gimple-pretty-print.h"
  81 #include "tree-ssa-loop-niter.h"
  82 #include "fractional-cost.h"
  83 #include "rtlanal.h"
  84 #include "tree-dfa.h"
  85 #include "asan.h"
  86 #include "aarch64-feature-deps.h"
  87 #include "config/arm/aarch-common.h"
  88 #include "config/arm/aarch-common-protos.h"
  89 #include "common/config/aarch64/cpuinfo.h"
  90 #include "ssa.h"
  91 #include "except.h"
  92 #include "tree-pass.h"
  93 #include "cfgbuild.h"
  94 #include "symbol-summary.h"
  95 #include "sreal.h"
  96 #include "ipa-cp.h"
  97 #include "ipa-prop.h"
  98 #include "ipa-fnsummary.h"
  99 #include "hash-map.h"
 100
 101 /* This file should be included last.  */
 102 #include "target-def.h"
 103
 104 /* Defined for convenience.  */
 105 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
 106
 107 /* Maximum bytes set for an inline memset expansion.  With -Os use 3 STP
 108    and 1 MOVI/DUP (same size as a call).  */
 109 #define MAX_SET_SIZE(speed) (speed ? 256 : 96)
 110
 111 /* Flags that describe how a function shares certain architectural state
 112    with its callers.
 113
 114    - AARCH64_STATE_SHARED indicates that the function does share the state
 115      with callers.
 116
 117    - AARCH64_STATE_IN indicates that the function reads (or might read) the
 118      incoming state.  The converse is that the function ignores the incoming
 119      state.
 120
 121    - AARCH64_STATE_OUT indicates that the function returns new state.
 122      The converse is that the state on return is the same as it was on entry.
 123
 124    A function that partially modifies the state treats it as both IN
 125    and OUT (because the value on return depends to some extent on the
 126    value on input).  */
 127 constexpr auto AARCH64_STATE_SHARED = 1U << 0;
 128 constexpr auto AARCH64_STATE_IN = 1U << 1;
 129 constexpr auto AARCH64_STATE_OUT = 1U << 2;
 130
 131 /* Enum to distinguish which type of check is to be done in
 132    aarch64_simd_valid_imm.  */
 133 enum simd_immediate_check {
 134   AARCH64_CHECK_MOV,
 135   AARCH64_CHECK_ORR,
 136   AARCH64_CHECK_AND,
 137   AARCH64_CHECK_XOR
 138 };
 139
 140 /* Information about a legitimate vector immediate operand.  */
 141 struct simd_immediate_info
 142 {
 143   enum insn_type { MOV, MVN, INDEX, PTRUE, SVE_MOV };
 144   enum modifier_type { LSL, MSL };
 145
 146   simd_immediate_info () {}
 147   simd_immediate_info (scalar_float_mode, rtx);
 148   simd_immediate_info (scalar_int_mode, unsigned HOST_WIDE_INT,
 149                        insn_type = MOV, modifier_type = LSL,
 150                        unsigned int = 0);
 151   simd_immediate_info (scalar_mode, rtx, rtx);
 152   simd_immediate_info (scalar_int_mode, aarch64_svpattern);
 153
 154   /* The mode of the elements.  */
 155   scalar_mode elt_mode;
 156
 157   /* The instruction to use to move the immediate into a vector.  */
 158   insn_type insn;
 159
 160   union
 161   {
 162     /* For MOV and MVN.  */
 163     struct
 164     {
 165       /* The value of each element.  */
 166       rtx value;
 167
 168       /* The kind of shift modifier to use, and the number of bits to shift.
 169          This is (LSL, 0) if no shift is needed.  */
 170       modifier_type modifier;
 171       unsigned int shift;
 172     } mov;
 173
 174     /* For INDEX.  */
 175     struct
 176     {
 177       /* The value of the first element and the step to be added for each
 178          subsequent element.  */
 179       rtx base, step;
 180     } index;
 181
 182     /* For PTRUE.  */
 183     aarch64_svpattern pattern;
 184   } u;
 185 };
 186
 187 /* Construct a floating-point immediate in which each element has mode
 188    ELT_MODE_IN and value VALUE_IN.  */
 189 inline simd_immediate_info
 190 ::simd_immediate_info (scalar_float_mode elt_mode_in, rtx value_in)
 191   : elt_mode (elt_mode_in), insn (MOV)
 192 {
 193   u.mov.value = value_in;
 194   u.mov.modifier = LSL;
 195   u.mov.shift = 0;
 196 }
 197
 198 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
 199    and value VALUE_IN.  The other parameters are as for the structure
 200    fields.  */
 201 inline simd_immediate_info
 202 ::simd_immediate_info (scalar_int_mode elt_mode_in,
 203                        unsigned HOST_WIDE_INT value_in,
 204                        insn_type insn_in, modifier_type modifier_in,
 205                        unsigned int shift_in)
 206   : elt_mode (elt_mode_in), insn (insn_in)
 207 {
 208   u.mov.value = gen_int_mode (value_in, elt_mode_in);
 209   u.mov.modifier = modifier_in;
 210   u.mov.shift = shift_in;
 211 }
 212
 213 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
 214    and where element I is equal to BASE_IN + I * STEP_IN.  */
 215 inline simd_immediate_info
 216 ::simd_immediate_info (scalar_mode elt_mode_in, rtx base_in, rtx step_in)
 217   : elt_mode (elt_mode_in), insn (INDEX)
 218 {
 219   u.index.base = base_in;
 220   u.index.step = step_in;
 221 }
 222
 223 /* Construct a predicate that controls elements of mode ELT_MODE_IN
 224    and has PTRUE pattern PATTERN_IN.  */
 225 inline simd_immediate_info
 226 ::simd_immediate_info (scalar_int_mode elt_mode_in,
 227                        aarch64_svpattern pattern_in)
 228   : elt_mode (elt_mode_in), insn (PTRUE)
 229 {
 230   u.pattern = pattern_in;
 231 }
 232
 233 namespace {
 234
 235 /* Describes types that map to Pure Scalable Types (PSTs) in the AAPCS64.  */
 236 class pure_scalable_type_info
 237 {
 238 public:
 239   /* Represents the result of analyzing a type.  All values are nonzero,
 240      in the possibly forlorn hope that accidental conversions to bool
 241      trigger a warning.  */
 242   enum analysis_result
 243   {
 244     /* The type does not have an ABI identity; i.e. it doesn't contain
 245        at least one object whose type is a Fundamental Data Type.  */
 246     NO_ABI_IDENTITY = 1,
 247
 248     /* The type is definitely a Pure Scalable Type.  */
 249     IS_PST,
 250
 251     /* The type is definitely not a Pure Scalable Type.  */
 252     ISNT_PST,
 253
 254     /* It doesn't matter for PCS purposes whether the type is a Pure
 255        Scalable Type or not, since the type will be handled the same
 256        way regardless.
 257
 258        Specifically, this means that if the type is a Pure Scalable Type,
 259        there aren't enough argument registers to hold it, and so it will
 260        need to be passed or returned in memory.  If the type isn't a
 261        Pure Scalable Type, it's too big to be passed or returned in core
 262        or SIMD&FP registers, and so again will need to go in memory.  */
 263     DOESNT_MATTER
 264   };
 265
 266   /* Aggregates of 17 bytes or more are normally passed and returned
 267      in memory, so aggregates of that size can safely be analyzed as
 268      DOESNT_MATTER.  We need to be able to collect enough pieces to
 269      represent a PST that is smaller than that.  Since predicates are
 270      2 bytes in size for -msve-vector-bits=128, that means we need to be
 271      able to store at least 8 pieces.
 272
 273      We also need to be able to store enough pieces to represent
 274      a single vector in each vector argument register and a single
 275      predicate in each predicate argument register.  This means that
 276      we need at least 12 pieces.  */
 277   static const unsigned int MAX_PIECES = NUM_FP_ARG_REGS + NUM_PR_ARG_REGS;
 278   static_assert (MAX_PIECES >= 8, "Need to store at least 8 predicates");
 279
 280   /* Describes one piece of a PST.  Each piece is one of:
 281
 282      - a single Scalable Vector Type (SVT)
 283      - a single Scalable Predicate Type (SPT)
 284      - a PST containing 2, 3 or 4 SVTs, with no padding
 285
 286      It either represents a single built-in type or a PST formed from
 287      multiple homogeneous built-in types.  */
 288   struct piece
 289   {
 290     rtx get_rtx (unsigned int, unsigned int) const;
 291
 292     /* The number of vector and predicate registers that the piece
 293        occupies.  One of the two is always zero.  */
 294     unsigned int num_zr;
 295     unsigned int num_pr;
 296
 297     /* The mode of the registers described above.  */
 298     machine_mode mode;
 299
 300     /* If this piece is formed from multiple homogeneous built-in types,
 301        this is the mode of the built-in types, otherwise it is MODE.  */
 302     machine_mode orig_mode;
 303
 304     /* The offset in bytes of the piece from the start of the type.  */
 305     poly_uint64 offset;
 306   };
 307
 308   /* Divides types analyzed as IS_PST into individual pieces.  The pieces
 309      are in memory order.  */
 310   auto_vec<piece, MAX_PIECES> pieces;
 311
 312   unsigned int num_zr () const;
 313   unsigned int num_pr () const;
 314
 315   rtx get_rtx (machine_mode mode, unsigned int, unsigned int) const;
 316
 317   analysis_result analyze (const_tree);
 318   bool analyze_registers (const_tree);
 319
 320 private:
 321   analysis_result analyze_array (const_tree);
 322   analysis_result analyze_record (const_tree);
 323   void add_piece (const piece &);
 324 };
 325 }
 326
 327 /* The current code model.  */
 328 enum aarch64_code_model aarch64_cmodel;
 329
 330 enum aarch64_tp_reg aarch64_tpidr_register;
 331
 332 /* The number of 64-bit elements in an SVE vector.  */
 333 poly_uint16 aarch64_sve_vg;
 334
 335 #ifdef HAVE_AS_TLS
 336 #undef TARGET_HAVE_TLS
 337 #define TARGET_HAVE_TLS 1
 338 #endif
 339
 340 static bool aarch64_composite_type_p (const_tree, machine_mode);
 341 static bool aarch64_return_in_memory_1 (const_tree);
 342 static bool aarch64_vfp_is_call_or_return_candidate (machine_mode,
 343                                                      const_tree,
 344                                                      machine_mode *, int *,
 345                                                      bool *, bool);
 346 static void aarch64_elf_asm_constructor (rtx, int) ATTRIBUTE_UNUSED;
 347 static void aarch64_elf_asm_destructor (rtx, int) ATTRIBUTE_UNUSED;
 348 static void aarch64_override_options_after_change (void);
 349 static bool aarch64_vector_mode_supported_p (machine_mode);
 350 static int aarch64_address_cost (rtx, machine_mode, addr_space_t, bool);
 351 static bool aarch64_builtin_support_vector_misalignment (machine_mode mode,
 352                                                          const_tree type,
 353                                                          int misalignment,
 354                                                          bool is_packed);
 355 static machine_mode aarch64_simd_container_mode (scalar_mode, poly_int64);
 356 static bool aarch64_print_address_internal (FILE*, machine_mode, rtx,
 357                                             aarch64_addr_query_type);
 358
 359 /* The processor for which instructions should be scheduled.  */
 360 enum aarch64_cpu aarch64_tune = AARCH64_CPU_cortexa53;
 361
 362 /* Global flag for PC relative loads.  */
 363 bool aarch64_pcrelative_literal_loads;
 364
 365 /* Global flag for whether frame pointer is enabled.  */
 366 bool aarch64_use_frame_pointer;
 367
 368 /* Support for command line parsing of boolean flags in the tuning
 369    structures.  */
 370 struct aarch64_flag_desc
 371 {
 372   const char* name;
 373   unsigned int flag;
 374 };
 375
 376 #define AARCH64_FUSION_PAIR(name, internal_name) \
 377   { name, AARCH64_FUSE_##internal_name },
 378 static const struct aarch64_flag_desc aarch64_fusible_pairs[] =
 379 {
 380   { "none", AARCH64_FUSE_NOTHING },
 381 #include "aarch64-fusion-pairs.def"
 382   { "all", AARCH64_FUSE_ALL },
 383   { NULL, AARCH64_FUSE_NOTHING }
 384 };
 385
 386 #define AARCH64_EXTRA_TUNING_OPTION(name, internal_name) \
 387   { name, AARCH64_EXTRA_TUNE_##internal_name },
 388 static const struct aarch64_flag_desc aarch64_tuning_flags[] =
 389 {
 390   { "none", AARCH64_EXTRA_TUNE_NONE },
 391 #include "aarch64-tuning-flags.def"
 392   { "all", AARCH64_EXTRA_TUNE_ALL },
 393   { NULL, AARCH64_EXTRA_TUNE_NONE }
 394 };
 395
 396 /* Tuning parameters.  */
 397 #include "tuning_models/generic.h"
 398 #include "tuning_models/generic_armv8_a.h"
 399 #include "tuning_models/generic_armv9_a.h"
 400 #include "tuning_models/cortexa35.h"
 401 #include "tuning_models/cortexa53.h"
 402 #include "tuning_models/cortexa57.h"
 403 #include "tuning_models/cortexa72.h"
 404 #include "tuning_models/cortexa73.h"
 405 #include "tuning_models/cortexx925.h"
 406 #include "tuning_models/exynosm1.h"
 407 #include "tuning_models/thunderxt88.h"
 408 #include "tuning_models/thunderx.h"
 409 #include "tuning_models/tsv110.h"
 410 #include "tuning_models/xgene1.h"
 411 #include "tuning_models/emag.h"
 412 #include "tuning_models/qdf24xx.h"
 413 #include "tuning_models/saphira.h"
 414 #include "tuning_models/thunderx2t99.h"
 415 #include "tuning_models/thunderx3t110.h"
 416 #include "tuning_models/neoversen1.h"
 417 #include "tuning_models/ampere1.h"
 418 #include "tuning_models/ampere1a.h"
 419 #include "tuning_models/ampere1b.h"
 420 #include "tuning_models/neoversev1.h"
 421 #include "tuning_models/neoverse512tvb.h"
 422 #include "tuning_models/neoversen2.h"
 423 #include "tuning_models/neoversen3.h"
 424 #include "tuning_models/neoversev2.h"
 425 #include "tuning_models/neoversev3.h"
 426 #include "tuning_models/neoversev3ae.h"
 427 #include "tuning_models/a64fx.h"
 428 #include "tuning_models/fujitsu_monaka.h"
 429
 430 /* Support for fine-grained override of the tuning structures.  */
 431 struct aarch64_tuning_override_function
 432 {
 433   const char* name;
 434   void (*parse_override)(const char*, struct tune_params*);
 435 };
 436
 437 static void aarch64_parse_fuse_string (const char*, struct tune_params*);
 438 static void aarch64_parse_tune_string (const char*, struct tune_params*);
 439 static void aarch64_parse_sve_width_string (const char*, struct tune_params*);
 440
 441 static const struct aarch64_tuning_override_function
 442 aarch64_tuning_override_functions[] =
 443 {
 444   { "fuse", aarch64_parse_fuse_string },
 445   { "tune", aarch64_parse_tune_string },
 446   { "sve_width", aarch64_parse_sve_width_string },
 447   { NULL, NULL }
 448 };
 449
 450 /* A processor implementing AArch64.  */
 451 struct processor
 452 {
 453   const char *name;
 454   aarch64_cpu ident;
 455   aarch64_cpu sched_core;
 456   aarch64_arch arch;
 457   aarch64_feature_flags flags;
 458   const tune_params *tune;
 459 };
 460
 461 /* Architectures implementing AArch64.  */
 462 static CONSTEXPR const processor all_architectures[] =
 463 {
 464 #define AARCH64_ARCH(NAME, CORE, ARCH_IDENT, D, E) \
 465   {NAME, AARCH64_CPU_##CORE, AARCH64_CPU_##CORE, AARCH64_ARCH_##ARCH_IDENT, \
 466    feature_deps::ARCH_IDENT ().enable, NULL},
 467 #include "aarch64-arches.def"
 468   {NULL, aarch64_no_cpu, aarch64_no_cpu, aarch64_no_arch, 0, NULL}
 469 };
 470
 471 /* Processor cores implementing AArch64.  */
 472 static const struct processor all_cores[] =
 473 {
 474 #define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, E, COSTS, G, H, I) \
 475   {NAME, AARCH64_CPU_##IDENT, AARCH64_CPU_##SCHED, AARCH64_ARCH_##ARCH, \
 476    feature_deps::cpu_##IDENT, &COSTS##_tunings},
 477 #include "aarch64-cores.def"
 478   {NULL, aarch64_no_cpu, aarch64_no_cpu, aarch64_no_arch, 0, NULL}
 479 };
 480 /* Internal representation of system registers.  */
 481 typedef struct {
 482   const char *name;
 483   /* Stringified sysreg encoding values, represented as
 484      s<sn>_<op1>_c<cn>_c<cm>_<op2>.  */
 485   const char *encoding;
 486   /* Flags affecting sysreg usage, such as read/write-only.  */
 487   unsigned properties;
 488   /* Architectural features implied by sysreg.  */
 489   aarch64_feature_flags arch_reqs;
 490 } sysreg_t;
 491
 492 /* An aarch64_feature_set initializer for a single feature,
 493    AARCH64_FEATURE_<FEAT>.  */
 494 #define AARCH64_FEATURE(FEAT) AARCH64_FL_##FEAT
 495
 496 /* Used by AARCH64_FEATURES.  */
 497 #define AARCH64_OR_FEATURES_1(X, F1) \
 498   AARCH64_FEATURE (F1)
 499 #define AARCH64_OR_FEATURES_2(X, F1, F2) \
 500   (AARCH64_FEATURE (F1) | AARCH64_OR_FEATURES_1 (X, F2))
 501 #define AARCH64_OR_FEATURES_3(X, F1, ...) \
 502   (AARCH64_FEATURE (F1) | AARCH64_OR_FEATURES_2 (X, __VA_ARGS__))
 503
 504 /* An aarch64_feature_set initializer for the N features listed in "...".  */
 505 #define AARCH64_FEATURES(N, ...) \
 506   AARCH64_OR_FEATURES_##N (0, __VA_ARGS__)
 507
 508 #define AARCH64_NO_FEATURES        0
 509
 510 /* Flags associated with the properties of system registers.  It mainly serves
 511    to mark particular registers as read or write only.  */
 512 #define F_DEPRECATED               (1 << 1)
 513 #define F_REG_READ                 (1 << 2)
 514 #define F_REG_WRITE                (1 << 3)
 515 #define F_ARCHEXT                  (1 << 4)
 516 /* Flag indicating register name is alias for another system register.  */
 517 #define F_REG_ALIAS                (1 << 5)
 518 /* Flag indicatinig registers which may be implemented with 128-bits.  */
 519 #define F_REG_128                  (1 << 6)
 520
 521 /* Database of system registers, their encodings and architectural
 522    requirements.  */
 523 const sysreg_t aarch64_sysregs[] =
 524 {
 525 #define CPENC(SN, OP1, CN, CM, OP2) "s"#SN"_"#OP1"_c"#CN"_c"#CM"_"#OP2
 526 #define SYSREG(NAME, ENC, FLAGS, ARCH) \
 527   { NAME, ENC, FLAGS, ARCH },
 528 #include "aarch64-sys-regs.def"
 529 #undef CPENC
 530 };
 531
 532 #undef AARCH64_NO_FEATURES
 533
 534 using sysreg_map_t = hash_map<nofree_string_hash, const sysreg_t *>;
 535 static sysreg_map_t *sysreg_map = nullptr;
 536
 537 /* Map system register names to their hardware metadata: encoding,
 538    feature flags and architectural feature requirements, all of which
 539    are encoded in a sysreg_t struct.  */
 540 void
 541 aarch64_register_sysreg (const char *name, const sysreg_t *metadata)
 542 {
 543   bool dup = sysreg_map->put (name, metadata);
 544   gcc_checking_assert (!dup);
 545 }
 546
 547 /* Lazily initialize hash table for system register validation,
 548    checking the validity of supplied register name and returning
 549    register's associated metadata.  */
 550 static void
 551 aarch64_init_sysregs (void)
 552 {
 553   gcc_assert (!sysreg_map);
 554   sysreg_map = new sysreg_map_t;
 555
 556
 557   for (unsigned i = 0; i < ARRAY_SIZE (aarch64_sysregs); i++)
 558     {
 559       const sysreg_t *reg = aarch64_sysregs + i;
 560       aarch64_register_sysreg (reg->name, reg);
 561     }
 562 }
 563
 564 /* No direct access to the sysreg hash-map should be made.  Doing so
 565    risks trying to acess an unitialized hash-map and dereferencing the
 566    returned double pointer without due care risks dereferencing a
 567    null-pointer.  */
 568 const sysreg_t *
 569 aarch64_lookup_sysreg_map (const char *regname)
 570 {
 571   if (!sysreg_map)
 572     aarch64_init_sysregs ();
 573
 574   const sysreg_t **sysreg_entry = sysreg_map->get (regname);
 575   if (sysreg_entry != NULL)
 576     return *sysreg_entry;
 577   return NULL;
 578 }
 579
 580 /* The current tuning set.  */
 581 struct tune_params aarch64_tune_params = generic_tunings;
 582
 583 /* If NAME is the name of an arm:: attribute that describes shared state,
 584    return its associated AARCH64_STATE_* flags, otherwise return 0.  */
 585 static unsigned int
 586 aarch64_attribute_shared_state_flags (const char *name)
 587 {
 588   if (strcmp (name, "in") == 0)
 589     return AARCH64_STATE_SHARED | AARCH64_STATE_IN;
 590   if (strcmp (name, "inout") == 0)
 591     return AARCH64_STATE_SHARED | AARCH64_STATE_IN | AARCH64_STATE_OUT;
 592   if (strcmp (name, "out") == 0)
 593     return AARCH64_STATE_SHARED | AARCH64_STATE_OUT;
 594   if (strcmp (name, "preserves") == 0)
 595     return AARCH64_STATE_SHARED;
 596   return 0;
 597 }
 598
 599 /* See whether attribute list ATTRS has any sharing information
 600    for state STATE_NAME.  Return the associated state flags if so,
 601    otherwise return 0.  */
 602 static unsigned int
 603 aarch64_lookup_shared_state_flags (tree attrs, const char *state_name)
 604 {
 605   for (tree attr = attrs; attr; attr = TREE_CHAIN (attr))
 606     {
 607       if (!is_attribute_namespace_p ("arm", attr))
 608         continue;
 609
 610       auto attr_name = IDENTIFIER_POINTER (get_attribute_name (attr));
 611       auto flags = aarch64_attribute_shared_state_flags (attr_name);
 612       if (!flags)
 613         continue;
 614
 615       for (tree arg = TREE_VALUE (attr); arg; arg = TREE_CHAIN (arg))
 616         {
 617           tree value = TREE_VALUE (arg);
 618           if (TREE_CODE (value) == STRING_CST
 619               && strcmp (TREE_STRING_POINTER (value), state_name) == 0)
 620             return flags;
 621         }
 622     }
 623   return 0;
 624 }
 625
 626 /* Return true if DECL creates a new scope for state STATE_STRING.  */
 627 static bool
 628 aarch64_fndecl_has_new_state (const_tree decl, const char *state_name)
 629 {
 630   if (tree attr = lookup_attribute ("arm", "new", DECL_ATTRIBUTES (decl)))
 631     for (tree arg = TREE_VALUE (attr); arg; arg = TREE_CHAIN (arg))
 632       {
 633         tree value = TREE_VALUE (arg);
 634         if (TREE_CODE (value) == STRING_CST
 635             && strcmp (TREE_STRING_POINTER (value), state_name) == 0)
 636           return true;
 637       }
 638   return false;
 639 }
 640
 641 /* Return true if attribute argument VALUE is a recognized state string,
 642    otherwise report an error.  NAME is the name of the attribute to which
 643    VALUE is being passed.  */
 644 static bool
 645 aarch64_check_state_string (tree name, tree value)
 646 {
 647   if (TREE_CODE (value) != STRING_CST)
 648     {
 649       error ("the arguments to %qE must be constant strings", name);
 650       return false;
 651     }
 652
 653   const char *state_name = TREE_STRING_POINTER (value);
 654   if (strcmp (state_name, "za") != 0
 655       && strcmp (state_name, "zt0") != 0)
 656     {
 657       error ("unrecognized state string %qs", state_name);
 658       return false;
 659     }
 660
 661   return true;
 662 }
 663
 664 /* qsort callback to compare two STRING_CSTs.  */
 665 static int
 666 cmp_string_csts (const void *a, const void *b)
 667 {
 668   return strcmp (TREE_STRING_POINTER (*(const_tree const *) a),
 669                  TREE_STRING_POINTER (*(const_tree const *) b));
 670 }
 671
 672 /* Canonicalize a list of state strings.  ARGS contains the arguments to
 673    a new attribute while OLD_ATTR, if nonnull, contains a previous attribute
 674    of the same type.  If CAN_MERGE_IN_PLACE, it is safe to adjust OLD_ATTR's
 675    arguments and drop the new attribute.  Otherwise, the new attribute must
 676    be kept and ARGS must include the information in OLD_ATTR.
 677
 678    In both cases, the new arguments must be a sorted list of state strings
 679    with duplicates removed.
 680
 681    Return true if new attribute should be kept, false if it should be
 682    dropped.  */
 683 static bool
 684 aarch64_merge_string_arguments (tree args, tree old_attr,
 685                                 bool can_merge_in_place)
 686 {
 687   /* Get a sorted list of all state strings (including duplicates).  */
 688   auto add_args = [](vec<tree> &strings, const_tree args)
 689     {
 690       for (const_tree arg = args; arg; arg = TREE_CHAIN (arg))
 691         if (TREE_CODE (TREE_VALUE (arg)) == STRING_CST)
 692           strings.safe_push (TREE_VALUE (arg));
 693     };
 694   auto_vec<tree, 16> strings;
 695   add_args (strings, args);
 696   if (old_attr)
 697     add_args (strings, TREE_VALUE (old_attr));
 698   strings.qsort (cmp_string_csts);
 699
 700   /* The list can be empty if there was no previous attribute and if all
 701      the new arguments are erroneous.  Drop the attribute in that case.  */
 702   if (strings.is_empty ())
 703     return false;
 704
 705   /* Destructively modify one of the argument lists, removing duplicates
 706      on the fly.  */
 707   bool use_old_attr = old_attr && can_merge_in_place;
 708   tree *end = use_old_attr ? &TREE_VALUE (old_attr) : &args;
 709   tree prev = NULL_TREE;
 710   for (tree arg : strings)
 711     {
 712       if (prev && simple_cst_equal (arg, prev))
 713         continue;
 714       prev = arg;
 715       if (!*end)
 716         *end = tree_cons (NULL_TREE, arg, NULL_TREE);
 717       else
 718         TREE_VALUE (*end) = arg;
 719       end = &TREE_CHAIN (*end);
 720     }
 721   *end = NULL_TREE;
 722   return !use_old_attr;
 723 }
 724
 725 /* Check whether an 'aarch64_vector_pcs' attribute is valid.  */
 726
 727 static tree
 728 handle_aarch64_vector_pcs_attribute (tree *node, tree name, tree,
 729                                      int, bool *no_add_attrs)
 730 {
 731   /* Since we set fn_type_req to true, the caller should have checked
 732      this for us.  */
 733   gcc_assert (FUNC_OR_METHOD_TYPE_P (*node));
 734   switch ((arm_pcs) fntype_abi (*node).id ())
 735     {
 736     case ARM_PCS_AAPCS64:
 737     case ARM_PCS_SIMD:
 738       return NULL_TREE;
 739
 740     case ARM_PCS_SVE:
 741       error ("the %qE attribute cannot be applied to an SVE function type",
 742              name);
 743       *no_add_attrs = true;
 744       return NULL_TREE;
 745
 746     case ARM_PCS_TLSDESC:
 747     case ARM_PCS_UNKNOWN:
 748       break;
 749     }
 750   gcc_unreachable ();
 751 }
 752
 753 /* Return true if arm::new(ARGS) is compatible with the type of decl DECL,
 754    otherwise report an error.  */
 755 static bool
 756 aarch64_check_arm_new_against_type (tree args, tree decl)
 757 {
 758   tree type_attrs = TYPE_ATTRIBUTES (TREE_TYPE (decl));
 759   for (tree arg = args; arg; arg = TREE_CHAIN (arg))
 760     {
 761       tree value = TREE_VALUE (arg);
 762       if (TREE_CODE (value) == STRING_CST)
 763         {
 764           const char *state_name = TREE_STRING_POINTER (value);
 765           if (aarch64_lookup_shared_state_flags (type_attrs, state_name))
 766             {
 767               error_at (DECL_SOURCE_LOCATION (decl),
 768                         "cannot create a new %qs scope since %qs is shared"
 769                         " with callers", state_name, state_name);
 770               return false;
 771             }
 772         }
 773     }
 774   return true;
 775 }
 776
 777 /* Callback for arm::new attributes.  */
 778 static tree
 779 handle_arm_new (tree *node, tree name, tree args, int, bool *no_add_attrs)
 780 {
 781   tree decl = *node;
 782   if (TREE_CODE (decl) != FUNCTION_DECL)
 783     {
 784       error ("%qE attribute applies only to function definitions", name);
 785       *no_add_attrs = true;
 786       return NULL_TREE;
 787     }
 788   if (TREE_TYPE (decl) == error_mark_node)
 789     {
 790       *no_add_attrs = true;
 791       return NULL_TREE;
 792     }
 793
 794   for (tree arg = args; arg; arg = TREE_CHAIN (arg))
 795     aarch64_check_state_string (name, TREE_VALUE (arg));
 796
 797   if (!aarch64_check_arm_new_against_type (args, decl))
 798     {
 799       *no_add_attrs = true;
 800       return NULL_TREE;
 801     }
 802
 803   /* If there is an old attribute, we should try to update it in-place,
 804      so that there is only one (definitive) arm::new attribute on the decl.  */
 805   tree old_attr = lookup_attribute ("arm", "new", DECL_ATTRIBUTES (decl));
 806   if (!aarch64_merge_string_arguments (args, old_attr, true))
 807     *no_add_attrs = true;
 808
 809   return NULL_TREE;
 810 }
 811
 812 /* Callback for arm::{in,out,inout,preserves} attributes.  */
 813 static tree
 814 handle_arm_shared (tree *node, tree name, tree args,
 815                    int, bool *no_add_attrs)
 816 {
 817   tree type = *node;
 818   tree old_attrs = TYPE_ATTRIBUTES (type);
 819   auto flags = aarch64_attribute_shared_state_flags (IDENTIFIER_POINTER (name));
 820   for (tree arg = args; arg; arg = TREE_CHAIN (arg))
 821     {
 822       tree value = TREE_VALUE (arg);
 823       if (aarch64_check_state_string (name, value))
 824         {
 825           const char *state_name = TREE_STRING_POINTER (value);
 826           auto old_flags = aarch64_lookup_shared_state_flags (old_attrs,
 827                                                               state_name);
 828           if (old_flags && old_flags != flags)
 829             {
 830               error ("inconsistent attributes for state %qs", state_name);
 831               *no_add_attrs = true;
 832               return NULL_TREE;
 833             }
 834         }
 835     }
 836
 837   /* We can't update an old attribute in-place, since types are shared.
 838      Instead make sure that this new attribute contains all the
 839      information, so that the old attribute becomes redundant.  */
 840   tree old_attr = lookup_attribute ("arm", IDENTIFIER_POINTER (name),
 841                                     old_attrs);
 842   if (!aarch64_merge_string_arguments (args, old_attr, false))
 843     *no_add_attrs = true;
 844
 845   return NULL_TREE;
 846 }
 847
 848 /* Mutually-exclusive function type attributes for controlling PSTATE.SM.  */
 849 static const struct attribute_spec::exclusions attr_streaming_exclusions[] =
 850 {
 851   /* Attribute name     exclusion applies to:
 852                         function, type, variable */
 853   { "streaming", false, true, false },
 854   { "streaming_compatible", false, true, false },
 855   { NULL, false, false, false }
 856 };
 857
 858 /* Table of machine attributes.  */
 859 static const attribute_spec aarch64_gnu_attributes[] =
 860 {
 861   /* { name, min_len, max_len, decl_req, type_req, fn_type_req,
 862        affects_type_identity, handler, exclude } */
 863   { "aarch64_vector_pcs", 0, 0, false, true,  true,  true,
 864                           handle_aarch64_vector_pcs_attribute, NULL },
 865   { "indirect_return",    0, 0, false, true, true, true, NULL, NULL },
 866   { "arm_sve_vector_bits", 1, 1, false, true,  false, true,
 867                           aarch64_sve::handle_arm_sve_vector_bits_attribute,
 868                           NULL },
 869   { "Advanced SIMD type", 1, 1, false, true,  false, true,  NULL, NULL },
 870   { "SVE type",           3, 3, false, true,  false, true,  NULL, NULL },
 871   { "SVE sizeless type",  0, 0, false, true,  false, true,  NULL, NULL },
 872 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
 873   { "dllimport", 0, 0, false, false, false, false, handle_dll_attribute, NULL },
 874   { "dllexport", 0, 0, false, false, false, false, handle_dll_attribute, NULL },
 875 #endif
 876 #ifdef SUBTARGET_ATTRIBUTE_TABLE
 877   SUBTARGET_ATTRIBUTE_TABLE
 878 #endif
 879 };
 880
 881 static const scoped_attribute_specs aarch64_gnu_attribute_table =
 882 {
 883   "gnu", { aarch64_gnu_attributes }
 884 };
 885
 886 static const attribute_spec aarch64_arm_attributes[] =
 887 {
 888   { "streaming",          0, 0, false, true,  true,  true,
 889                           NULL, attr_streaming_exclusions },
 890   { "streaming_compatible", 0, 0, false, true,  true,  true,
 891                           NULL, attr_streaming_exclusions },
 892   { "locally_streaming",  0, 0, true, false, false, false, NULL, NULL },
 893   { "new",                1, -1, true, false, false, false,
 894                           handle_arm_new, NULL },
 895   { "preserves",          1, -1, false, true,  true,  true,
 896                           handle_arm_shared, NULL },
 897   { "in",                 1, -1, false, true,  true,  true,
 898                           handle_arm_shared, NULL },
 899   { "out",                1, -1, false, true,  true,  true,
 900                           handle_arm_shared, NULL },
 901   { "inout",              1, -1, false, true,  true,  true,
 902                           handle_arm_shared, NULL }
 903 };
 904
 905 static const scoped_attribute_specs aarch64_arm_attribute_table =
 906 {
 907   "arm", { aarch64_arm_attributes }
 908 };
 909
 910 static const scoped_attribute_specs *const aarch64_attribute_table[] =
 911 {
 912   &aarch64_gnu_attribute_table,
 913   &aarch64_arm_attribute_table
 914 };
 915
 916 typedef enum aarch64_cond_code
 917 {
 918   AARCH64_EQ = 0, AARCH64_NE, AARCH64_CS, AARCH64_CC, AARCH64_MI, AARCH64_PL,
 919   AARCH64_VS, AARCH64_VC, AARCH64_HI, AARCH64_LS, AARCH64_GE, AARCH64_LT,
 920   AARCH64_GT, AARCH64_LE, AARCH64_AL, AARCH64_NV
 921 }
 922 aarch64_cc;
 923
 924 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
 925
 926
 927 /* The condition codes of the processor, and the inverse function.  */
 928 static const char * const aarch64_condition_codes[] =
 929 {
 930   "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
 931   "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
 932 };
 933
 934 /* The preferred condition codes for SVE conditions.  */
 935 static const char *const aarch64_sve_condition_codes[] =
 936 {
 937   "none", "any", "nlast", "last", "first", "nfrst", "vs", "vc",
 938   "pmore", "plast", "tcont", "tstop", "gt", "le", "al", "nv"
 939 };
 940
 941 /* Return the assembly token for svpattern value VALUE.  */
 942
 943 static const char *
 944 svpattern_token (enum aarch64_svpattern pattern)
 945 {
 946   switch (pattern)
 947     {
 948 #define CASE(UPPER, LOWER, VALUE) case AARCH64_SV_##UPPER: return #LOWER;
 949     AARCH64_FOR_SVPATTERN (CASE)
 950 #undef CASE
 951     case AARCH64_NUM_SVPATTERNS:
 952       break;
 953     }
 954   gcc_unreachable ();
 955 }
 956
 957 /* Return the location of a piece that is known to be passed or returned
 958    in registers.  FIRST_ZR is the first unused vector argument register
 959    and FIRST_PR is the first unused predicate argument register.  */
 960
 961 rtx
 962 pure_scalable_type_info::piece::get_rtx (unsigned int first_zr,
 963                                          unsigned int first_pr) const
 964 {
 965   gcc_assert (VECTOR_MODE_P (mode)
 966               && first_zr + num_zr <= V0_REGNUM + NUM_FP_ARG_REGS
 967               && first_pr + num_pr <= P0_REGNUM + NUM_PR_ARG_REGS);
 968
 969   if (num_zr > 0 && num_pr == 0)
 970     return gen_rtx_REG (mode, first_zr);
 971
 972   if (num_zr == 0 && num_pr > 0)
 973     return gen_rtx_REG (mode, first_pr);
 974
 975   gcc_unreachable ();
 976 }
 977
 978 /* Return the total number of vector registers required by the PST.  */
 979
 980 unsigned int
 981 pure_scalable_type_info::num_zr () const
 982 {
 983   unsigned int res = 0;
 984   for (unsigned int i = 0; i < pieces.length (); ++i)
 985     res += pieces[i].num_zr;
 986   return res;
 987 }
 988
 989 /* Return the total number of predicate registers required by the PST.  */
 990
 991 unsigned int
 992 pure_scalable_type_info::num_pr () const
 993 {
 994   unsigned int res = 0;
 995   for (unsigned int i = 0; i < pieces.length (); ++i)
 996     res += pieces[i].num_pr;
 997   return res;
 998 }
 999
1000 /* Return the location of a PST that is known to be passed or returned
1001    in registers.  FIRST_ZR is the first unused vector argument register
1002    and FIRST_PR is the first unused predicate argument register.  */
1003
1004 rtx
1005 pure_scalable_type_info::get_rtx (machine_mode mode,
1006                                   unsigned int first_zr,
1007                                   unsigned int first_pr) const
1008 {
1009   /* Try to return a single REG if possible.  This leads to better
1010      code generation; it isn't required for correctness.  */
1011   if (mode == pieces[0].mode)
1012     {
1013       gcc_assert (pieces.length () == 1);
1014       return pieces[0].get_rtx (first_zr, first_pr);
1015     }
1016
1017   /* Build up a PARALLEL that contains the individual pieces.  */
1018   rtvec rtxes = rtvec_alloc (pieces.length ());
1019   for (unsigned int i = 0; i < pieces.length (); ++i)
1020     {
1021       rtx reg = pieces[i].get_rtx (first_zr, first_pr);
1022       rtx offset = gen_int_mode (pieces[i].offset, Pmode);
1023       RTVEC_ELT (rtxes, i) = gen_rtx_EXPR_LIST (VOIDmode, reg, offset);
1024       first_zr += pieces[i].num_zr;
1025       first_pr += pieces[i].num_pr;
1026     }
1027   return gen_rtx_PARALLEL (mode, rtxes);
1028 }
1029
1030 /* Analyze whether TYPE is a Pure Scalable Type according to the rules
1031    in the AAPCS64.  */
1032
1033 pure_scalable_type_info::analysis_result
1034 pure_scalable_type_info::analyze (const_tree type)
1035 {
1036   /* Prevent accidental reuse.  */
1037   gcc_assert (pieces.is_empty ());
1038
1039   /* No code will be generated for erroneous types, so we won't establish
1040      an ABI mapping.  */
1041   if (type == error_mark_node)
1042     return NO_ABI_IDENTITY;
1043
1044   /* Zero-sized types disappear in the language->ABI mapping.  */
1045   if (TYPE_SIZE (type) && integer_zerop (TYPE_SIZE (type)))
1046     return NO_ABI_IDENTITY;
1047
1048   /* Check for SVTs, SPTs, and built-in tuple types that map to PSTs.  */
1049   piece p = {};
1050   if (aarch64_sve::builtin_type_p (type, &p.num_zr, &p.num_pr))
1051     {
1052       machine_mode mode = TYPE_MODE_RAW (type);
1053       gcc_assert (VECTOR_MODE_P (mode)
1054                   && (!TARGET_SVE || aarch64_sve_mode_p (mode)));
1055
1056       p.mode = p.orig_mode = mode;
1057       add_piece (p);
1058       return IS_PST;
1059     }
1060
1061   /* Check for user-defined PSTs.  */
1062   if (TREE_CODE (type) == ARRAY_TYPE)
1063     return analyze_array (type);
1064   if (TREE_CODE (type) == RECORD_TYPE)
1065     return analyze_record (type);
1066
1067   return ISNT_PST;
1068 }
1069
1070 /* Analyze a type that is known not to be passed or returned in memory.
1071    Return true if it has an ABI identity and is a Pure Scalable Type.  */
1072
1073 bool
1074 pure_scalable_type_info::analyze_registers (const_tree type)
1075 {
1076   analysis_result result = analyze (type);
1077   gcc_assert (result != DOESNT_MATTER);
1078   return result == IS_PST;
1079 }
1080
1081 /* Subroutine of analyze for handling ARRAY_TYPEs.  */
1082
1083 pure_scalable_type_info::analysis_result
1084 pure_scalable_type_info::analyze_array (const_tree type)
1085 {
1086   /* Analyze the element type.  */
1087   pure_scalable_type_info element_info;
1088   analysis_result result = element_info.analyze (TREE_TYPE (type));
1089   if (result != IS_PST)
1090     return result;
1091
1092   /* An array of unknown, flexible or variable length will be passed and
1093      returned by reference whatever we do.  */
1094   tree nelts_minus_one = array_type_nelts_minus_one (type);
1095   if (!tree_fits_uhwi_p (nelts_minus_one))
1096     return DOESNT_MATTER;
1097
1098   /* Likewise if the array is constant-sized but too big to be interesting.
1099      The double checks against MAX_PIECES are to protect against overflow.  */
1100   unsigned HOST_WIDE_INT count = tree_to_uhwi (nelts_minus_one);
1101   if (count > MAX_PIECES)
1102     return DOESNT_MATTER;
1103   count += 1;
1104   if (count * element_info.pieces.length () > MAX_PIECES)
1105     return DOESNT_MATTER;
1106
1107   /* The above checks should have weeded out elements of unknown size.  */
1108   poly_uint64 element_bytes;
1109   if (!poly_int_tree_p (TYPE_SIZE_UNIT (TREE_TYPE (type)), &element_bytes))
1110     gcc_unreachable ();
1111
1112   /* Build up the list of individual vectors and predicates.  */
1113   gcc_assert (!element_info.pieces.is_empty ());
1114   for (unsigned int i = 0; i < count; ++i)
1115     for (unsigned int j = 0; j < element_info.pieces.length (); ++j)
1116       {
1117         piece p = element_info.pieces[j];
1118         p.offset += i * element_bytes;
1119         add_piece (p);
1120       }
1121   return IS_PST;
1122 }
1123
1124 /* Subroutine of analyze for handling RECORD_TYPEs.  */
1125
1126 pure_scalable_type_info::analysis_result
1127 pure_scalable_type_info::analyze_record (const_tree type)
1128 {
1129   for (tree field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
1130     {
1131       if (TREE_CODE (field) != FIELD_DECL)
1132         continue;
1133
1134       /* Zero-sized fields disappear in the language->ABI mapping.  */
1135       if (DECL_SIZE (field) && integer_zerop (DECL_SIZE (field)))
1136         continue;
1137
1138       /* All fields with an ABI identity must be PSTs for the record as
1139          a whole to be a PST.  If any individual field is too big to be
1140          interesting then the record is too.  */
1141       pure_scalable_type_info field_info;
1142       analysis_result subresult = field_info.analyze (TREE_TYPE (field));
1143       if (subresult == NO_ABI_IDENTITY)
1144         continue;
1145       if (subresult != IS_PST)
1146         return subresult;
1147
1148       /* Since all previous fields are PSTs, we ought to be able to track
1149          the field offset using poly_ints.  */
1150       tree bitpos = bit_position (field);
1151       gcc_assert (poly_int_tree_p (bitpos));
1152
1153       /* For the same reason, it shouldn't be possible to create a PST field
1154          whose offset isn't byte-aligned.  */
1155       poly_widest_int wide_bytepos = exact_div (wi::to_poly_widest (bitpos),
1156                                                 BITS_PER_UNIT);
1157
1158       /* Punt if the record is too big to be interesting.  */
1159       poly_uint64 bytepos;
1160       if (!wide_bytepos.to_uhwi (&bytepos)
1161           || pieces.length () + field_info.pieces.length () > MAX_PIECES)
1162         return DOESNT_MATTER;
1163
1164       /* Add the individual vectors and predicates in the field to the
1165          record's list.  */
1166       gcc_assert (!field_info.pieces.is_empty ());
1167       for (unsigned int i = 0; i < field_info.pieces.length (); ++i)
1168         {
1169           piece p = field_info.pieces[i];
1170           p.offset += bytepos;
1171           add_piece (p);
1172         }
1173     }
1174   /* Empty structures disappear in the language->ABI mapping.  */
1175   return pieces.is_empty () ? NO_ABI_IDENTITY : IS_PST;
1176 }
1177
1178 /* Add P to the list of pieces in the type.  */
1179
1180 void
1181 pure_scalable_type_info::add_piece (const piece &p)
1182 {
1183   /* Try to fold the new piece into the previous one to form a
1184      single-mode PST.  For example, if we see three consecutive vectors
1185      of the same mode, we can represent them using the corresponding
1186      3-tuple mode.
1187
1188      This is purely an optimization.  */
1189   if (!pieces.is_empty ())
1190     {
1191       piece &prev = pieces.last ();
1192       gcc_assert (VECTOR_MODE_P (p.mode) && VECTOR_MODE_P (prev.mode));
1193       unsigned int nelems1, nelems2;
1194       if (prev.orig_mode == p.orig_mode
1195           && GET_MODE_CLASS (p.orig_mode) != MODE_VECTOR_BOOL
1196           && known_eq (prev.offset + GET_MODE_SIZE (prev.mode), p.offset)
1197           && constant_multiple_p (GET_MODE_NUNITS (prev.mode),
1198                                   GET_MODE_NUNITS (p.orig_mode), &nelems1)
1199           && constant_multiple_p (GET_MODE_NUNITS (p.mode),
1200                                   GET_MODE_NUNITS (p.orig_mode), &nelems2)
1201           && targetm.array_mode (p.orig_mode,
1202                                  nelems1 + nelems2).exists (&prev.mode))
1203         {
1204           prev.num_zr += p.num_zr;
1205           prev.num_pr += p.num_pr;
1206           return;
1207         }
1208     }
1209   pieces.quick_push (p);
1210 }
1211
1212 /* Return true if at least one possible value of type TYPE includes at
1213    least one object of Pure Scalable Type, in the sense of the AAPCS64.
1214
1215    This is a relatively expensive test for some types, so it should
1216    generally be made as late as possible.  */
1217
1218 static bool
1219 aarch64_some_values_include_pst_objects_p (const_tree type)
1220 {
1221   if (TYPE_SIZE (type) && integer_zerop (TYPE_SIZE (type)))
1222     return false;
1223
1224   if (aarch64_sve::builtin_type_p (type))
1225     return true;
1226
1227   if (TREE_CODE (type) == ARRAY_TYPE || TREE_CODE (type) == COMPLEX_TYPE)
1228     return aarch64_some_values_include_pst_objects_p (TREE_TYPE (type));
1229
1230   if (RECORD_OR_UNION_TYPE_P (type))
1231     for (tree field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
1232       if (TREE_CODE (field) == FIELD_DECL
1233           && aarch64_some_values_include_pst_objects_p (TREE_TYPE (field)))
1234         return true;
1235
1236   return false;
1237 }
1238
1239 /* Return the descriptor of the SIMD ABI.  */
1240
1241 static const predefined_function_abi &
1242 aarch64_simd_abi (void)
1243 {
1244   predefined_function_abi &simd_abi = function_abis[ARM_PCS_SIMD];
1245   if (!simd_abi.initialized_p ())
1246     {
1247       HARD_REG_SET full_reg_clobbers
1248         = default_function_abi.full_reg_clobbers ();
1249       for (int regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
1250         if (FP_SIMD_SAVED_REGNUM_P (regno))
1251           CLEAR_HARD_REG_BIT (full_reg_clobbers, regno);
1252       simd_abi.initialize (ARM_PCS_SIMD, full_reg_clobbers);
1253     }
1254   return simd_abi;
1255 }
1256
1257 /* Return the descriptor of the SVE PCS.  */
1258
1259 static const predefined_function_abi &
1260 aarch64_sve_abi (void)
1261 {
1262   predefined_function_abi &sve_abi = function_abis[ARM_PCS_SVE];
1263   if (!sve_abi.initialized_p ())
1264     {
1265       HARD_REG_SET full_reg_clobbers
1266         = default_function_abi.full_reg_clobbers ();
1267       for (int regno = V8_REGNUM; regno <= V23_REGNUM; ++regno)
1268         CLEAR_HARD_REG_BIT (full_reg_clobbers, regno);
1269       for (int regno = P4_REGNUM; regno <= P15_REGNUM; ++regno)
1270         CLEAR_HARD_REG_BIT (full_reg_clobbers, regno);
1271       sve_abi.initialize (ARM_PCS_SVE, full_reg_clobbers);
1272     }
1273   return sve_abi;
1274 }
1275
1276 /* If X is an UNSPEC_SALT_ADDR expression, return the address that it
1277    wraps, otherwise return X itself.  */
1278
1279 static rtx
1280 strip_salt (rtx x)
1281 {
1282   rtx search = x;
1283   if (GET_CODE (search) == CONST)
1284     search = XEXP (search, 0);
1285   if (GET_CODE (search) == UNSPEC && XINT (search, 1) == UNSPEC_SALT_ADDR)
1286     x = XVECEXP (search, 0, 0);
1287   return x;
1288 }
1289
1290 /* Like strip_offset, but also strip any UNSPEC_SALT_ADDR from the
1291    expression.  */
1292
1293 static rtx
1294 strip_offset_and_salt (rtx addr, poly_int64 *offset)
1295 {
1296   return strip_salt (strip_offset (addr, offset));
1297 }
1298
1299 /* Generate code to enable conditional branches in functions over 1 MiB.  */
1300 const char *
1301 aarch64_gen_far_branch (rtx * operands, int pos_label, const char * dest,
1302                         const char * branch_format)
1303 {
1304     rtx_code_label * tmp_label = gen_label_rtx ();
1305     char label_buf[256];
1306     char buffer[128];
1307     ASM_GENERATE_INTERNAL_LABEL (label_buf, dest,
1308                                  CODE_LABEL_NUMBER (tmp_label));
1309     const char *label_ptr = targetm.strip_name_encoding (label_buf);
1310     rtx dest_label = operands[pos_label];
1311     operands[pos_label] = tmp_label;
1312
1313     snprintf (buffer, sizeof (buffer), "%s%s", branch_format, label_ptr);
1314     output_asm_insn (buffer, operands);
1315
1316     snprintf (buffer, sizeof (buffer), "b\t%%l%d\n%s:", pos_label, label_ptr);
1317     operands[pos_label] = dest_label;
1318     output_asm_insn (buffer, operands);
1319     return "";
1320 }
1321
1322 void
1323 aarch64_err_no_fpadvsimd (machine_mode mode)
1324 {
1325   if (TARGET_GENERAL_REGS_ONLY)
1326     if (FLOAT_MODE_P (mode))
1327       error ("%qs is incompatible with the use of floating-point types",
1328              "-mgeneral-regs-only");
1329     else
1330       error ("%qs is incompatible with the use of vector types",
1331              "-mgeneral-regs-only");
1332   else
1333     if (FLOAT_MODE_P (mode))
1334       error ("%qs feature modifier is incompatible with the use of"
1335              " floating-point types", "+nofp");
1336     else
1337       error ("%qs feature modifier is incompatible with the use of"
1338              " vector types", "+nofp");
1339 }
1340
1341 /* Report when we try to do something that requires SVE when SVE is disabled.
1342    This is an error of last resort and isn't very high-quality.  It usually
1343    involves attempts to measure the vector length in some way.  */
1344 static void
1345 aarch64_report_sve_required (void)
1346 {
1347   static bool reported_p = false;
1348
1349   /* Avoid reporting a slew of messages for a single oversight.  */
1350   if (reported_p)
1351     return;
1352
1353   error ("this operation requires the SVE ISA extension");
1354   inform (input_location, "you can enable SVE using the command-line"
1355           " option %<-march%>, or by using the %<target%>"
1356           " attribute or pragma");
1357   reported_p = true;
1358 }
1359
1360 /* Return true if REGNO is P0-P15 or one of the special FFR-related
1361    registers.  */
1362 inline bool
1363 pr_or_ffr_regnum_p (unsigned int regno)
1364 {
1365   return PR_REGNUM_P (regno) || regno == FFR_REGNUM || regno == FFRT_REGNUM;
1366 }
1367
1368 /* Implement TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS.
1369    The register allocator chooses POINTER_AND_FP_REGS if FP_REGS and
1370    GENERAL_REGS have the same cost - even if POINTER_AND_FP_REGS has a much
1371    higher cost.  POINTER_AND_FP_REGS is also used if the cost of both FP_REGS
1372    and GENERAL_REGS is lower than the memory cost (in this case the best class
1373    is the lowest cost one).  Using POINTER_AND_FP_REGS irrespectively of its
1374    cost results in bad allocations with many redundant int<->FP moves which
1375    are expensive on various cores.
1376    To avoid this we don't allow POINTER_AND_FP_REGS as the allocno class, but
1377    force a decision between FP_REGS and GENERAL_REGS.  We use the allocno class
1378    if it isn't POINTER_AND_FP_REGS.  Similarly, use the best class if it isn't
1379    POINTER_AND_FP_REGS.  Otherwise set the allocno class depending on the mode.
1380    The result of this is that it is no longer inefficient to have a higher
1381    memory move cost than the register move cost.
1382 */
1383
1384 static reg_class_t
1385 aarch64_ira_change_pseudo_allocno_class (int regno, reg_class_t allocno_class,
1386                                          reg_class_t best_class)
1387 {
1388   machine_mode mode;
1389
1390   if (!reg_class_subset_p (GENERAL_REGS, allocno_class)
1391       || !reg_class_subset_p (FP_REGS, allocno_class))
1392     return allocno_class;
1393
1394   if (!reg_class_subset_p (GENERAL_REGS, best_class)
1395       || !reg_class_subset_p (FP_REGS, best_class))
1396     return best_class;
1397
1398   mode = PSEUDO_REGNO_MODE (regno);
1399   return FLOAT_MODE_P (mode) || VECTOR_MODE_P (mode) ? FP_REGS : GENERAL_REGS;
1400 }
1401
1402 static unsigned int
1403 aarch64_min_divisions_for_recip_mul (machine_mode mode)
1404 {
1405   if (GET_MODE_UNIT_SIZE (mode) == 4)
1406     return aarch64_tune_params.min_div_recip_mul_sf;
1407   return aarch64_tune_params.min_div_recip_mul_df;
1408 }
1409
1410 /* Return the reassociation width of treeop OPC with mode MODE.  */
1411 static int
1412 aarch64_reassociation_width (unsigned opc, machine_mode mode)
1413 {
1414   if (VECTOR_MODE_P (mode))
1415     return aarch64_tune_params.vec_reassoc_width;
1416   if (INTEGRAL_MODE_P (mode))
1417     return aarch64_tune_params.int_reassoc_width;
1418   /* Reassociation reduces the number of FMAs which may result in worse
1419      performance.  Use a per-CPU setting for FMA reassociation which allows
1420      narrow CPUs with few FP pipes to switch it off (value of 1), and wider
1421      CPUs with many FP pipes to enable reassociation.
1422      Since the reassociation pass doesn't understand FMA at all, assume
1423      that any FP addition might turn into FMA.  */
1424   if (FLOAT_MODE_P (mode))
1425     return opc == PLUS_EXPR ? aarch64_tune_params.fma_reassoc_width
1426                             : aarch64_tune_params.fp_reassoc_width;
1427   return 1;
1428 }
1429
1430 /* Provide a mapping from gcc register numbers to dwarf register numbers.  */
1431 unsigned
1432 aarch64_debugger_regno (unsigned regno)
1433 {
1434    if (GP_REGNUM_P (regno))
1435      return AARCH64_DWARF_R0 + regno - R0_REGNUM;
1436    else if (regno == SP_REGNUM)
1437      return AARCH64_DWARF_SP;
1438    else if (FP_REGNUM_P (regno))
1439      return AARCH64_DWARF_V0 + regno - V0_REGNUM;
1440    else if (PR_REGNUM_P (regno))
1441      return AARCH64_DWARF_P0 + regno - P0_REGNUM;
1442    else if (regno == VG_REGNUM)
1443      return AARCH64_DWARF_VG;
1444
1445    /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
1446       equivalent DWARF register.  */
1447    return DWARF_FRAME_REGISTERS;
1448 }
1449
1450 /* Implement TARGET_DWARF_FRAME_REG_MODE.  */
1451 static machine_mode
1452 aarch64_dwarf_frame_reg_mode (int regno)
1453 {
1454   /* Predicate registers are call-clobbered in the EH ABI (which is
1455      ARM_PCS_AAPCS64), so they should not be described by CFI.
1456      Their size changes as VL changes, so any values computed by
1457      __builtin_init_dwarf_reg_size_table might not be valid for
1458      all frames.  */
1459   if (PR_REGNUM_P (regno))
1460     return VOIDmode;
1461   return default_dwarf_frame_reg_mode (regno);
1462 }
1463
1464 /* Implement TARGET_OUTPUT_CFI_DIRECTIVE.  */
1465 static bool
1466 aarch64_output_cfi_directive (FILE *f, dw_cfi_ref cfi)
1467 {
1468   bool found = false;
1469   if (cfi->dw_cfi_opc == DW_CFA_AARCH64_negate_ra_state)
1470     {
1471       fprintf (f, "\t.cfi_negate_ra_state\n");
1472       found = true;
1473     }
1474   return found;
1475 }
1476
1477 /* Implement TARGET_DW_CFI_OPRND1_DESC.  */
1478 static bool
1479 aarch64_dw_cfi_oprnd1_desc (dwarf_call_frame_info cfi_opc,
1480                             dw_cfi_oprnd_type &oprnd_type)
1481 {
1482   if (cfi_opc == DW_CFA_AARCH64_negate_ra_state)
1483     {
1484       oprnd_type = dw_cfi_oprnd_unused;
1485       return true;
1486     }
1487   return false;
1488 }
1489
1490 /* If X is a CONST_DOUBLE, return its bit representation as a constant
1491    integer, otherwise return X unmodified.  */
1492 static rtx
1493 aarch64_bit_representation (rtx x)
1494 {
1495   if (CONST_DOUBLE_P (x))
1496     x = gen_lowpart (int_mode_for_mode (GET_MODE (x)).require (), x);
1497   return x;
1498 }
1499
1500 /* Return an estimate for the number of quadwords in an SVE vector.  This is
1501    equivalent to the number of Advanced SIMD vectors in an SVE vector.  */
1502 static unsigned int
1503 aarch64_estimated_sve_vq ()
1504 {
1505   return estimated_poly_value (BITS_PER_SVE_VECTOR) / 128;
1506 }
1507
1508 /* Return true if MODE is an SVE predicate mode.  */
1509 static bool
1510 aarch64_sve_pred_mode_p (machine_mode mode)
1511 {
1512   return (TARGET_SVE
1513           && (mode == VNx16BImode
1514               || mode == VNx8BImode
1515               || mode == VNx4BImode
1516               || mode == VNx2BImode));
1517 }
1518
1519 /* Three mutually-exclusive flags describing a vector or predicate type.  */
1520 const unsigned int VEC_ADVSIMD  = 1;
1521 const unsigned int VEC_SVE_DATA = 2;
1522 const unsigned int VEC_SVE_PRED = 4;
1523 /* Indicates a structure of 2, 3 or 4 vectors or predicates.  */
1524 const unsigned int VEC_STRUCT   = 8;
1525 /* Can be used in combination with VEC_SVE_DATA to indicate that the
1526    vector has fewer significant bytes than a full SVE vector.  */
1527 const unsigned int VEC_PARTIAL  = 16;
1528 /* Useful combinations of the above.  */
1529 const unsigned int VEC_ANY_SVE  = VEC_SVE_DATA | VEC_SVE_PRED;
1530 const unsigned int VEC_ANY_DATA = VEC_ADVSIMD | VEC_SVE_DATA;
1531
1532 /* Return a set of flags describing the vector properties of mode MODE.
1533    If ANY_TARGET_P is false (the default), ignore modes that are not supported
1534    by the current target.  Otherwise categorize the modes that can be used
1535    with the set of all targets supported by the port.  */
1536
1537 static unsigned int
1538 aarch64_classify_vector_mode (machine_mode mode, bool any_target_p = false)
1539 {
1540   if (aarch64_sve_pred_mode_p (mode))
1541     return VEC_SVE_PRED;
1542
1543   /* Make the decision based on the mode's enum value rather than its
1544      properties, so that we keep the correct classification regardless
1545      of -msve-vector-bits.  */
1546   switch (mode)
1547     {
1548     /* Partial SVE QI vectors.  */
1549     case E_VNx2QImode:
1550     case E_VNx4QImode:
1551     case E_VNx8QImode:
1552     /* Partial SVE HI vectors.  */
1553     case E_VNx2HImode:
1554     case E_VNx4HImode:
1555     /* Partial SVE SI vector.  */
1556     case E_VNx2SImode:
1557     /* Partial SVE HF vectors.  */
1558     case E_VNx2HFmode:
1559     case E_VNx4HFmode:
1560     /* Partial SVE BF vectors.  */
1561     case E_VNx2BFmode:
1562     case E_VNx4BFmode:
1563     /* Partial SVE SF vector.  */
1564     case E_VNx2SFmode:
1565       return (TARGET_SVE || any_target_p) ? VEC_SVE_DATA | VEC_PARTIAL : 0;
1566
1567     case E_VNx16QImode:
1568     case E_VNx8HImode:
1569     case E_VNx4SImode:
1570     case E_VNx2DImode:
1571     case E_VNx8BFmode:
1572     case E_VNx8HFmode:
1573     case E_VNx4SFmode:
1574     case E_VNx2DFmode:
1575       return (TARGET_SVE || any_target_p) ? VEC_SVE_DATA : 0;
1576
1577     /* x2 SVE vectors.  */
1578     case E_VNx32QImode:
1579     case E_VNx16HImode:
1580     case E_VNx8SImode:
1581     case E_VNx4DImode:
1582     case E_VNx16BFmode:
1583     case E_VNx16HFmode:
1584     case E_VNx8SFmode:
1585     case E_VNx4DFmode:
1586     /* x3 SVE vectors.  */
1587     case E_VNx48QImode:
1588     case E_VNx24HImode:
1589     case E_VNx12SImode:
1590     case E_VNx6DImode:
1591     case E_VNx24BFmode:
1592     case E_VNx24HFmode:
1593     case E_VNx12SFmode:
1594     case E_VNx6DFmode:
1595     /* x4 SVE vectors.  */
1596     case E_VNx64QImode:
1597     case E_VNx32HImode:
1598     case E_VNx16SImode:
1599     case E_VNx8DImode:
1600     case E_VNx32BFmode:
1601     case E_VNx32HFmode:
1602     case E_VNx16SFmode:
1603     case E_VNx8DFmode:
1604       return (TARGET_SVE || any_target_p) ? VEC_SVE_DATA | VEC_STRUCT : 0;
1605
1606     case E_OImode:
1607     case E_CImode:
1608     case E_XImode:
1609       return (TARGET_FLOAT || any_target_p) ? VEC_ADVSIMD | VEC_STRUCT : 0;
1610
1611     /* Structures of 64-bit Advanced SIMD vectors.  */
1612     case E_V2x8QImode:
1613     case E_V2x4HImode:
1614     case E_V2x2SImode:
1615     case E_V2x1DImode:
1616     case E_V2x4BFmode:
1617     case E_V2x4HFmode:
1618     case E_V2x2SFmode:
1619     case E_V2x1DFmode:
1620     case E_V3x8QImode:
1621     case E_V3x4HImode:
1622     case E_V3x2SImode:
1623     case E_V3x1DImode:
1624     case E_V3x4BFmode:
1625     case E_V3x4HFmode:
1626     case E_V3x2SFmode:
1627     case E_V3x1DFmode:
1628     case E_V4x8QImode:
1629     case E_V4x4HImode:
1630     case E_V4x2SImode:
1631     case E_V4x1DImode:
1632     case E_V4x4BFmode:
1633     case E_V4x4HFmode:
1634     case E_V4x2SFmode:
1635     case E_V4x1DFmode:
1636       return (TARGET_FLOAT || any_target_p)
1637               ? VEC_ADVSIMD | VEC_STRUCT | VEC_PARTIAL : 0;
1638
1639     /* Structures of 128-bit Advanced SIMD vectors.  */
1640     case E_V2x16QImode:
1641     case E_V2x8HImode:
1642     case E_V2x4SImode:
1643     case E_V2x2DImode:
1644     case E_V2x8BFmode:
1645     case E_V2x8HFmode:
1646     case E_V2x4SFmode:
1647     case E_V2x2DFmode:
1648     case E_V3x16QImode:
1649     case E_V3x8HImode:
1650     case E_V3x4SImode:
1651     case E_V3x2DImode:
1652     case E_V3x8BFmode:
1653     case E_V3x8HFmode:
1654     case E_V3x4SFmode:
1655     case E_V3x2DFmode:
1656     case E_V4x16QImode:
1657     case E_V4x8HImode:
1658     case E_V4x4SImode:
1659     case E_V4x2DImode:
1660     case E_V4x8BFmode:
1661     case E_V4x8HFmode:
1662     case E_V4x4SFmode:
1663     case E_V4x2DFmode:
1664       return (TARGET_FLOAT || any_target_p) ? VEC_ADVSIMD | VEC_STRUCT : 0;
1665
1666     /* 64-bit Advanced SIMD vectors.  */
1667     case E_V8QImode:
1668     case E_V4HImode:
1669     case E_V2SImode:
1670     case E_V1DImode:
1671     case E_V4HFmode:
1672     case E_V4BFmode:
1673     case E_V2SFmode:
1674     case E_V1DFmode:
1675     /* 128-bit Advanced SIMD vectors.  */
1676     case E_V16QImode:
1677     case E_V8HImode:
1678     case E_V4SImode:
1679     case E_V2DImode:
1680     case E_V8HFmode:
1681     case E_V8BFmode:
1682     case E_V4SFmode:
1683     case E_V2DFmode:
1684       return (TARGET_FLOAT || any_target_p) ? VEC_ADVSIMD : 0;
1685
1686     case E_VNx32BImode:
1687     case E_VNx64BImode:
1688       return TARGET_SVE ? VEC_SVE_PRED | VEC_STRUCT : 0;
1689
1690     default:
1691       return 0;
1692     }
1693 }
1694
1695 /* Like aarch64_classify_vector_mode, but also include modes that are used
1696    for memory operands but not register operands.  Such modes do not count
1697    as real vector modes; they are just an internal construct to make things
1698    easier to describe.  */
1699 static unsigned int
1700 aarch64_classify_vector_memory_mode (machine_mode mode)
1701 {
1702   switch (mode)
1703     {
1704     case VNx1SImode:
1705     case VNx1DImode:
1706       return TARGET_SVE ? VEC_SVE_DATA | VEC_PARTIAL : 0;
1707
1708     case VNx1TImode:
1709       return TARGET_SVE ? VEC_SVE_DATA : 0;
1710
1711     case VNx2TImode:
1712     case VNx3TImode:
1713     case VNx4TImode:
1714       return TARGET_SVE ? VEC_SVE_DATA | VEC_STRUCT : 0;
1715
1716     default:
1717       return aarch64_classify_vector_mode (mode);
1718     }
1719 }
1720
1721 /* Return true if MODE is any of the Advanced SIMD structure modes.  */
1722 bool
1723 aarch64_advsimd_struct_mode_p (machine_mode mode)
1724 {
1725   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
1726   return (vec_flags & VEC_ADVSIMD) && (vec_flags & VEC_STRUCT);
1727 }
1728
1729 /* Return true if MODE is an Advanced SIMD D-register structure mode.  */
1730 static bool
1731 aarch64_advsimd_partial_struct_mode_p (machine_mode mode)
1732 {
1733   return (aarch64_classify_vector_mode (mode)
1734           == (VEC_ADVSIMD | VEC_STRUCT | VEC_PARTIAL));
1735 }
1736
1737 /* Return true if MODE is an Advanced SIMD Q-register structure mode.  */
1738 static bool
1739 aarch64_advsimd_full_struct_mode_p (machine_mode mode)
1740 {
1741   return (aarch64_classify_vector_mode (mode) == (VEC_ADVSIMD | VEC_STRUCT));
1742 }
1743
1744 /* Return true if MODE is any of the data vector modes, including
1745    structure modes.  */
1746 static bool
1747 aarch64_vector_data_mode_p (machine_mode mode)
1748 {
1749   return aarch64_classify_vector_mode (mode) & VEC_ANY_DATA;
1750 }
1751
1752 /* Return true if MODE is any form of SVE mode, including predicates,
1753    vectors and structures.  */
1754 bool
1755 aarch64_sve_mode_p (machine_mode mode)
1756 {
1757   return aarch64_classify_vector_mode (mode) & VEC_ANY_SVE;
1758 }
1759
1760 /* Return true if MODE is an SVE data vector mode; either a single vector
1761    or a structure of vectors.  */
1762 static bool
1763 aarch64_sve_data_mode_p (machine_mode mode)
1764 {
1765   return aarch64_classify_vector_mode (mode) & VEC_SVE_DATA;
1766 }
1767
1768 /* Return the number of defined bytes in one constituent vector of
1769    SVE mode MODE, which has vector flags VEC_FLAGS.  */
1770 static poly_int64
1771 aarch64_vl_bytes (machine_mode mode, unsigned int vec_flags)
1772 {
1773   if (vec_flags & VEC_PARTIAL)
1774     /* A single partial vector.  */
1775     return GET_MODE_SIZE (mode);
1776
1777   if (vec_flags & VEC_SVE_DATA)
1778     /* A single vector or a tuple.  */
1779     return BYTES_PER_SVE_VECTOR;
1780
1781   /* A single predicate.  */
1782   gcc_assert (vec_flags & VEC_SVE_PRED);
1783   return BYTES_PER_SVE_PRED;
1784 }
1785
1786 /* If MODE holds an array of vectors, return the number of vectors
1787    in the array, otherwise return 1.  */
1788
1789 static unsigned int
1790 aarch64_ldn_stn_vectors (machine_mode mode)
1791 {
1792   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
1793   if (vec_flags == (VEC_ADVSIMD | VEC_PARTIAL | VEC_STRUCT))
1794     return exact_div (GET_MODE_SIZE (mode), 8).to_constant ();
1795   if (vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
1796     return exact_div (GET_MODE_SIZE (mode), 16).to_constant ();
1797   if (vec_flags == (VEC_SVE_DATA | VEC_STRUCT))
1798     return exact_div (GET_MODE_SIZE (mode),
1799                       BYTES_PER_SVE_VECTOR).to_constant ();
1800   return 1;
1801 }
1802
1803 /* Given an Advanced SIMD vector mode MODE and a tuple size NELEMS, return the
1804    corresponding vector structure mode.  */
1805 opt_machine_mode
1806 aarch64_advsimd_vector_array_mode (machine_mode mode,
1807                                    unsigned HOST_WIDE_INT nelems)
1808 {
1809   unsigned int flags = VEC_ADVSIMD | VEC_STRUCT;
1810   if (known_eq (GET_MODE_SIZE (mode), 8))
1811     flags |= VEC_PARTIAL;
1812
1813   machine_mode struct_mode;
1814   FOR_EACH_MODE_IN_CLASS (struct_mode, GET_MODE_CLASS (mode))
1815     if (aarch64_classify_vector_mode (struct_mode) == flags
1816         && GET_MODE_INNER (struct_mode) == GET_MODE_INNER (mode)
1817         && known_eq (GET_MODE_NUNITS (struct_mode),
1818              GET_MODE_NUNITS (mode) * nelems))
1819       return struct_mode;
1820   return opt_machine_mode ();
1821 }
1822
1823 /* Return the SVE vector mode that has NUNITS elements of mode INNER_MODE.  */
1824
1825 opt_machine_mode
1826 aarch64_sve_data_mode (scalar_mode inner_mode, poly_uint64 nunits)
1827 {
1828   enum mode_class mclass = (is_a <scalar_float_mode> (inner_mode)
1829                             ? MODE_VECTOR_FLOAT : MODE_VECTOR_INT);
1830   machine_mode mode;
1831   FOR_EACH_MODE_IN_CLASS (mode, mclass)
1832     if (inner_mode == GET_MODE_INNER (mode)
1833         && known_eq (nunits, GET_MODE_NUNITS (mode))
1834         && aarch64_sve_data_mode_p (mode))
1835       return mode;
1836   return opt_machine_mode ();
1837 }
1838
1839 /* Implement target hook TARGET_ARRAY_MODE.  */
1840 static opt_machine_mode
1841 aarch64_array_mode (machine_mode mode, unsigned HOST_WIDE_INT nelems)
1842 {
1843   if (TARGET_SVE && GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL)
1844     {
1845       /* Use VNx32BI and VNx64BI for tuples of predicates, but explicitly
1846          reject giving a mode to other array sizes.  Using integer modes
1847          requires a round trip through memory and generates terrible code.  */
1848       if (nelems == 1)
1849         return mode;
1850       if (mode == VNx16BImode && nelems == 2)
1851         return VNx32BImode;
1852       if (mode == VNx16BImode && nelems == 4)
1853         return VNx64BImode;
1854       return BLKmode;
1855     }
1856
1857   auto flags = aarch64_classify_vector_mode (mode);
1858   if (flags == VEC_SVE_DATA && IN_RANGE (nelems, 2, 4))
1859     return aarch64_sve_data_mode (GET_MODE_INNER (mode),
1860                                   GET_MODE_NUNITS (mode) * nelems);
1861
1862   if (flags == VEC_ADVSIMD && IN_RANGE (nelems, 2, 4))
1863     return aarch64_advsimd_vector_array_mode (mode, nelems);
1864
1865   return opt_machine_mode ();
1866 }
1867
1868 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P.  */
1869 static bool
1870 aarch64_array_mode_supported_p (machine_mode mode,
1871                                 unsigned HOST_WIDE_INT nelems)
1872 {
1873   if (TARGET_BASE_SIMD
1874       && (AARCH64_VALID_SIMD_QREG_MODE (mode)
1875           || AARCH64_VALID_SIMD_DREG_MODE (mode))
1876       && (nelems >= 2 && nelems <= 4))
1877     return true;
1878
1879   return false;
1880 }
1881
1882 /* MODE is some form of SVE vector mode.  For data modes, return the number
1883    of vector register bits that each element of MODE occupies, such as 64
1884    for both VNx2DImode and VNx2SImode (where each 32-bit value is stored
1885    in a 64-bit container).  For predicate modes, return the number of
1886    data bits controlled by each significant predicate bit.  */
1887
1888 static unsigned int
1889 aarch64_sve_container_bits (machine_mode mode)
1890 {
1891   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
1892   poly_uint64 vector_bits = (vec_flags & (VEC_PARTIAL | VEC_SVE_PRED)
1893                              ? BITS_PER_SVE_VECTOR
1894                              : GET_MODE_BITSIZE (mode));
1895   return vector_element_size (vector_bits, GET_MODE_NUNITS (mode));
1896 }
1897
1898 /* Return the SVE predicate mode to use for elements that have
1899    ELEM_NBYTES bytes, if such a mode exists.  */
1900
1901 opt_machine_mode
1902 aarch64_sve_pred_mode (unsigned int elem_nbytes)
1903 {
1904   if (TARGET_SVE)
1905     {
1906       if (elem_nbytes == 1)
1907         return VNx16BImode;
1908       if (elem_nbytes == 2)
1909         return VNx8BImode;
1910       if (elem_nbytes == 4)
1911         return VNx4BImode;
1912       if (elem_nbytes == 8)
1913         return VNx2BImode;
1914     }
1915   return opt_machine_mode ();
1916 }
1917
1918 /* Return the SVE predicate mode that should be used to control
1919    SVE mode MODE.  */
1920
1921 machine_mode
1922 aarch64_sve_pred_mode (machine_mode mode)
1923 {
1924   unsigned int bits = aarch64_sve_container_bits (mode);
1925   return aarch64_sve_pred_mode (bits / BITS_PER_UNIT).require ();
1926 }
1927
1928 /* Implement TARGET_VECTORIZE_GET_MASK_MODE.  */
1929
1930 static opt_machine_mode
1931 aarch64_get_mask_mode (machine_mode mode)
1932 {
1933   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
1934   if (vec_flags & VEC_SVE_DATA)
1935     return aarch64_sve_pred_mode (mode);
1936
1937   return default_get_mask_mode (mode);
1938 }
1939
1940 /* Return the integer element mode associated with SVE mode MODE.  */
1941
1942 static scalar_int_mode
1943 aarch64_sve_element_int_mode (machine_mode mode)
1944 {
1945   poly_uint64 vector_bits = (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL
1946                              ? BITS_PER_SVE_VECTOR
1947                              : GET_MODE_BITSIZE (mode));
1948   unsigned int elt_bits = vector_element_size (vector_bits,
1949                                                GET_MODE_NUNITS (mode));
1950   return int_mode_for_size (elt_bits, 0).require ();
1951 }
1952
1953 /* Return an integer element mode that contains exactly
1954    aarch64_sve_container_bits (MODE) bits.  This is wider than
1955    aarch64_sve_element_int_mode if MODE is a partial vector,
1956    otherwise it's the same.  */
1957
1958 static scalar_int_mode
1959 aarch64_sve_container_int_mode (machine_mode mode)
1960 {
1961   return int_mode_for_size (aarch64_sve_container_bits (mode), 0).require ();
1962 }
1963
1964 /* Return the integer vector mode associated with SVE mode MODE.
1965    Unlike related_int_vector_mode, this can handle the case in which
1966    MODE is a predicate (and thus has a different total size).  */
1967
1968 machine_mode
1969 aarch64_sve_int_mode (machine_mode mode)
1970 {
1971   scalar_int_mode int_mode = aarch64_sve_element_int_mode (mode);
1972   return aarch64_sve_data_mode (int_mode, GET_MODE_NUNITS (mode)).require ();
1973 }
1974
1975 /* Look for a vector mode with the same classification as VEC_MODE,
1976    but with each group of FACTOR elements coalesced into a single element.
1977    In other words, look for a mode in which the elements are FACTOR times
1978    larger and in which the number of elements is FACTOR times smaller.
1979
1980    Return the mode found, if one exists.  */
1981
1982 static opt_machine_mode
1983 aarch64_coalesce_units (machine_mode vec_mode, unsigned int factor)
1984 {
1985   auto elt_bits = vector_element_size (GET_MODE_BITSIZE (vec_mode),
1986                                        GET_MODE_NUNITS (vec_mode));
1987   auto vec_flags = aarch64_classify_vector_mode (vec_mode);
1988   if (vec_flags & VEC_SVE_PRED)
1989     {
1990       if (known_eq (GET_MODE_SIZE (vec_mode), BYTES_PER_SVE_PRED))
1991         return aarch64_sve_pred_mode (elt_bits * factor);
1992       return {};
1993     }
1994
1995   scalar_mode new_elt_mode;
1996   if (!int_mode_for_size (elt_bits * factor, false).exists (&new_elt_mode))
1997     return {};
1998
1999   if (vec_flags == VEC_ADVSIMD)
2000     {
2001       auto mode = aarch64_simd_container_mode (new_elt_mode,
2002                                                GET_MODE_BITSIZE (vec_mode));
2003       if (mode != word_mode)
2004         return mode;
2005     }
2006   else if (vec_flags & VEC_SVE_DATA)
2007     {
2008       poly_uint64 new_nunits;
2009       if (multiple_p (GET_MODE_NUNITS (vec_mode), factor, &new_nunits))
2010         return aarch64_sve_data_mode (new_elt_mode, new_nunits);
2011     }
2012   return {};
2013 }
2014
2015 /* Implement TARGET_VECTORIZE_RELATED_MODE.  */
2016
2017 static opt_machine_mode
2018 aarch64_vectorize_related_mode (machine_mode vector_mode,
2019                                 scalar_mode element_mode,
2020                                 poly_uint64 nunits)
2021 {
2022   unsigned int vec_flags = aarch64_classify_vector_mode (vector_mode);
2023
2024   /* If we're operating on SVE vectors, try to return an SVE mode.  */
2025   poly_uint64 sve_nunits;
2026   if ((vec_flags & VEC_SVE_DATA)
2027       && multiple_p (BYTES_PER_SVE_VECTOR,
2028                      GET_MODE_SIZE (element_mode), &sve_nunits))
2029     {
2030       machine_mode sve_mode;
2031       if (maybe_ne (nunits, 0U))
2032         {
2033           /* Try to find a full or partial SVE mode with exactly
2034              NUNITS units.  */
2035           if (multiple_p (sve_nunits, nunits)
2036               && aarch64_sve_data_mode (element_mode,
2037                                         nunits).exists (&sve_mode))
2038             return sve_mode;
2039         }
2040       else
2041         {
2042           /* Take the preferred number of units from the number of bytes
2043              that fit in VECTOR_MODE.  We always start by "autodetecting"
2044              a full vector mode with preferred_simd_mode, so vectors
2045              chosen here will also be full vector modes.  Then
2046              autovectorize_vector_modes tries smaller starting modes
2047              and thus smaller preferred numbers of units.  */
2048           sve_nunits = ordered_min (sve_nunits, GET_MODE_SIZE (vector_mode));
2049           if (aarch64_sve_data_mode (element_mode,
2050                                      sve_nunits).exists (&sve_mode))
2051             return sve_mode;
2052         }
2053     }
2054
2055   /* Prefer to use 1 128-bit vector instead of 2 64-bit vectors.  */
2056   if (TARGET_SIMD
2057       && (vec_flags & VEC_ADVSIMD)
2058       && known_eq (nunits, 0U)
2059       && known_eq (GET_MODE_BITSIZE (vector_mode), 64U)
2060       && maybe_ge (GET_MODE_BITSIZE (element_mode)
2061                    * GET_MODE_NUNITS (vector_mode), 128U))
2062     {
2063       machine_mode res = aarch64_simd_container_mode (element_mode, 128);
2064       if (VECTOR_MODE_P (res))
2065         return res;
2066     }
2067
2068   return default_vectorize_related_mode (vector_mode, element_mode, nunits);
2069 }
2070
2071 /* Implement TARGET_VECTORIZE_PREFERRED_DIV_AS_SHIFTS_OVER_MULT.  */
2072
2073 static bool
2074 aarch64_vectorize_preferred_div_as_shifts_over_mult (const_tree type)
2075 {
2076   machine_mode mode = TYPE_MODE (type);
2077   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
2078   bool sve_p = (vec_flags & VEC_ANY_SVE);
2079   bool simd_p = (vec_flags & VEC_ADVSIMD);
2080
2081   return (sve_p && TARGET_SVE2) || (simd_p && TARGET_SIMD);
2082 }
2083
2084 /* Implement TARGET_PREFERRED_ELSE_VALUE.  For binary operations,
2085    prefer to use the first arithmetic operand as the else value if
2086    the else value doesn't matter, since that exactly matches the SVE
2087    destructive merging form.  For ternary operations we could either
2088    pick the first operand and use FMAD-like instructions or the last
2089    operand and use FMLA-like instructions; the latter seems more
2090    natural.  */
2091
2092 static tree
2093 aarch64_preferred_else_value (unsigned, tree, unsigned int nops, tree *ops)
2094 {
2095   return nops == 3 ? ops[2] : ops[0];
2096 }
2097
2098 /* Implement TARGET_HARD_REGNO_NREGS.  */
2099
2100 static unsigned int
2101 aarch64_hard_regno_nregs (unsigned regno, machine_mode mode)
2102 {
2103   /* ??? Logically we should only need to provide a value when
2104      HARD_REGNO_MODE_OK says that the combination is valid,
2105      but at the moment we need to handle all modes.  Just ignore
2106      any runtime parts for registers that can't store them.  */
2107   HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
2108   switch (aarch64_regno_regclass (regno))
2109     {
2110     case FP_REGS:
2111     case FP_LO_REGS:
2112     case FP_LO8_REGS:
2113       {
2114         unsigned int vec_flags = aarch64_classify_vector_mode (mode);
2115         if (vec_flags & VEC_SVE_DATA)
2116           return exact_div (GET_MODE_SIZE (mode),
2117                             aarch64_vl_bytes (mode, vec_flags)).to_constant ();
2118         if (vec_flags == (VEC_ADVSIMD | VEC_STRUCT | VEC_PARTIAL))
2119           return GET_MODE_SIZE (mode).to_constant () / 8;
2120         return CEIL (lowest_size, UNITS_PER_VREG);
2121       }
2122
2123     case PR_REGS:
2124     case PR_LO_REGS:
2125     case PR_HI_REGS:
2126       return mode == VNx64BImode ? 4 : mode == VNx32BImode ? 2 : 1;
2127
2128     case MOVEABLE_SYSREGS:
2129     case FFR_REGS:
2130     case PR_AND_FFR_REGS:
2131     case FAKE_REGS:
2132       return 1;
2133
2134     default:
2135       return CEIL (lowest_size, UNITS_PER_WORD);
2136     }
2137   gcc_unreachable ();
2138 }
2139
2140 /* Implement TARGET_HARD_REGNO_MODE_OK.  */
2141
2142 static bool
2143 aarch64_hard_regno_mode_ok (unsigned regno, machine_mode mode)
2144 {
2145   if (mode == V8DImode)
2146     return IN_RANGE (regno, R0_REGNUM, R23_REGNUM)
2147            && multiple_p (regno - R0_REGNUM, 2);
2148
2149   if (GET_MODE_CLASS (mode) == MODE_CC)
2150     return regno == CC_REGNUM;
2151
2152   if (regno == VG_REGNUM)
2153     /* This must have the same size as _Unwind_Word.  */
2154     return mode == DImode;
2155
2156   if (regno == FPM_REGNUM)
2157     return mode == QImode || mode == HImode || mode == SImode || mode == DImode;
2158
2159   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
2160   if (vec_flags == VEC_SVE_PRED)
2161     return pr_or_ffr_regnum_p (regno);
2162
2163   if (vec_flags == (VEC_SVE_PRED | VEC_STRUCT))
2164     return PR_REGNUM_P (regno);
2165
2166   if (pr_or_ffr_regnum_p (regno))
2167     return false;
2168
2169   /* These registers are abstract; their modes don't matter.  */
2170   if (FAKE_REGNUM_P (regno))
2171     return true;
2172
2173   if (regno == SP_REGNUM)
2174     /* The purpose of comparing with ptr_mode is to support the
2175        global register variable associated with the stack pointer
2176        register via the syntax of asm ("wsp") in ILP32.  */
2177     return mode == Pmode || mode == ptr_mode;
2178
2179   if (regno == FRAME_POINTER_REGNUM || regno == ARG_POINTER_REGNUM)
2180     return mode == Pmode;
2181
2182   if (GP_REGNUM_P (regno))
2183     {
2184       if (vec_flags & (VEC_ANY_SVE | VEC_STRUCT))
2185         return false;
2186       if (known_le (GET_MODE_SIZE (mode), 8))
2187         return true;
2188       if (known_le (GET_MODE_SIZE (mode), 16))
2189         return (regno & 1) == 0;
2190     }
2191   else if (FP_REGNUM_P (regno))
2192     {
2193       if (vec_flags & VEC_STRUCT)
2194         return end_hard_regno (mode, regno) - 1 <= V31_REGNUM;
2195       else
2196         return !VECTOR_MODE_P (mode) || vec_flags != 0;
2197     }
2198
2199   return false;
2200 }
2201
2202 /* Return true if a function with type FNTYPE returns its value in
2203    SVE vector or predicate registers.  */
2204
2205 static bool
2206 aarch64_returns_value_in_sve_regs_p (const_tree fntype)
2207 {
2208   tree return_type = TREE_TYPE (fntype);
2209
2210   pure_scalable_type_info pst_info;
2211   switch (pst_info.analyze (return_type))
2212     {
2213     case pure_scalable_type_info::IS_PST:
2214       return (pst_info.num_zr () <= NUM_FP_ARG_REGS
2215               && pst_info.num_pr () <= NUM_PR_ARG_REGS);
2216
2217     case pure_scalable_type_info::DOESNT_MATTER:
2218       gcc_assert (aarch64_return_in_memory_1 (return_type));
2219       return false;
2220
2221     case pure_scalable_type_info::NO_ABI_IDENTITY:
2222     case pure_scalable_type_info::ISNT_PST:
2223       return false;
2224     }
2225   gcc_unreachable ();
2226 }
2227
2228 /* Return true if a function with type FNTYPE takes arguments in
2229    SVE vector or predicate registers.  */
2230
2231 static bool
2232 aarch64_takes_arguments_in_sve_regs_p (const_tree fntype)
2233 {
2234   CUMULATIVE_ARGS args_so_far_v;
2235   aarch64_init_cumulative_args (&args_so_far_v, NULL_TREE, NULL_RTX,
2236                                 NULL_TREE, 0, true);
2237   cumulative_args_t args_so_far = pack_cumulative_args (&args_so_far_v);
2238
2239   for (tree chain = TYPE_ARG_TYPES (fntype);
2240        chain && chain != void_list_node;
2241        chain = TREE_CHAIN (chain))
2242     {
2243       tree arg_type = TREE_VALUE (chain);
2244       if (arg_type == error_mark_node)
2245         return false;
2246
2247       function_arg_info arg (arg_type, /*named=*/true);
2248       apply_pass_by_reference_rules (&args_so_far_v, arg);
2249       pure_scalable_type_info pst_info;
2250       if (pst_info.analyze_registers (arg.type))
2251         {
2252           unsigned int end_zr = args_so_far_v.aapcs_nvrn + pst_info.num_zr ();
2253           unsigned int end_pr = args_so_far_v.aapcs_nprn + pst_info.num_pr ();
2254           gcc_assert (end_zr <= NUM_FP_ARG_REGS && end_pr <= NUM_PR_ARG_REGS);
2255           return true;
2256         }
2257
2258       targetm.calls.function_arg_advance (args_so_far, arg);
2259     }
2260   return false;
2261 }
2262
2263 /* Implement TARGET_FNTYPE_ABI.  */
2264
2265 static const predefined_function_abi &
2266 aarch64_fntype_abi (const_tree fntype)
2267 {
2268   if (lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (fntype)))
2269     return aarch64_simd_abi ();
2270
2271   if (aarch64_returns_value_in_sve_regs_p (fntype)
2272       || aarch64_takes_arguments_in_sve_regs_p (fntype))
2273     return aarch64_sve_abi ();
2274
2275   return default_function_abi;
2276 }
2277
2278 /* Return the state of PSTATE.SM on entry to functions of type FNTYPE.  */
2279
2280 static aarch64_isa_mode
2281 aarch64_fntype_pstate_sm (const_tree fntype)
2282 {
2283   if (lookup_attribute ("arm", "streaming", TYPE_ATTRIBUTES (fntype)))
2284     return AARCH64_ISA_MODE_SM_ON;
2285
2286   if (lookup_attribute ("arm", "streaming_compatible",
2287                         TYPE_ATTRIBUTES (fntype)))
2288     return 0;
2289
2290   return AARCH64_ISA_MODE_SM_OFF;
2291 }
2292
2293 /* Return state flags that describe whether and how functions of type
2294    FNTYPE share state STATE_NAME with their callers.  */
2295
2296 static unsigned int
2297 aarch64_fntype_shared_flags (const_tree fntype, const char *state_name)
2298 {
2299   return aarch64_lookup_shared_state_flags (TYPE_ATTRIBUTES (fntype),
2300                                             state_name);
2301 }
2302
2303 /* Return the state of PSTATE.ZA on entry to functions of type FNTYPE.  */
2304
2305 static aarch64_isa_mode
2306 aarch64_fntype_pstate_za (const_tree fntype)
2307 {
2308   if (aarch64_fntype_shared_flags (fntype, "za")
2309       || aarch64_fntype_shared_flags (fntype, "zt0"))
2310     return AARCH64_ISA_MODE_ZA_ON;
2311
2312   return 0;
2313 }
2314
2315 /* Return the ISA mode on entry to functions of type FNTYPE.  */
2316
2317 static aarch64_isa_mode
2318 aarch64_fntype_isa_mode (const_tree fntype)
2319 {
2320   return (aarch64_fntype_pstate_sm (fntype)
2321           | aarch64_fntype_pstate_za (fntype));
2322 }
2323
2324 /* Return true if FNDECL uses streaming mode internally, as an
2325    implementation choice.  */
2326
2327 static bool
2328 aarch64_fndecl_is_locally_streaming (const_tree fndecl)
2329 {
2330   return lookup_attribute ("arm", "locally_streaming",
2331                            DECL_ATTRIBUTES (fndecl));
2332 }
2333
2334 /* Return the state of PSTATE.SM when compiling the body of
2335    function FNDECL.  This might be different from the state of
2336    PSTATE.SM on entry.  */
2337
2338 static aarch64_isa_mode
2339 aarch64_fndecl_pstate_sm (const_tree fndecl)
2340 {
2341   if (aarch64_fndecl_is_locally_streaming (fndecl))
2342     return AARCH64_ISA_MODE_SM_ON;
2343
2344   return aarch64_fntype_pstate_sm (TREE_TYPE (fndecl));
2345 }
2346
2347 /* Return true if function FNDECL has state STATE_NAME, either by creating
2348    new state itself or by sharing state with callers.  */
2349
2350 static bool
2351 aarch64_fndecl_has_state (tree fndecl, const char *state_name)
2352 {
2353   return (aarch64_fndecl_has_new_state (fndecl, state_name)
2354           || aarch64_fntype_shared_flags (TREE_TYPE (fndecl),
2355                                           state_name) != 0);
2356 }
2357
2358 /* Return the state of PSTATE.ZA when compiling the body of function FNDECL.
2359    This might be different from the state of PSTATE.ZA on entry.  */
2360
2361 static aarch64_isa_mode
2362 aarch64_fndecl_pstate_za (const_tree fndecl)
2363 {
2364   if (aarch64_fndecl_has_new_state (fndecl, "za")
2365       || aarch64_fndecl_has_new_state (fndecl, "zt0"))
2366     return AARCH64_ISA_MODE_ZA_ON;
2367
2368   return aarch64_fntype_pstate_za (TREE_TYPE (fndecl));
2369 }
2370
2371 /* Return the ISA mode that should be used to compile the body of
2372    function FNDECL.  */
2373
2374 static aarch64_isa_mode
2375 aarch64_fndecl_isa_mode (const_tree fndecl)
2376 {
2377   return (aarch64_fndecl_pstate_sm (fndecl)
2378           | aarch64_fndecl_pstate_za (fndecl));
2379 }
2380
2381 /* Return the state of PSTATE.SM on entry to the current function.
2382    This might be different from the state of PSTATE.SM in the function
2383    body.  */
2384
2385 static aarch64_isa_mode
2386 aarch64_cfun_incoming_pstate_sm ()
2387 {
2388   return aarch64_fntype_pstate_sm (TREE_TYPE (cfun->decl));
2389 }
2390
2391 /* Return the state of PSTATE.ZA on entry to the current function.
2392    This might be different from the state of PSTATE.ZA in the function
2393    body.  */
2394
2395 static aarch64_isa_mode
2396 aarch64_cfun_incoming_pstate_za ()
2397 {
2398   return aarch64_fntype_pstate_za (TREE_TYPE (cfun->decl));
2399 }
2400
2401 /* Return state flags that describe whether and how the current function shares
2402    state STATE_NAME with callers.  */
2403
2404 static unsigned int
2405 aarch64_cfun_shared_flags (const char *state_name)
2406 {
2407   return aarch64_fntype_shared_flags (TREE_TYPE (cfun->decl), state_name);
2408 }
2409
2410 /* Return true if the current function creates new state of type STATE_NAME
2411    (as opposed to sharing the state with its callers or ignoring the state
2412    altogether).  */
2413
2414 static bool
2415 aarch64_cfun_has_new_state (const char *state_name)
2416 {
2417   return aarch64_fndecl_has_new_state (cfun->decl, state_name);
2418 }
2419
2420 /* Return true if PSTATE.SM is 1 in the body of the current function,
2421    but is not guaranteed to be 1 on entry.  */
2422
2423 static bool
2424 aarch64_cfun_enables_pstate_sm ()
2425 {
2426   return (aarch64_fndecl_is_locally_streaming (cfun->decl)
2427           && aarch64_cfun_incoming_pstate_sm () != AARCH64_ISA_MODE_SM_ON);
2428 }
2429
2430 /* Return true if the current function has state STATE_NAME, either by
2431    creating new state itself or by sharing state with callers.  */
2432
2433 static bool
2434 aarch64_cfun_has_state (const char *state_name)
2435 {
2436   return aarch64_fndecl_has_state (cfun->decl, state_name);
2437 }
2438
2439 /* Return true if a call from the current function to a function with
2440    ISA mode CALLEE_MODE would involve a change to PSTATE.SM around
2441    the BL instruction.  */
2442
2443 static bool
2444 aarch64_call_switches_pstate_sm (aarch64_isa_mode callee_mode)
2445 {
2446   return (bool) (callee_mode & ~AARCH64_ISA_MODE & AARCH64_ISA_MODE_SM_STATE);
2447 }
2448
2449 /* Implement TARGET_COMPATIBLE_VECTOR_TYPES_P.  */
2450
2451 static bool
2452 aarch64_compatible_vector_types_p (const_tree type1, const_tree type2)
2453 {
2454   return (aarch64_sve::builtin_type_p (type1)
2455           == aarch64_sve::builtin_type_p (type2));
2456 }
2457
2458 /* Return true if we should emit CFI for register REGNO.  */
2459
2460 static bool
2461 aarch64_emit_cfi_for_reg_p (unsigned int regno)
2462 {
2463   return (GP_REGNUM_P (regno)
2464           || !default_function_abi.clobbers_full_reg_p (regno));
2465 }
2466
2467 /* Return the mode we should use to save and restore register REGNO.  */
2468
2469 static machine_mode
2470 aarch64_reg_save_mode (unsigned int regno)
2471 {
2472   if (GP_REGNUM_P (regno) || regno == VG_REGNUM)
2473     return DImode;
2474
2475   if (FP_REGNUM_P (regno))
2476     switch (crtl->abi->id ())
2477       {
2478       case ARM_PCS_AAPCS64:
2479         /* Only the low 64 bits are saved by the base PCS.  */
2480         return DFmode;
2481
2482       case ARM_PCS_SIMD:
2483         /* The vector PCS saves the low 128 bits (which is the full
2484            register on non-SVE targets).  */
2485         return V16QImode;
2486
2487       case ARM_PCS_SVE:
2488         /* Use vectors of DImode for registers that need frame
2489            information, so that the first 64 bytes of the save slot
2490            are always the equivalent of what storing D<n> would give.  */
2491         if (aarch64_emit_cfi_for_reg_p (regno))
2492           return VNx2DImode;
2493
2494         /* Use vectors of bytes otherwise, so that the layout is
2495            endian-agnostic, and so that we can use LDR and STR for
2496            big-endian targets.  */
2497         return VNx16QImode;
2498
2499       case ARM_PCS_TLSDESC:
2500       case ARM_PCS_UNKNOWN:
2501         break;
2502       }
2503
2504   if (PR_REGNUM_P (regno))
2505     /* Save the full predicate register.  */
2506     return VNx16BImode;
2507
2508   gcc_unreachable ();
2509 }
2510
2511 /* Return the CONST_INT that should be placed in an UNSPEC_CALLEE_ABI rtx.
2512    This value encodes the following information:
2513     - the ISA mode on entry to a callee (ISA_MODE)
2514     - the ABI of the callee (PCS_VARIANT)
2515     - whether the callee has an indirect_return
2516       attribute (INDIRECT_RETURN).  */
2517
2518 rtx
2519 aarch64_gen_callee_cookie (aarch64_isa_mode isa_mode, arm_pcs pcs_variant,
2520                            bool indirect_return)
2521 {
2522   unsigned int im = (unsigned int) isa_mode;
2523   unsigned int ir = (indirect_return ? 1 : 0) << AARCH64_NUM_ISA_MODES;
2524   unsigned int pv = (unsigned int) pcs_variant
2525                      << (AARCH64_NUM_ABI_ATTRIBUTES + AARCH64_NUM_ISA_MODES);
2526   return gen_int_mode (im | ir | pv, DImode);
2527 }
2528
2529 /* COOKIE is a CONST_INT from an UNSPEC_CALLEE_ABI rtx.  Return the
2530    callee's ABI.  */
2531
2532 static const predefined_function_abi &
2533 aarch64_callee_abi (rtx cookie)
2534 {
2535   return function_abis[UINTVAL (cookie)
2536          >> (AARCH64_NUM_ABI_ATTRIBUTES + AARCH64_NUM_ISA_MODES)];
2537 }
2538
2539 /* COOKIE is a CONST_INT from an UNSPEC_CALLEE_ABI rtx.  Return the
2540    required ISA mode on entry to the callee, which is also the ISA
2541    mode on return from the callee.  */
2542
2543 static aarch64_isa_mode
2544 aarch64_callee_isa_mode (rtx cookie)
2545 {
2546   return UINTVAL (cookie) & ((1 << AARCH64_NUM_ISA_MODES) - 1);
2547 }
2548
2549 /* COOKIE is a CONST_INT from an UNSPEC_CALLEE_ABI rtx.  Return
2550    whether function was marked with an indirect_return attribute.  */
2551
2552 static bool
2553 aarch64_callee_indirect_return (rtx cookie)
2554 {
2555   return ((UINTVAL (cookie) >> AARCH64_NUM_ISA_MODES) & 1) == 1;
2556 }
2557
2558 /* INSN is a call instruction.  Return the CONST_INT stored in its
2559    UNSPEC_CALLEE_ABI rtx.  */
2560
2561 static rtx
2562 aarch64_insn_callee_cookie (const rtx_insn *insn)
2563 {
2564   rtx pat = PATTERN (insn);
2565   gcc_assert (GET_CODE (pat) == PARALLEL);
2566   rtx unspec = XVECEXP (pat, 0, 1);
2567   gcc_assert (GET_CODE (unspec) == UNSPEC
2568               && XINT (unspec, 1) == UNSPEC_CALLEE_ABI);
2569   return XVECEXP (unspec, 0, 0);
2570 }
2571
2572 /* INSN is a call instruction.  Return true if the callee has an
2573    indirect_return attribute.  */
2574
2575 bool
2576 aarch_fun_is_indirect_return (rtx_insn *insn)
2577 {
2578   rtx cookie = aarch64_insn_callee_cookie (insn);
2579   return aarch64_callee_indirect_return (cookie);
2580 }
2581
2582 /* Implement TARGET_INSN_CALLEE_ABI.  */
2583
2584 const predefined_function_abi &
2585 aarch64_insn_callee_abi (const rtx_insn *insn)
2586 {
2587   return aarch64_callee_abi (aarch64_insn_callee_cookie (insn));
2588 }
2589
2590 /* INSN is a call instruction.  Return the required ISA mode on entry to
2591    the callee, which is also the ISA mode on return from the callee.  */
2592
2593 static aarch64_isa_mode
2594 aarch64_insn_callee_isa_mode (const rtx_insn *insn)
2595 {
2596   return aarch64_callee_isa_mode (aarch64_insn_callee_cookie (insn));
2597 }
2598
2599 /* Implement TARGET_HARD_REGNO_CALL_PART_CLOBBERED.  The callee only saves
2600    the lower 64 bits of a 128-bit register.  Tell the compiler the callee
2601    clobbers the top 64 bits when restoring the bottom 64 bits.  */
2602
2603 static bool
2604 aarch64_hard_regno_call_part_clobbered (unsigned int abi_id,
2605                                         unsigned int regno,
2606                                         machine_mode mode)
2607 {
2608   if (FP_REGNUM_P (regno) && abi_id != ARM_PCS_SVE)
2609     {
2610       poly_int64 per_register_size = GET_MODE_SIZE (mode);
2611       unsigned int nregs = hard_regno_nregs (regno, mode);
2612       if (nregs > 1)
2613         per_register_size = exact_div (per_register_size, nregs);
2614       if (abi_id == ARM_PCS_SIMD || abi_id == ARM_PCS_TLSDESC)
2615         return maybe_gt (per_register_size, 16);
2616       return maybe_gt (per_register_size, 8);
2617     }
2618   return false;
2619 }
2620
2621 /* Implement REGMODE_NATURAL_SIZE.  */
2622 poly_uint64
2623 aarch64_regmode_natural_size (machine_mode mode)
2624 {
2625   /* The natural size for SVE data modes is one SVE data vector,
2626      and similarly for predicates.  We can't independently modify
2627      anything smaller than that.  */
2628   /* ??? For now, only do this for variable-width SVE registers.
2629      Doing it for constant-sized registers breaks lower-subreg.cc.  */
2630   /* ??? And once that's fixed, we should probably have similar
2631      code for Advanced SIMD.  */
2632   if (!aarch64_sve_vg.is_constant ())
2633     {
2634       /* REGMODE_NATURAL_SIZE influences general subreg validity rules,
2635          so we need to handle memory-only modes as well.  */
2636       unsigned int vec_flags = aarch64_classify_vector_memory_mode (mode);
2637       if (vec_flags & VEC_SVE_PRED)
2638         return BYTES_PER_SVE_PRED;
2639       if (vec_flags & VEC_SVE_DATA)
2640         return BYTES_PER_SVE_VECTOR;
2641     }
2642   return UNITS_PER_WORD;
2643 }
2644
2645 /* Implement HARD_REGNO_CALLER_SAVE_MODE.  */
2646 machine_mode
2647 aarch64_hard_regno_caller_save_mode (unsigned regno, unsigned,
2648                                      machine_mode mode)
2649 {
2650   /* The predicate mode determines which bits are significant and
2651      which are "don't care".  Decreasing the number of lanes would
2652      lose data while increasing the number of lanes would make bits
2653      unnecessarily significant.  */
2654   if (PR_REGNUM_P (regno))
2655     return mode;
2656   if (known_lt (GET_MODE_SIZE (mode), 4)
2657       && REG_CAN_CHANGE_MODE_P (regno, mode, SImode)
2658       && REG_CAN_CHANGE_MODE_P (regno, SImode, mode))
2659     return SImode;
2660   return mode;
2661 }
2662
2663 /* Return true if I's bits are consecutive ones from the MSB.  */
2664 bool
2665 aarch64_high_bits_all_ones_p (HOST_WIDE_INT i)
2666 {
2667   return exact_log2 (-i) != HOST_WIDE_INT_M1;
2668 }
2669
2670 /* Implement TARGET_CONSTANT_ALIGNMENT.  Make strings word-aligned so
2671    that strcpy from constants will be faster.  */
2672
2673 static HOST_WIDE_INT
2674 aarch64_constant_alignment (const_tree exp, HOST_WIDE_INT align)
2675 {
2676   if (TREE_CODE (exp) == STRING_CST && !optimize_size)
2677     return MAX (align, BITS_PER_WORD);
2678   return align;
2679 }
2680
2681 /* Align definitions of arrays, unions and structures so that
2682    initializations and copies can be made more efficient.  This is not
2683    ABI-changing, so it only affects places where we can see the
2684    definition.  Increasing the alignment tends to introduce padding,
2685    so don't do this when optimizing for size/conserving stack space.  */
2686
2687 unsigned
2688 aarch64_data_alignment (const_tree type, unsigned align)
2689 {
2690   if (optimize_size)
2691     return align;
2692
2693   if (AGGREGATE_TYPE_P (type))
2694     {
2695       unsigned HOST_WIDE_INT size = 0;
2696
2697       if (TYPE_SIZE (type) && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
2698           && tree_fits_uhwi_p (TYPE_SIZE (type)))
2699         size = tree_to_uhwi (TYPE_SIZE (type));
2700
2701       /* Align small structs/arrays to 32 bits, or 64 bits if larger.  */
2702       if (align < 32 && size <= 32)
2703         align = 32;
2704       else if (align < 64)
2705         align = 64;
2706     }
2707
2708   return align;
2709 }
2710
2711 unsigned
2712 aarch64_stack_alignment (const_tree type, unsigned align)
2713 {
2714   if (flag_conserve_stack)
2715     return align;
2716
2717   if (AGGREGATE_TYPE_P (type))
2718     {
2719       unsigned HOST_WIDE_INT size = 0;
2720
2721       if (TYPE_SIZE (type) && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
2722           && tree_fits_uhwi_p (TYPE_SIZE (type)))
2723         size = tree_to_uhwi (TYPE_SIZE (type));
2724
2725       /* Align small structs/arrays to 32 bits, or 64 bits if larger.  */
2726       if (align < 32 && size <= 32)
2727         align = 32;
2728       else if (align < 64)
2729         align = 64;
2730     }
2731
2732   return align;
2733 }
2734
2735 /* Return true if calls to DECL should be treated as
2736    long-calls (ie called via a register).  */
2737 static bool
2738 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED)
2739 {
2740   return false;
2741 }
2742
2743 /* Return true if calls to symbol-ref SYM should be treated as
2744    long-calls (ie called via a register).  */
2745 bool
2746 aarch64_is_long_call_p (rtx sym)
2747 {
2748   return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym));
2749 }
2750
2751 /* Return true if calls to symbol-ref SYM should not go through
2752    plt stubs.  */
2753
2754 bool
2755 aarch64_is_noplt_call_p (rtx sym)
2756 {
2757   const_tree decl = SYMBOL_REF_DECL (sym);
2758
2759   if (flag_pic
2760       && decl
2761       && (!flag_plt
2762           || lookup_attribute ("noplt", DECL_ATTRIBUTES (decl)))
2763       && !targetm.binds_local_p (decl))
2764     return true;
2765
2766   return false;
2767 }
2768
2769 /* Emit an insn that's a simple single-set.  Both the operands must be
2770    known to be valid.  */
2771 inline static rtx_insn *
2772 emit_set_insn (rtx x, rtx y)
2773 {
2774   return emit_insn (gen_rtx_SET (x, y));
2775 }
2776
2777 /* X and Y are two things to compare using CODE.  Emit the compare insn and
2778    return the rtx for register 0 in the proper mode.  */
2779 rtx
2780 aarch64_gen_compare_reg (RTX_CODE code, rtx x, rtx y)
2781 {
2782   machine_mode cmp_mode = GET_MODE (x);
2783   machine_mode cc_mode;
2784   rtx cc_reg;
2785
2786   if (cmp_mode == TImode)
2787     {
2788       gcc_assert (code == NE);
2789
2790       cc_mode = CCmode;
2791       cc_reg = gen_rtx_REG (cc_mode, CC_REGNUM);
2792
2793       rtx x_lo = operand_subword (x, 0, 0, TImode);
2794       rtx y_lo = operand_subword (y, 0, 0, TImode);
2795       emit_set_insn (cc_reg, gen_rtx_COMPARE (cc_mode, x_lo, y_lo));
2796
2797       rtx x_hi = operand_subword (x, 1, 0, TImode);
2798       rtx y_hi = operand_subword (y, 1, 0, TImode);
2799       emit_insn (gen_ccmpccdi (cc_reg, cc_reg, x_hi, y_hi,
2800                                gen_rtx_EQ (cc_mode, cc_reg, const0_rtx),
2801                                GEN_INT (AARCH64_EQ)));
2802     }
2803   else
2804     {
2805       cc_mode = SELECT_CC_MODE (code, x, y);
2806       cc_reg = gen_rtx_REG (cc_mode, CC_REGNUM);
2807       emit_set_insn (cc_reg, gen_rtx_COMPARE (cc_mode, x, y));
2808     }
2809   return cc_reg;
2810 }
2811
2812 /* Similarly, but maybe zero-extend Y if Y_MODE < SImode.  */
2813
2814 static rtx
2815 aarch64_gen_compare_reg_maybe_ze (RTX_CODE code, rtx x, rtx y,
2816                                   machine_mode y_mode)
2817 {
2818   if (y_mode == E_QImode || y_mode == E_HImode)
2819     {
2820       if (CONST_INT_P (y))
2821         {
2822           y = GEN_INT (INTVAL (y) & GET_MODE_MASK (y_mode));
2823           y_mode = SImode;
2824         }
2825       else
2826         {
2827           rtx t, cc_reg;
2828           machine_mode cc_mode;
2829
2830           t = gen_rtx_ZERO_EXTEND (SImode, y);
2831           t = gen_rtx_COMPARE (CC_SWPmode, t, x);
2832           cc_mode = CC_SWPmode;
2833           cc_reg = gen_rtx_REG (cc_mode, CC_REGNUM);
2834           emit_set_insn (cc_reg, t);
2835           return cc_reg;
2836         }
2837     }
2838
2839   if (!aarch64_plus_operand (y, y_mode))
2840     y = force_reg (y_mode, y);
2841
2842   return aarch64_gen_compare_reg (code, x, y);
2843 }
2844
2845 /* Generate conditional branch to LABEL, comparing X to 0 using CODE.
2846    Return the jump instruction.  */
2847
2848 static rtx
2849 aarch64_gen_compare_zero_and_branch (rtx_code code, rtx x,
2850                                      rtx_code_label *label)
2851 {
2852   if (aarch64_track_speculation)
2853     {
2854       /* Emit an explicit compare instruction, so that we can correctly
2855          track the condition codes.  */
2856       rtx cc_reg = aarch64_gen_compare_reg (code, x, const0_rtx);
2857       x = gen_rtx_fmt_ee (code, GET_MODE (cc_reg), cc_reg, const0_rtx);
2858     }
2859   else
2860     x = gen_rtx_fmt_ee (code, VOIDmode, x, const0_rtx);
2861
2862   x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
2863                             gen_rtx_LABEL_REF (Pmode, label), pc_rtx);
2864   return gen_rtx_SET (pc_rtx, x);
2865 }
2866
2867 /* Return an rtx that branches to LABEL based on the value of bit BITNUM of X.
2868    If CODE is NE, it branches to LABEL when the bit is set; if CODE is EQ,
2869    it branches to LABEL when the bit is clear.  */
2870
2871 static rtx
2872 aarch64_gen_test_and_branch (rtx_code code, rtx x, int bitnum,
2873                              rtx_code_label *label)
2874 {
2875   auto mode = GET_MODE (x);
2876   if (aarch64_track_speculation)
2877     {
2878       auto mask = gen_int_mode (HOST_WIDE_INT_1U << bitnum, mode);
2879       emit_insn (gen_aarch64_and3nr_compare0 (mode, x, mask));
2880       rtx cc_reg = gen_rtx_REG (CC_NZVmode, CC_REGNUM);
2881       rtx x = gen_rtx_fmt_ee (code, CC_NZVmode, cc_reg, const0_rtx);
2882       return gen_condjump (x, cc_reg, label);
2883     }
2884   return gen_aarch64_tb (code, mode, mode,
2885                          x, gen_int_mode (bitnum, mode), label);
2886 }
2887
2888 /* Consider the operation:
2889
2890      OPERANDS[0] = CODE (OPERANDS[1], OPERANDS[2]) + OPERANDS[3]
2891
2892    where:
2893
2894    - CODE is [SU]MAX or [SU]MIN
2895    - OPERANDS[2] and OPERANDS[3] are constant integers
2896    - OPERANDS[3] is a positive or negative shifted 12-bit immediate
2897    - all operands have mode MODE
2898
2899    Decide whether it is possible to implement the operation using:
2900
2901      SUBS <tmp>, OPERANDS[1], -OPERANDS[3]
2902      or
2903      ADDS <tmp>, OPERANDS[1], OPERANDS[3]
2904
2905    followed by:
2906
2907      <insn> OPERANDS[0], <tmp>, [wx]zr, <cond>
2908
2909    where <insn> is one of CSEL, CSINV or CSINC.  Return true if so.
2910    If GENERATE_P is true, also update OPERANDS as follows:
2911
2912      OPERANDS[4] = -OPERANDS[3]
2913      OPERANDS[5] = the rtl condition representing <cond>
2914      OPERANDS[6] = <tmp>
2915      OPERANDS[7] = 0 for CSEL, -1 for CSINV or 1 for CSINC.  */
2916 bool
2917 aarch64_maxmin_plus_const (rtx_code code, rtx *operands, bool generate_p)
2918 {
2919   signop sgn = (code == UMAX || code == UMIN ? UNSIGNED : SIGNED);
2920   rtx dst = operands[0];
2921   rtx maxmin_op = operands[2];
2922   rtx add_op = operands[3];
2923   machine_mode mode = GET_MODE (dst);
2924
2925   /* max (x, y) - z == (x >= y + 1 ? x : y) - z
2926                     == (x >= y ? x : y) - z
2927                     == (x > y ? x : y) - z
2928                     == (x > y - 1 ? x : y) - z
2929
2930      min (x, y) - z == (x <= y - 1 ? x : y) - z
2931                     == (x <= y ? x : y) - z
2932                     == (x < y ? x : y) - z
2933                     == (x < y + 1 ? x : y) - z
2934
2935      Check whether z is in { y - 1, y, y + 1 } and pick the form(s) for
2936      which x is compared with z.  Set DIFF to y - z.  Thus the supported
2937      combinations are as follows, with DIFF being the value after the ":":
2938
2939      max (x, y) - z == x >= y + 1 ? x - (y + 1) : -1   [z == y + 1]
2940                     == x >= y ? x - y : 0              [z == y]
2941                     == x > y ? x - y : 0               [z == y]
2942                     == x > y - 1 ? x - (y - 1) : 1     [z == y - 1]
2943
2944      min (x, y) - z == x <= y - 1 ? x - (y - 1) : 1    [z == y - 1]
2945                     == x <= y ? x - y : 0              [z == y]
2946                     == x < y ? x - y : 0               [z == y]
2947                     == x < y + 1 ? x - (y + 1) : -1    [z == y + 1].  */
2948   auto maxmin_val = rtx_mode_t (maxmin_op, mode);
2949   auto add_val = rtx_mode_t (add_op, mode);
2950   auto sub_val = wi::neg (add_val);
2951   auto diff = wi::sub (maxmin_val, sub_val);
2952   if (!(diff == 0
2953         || (diff == 1 && wi::gt_p (maxmin_val, sub_val, sgn))
2954         || (diff == -1 && wi::lt_p (maxmin_val, sub_val, sgn))))
2955     return false;
2956
2957   if (!generate_p)
2958     return true;
2959
2960   rtx_code cmp;
2961   switch (code)
2962     {
2963     case SMAX:
2964       cmp = diff == 1 ? GT : GE;
2965       break;
2966     case UMAX:
2967       cmp = diff == 1 ? GTU : GEU;
2968       break;
2969     case SMIN:
2970       cmp = diff == -1 ? LT : LE;
2971       break;
2972     case UMIN:
2973       cmp = diff == -1 ? LTU : LEU;
2974       break;
2975     default:
2976       gcc_unreachable ();
2977     }
2978   rtx cc = gen_rtx_REG (CCmode, CC_REGNUM);
2979
2980   operands[4] = immed_wide_int_const (sub_val, mode);
2981   operands[5] = gen_rtx_fmt_ee (cmp, VOIDmode, cc, const0_rtx);
2982   if (can_create_pseudo_p ())
2983     operands[6] = gen_reg_rtx (mode);
2984   else
2985     operands[6] = dst;
2986   operands[7] = immed_wide_int_const (diff, mode);
2987
2988   return true;
2989 }
2990
2991
2992 /* Build the SYMBOL_REF for __tls_get_addr.  */
2993
2994 static GTY(()) rtx tls_get_addr_libfunc;
2995
2996 rtx
2997 aarch64_tls_get_addr (void)
2998 {
2999   if (!tls_get_addr_libfunc)
3000     tls_get_addr_libfunc = init_one_libfunc ("__tls_get_addr");
3001   return tls_get_addr_libfunc;
3002 }
3003
3004 /* Return the TLS model to use for ADDR.  */
3005
3006 static enum tls_model
3007 tls_symbolic_operand_type (rtx addr)
3008 {
3009   enum tls_model tls_kind = TLS_MODEL_NONE;
3010   poly_int64 offset;
3011   addr = strip_offset_and_salt (addr, &offset);
3012   if (SYMBOL_REF_P (addr))
3013     tls_kind = SYMBOL_REF_TLS_MODEL (addr);
3014
3015   return tls_kind;
3016 }
3017
3018 /* We'll allow lo_sum's in addresses in our legitimate addresses
3019    so that combine would take care of combining addresses where
3020    necessary, but for generation purposes, we'll generate the address
3021    as :
3022    RTL                               Absolute
3023    tmp = hi (symbol_ref);            adrp  x1, foo
3024    dest = lo_sum (tmp, symbol_ref);  add dest, x1, :lo_12:foo
3025                                      nop
3026
3027    PIC                               TLS
3028    adrp x1, :got:foo                 adrp tmp, :tlsgd:foo
3029    ldr  x1, [:got_lo12:foo]          add  dest, tmp, :tlsgd_lo12:foo
3030                                      bl   __tls_get_addr
3031                                      nop
3032
3033    Load TLS symbol, depending on TLS mechanism and TLS access model.
3034
3035    Global Dynamic - Traditional TLS:
3036    adrp tmp, :tlsgd:imm
3037    add  dest, tmp, #:tlsgd_lo12:imm
3038    bl   __tls_get_addr
3039
3040    Global Dynamic - TLS Descriptors:
3041    adrp dest, :tlsdesc:imm
3042    ldr  tmp, [dest, #:tlsdesc_lo12:imm]
3043    add  dest, dest, #:tlsdesc_lo12:imm
3044    blr  tmp
3045    mrs  tp, tpidr_el0
3046    add  dest, dest, tp
3047
3048    Initial Exec:
3049    mrs  tp, tpidr_el0
3050    adrp tmp, :gottprel:imm
3051    ldr  dest, [tmp, #:gottprel_lo12:imm]
3052    add  dest, dest, tp
3053
3054    Local Exec:
3055    mrs  tp, tpidr_el0
3056    add  t0, tp, #:tprel_hi12:imm, lsl #12
3057    add  t0, t0, #:tprel_lo12_nc:imm
3058 */
3059
3060 static void
3061 aarch64_load_symref_appropriately (rtx dest, rtx imm,
3062                                    enum aarch64_symbol_type type)
3063 {
3064 #if TARGET_PECOFF
3065   rtx tmp = legitimize_pe_coff_symbol (imm, true);
3066   if (tmp)
3067     {
3068       emit_insn (gen_rtx_SET (dest, tmp));
3069       return;
3070     }
3071 #endif
3072
3073   switch (type)
3074     {
3075     case SYMBOL_SMALL_ABSOLUTE:
3076       {
3077         /* In ILP32, the mode of dest can be either SImode or DImode.  */
3078         rtx tmp_reg = dest;
3079         machine_mode mode = GET_MODE (dest);
3080
3081         gcc_assert (mode == Pmode || mode == ptr_mode);
3082
3083         if (can_create_pseudo_p ())
3084           tmp_reg = gen_reg_rtx (mode);
3085
3086         HOST_WIDE_INT mid_const = 0;
3087         if (TARGET_PECOFF)
3088           {
3089             poly_int64 offset;
3090             strip_offset (imm, &offset);
3091
3092             HOST_WIDE_INT const_offset;
3093             if (offset.is_constant (&const_offset))
3094               /* Written this way for the sake of negative offsets.  */
3095               mid_const = const_offset / (1 << 20) * (1 << 20);
3096           }
3097         imm = plus_constant (mode, imm, -mid_const);
3098
3099         emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, copy_rtx (imm)));
3100         if (mid_const)
3101           emit_set_insn (tmp_reg, plus_constant (mode, tmp_reg, mid_const));
3102         emit_insn (gen_add_losym (dest, tmp_reg, imm));
3103         return;
3104       }
3105
3106     case SYMBOL_TINY_ABSOLUTE:
3107       emit_insn (gen_rtx_SET (dest, imm));
3108       return;
3109
3110     case SYMBOL_SMALL_GOT_28K:
3111       {
3112         machine_mode mode = GET_MODE (dest);
3113         rtx gp_rtx = pic_offset_table_rtx;
3114         rtx insn;
3115         rtx mem;
3116
3117         /* NOTE: pic_offset_table_rtx can be NULL_RTX, because we can reach
3118            here before rtl expand.  Tree IVOPT will generate rtl pattern to
3119            decide rtx costs, in which case pic_offset_table_rtx is not
3120            initialized.  For that case no need to generate the first adrp
3121            instruction as the final cost for global variable access is
3122            one instruction.  */
3123         if (gp_rtx != NULL)
3124           {
3125             /* -fpic for -mcmodel=small allow 32K GOT table size (but we are
3126                using the page base as GOT base, the first page may be wasted,
3127                in the worst scenario, there is only 28K space for GOT).
3128
3129                The generate instruction sequence for accessing global variable
3130                is:
3131
3132                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym]
3133
3134                Only one instruction needed. But we must initialize
3135                pic_offset_table_rtx properly.  We generate initialize insn for
3136                every global access, and allow CSE to remove all redundant.
3137
3138                The final instruction sequences will look like the following
3139                for multiply global variables access.
3140
3141                  adrp pic_offset_table_rtx, _GLOBAL_OFFSET_TABLE_
3142
3143                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym1]
3144                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym2]
3145                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym3]
3146                  ...  */
3147
3148             rtx s = gen_rtx_SYMBOL_REF (Pmode, "_GLOBAL_OFFSET_TABLE_");
3149             crtl->uses_pic_offset_table = 1;
3150             emit_move_insn (gp_rtx, gen_rtx_HIGH (Pmode, s));
3151
3152             if (mode != GET_MODE (gp_rtx))
3153              gp_rtx = gen_lowpart (mode, gp_rtx);
3154
3155           }
3156
3157         if (mode == ptr_mode)
3158           {
3159             if (mode == DImode)
3160               insn = gen_ldr_got_small_28k_di (dest, gp_rtx, imm);
3161             else
3162               insn = gen_ldr_got_small_28k_si (dest, gp_rtx, imm);
3163
3164             mem = XVECEXP (SET_SRC (insn), 0, 0);
3165           }
3166         else
3167           {
3168             gcc_assert (mode == Pmode);
3169
3170             insn = gen_ldr_got_small_28k_sidi (dest, gp_rtx, imm);
3171             mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
3172           }
3173
3174         /* The operand is expected to be MEM.  Whenever the related insn
3175            pattern changed, above code which calculate mem should be
3176            updated.  */
3177         gcc_assert (MEM_P (mem));
3178         MEM_READONLY_P (mem) = 1;
3179         MEM_NOTRAP_P (mem) = 1;
3180         emit_insn (insn);
3181         return;
3182       }
3183
3184     case SYMBOL_SMALL_GOT_4G:
3185       emit_insn (gen_rtx_SET (dest, imm));
3186       return;
3187
3188     case SYMBOL_SMALL_TLSGD:
3189       {
3190         rtx_insn *insns;
3191         /* The return type of __tls_get_addr is the C pointer type
3192            so use ptr_mode.  */
3193         rtx result = gen_rtx_REG (ptr_mode, R0_REGNUM);
3194         rtx tmp_reg = dest;
3195
3196         if (GET_MODE (dest) != ptr_mode)
3197           tmp_reg = can_create_pseudo_p () ? gen_reg_rtx (ptr_mode) : result;
3198
3199         start_sequence ();
3200         if (ptr_mode == SImode)
3201           aarch64_emit_call_insn (gen_tlsgd_small_si (result, imm));
3202         else
3203           aarch64_emit_call_insn (gen_tlsgd_small_di (result, imm));
3204         insns = get_insns ();
3205         end_sequence ();
3206
3207         RTL_CONST_CALL_P (insns) = 1;
3208         emit_libcall_block (insns, tmp_reg, result, imm);
3209         /* Convert back to the mode of the dest adding a zero_extend
3210            from SImode (ptr_mode) to DImode (Pmode). */
3211         if (dest != tmp_reg)
3212           convert_move (dest, tmp_reg, true);
3213         return;
3214       }
3215
3216     case SYMBOL_SMALL_TLSDESC:
3217       {
3218         machine_mode mode = GET_MODE (dest);
3219         rtx x0 = gen_rtx_REG (mode, R0_REGNUM);
3220         rtx tp;
3221
3222         gcc_assert (mode == Pmode || mode == ptr_mode);
3223
3224         /* In ILP32, the got entry is always of SImode size.  Unlike
3225            small GOT, the dest is fixed at reg 0.  */
3226         if (TARGET_ILP32)
3227           emit_insn (gen_tlsdesc_small_si (imm));
3228         else
3229           emit_insn (gen_tlsdesc_small_di (imm));
3230         tp = aarch64_load_tp (NULL);
3231
3232         if (mode != Pmode)
3233           tp = gen_lowpart (mode, tp);
3234
3235         emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, x0)));
3236         if (REG_P (dest))
3237           set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
3238         return;
3239       }
3240
3241     case SYMBOL_SMALL_TLSIE:
3242       {
3243         /* In ILP32, the mode of dest can be either SImode or DImode,
3244            while the got entry is always of SImode size.  The mode of
3245            dest depends on how dest is used: if dest is assigned to a
3246            pointer (e.g. in the memory), it has SImode; it may have
3247            DImode if dest is dereferenced to access the memeory.
3248            This is why we have to handle three different tlsie_small
3249            patterns here (two patterns for ILP32).  */
3250         machine_mode mode = GET_MODE (dest);
3251         rtx tmp_reg = gen_reg_rtx (mode);
3252         rtx tp = aarch64_load_tp (NULL);
3253
3254         if (mode == ptr_mode)
3255           {
3256             if (mode == DImode)
3257               emit_insn (gen_tlsie_small_di (tmp_reg, imm));
3258             else
3259               {
3260                 emit_insn (gen_tlsie_small_si (tmp_reg, imm));
3261                 tp = gen_lowpart (mode, tp);
3262               }
3263           }
3264         else
3265           {
3266             gcc_assert (mode == Pmode);
3267             emit_insn (gen_tlsie_small_sidi (tmp_reg, imm));
3268           }
3269
3270         emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, tmp_reg)));
3271         if (REG_P (dest))
3272           set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
3273         return;
3274       }
3275
3276     case SYMBOL_TLSLE12:
3277     case SYMBOL_TLSLE24:
3278     case SYMBOL_TLSLE32:
3279     case SYMBOL_TLSLE48:
3280       {
3281         machine_mode mode = GET_MODE (dest);
3282         rtx tp = aarch64_load_tp (NULL);
3283
3284         if (mode != Pmode)
3285           tp = gen_lowpart (mode, tp);
3286
3287         switch (type)
3288           {
3289           case SYMBOL_TLSLE12:
3290             emit_insn ((mode == DImode ? gen_tlsle12_di : gen_tlsle12_si)
3291                         (dest, tp, imm));
3292             break;
3293           case SYMBOL_TLSLE24:
3294             emit_insn ((mode == DImode ? gen_tlsle24_di : gen_tlsle24_si)
3295                         (dest, tp, imm));
3296           break;
3297           case SYMBOL_TLSLE32:
3298             emit_insn ((mode == DImode ? gen_tlsle32_di : gen_tlsle32_si)
3299                         (dest, imm));
3300             emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
3301                         (dest, dest, tp));
3302           break;
3303           case SYMBOL_TLSLE48:
3304             emit_insn ((mode == DImode ? gen_tlsle48_di : gen_tlsle48_si)
3305                         (dest, imm));
3306             emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
3307                         (dest, dest, tp));
3308             break;
3309           default:
3310             gcc_unreachable ();
3311           }
3312
3313         if (REG_P (dest))
3314           set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
3315         return;
3316       }
3317
3318     case SYMBOL_TINY_GOT:
3319       {
3320         rtx insn;
3321         machine_mode mode = GET_MODE (dest);
3322
3323         if (mode == ptr_mode)
3324           insn = gen_ldr_got_tiny (mode, dest, imm);
3325         else
3326           {
3327             gcc_assert (mode == Pmode);
3328             insn = gen_ldr_got_tiny_sidi (dest, imm);
3329           }
3330
3331         emit_insn (insn);
3332         return;
3333       }
3334
3335     case SYMBOL_TINY_TLSIE:
3336       {
3337         machine_mode mode = GET_MODE (dest);
3338         rtx tp = aarch64_load_tp (NULL);
3339
3340         if (mode == ptr_mode)
3341           {
3342             if (mode == DImode)
3343               emit_insn (gen_tlsie_tiny_di (dest, imm, tp));
3344             else
3345               {
3346                 tp = gen_lowpart (mode, tp);
3347                 emit_insn (gen_tlsie_tiny_si (dest, imm, tp));
3348               }
3349           }
3350         else
3351           {
3352             gcc_assert (mode == Pmode);
3353             emit_insn (gen_tlsie_tiny_sidi (dest, imm, tp));
3354           }
3355
3356         if (REG_P (dest))
3357           set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
3358         return;
3359       }
3360
3361     default:
3362       gcc_unreachable ();
3363     }
3364 }
3365
3366 /* Emit a move from SRC to DEST.  Assume that the move expanders can
3367    handle all moves if !can_create_pseudo_p ().  The distinction is
3368    important because, unlike emit_move_insn, the move expanders know
3369    how to force Pmode objects into the constant pool even when the
3370    constant pool address is not itself legitimate.  */
3371 static rtx
3372 aarch64_emit_move (rtx dest, rtx src)
3373 {
3374   return (can_create_pseudo_p ()
3375           ? emit_move_insn (dest, src)
3376           : emit_move_insn_1 (dest, src));
3377 }
3378
3379 /* Apply UNOPTAB to OP and store the result in DEST.  */
3380
3381 static void
3382 aarch64_emit_unop (rtx dest, optab unoptab, rtx op)
3383 {
3384   rtx tmp = expand_unop (GET_MODE (dest), unoptab, op, dest, 0);
3385   if (dest != tmp)
3386     emit_move_insn (dest, tmp);
3387 }
3388
3389 /* Apply BINOPTAB to OP0 and OP1 and store the result in DEST.  */
3390
3391 static void
3392 aarch64_emit_binop (rtx dest, optab binoptab, rtx op0, rtx op1)
3393 {
3394   rtx tmp = expand_binop (GET_MODE (dest), binoptab, op0, op1, dest, 0,
3395                           OPTAB_DIRECT);
3396   if (dest != tmp)
3397     emit_move_insn (dest, tmp);
3398 }
3399
3400 /* Split a move from SRC to DST into multiple moves of mode SINGLE_MODE.  */
3401
3402 void
3403 aarch64_split_move (rtx dst, rtx src, machine_mode single_mode)
3404 {
3405   machine_mode mode = GET_MODE (dst);
3406   auto npieces = exact_div (GET_MODE_SIZE (mode),
3407                             GET_MODE_SIZE (single_mode)).to_constant ();
3408   auto_vec<rtx, 4> dst_pieces, src_pieces;
3409
3410   for (unsigned int i = 0; i < npieces; ++i)
3411     {
3412       auto off = i * GET_MODE_SIZE (single_mode);
3413       dst_pieces.safe_push (simplify_gen_subreg (single_mode, dst, mode, off));
3414       src_pieces.safe_push (simplify_gen_subreg (single_mode, src, mode, off));
3415     }
3416
3417   /* At most one pairing may overlap.  */
3418   if (reg_overlap_mentioned_p (dst_pieces[0], src))
3419     for (unsigned int i = npieces; i-- > 0;)
3420       aarch64_emit_move (dst_pieces[i], src_pieces[i]);
3421   else
3422     for (unsigned int i = 0; i < npieces; ++i)
3423       aarch64_emit_move (dst_pieces[i], src_pieces[i]);
3424 }
3425
3426 /* Split a 128-bit move operation into two 64-bit move operations,
3427    taking care to handle partial overlap of register to register
3428    copies.  Special cases are needed when moving between GP regs and
3429    FP regs.  SRC can be a register, constant or memory; DST a register
3430    or memory.  If either operand is memory it must not have any side
3431    effects.  */
3432 void
3433 aarch64_split_128bit_move (rtx dst, rtx src)
3434 {
3435   machine_mode mode = GET_MODE (dst);
3436
3437   gcc_assert (mode == TImode || mode == TFmode || mode == TDmode);
3438   gcc_assert (!(side_effects_p (src) || side_effects_p (dst)));
3439   gcc_assert (mode == GET_MODE (src) || GET_MODE (src) == VOIDmode);
3440
3441   if (REG_P (dst) && REG_P (src))
3442     {
3443       int src_regno = REGNO (src);
3444       int dst_regno = REGNO (dst);
3445
3446       /* Handle FP <-> GP regs.  */
3447       if (FP_REGNUM_P (dst_regno) && GP_REGNUM_P (src_regno))
3448         {
3449           rtx src_lo = gen_lowpart (word_mode, src);
3450           rtx src_hi = gen_highpart (word_mode, src);
3451
3452           emit_insn (gen_aarch64_movlow_di (mode, dst, src_lo));
3453           emit_insn (gen_aarch64_movhigh_di (mode, dst, src_hi));
3454           return;
3455         }
3456       else if (GP_REGNUM_P (dst_regno) && FP_REGNUM_P (src_regno))
3457         {
3458           rtx dst_lo = gen_lowpart (word_mode, dst);
3459           rtx dst_hi = gen_highpart (word_mode, dst);
3460
3461           emit_insn (gen_aarch64_movdi_low (mode, dst_lo, src));
3462           emit_insn (gen_aarch64_movdi_high (mode, dst_hi, src));
3463           return;
3464         }
3465     }
3466
3467   aarch64_split_move (dst, src, word_mode);
3468 }
3469
3470 /* Return true if we should split a move from 128-bit value SRC
3471    to 128-bit register DEST.  */
3472
3473 bool
3474 aarch64_split_128bit_move_p (rtx dst, rtx src)
3475 {
3476   if (FP_REGNUM_P (REGNO (dst)))
3477     return REG_P (src) && !FP_REGNUM_P (REGNO (src));
3478   /* All moves to GPRs need to be split.  */
3479   return true;
3480 }
3481
3482 /* Split a complex SIMD move.  */
3483
3484 void
3485 aarch64_split_simd_move (rtx dst, rtx src)
3486 {
3487   machine_mode src_mode = GET_MODE (src);
3488   machine_mode dst_mode = GET_MODE (dst);
3489
3490   gcc_assert (VECTOR_MODE_P (dst_mode));
3491
3492   if (REG_P (dst) && REG_P (src))
3493     {
3494       gcc_assert (VECTOR_MODE_P (src_mode));
3495       emit_insn (gen_aarch64_split_simd_mov (src_mode, dst, src));
3496     }
3497 }
3498
3499 /* Return a register that contains SVE value X reinterpreted as SVE mode MODE.
3500    The semantics of those of svreinterpret rather than those of subregs;
3501    see the comment at the head of aarch64-sve.md for details about the
3502    difference.  */
3503
3504 rtx
3505 aarch64_sve_reinterpret (machine_mode mode, rtx x)
3506 {
3507   if (GET_MODE (x) == mode)
3508     return x;
3509
3510   /* can_change_mode_class must only return true if subregs and svreinterprets
3511      have the same semantics.  */
3512   if (targetm.can_change_mode_class (GET_MODE (x), mode, FP_REGS))
3513     return force_lowpart_subreg (mode, x, GET_MODE (x));
3514
3515   rtx res = gen_reg_rtx (mode);
3516   x = force_reg (GET_MODE (x), x);
3517   emit_insn (gen_aarch64_sve_reinterpret (mode, res, x));
3518   return res;
3519 }
3520
3521 bool
3522 aarch64_zero_extend_const_eq (machine_mode xmode, rtx x,
3523                               machine_mode ymode, rtx y)
3524 {
3525   rtx r = simplify_const_unary_operation (ZERO_EXTEND, xmode, y, ymode);
3526   gcc_assert (r != NULL);
3527   return rtx_equal_p (x, r);
3528 }
3529
3530 /* Return TARGET if it is nonnull and a register of mode MODE.
3531    Otherwise, return a fresh register of mode MODE if we can,
3532    or TARGET reinterpreted as MODE if we can't.  */
3533
3534 static rtx
3535 aarch64_target_reg (rtx target, machine_mode mode)
3536 {
3537   if (target && REG_P (target) && GET_MODE (target) == mode)
3538     return target;
3539   if (!can_create_pseudo_p ())
3540     {
3541       gcc_assert (target);
3542       return gen_lowpart (mode, target);
3543     }
3544   return gen_reg_rtx (mode);
3545 }
3546
3547 /* Return a register that contains the constant in BUILDER, given that
3548    the constant is a legitimate move operand.  Use TARGET as the register
3549    if it is nonnull and convenient.  */
3550
3551 static rtx
3552 aarch64_emit_set_immediate (rtx target, rtx_vector_builder &builder)
3553 {
3554   rtx src = builder.build ();
3555   target = aarch64_target_reg (target, GET_MODE (src));
3556   emit_insn (gen_rtx_SET (target, src));
3557   return target;
3558 }
3559
3560 static rtx
3561 aarch64_force_temporary (machine_mode mode, rtx x, rtx value)
3562 {
3563   if (can_create_pseudo_p ())
3564     return force_reg (mode, value);
3565   else
3566     {
3567       gcc_assert (x);
3568       aarch64_emit_move (x, value);
3569       return x;
3570     }
3571 }
3572
3573 /* Return true if predicate value X is a constant in which every element
3574    is a CONST_INT.  When returning true, describe X in BUILDER as a VNx16BI
3575    value, i.e. as a predicate in which all bits are significant.  */
3576
3577 static bool
3578 aarch64_get_sve_pred_bits (rtx_vector_builder &builder, rtx x)
3579 {
3580   if (!CONST_VECTOR_P (x))
3581     return false;
3582
3583   unsigned int factor = vector_element_size (GET_MODE_NUNITS (VNx16BImode),
3584                                              GET_MODE_NUNITS (GET_MODE (x)));
3585   unsigned int npatterns = CONST_VECTOR_NPATTERNS (x) * factor;
3586   unsigned int nelts_per_pattern = CONST_VECTOR_NELTS_PER_PATTERN (x);
3587   builder.new_vector (VNx16BImode, npatterns, nelts_per_pattern);
3588
3589   unsigned int nelts = const_vector_encoded_nelts (x);
3590   for (unsigned int i = 0; i < nelts; ++i)
3591     {
3592       rtx elt = CONST_VECTOR_ENCODED_ELT (x, i);
3593       if (!CONST_INT_P (elt))
3594         return false;
3595
3596       builder.quick_push (elt);
3597       for (unsigned int j = 1; j < factor; ++j)
3598         builder.quick_push (const0_rtx);
3599     }
3600   builder.finalize ();
3601   return true;
3602 }
3603
3604 /* BUILDER contains a predicate constant of mode VNx16BI.  Return the
3605    widest predicate element size it can have (that is, the largest size
3606    for which each element would still be 0 or 1).  */
3607
3608 unsigned int
3609 aarch64_widest_sve_pred_elt_size (rtx_vector_builder &builder)
3610 {
3611   /* Start with the most optimistic assumption: that we only need
3612      one bit per pattern.  This is what we will use if only the first
3613      bit in each pattern is ever set.  */
3614   unsigned int mask = GET_MODE_SIZE (DImode);
3615   mask |= builder.npatterns ();
3616
3617   /* Look for set bits.  */
3618   unsigned int nelts = builder.encoded_nelts ();
3619   for (unsigned int i = 1; i < nelts; ++i)
3620     if (INTVAL (builder.elt (i)) != 0)
3621       {
3622         if (i & 1)
3623           return 1;
3624         mask |= i;
3625       }
3626   return mask & -mask;
3627 }
3628
3629 /* If VNx16BImode rtx X is a canonical PTRUE for a predicate mode,
3630    return that predicate mode, otherwise return opt_machine_mode ().  */
3631
3632 opt_machine_mode
3633 aarch64_ptrue_all_mode (rtx x)
3634 {
3635   gcc_assert (GET_MODE (x) == VNx16BImode);
3636   if (!CONST_VECTOR_P (x)
3637       || !CONST_VECTOR_DUPLICATE_P (x)
3638       || !CONST_INT_P (CONST_VECTOR_ENCODED_ELT (x, 0))
3639       || INTVAL (CONST_VECTOR_ENCODED_ELT (x, 0)) == 0)
3640     return opt_machine_mode ();
3641
3642   unsigned int nelts = const_vector_encoded_nelts (x);
3643   for (unsigned int i = 1; i < nelts; ++i)
3644     if (CONST_VECTOR_ENCODED_ELT (x, i) != const0_rtx)
3645       return opt_machine_mode ();
3646
3647   return aarch64_sve_pred_mode (nelts);
3648 }
3649
3650 /* BUILDER is a predicate constant of mode VNx16BI.  Consider the value
3651    that the constant would have with predicate element size ELT_SIZE
3652    (ignoring the upper bits in each element) and return:
3653
3654    * -1 if all bits are set
3655    * N if the predicate has N leading set bits followed by all clear bits
3656    * 0 if the predicate does not have any of these forms.  */
3657
3658 int
3659 aarch64_partial_ptrue_length (rtx_vector_builder &builder,
3660                               unsigned int elt_size)
3661 {
3662   /* If nelts_per_pattern is 3, we have set bits followed by clear bits
3663      followed by set bits.  */
3664   if (builder.nelts_per_pattern () == 3)
3665     return 0;
3666
3667   /* Skip over leading set bits.  */
3668   unsigned int nelts = builder.encoded_nelts ();
3669   unsigned int i = 0;
3670   for (; i < nelts; i += elt_size)
3671     if (INTVAL (builder.elt (i)) == 0)
3672       break;
3673   unsigned int vl = i / elt_size;
3674
3675   /* Check for the all-true case.  */
3676   if (i == nelts)
3677     return -1;
3678
3679   /* If nelts_per_pattern is 1, then either VL is zero, or we have a
3680      repeating pattern of set bits followed by clear bits.  */
3681   if (builder.nelts_per_pattern () != 2)
3682     return 0;
3683
3684   /* We have a "foreground" value and a duplicated "background" value.
3685      If the background might repeat and the last set bit belongs to it,
3686      we might have set bits followed by clear bits followed by set bits.  */
3687   if (i > builder.npatterns () && maybe_ne (nelts, builder.full_nelts ()))
3688     return 0;
3689
3690   /* Make sure that the rest are all clear.  */
3691   for (; i < nelts; i += elt_size)
3692     if (INTVAL (builder.elt (i)) != 0)
3693       return 0;
3694
3695   return vl;
3696 }
3697
3698 /* See if there is an svpattern that encodes an SVE predicate of mode
3699    PRED_MODE in which the first VL bits are set and the rest are clear.
3700    Return the pattern if so, otherwise return AARCH64_NUM_SVPATTERNS.
3701    A VL of -1 indicates an all-true vector.  */
3702
3703 aarch64_svpattern
3704 aarch64_svpattern_for_vl (machine_mode pred_mode, int vl)
3705 {
3706   if (vl < 0)
3707     return AARCH64_SV_ALL;
3708
3709   if (maybe_gt (vl, GET_MODE_NUNITS (pred_mode)))
3710     return AARCH64_NUM_SVPATTERNS;
3711
3712   if (vl >= 1 && vl <= 8)
3713     return aarch64_svpattern (AARCH64_SV_VL1 + (vl - 1));
3714
3715   if (vl >= 16 && vl <= 256 && pow2p_hwi (vl))
3716     return aarch64_svpattern (AARCH64_SV_VL16 + (exact_log2 (vl) - 4));
3717
3718   int max_vl;
3719   if (GET_MODE_NUNITS (pred_mode).is_constant (&max_vl))
3720     {
3721       if (vl == (max_vl / 3) * 3)
3722         return AARCH64_SV_MUL3;
3723       /* These would only trigger for non-power-of-2 lengths.  */
3724       if (vl == (max_vl & -4))
3725         return AARCH64_SV_MUL4;
3726       if (vl == (1 << floor_log2 (max_vl)))
3727         return AARCH64_SV_POW2;
3728       if (vl == max_vl)
3729         return AARCH64_SV_ALL;
3730     }
3731   return AARCH64_NUM_SVPATTERNS;
3732 }
3733
3734 /* Return a VNx16BImode constant in which every sequence of ELT_SIZE
3735    bits has the lowest bit set and the upper bits clear.  This is the
3736    VNx16BImode equivalent of a PTRUE for controlling elements of
3737    ELT_SIZE bytes.  However, because the constant is VNx16BImode,
3738    all bits are significant, even the upper zeros.  */
3739
3740 rtx
3741 aarch64_ptrue_all (unsigned int elt_size)
3742 {
3743   rtx_vector_builder builder (VNx16BImode, elt_size, 1);
3744   builder.quick_push (const1_rtx);
3745   for (unsigned int i = 1; i < elt_size; ++i)
3746     builder.quick_push (const0_rtx);
3747   return builder.build ();
3748 }
3749
3750 /* Return an all-true predicate register of mode MODE.  */
3751
3752 rtx
3753 aarch64_ptrue_reg (machine_mode mode)
3754 {
3755   gcc_assert (aarch64_sve_pred_mode_p (mode));
3756   rtx reg = force_reg (VNx16BImode, CONSTM1_RTX (VNx16BImode));
3757   return gen_lowpart (mode, reg);
3758 }
3759
3760 /* Return an all-true (restricted to the leading VL bits) predicate register of
3761    mode MODE.  */
3762
3763 rtx
3764 aarch64_ptrue_reg (machine_mode mode, unsigned int vl)
3765 {
3766   gcc_assert (aarch64_sve_pred_mode_p (mode));
3767
3768   rtx_vector_builder builder (VNx16BImode, vl, 2);
3769
3770   for (unsigned i = 0; i < vl; i++)
3771     builder.quick_push (CONST1_RTX (BImode));
3772
3773   for (unsigned i = 0; i < vl; i++)
3774     builder.quick_push (CONST0_RTX (BImode));
3775
3776   rtx const_vec = builder.build ();
3777   rtx reg = force_reg (VNx16BImode, const_vec);
3778   return gen_lowpart (mode, reg);
3779 }
3780
3781 /* Return a register of mode PRED_MODE for controlling data of mode DATA_MODE.
3782
3783    DATA_MODE can be a scalar, an Advanced SIMD vector, or an SVE vector.
3784    If it's an N-byte scalar or an Advanced SIMD vector, the first N bits
3785    of the predicate will be active and the rest will be inactive.
3786    If DATA_MODE is an SVE mode, every bit of the predicate will be active.  */
3787 rtx
3788 aarch64_ptrue_reg (machine_mode pred_mode, machine_mode data_mode)
3789 {
3790   if (aarch64_sve_mode_p (data_mode))
3791     return aarch64_ptrue_reg (pred_mode);
3792
3793   auto size = GET_MODE_SIZE (data_mode).to_constant ();
3794   return aarch64_ptrue_reg (pred_mode, size);
3795 }
3796
3797 /* Return an all-false predicate register of mode MODE.  */
3798
3799 rtx
3800 aarch64_pfalse_reg (machine_mode mode)
3801 {
3802   gcc_assert (aarch64_sve_pred_mode_p (mode));
3803   rtx reg = force_reg (VNx16BImode, CONST0_RTX (VNx16BImode));
3804   return gen_lowpart (mode, reg);
3805 }
3806
3807 /* PRED1[0] is a PTEST predicate and PRED1[1] is an aarch64_sve_ptrue_flag
3808    for it.  PRED2[0] is the predicate for the instruction whose result
3809    is tested by the PTEST and PRED2[1] is again an aarch64_sve_ptrue_flag
3810    for it.  Return true if we can prove that the two predicates are
3811    equivalent for PTEST purposes; that is, if we can replace PRED2[0]
3812    with PRED1[0] without changing behavior.  */
3813
3814 bool
3815 aarch64_sve_same_pred_for_ptest_p (rtx *pred1, rtx *pred2)
3816 {
3817   machine_mode mode = GET_MODE (pred1[0]);
3818   gcc_assert (aarch64_sve_pred_mode_p (mode)
3819               && mode == GET_MODE (pred2[0])
3820               && aarch64_sve_ptrue_flag (pred1[1], SImode)
3821               && aarch64_sve_ptrue_flag (pred2[1], SImode));
3822
3823   bool ptrue1_p = (pred1[0] == CONSTM1_RTX (mode)
3824                    || INTVAL (pred1[1]) == SVE_KNOWN_PTRUE);
3825   bool ptrue2_p = (pred2[0] == CONSTM1_RTX (mode)
3826                    || INTVAL (pred2[1]) == SVE_KNOWN_PTRUE);
3827   return (ptrue1_p && ptrue2_p) || rtx_equal_p (pred1[0], pred2[0]);
3828 }
3829
3830 /* Emit a comparison CMP between OP0 and OP1, both of which have mode
3831    DATA_MODE, and return the result in a predicate of mode PRED_MODE.
3832    Use TARGET as the target register if nonnull and convenient.  */
3833
3834 static rtx
3835 aarch64_sve_emit_int_cmp (rtx target, machine_mode pred_mode, rtx_code cmp,
3836                           machine_mode data_mode, rtx op1, rtx op2)
3837 {
3838   insn_code icode = code_for_aarch64_pred_cmp (cmp, data_mode);
3839   expand_operand ops[5];
3840   create_output_operand (&ops[0], target, pred_mode);
3841   create_input_operand (&ops[1], CONSTM1_RTX (pred_mode), pred_mode);
3842   create_integer_operand (&ops[2], SVE_KNOWN_PTRUE);
3843   create_input_operand (&ops[3], op1, data_mode);
3844   create_input_operand (&ops[4], op2, data_mode);
3845   expand_insn (icode, 5, ops);
3846   return ops[0].value;
3847 }
3848
3849 /* Use a comparison to convert integer vector SRC into MODE, which is
3850    the corresponding SVE predicate mode.  Use TARGET for the result
3851    if it's nonnull and convenient.  */
3852
3853 rtx
3854 aarch64_convert_sve_data_to_pred (rtx target, machine_mode mode, rtx src)
3855 {
3856   machine_mode src_mode = GET_MODE (src);
3857   return aarch64_sve_emit_int_cmp (target, mode, NE, src_mode,
3858                                    src, CONST0_RTX (src_mode));
3859 }
3860
3861 /* Return the assembly token for svprfop value PRFOP.  */
3862
3863 static const char *
3864 svprfop_token (enum aarch64_svprfop prfop)
3865 {
3866   switch (prfop)
3867     {
3868 #define CASE(UPPER, LOWER, VALUE) case AARCH64_SV_##UPPER: return #LOWER;
3869     AARCH64_FOR_SVPRFOP (CASE)
3870 #undef CASE
3871     case AARCH64_NUM_SVPRFOPS:
3872       break;
3873     }
3874   gcc_unreachable ();
3875 }
3876
3877 /* Return the assembly string for an SVE prefetch operation with
3878    mnemonic MNEMONIC, given that PRFOP_RTX is the prefetch operation
3879    and that SUFFIX is the format for the remaining operands.  */
3880
3881 char *
3882 aarch64_output_sve_prefetch (const char *mnemonic, rtx prfop_rtx,
3883                              const char *suffix)
3884 {
3885   static char buffer[128];
3886   aarch64_svprfop prfop = (aarch64_svprfop) INTVAL (prfop_rtx);
3887   unsigned int written = snprintf (buffer, sizeof (buffer), "%s\t%s, %s",
3888                                    mnemonic, svprfop_token (prfop), suffix);
3889   gcc_assert (written < sizeof (buffer));
3890   return buffer;
3891 }
3892
3893 /* Check whether we can calculate the number of elements in PATTERN
3894    at compile time, given that there are NELTS_PER_VQ elements per
3895    128-bit block.  Return the value if so, otherwise return -1.  */
3896
3897 HOST_WIDE_INT
3898 aarch64_fold_sve_cnt_pat (aarch64_svpattern pattern, unsigned int nelts_per_vq)
3899 {
3900   unsigned int vl, const_vg;
3901   if (pattern >= AARCH64_SV_VL1 && pattern <= AARCH64_SV_VL8)
3902     vl = 1 + (pattern - AARCH64_SV_VL1);
3903   else if (pattern >= AARCH64_SV_VL16 && pattern <= AARCH64_SV_VL256)
3904     vl = 16 << (pattern - AARCH64_SV_VL16);
3905   else if (aarch64_sve_vg.is_constant (&const_vg))
3906     {
3907       /* There are two vector granules per quadword.  */
3908       unsigned int nelts = (const_vg / 2) * nelts_per_vq;
3909       switch (pattern)
3910         {
3911         case AARCH64_SV_POW2: return 1 << floor_log2 (nelts);
3912         case AARCH64_SV_MUL4: return nelts & -4;
3913         case AARCH64_SV_MUL3: return (nelts / 3) * 3;
3914         case AARCH64_SV_ALL: return nelts;
3915         default: gcc_unreachable ();
3916         }
3917     }
3918   else
3919     return -1;
3920
3921   /* There are two vector granules per quadword.  */
3922   poly_uint64 nelts_all = exact_div (aarch64_sve_vg, 2) * nelts_per_vq;
3923   if (known_le (vl, nelts_all))
3924     return vl;
3925
3926   /* Requesting more elements than are available results in a PFALSE.  */
3927   if (known_gt (vl, nelts_all))
3928     return 0;
3929
3930   return -1;
3931 }
3932
3933 /* Return true if a single CNT[BHWD] instruction can multiply FACTOR
3934    by the number of 128-bit quadwords in an SVE vector.  */
3935
3936 static bool
3937 aarch64_sve_cnt_factor_p (HOST_WIDE_INT factor)
3938 {
3939   /* The coefficient must be [1, 16] * {2, 4, 8, 16}.  */
3940   return (IN_RANGE (factor, 2, 16 * 16)
3941           && (factor & 1) == 0
3942           && factor <= 16 * (factor & -factor));
3943 }
3944
3945 /* Return true if we can move VALUE into a register using a single
3946    CNT[BHWD] instruction.  */
3947
3948 static bool
3949 aarch64_sve_cnt_immediate_p (poly_int64 value)
3950 {
3951   HOST_WIDE_INT factor = value.coeffs[0];
3952   return value.coeffs[1] == factor && aarch64_sve_cnt_factor_p (factor);
3953 }
3954
3955 /* Likewise for rtx X.  */
3956
3957 bool
3958 aarch64_sve_cnt_immediate_p (rtx x)
3959 {
3960   poly_int64 value;
3961   return poly_int_rtx_p (x, &value) && aarch64_sve_cnt_immediate_p (value);
3962 }
3963
3964 /* Return the asm string for an instruction with a CNT-like vector size
3965    operand (a vector pattern followed by a multiplier in the range [1, 16]).
3966    PREFIX is the mnemonic without the size suffix and OPERANDS is the
3967    first part of the operands template (the part that comes before the
3968    vector size itself).  PATTERN is the pattern to use.  FACTOR is the
3969    number of quadwords.  NELTS_PER_VQ, if nonzero, is the number of elements
3970    in each quadword.  If it is zero, we can use any element size.  */
3971
3972 static char *
3973 aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
3974                                   aarch64_svpattern pattern,
3975                                   unsigned int factor,
3976                                   unsigned int nelts_per_vq)
3977 {
3978   static char buffer[sizeof ("sqincd\t%x0, %w0, vl256, mul #16")];
3979
3980   if (nelts_per_vq == 0)
3981     /* There is some overlap in the ranges of the four CNT instructions.
3982        Here we always use the smallest possible element size, so that the
3983        multiplier is 1 whereever possible.  */
3984     nelts_per_vq = factor & -factor;
3985   int shift = std::min (exact_log2 (nelts_per_vq), 4);
3986   gcc_assert (IN_RANGE (shift, 1, 4));
3987   char suffix = "dwhb"[shift - 1];
3988
3989   factor >>= shift;
3990   unsigned int written;
3991   if (pattern == AARCH64_SV_ALL && factor == 1)
3992     written = snprintf (buffer, sizeof (buffer), "%s%c\t%s",
3993                         prefix, suffix, operands);
3994   else if (factor == 1)
3995     written = snprintf (buffer, sizeof (buffer), "%s%c\t%s, %s",
3996                         prefix, suffix, operands, svpattern_token (pattern));
3997   else
3998     written = snprintf (buffer, sizeof (buffer), "%s%c\t%s, %s, mul #%d",
3999                         prefix, suffix, operands, svpattern_token (pattern),
4000                         factor);
4001   gcc_assert (written < sizeof (buffer));
4002   return buffer;
4003 }
4004
4005 /* Return the asm string for an instruction with a CNT-like vector size
4006    operand (a vector pattern followed by a multiplier in the range [1, 16]).
4007    PREFIX is the mnemonic without the size suffix and OPERANDS is the
4008    first part of the operands template (the part that comes before the
4009    vector size itself).  X is the value of the vector size operand,
4010    as a polynomial integer rtx; we need to convert this into an "all"
4011    pattern with a multiplier.  */
4012
4013 char *
4014 aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
4015                                   rtx x)
4016 {
4017   poly_int64 value = rtx_to_poly_int64 (x);
4018   gcc_assert (aarch64_sve_cnt_immediate_p (value));
4019   return aarch64_output_sve_cnt_immediate (prefix, operands, AARCH64_SV_ALL,
4020                                            value.coeffs[1], 0);
4021 }
4022
4023 /* Return the asm string for an instruction with a CNT-like vector size
4024    operand (a vector pattern followed by a multiplier in the range [1, 16]).
4025    PREFIX is the mnemonic without the size suffix and OPERANDS is the
4026    first part of the operands template (the part that comes before the
4027    vector size itself).  CNT_PAT[0..2] are the operands of the
4028    UNSPEC_SVE_CNT_PAT; see aarch64_sve_cnt_pat for details.  */
4029
4030 char *
4031 aarch64_output_sve_cnt_pat_immediate (const char *prefix,
4032                                       const char *operands, rtx *cnt_pat)
4033 {
4034   aarch64_svpattern pattern = (aarch64_svpattern) INTVAL (cnt_pat[0]);
4035   unsigned int nelts_per_vq = INTVAL (cnt_pat[1]);
4036   unsigned int factor = INTVAL (cnt_pat[2]) * nelts_per_vq;
4037   return aarch64_output_sve_cnt_immediate (prefix, operands, pattern,
4038                                            factor, nelts_per_vq);
4039 }
4040
4041 /* Return true if we can add X using a single SVE INC or DEC instruction.  */
4042
4043 bool
4044 aarch64_sve_scalar_inc_dec_immediate_p (rtx x)
4045 {
4046   poly_int64 value;
4047   return (poly_int_rtx_p (x, &value)
4048           && (aarch64_sve_cnt_immediate_p (value)
4049               || aarch64_sve_cnt_immediate_p (-value)));
4050 }
4051
4052 /* Return the asm string for adding SVE INC/DEC immediate OFFSET to
4053    operand 0.  */
4054
4055 char *
4056 aarch64_output_sve_scalar_inc_dec (rtx offset)
4057 {
4058   poly_int64 offset_value = rtx_to_poly_int64 (offset);
4059   gcc_assert (offset_value.coeffs[0] == offset_value.coeffs[1]);
4060   if (offset_value.coeffs[1] > 0)
4061     return aarch64_output_sve_cnt_immediate ("inc", "%x0", AARCH64_SV_ALL,
4062                                              offset_value.coeffs[1], 0);
4063   else
4064     return aarch64_output_sve_cnt_immediate ("dec", "%x0", AARCH64_SV_ALL,
4065                                              -offset_value.coeffs[1], 0);
4066 }
4067
4068 /* Return true if a single RDVL instruction can multiply FACTOR by the
4069    number of 128-bit quadwords in an SVE vector.  This is also the
4070    range of ADDVL.  */
4071
4072 static bool
4073 aarch64_sve_rdvl_addvl_factor_p (HOST_WIDE_INT factor)
4074 {
4075   return (multiple_p (factor, 16)
4076           && IN_RANGE (factor, -32 * 16, 31 * 16));
4077 }
4078
4079 /* Return true if ADDPL can be used to add FACTOR multiplied by the number
4080    of quadwords in an SVE vector.  */
4081
4082 static bool
4083 aarch64_sve_addpl_factor_p (HOST_WIDE_INT factor)
4084 {
4085   return (multiple_p (factor, 2)
4086           && IN_RANGE (factor, -32 * 2, 31 * 2));
4087 }
4088
4089 /* Return true if we can move VALUE into a register using a single
4090    RDVL instruction.  */
4091
4092 static bool
4093 aarch64_sve_rdvl_immediate_p (poly_int64 value)
4094 {
4095   HOST_WIDE_INT factor = value.coeffs[0];
4096   return value.coeffs[1] == factor && aarch64_sve_rdvl_addvl_factor_p (factor);
4097 }
4098
4099 /* Likewise for rtx X.  */
4100
4101 bool
4102 aarch64_sve_rdvl_immediate_p (rtx x)
4103 {
4104   poly_int64 value;
4105   return poly_int_rtx_p (x, &value) && aarch64_sve_rdvl_immediate_p (value);
4106 }
4107
4108 /* Return the asm string for moving RDVL immediate OFFSET into register
4109    operand 0.  */
4110
4111 char *
4112 aarch64_output_sve_rdvl (rtx offset)
4113 {
4114   static char buffer[sizeof ("rdvl\t%x0, #-") + 3 * sizeof (int)];
4115   poly_int64 offset_value = rtx_to_poly_int64 (offset);
4116   gcc_assert (aarch64_sve_rdvl_immediate_p (offset_value));
4117
4118   int factor = offset_value.coeffs[1];
4119   snprintf (buffer, sizeof (buffer), "rdvl\t%%x0, #%d", factor / 16);
4120   return buffer;
4121 }
4122
4123 /* Return true if we can add VALUE to a register using a single ADDVL
4124    or ADDPL instruction.  */
4125
4126 static bool
4127 aarch64_sve_addvl_addpl_immediate_p (poly_int64 value)
4128 {
4129   HOST_WIDE_INT factor = value.coeffs[0];
4130   if (factor == 0 || value.coeffs[1] != factor)
4131     return false;
4132   return (aarch64_sve_rdvl_addvl_factor_p (factor)
4133           || aarch64_sve_addpl_factor_p (factor));
4134 }
4135
4136 /* Likewise for rtx X.  */
4137
4138 bool
4139 aarch64_sve_addvl_addpl_immediate_p (rtx x)
4140 {
4141   poly_int64 value;
4142   return (poly_int_rtx_p (x, &value)
4143           && aarch64_sve_addvl_addpl_immediate_p (value));
4144 }
4145
4146 /* Return the asm string for adding ADDVL or ADDPL immediate OFFSET
4147    to operand 1 and storing the result in operand 0.  */
4148
4149 char *
4150 aarch64_output_sve_addvl_addpl (rtx offset)
4151 {
4152   static char buffer[sizeof ("addpl\t%x0, %x1, #-") + 3 * sizeof (int)];
4153   poly_int64 offset_value = rtx_to_poly_int64 (offset);
4154   gcc_assert (aarch64_sve_addvl_addpl_immediate_p (offset_value));
4155
4156   int factor = offset_value.coeffs[1];
4157   if ((factor & 15) == 0)
4158     snprintf (buffer, sizeof (buffer), "addvl\t%%x0, %%x1, #%d", factor / 16);
4159   else
4160     snprintf (buffer, sizeof (buffer), "addpl\t%%x0, %%x1, #%d", factor / 2);
4161   return buffer;
4162 }
4163
4164 /* Return true if X is a valid immediate for an SVE vector INC or DEC
4165    instruction.  If it is, store the number of elements in each vector
4166    quadword in *NELTS_PER_VQ_OUT (if nonnull) and store the multiplication
4167    factor in *FACTOR_OUT (if nonnull).  */
4168
4169 bool
4170 aarch64_sve_vector_inc_dec_immediate_p (rtx x, int *factor_out,
4171                                         unsigned int *nelts_per_vq_out)
4172 {
4173   rtx elt;
4174   poly_int64 value;
4175
4176   if (!const_vec_duplicate_p (x, &elt)
4177       || !poly_int_rtx_p (elt, &value))
4178     return false;
4179
4180   unsigned int nelts_per_vq = 128 / GET_MODE_UNIT_BITSIZE (GET_MODE (x));
4181   if (nelts_per_vq != 8 && nelts_per_vq != 4 && nelts_per_vq != 2)
4182     /* There's no vector INCB.  */
4183     return false;
4184
4185   HOST_WIDE_INT factor = value.coeffs[0];
4186   if (value.coeffs[1] != factor)
4187     return false;
4188
4189   /* The coefficient must be [1, 16] * NELTS_PER_VQ.  */
4190   if ((factor % nelts_per_vq) != 0
4191       || !IN_RANGE (abs (factor), nelts_per_vq, 16 * nelts_per_vq))
4192     return false;
4193
4194   if (factor_out)
4195     *factor_out = factor;
4196   if (nelts_per_vq_out)
4197     *nelts_per_vq_out = nelts_per_vq;
4198   return true;
4199 }
4200
4201 /* Return true if X is a valid immediate for an SVE vector INC or DEC
4202    instruction.  */
4203
4204 bool
4205 aarch64_sve_vector_inc_dec_immediate_p (rtx x)
4206 {
4207   return aarch64_sve_vector_inc_dec_immediate_p (x, NULL, NULL);
4208 }
4209
4210 /* Return the asm template for an SVE vector INC or DEC instruction.
4211    OPERANDS gives the operands before the vector count and X is the
4212    value of the vector count operand itself.  */
4213
4214 char *
4215 aarch64_output_sve_vector_inc_dec (const char *operands, rtx x)
4216 {
4217   int factor;
4218   unsigned int nelts_per_vq;
4219   if (!aarch64_sve_vector_inc_dec_immediate_p (x, &factor, &nelts_per_vq))
4220     gcc_unreachable ();
4221   if (factor < 0)
4222     return aarch64_output_sve_cnt_immediate ("dec", operands, AARCH64_SV_ALL,
4223                                              -factor, nelts_per_vq);
4224   else
4225     return aarch64_output_sve_cnt_immediate ("inc", operands, AARCH64_SV_ALL,
4226                                              factor, nelts_per_vq);
4227 }
4228
4229 /* Return a constant that represents FACTOR multiplied by the
4230    number of 128-bit quadwords in an SME vector.  ISA_MODE is the
4231    ISA mode in which the calculation is being performed.  */
4232
4233 rtx
4234 aarch64_sme_vq_immediate (machine_mode mode, HOST_WIDE_INT factor,
4235                           aarch64_isa_mode isa_mode)
4236 {
4237   gcc_assert (aarch64_sve_rdvl_addvl_factor_p (factor));
4238   if (isa_mode & AARCH64_ISA_MODE_SM_ON)
4239     /* We're in streaming mode, so we can use normal poly-int values.  */
4240     return gen_int_mode ({ factor, factor }, mode);
4241
4242   rtvec vec = gen_rtvec (1, gen_int_mode (factor, SImode));
4243   rtx unspec = gen_rtx_UNSPEC (mode, vec, UNSPEC_SME_VQ);
4244   return gen_rtx_CONST (mode, unspec);
4245 }
4246
4247 /* Return true if X is a constant that represents some number X
4248    multiplied by the number of quadwords in an SME vector.  Store this X
4249    in *FACTOR if so.  */
4250
4251 static bool
4252 aarch64_sme_vq_unspec_p (const_rtx x, HOST_WIDE_INT *factor)
4253 {
4254   if (!TARGET_SME || GET_CODE (x) != CONST)
4255     return false;
4256
4257   x = XEXP (x, 0);
4258   if (GET_CODE (x) != UNSPEC
4259       || XINT (x, 1) != UNSPEC_SME_VQ
4260       || XVECLEN (x, 0) != 1)
4261     return false;
4262
4263   x = XVECEXP (x, 0, 0);
4264   if (!CONST_INT_P (x))
4265     return false;
4266
4267   *factor = INTVAL (x);
4268   return true;
4269 }
4270
4271 /* Return true if X is a constant that represents some number Y
4272    multiplied by the number of quadwords in an SME vector, and if
4273    that Y is in the range of RDSVL.  */
4274
4275 bool
4276 aarch64_rdsvl_immediate_p (const_rtx x)
4277 {
4278   HOST_WIDE_INT factor;
4279   return (aarch64_sme_vq_unspec_p (x, &factor)
4280           && aarch64_sve_rdvl_addvl_factor_p (factor));
4281 }
4282
4283 /* Return the asm string for an RDSVL instruction that calculates X,
4284    which is a constant that satisfies aarch64_rdsvl_immediate_p.  */
4285
4286 char *
4287 aarch64_output_rdsvl (const_rtx x)
4288 {
4289   gcc_assert (aarch64_rdsvl_immediate_p (x));
4290   static char buffer[sizeof ("rdsvl\t%x0, #-") + 3 * sizeof (int)];
4291   x = XVECEXP (XEXP (x, 0), 0, 0);
4292   snprintf (buffer, sizeof (buffer), "rdsvl\t%%x0, #%d",
4293             (int) INTVAL (x) / 16);
4294   return buffer;
4295 }
4296
4297 /* Return true if X is a constant that can be added using ADDSVL or ADDSPL.  */
4298
4299 bool
4300 aarch64_addsvl_addspl_immediate_p (const_rtx x)
4301 {
4302   HOST_WIDE_INT factor;
4303   return (aarch64_sme_vq_unspec_p (x, &factor)
4304           && (aarch64_sve_rdvl_addvl_factor_p (factor)
4305               || aarch64_sve_addpl_factor_p (factor)));
4306 }
4307
4308 /* X is a constant that satisfies aarch64_addsvl_addspl_immediate_p.
4309    Return the asm string for the associated instruction.  */
4310
4311 char *
4312 aarch64_output_addsvl_addspl (rtx x)
4313 {
4314   static char buffer[sizeof ("addspl\t%x0, %x1, #-") + 3 * sizeof (int)];
4315   HOST_WIDE_INT factor;
4316   if (!aarch64_sme_vq_unspec_p (x, &factor))
4317     gcc_unreachable ();
4318   if (aarch64_sve_rdvl_addvl_factor_p (factor))
4319     snprintf (buffer, sizeof (buffer), "addsvl\t%%x0, %%x1, #%d",
4320               (int) factor / 16);
4321   else if (aarch64_sve_addpl_factor_p (factor))
4322     snprintf (buffer, sizeof (buffer), "addspl\t%%x0, %%x1, #%d",
4323               (int) factor / 2);
4324   else
4325     gcc_unreachable ();
4326   return buffer;
4327 }
4328
4329 /* Multipliers for repeating bitmasks of width 32, 16, 8, 4, and 2.  */
4330
4331 static const unsigned HOST_WIDE_INT bitmask_imm_mul[] =
4332   {
4333     0x0000000100000001ull,
4334     0x0001000100010001ull,
4335     0x0101010101010101ull,
4336     0x1111111111111111ull,
4337     0x5555555555555555ull,
4338   };
4339
4340
4341
4342 /* Return true if 64-bit VAL is a valid bitmask immediate.  */
4343 static bool
4344 aarch64_bitmask_imm (unsigned HOST_WIDE_INT val)
4345 {
4346   unsigned HOST_WIDE_INT tmp, mask, first_one, next_one;
4347   int bits;
4348
4349   /* Check for a single sequence of one bits and return quickly if so.
4350      The special cases of all ones and all zeroes returns false.  */
4351   tmp = val + (val & -val);
4352
4353   if (tmp == (tmp & -tmp))
4354     return (val + 1) > 1;
4355
4356   /* Invert if the immediate doesn't start with a zero bit - this means we
4357      only need to search for sequences of one bits.  */
4358   if (val & 1)
4359     val = ~val;
4360
4361   /* Find the first set bit and set tmp to val with the first sequence of one
4362      bits removed.  Return success if there is a single sequence of ones.  */
4363   first_one = val & -val;
4364   tmp = val & (val + first_one);
4365
4366   if (tmp == 0)
4367     return true;
4368
4369   /* Find the next set bit and compute the difference in bit position.  */
4370   next_one = tmp & -tmp;
4371   bits = clz_hwi (first_one) - clz_hwi (next_one);
4372   mask = val ^ tmp;
4373
4374   /* Check the bit position difference is a power of 2, and that the first
4375      sequence of one bits fits within 'bits' bits.  */
4376   if ((mask >> bits) != 0 || bits != (bits & -bits))
4377     return false;
4378
4379   /* Check the sequence of one bits is repeated 64/bits times.  */
4380   return val == mask * bitmask_imm_mul[__builtin_clz (bits) - 26];
4381 }
4382
4383
4384 /* Return true if VAL is a valid bitmask immediate for MODE.  */
4385 bool
4386 aarch64_bitmask_imm (unsigned HOST_WIDE_INT val, machine_mode mode)
4387 {
4388   if (mode == DImode)
4389     return aarch64_bitmask_imm (val);
4390
4391   if (mode == SImode)
4392     return aarch64_bitmask_imm ((val & 0xffffffff) | (val << 32));
4393
4394   /* Replicate small immediates to fit 64 bits.  */
4395   int size = GET_MODE_UNIT_PRECISION (mode);
4396   val &= (HOST_WIDE_INT_1U << size) - 1;
4397   val *= bitmask_imm_mul[__builtin_clz (size) - 26];
4398
4399   return aarch64_bitmask_imm (val);
4400 }
4401
4402
4403 /* Return true if the immediate VAL can be a bitfield immediate
4404    by changing the given MASK bits in VAL to zeroes, ones or bits
4405    from the other half of VAL.  Return the new immediate in VAL2.  */
4406 static inline bool
4407 aarch64_check_bitmask (unsigned HOST_WIDE_INT val,
4408                        unsigned HOST_WIDE_INT &val2,
4409                        unsigned HOST_WIDE_INT mask)
4410 {
4411   val2 = val & ~mask;
4412   if (val2 != val && aarch64_bitmask_imm (val2))
4413     return true;
4414   val2 = val | mask;
4415   if (val2 != val && aarch64_bitmask_imm (val2))
4416     return true;
4417   val = val & ~mask;
4418   val2 = val | (((val >> 32) | (val << 32)) & mask);
4419   if (val2 != val && aarch64_bitmask_imm (val2))
4420     return true;
4421   val2 = val | (((val >> 16) | (val << 48)) & mask);
4422   if (val2 != val && aarch64_bitmask_imm (val2))
4423     return true;
4424   return false;
4425 }
4426
4427
4428 /* Return true if VAL is a valid MOVZ immediate.  */
4429 static inline bool
4430 aarch64_is_movz (unsigned HOST_WIDE_INT val)
4431 {
4432   return (val >> (ctz_hwi (val) & 48)) < 65536;
4433 }
4434
4435
4436 /* Return true if immediate VAL can be created by a 64-bit MOVI/MOVN/MOVZ.  */
4437 bool
4438 aarch64_is_mov_xn_imm (unsigned HOST_WIDE_INT val)
4439 {
4440   return aarch64_is_movz (val) || aarch64_is_movz (~val)
4441     || aarch64_bitmask_imm (val);
4442 }
4443
4444
4445 /* Return true if VAL is an immediate that can be created by a single
4446    MOV instruction.  */
4447 bool
4448 aarch64_move_imm (unsigned HOST_WIDE_INT val, machine_mode mode)
4449 {
4450   gcc_assert (mode == SImode || mode == DImode);
4451
4452   if (val < 65536)
4453     return true;
4454
4455   unsigned HOST_WIDE_INT mask =
4456     (val >> 32) == 0 || mode == SImode ? 0xffffffff : HOST_WIDE_INT_M1U;
4457
4458   if (aarch64_is_movz (val & mask) || aarch64_is_movz (~val & mask))
4459     return true;
4460
4461   val = (val & mask) | ((val << 32) & ~mask);
4462   return aarch64_bitmask_imm (val);
4463 }
4464
4465
4466 static int
4467 aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
4468                                 machine_mode mode)
4469 {
4470   int i;
4471   unsigned HOST_WIDE_INT val, val2, val3, mask;
4472   int one_match, zero_match;
4473   int num_insns;
4474
4475   gcc_assert (mode == SImode || mode == DImode);
4476
4477   val = INTVAL (imm);
4478
4479   if (aarch64_move_imm (val, mode))
4480     {
4481       if (generate)
4482         emit_insn (gen_rtx_SET (dest, imm));
4483       return 1;
4484     }
4485
4486   if ((val >> 32) == 0 || mode == SImode)
4487     {
4488       if (generate)
4489         {
4490           emit_insn (gen_rtx_SET (dest, GEN_INT (val & 0xffff)));
4491           if (mode == SImode)
4492             emit_insn (gen_insv_immsi (dest, GEN_INT (16),
4493                                        GEN_INT ((val >> 16) & 0xffff)));
4494           else
4495             emit_insn (gen_insv_immdi (dest, GEN_INT (16),
4496                                        GEN_INT ((val >> 16) & 0xffff)));
4497         }
4498       return 2;
4499     }
4500
4501   /* Remaining cases are all for DImode.  */
4502
4503   mask = 0xffff;
4504   zero_match = ((val & mask) == 0) + ((val & (mask << 16)) == 0) +
4505     ((val & (mask << 32)) == 0) + ((val & (mask << 48)) == 0);
4506   one_match = ((~val & mask) == 0) + ((~val & (mask << 16)) == 0) +
4507     ((~val & (mask << 32)) == 0) + ((~val & (mask << 48)) == 0);
4508
4509   /* Try a bitmask immediate and a movk to generate the immediate
4510      in 2 instructions.  */
4511
4512   if (zero_match < 2 && one_match < 2)
4513     {
4514       for (i = 0; i < 64; i += 16)
4515         {
4516           if (aarch64_check_bitmask (val, val2, mask << i))
4517             break;
4518
4519           val2 = val & ~(mask << i);
4520           if ((val2 >> 32) == 0 && aarch64_move_imm (val2, DImode))
4521             break;
4522         }
4523
4524       if (i != 64)
4525         {
4526           if (generate)
4527             {
4528               emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
4529               emit_insn (gen_insv_immdi (dest, GEN_INT (i),
4530                                          GEN_INT ((val >> i) & 0xffff)));
4531             }
4532           return 2;
4533         }
4534
4535       /* Try 2 bitmask immediates which are xor'd together. */
4536       for (i = 0; i < 64; i += 16)
4537         {
4538           val2 = (val >> i) & mask;
4539           val2 |= val2 << 16;
4540           val2 |= val2 << 32;
4541           if (aarch64_bitmask_imm (val2) && aarch64_bitmask_imm (val ^ val2))
4542             break;
4543         }
4544
4545       if (i != 64)
4546         {
4547           if (generate)
4548             {
4549               emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
4550               emit_insn (gen_xordi3 (dest, dest, GEN_INT (val ^ val2)));
4551             }
4552           return 2;
4553         }
4554     }
4555
4556   /* Try a bitmask plus 2 movk to generate the immediate in 3 instructions.  */
4557   if (zero_match + one_match == 0)
4558     {
4559       for (i = 0; i < 48; i += 16)
4560         for (int j = i + 16; j < 64; j += 16)
4561           if (aarch64_check_bitmask (val, val2, (mask << i) | (mask << j)))
4562             {
4563               if (generate)
4564                 {
4565                   emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
4566                   emit_insn (gen_insv_immdi (dest, GEN_INT (i),
4567                                              GEN_INT ((val >> i) & 0xffff)));
4568                   emit_insn (gen_insv_immdi (dest, GEN_INT (j),
4569                                                GEN_INT ((val >> j) & 0xffff)));
4570                 }
4571               return 3;
4572             }
4573
4574       /* Try shifting and inserting the bottom 32-bits into the top bits.  */
4575       val2 = val & 0xffffffff;
4576       val3 = 0xffffffff;
4577       val3 = val2 | (val3 << 32);
4578       for (i = 17; i < 48; i++)
4579         if ((val2 | (val2 << i)) == val)
4580           {
4581             if (generate)
4582               {
4583                 emit_insn (gen_rtx_SET (dest, GEN_INT (val2 & 0xffff)));
4584                 emit_insn (gen_insv_immdi (dest, GEN_INT (16),
4585                                            GEN_INT (val2 >> 16)));
4586                 emit_insn (gen_ior_ashldi3 (dest, dest, GEN_INT (i), dest));
4587               }
4588             return 3;
4589           }
4590         else if ((val3 & ~(val3 << i)) == val)
4591           {
4592             if (generate)
4593               {
4594                 emit_insn (gen_rtx_SET (dest, GEN_INT (val3 | 0xffff0000)));
4595                 emit_insn (gen_insv_immdi (dest, GEN_INT (16),
4596                                            GEN_INT (val2 >> 16)));
4597                 emit_insn (gen_and_one_cmpl_ashldi3 (dest, dest, GEN_INT (i),
4598                                                       dest));
4599               }
4600             return 3;
4601           }
4602     }
4603
4604   /* Generate 2-4 instructions, skipping 16 bits of all zeroes or ones which
4605      are emitted by the initial mov.  If one_match > zero_match, skip set bits,
4606      otherwise skip zero bits.  */
4607
4608   num_insns = 1;
4609   mask = 0xffff;
4610   val2 = one_match > zero_match ? ~val : val;
4611   i = (val2 & mask) != 0 ? 0 : (val2 & (mask << 16)) != 0 ? 16 : 32;
4612
4613   if (generate)
4614     emit_insn (gen_rtx_SET (dest, GEN_INT (one_match > zero_match
4615                                            ? (val | ~(mask << i))
4616                                            : (val & (mask << i)))));
4617   for (i += 16; i < 64; i += 16)
4618     {
4619       if ((val2 & (mask << i)) == 0)
4620         continue;
4621       if (generate)
4622         emit_insn (gen_insv_immdi (dest, GEN_INT (i),
4623                                    GEN_INT ((val >> i) & 0xffff)));
4624       num_insns ++;
4625     }
4626
4627   return num_insns;
4628 }
4629
4630 /* Return whether imm is a 128-bit immediate which is simple enough to
4631    expand inline.  */
4632 bool
4633 aarch64_mov128_immediate (rtx imm)
4634 {
4635   if (CONST_INT_P (imm))
4636     return true;
4637
4638   gcc_assert (CONST_WIDE_INT_NUNITS (imm) == 2);
4639
4640   rtx lo = GEN_INT (CONST_WIDE_INT_ELT (imm, 0));
4641   rtx hi = GEN_INT (CONST_WIDE_INT_ELT (imm, 1));
4642
4643   return aarch64_internal_mov_immediate (NULL_RTX, lo, false, DImode)
4644          + aarch64_internal_mov_immediate (NULL_RTX, hi, false, DImode) <= 4;
4645 }
4646
4647
4648 /* Return true if val can be encoded as a 12-bit unsigned immediate with
4649    a left shift of 0 or 12 bits.  */
4650 bool
4651 aarch64_uimm12_shift (unsigned HOST_WIDE_INT val)
4652 {
4653   return val < 4096 || (val & 0xfff000) == val;
4654 }
4655
4656 /* Returns the nearest value to VAL that will fit as a 12-bit unsigned immediate
4657    that can be created with a left shift of 0 or 12.  */
4658 static HOST_WIDE_INT
4659 aarch64_clamp_to_uimm12_shift (unsigned HOST_WIDE_INT val)
4660 {
4661   /* Check to see if the value fits in 24 bits, as that is the maximum we can
4662      handle correctly.  */
4663   gcc_assert (val < 0x1000000);
4664
4665   if (val < 4096)
4666     return val;
4667
4668   return val & 0xfff000;
4669 }
4670
4671
4672 /* Test whether:
4673
4674      X = (X & AND_VAL) | IOR_VAL;
4675
4676    can be implemented using:
4677
4678      MOVK X, #(IOR_VAL >> shift), LSL #shift
4679
4680    Return the shift if so, otherwise return -1.  */
4681 int
4682 aarch64_movk_shift (const wide_int_ref &and_val,
4683                     const wide_int_ref &ior_val)
4684 {
4685   unsigned int precision = and_val.get_precision ();
4686   unsigned HOST_WIDE_INT mask = 0xffff;
4687   for (unsigned int shift = 0; shift < precision; shift += 16)
4688     {
4689       if (and_val == ~mask && (ior_val & mask) == ior_val)
4690         return shift;
4691       mask <<= 16;
4692     }
4693   return -1;
4694 }
4695
4696 /* Create mask of ones, covering the lowest to highest bits set in VAL_IN.
4697    Assumed precondition: VAL_IN Is not zero.  */
4698
4699 unsigned HOST_WIDE_INT
4700 aarch64_and_split_imm1 (HOST_WIDE_INT val_in)
4701 {
4702   int lowest_bit_set = ctz_hwi (val_in);
4703   int highest_bit_set = floor_log2 (val_in);
4704   gcc_assert (val_in != 0);
4705
4706   return ((HOST_WIDE_INT_UC (2) << highest_bit_set) -
4707           (HOST_WIDE_INT_1U << lowest_bit_set));
4708 }
4709
4710 /* Create constant where bits outside of lowest bit set to highest bit set
4711    are set to 1.  */
4712
4713 unsigned HOST_WIDE_INT
4714 aarch64_and_split_imm2 (HOST_WIDE_INT val_in)
4715 {
4716   return val_in | ~aarch64_and_split_imm1 (val_in);
4717 }
4718
4719 /* Return true if VAL_IN is a valid 'and' bitmask immediate.  */
4720
4721 bool
4722 aarch64_and_bitmask_imm (unsigned HOST_WIDE_INT val_in, machine_mode mode)
4723 {
4724   scalar_int_mode int_mode;
4725   if (!is_a <scalar_int_mode> (mode, &int_mode))
4726     return false;
4727
4728   if (aarch64_bitmask_imm (val_in, int_mode))
4729     return false;
4730
4731   if (aarch64_move_imm (val_in, int_mode))
4732     return false;
4733
4734   unsigned HOST_WIDE_INT imm2 = aarch64_and_split_imm2 (val_in);
4735
4736   return aarch64_bitmask_imm (imm2, int_mode);
4737 }
4738
4739 /* Return the number of temporary registers that aarch64_add_offset_1
4740    would need to add OFFSET to a register.  */
4741
4742 static unsigned int
4743 aarch64_add_offset_1_temporaries (HOST_WIDE_INT offset)
4744 {
4745   return absu_hwi (offset) < 0x1000000 ? 0 : 1;
4746 }
4747
4748 /* A subroutine of aarch64_add_offset.  Set DEST to SRC + OFFSET for
4749    a non-polynomial OFFSET.  MODE is the mode of the addition.
4750    FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
4751    be set and CFA adjustments added to the generated instructions.
4752
4753    TEMP1, if nonnull, is a register of mode MODE that can be used as a
4754    temporary if register allocation is already complete.  This temporary
4755    register may overlap DEST but must not overlap SRC.  If TEMP1 is known
4756    to hold abs (OFFSET), EMIT_MOVE_IMM can be set to false to avoid emitting
4757    the immediate again.
4758
4759    Since this function may be used to adjust the stack pointer, we must
4760    ensure that it cannot cause transient stack deallocation (for example
4761    by first incrementing SP and then decrementing when adjusting by a
4762    large immediate).  */
4763
4764 static void
4765 aarch64_add_offset_1 (scalar_int_mode mode, rtx dest,
4766                       rtx src, HOST_WIDE_INT offset, rtx temp1,
4767                       bool frame_related_p, bool emit_move_imm)
4768 {
4769   gcc_assert (emit_move_imm || temp1 != NULL_RTX);
4770   gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
4771
4772   unsigned HOST_WIDE_INT moffset = absu_hwi (offset);
4773   rtx_insn *insn;
4774
4775   if (!moffset)
4776     {
4777       if (!rtx_equal_p (dest, src))
4778         {
4779           insn = emit_insn (gen_rtx_SET (dest, src));
4780           RTX_FRAME_RELATED_P (insn) = frame_related_p;
4781         }
4782       return;
4783     }
4784
4785   /* Single instruction adjustment.  */
4786   if (aarch64_uimm12_shift (moffset))
4787     {
4788       insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (offset)));
4789       RTX_FRAME_RELATED_P (insn) = frame_related_p;
4790       return;
4791     }
4792
4793   /* Emit 2 additions/subtractions if the adjustment is less than 24 bits
4794      and either:
4795
4796      a) the offset cannot be loaded by a 16-bit move or
4797      b) there is no spare register into which we can move it.  */
4798   if (moffset < 0x1000000
4799       && ((!temp1 && !can_create_pseudo_p ())
4800           || !aarch64_move_imm (moffset, mode)))
4801     {
4802       HOST_WIDE_INT low_off = moffset & 0xfff;
4803
4804       low_off = offset < 0 ? -low_off : low_off;
4805       insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (low_off)));
4806       RTX_FRAME_RELATED_P (insn) = frame_related_p;
4807       insn = emit_insn (gen_add2_insn (dest, GEN_INT (offset - low_off)));
4808       RTX_FRAME_RELATED_P (insn) = frame_related_p;
4809       return;
4810     }
4811
4812   /* Emit a move immediate if required and an addition/subtraction.  */
4813   if (emit_move_imm)
4814     {
4815       gcc_assert (temp1 != NULL_RTX || can_create_pseudo_p ());
4816       temp1 = aarch64_force_temporary (mode, temp1,
4817                                        gen_int_mode (moffset, mode));
4818     }
4819   insn = emit_insn (offset < 0
4820                     ? gen_sub3_insn (dest, src, temp1)
4821                     : gen_add3_insn (dest, src, temp1));
4822   if (frame_related_p)
4823     {
4824       RTX_FRAME_RELATED_P (insn) = frame_related_p;
4825       rtx adj = plus_constant (mode, src, offset);
4826       add_reg_note (insn, REG_CFA_ADJUST_CFA, gen_rtx_SET (dest, adj));
4827     }
4828 }
4829
4830 /* Return the number of temporary registers that aarch64_add_offset
4831    would need to move OFFSET into a register or add OFFSET to a register;
4832    ADD_P is true if we want the latter rather than the former.  */
4833
4834 static unsigned int
4835 aarch64_offset_temporaries (bool add_p, poly_int64 offset)
4836 {
4837   /* This follows the same structure as aarch64_add_offset.  */
4838   if (add_p && aarch64_sve_addvl_addpl_immediate_p (offset))
4839     return 0;
4840
4841   unsigned int count = 0;
4842   HOST_WIDE_INT factor = offset.coeffs[1];
4843   HOST_WIDE_INT constant = offset.coeffs[0] - factor;
4844   poly_int64 poly_offset (factor, factor);
4845   if (add_p && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
4846     /* Need one register for the ADDVL/ADDPL result.  */
4847     count += 1;
4848   else if (factor != 0)
4849     {
4850       factor /= (HOST_WIDE_INT) least_bit_hwi (factor);
4851       if (!IN_RANGE (factor, -32, 31))
4852         /* Need one register for the CNT or RDVL result and one for the
4853            multiplication factor.  If necessary, the second temporary
4854            can be reused for the constant part of the offset.  */
4855         return 2;
4856       /* Need one register for the CNT or RDVL result (which might then
4857          be shifted).  */
4858       count += 1;
4859     }
4860   return count + aarch64_add_offset_1_temporaries (constant);
4861 }
4862
4863 /* If X can be represented as a poly_int64, return the number
4864    of temporaries that are required to add it to a register.
4865    Return -1 otherwise.  */
4866
4867 int
4868 aarch64_add_offset_temporaries (rtx x)
4869 {
4870   poly_int64 offset;
4871   if (!poly_int_rtx_p (x, &offset))
4872     return -1;
4873   return aarch64_offset_temporaries (true, offset);
4874 }
4875
4876 /* Set DEST to SRC + OFFSET.  MODE is the mode of the addition.
4877    FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
4878    be set and CFA adjustments added to the generated instructions.
4879
4880    TEMP1, if nonnull, is a register of mode MODE that can be used as a
4881    temporary if register allocation is already complete.  This temporary
4882    register may overlap DEST if !FRAME_RELATED_P but must not overlap SRC.
4883    If TEMP1 is known to hold abs (OFFSET), EMIT_MOVE_IMM can be set to
4884    false to avoid emitting the immediate again.
4885
4886    TEMP2, if nonnull, is a second temporary register that doesn't
4887    overlap either DEST or REG.
4888
4889    FORCE_ISA_MODE is AARCH64_ISA_MODE_SM_ON if any variable component of OFFSET
4890    is measured relative to the SME vector length instead of the current
4891    prevailing vector length.  It is 0 otherwise.
4892
4893    Since this function may be used to adjust the stack pointer, we must
4894    ensure that it cannot cause transient stack deallocation (for example
4895    by first incrementing SP and then decrementing when adjusting by a
4896    large immediate).  */
4897
4898 static void
4899 aarch64_add_offset (scalar_int_mode mode, rtx dest, rtx src,
4900                     poly_int64 offset, rtx temp1, rtx temp2,
4901                     aarch64_isa_mode force_isa_mode,
4902                     bool frame_related_p, bool emit_move_imm = true)
4903 {
4904   gcc_assert (emit_move_imm || temp1 != NULL_RTX);
4905   gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
4906   gcc_assert (temp1 == NULL_RTX
4907               || !frame_related_p
4908               || !reg_overlap_mentioned_p (temp1, dest));
4909   gcc_assert (temp2 == NULL_RTX || !reg_overlap_mentioned_p (dest, temp2));
4910
4911   /* Try using ADDVL or ADDPL to add the whole value.  */
4912   if (src != const0_rtx && aarch64_sve_addvl_addpl_immediate_p (offset))
4913     {
4914       gcc_assert (offset.coeffs[0] == offset.coeffs[1]);
4915       rtx offset_rtx;
4916       if (force_isa_mode == 0)
4917         offset_rtx = gen_int_mode (offset, mode);
4918       else
4919         offset_rtx = aarch64_sme_vq_immediate (mode, offset.coeffs[0], 0);
4920       rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
4921       RTX_FRAME_RELATED_P (insn) = frame_related_p;
4922       if (frame_related_p && (force_isa_mode & AARCH64_ISA_MODE_SM_ON))
4923         add_reg_note (insn, REG_CFA_ADJUST_CFA,
4924                       gen_rtx_SET (dest, plus_constant (Pmode, src,
4925                                                         offset)));
4926       return;
4927     }
4928
4929   /* Coefficient 1 is multiplied by the number of 128-bit blocks in an
4930      SVE vector register, over and above the minimum size of 128 bits.
4931      This is equivalent to half the value returned by CNTD with a
4932      vector shape of ALL.  */
4933   HOST_WIDE_INT factor = offset.coeffs[1];
4934   HOST_WIDE_INT constant = offset.coeffs[0] - factor;
4935
4936   /* Try using ADDVL or ADDPL to add the VG-based part.  */
4937   poly_int64 poly_offset (factor, factor);
4938   if (src != const0_rtx
4939       && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
4940     {
4941       rtx offset_rtx;
4942       if (force_isa_mode == 0)
4943         offset_rtx = gen_int_mode (poly_offset, mode);
4944       else
4945         offset_rtx = aarch64_sme_vq_immediate (mode, factor, 0);
4946       if (frame_related_p)
4947         {
4948           rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
4949           RTX_FRAME_RELATED_P (insn) = true;
4950           if (force_isa_mode & AARCH64_ISA_MODE_SM_ON)
4951             add_reg_note (insn, REG_CFA_ADJUST_CFA,
4952                           gen_rtx_SET (dest, plus_constant (Pmode, src,
4953                                                             poly_offset)));
4954           src = dest;
4955         }
4956       else
4957         {
4958           rtx addr = gen_rtx_PLUS (mode, src, offset_rtx);
4959           src = aarch64_force_temporary (mode, temp1, addr);
4960           temp1 = temp2;
4961           temp2 = NULL_RTX;
4962         }
4963     }
4964   /* Otherwise use a CNT-based sequence.  */
4965   else if (factor != 0)
4966     {
4967       /* Calculate CNTB * FACTOR / 16 as CNTB * REL_FACTOR * 2**SHIFT,
4968          with negative shifts indicating a shift right.  */
4969       HOST_WIDE_INT low_bit = least_bit_hwi (factor);
4970       HOST_WIDE_INT rel_factor = factor / low_bit;
4971       int shift = exact_log2 (low_bit) - 4;
4972       gcc_assert (shift >= -4 && (rel_factor & 1) != 0);
4973
4974       /* Set CODE, VAL and SHIFT so that [+-] VAL * 2**SHIFT is
4975          equal to CNTB * FACTOR / 16, with CODE being the [+-].
4976
4977          We can avoid a multiplication if REL_FACTOR is in the range
4978          of RDVL, although there are then various optimizations that
4979          we can try on top.  */
4980       rtx_code code = PLUS;
4981       rtx val;
4982       if (IN_RANGE (rel_factor, -32, 31))
4983         {
4984           if (force_isa_mode & AARCH64_ISA_MODE_SM_ON)
4985             {
4986               /* Try to use an unshifted RDSVL, otherwise fall back on
4987                  a shifted RDSVL #1.  */
4988               if (aarch64_sve_rdvl_addvl_factor_p (factor))
4989                 shift = 0;
4990               else
4991                 factor = rel_factor * 16;
4992               val = aarch64_sme_vq_immediate (mode, factor, 0);
4993             }
4994           /* Try to use an unshifted CNT[BHWD] or RDVL.  */
4995           else if (aarch64_sve_cnt_factor_p (factor)
4996                    || aarch64_sve_rdvl_addvl_factor_p (factor))
4997             {
4998               val = gen_int_mode (poly_int64 (factor, factor), mode);
4999               shift = 0;
5000             }
5001           /* Try to subtract an unshifted CNT[BHWD].  */
5002           else if (aarch64_sve_cnt_factor_p (-factor))
5003             {
5004               code = MINUS;
5005               val = gen_int_mode (poly_int64 (-factor, -factor), mode);
5006               shift = 0;
5007             }
5008           /* If subtraction is free, prefer to load a positive constant.
5009              In the best case this will fit a shifted CNTB.  */
5010           else if (src != const0_rtx && rel_factor < 0)
5011             {
5012               code = MINUS;
5013               val = gen_int_mode (-rel_factor * BYTES_PER_SVE_VECTOR, mode);
5014             }
5015           /* Otherwise use a shifted RDVL or CNT[BHWD].  */
5016           else
5017             val = gen_int_mode (rel_factor * BYTES_PER_SVE_VECTOR, mode);
5018         }
5019       else
5020         {
5021           /* If we can calculate CNTB << SHIFT directly, prefer to do that,
5022              since it should increase the chances of being able to use
5023              a shift and add sequence for the multiplication.
5024              If CNTB << SHIFT is out of range, stick with the current
5025              shift factor.  */
5026           if (force_isa_mode == 0
5027               && IN_RANGE (low_bit, 2, 16 * 16))
5028             {
5029               val = gen_int_mode (poly_int64 (low_bit, low_bit), mode);
5030               shift = 0;
5031             }
5032           else if ((force_isa_mode & AARCH64_ISA_MODE_SM_ON)
5033                    && aarch64_sve_rdvl_addvl_factor_p (low_bit))
5034             {
5035               val = aarch64_sme_vq_immediate (mode, low_bit, 0);
5036               shift = 0;
5037             }
5038           else
5039             val = gen_int_mode (BYTES_PER_SVE_VECTOR, mode);
5040
5041           val = aarch64_force_temporary (mode, temp1, val);
5042
5043           /* Prefer to multiply by a positive factor and subtract rather
5044              than multiply by a negative factor and add, since positive
5045              values are usually easier to move.  */
5046           if (rel_factor < 0 && src != const0_rtx)
5047             {
5048               rel_factor = -rel_factor;
5049               code = MINUS;
5050             }
5051
5052           if (can_create_pseudo_p ())
5053             {
5054               rtx coeff1 = gen_int_mode (rel_factor, mode);
5055               val = expand_mult (mode, val, coeff1, NULL_RTX, true, true);
5056             }
5057           else
5058             {
5059               rtx coeff1 = gen_int_mode (rel_factor, mode);
5060               coeff1 = aarch64_force_temporary (mode, temp2, coeff1);
5061               val = gen_rtx_MULT (mode, val, coeff1);
5062             }
5063         }
5064
5065       /* Multiply by 2 ** SHIFT.  */
5066       if (shift > 0)
5067         {
5068           val = aarch64_force_temporary (mode, temp1, val);
5069           val = gen_rtx_ASHIFT (mode, val, GEN_INT (shift));
5070         }
5071       else if (shift < 0)
5072         {
5073           val = aarch64_force_temporary (mode, temp1, val);
5074           val = gen_rtx_ASHIFTRT (mode, val, GEN_INT (-shift));
5075         }
5076
5077       /* Add the result to SRC or subtract the result from SRC.  */
5078       if (src != const0_rtx)
5079         {
5080           val = aarch64_force_temporary (mode, temp1, val);
5081           val = gen_rtx_fmt_ee (code, mode, src, val);
5082         }
5083       else if (code == MINUS)
5084         {
5085           val = aarch64_force_temporary (mode, temp1, val);
5086           val = gen_rtx_NEG (mode, val);
5087         }
5088
5089       if (constant == 0 || frame_related_p)
5090         {
5091           rtx_insn *insn = emit_insn (gen_rtx_SET (dest, val));
5092           if (frame_related_p)
5093             {
5094               RTX_FRAME_RELATED_P (insn) = true;
5095               add_reg_note (insn, REG_CFA_ADJUST_CFA,
5096                             gen_rtx_SET (dest, plus_constant (Pmode, src,
5097                                                               poly_offset)));
5098             }
5099           src = dest;
5100           if (constant == 0)
5101             return;
5102         }
5103       else
5104         {
5105           src = aarch64_force_temporary (mode, temp1, val);
5106           temp1 = temp2;
5107           temp2 = NULL_RTX;
5108         }
5109
5110       emit_move_imm = true;
5111     }
5112
5113   aarch64_add_offset_1 (mode, dest, src, constant, temp1,
5114                         frame_related_p, emit_move_imm);
5115 }
5116
5117 /* Like aarch64_add_offset, but the offset is given as an rtx rather
5118    than a poly_int64.  */
5119
5120 void
5121 aarch64_split_add_offset (scalar_int_mode mode, rtx dest, rtx src,
5122                           rtx offset_rtx, rtx temp1, rtx temp2)
5123 {
5124   aarch64_add_offset (mode, dest, src, rtx_to_poly_int64 (offset_rtx),
5125                       temp1, temp2, 0, false);
5126 }
5127
5128 /* Add DELTA to the stack pointer, marking the instructions frame-related.
5129    TEMP1 is available as a temporary if nonnull.  FORCE_ISA_MODE is as
5130    for aarch64_add_offset.  EMIT_MOVE_IMM is false if TEMP1 already
5131    contains abs (DELTA).  */
5132
5133 static inline void
5134 aarch64_add_sp (rtx temp1, rtx temp2, poly_int64 delta,
5135                 aarch64_isa_mode force_isa_mode, bool emit_move_imm)
5136 {
5137   aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, delta,
5138                       temp1, temp2, force_isa_mode, true, emit_move_imm);
5139 }
5140
5141 /* Subtract DELTA from the stack pointer, marking the instructions
5142    frame-related if FRAME_RELATED_P.  FORCE_ISA_MODE is as for
5143    aarch64_add_offset.  TEMP1 is available as a temporary if nonnull.  */
5144
5145 static inline void
5146 aarch64_sub_sp (rtx temp1, rtx temp2, poly_int64 delta,
5147                 aarch64_isa_mode force_isa_mode,
5148                 bool frame_related_p, bool emit_move_imm = true)
5149 {
5150   aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, -delta,
5151                       temp1, temp2, force_isa_mode, frame_related_p,
5152                       emit_move_imm);
5153 }
5154
5155 /* A streaming-compatible function needs to switch temporarily to the known
5156    PSTATE.SM mode described by LOCAL_MODE.  The low bit of OLD_SVCR contains
5157    the runtime state of PSTATE.SM in the streaming-compatible code, before
5158    the start of the switch to LOCAL_MODE.
5159
5160    Emit instructions to branch around the mode switch if PSTATE.SM already
5161    matches LOCAL_MODE.  Return the label that the branch jumps to.  */
5162
5163 static rtx_insn *
5164 aarch64_guard_switch_pstate_sm (rtx old_svcr, aarch64_isa_mode local_mode)
5165 {
5166   local_mode &= AARCH64_ISA_MODE_SM_STATE;
5167   gcc_assert (local_mode != 0);
5168   auto already_ok_cond = (local_mode & AARCH64_ISA_MODE_SM_ON ? NE : EQ);
5169   auto *label = gen_label_rtx ();
5170   auto branch = aarch64_gen_test_and_branch (already_ok_cond, old_svcr, 0,
5171                                              label);
5172   auto *jump = emit_jump_insn (branch);
5173   JUMP_LABEL (jump) = label;
5174   return label;
5175 }
5176
5177 /* Emit code to switch from the PSTATE.SM state in OLD_MODE to the PSTATE.SM
5178    state in NEW_MODE.  This is known to involve either an SMSTART SM or
5179    an SMSTOP SM.  */
5180
5181 static void
5182 aarch64_switch_pstate_sm (aarch64_isa_mode old_mode, aarch64_isa_mode new_mode)
5183 {
5184   old_mode &= AARCH64_ISA_MODE_SM_STATE;
5185   new_mode &= AARCH64_ISA_MODE_SM_STATE;
5186   gcc_assert (old_mode != new_mode);
5187
5188   if ((new_mode & AARCH64_ISA_MODE_SM_ON)
5189       || (!new_mode && (old_mode & AARCH64_ISA_MODE_SM_OFF)))
5190     emit_insn (gen_aarch64_smstart_sm ());
5191   else
5192     emit_insn (gen_aarch64_smstop_sm ());
5193 }
5194
5195 /* As a side-effect, SMSTART SM and SMSTOP SM clobber the contents of all
5196    FP and predicate registers.  This class emits code to preserve any
5197    necessary registers around the mode switch.
5198
5199    The class uses four approaches to saving and restoring contents, enumerated
5200    by group_type:
5201
5202    - GPR: save and restore the contents of FP registers using GPRs.
5203      This is used if the FP register contains no more than 64 significant
5204      bits.  The registers used are FIRST_GPR onwards.
5205
5206    - MEM_128: save and restore 128-bit SIMD registers using memory.
5207
5208    - MEM_SVE_PRED: save and restore full SVE predicate registers using memory.
5209
5210    - MEM_SVE_DATA: save and restore full SVE vector registers using memory.
5211
5212    The save slots within each memory group are consecutive, with the
5213    MEM_SVE_PRED slots occupying a region below the MEM_SVE_DATA slots.
5214
5215    There will only be two mode switches for each use of SME, so they should
5216    not be particularly performance-sensitive.  It's also rare for SIMD, SVE
5217    or predicate registers to be live across mode switches.  We therefore
5218    don't preallocate the save slots but instead allocate them locally on
5219    demand.  This makes the code emitted by the class self-contained.  */
5220
5221 class aarch64_sme_mode_switch_regs
5222 {
5223 public:
5224   static const unsigned int FIRST_GPR = R10_REGNUM;
5225
5226   void add_reg (machine_mode, unsigned int);
5227   void add_call_args (rtx_call_insn *);
5228   void add_call_result (rtx_call_insn *);
5229   void add_call_preserved_reg (unsigned int);
5230   void add_call_preserved_regs (bitmap);
5231
5232   void emit_prologue ();
5233   void emit_epilogue ();
5234
5235   /* The number of GPRs needed to save FP registers, starting from
5236      FIRST_GPR.  */
5237   unsigned int num_gprs () { return m_group_count[GPR]; }
5238
5239 private:
5240   enum sequence { PROLOGUE, EPILOGUE };
5241   enum group_type { GPR, MEM_128, MEM_SVE_PRED, MEM_SVE_DATA, NUM_GROUPS };
5242
5243   /* Information about the save location for one FP, SIMD, SVE data, or
5244      SVE predicate register.  */
5245   struct save_location {
5246     /* The register to be saved.  */
5247     rtx reg;
5248
5249     /* Which group the save location belongs to.  */
5250     group_type group;
5251
5252     /* A zero-based index of the register within the group.  */
5253     unsigned int index;
5254   };
5255
5256   unsigned int sve_data_headroom ();
5257   rtx get_slot_mem (machine_mode, poly_int64);
5258   void emit_stack_adjust (sequence, poly_int64);
5259   void emit_mem_move (sequence, const save_location &, poly_int64);
5260
5261   void emit_gpr_moves (sequence);
5262   void emit_mem_128_moves (sequence);
5263   void emit_sve_sp_adjust (sequence);
5264   void emit_sve_pred_moves (sequence);
5265   void emit_sve_data_moves (sequence);
5266
5267   /* All save locations, in no particular order.  */
5268   auto_vec<save_location, 12> m_save_locations;
5269
5270   /* The number of registers in each group.  */
5271   unsigned int m_group_count[NUM_GROUPS] = {};
5272 };
5273
5274 /* Record that (reg:MODE REGNO) needs to be preserved around the mode
5275    switch.  */
5276
5277 void
5278 aarch64_sme_mode_switch_regs::add_reg (machine_mode mode, unsigned int regno)
5279 {
5280   if (!FP_REGNUM_P (regno) && !PR_REGNUM_P (regno))
5281     return;
5282
5283   unsigned int end_regno = end_hard_regno (mode, regno);
5284   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
5285   gcc_assert ((vec_flags & VEC_STRUCT) || end_regno == regno + 1);
5286   for (; regno < end_regno; regno++)
5287     {
5288       /* Force the mode of SVE saves and restores even for single registers.
5289          This is necessary because big-endian targets only allow LDR Z and
5290          STR Z to be used with byte modes.  */
5291       machine_mode submode = mode;
5292       if (vec_flags & VEC_SVE_PRED)
5293         submode = VNx16BImode;
5294       else if (vec_flags & VEC_SVE_DATA)
5295         submode = SVE_BYTE_MODE;
5296       else if (vec_flags & VEC_STRUCT)
5297         {
5298           if (vec_flags & VEC_PARTIAL)
5299             submode = V8QImode;
5300           else
5301             submode = V16QImode;
5302         }
5303       save_location loc;
5304       loc.reg = gen_rtx_REG (submode, regno);
5305       if (vec_flags & VEC_SVE_PRED)
5306         {
5307           gcc_assert (PR_REGNUM_P (regno));
5308           loc.group = MEM_SVE_PRED;
5309         }
5310       else
5311         {
5312           gcc_assert (FP_REGNUM_P (regno));
5313           if (known_le (GET_MODE_SIZE (submode), 8))
5314             loc.group = GPR;
5315           else if (known_eq (GET_MODE_SIZE (submode), 16))
5316             loc.group = MEM_128;
5317           else
5318             loc.group = MEM_SVE_DATA;
5319         }
5320       loc.index = m_group_count[loc.group]++;
5321       m_save_locations.quick_push (loc);
5322     }
5323 }
5324
5325 /* Record that the arguments to CALL_INSN need to be preserved around
5326    the mode switch.  */
5327
5328 void
5329 aarch64_sme_mode_switch_regs::add_call_args (rtx_call_insn *call_insn)
5330 {
5331   for (rtx node = CALL_INSN_FUNCTION_USAGE (call_insn);
5332        node; node = XEXP (node, 1))
5333     {
5334       rtx item = XEXP (node, 0);
5335       if (GET_CODE (item) != USE)
5336         continue;
5337       item = XEXP (item, 0);
5338       if (!REG_P (item))
5339         continue;
5340       add_reg (GET_MODE (item), REGNO (item));
5341     }
5342 }
5343
5344 /* Record that the return value from CALL_INSN (if any) needs to be
5345    preserved around the mode switch.  */
5346
5347 void
5348 aarch64_sme_mode_switch_regs::add_call_result (rtx_call_insn *call_insn)
5349 {
5350   rtx pat = PATTERN (call_insn);
5351   gcc_assert (GET_CODE (pat) == PARALLEL);
5352   pat = XVECEXP (pat, 0, 0);
5353   if (GET_CODE (pat) == CALL)
5354     return;
5355   rtx dest = SET_DEST (pat);
5356   if (GET_CODE (dest) == PARALLEL)
5357     for (int i = 0; i < XVECLEN (dest, 0); ++i)
5358       {
5359         rtx x = XVECEXP (dest, 0, i);
5360         gcc_assert (GET_CODE (x) == EXPR_LIST);
5361         rtx reg = XEXP (x, 0);
5362         add_reg (GET_MODE (reg), REGNO (reg));
5363       }
5364   else
5365     add_reg (GET_MODE (dest), REGNO (dest));
5366 }
5367
5368 /* REGNO is a register that is call-preserved under the current function's ABI.
5369    Record that it must be preserved around the mode switch.  */
5370
5371 void
5372 aarch64_sme_mode_switch_regs::add_call_preserved_reg (unsigned int regno)
5373 {
5374   if (FP_REGNUM_P (regno))
5375     switch (crtl->abi->id ())
5376       {
5377       case ARM_PCS_SVE:
5378         add_reg (VNx16QImode, regno);
5379         break;
5380       case ARM_PCS_SIMD:
5381         add_reg (V16QImode, regno);
5382         break;
5383       case ARM_PCS_AAPCS64:
5384         add_reg (DImode, regno);
5385         break;
5386       default:
5387         gcc_unreachable ();
5388       }
5389   else if (PR_REGNUM_P (regno))
5390     add_reg (VNx16BImode, regno);
5391 }
5392
5393 /* The hard registers in REGS are call-preserved under the current function's
5394    ABI.  Record that they must be preserved around the mode switch.  */
5395
5396 void
5397 aarch64_sme_mode_switch_regs::add_call_preserved_regs (bitmap regs)
5398 {
5399   bitmap_iterator bi;
5400   unsigned int regno;
5401   EXECUTE_IF_SET_IN_BITMAP (regs, 0, regno, bi)
5402     if (HARD_REGISTER_NUM_P (regno))
5403       add_call_preserved_reg (regno);
5404     else
5405       break;
5406 }
5407
5408 /* Emit code to save registers before the mode switch.  */
5409
5410 void
5411 aarch64_sme_mode_switch_regs::emit_prologue ()
5412 {
5413   emit_sve_sp_adjust (PROLOGUE);
5414   emit_sve_pred_moves (PROLOGUE);
5415   emit_sve_data_moves (PROLOGUE);
5416   emit_mem_128_moves (PROLOGUE);
5417   emit_gpr_moves (PROLOGUE);
5418 }
5419
5420 /* Emit code to restore registers after the mode switch.  */
5421
5422 void
5423 aarch64_sme_mode_switch_regs::emit_epilogue ()
5424 {
5425   emit_gpr_moves (EPILOGUE);
5426   emit_mem_128_moves (EPILOGUE);
5427   emit_sve_pred_moves (EPILOGUE);
5428   emit_sve_data_moves (EPILOGUE);
5429   emit_sve_sp_adjust (EPILOGUE);
5430 }
5431
5432 /* The SVE predicate registers are stored below the SVE data registers,
5433    with the predicate save area being padded to a data-register-sized
5434    boundary.  Return the size of this padded area as a whole number
5435    of data register slots.  */
5436
5437 unsigned int
5438 aarch64_sme_mode_switch_regs::sve_data_headroom ()
5439 {
5440   return CEIL (m_group_count[MEM_SVE_PRED], 8);
5441 }
5442
5443 /* Return a memory reference of mode MODE to OFFSET bytes from the
5444    stack pointer.  */
5445
5446 rtx
5447 aarch64_sme_mode_switch_regs::get_slot_mem (machine_mode mode,
5448                                             poly_int64 offset)
5449 {
5450   rtx addr = plus_constant (Pmode, stack_pointer_rtx, offset);
5451   return gen_rtx_MEM (mode, addr);
5452 }
5453
5454 /* Allocate or deallocate SIZE bytes of stack space: SEQ decides which.  */
5455
5456 void
5457 aarch64_sme_mode_switch_regs::emit_stack_adjust (sequence seq,
5458                                                  poly_int64 size)
5459 {
5460   if (seq == PROLOGUE)
5461     size = -size;
5462   emit_insn (gen_rtx_SET (stack_pointer_rtx,
5463                           plus_constant (Pmode, stack_pointer_rtx, size)));
5464 }
5465
5466 /* Save or restore the register in LOC, whose slot is OFFSET bytes from
5467    the stack pointer.  SEQ chooses between saving and restoring.  */
5468
5469 void
5470 aarch64_sme_mode_switch_regs::emit_mem_move (sequence seq,
5471                                              const save_location &loc,
5472                                              poly_int64 offset)
5473 {
5474   rtx mem = get_slot_mem (GET_MODE (loc.reg), offset);
5475   if (seq == PROLOGUE)
5476     emit_move_insn (mem, loc.reg);
5477   else
5478     emit_move_insn (loc.reg, mem);
5479 }
5480
5481 /* Emit instructions to save or restore the GPR group.  SEQ chooses between
5482    saving and restoring.  */
5483
5484 void
5485 aarch64_sme_mode_switch_regs::emit_gpr_moves (sequence seq)
5486 {
5487   for (auto &loc : m_save_locations)
5488     if (loc.group == GPR)
5489       {
5490         gcc_assert (loc.index < 8);
5491         rtx gpr = gen_rtx_REG (GET_MODE (loc.reg), FIRST_GPR + loc.index);
5492         if (seq == PROLOGUE)
5493           emit_move_insn (gpr, loc.reg);
5494         else
5495           emit_move_insn (loc.reg, gpr);
5496       }
5497 }
5498
5499 /* Emit instructions to save or restore the MEM_128 group.  SEQ chooses
5500    between saving and restoring.  */
5501
5502 void
5503 aarch64_sme_mode_switch_regs::emit_mem_128_moves (sequence seq)
5504 {
5505   HOST_WIDE_INT count = m_group_count[MEM_128];
5506   if (count == 0)
5507     return;
5508
5509   auto sp = stack_pointer_rtx;
5510   auto sp_adjust = (seq == PROLOGUE ? -count : count) * 16;
5511
5512   /* Pick a common mode that supports LDR & STR with pre/post-modification
5513      and LDP & STP with pre/post-modification.  */
5514   auto mode = TFmode;
5515
5516   /* An instruction pattern that should be emitted at the end.  */
5517   rtx last_pat = NULL_RTX;
5518
5519   /* A previous MEM_128 location that hasn't been handled yet.  */
5520   save_location *prev_loc = nullptr;
5521
5522   /* Look for LDP/STPs and record any leftover LDR/STR in PREV_LOC.  */
5523   for (auto &loc : m_save_locations)
5524     if (loc.group == MEM_128)
5525       {
5526         if (!prev_loc)
5527           {
5528             prev_loc = &loc;
5529             continue;
5530           }
5531         gcc_assert (loc.index == prev_loc->index + 1);
5532
5533         /* The offset of the base of the save area from the current
5534            stack pointer.  */
5535         HOST_WIDE_INT bias = 0;
5536         if (prev_loc->index == 0 && seq == PROLOGUE)
5537           bias = sp_adjust;
5538
5539         /* Get the two sets in the LDP/STP.  */
5540         rtx ops[] = {
5541           gen_rtx_REG (mode, REGNO (prev_loc->reg)),
5542           get_slot_mem (mode, prev_loc->index * 16 + bias),
5543           gen_rtx_REG (mode, REGNO (loc.reg)),
5544           get_slot_mem (mode, loc.index * 16 + bias)
5545         };
5546         unsigned int lhs = (seq == PROLOGUE);
5547         rtx set1 = gen_rtx_SET (ops[lhs], ops[1 - lhs]);
5548         rtx set2 = gen_rtx_SET (ops[lhs + 2], ops[3 - lhs]);
5549
5550         /* Combine the sets with any stack allocation/deallocation.  */
5551         rtx pat;
5552         if (prev_loc->index == 0)
5553           {
5554             rtx plus_sp = plus_constant (Pmode, sp, sp_adjust);
5555             rtvec vec = gen_rtvec (3, gen_rtx_SET (sp, plus_sp), set1, set2);
5556             pat = gen_rtx_PARALLEL (VOIDmode, vec);
5557           }
5558         else if (seq == PROLOGUE)
5559           pat = aarch64_gen_store_pair (ops[1], ops[0], ops[2]);
5560         else
5561           pat = aarch64_gen_load_pair (ops[0], ops[2], ops[1]);
5562
5563         /* Queue a deallocation to the end, otherwise emit the
5564            instruction now.  */
5565         if (seq == EPILOGUE && prev_loc->index == 0)
5566           last_pat = pat;
5567         else
5568           emit_insn (pat);
5569         prev_loc = nullptr;
5570       }
5571
5572   /* Handle any leftover LDR/STR.  */
5573   if (prev_loc)
5574     {
5575       rtx reg = gen_rtx_REG (mode, REGNO (prev_loc->reg));
5576       rtx addr;
5577       if (prev_loc->index != 0)
5578         addr = plus_constant (Pmode, sp, prev_loc->index * 16);
5579       else if (seq == PROLOGUE)
5580         {
5581           rtx allocate = plus_constant (Pmode, sp, -count * 16);
5582           addr = gen_rtx_PRE_MODIFY (Pmode, sp, allocate);
5583         }
5584       else
5585         {
5586           rtx deallocate = plus_constant (Pmode, sp, count * 16);
5587           addr = gen_rtx_POST_MODIFY (Pmode, sp, deallocate);
5588         }
5589       rtx mem = gen_rtx_MEM (mode, addr);
5590       if (seq == PROLOGUE)
5591         emit_move_insn (mem, reg);
5592       else
5593         emit_move_insn (reg, mem);
5594     }
5595
5596   if (last_pat)
5597     emit_insn (last_pat);
5598 }
5599
5600 /* Allocate or deallocate the stack space needed by the SVE groups.
5601    SEQ chooses between allocating and deallocating.  */
5602
5603 void
5604 aarch64_sme_mode_switch_regs::emit_sve_sp_adjust (sequence seq)
5605 {
5606   if (unsigned int count = m_group_count[MEM_SVE_DATA] + sve_data_headroom ())
5607     emit_stack_adjust (seq, count * BYTES_PER_SVE_VECTOR);
5608 }
5609
5610 /* Save or restore the MEM_SVE_DATA group.  SEQ chooses between saving
5611    and restoring.  */
5612
5613 void
5614 aarch64_sme_mode_switch_regs::emit_sve_data_moves (sequence seq)
5615 {
5616   for (auto &loc : m_save_locations)
5617     if (loc.group == MEM_SVE_DATA)
5618       {
5619         auto index = loc.index + sve_data_headroom ();
5620         emit_mem_move (seq, loc, index * BYTES_PER_SVE_VECTOR);
5621       }
5622 }
5623
5624 /* Save or restore the MEM_SVE_PRED group.  SEQ chooses between saving
5625    and restoring.  */
5626
5627 void
5628 aarch64_sme_mode_switch_regs::emit_sve_pred_moves (sequence seq)
5629 {
5630   for (auto &loc : m_save_locations)
5631     if (loc.group == MEM_SVE_PRED)
5632       emit_mem_move (seq, loc, loc.index * BYTES_PER_SVE_PRED);
5633 }
5634
5635 /* Set DEST to (vec_series BASE STEP).  */
5636
5637 static void
5638 aarch64_expand_vec_series (rtx dest, rtx base, rtx step)
5639 {
5640   machine_mode mode = GET_MODE (dest);
5641   scalar_mode inner = GET_MODE_INNER (mode);
5642
5643   /* Each operand can be a register or an immediate in the range [-16, 15].  */
5644   if (!aarch64_sve_index_immediate_p (base))
5645     base = force_reg (inner, base);
5646   if (!aarch64_sve_index_immediate_p (step))
5647     step = force_reg (inner, step);
5648
5649   emit_set_insn (dest, gen_rtx_VEC_SERIES (mode, base, step));
5650 }
5651
5652 /* Duplicate 128-bit Advanced SIMD vector SRC so that it fills an SVE
5653    register of mode MODE.  Use TARGET for the result if it's nonnull
5654    and convenient.
5655
5656    The two vector modes must have the same element mode.  The behavior
5657    is to duplicate architectural lane N of SRC into architectural lanes
5658    N + I * STEP of the result.  On big-endian targets, architectural
5659    lane 0 of an Advanced SIMD vector is the last element of the vector
5660    in memory layout, so for big-endian targets this operation has the
5661    effect of reversing SRC before duplicating it.  Callers need to
5662    account for this.  */
5663
5664 rtx
5665 aarch64_expand_sve_dupq (rtx target, machine_mode mode, rtx src)
5666 {
5667   machine_mode src_mode = GET_MODE (src);
5668   gcc_assert (GET_MODE_INNER (mode) == GET_MODE_INNER (src_mode));
5669   insn_code icode = (BYTES_BIG_ENDIAN
5670                      ? code_for_aarch64_vec_duplicate_vq_be (mode)
5671                      : code_for_aarch64_vec_duplicate_vq_le (mode));
5672
5673   unsigned int i = 0;
5674   expand_operand ops[3];
5675   create_output_operand (&ops[i++], target, mode);
5676   create_output_operand (&ops[i++], src, src_mode);
5677   if (BYTES_BIG_ENDIAN)
5678     {
5679       /* Create a PARALLEL describing the reversal of SRC.  */
5680       unsigned int nelts_per_vq = 128 / GET_MODE_UNIT_BITSIZE (mode);
5681       rtx sel = aarch64_gen_stepped_int_parallel (nelts_per_vq,
5682                                                   nelts_per_vq - 1, -1);
5683       create_fixed_operand (&ops[i++], sel);
5684     }
5685   expand_insn (icode, i, ops);
5686   return ops[0].value;
5687 }
5688
5689 /* Try to force 128-bit vector value SRC into memory and use LD1RQ to fetch
5690    the memory image into DEST.  Return true on success.  */
5691
5692 static bool
5693 aarch64_expand_sve_ld1rq (rtx dest, rtx src)
5694 {
5695   src = force_const_mem (GET_MODE (src), src);
5696   if (!src)
5697     return false;
5698
5699   /* Make sure that the address is legitimate.  */
5700   if (!aarch64_sve_ld1rq_operand_p (src))
5701     {
5702       rtx addr = force_reg (Pmode, XEXP (src, 0));
5703       src = replace_equiv_address (src, addr);
5704     }
5705
5706   machine_mode mode = GET_MODE (dest);
5707   machine_mode pred_mode = aarch64_sve_pred_mode (mode);
5708   rtx ptrue = aarch64_ptrue_reg (pred_mode);
5709   emit_insn (gen_aarch64_sve_ld1rq (mode, dest, src, ptrue));
5710   return true;
5711 }
5712
5713 /* SRC is an SVE CONST_VECTOR that contains N "foreground" values followed
5714    by N "background" values.  Try to move it into TARGET using:
5715
5716       PTRUE PRED.<T>, VL<N>
5717       MOV TRUE.<T>, #<foreground>
5718       MOV FALSE.<T>, #<background>
5719       SEL TARGET.<T>, PRED.<T>, TRUE.<T>, FALSE.<T>
5720
5721    The PTRUE is always a single instruction but the MOVs might need a
5722    longer sequence.  If the background value is zero (as it often is),
5723    the sequence can sometimes collapse to a PTRUE followed by a
5724    zero-predicated move.
5725
5726    Return the target on success, otherwise return null.  */
5727
5728 static rtx
5729 aarch64_expand_sve_const_vector_sel (rtx target, rtx src)
5730 {
5731   gcc_assert (CONST_VECTOR_NELTS_PER_PATTERN (src) == 2);
5732
5733   /* Make sure that the PTRUE is valid.  */
5734   machine_mode mode = GET_MODE (src);
5735   machine_mode pred_mode = aarch64_sve_pred_mode (mode);
5736   unsigned int npatterns = CONST_VECTOR_NPATTERNS (src);
5737   if (aarch64_svpattern_for_vl (pred_mode, npatterns)
5738       == AARCH64_NUM_SVPATTERNS)
5739     return NULL_RTX;
5740
5741   rtx_vector_builder pred_builder (pred_mode, npatterns, 2);
5742   rtx_vector_builder true_builder (mode, npatterns, 1);
5743   rtx_vector_builder false_builder (mode, npatterns, 1);
5744   for (unsigned int i = 0; i < npatterns; ++i)
5745     {
5746       true_builder.quick_push (CONST_VECTOR_ENCODED_ELT (src, i));
5747       pred_builder.quick_push (CONST1_RTX (BImode));
5748     }
5749   for (unsigned int i = 0; i < npatterns; ++i)
5750     {
5751       false_builder.quick_push (CONST_VECTOR_ENCODED_ELT (src, i + npatterns));
5752       pred_builder.quick_push (CONST0_RTX (BImode));
5753     }
5754   expand_operand ops[4];
5755   create_output_operand (&ops[0], target, mode);
5756   create_input_operand (&ops[1], true_builder.build (), mode);
5757   create_input_operand (&ops[2], false_builder.build (), mode);
5758   create_input_operand (&ops[3], pred_builder.build (), pred_mode);
5759   expand_insn (code_for_vcond_mask (mode, mode), 4, ops);
5760   return target;
5761 }
5762
5763 /* Return a register containing CONST_VECTOR SRC, given that SRC has an
5764    SVE data mode and isn't a legitimate constant.  Use TARGET for the
5765    result if convenient.
5766
5767    The returned register can have whatever mode seems most natural
5768    given the contents of SRC.  */
5769
5770 static rtx
5771 aarch64_expand_sve_const_vector (rtx target, rtx src)
5772 {
5773   machine_mode mode = GET_MODE (src);
5774   unsigned int npatterns = CONST_VECTOR_NPATTERNS (src);
5775   unsigned int nelts_per_pattern = CONST_VECTOR_NELTS_PER_PATTERN (src);
5776   scalar_mode elt_mode = GET_MODE_INNER (mode);
5777   unsigned int elt_bits = GET_MODE_BITSIZE (elt_mode);
5778   unsigned int container_bits = aarch64_sve_container_bits (mode);
5779   unsigned int encoded_bits = npatterns * nelts_per_pattern * container_bits;
5780
5781   if (nelts_per_pattern == 1
5782       && encoded_bits <= 128
5783       && container_bits != elt_bits)
5784     {
5785       /* We have a partial vector mode and a constant whose full-vector
5786          equivalent would occupy a repeating 128-bit sequence.  Build that
5787          full-vector equivalent instead, so that we have the option of
5788          using LD1RQ and Advanced SIMD operations.  */
5789       unsigned int repeat = container_bits / elt_bits;
5790       machine_mode full_mode = aarch64_full_sve_mode (elt_mode).require ();
5791       rtx_vector_builder builder (full_mode, npatterns * repeat, 1);
5792       for (unsigned int i = 0; i < npatterns; ++i)
5793         for (unsigned int j = 0; j < repeat; ++j)
5794           builder.quick_push (CONST_VECTOR_ENCODED_ELT (src, i));
5795       target = aarch64_target_reg (target, full_mode);
5796       return aarch64_expand_sve_const_vector (target, builder.build ());
5797     }
5798
5799   if (nelts_per_pattern == 1 && encoded_bits == 128)
5800     {
5801       /* The constant is a duplicated quadword but can't be narrowed
5802          beyond a quadword.  Get the memory image of the first quadword
5803          as a 128-bit vector and try using LD1RQ to load it from memory.
5804
5805          The effect for both endiannesses is to load memory lane N into
5806          architectural lanes N + I * STEP of the result.  On big-endian
5807          targets, the layout of the 128-bit vector in an Advanced SIMD
5808          register would be different from its layout in an SVE register,
5809          but this 128-bit vector is a memory value only.  */
5810       machine_mode vq_mode = aarch64_v128_mode (elt_mode).require ();
5811       rtx vq_value = simplify_gen_subreg (vq_mode, src, mode, 0);
5812       if (vq_value && aarch64_expand_sve_ld1rq (target, vq_value))
5813         return target;
5814     }
5815
5816   if (nelts_per_pattern == 1 && encoded_bits < 128)
5817     {
5818       /* The vector is a repeating sequence of 64 bits or fewer.
5819          See if we can load them using an Advanced SIMD move and then
5820          duplicate it to fill a vector.  This is better than using a GPR
5821          move because it keeps everything in the same register file.  */
5822       machine_mode vq_mode = aarch64_v128_mode (elt_mode).require ();
5823       rtx_vector_builder builder (vq_mode, npatterns, 1);
5824       for (unsigned int i = 0; i < npatterns; ++i)
5825         {
5826           /* We want memory lane N to go into architectural lane N,
5827              so reverse for big-endian targets.  The DUP .Q pattern
5828              has a compensating reverse built-in.  */
5829           unsigned int srci = BYTES_BIG_ENDIAN ? npatterns - i - 1 : i;
5830           builder.quick_push (CONST_VECTOR_ENCODED_ELT (src, srci));
5831         }
5832       rtx vq_src = builder.build ();
5833       if (aarch64_simd_valid_mov_imm (vq_src))
5834         {
5835           vq_src = force_reg (vq_mode, vq_src);
5836           return aarch64_expand_sve_dupq (target, mode, vq_src);
5837         }
5838
5839       /* Get an integer representation of the repeating part of Advanced
5840          SIMD vector VQ_SRC.  This preserves the endianness of VQ_SRC,
5841          which for big-endian targets is lane-swapped wrt a normal
5842          Advanced SIMD vector.  This means that for both endiannesses,
5843          memory lane N of SVE vector SRC corresponds to architectural
5844          lane N of a register holding VQ_SRC.  This in turn means that
5845          memory lane 0 of SVE vector SRC is in the lsb of VQ_SRC (viewed
5846          as a single 128-bit value) and thus that memory lane 0 of SRC is
5847          in the lsb of the integer.  Duplicating the integer therefore
5848          ensures that memory lane N of SRC goes into architectural lane
5849          N + I * INDEX of the SVE register.  */
5850       scalar_mode int_mode = int_mode_for_size (encoded_bits, 0).require ();
5851       rtx elt_value = simplify_gen_subreg (int_mode, vq_src, vq_mode, 0);
5852       if (elt_value)
5853         {
5854           /* Pretend that we had a vector of INT_MODE to start with.  */
5855           elt_mode = int_mode;
5856           mode = aarch64_full_sve_mode (int_mode).require ();
5857
5858           /* If the integer can be moved into a general register by a
5859              single instruction, do that and duplicate the result.  */
5860           if (CONST_INT_P (elt_value)
5861               && aarch64_move_imm (INTVAL (elt_value),
5862                                    encoded_bits <= 32 ? SImode : DImode))
5863             {
5864               elt_value = force_reg (elt_mode, elt_value);
5865               return expand_vector_broadcast (mode, elt_value);
5866             }
5867         }
5868       else if (npatterns == 1)
5869         /* We're duplicating a single value, but can't do better than
5870            force it to memory and load from there.  This handles things
5871            like symbolic constants.  */
5872         elt_value = CONST_VECTOR_ENCODED_ELT (src, 0);
5873
5874       if (elt_value)
5875         {
5876           /* Load the element from memory if we can, otherwise move it into
5877              a register and use a DUP.  */
5878           rtx op = force_const_mem (elt_mode, elt_value);
5879           if (!op)
5880             op = force_reg (elt_mode, elt_value);
5881           return expand_vector_broadcast (mode, op);
5882         }
5883     }
5884
5885   /* Try using INDEX.  */
5886   rtx base, step;
5887   if (const_vec_series_p (src, &base, &step))
5888     {
5889       aarch64_expand_vec_series (target, base, step);
5890       return target;
5891     }
5892
5893   /* From here on, it's better to force the whole constant to memory
5894      if we can.  */
5895   if (GET_MODE_NUNITS (mode).is_constant ())
5896     return NULL_RTX;
5897
5898   if (nelts_per_pattern == 2)
5899     if (rtx res = aarch64_expand_sve_const_vector_sel (target, src))
5900       return res;
5901
5902   /* Expand each pattern individually.  */
5903   gcc_assert (npatterns > 1);
5904   rtx_vector_builder builder;
5905   auto_vec<rtx, 16> vectors (npatterns);
5906   for (unsigned int i = 0; i < npatterns; ++i)
5907     {
5908       builder.new_vector (mode, 1, nelts_per_pattern);
5909       for (unsigned int j = 0; j < nelts_per_pattern; ++j)
5910         builder.quick_push (CONST_VECTOR_ELT (src, i + j * npatterns));
5911       vectors.quick_push (force_reg (mode, builder.build ()));
5912     }
5913
5914   /* Use permutes to interleave the separate vectors.  */
5915   while (npatterns > 1)
5916     {
5917       npatterns /= 2;
5918       for (unsigned int i = 0; i < npatterns; ++i)
5919         {
5920           rtx tmp = (npatterns == 1 ? target : gen_reg_rtx (mode));
5921           rtvec v = gen_rtvec (2, vectors[i], vectors[i + npatterns]);
5922           emit_set_insn (tmp, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
5923           vectors[i] = tmp;
5924         }
5925     }
5926   gcc_assert (vectors[0] == target);
5927   return target;
5928 }
5929
5930 /* Use WHILE to set a predicate register of mode MODE in which the first
5931    VL bits are set and the rest are clear.  Use TARGET for the register
5932    if it's nonnull and convenient.  */
5933
5934 static rtx
5935 aarch64_sve_move_pred_via_while (rtx target, machine_mode mode,
5936                                  unsigned int vl)
5937 {
5938   rtx limit = force_reg (DImode, gen_int_mode (vl, DImode));
5939   target = aarch64_target_reg (target, mode);
5940   emit_insn (gen_while (UNSPEC_WHILELO, DImode, mode,
5941                         target, const0_rtx, limit));
5942   return target;
5943 }
5944
5945 static rtx
5946 aarch64_expand_sve_const_pred_1 (rtx, rtx_vector_builder &, bool);
5947
5948 /* BUILDER is a constant predicate in which the index of every set bit
5949    is a multiple of ELT_SIZE (which is <= 8).  Try to load the constant
5950    by inverting every element at a multiple of ELT_SIZE and EORing the
5951    result with an ELT_SIZE PTRUE.
5952
5953    Return a register that contains the constant on success, otherwise
5954    return null.  Use TARGET as the register if it is nonnull and
5955    convenient.  */
5956
5957 static rtx
5958 aarch64_expand_sve_const_pred_eor (rtx target, rtx_vector_builder &builder,
5959                                    unsigned int elt_size)
5960 {
5961   /* Invert every element at a multiple of ELT_SIZE, keeping the
5962      other bits zero.  */
5963   rtx_vector_builder inv_builder (VNx16BImode, builder.npatterns (),
5964                                   builder.nelts_per_pattern ());
5965   for (unsigned int i = 0; i < builder.encoded_nelts (); ++i)
5966     if ((i & (elt_size - 1)) == 0 && INTVAL (builder.elt (i)) == 0)
5967       inv_builder.quick_push (const1_rtx);
5968     else
5969       inv_builder.quick_push (const0_rtx);
5970   inv_builder.finalize ();
5971
5972   /* See if we can load the constant cheaply.  */
5973   rtx inv = aarch64_expand_sve_const_pred_1 (NULL_RTX, inv_builder, false);
5974   if (!inv)
5975     return NULL_RTX;
5976
5977   /* EOR the result with an ELT_SIZE PTRUE.  */
5978   rtx mask = aarch64_ptrue_all (elt_size);
5979   mask = force_reg (VNx16BImode, mask);
5980   inv = gen_lowpart (VNx16BImode, inv);
5981   target = aarch64_target_reg (target, VNx16BImode);
5982   emit_insn (gen_aarch64_pred_z (XOR, VNx16BImode, target, mask, inv, mask));
5983   return target;
5984 }
5985
5986 /* BUILDER is a constant predicate in which the index of every set bit
5987    is a multiple of ELT_SIZE (which is <= 8).  Try to load the constant
5988    using a TRN1 of size PERMUTE_SIZE, which is >= ELT_SIZE.  Return the
5989    register on success, otherwise return null.  Use TARGET as the register
5990    if nonnull and convenient.  */
5991
5992 static rtx
5993 aarch64_expand_sve_const_pred_trn (rtx target, rtx_vector_builder &builder,
5994                                    unsigned int elt_size,
5995                                    unsigned int permute_size)
5996 {
5997   /* We're going to split the constant into two new constants A and B,
5998      with element I of BUILDER going into A if (I & PERMUTE_SIZE) == 0
5999      and into B otherwise.  E.g. for PERMUTE_SIZE == 4 && ELT_SIZE == 1:
6000
6001      A: { 0, 1, 2, 3, _, _, _, _, 8, 9, 10, 11, _, _, _, _ }
6002      B: { 4, 5, 6, 7, _, _, _, _, 12, 13, 14, 15, _, _, _, _ }
6003
6004      where _ indicates elements that will be discarded by the permute.
6005
6006      First calculate the ELT_SIZEs for A and B.  */
6007   unsigned int a_elt_size = GET_MODE_SIZE (DImode);
6008   unsigned int b_elt_size = GET_MODE_SIZE (DImode);
6009   for (unsigned int i = 0; i < builder.encoded_nelts (); i += elt_size)
6010     if (INTVAL (builder.elt (i)) != 0)
6011       {
6012         if (i & permute_size)
6013           b_elt_size |= i - permute_size;
6014         else
6015           a_elt_size |= i;
6016       }
6017   a_elt_size &= -a_elt_size;
6018   b_elt_size &= -b_elt_size;
6019
6020   /* Now construct the vectors themselves.  */
6021   rtx_vector_builder a_builder (VNx16BImode, builder.npatterns (),
6022                                 builder.nelts_per_pattern ());
6023   rtx_vector_builder b_builder (VNx16BImode, builder.npatterns (),
6024                                 builder.nelts_per_pattern ());
6025   unsigned int nelts = builder.encoded_nelts ();
6026   for (unsigned int i = 0; i < nelts; ++i)
6027     if (i & (elt_size - 1))
6028       {
6029         a_builder.quick_push (const0_rtx);
6030         b_builder.quick_push (const0_rtx);
6031       }
6032     else if ((i & permute_size) == 0)
6033       {
6034         /* The A and B elements are significant.  */
6035         a_builder.quick_push (builder.elt (i));
6036         b_builder.quick_push (builder.elt (i + permute_size));
6037       }
6038     else
6039       {
6040         /* The A and B elements are going to be discarded, so pick whatever
6041            is likely to give a nice constant.  We are targeting element
6042            sizes A_ELT_SIZE and B_ELT_SIZE for A and B respectively,
6043            with the aim of each being a sequence of ones followed by
6044            a sequence of zeros.  So:
6045
6046            * if X_ELT_SIZE <= PERMUTE_SIZE, the best approach is to
6047              duplicate the last X_ELT_SIZE element, to extend the
6048              current sequence of ones or zeros.
6049
6050            * if X_ELT_SIZE > PERMUTE_SIZE, the best approach is to add a
6051              zero, so that the constant really does have X_ELT_SIZE and
6052              not a smaller size.  */
6053         if (a_elt_size > permute_size)
6054           a_builder.quick_push (const0_rtx);
6055         else
6056           a_builder.quick_push (a_builder.elt (i - a_elt_size));
6057         if (b_elt_size > permute_size)
6058           b_builder.quick_push (const0_rtx);
6059         else
6060           b_builder.quick_push (b_builder.elt (i - b_elt_size));
6061       }
6062   a_builder.finalize ();
6063   b_builder.finalize ();
6064
6065   /* Try loading A into a register.  */
6066   rtx_insn *last = get_last_insn ();
6067   rtx a = aarch64_expand_sve_const_pred_1 (NULL_RTX, a_builder, false);
6068   if (!a)
6069     return NULL_RTX;
6070
6071   /* Try loading B into a register.  */
6072   rtx b = a;
6073   if (a_builder != b_builder)
6074     {
6075       b = aarch64_expand_sve_const_pred_1 (NULL_RTX, b_builder, false);
6076       if (!b)
6077         {
6078           delete_insns_since (last);
6079           return NULL_RTX;
6080         }
6081     }
6082
6083   /* Emit the TRN1 itself.  We emit a TRN that operates on VNx16BI
6084      operands but permutes them as though they had mode MODE.  */
6085   machine_mode mode = aarch64_sve_pred_mode (permute_size).require ();
6086   target = aarch64_target_reg (target, GET_MODE (a));
6087   rtx type_reg = CONST0_RTX (mode);
6088   emit_insn (gen_aarch64_sve_trn1_conv (mode, target, a, b, type_reg));
6089   return target;
6090 }
6091
6092 /* Subroutine of aarch64_expand_sve_const_pred.  Try to load the VNx16BI
6093    constant in BUILDER into an SVE predicate register.  Return the register
6094    on success, otherwise return null.  Use TARGET for the register if
6095    nonnull and convenient.
6096
6097    ALLOW_RECURSE_P is true if we can use methods that would call this
6098    function recursively.  */
6099
6100 static rtx
6101 aarch64_expand_sve_const_pred_1 (rtx target, rtx_vector_builder &builder,
6102                                  bool allow_recurse_p)
6103 {
6104   if (builder.encoded_nelts () == 1)
6105     /* A PFALSE or a PTRUE .B ALL.  */
6106     return aarch64_emit_set_immediate (target, builder);
6107
6108   unsigned int elt_size = aarch64_widest_sve_pred_elt_size (builder);
6109   if (int vl = aarch64_partial_ptrue_length (builder, elt_size))
6110     {
6111       /* If we can load the constant using PTRUE, use it as-is.  */
6112       machine_mode mode = aarch64_sve_pred_mode (elt_size).require ();
6113       if (aarch64_svpattern_for_vl (mode, vl) != AARCH64_NUM_SVPATTERNS)
6114         return aarch64_emit_set_immediate (target, builder);
6115
6116       /* Otherwise use WHILE to set the first VL bits.  */
6117       return aarch64_sve_move_pred_via_while (target, mode, vl);
6118     }
6119
6120   if (!allow_recurse_p)
6121     return NULL_RTX;
6122
6123   /* Try inverting the vector in element size ELT_SIZE and then EORing
6124      the result with an ELT_SIZE PTRUE.  */
6125   if (INTVAL (builder.elt (0)) == 0)
6126     if (rtx res = aarch64_expand_sve_const_pred_eor (target, builder,
6127                                                      elt_size))
6128       return res;
6129
6130   /* Try using TRN1 to permute two simpler constants.  */
6131   for (unsigned int i = elt_size; i <= 8; i *= 2)
6132     if (rtx res = aarch64_expand_sve_const_pred_trn (target, builder,
6133                                                      elt_size, i))
6134       return res;
6135
6136   return NULL_RTX;
6137 }
6138
6139 /* Return an SVE predicate register that contains the VNx16BImode
6140    constant in BUILDER, without going through the move expanders.
6141
6142    The returned register can have whatever mode seems most natural
6143    given the contents of BUILDER.  Use TARGET for the result if
6144    convenient.  */
6145
6146 static rtx
6147 aarch64_expand_sve_const_pred (rtx target, rtx_vector_builder &builder)
6148 {
6149   /* Try loading the constant using pure predicate operations.  */
6150   if (rtx res = aarch64_expand_sve_const_pred_1 (target, builder, true))
6151     return res;
6152
6153   /* Try forcing the constant to memory.  */
6154   if (builder.full_nelts ().is_constant ())
6155     if (rtx mem = force_const_mem (VNx16BImode, builder.build ()))
6156       {
6157         target = aarch64_target_reg (target, VNx16BImode);
6158         emit_move_insn (target, mem);
6159         return target;
6160       }
6161
6162   /* The last resort is to load the constant as an integer and then
6163      compare it against zero.  Use -1 for set bits in order to increase
6164      the changes of using SVE DUPM or an Advanced SIMD byte mask.  */
6165   rtx_vector_builder int_builder (VNx16QImode, builder.npatterns (),
6166                                   builder.nelts_per_pattern ());
6167   for (unsigned int i = 0; i < builder.encoded_nelts (); ++i)
6168     int_builder.quick_push (INTVAL (builder.elt (i))
6169                             ? constm1_rtx : const0_rtx);
6170   return aarch64_convert_sve_data_to_pred (target, VNx16BImode,
6171                                            int_builder.build ());
6172 }
6173
6174 /* Set DEST to immediate IMM.  */
6175
6176 void
6177 aarch64_expand_mov_immediate (rtx dest, rtx imm)
6178 {
6179   machine_mode mode = GET_MODE (dest);
6180
6181   /* Check on what type of symbol it is.  */
6182   scalar_int_mode int_mode;
6183   if ((SYMBOL_REF_P (imm)
6184        || LABEL_REF_P (imm)
6185        || GET_CODE (imm) == CONST
6186        || GET_CODE (imm) == CONST_POLY_INT)
6187       && is_a <scalar_int_mode> (mode, &int_mode))
6188     {
6189       rtx mem;
6190       poly_int64 offset;
6191       HOST_WIDE_INT const_offset;
6192       enum aarch64_symbol_type sty;
6193
6194       /* If we have (const (plus symbol offset)), separate out the offset
6195          before we start classifying the symbol.  */
6196       rtx base = strip_offset (imm, &offset);
6197
6198       /* We must always add an offset involving VL separately, rather than
6199          folding it into the relocation.  */
6200       if (!offset.is_constant (&const_offset))
6201         {
6202           if (!TARGET_SVE)
6203             {
6204               aarch64_report_sve_required ();
6205               return;
6206             }
6207           if (base == const0_rtx
6208               && (aarch64_sve_cnt_immediate_p (offset)
6209                   || aarch64_sve_rdvl_immediate_p (offset)))
6210             emit_insn (gen_rtx_SET (dest, imm));
6211           else
6212             {
6213               /* Do arithmetic on 32-bit values if the result is smaller
6214                  than that.  */
6215               if (partial_subreg_p (int_mode, SImode))
6216                 {
6217                   /* It is invalid to do symbol calculations in modes
6218                      narrower than SImode.  */
6219                   gcc_assert (base == const0_rtx);
6220                   dest = gen_lowpart (SImode, dest);
6221                   int_mode = SImode;
6222                 }
6223               if (base != const0_rtx)
6224                 {
6225                   base = aarch64_force_temporary (int_mode, dest, base);
6226                   aarch64_add_offset (int_mode, dest, base, offset,
6227                                       NULL_RTX, NULL_RTX, 0, false);
6228                 }
6229               else
6230                 aarch64_add_offset (int_mode, dest, base, offset,
6231                                     dest, NULL_RTX, 0, false);
6232             }
6233           return;
6234         }
6235
6236       if (aarch64_rdsvl_immediate_p (base))
6237         {
6238           /* We could handle non-constant offsets if they are ever
6239              generated.  */
6240           gcc_assert (const_offset == 0);
6241           emit_insn (gen_rtx_SET (dest, imm));
6242           return;
6243         }
6244
6245       sty = aarch64_classify_symbol (base, const_offset);
6246       switch (sty)
6247         {
6248         case SYMBOL_FORCE_TO_MEM:
6249           if (int_mode != ptr_mode)
6250             imm = convert_memory_address (ptr_mode, imm);
6251
6252           if (const_offset != 0
6253               && targetm.cannot_force_const_mem (ptr_mode, imm))
6254             {
6255               gcc_assert (can_create_pseudo_p ());
6256               base = aarch64_force_temporary (int_mode, dest, base);
6257               aarch64_add_offset (int_mode, dest, base, const_offset,
6258                                   NULL_RTX, NULL_RTX, 0, false);
6259               return;
6260             }
6261
6262           mem = force_const_mem (ptr_mode, imm);
6263           gcc_assert (mem);
6264
6265           /* If we aren't generating PC relative literals, then
6266              we need to expand the literal pool access carefully.
6267              This is something that needs to be done in a number
6268              of places, so could well live as a separate function.  */
6269           if (!aarch64_pcrelative_literal_loads)
6270             {
6271               gcc_assert (can_create_pseudo_p ());
6272               base = gen_reg_rtx (ptr_mode);
6273               aarch64_expand_mov_immediate (base, XEXP (mem, 0));
6274               if (ptr_mode != Pmode)
6275                 base = convert_memory_address (Pmode, base);
6276               mem = gen_rtx_MEM (ptr_mode, base);
6277             }
6278
6279           if (int_mode != ptr_mode)
6280             mem = gen_rtx_ZERO_EXTEND (int_mode, mem);
6281
6282           emit_insn (gen_rtx_SET (dest, mem));
6283
6284           return;
6285
6286         case SYMBOL_SMALL_TLSGD:
6287         case SYMBOL_SMALL_TLSDESC:
6288         case SYMBOL_SMALL_TLSIE:
6289         case SYMBOL_SMALL_GOT_28K:
6290         case SYMBOL_SMALL_GOT_4G:
6291         case SYMBOL_TINY_GOT:
6292         case SYMBOL_TINY_TLSIE:
6293           if (const_offset != 0)
6294             {
6295               gcc_assert(can_create_pseudo_p ());
6296               base = aarch64_force_temporary (int_mode, dest, base);
6297               aarch64_add_offset (int_mode, dest, base, const_offset,
6298                                   NULL_RTX, NULL_RTX, 0, false);
6299               return;
6300             }
6301           /* FALLTHRU */
6302
6303         case SYMBOL_SMALL_ABSOLUTE:
6304         case SYMBOL_TINY_ABSOLUTE:
6305         case SYMBOL_TLSLE12:
6306         case SYMBOL_TLSLE24:
6307         case SYMBOL_TLSLE32:
6308         case SYMBOL_TLSLE48:
6309           aarch64_load_symref_appropriately (dest, imm, sty);
6310           return;
6311
6312         default:
6313           gcc_unreachable ();
6314         }
6315     }
6316
6317   if (!CONST_INT_P (imm))
6318     {
6319       if (aarch64_sve_pred_mode_p (mode))
6320         {
6321           /* Only the low bit of each .H, .S and .D element is defined,
6322              so we can set the upper bits to whatever we like.  If the
6323              predicate is all-true in MODE, prefer to set all the undefined
6324              bits as well, so that we can share a single .B predicate for
6325              all modes.  */
6326           if (imm == CONSTM1_RTX (mode))
6327             imm = CONSTM1_RTX (VNx16BImode);
6328
6329           /* All methods for constructing predicate modes wider than VNx16BI
6330              will set the upper bits of each element to zero.  Expose this
6331              by moving such constants as a VNx16BI, so that all bits are
6332              significant and so that constants for different modes can be
6333              shared.  The wider constant will still be available as a
6334              REG_EQUAL note.  */
6335           rtx_vector_builder builder;
6336           if (aarch64_get_sve_pred_bits (builder, imm))
6337             {
6338               rtx res = aarch64_expand_sve_const_pred (dest, builder);
6339               if (dest != res)
6340                 emit_move_insn (dest, gen_lowpart (mode, res));
6341               return;
6342             }
6343         }
6344
6345       if (GET_CODE (imm) == HIGH || aarch64_simd_valid_mov_imm (imm))
6346         {
6347           emit_insn (gen_rtx_SET (dest, imm));
6348           return;
6349         }
6350
6351       if (CONST_VECTOR_P (imm) && aarch64_sve_data_mode_p (mode))
6352         if (rtx res = aarch64_expand_sve_const_vector (dest, imm))
6353           {
6354             if (dest != res)
6355               emit_insn (gen_aarch64_sve_reinterpret (mode, dest, res));
6356             return;
6357           }
6358
6359       rtx mem = force_const_mem (mode, imm);
6360       gcc_assert (mem);
6361       emit_move_insn (dest, mem);
6362       return;
6363     }
6364
6365   aarch64_internal_mov_immediate (dest, imm, true, mode);
6366 }
6367
6368 /* Return the MEM rtx that provides the canary value that should be used
6369    for stack-smashing protection.  MODE is the mode of the memory.
6370    For SSP_GLOBAL, DECL_RTL is the MEM rtx for the canary variable
6371    (__stack_chk_guard), otherwise it has no useful value.  SALT_TYPE
6372    indicates whether the caller is performing a SET or a TEST operation.  */
6373
6374 rtx
6375 aarch64_stack_protect_canary_mem (machine_mode mode, rtx decl_rtl,
6376                                   aarch64_salt_type salt_type)
6377 {
6378   rtx addr;
6379   if (aarch64_stack_protector_guard == SSP_GLOBAL)
6380     {
6381       gcc_assert (MEM_P (decl_rtl));
6382       addr = XEXP (decl_rtl, 0);
6383       poly_int64 offset;
6384       rtx base = strip_offset_and_salt (addr, &offset);
6385       if (!SYMBOL_REF_P (base))
6386         return decl_rtl;
6387
6388       rtvec v = gen_rtvec (2, base, GEN_INT (salt_type));
6389       addr = gen_rtx_UNSPEC (Pmode, v, UNSPEC_SALT_ADDR);
6390       addr = gen_rtx_CONST (Pmode, addr);
6391       addr = plus_constant (Pmode, addr, offset);
6392     }
6393   else
6394     {
6395       /* Calculate the address from the system register.  */
6396       rtx salt = GEN_INT (salt_type);
6397       addr = gen_reg_rtx (mode);
6398       if (mode == DImode)
6399         emit_insn (gen_reg_stack_protect_address_di (addr, salt));
6400       else
6401         {
6402           emit_insn (gen_reg_stack_protect_address_si (addr, salt));
6403           addr = convert_memory_address (Pmode, addr);
6404         }
6405       addr = plus_constant (Pmode, addr, aarch64_stack_protector_guard_offset);
6406     }
6407   return gen_rtx_MEM (mode, force_reg (Pmode, addr));
6408 }
6409
6410 /* Emit an SVE predicated move from SRC to DEST.  PRED is a predicate
6411    that is known to contain PTRUE.  */
6412
6413 void
6414 aarch64_emit_sve_pred_move (rtx dest, rtx pred, rtx src)
6415 {
6416   expand_operand ops[3];
6417   machine_mode mode = GET_MODE (dest);
6418   create_output_operand (&ops[0], dest, mode);
6419   create_input_operand (&ops[1], pred, GET_MODE(pred));
6420   create_input_operand (&ops[2], src, mode);
6421   temporary_volatile_ok v (true);
6422   expand_insn (code_for_aarch64_pred_mov (mode), 3, ops);
6423 }
6424
6425 /* Expand a pre-RA SVE data move from SRC to DEST in which at least one
6426    operand is in memory.  In this case we need to use the predicated LD1
6427    and ST1 instead of LDR and STR, both for correctness on big-endian
6428    targets and because LD1 and ST1 support a wider range of addressing modes.
6429    PRED_MODE is the mode of the predicate.
6430
6431    See the comment at the head of aarch64-sve.md for details about the
6432    big-endian handling.  */
6433
6434 void
6435 aarch64_expand_sve_mem_move (rtx dest, rtx src, machine_mode pred_mode)
6436 {
6437   machine_mode mode = GET_MODE (dest);
6438   rtx ptrue = aarch64_ptrue_reg (pred_mode);
6439   if (!register_operand (src, mode)
6440       && !register_operand (dest, mode))
6441     {
6442       rtx tmp = gen_reg_rtx (mode);
6443       if (MEM_P (src))
6444         aarch64_emit_sve_pred_move (tmp, ptrue, src);
6445       else
6446         emit_move_insn (tmp, src);
6447       src = tmp;
6448     }
6449   aarch64_emit_sve_pred_move (dest, ptrue, src);
6450 }
6451
6452 /* Called only on big-endian targets.  See whether an SVE vector move
6453    from SRC to DEST is effectively a REV[BHW] instruction, because at
6454    least one operand is a subreg of an SVE vector that has wider or
6455    narrower elements.  Return true and emit the instruction if so.
6456
6457    For example:
6458
6459      (set (reg:VNx8HI R1) (subreg:VNx8HI (reg:VNx16QI R2) 0))
6460
6461    represents a VIEW_CONVERT between the following vectors, viewed
6462    in memory order:
6463
6464      R2: { [0].high, [0].low,  [1].high, [1].low, ... }
6465      R1: { [0],      [1],      [2],      [3],     ... }
6466
6467    The high part of lane X in R2 should therefore correspond to lane X*2
6468    of R1, but the register representations are:
6469
6470          msb                                      lsb
6471      R2: ...... [1].high  [1].low   [0].high  [0].low
6472      R1: ...... [3]       [2]       [1]       [0]
6473
6474    where the low part of lane X in R2 corresponds to lane X*2 in R1.
6475    We therefore need a reverse operation to swap the high and low values
6476    around.
6477
6478    This is purely an optimization.  Without it we would spill the
6479    subreg operand to the stack in one mode and reload it in the
6480    other mode, which has the same effect as the REV.  */
6481
6482 bool
6483 aarch64_maybe_expand_sve_subreg_move (rtx dest, rtx src)
6484 {
6485   gcc_assert (BYTES_BIG_ENDIAN);
6486
6487   /* Do not try to optimize subregs that LRA has created for matched
6488      reloads.  These subregs only exist as a temporary measure to make
6489      the RTL well-formed, but they are exempt from the usual
6490      TARGET_CAN_CHANGE_MODE_CLASS rules.
6491
6492      For example, if we have:
6493
6494        (set (reg:VNx8HI R1) (foo:VNx8HI (reg:VNx4SI R2)))
6495
6496      and the constraints require R1 and R2 to be in the same register,
6497      LRA may need to create RTL such as:
6498
6499        (set (subreg:VNx4SI (reg:VNx8HI TMP) 0) (reg:VNx4SI R2))
6500        (set (reg:VNx8HI TMP) (foo:VNx8HI (subreg:VNx4SI (reg:VNx8HI TMP) 0)))
6501        (set (reg:VNx8HI R1) (reg:VNx8HI TMP))
6502
6503      which forces both the input and output of the original instruction
6504      to use the same hard register.  But for this to work, the normal
6505      rules have to be suppressed on the subreg input, otherwise LRA
6506      would need to reload that input too, meaning that the process
6507      would never terminate.  To compensate for this, the normal rules
6508      are also suppressed for the subreg output of the first move.
6509      Ignoring the special case and handling the first move normally
6510      would therefore generate wrong code: we would reverse the elements
6511      for the first subreg but not reverse them back for the second subreg.  */
6512   if (SUBREG_P (dest) && !LRA_SUBREG_P (dest))
6513     dest = SUBREG_REG (dest);
6514   if (SUBREG_P (src) && !LRA_SUBREG_P (src))
6515     src = SUBREG_REG (src);
6516
6517   /* The optimization handles two single SVE REGs with different element
6518      sizes.  */
6519   if (!REG_P (dest)
6520       || !REG_P (src)
6521       || aarch64_classify_vector_mode (GET_MODE (dest)) != VEC_SVE_DATA
6522       || aarch64_classify_vector_mode (GET_MODE (src)) != VEC_SVE_DATA
6523       || (GET_MODE_UNIT_SIZE (GET_MODE (dest))
6524           == GET_MODE_UNIT_SIZE (GET_MODE (src))))
6525     return false;
6526
6527   /* Generate *aarch64_sve_mov<mode>_subreg_be.  */
6528   rtx ptrue = aarch64_ptrue_reg (VNx16BImode);
6529   rtx unspec = gen_rtx_UNSPEC (GET_MODE (dest), gen_rtvec (2, ptrue, src),
6530                                UNSPEC_REV_SUBREG);
6531   emit_insn (gen_rtx_SET (dest, unspec));
6532   return true;
6533 }
6534
6535 /* Return a copy of X with mode MODE, without changing its other
6536    attributes.  Unlike gen_lowpart, this doesn't care whether the
6537    mode change is valid.  */
6538
6539 rtx
6540 aarch64_replace_reg_mode (rtx x, machine_mode mode)
6541 {
6542   if (GET_MODE (x) == mode)
6543     return x;
6544
6545   x = shallow_copy_rtx (x);
6546   set_mode_and_regno (x, mode, REGNO (x));
6547   return x;
6548 }
6549
6550 /* Return the SVE REV[BHW] unspec for reversing quantites of mode MODE
6551    stored in wider integer containers.  */
6552
6553 static unsigned int
6554 aarch64_sve_rev_unspec (machine_mode mode)
6555 {
6556   switch (GET_MODE_UNIT_SIZE (mode))
6557     {
6558     case 1: return UNSPEC_REVB;
6559     case 2: return UNSPEC_REVH;
6560     case 4: return UNSPEC_REVW;
6561     }
6562   gcc_unreachable ();
6563 }
6564
6565 /* Split a *aarch64_sve_mov<mode>_subreg_be pattern with the given
6566    operands.  */
6567
6568 void
6569 aarch64_split_sve_subreg_move (rtx dest, rtx ptrue, rtx src)
6570 {
6571   /* Decide which REV operation we need.  The mode with wider elements
6572      determines the mode of the operands and the mode with the narrower
6573      elements determines the reverse width.  */
6574   machine_mode mode_with_wider_elts = aarch64_sve_int_mode (GET_MODE (dest));
6575   machine_mode mode_with_narrower_elts = aarch64_sve_int_mode (GET_MODE (src));
6576   if (GET_MODE_UNIT_SIZE (mode_with_wider_elts)
6577       < GET_MODE_UNIT_SIZE (mode_with_narrower_elts))
6578     std::swap (mode_with_wider_elts, mode_with_narrower_elts);
6579
6580   unsigned int unspec = aarch64_sve_rev_unspec (mode_with_narrower_elts);
6581   machine_mode pred_mode = aarch64_sve_pred_mode (mode_with_wider_elts);
6582
6583   /* Get the operands in the appropriate modes and emit the instruction.  */
6584   ptrue = gen_lowpart (pred_mode, ptrue);
6585   dest = aarch64_replace_reg_mode (dest, mode_with_wider_elts);
6586   src = aarch64_replace_reg_mode (src, mode_with_wider_elts);
6587   emit_insn (gen_aarch64_pred (unspec, mode_with_wider_elts,
6588                                dest, ptrue, src));
6589 }
6590
6591 static bool
6592 aarch64_function_ok_for_sibcall (tree, tree exp)
6593 {
6594   auto from_abi = crtl->abi->id ();
6595   auto to_abi = expr_callee_abi (exp).id ();
6596
6597   /* ARM_PCS_SVE preserves strictly more than ARM_PCS_SIMD, which in
6598      turn preserves strictly more than the base PCS.  The callee must
6599      preserve everything that the caller is required to preserve.  */
6600   if (from_abi != to_abi && to_abi == ARM_PCS_SVE)
6601     to_abi = ARM_PCS_SIMD;
6602   if (from_abi != to_abi && to_abi == ARM_PCS_SIMD)
6603     to_abi = ARM_PCS_AAPCS64;
6604   if (from_abi != to_abi)
6605     return false;
6606
6607   tree fntype = TREE_TYPE (TREE_TYPE (CALL_EXPR_FN (exp)));
6608   if (aarch64_fntype_pstate_sm (fntype) & ~aarch64_cfun_incoming_pstate_sm ())
6609     return false;
6610   for (auto state : { "za", "zt0" })
6611     if (bool (aarch64_cfun_shared_flags (state))
6612         != bool (aarch64_fntype_shared_flags (fntype, state)))
6613       return false;
6614
6615   /* BTI J is needed where indirect_return functions may return
6616      if bti is enabled there.  */
6617   if (lookup_attribute ("indirect_return", TYPE_ATTRIBUTES (fntype))
6618       && !lookup_attribute ("indirect_return",
6619                             TYPE_ATTRIBUTES (TREE_TYPE (cfun->decl))))
6620     return false;
6621
6622   return true;
6623 }
6624
6625 /* Subroutine of aarch64_pass_by_reference for arguments that are not
6626    passed in SVE registers.  */
6627
6628 static bool
6629 aarch64_pass_by_reference_1 (CUMULATIVE_ARGS *pcum,
6630                              const function_arg_info &arg)
6631 {
6632   HOST_WIDE_INT size;
6633   machine_mode dummymode;
6634   int nregs;
6635
6636   /* GET_MODE_SIZE (BLKmode) is useless since it is 0.  */
6637   if (arg.mode == BLKmode && arg.type)
6638     size = int_size_in_bytes (arg.type);
6639   else
6640     /* No frontends can create types with variable-sized modes, so we
6641        shouldn't be asked to pass or return them.  */
6642     size = GET_MODE_SIZE (arg.mode).to_constant ();
6643
6644   /* Aggregates are passed by reference based on their size.  */
6645   if (arg.aggregate_type_p ())
6646     size = int_size_in_bytes (arg.type);
6647
6648   /* Variable sized arguments are always returned by reference.  */
6649   if (size < 0)
6650     return true;
6651
6652   /* Can this be a candidate to be passed in fp/simd register(s)?  */
6653   if (aarch64_vfp_is_call_or_return_candidate (arg.mode, arg.type,
6654                                                &dummymode, &nregs, NULL,
6655                                                !pcum || pcum->silent_p))
6656     return false;
6657
6658   /* Arguments which are variable sized or larger than 2 registers are
6659      passed by reference unless they are a homogenous floating point
6660      aggregate.  */
6661   return size > 2 * UNITS_PER_WORD;
6662 }
6663
6664 /* Implement TARGET_PASS_BY_REFERENCE.  */
6665
6666 static bool
6667 aarch64_pass_by_reference (cumulative_args_t pcum_v,
6668                            const function_arg_info &arg)
6669 {
6670   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
6671
6672   if (!arg.type)
6673     return aarch64_pass_by_reference_1 (pcum, arg);
6674
6675   pure_scalable_type_info pst_info;
6676   switch (pst_info.analyze (arg.type))
6677     {
6678     case pure_scalable_type_info::IS_PST:
6679       if (pcum && !pcum->silent_p && !TARGET_SVE)
6680         /* We can't gracefully recover at this point, so make this a
6681            fatal error.  */
6682         fatal_error (input_location, "arguments of type %qT require"
6683                      " the SVE ISA extension", arg.type);
6684
6685       /* Variadic SVE types are passed by reference.  Normal non-variadic
6686          arguments are too if we've run out of registers.  */
6687       return (!arg.named
6688               || pcum->aapcs_nvrn + pst_info.num_zr () > NUM_FP_ARG_REGS
6689               || pcum->aapcs_nprn + pst_info.num_pr () > NUM_PR_ARG_REGS);
6690
6691     case pure_scalable_type_info::DOESNT_MATTER:
6692       gcc_assert (aarch64_pass_by_reference_1 (pcum, arg));
6693       return true;
6694
6695     case pure_scalable_type_info::NO_ABI_IDENTITY:
6696     case pure_scalable_type_info::ISNT_PST:
6697       return aarch64_pass_by_reference_1 (pcum, arg);
6698     }
6699   gcc_unreachable ();
6700 }
6701
6702 /* Return TRUE if VALTYPE is padded to its least significant bits.  */
6703 static bool
6704 aarch64_return_in_msb (const_tree valtype)
6705 {
6706   machine_mode dummy_mode;
6707   int dummy_int;
6708
6709   /* Never happens in little-endian mode.  */
6710   if (!BYTES_BIG_ENDIAN)
6711     return false;
6712
6713   /* Only composite types smaller than or equal to 16 bytes can
6714      be potentially returned in registers.  */
6715   if (!aarch64_composite_type_p (valtype, TYPE_MODE (valtype))
6716       || int_size_in_bytes (valtype) <= 0
6717       || int_size_in_bytes (valtype) > 16)
6718     return false;
6719
6720   /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
6721      or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
6722      is always passed/returned in the least significant bits of fp/simd
6723      register(s).  */
6724   if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype), valtype,
6725                                                &dummy_mode, &dummy_int, NULL,
6726                                                false))
6727     return false;
6728
6729   /* Likewise pure scalable types for SVE vector and predicate registers.  */
6730   pure_scalable_type_info pst_info;
6731   if (pst_info.analyze_registers (valtype))
6732     return false;
6733
6734   return true;
6735 }
6736
6737 /* Implement TARGET_FUNCTION_VALUE.
6738    Define how to find the value returned by a function.  */
6739
6740 static rtx
6741 aarch64_function_value (const_tree type, const_tree func,
6742                         bool outgoing ATTRIBUTE_UNUSED)
6743 {
6744   machine_mode mode;
6745   int unsignedp;
6746
6747   mode = TYPE_MODE (type);
6748   if (INTEGRAL_TYPE_P (type))
6749     mode = promote_function_mode (type, mode, &unsignedp, func, 1);
6750
6751   pure_scalable_type_info pst_info;
6752   if (type && pst_info.analyze_registers (type))
6753     return pst_info.get_rtx (mode, V0_REGNUM, P0_REGNUM);
6754
6755   /* Generic vectors that map to full SVE modes with -msve-vector-bits=N
6756      are returned in memory, not by value.  */
6757   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
6758   bool sve_p = (vec_flags & VEC_ANY_SVE);
6759
6760   if (aarch64_return_in_msb (type))
6761     {
6762       HOST_WIDE_INT size = int_size_in_bytes (type);
6763
6764       if (size % UNITS_PER_WORD != 0)
6765         {
6766           size += UNITS_PER_WORD - size % UNITS_PER_WORD;
6767           mode = int_mode_for_size (size * BITS_PER_UNIT, 0).require ();
6768         }
6769     }
6770
6771   int count;
6772   machine_mode ag_mode;
6773   if (aarch64_vfp_is_call_or_return_candidate (mode, type, &ag_mode, &count,
6774                                                NULL, false))
6775     {
6776       gcc_assert (!sve_p);
6777       if (!aarch64_composite_type_p (type, mode))
6778         {
6779           gcc_assert (count == 1 && mode == ag_mode);
6780           return gen_rtx_REG (mode, V0_REGNUM);
6781         }
6782       else if (aarch64_advsimd_full_struct_mode_p (mode)
6783                && known_eq (GET_MODE_SIZE (ag_mode), 16))
6784         return gen_rtx_REG (mode, V0_REGNUM);
6785       else if (aarch64_advsimd_partial_struct_mode_p (mode)
6786                && known_eq (GET_MODE_SIZE (ag_mode), 8))
6787         return gen_rtx_REG (mode, V0_REGNUM);
6788       else
6789         {
6790           int i;
6791           rtx par;
6792
6793           par = gen_rtx_PARALLEL (mode, rtvec_alloc (count));
6794           for (i = 0; i < count; i++)
6795             {
6796               rtx tmp = gen_rtx_REG (ag_mode, V0_REGNUM + i);
6797               rtx offset = gen_int_mode (i * GET_MODE_SIZE (ag_mode), Pmode);
6798               tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
6799               XVECEXP (par, 0, i) = tmp;
6800             }
6801           return par;
6802         }
6803     }
6804   else
6805     {
6806       if (sve_p)
6807         {
6808           /* Vector types can acquire a partial SVE mode using things like
6809              __attribute__((vector_size(N))), and this is potentially useful.
6810              However, the choice of mode doesn't affect the type's ABI
6811              identity, so we should treat the types as though they had
6812              the associated integer mode, just like they did before SVE
6813              was introduced.
6814
6815              We know that the vector must be 128 bits or smaller,
6816              otherwise we'd have returned it in memory instead.  */
6817           gcc_assert (type
6818                       && (aarch64_some_values_include_pst_objects_p (type)
6819                           || (vec_flags & VEC_PARTIAL)));
6820
6821           scalar_int_mode int_mode = int_mode_for_mode (mode).require ();
6822           rtx reg = gen_rtx_REG (int_mode, R0_REGNUM);
6823           rtx pair = gen_rtx_EXPR_LIST (VOIDmode, reg, const0_rtx);
6824           return gen_rtx_PARALLEL (mode, gen_rtvec (1, pair));
6825         }
6826       return gen_rtx_REG (mode, R0_REGNUM);
6827     }
6828 }
6829
6830 /* Implements TARGET_FUNCTION_VALUE_REGNO_P.
6831    Return true if REGNO is the number of a hard register in which the values
6832    of called function may come back.  */
6833
6834 static bool
6835 aarch64_function_value_regno_p (const unsigned int regno)
6836 {
6837   /* Maximum of 16 bytes can be returned in the general registers.  Examples
6838      of 16-byte return values are: 128-bit integers and 16-byte small
6839      structures (excluding homogeneous floating-point aggregates).  */
6840   if (regno == R0_REGNUM || regno == R1_REGNUM)
6841     return true;
6842
6843   /* Up to four fp/simd registers can return a function value, e.g. a
6844      homogeneous floating-point aggregate having four members.  */
6845   if (regno >= V0_REGNUM && regno < V0_REGNUM + HA_MAX_NUM_FLDS)
6846     return TARGET_FLOAT;
6847
6848   if (regno >= P0_REGNUM && regno < P0_REGNUM + HA_MAX_NUM_FLDS)
6849     return TARGET_SVE;
6850
6851   return false;
6852 }
6853
6854 /* Subroutine for aarch64_return_in_memory for types that are not returned
6855    in SVE registers.  */
6856
6857 static bool
6858 aarch64_return_in_memory_1 (const_tree type)
6859 {
6860   HOST_WIDE_INT size;
6861   machine_mode ag_mode;
6862   int count;
6863
6864   if (!AGGREGATE_TYPE_P (type)
6865       && TREE_CODE (type) != BITINT_TYPE
6866       && TREE_CODE (type) != COMPLEX_TYPE
6867       && TREE_CODE (type) != VECTOR_TYPE)
6868     /* Simple scalar types always returned in registers.  */
6869     return false;
6870
6871   if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type), type,
6872                                                &ag_mode, &count, NULL, false))
6873     return false;
6874
6875   /* Types larger than 2 registers returned in memory.  */
6876   size = int_size_in_bytes (type);
6877   return (size < 0 || size > 2 * UNITS_PER_WORD);
6878 }
6879
6880 /* Implement TARGET_RETURN_IN_MEMORY.
6881
6882    If the type T of the result of a function is such that
6883      void func (T arg)
6884    would require that arg be passed as a value in a register (or set of
6885    registers) according to the parameter passing rules, then the result
6886    is returned in the same registers as would be used for such an
6887    argument.  */
6888
6889 static bool
6890 aarch64_return_in_memory (const_tree type, const_tree fndecl ATTRIBUTE_UNUSED)
6891 {
6892   pure_scalable_type_info pst_info;
6893   switch (pst_info.analyze (type))
6894     {
6895     case pure_scalable_type_info::IS_PST:
6896       return (pst_info.num_zr () > NUM_FP_ARG_REGS
6897               || pst_info.num_pr () > NUM_PR_ARG_REGS);
6898
6899     case pure_scalable_type_info::DOESNT_MATTER:
6900       gcc_assert (aarch64_return_in_memory_1 (type));
6901       return true;
6902
6903     case pure_scalable_type_info::NO_ABI_IDENTITY:
6904     case pure_scalable_type_info::ISNT_PST:
6905       return aarch64_return_in_memory_1 (type);
6906     }
6907   gcc_unreachable ();
6908 }
6909
6910 static bool
6911 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v, machine_mode mode,
6912                                const_tree type, int *nregs)
6913 {
6914   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
6915   return aarch64_vfp_is_call_or_return_candidate (mode, type,
6916                                                   &pcum->aapcs_vfp_rmode,
6917                                                   nregs, NULL, pcum->silent_p);
6918 }
6919
6920 /* Given MODE and TYPE of a function argument, return the alignment in
6921    bits.  The idea is to suppress any stronger alignment requested by
6922    the user and opt for the natural alignment (specified in AAPCS64 \S
6923    4.1).  ABI_BREAK_GCC_9 is set to the old alignment if the alignment
6924    was incorrectly calculated in versions of GCC prior to GCC 9.
6925    ABI_BREAK_GCC_13 is set to the old alignment if it was incorrectly
6926    calculated in versions between GCC 9 and GCC 13.  If the alignment
6927    might have changed between GCC 13 and GCC 14, ABI_BREAK_GCC_14
6928    is the old GCC 13 alignment, otherwise it is zero.
6929
6930    This is a helper function for local use only.  */
6931
6932 static unsigned int
6933 aarch64_function_arg_alignment (machine_mode mode, const_tree type,
6934                                 unsigned int *abi_break_gcc_9,
6935                                 unsigned int *abi_break_gcc_13,
6936                                 unsigned int *abi_break_gcc_14)
6937 {
6938   *abi_break_gcc_9 = 0;
6939   *abi_break_gcc_13 = 0;
6940   *abi_break_gcc_14 = 0;
6941   if (!type)
6942     return GET_MODE_ALIGNMENT (mode);
6943
6944   if (integer_zerop (TYPE_SIZE (type)))
6945     return 0;
6946
6947   gcc_assert (TYPE_MODE (type) == mode);
6948
6949   if (!AGGREGATE_TYPE_P (type))
6950     {
6951       /* The ABI alignment is the natural alignment of the type, without
6952          any attributes applied.  Normally this is the alignment of the
6953          TYPE_MAIN_VARIANT, but not always; see PR108910 for a counterexample.
6954          For now we just handle the known exceptions explicitly.  */
6955       type = TYPE_MAIN_VARIANT (type);
6956       if (POINTER_TYPE_P (type))
6957         {
6958           gcc_assert (known_eq (POINTER_SIZE, GET_MODE_BITSIZE (mode)));
6959           return POINTER_SIZE;
6960         }
6961       if (TREE_CODE (type) == ENUMERAL_TYPE && TREE_TYPE (type))
6962         {
6963           *abi_break_gcc_14 = TYPE_ALIGN (type);
6964           type = TYPE_MAIN_VARIANT (TREE_TYPE (type));
6965         }
6966       gcc_assert (!TYPE_USER_ALIGN (type));
6967       return TYPE_ALIGN (type);
6968     }
6969
6970   if (TREE_CODE (type) == ARRAY_TYPE)
6971     return TYPE_ALIGN (TREE_TYPE (type));
6972
6973   unsigned int alignment = 0;
6974   unsigned int bitfield_alignment_with_packed = 0;
6975   unsigned int bitfield_alignment = 0;
6976   for (tree field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
6977     if (TREE_CODE (field) == FIELD_DECL)
6978       {
6979         /* Note that we explicitly consider zero-sized fields here,
6980            even though they don't map to AAPCS64 machine types.
6981            For example, in:
6982
6983                struct __attribute__((aligned(8))) empty {};
6984
6985                struct s {
6986                  [[no_unique_address]] empty e;
6987                  int x;
6988                };
6989
6990            "s" contains only one Fundamental Data Type (the int field)
6991            but gains 8-byte alignment and size thanks to "e".  */
6992         alignment = std::max (alignment, DECL_ALIGN (field));
6993         if (DECL_BIT_FIELD_TYPE (field))
6994           {
6995             /* Take the bit-field type's alignment into account only
6996                if the user didn't reduce this field's alignment with
6997                the packed attribute.  */
6998             if (!DECL_PACKED (field))
6999               bitfield_alignment
7000                 = std::max (bitfield_alignment,
7001                             TYPE_ALIGN (DECL_BIT_FIELD_TYPE (field)));
7002
7003             /* Compute the alignment even if the bit-field is
7004                packed, so that we can emit a warning in case the
7005                alignment changed between GCC versions.  */
7006             bitfield_alignment_with_packed
7007               = std::max (bitfield_alignment_with_packed,
7008                           TYPE_ALIGN (DECL_BIT_FIELD_TYPE (field)));
7009           }
7010       }
7011
7012   /* Emit a warning if the alignment is different when taking the
7013      'packed' attribute into account.  */
7014   if (bitfield_alignment != bitfield_alignment_with_packed
7015       && bitfield_alignment_with_packed > alignment)
7016     *abi_break_gcc_13 = bitfield_alignment_with_packed;
7017
7018   if (bitfield_alignment > alignment)
7019     {
7020       *abi_break_gcc_9 = alignment;
7021       return bitfield_alignment;
7022     }
7023
7024   return alignment;
7025 }
7026
7027 /* Return true if TYPE describes a _BitInt(N) or an angreggate that uses the
7028    _BitInt(N) type.  These include ARRAY_TYPE's with an element that is a
7029    _BitInt(N) or an aggregate that uses it, and a RECORD_TYPE or a UNION_TYPE
7030    with a field member that is a _BitInt(N) or an aggregate that uses it.
7031    Return false otherwise.  */
7032
7033 static bool
7034 bitint_or_aggr_of_bitint_p (tree type)
7035 {
7036   if (!type)
7037     return false;
7038
7039   if (TREE_CODE (type) == BITINT_TYPE)
7040     return true;
7041
7042   /* If ARRAY_TYPE, check it's element type.  */
7043   if (TREE_CODE (type) == ARRAY_TYPE)
7044     return bitint_or_aggr_of_bitint_p (TREE_TYPE (type));
7045
7046   /* If RECORD_TYPE or UNION_TYPE, check the fields' types.  */
7047   if (RECORD_OR_UNION_TYPE_P (type))
7048     for (tree field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
7049       {
7050         if (TREE_CODE (field) != FIELD_DECL)
7051           continue;
7052         if (bitint_or_aggr_of_bitint_p (TREE_TYPE (field)))
7053           return true;
7054       }
7055   return false;
7056 }
7057
7058 /* Layout a function argument according to the AAPCS64 rules.  The rule
7059    numbers refer to the rule numbers in the AAPCS64.  ORIG_MODE is the
7060    mode that was originally given to us by the target hook, whereas the
7061    mode in ARG might be the result of replacing partial SVE modes with
7062    the equivalent integer mode.  */
7063
7064 static void
7065 aarch64_layout_arg (cumulative_args_t pcum_v, const function_arg_info &arg)
7066 {
7067   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
7068   tree type = arg.type;
7069   machine_mode mode = arg.mode;
7070   int ncrn, nvrn, nregs;
7071   bool allocate_ncrn, allocate_nvrn;
7072   HOST_WIDE_INT size;
7073   unsigned int abi_break_gcc_9;
7074   unsigned int abi_break_gcc_13;
7075   unsigned int abi_break_gcc_14;
7076
7077   /* We need to do this once per argument.  */
7078   if (pcum->aapcs_arg_processed)
7079     return;
7080
7081   bool warn_pcs_change
7082     = (warn_psabi
7083        && !pcum->silent_p
7084        && (currently_expanding_function_start
7085            || currently_expanding_gimple_stmt));
7086
7087   /* HFAs and HVAs can have an alignment greater than 16 bytes.  For example:
7088
7089        typedef struct foo {
7090          __Int8x16_t foo[2] __attribute__((aligned(32)));
7091        } foo;
7092
7093      is still a HVA despite its larger-than-normal alignment.
7094      However, such over-aligned HFAs and HVAs are guaranteed to have
7095      no padding.
7096
7097      If we exclude HFAs and HVAs from the discussion below, then there
7098      are several things to note:
7099
7100      - Both the C and AAPCS64 interpretations of a type's alignment should
7101        give a value that is no greater than the type's size.
7102
7103      - Types bigger than 16 bytes are passed indirectly.
7104
7105      - If an argument of type T is passed indirectly, TYPE and MODE describe
7106        a pointer to T rather than T iself.
7107
7108      It follows that the AAPCS64 alignment of TYPE must be no greater
7109      than 16 bytes.
7110
7111      Versions prior to GCC 9.1 ignored a bitfield's underlying type
7112      and so could calculate an alignment that was too small.  If this
7113      happened for TYPE then ABI_BREAK_GCC_9 is this older, too-small alignment.
7114
7115      Although GCC 9.1 fixed that bug, it introduced a different one:
7116      it would consider the alignment of a bitfield's underlying type even
7117      if the field was packed (which should have the effect of overriding
7118      the alignment of the underlying type).  This was fixed in GCC 13.1.
7119
7120      As a result of this bug, GCC 9 to GCC 12 could calculate an alignment
7121      that was too big.  If this happened for TYPE, ABI_BREAK_GCC_13 is
7122      this older, too-big alignment.
7123
7124      Also, the fact that GCC 9 to GCC 12 considered irrelevant
7125      alignments meant they could calculate type alignments that were
7126      bigger than the type's size, contrary to the assumption above.
7127      The handling of register arguments was nevertheless (and justifiably)
7128      written to follow the assumption that the alignment can never be
7129      greater than the size.  The same was not true for stack arguments;
7130      their alignment was instead handled by MIN bounds in
7131      aarch64_function_arg_boundary.
7132
7133      The net effect is that, if GCC 9 to GCC 12 incorrectly calculated
7134      an alignment of more than 16 bytes for TYPE then:
7135
7136      - If the argument was passed in registers, these GCC versions
7137        would treat the alignment as though it was *less than* 16 bytes.
7138
7139      - If the argument was passed on the stack, these GCC versions
7140        would treat the alignment as though it was *equal to* 16 bytes.
7141
7142      Both behaviors were wrong, but in different cases.  */
7143
7144   pcum->aapcs_arg_processed = true;
7145
7146   pure_scalable_type_info pst_info;
7147   if (type && pst_info.analyze_registers (type))
7148     {
7149       /* aarch64_function_arg_alignment has never had an effect on
7150          this case.  */
7151
7152       /* The PCS says that it is invalid to pass an SVE value to an
7153          unprototyped function.  There is no ABI-defined location we
7154          can return in this case, so we have no real choice but to raise
7155          an error immediately, even though this is only a query function.  */
7156       if (arg.named && pcum->pcs_variant != ARM_PCS_SVE)
7157         {
7158           gcc_assert (!pcum->silent_p);
7159           error ("SVE type %qT cannot be passed to an unprototyped function",
7160                  arg.type);
7161           /* Avoid repeating the message, and avoid tripping the assert
7162              below.  */
7163           pcum->pcs_variant = ARM_PCS_SVE;
7164         }
7165
7166       /* We would have converted the argument into pass-by-reference
7167          form if it didn't fit in registers.  */
7168       pcum->aapcs_nextnvrn = pcum->aapcs_nvrn + pst_info.num_zr ();
7169       pcum->aapcs_nextnprn = pcum->aapcs_nprn + pst_info.num_pr ();
7170       gcc_assert (arg.named
7171                   && pcum->pcs_variant == ARM_PCS_SVE
7172                   && pcum->aapcs_nextnvrn <= NUM_FP_ARG_REGS
7173                   && pcum->aapcs_nextnprn <= NUM_PR_ARG_REGS);
7174       pcum->aapcs_reg = pst_info.get_rtx (mode, V0_REGNUM + pcum->aapcs_nvrn,
7175                                           P0_REGNUM + pcum->aapcs_nprn);
7176       return;
7177     }
7178
7179   /* Generic vectors that map to full SVE modes with -msve-vector-bits=N
7180      are passed by reference, not by value.  */
7181   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
7182   bool sve_p = (vec_flags & VEC_ANY_SVE);
7183   if (sve_p)
7184     /* Vector types can acquire a partial SVE mode using things like
7185        __attribute__((vector_size(N))), and this is potentially useful.
7186        However, the choice of mode doesn't affect the type's ABI
7187        identity, so we should treat the types as though they had
7188        the associated integer mode, just like they did before SVE
7189        was introduced.
7190
7191        We know that the vector must be 128 bits or smaller,
7192        otherwise we'd have passed it in memory instead.  */
7193     gcc_assert (type
7194                 && (aarch64_some_values_include_pst_objects_p (type)
7195                     || (vec_flags & VEC_PARTIAL)));
7196
7197   /* Size in bytes, rounded to the nearest multiple of 8 bytes.  */
7198   if (type)
7199     size = int_size_in_bytes (type);
7200   else
7201     /* No frontends can create types with variable-sized modes, so we
7202        shouldn't be asked to pass or return them.  */
7203     size = GET_MODE_SIZE (mode).to_constant ();
7204   size = ROUND_UP (size, UNITS_PER_WORD);
7205
7206   allocate_ncrn = (type) ? !(FLOAT_TYPE_P (type)) : !FLOAT_MODE_P (mode);
7207   allocate_nvrn = aarch64_vfp_is_call_candidate (pcum_v,
7208                                                  mode,
7209                                                  type,
7210                                                  &nregs);
7211   gcc_assert (!sve_p || !allocate_nvrn);
7212
7213   unsigned int alignment
7214     = aarch64_function_arg_alignment (mode, type, &abi_break_gcc_9,
7215                                       &abi_break_gcc_13, &abi_break_gcc_14);
7216
7217   gcc_assert ((allocate_nvrn || alignment <= 16 * BITS_PER_UNIT)
7218               && (!alignment || abi_break_gcc_9 < alignment)
7219               && (!abi_break_gcc_13 || alignment < abi_break_gcc_13));
7220
7221   /* _BitInt(N) was only added in GCC 14.  */
7222   bool warn_pcs_change_le_gcc14
7223     = warn_pcs_change && !bitint_or_aggr_of_bitint_p (type);
7224
7225   /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
7226      The following code thus handles passing by SIMD/FP registers first.  */
7227
7228   nvrn = pcum->aapcs_nvrn;
7229
7230   /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
7231      and homogenous short-vector aggregates (HVA).  */
7232   if (allocate_nvrn)
7233     {
7234       /* aarch64_function_arg_alignment has never had an effect on
7235          this case.  */
7236       if (!pcum->silent_p && !TARGET_FLOAT)
7237         aarch64_err_no_fpadvsimd (mode);
7238
7239       if (nvrn + nregs <= NUM_FP_ARG_REGS)
7240         {
7241           pcum->aapcs_nextnvrn = nvrn + nregs;
7242           if (!aarch64_composite_type_p (type, mode))
7243             {
7244               gcc_assert (nregs == 1);
7245               pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
7246             }
7247           else if (aarch64_advsimd_full_struct_mode_p (mode)
7248                    && known_eq (GET_MODE_SIZE (pcum->aapcs_vfp_rmode), 16))
7249             pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
7250           else if (aarch64_advsimd_partial_struct_mode_p (mode)
7251                    && known_eq (GET_MODE_SIZE (pcum->aapcs_vfp_rmode), 8))
7252             pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
7253           else
7254             {
7255               rtx par;
7256               int i;
7257               par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
7258               for (i = 0; i < nregs; i++)
7259                 {
7260                   rtx tmp = gen_rtx_REG (pcum->aapcs_vfp_rmode,
7261                                          V0_REGNUM + nvrn + i);
7262                   rtx offset = gen_int_mode
7263                     (i * GET_MODE_SIZE (pcum->aapcs_vfp_rmode), Pmode);
7264                   tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
7265                   XVECEXP (par, 0, i) = tmp;
7266                 }
7267               pcum->aapcs_reg = par;
7268             }
7269           return;
7270         }
7271       else
7272         {
7273           /* C.3 NSRN is set to 8.  */
7274           pcum->aapcs_nextnvrn = NUM_FP_ARG_REGS;
7275           goto on_stack;
7276         }
7277     }
7278
7279   ncrn = pcum->aapcs_ncrn;
7280   nregs = size / UNITS_PER_WORD;
7281
7282   /* C6 - C9.  though the sign and zero extension semantics are
7283      handled elsewhere.  This is the case where the argument fits
7284      entirely general registers.  */
7285   if (allocate_ncrn && (ncrn + nregs <= NUM_ARG_REGS))
7286     {
7287       gcc_assert (nregs == 0 || nregs == 1 || nregs == 2);
7288
7289       /* C.8 if the argument has an alignment of 16 then the NGRN is
7290          rounded up to the next even number.  */
7291       if (nregs == 2
7292           && ncrn % 2)
7293         {
7294           /* Emit a warning if the alignment changed when taking the
7295              'packed' attribute into account.  */
7296           if (warn_pcs_change_le_gcc14
7297               && abi_break_gcc_13
7298               && ((abi_break_gcc_13 == 16 * BITS_PER_UNIT)
7299                   != (alignment == 16 * BITS_PER_UNIT)))
7300             inform (input_location, "parameter passing for argument of type "
7301                     "%qT changed in GCC 13.1", type);
7302
7303           if (warn_pcs_change_le_gcc14
7304               && abi_break_gcc_14
7305               && ((abi_break_gcc_14 == 16 * BITS_PER_UNIT)
7306                   != (alignment == 16 * BITS_PER_UNIT)))
7307             inform (input_location, "parameter passing for argument of type "
7308                     "%qT changed in GCC 14.1", type);
7309
7310           /* The == 16 * BITS_PER_UNIT instead of >= 16 * BITS_PER_UNIT
7311              comparison is there because for > 16 * BITS_PER_UNIT
7312              alignment nregs should be > 2 and therefore it should be
7313              passed by reference rather than value.  */
7314           if (alignment == 16 * BITS_PER_UNIT)
7315             {
7316               if (warn_pcs_change_le_gcc14
7317                   && abi_break_gcc_9)
7318                 inform (input_location, "parameter passing for argument of type "
7319                         "%qT changed in GCC 9.1", type);
7320               ++ncrn;
7321               gcc_assert (ncrn + nregs <= NUM_ARG_REGS);
7322             }
7323         }
7324
7325       /* If an argument with an SVE mode needs to be shifted up to the
7326          high part of the register, treat it as though it had an integer mode.
7327          Using the normal (parallel [...]) would suppress the shifting.  */
7328       if (sve_p
7329           && BYTES_BIG_ENDIAN
7330           && maybe_ne (GET_MODE_SIZE (mode), nregs * UNITS_PER_WORD)
7331           && aarch64_pad_reg_upward (mode, type, false))
7332         {
7333           mode = int_mode_for_mode (mode).require ();
7334           sve_p = false;
7335         }
7336
7337       /* NREGS can be 0 when e.g. an empty structure is to be passed.
7338          A reg is still generated for it, but the caller should be smart
7339          enough not to use it.  */
7340       if (nregs == 0
7341           || (nregs == 1 && !sve_p)
7342           || GET_MODE_CLASS (mode) == MODE_INT)
7343         pcum->aapcs_reg = gen_rtx_REG (mode, R0_REGNUM + ncrn);
7344       else
7345         {
7346           rtx par;
7347           int i;
7348
7349           par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
7350           for (i = 0; i < nregs; i++)
7351             {
7352               scalar_int_mode reg_mode = word_mode;
7353               if (nregs == 1)
7354                 reg_mode = int_mode_for_mode (mode).require ();
7355               rtx tmp = gen_rtx_REG (reg_mode, R0_REGNUM + ncrn + i);
7356               tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
7357                                        GEN_INT (i * UNITS_PER_WORD));
7358               XVECEXP (par, 0, i) = tmp;
7359             }
7360           pcum->aapcs_reg = par;
7361         }
7362
7363       pcum->aapcs_nextncrn = ncrn + nregs;
7364       return;
7365     }
7366
7367   /* C.11  */
7368   pcum->aapcs_nextncrn = NUM_ARG_REGS;
7369
7370   /* The argument is passed on stack; record the needed number of words for
7371      this argument and align the total size if necessary.  */
7372 on_stack:
7373   pcum->aapcs_stack_words = size / UNITS_PER_WORD;
7374
7375   if (warn_pcs_change_le_gcc14
7376       && abi_break_gcc_13
7377       && ((abi_break_gcc_13 >= 16 * BITS_PER_UNIT)
7378           != (alignment >= 16 * BITS_PER_UNIT)))
7379     inform (input_location, "parameter passing for argument of type "
7380             "%qT changed in GCC 13.1", type);
7381
7382   if (warn_pcs_change_le_gcc14
7383       && abi_break_gcc_14
7384       && ((abi_break_gcc_14 >= 16 * BITS_PER_UNIT)
7385           != (alignment >= 16 * BITS_PER_UNIT)))
7386     inform (input_location, "parameter passing for argument of type "
7387             "%qT changed in GCC 14.1", type);
7388
7389   if (alignment == 16 * BITS_PER_UNIT)
7390     {
7391       int new_size = ROUND_UP (pcum->aapcs_stack_size, 16 / UNITS_PER_WORD);
7392       if (pcum->aapcs_stack_size != new_size)
7393         {
7394           if (warn_pcs_change_le_gcc14
7395               && abi_break_gcc_9)
7396             inform (input_location, "parameter passing for argument of type "
7397                     "%qT changed in GCC 9.1", type);
7398           pcum->aapcs_stack_size = new_size;
7399         }
7400     }
7401   return;
7402 }
7403
7404 /* Add the current argument register to the set of those that need
7405    to be saved and restored around a change to PSTATE.SM.  */
7406
7407 static void
7408 aarch64_record_sme_mode_switch_args (CUMULATIVE_ARGS *pcum)
7409 {
7410   subrtx_var_iterator::array_type array;
7411   FOR_EACH_SUBRTX_VAR (iter, array, pcum->aapcs_reg, NONCONST)
7412     {
7413       rtx x = *iter;
7414       if (REG_P (x) && (FP_REGNUM_P (REGNO (x)) || PR_REGNUM_P (REGNO (x))))
7415         {
7416           unsigned int i = pcum->num_sme_mode_switch_args++;
7417           gcc_assert (i < ARRAY_SIZE (pcum->sme_mode_switch_args));
7418           pcum->sme_mode_switch_args[i] = x;
7419         }
7420     }
7421 }
7422
7423 /* Return a parallel that contains all the registers that need to be
7424    saved around a change to PSTATE.SM.  Return const0_rtx if there is
7425    no such mode switch, or if no registers need to be saved.  */
7426
7427 static rtx
7428 aarch64_finish_sme_mode_switch_args (CUMULATIVE_ARGS *pcum)
7429 {
7430   if (!pcum->num_sme_mode_switch_args)
7431     return const0_rtx;
7432
7433   auto argvec = gen_rtvec_v (pcum->num_sme_mode_switch_args,
7434                              pcum->sme_mode_switch_args);
7435   return gen_rtx_PARALLEL (VOIDmode, argvec);
7436 }
7437
7438 /* Implement TARGET_FUNCTION_ARG.  */
7439
7440 static rtx
7441 aarch64_function_arg (cumulative_args_t pcum_v, const function_arg_info &arg)
7442 {
7443   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
7444   gcc_assert (pcum->pcs_variant == ARM_PCS_AAPCS64
7445               || pcum->pcs_variant == ARM_PCS_SIMD
7446               || pcum->pcs_variant == ARM_PCS_SVE);
7447
7448   if (arg.end_marker_p ())
7449     {
7450       rtx abi_cookie = aarch64_gen_callee_cookie (pcum->isa_mode,
7451                                                   pcum->pcs_variant,
7452                                                   pcum->indirect_return);
7453       rtx sme_mode_switch_args = aarch64_finish_sme_mode_switch_args (pcum);
7454       rtx shared_za_flags = gen_int_mode (pcum->shared_za_flags, SImode);
7455       rtx shared_zt0_flags = gen_int_mode (pcum->shared_zt0_flags, SImode);
7456       return gen_rtx_PARALLEL (VOIDmode, gen_rtvec (4, abi_cookie,
7457                                                     sme_mode_switch_args,
7458                                                     shared_za_flags,
7459                                                     shared_zt0_flags));
7460     }
7461
7462   aarch64_layout_arg (pcum_v, arg);
7463   return pcum->aapcs_reg;
7464 }
7465
7466 void
7467 aarch64_init_cumulative_args (CUMULATIVE_ARGS *pcum,
7468                               const_tree fntype,
7469                               rtx libname ATTRIBUTE_UNUSED,
7470                               const_tree fndecl,
7471                               unsigned n_named ATTRIBUTE_UNUSED,
7472                               bool silent_p)
7473 {
7474   pcum->aapcs_ncrn = 0;
7475   pcum->aapcs_nvrn = 0;
7476   pcum->aapcs_nprn = 0;
7477   pcum->aapcs_nextncrn = 0;
7478   pcum->aapcs_nextnvrn = 0;
7479   pcum->aapcs_nextnprn = 0;
7480   if (fntype)
7481     {
7482       pcum->pcs_variant = (arm_pcs) fntype_abi (fntype).id ();
7483       pcum->isa_mode = aarch64_fntype_isa_mode (fntype);
7484       pcum->indirect_return = lookup_attribute ("indirect_return",
7485                                                 TYPE_ATTRIBUTES (fntype));
7486     }
7487   else
7488     {
7489       pcum->pcs_variant = ARM_PCS_AAPCS64;
7490       pcum->isa_mode = AARCH64_DEFAULT_ISA_MODE;
7491       pcum->indirect_return = false;
7492     }
7493   pcum->aapcs_reg = NULL_RTX;
7494   pcum->aapcs_arg_processed = false;
7495   pcum->aapcs_stack_words = 0;
7496   pcum->aapcs_stack_size = 0;
7497   pcum->silent_p = silent_p;
7498   pcum->shared_za_flags
7499     = (fntype ? aarch64_fntype_shared_flags (fntype, "za") : 0U);
7500   pcum->shared_zt0_flags
7501     = (fntype ? aarch64_fntype_shared_flags (fntype, "zt0") : 0U);
7502   pcum->num_sme_mode_switch_args = 0;
7503
7504   if (!silent_p
7505       && !TARGET_FLOAT
7506       && fntype && fntype != error_mark_node)
7507     {
7508       const_tree type = TREE_TYPE (fntype);
7509       machine_mode mode ATTRIBUTE_UNUSED; /* To pass pointer as argument.  */
7510       int nregs ATTRIBUTE_UNUSED; /* Likewise.  */
7511       if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type), type,
7512                                                    &mode, &nregs, NULL, false))
7513         aarch64_err_no_fpadvsimd (TYPE_MODE (type));
7514     }
7515
7516   if (!silent_p
7517       && !TARGET_SVE
7518       && pcum->pcs_variant == ARM_PCS_SVE)
7519     {
7520       /* We can't gracefully recover at this point, so make this a
7521          fatal error.  */
7522       if (fndecl)
7523         fatal_error (input_location, "%qE requires the SVE ISA extension",
7524                      fndecl);
7525       else
7526         fatal_error (input_location, "calls to functions of type %qT require"
7527                      " the SVE ISA extension", fntype);
7528     }
7529 }
7530
7531 static void
7532 aarch64_function_arg_advance (cumulative_args_t pcum_v,
7533                               const function_arg_info &arg)
7534 {
7535   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
7536   if (pcum->pcs_variant == ARM_PCS_AAPCS64
7537       || pcum->pcs_variant == ARM_PCS_SIMD
7538       || pcum->pcs_variant == ARM_PCS_SVE)
7539     {
7540       aarch64_layout_arg (pcum_v, arg);
7541       gcc_assert ((pcum->aapcs_reg != NULL_RTX)
7542                   != (pcum->aapcs_stack_words != 0));
7543       if (pcum->aapcs_reg
7544           && aarch64_call_switches_pstate_sm (pcum->isa_mode))
7545         aarch64_record_sme_mode_switch_args (pcum);
7546
7547       pcum->aapcs_arg_processed = false;
7548       pcum->aapcs_ncrn = pcum->aapcs_nextncrn;
7549       pcum->aapcs_nvrn = pcum->aapcs_nextnvrn;
7550       pcum->aapcs_nprn = pcum->aapcs_nextnprn;
7551       pcum->aapcs_stack_size += pcum->aapcs_stack_words;
7552       pcum->aapcs_stack_words = 0;
7553       pcum->aapcs_reg = NULL_RTX;
7554     }
7555 }
7556
7557 bool
7558 aarch64_function_arg_regno_p (unsigned regno)
7559 {
7560   return ((GP_REGNUM_P (regno) && regno < R0_REGNUM + NUM_ARG_REGS)
7561           || (FP_REGNUM_P (regno) && regno < V0_REGNUM + NUM_FP_ARG_REGS)
7562           || (PR_REGNUM_P (regno) && regno < P0_REGNUM + NUM_PR_ARG_REGS));
7563 }
7564
7565 /* Implement FUNCTION_ARG_BOUNDARY.  Every parameter gets at least
7566    PARM_BOUNDARY bits of alignment, but will be given anything up
7567    to STACK_BOUNDARY bits if the type requires it.  This makes sure
7568    that both before and after the layout of each argument, the Next
7569    Stacked Argument Address (NSAA) will have a minimum alignment of
7570    8 bytes.  */
7571
7572 static unsigned int
7573 aarch64_function_arg_boundary (machine_mode mode, const_tree type)
7574 {
7575   unsigned int abi_break_gcc_9;
7576   unsigned int abi_break_gcc_13;
7577   unsigned int abi_break_gcc_14;
7578   unsigned int alignment = aarch64_function_arg_alignment (mode, type,
7579                                                            &abi_break_gcc_9,
7580                                                            &abi_break_gcc_13,
7581                                                            &abi_break_gcc_14);
7582   /* We rely on aarch64_layout_arg and aarch64_gimplify_va_arg_expr
7583      to emit warnings about ABI incompatibility.  */
7584   alignment = MIN (MAX (alignment, PARM_BOUNDARY), STACK_BOUNDARY);
7585   return alignment;
7586 }
7587
7588 /* Implement TARGET_GET_RAW_RESULT_MODE and TARGET_GET_RAW_ARG_MODE.  */
7589
7590 static fixed_size_mode
7591 aarch64_get_reg_raw_mode (int regno)
7592 {
7593   /* Don't use any non GP registers for __builtin_apply and
7594      __builtin_return if general registers only mode is requested. */
7595   if (TARGET_GENERAL_REGS_ONLY && !GP_REGNUM_P (regno))
7596     return as_a <fixed_size_mode> (VOIDmode);
7597   if (TARGET_SVE && FP_REGNUM_P (regno))
7598     /* Don't use the SVE part of the register for __builtin_apply and
7599        __builtin_return.  The SVE registers aren't used by the normal PCS,
7600        so using them there would be a waste of time.  The PCS extensions
7601        for SVE types are fundamentally incompatible with the
7602        __builtin_return/__builtin_apply interface.  */
7603     return as_a <fixed_size_mode> (V16QImode);
7604   if (PR_REGNUM_P (regno))
7605     /* For SVE PR regs, indicate that they should be ignored for
7606        __builtin_apply/__builtin_return.  */
7607     return as_a <fixed_size_mode> (VOIDmode);
7608   return default_get_reg_raw_mode (regno);
7609 }
7610
7611 /* Implement TARGET_FUNCTION_ARG_PADDING.
7612
7613    Small aggregate types are placed in the lowest memory address.
7614
7615    The related parameter passing rules are B.4, C.3, C.5 and C.14.  */
7616
7617 static pad_direction
7618 aarch64_function_arg_padding (machine_mode mode, const_tree type)
7619 {
7620   /* On little-endian targets, the least significant byte of every stack
7621      argument is passed at the lowest byte address of the stack slot.  */
7622   if (!BYTES_BIG_ENDIAN)
7623     return PAD_UPWARD;
7624
7625   /* Otherwise, integral, floating-point and pointer types are padded downward:
7626      the least significant byte of a stack argument is passed at the highest
7627      byte address of the stack slot.  */
7628   if (type
7629       ? (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type)
7630          || POINTER_TYPE_P (type))
7631       : (SCALAR_INT_MODE_P (mode) || SCALAR_FLOAT_MODE_P (mode)))
7632     return PAD_DOWNWARD;
7633
7634   /* Everything else padded upward, i.e. data in first byte of stack slot.  */
7635   return PAD_UPWARD;
7636 }
7637
7638 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
7639
7640    It specifies padding for the last (may also be the only)
7641    element of a block move between registers and memory.  If
7642    assuming the block is in the memory, padding upward means that
7643    the last element is padded after its highest significant byte,
7644    while in downward padding, the last element is padded at the
7645    its least significant byte side.
7646
7647    Small aggregates and small complex types are always padded
7648    upwards.
7649
7650    We don't need to worry about homogeneous floating-point or
7651    short-vector aggregates; their move is not affected by the
7652    padding direction determined here.  Regardless of endianness,
7653    each element of such an aggregate is put in the least
7654    significant bits of a fp/simd register.
7655
7656    Return !BYTES_BIG_ENDIAN if the least significant byte of the
7657    register has useful data, and return the opposite if the most
7658    significant byte does.  */
7659
7660 bool
7661 aarch64_pad_reg_upward (machine_mode mode, const_tree type,
7662                      bool first ATTRIBUTE_UNUSED)
7663 {
7664
7665   /* Aside from pure scalable types, small composite types are always
7666      padded upward.  */
7667   if (BYTES_BIG_ENDIAN && aarch64_composite_type_p (type, mode))
7668     {
7669       HOST_WIDE_INT size;
7670       if (type)
7671         size = int_size_in_bytes (type);
7672       else
7673         /* No frontends can create types with variable-sized modes, so we
7674            shouldn't be asked to pass or return them.  */
7675         size = GET_MODE_SIZE (mode).to_constant ();
7676       if (size < 2 * UNITS_PER_WORD)
7677         {
7678           pure_scalable_type_info pst_info;
7679           if (pst_info.analyze_registers (type))
7680             return false;
7681           return true;
7682         }
7683     }
7684
7685   /* Otherwise, use the default padding.  */
7686   return !BYTES_BIG_ENDIAN;
7687 }
7688
7689 static scalar_int_mode
7690 aarch64_libgcc_cmp_return_mode (void)
7691 {
7692   return SImode;
7693 }
7694
7695 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
7696
7697 /* We use the 12-bit shifted immediate arithmetic instructions so values
7698    must be multiple of (1 << 12), i.e. 4096.  */
7699 #define ARITH_FACTOR 4096
7700
7701 #if (PROBE_INTERVAL % ARITH_FACTOR) != 0
7702 #error Cannot use simple address calculation for stack probing
7703 #endif
7704
7705 /* Emit code to probe a range of stack addresses from FIRST to FIRST+POLY_SIZE,
7706    inclusive.  These are offsets from the current stack pointer.  */
7707
7708 static void
7709 aarch64_emit_probe_stack_range (HOST_WIDE_INT first, poly_int64 poly_size)
7710 {
7711   HOST_WIDE_INT size;
7712   if (!poly_size.is_constant (&size))
7713     {
7714       sorry ("stack probes for SVE frames");
7715       return;
7716     }
7717
7718   rtx reg1 = gen_rtx_REG (Pmode, PROBE_STACK_FIRST_REGNUM);
7719
7720   /* See the same assertion on PROBE_INTERVAL above.  */
7721   gcc_assert ((first % ARITH_FACTOR) == 0);
7722
7723   /* See if we have a constant small number of probes to generate.  If so,
7724      that's the easy case.  */
7725   if (size <= PROBE_INTERVAL)
7726     {
7727       const HOST_WIDE_INT base = ROUND_UP (size, ARITH_FACTOR);
7728
7729       emit_set_insn (reg1,
7730                      plus_constant (Pmode,
7731                                     stack_pointer_rtx, -(first + base)));
7732       emit_stack_probe (plus_constant (Pmode, reg1, base - size));
7733     }
7734
7735   /* The run-time loop is made up of 8 insns in the generic case while the
7736      compile-time loop is made up of 4+2*(n-2) insns for n # of intervals.  */
7737   else if (size <= 4 * PROBE_INTERVAL)
7738     {
7739       HOST_WIDE_INT i, rem;
7740
7741       emit_set_insn (reg1,
7742                      plus_constant (Pmode,
7743                                     stack_pointer_rtx,
7744                                     -(first + PROBE_INTERVAL)));
7745       emit_stack_probe (reg1);
7746
7747       /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 2 until
7748          it exceeds SIZE.  If only two probes are needed, this will not
7749          generate any code.  Then probe at FIRST + SIZE.  */
7750       for (i = 2 * PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
7751         {
7752           emit_set_insn (reg1,
7753                          plus_constant (Pmode, reg1, -PROBE_INTERVAL));
7754           emit_stack_probe (reg1);
7755         }
7756
7757       rem = size - (i - PROBE_INTERVAL);
7758       if (rem > 256)
7759         {
7760           const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
7761
7762           emit_set_insn (reg1, plus_constant (Pmode, reg1, -base));
7763           emit_stack_probe (plus_constant (Pmode, reg1, base - rem));
7764         }
7765       else
7766         emit_stack_probe (plus_constant (Pmode, reg1, -rem));
7767     }
7768
7769   /* Otherwise, do the same as above, but in a loop.  Note that we must be
7770      extra careful with variables wrapping around because we might be at
7771      the very top (or the very bottom) of the address space and we have
7772      to be able to handle this case properly; in particular, we use an
7773      equality test for the loop condition.  */
7774   else
7775     {
7776       rtx reg2 = gen_rtx_REG (Pmode, PROBE_STACK_SECOND_REGNUM);
7777
7778       /* Step 1: round SIZE to the previous multiple of the interval.  */
7779
7780       HOST_WIDE_INT rounded_size = size & -PROBE_INTERVAL;
7781
7782
7783       /* Step 2: compute initial and final value of the loop counter.  */
7784
7785       /* TEST_ADDR = SP + FIRST.  */
7786       emit_set_insn (reg1,
7787                      plus_constant (Pmode, stack_pointer_rtx, -first));
7788
7789       /* LAST_ADDR = SP + FIRST + ROUNDED_SIZE.  */
7790       HOST_WIDE_INT adjustment = - (first + rounded_size);
7791       if (! aarch64_uimm12_shift (adjustment))
7792         {
7793           aarch64_internal_mov_immediate (reg2, GEN_INT (adjustment),
7794                                           true, Pmode);
7795           emit_set_insn (reg2, gen_rtx_PLUS (Pmode, stack_pointer_rtx, reg2));
7796         }
7797       else
7798         emit_set_insn (reg2,
7799                        plus_constant (Pmode, stack_pointer_rtx, adjustment));
7800
7801       /* Step 3: the loop
7802
7803          do
7804            {
7805              TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
7806              probe at TEST_ADDR
7807            }
7808          while (TEST_ADDR != LAST_ADDR)
7809
7810          probes at FIRST + N * PROBE_INTERVAL for values of N from 1
7811          until it is equal to ROUNDED_SIZE.  */
7812
7813       emit_insn (gen_probe_stack_range (reg1, reg1, reg2));
7814
7815
7816       /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
7817          that SIZE is equal to ROUNDED_SIZE.  */
7818
7819       if (size != rounded_size)
7820         {
7821           HOST_WIDE_INT rem = size - rounded_size;
7822
7823           if (rem > 256)
7824             {
7825               const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
7826
7827               emit_set_insn (reg2, plus_constant (Pmode, reg2, -base));
7828               emit_stack_probe (plus_constant (Pmode, reg2, base - rem));
7829             }
7830           else
7831             emit_stack_probe (plus_constant (Pmode, reg2, -rem));
7832         }
7833     }
7834
7835   /* Make sure nothing is scheduled before we are done.  */
7836   emit_insn (gen_blockage ());
7837 }
7838
7839 /* Probe a range of stack addresses from REG1 to REG2 inclusive.  These are
7840    absolute addresses.  */
7841
7842 const char *
7843 aarch64_output_probe_stack_range (rtx reg1, rtx reg2)
7844 {
7845   static int labelno = 0;
7846   char loop_lab[32];
7847   rtx xops[2];
7848
7849   ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
7850
7851   /* Loop.  */
7852   ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
7853
7854   HOST_WIDE_INT stack_clash_probe_interval
7855     = 1 << param_stack_clash_protection_guard_size;
7856
7857   /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL.  */
7858   xops[0] = reg1;
7859   HOST_WIDE_INT interval;
7860   if (flag_stack_clash_protection)
7861     interval = stack_clash_probe_interval;
7862   else
7863     interval = PROBE_INTERVAL;
7864
7865   gcc_assert (aarch64_uimm12_shift (interval));
7866   xops[1] = GEN_INT (interval);
7867
7868   output_asm_insn ("sub\t%0, %0, %1", xops);
7869
7870   /* If doing stack clash protection then we probe up by the ABI specified
7871      amount.  We do this because we're dropping full pages at a time in the
7872      loop.  But if we're doing non-stack clash probing, probe at SP 0.  */
7873   if (flag_stack_clash_protection)
7874     xops[1] = GEN_INT (STACK_CLASH_CALLER_GUARD);
7875   else
7876     xops[1] = CONST0_RTX (GET_MODE (xops[1]));
7877
7878   /* Probe at TEST_ADDR.  If we're inside the loop it is always safe to probe
7879      by this amount for each iteration.  */
7880   output_asm_insn ("str\txzr, [%0, %1]", xops);
7881
7882   /* Test if TEST_ADDR == LAST_ADDR.  */
7883   xops[1] = reg2;
7884   output_asm_insn ("cmp\t%0, %1", xops);
7885
7886   /* Branch.  */
7887   fputs ("\tb.ne\t", asm_out_file);
7888   assemble_name_raw (asm_out_file, loop_lab);
7889   fputc ('\n', asm_out_file);
7890
7891   return "";
7892 }
7893
7894 /* Emit the probe loop for doing stack clash probes and stack adjustments for
7895    SVE.  This emits probes from BASE to BASE - ADJUSTMENT based on a guard size
7896    of GUARD_SIZE.  When a probe is emitted it is done at most
7897    MIN_PROBE_THRESHOLD bytes from the current BASE at an interval of
7898    at most MIN_PROBE_THRESHOLD.  By the end of this function
7899    BASE = BASE - ADJUSTMENT.  */
7900
7901 const char *
7902 aarch64_output_probe_sve_stack_clash (rtx base, rtx adjustment,
7903                                       rtx min_probe_threshold, rtx guard_size)
7904 {
7905   /* This function is not allowed to use any instruction generation function
7906      like gen_ and friends.  If you do you'll likely ICE during CFG validation,
7907      so instead emit the code you want using output_asm_insn.  */
7908   gcc_assert (flag_stack_clash_protection);
7909   gcc_assert (CONST_INT_P (min_probe_threshold) && CONST_INT_P (guard_size));
7910   gcc_assert (INTVAL (guard_size) > INTVAL (min_probe_threshold));
7911
7912   /* The minimum required allocation before the residual requires probing.  */
7913   HOST_WIDE_INT residual_probe_guard = INTVAL (min_probe_threshold);
7914
7915   /* Clamp the value down to the nearest value that can be used with a cmp.  */
7916   residual_probe_guard = aarch64_clamp_to_uimm12_shift (residual_probe_guard);
7917   rtx probe_offset_value_rtx = gen_int_mode (residual_probe_guard, Pmode);
7918
7919   gcc_assert (INTVAL (min_probe_threshold) >= residual_probe_guard);
7920   gcc_assert (aarch64_uimm12_shift (residual_probe_guard));
7921
7922   static int labelno = 0;
7923   char loop_start_lab[32];
7924   char loop_end_lab[32];
7925   rtx xops[2];
7926
7927   ASM_GENERATE_INTERNAL_LABEL (loop_start_lab, "SVLPSPL", labelno);
7928   ASM_GENERATE_INTERNAL_LABEL (loop_end_lab, "SVLPEND", labelno++);
7929
7930   /* Emit loop start label.  */
7931   ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_start_lab);
7932
7933   /* ADJUSTMENT < RESIDUAL_PROBE_GUARD.  */
7934   xops[0] = adjustment;
7935   xops[1] = probe_offset_value_rtx;
7936   output_asm_insn ("cmp\t%0, %1", xops);
7937
7938   /* Branch to end if not enough adjustment to probe.  */
7939   fputs ("\tb.lt\t", asm_out_file);
7940   assemble_name_raw (asm_out_file, loop_end_lab);
7941   fputc ('\n', asm_out_file);
7942
7943   /* BASE = BASE - RESIDUAL_PROBE_GUARD.  */
7944   xops[0] = base;
7945   xops[1] = probe_offset_value_rtx;
7946   output_asm_insn ("sub\t%0, %0, %1", xops);
7947
7948   /* Probe at BASE.  */
7949   xops[1] = const0_rtx;
7950   output_asm_insn ("str\txzr, [%0, %1]", xops);
7951
7952   /* ADJUSTMENT = ADJUSTMENT - RESIDUAL_PROBE_GUARD.  */
7953   xops[0] = adjustment;
7954   xops[1] = probe_offset_value_rtx;
7955   output_asm_insn ("sub\t%0, %0, %1", xops);
7956
7957   /* Branch to start if still more bytes to allocate.  */
7958   fputs ("\tb\t", asm_out_file);
7959   assemble_name_raw (asm_out_file, loop_start_lab);
7960   fputc ('\n', asm_out_file);
7961
7962   /* No probe leave.  */
7963   ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_end_lab);
7964
7965   /* BASE = BASE - ADJUSTMENT.  */
7966   xops[0] = base;
7967   xops[1] = adjustment;
7968   output_asm_insn ("sub\t%0, %0, %1", xops);
7969   return "";
7970 }
7971
7972 /* Determine whether a frame chain needs to be generated.  */
7973 static bool
7974 aarch64_needs_frame_chain (void)
7975 {
7976   if (frame_pointer_needed)
7977     return true;
7978
7979   /* A leaf function cannot have calls or write LR.  */
7980   bool is_leaf = crtl->is_leaf && !df_regs_ever_live_p (LR_REGNUM);
7981
7982   /* Don't use a frame chain in leaf functions if leaf frame pointers
7983      are disabled.  */
7984   if (flag_omit_leaf_frame_pointer && is_leaf)
7985     return false;
7986
7987   return aarch64_use_frame_pointer;
7988 }
7989
7990 /* Return true if the current function should save registers above
7991    the locals area, rather than below it.  */
7992
7993 static bool
7994 aarch64_save_regs_above_locals_p ()
7995 {
7996   /* When using stack smash protection, make sure that the canary slot
7997      comes between the locals and the saved registers.  Otherwise,
7998      it would be possible for a carefully sized smash attack to change
7999      the saved registers (particularly LR and FP) without reaching the
8000      canary.  */
8001   return crtl->stack_protect_guard;
8002 }
8003
8004 /* Return true if the current function needs to record the incoming
8005    value of PSTATE.SM.  */
8006 static bool
8007 aarch64_need_old_pstate_sm ()
8008 {
8009   /* Exit early if the incoming value of PSTATE.SM is known at
8010      compile time.  */
8011   if (aarch64_cfun_incoming_pstate_sm () != 0)
8012     return false;
8013
8014   if (aarch64_cfun_enables_pstate_sm ())
8015     return true;
8016
8017   /* Non-local goto receivers are entered with PSTATE.SM equal to 0,
8018      but the function needs to return with PSTATE.SM unchanged.  */
8019   if (nonlocal_goto_handler_labels)
8020     return true;
8021
8022   /* Likewise for exception handlers.  */
8023   eh_landing_pad lp;
8024   for (unsigned int i = 1; vec_safe_iterate (cfun->eh->lp_array, i, &lp); ++i)
8025     if (lp && lp->post_landing_pad)
8026       return true;
8027
8028   /* Non-local gotos need to set PSTATE.SM to zero.  It's possible to call
8029      streaming-compatible functions without SME being available, so PSTATE.SM
8030      should only be changed if it is currently set to one.  */
8031   if (crtl->has_nonlocal_goto)
8032     return true;
8033
8034   if (cfun->machine->call_switches_pstate_sm)
8035     for (auto insn = get_insns (); insn; insn = NEXT_INSN (insn))
8036       if (auto *call = dyn_cast<rtx_call_insn *> (insn))
8037         if (!SIBLING_CALL_P (call))
8038           {
8039             /* Return true if there is a call to a non-streaming-compatible
8040                function.  */
8041             auto callee_isa_mode = aarch64_insn_callee_isa_mode (call);
8042             if (aarch64_call_switches_pstate_sm (callee_isa_mode))
8043               return true;
8044           }
8045   return false;
8046 }
8047
8048 /* Mark the registers that need to be saved by the callee and calculate
8049    the size of the callee-saved registers area and frame record (both FP
8050    and LR may be omitted).  */
8051 static void
8052 aarch64_layout_frame (void)
8053 {
8054   unsigned regno, last_fp_reg = INVALID_REGNUM;
8055   machine_mode vector_save_mode = aarch64_reg_save_mode (V8_REGNUM);
8056   poly_int64 vector_save_size = GET_MODE_SIZE (vector_save_mode);
8057   bool frame_related_fp_reg_p = false;
8058   aarch64_frame &frame = cfun->machine->frame;
8059   poly_int64 top_of_locals = -1;
8060   bool enables_pstate_sm = aarch64_cfun_enables_pstate_sm ();
8061
8062   vec_safe_truncate (frame.saved_gprs, 0);
8063   vec_safe_truncate (frame.saved_fprs, 0);
8064   vec_safe_truncate (frame.saved_prs, 0);
8065
8066   frame.emit_frame_chain = aarch64_needs_frame_chain ();
8067
8068   /* Adjust the outgoing arguments size if required.  Keep it in sync with what
8069      the mid-end is doing.  */
8070   crtl->outgoing_args_size = STACK_DYNAMIC_OFFSET (cfun);
8071
8072 #define SLOT_NOT_REQUIRED (-2)
8073 #define SLOT_REQUIRED     (-1)
8074
8075   frame.wb_push_candidate1 = INVALID_REGNUM;
8076   frame.wb_push_candidate2 = INVALID_REGNUM;
8077   frame.spare_pred_reg = INVALID_REGNUM;
8078
8079   /* First mark all the registers that really need to be saved...  */
8080   for (regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
8081     frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
8082   frame.old_svcr_offset = SLOT_NOT_REQUIRED;
8083
8084   /* ... that includes the eh data registers (if needed)...  */
8085   if (crtl->calls_eh_return)
8086     for (regno = 0; EH_RETURN_DATA_REGNO (regno) != INVALID_REGNUM; regno++)
8087       frame.reg_offset[EH_RETURN_DATA_REGNO (regno)] = SLOT_REQUIRED;
8088
8089   /* ... and any callee saved register that dataflow says is live.  */
8090   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
8091     if (df_regs_ever_live_p (regno)
8092         && !fixed_regs[regno]
8093         && (regno == R30_REGNUM
8094             || !crtl->abi->clobbers_full_reg_p (regno)))
8095       frame.reg_offset[regno] = SLOT_REQUIRED;
8096
8097   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
8098     if ((enables_pstate_sm || df_regs_ever_live_p (regno))
8099         && !fixed_regs[regno]
8100         && !crtl->abi->clobbers_full_reg_p (regno))
8101       {
8102         frame.reg_offset[regno] = SLOT_REQUIRED;
8103         last_fp_reg = regno;
8104         if (aarch64_emit_cfi_for_reg_p (regno))
8105           frame_related_fp_reg_p = true;
8106       }
8107
8108   /* Big-endian SVE frames need a spare predicate register in order
8109      to save Z8-Z15.  Decide which register they should use.  Prefer
8110      an unused argument register if possible, so that we don't force P4
8111      to be saved unnecessarily.  */
8112   if (frame_related_fp_reg_p
8113       && crtl->abi->id () == ARM_PCS_SVE
8114       && BYTES_BIG_ENDIAN)
8115     {
8116       bitmap live1 = df_get_live_out (ENTRY_BLOCK_PTR_FOR_FN (cfun));
8117       bitmap live2 = df_get_live_in (EXIT_BLOCK_PTR_FOR_FN (cfun));
8118       for (regno = P0_REGNUM; regno <= P7_REGNUM; regno++)
8119         if (!bitmap_bit_p (live1, regno) && !bitmap_bit_p (live2, regno))
8120           break;
8121       gcc_assert (regno <= P7_REGNUM);
8122       frame.spare_pred_reg = regno;
8123       df_set_regs_ever_live (regno, true);
8124     }
8125
8126   for (regno = P0_REGNUM; regno <= P15_REGNUM; regno++)
8127     if ((enables_pstate_sm || df_regs_ever_live_p (regno))
8128         && !fixed_regs[regno]
8129         && !crtl->abi->clobbers_full_reg_p (regno))
8130       frame.reg_offset[regno] = SLOT_REQUIRED;
8131
8132   bool regs_at_top_p = aarch64_save_regs_above_locals_p ();
8133
8134   poly_int64 offset = crtl->outgoing_args_size;
8135   gcc_assert (multiple_p (offset, STACK_BOUNDARY / BITS_PER_UNIT));
8136   if (regs_at_top_p)
8137     {
8138       offset += get_frame_size ();
8139       offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT);
8140       top_of_locals = offset;
8141     }
8142   frame.bytes_below_saved_regs = offset;
8143   frame.sve_save_and_probe = INVALID_REGNUM;
8144
8145   /* Now assign stack slots for the registers.  Start with the predicate
8146      registers, since predicate LDR and STR have a relatively small
8147      offset range.  These saves happen below the hard frame pointer.  */
8148   for (regno = P0_REGNUM; regno <= P15_REGNUM; regno++)
8149     if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED))
8150       {
8151         vec_safe_push (frame.saved_prs, regno);
8152         if (frame.sve_save_and_probe == INVALID_REGNUM)
8153           frame.sve_save_and_probe = regno;
8154         frame.reg_offset[regno] = offset;
8155         offset += BYTES_PER_SVE_PRED;
8156       }
8157
8158   poly_int64 saved_prs_size = offset - frame.bytes_below_saved_regs;
8159   if (maybe_ne (saved_prs_size, 0))
8160     {
8161       /* If we have any vector registers to save above the predicate registers,
8162          the offset of the vector register save slots need to be a multiple
8163          of the vector size.  This lets us use the immediate forms of LDR/STR
8164          (or LD1/ST1 for big-endian).
8165
8166          A vector register is 8 times the size of a predicate register,
8167          and we need to save a maximum of 12 predicate registers, so the
8168          first vector register will be at either #1, MUL VL or #2, MUL VL.
8169
8170          If we don't have any vector registers to save, and we know how
8171          big the predicate save area is, we can just round it up to the
8172          next 16-byte boundary.  */
8173       if (last_fp_reg == INVALID_REGNUM && offset.is_constant ())
8174         offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT);
8175       else
8176         {
8177           if (known_le (saved_prs_size, vector_save_size))
8178             offset = frame.bytes_below_saved_regs + vector_save_size;
8179           else if (known_le (saved_prs_size, vector_save_size * 2))
8180             offset = frame.bytes_below_saved_regs + vector_save_size * 2;
8181           else
8182             gcc_unreachable ();
8183         }
8184     }
8185
8186   /* If we need to save any SVE vector registers, add them next.  */
8187   if (last_fp_reg != INVALID_REGNUM && crtl->abi->id () == ARM_PCS_SVE)
8188     for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
8189       if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED))
8190         {
8191           vec_safe_push (frame.saved_fprs, regno);
8192           if (frame.sve_save_and_probe == INVALID_REGNUM)
8193             frame.sve_save_and_probe = regno;
8194           frame.reg_offset[regno] = offset;
8195           offset += vector_save_size;
8196         }
8197
8198   /* OFFSET is now the offset of the hard frame pointer from the bottom
8199      of the callee save area.  */
8200   auto below_hard_fp_saved_regs_size = offset - frame.bytes_below_saved_regs;
8201   bool saves_below_hard_fp_p = maybe_ne (below_hard_fp_saved_regs_size, 0);
8202   gcc_assert (!saves_below_hard_fp_p
8203               || (frame.sve_save_and_probe != INVALID_REGNUM
8204                   && known_eq (frame.reg_offset[frame.sve_save_and_probe],
8205                                frame.bytes_below_saved_regs)));
8206
8207   frame.bytes_below_hard_fp = offset;
8208   frame.hard_fp_save_and_probe = INVALID_REGNUM;
8209
8210   auto allocate_gpr_slot = [&](unsigned int regno)
8211     {
8212       vec_safe_push (frame.saved_gprs, regno);
8213       frame.reg_offset[regno] = offset;
8214       offset += UNITS_PER_WORD;
8215     };
8216
8217   if (frame.emit_frame_chain)
8218     {
8219       /* FP and LR are placed in the linkage record.  */
8220       allocate_gpr_slot (R29_REGNUM);
8221       allocate_gpr_slot (R30_REGNUM);
8222     }
8223   else if ((flag_stack_clash_protection || !frame.is_scs_enabled)
8224            && known_eq (frame.reg_offset[R30_REGNUM], SLOT_REQUIRED))
8225     /* Put the LR save slot first, since it makes a good choice of probe
8226        for stack clash purposes.  The idea is that the link register usually
8227        has to be saved before a call anyway, and so we lose little by
8228        stopping it from being individually shrink-wrapped.  */
8229     allocate_gpr_slot (R30_REGNUM);
8230
8231   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
8232     if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED))
8233       allocate_gpr_slot (regno);
8234
8235   if (aarch64_need_old_pstate_sm ())
8236     {
8237       frame.old_svcr_offset = offset;
8238       offset += UNITS_PER_WORD;
8239     }
8240
8241   /* If the current function changes the SVE vector length, ensure that the
8242      old value of the DWARF VG register is saved and available in the CFI,
8243      so that outer frames with VL-sized offsets can be processed correctly.  */
8244   if (cfun->machine->call_switches_pstate_sm
8245       || aarch64_cfun_enables_pstate_sm ())
8246     {
8247       frame.reg_offset[VG_REGNUM] = offset;
8248       offset += UNITS_PER_WORD;
8249     }
8250
8251   poly_int64 max_int_offset = offset;
8252   offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT);
8253   bool has_align_gap = maybe_ne (offset, max_int_offset);
8254
8255   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
8256     if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED))
8257       {
8258         vec_safe_push (frame.saved_fprs, regno);
8259         /* If there is an alignment gap between integer and fp callee-saves,
8260            allocate the last fp register to it if possible.  */
8261         if (regno == last_fp_reg
8262             && has_align_gap
8263             && known_eq (vector_save_size, 8)
8264             && multiple_p (offset, 16))
8265           {
8266             frame.reg_offset[regno] = max_int_offset;
8267             break;
8268           }
8269
8270         frame.reg_offset[regno] = offset;
8271         offset += vector_save_size;
8272       }
8273
8274   offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT);
8275   auto saved_regs_size = offset - frame.bytes_below_saved_regs;
8276
8277   array_slice<unsigned int> push_regs = (!vec_safe_is_empty (frame.saved_gprs)
8278                                          ? frame.saved_gprs
8279                                          : frame.saved_fprs);
8280   if (!push_regs.empty ()
8281       && known_eq (frame.reg_offset[push_regs[0]], frame.bytes_below_hard_fp))
8282     {
8283       frame.hard_fp_save_and_probe = push_regs[0];
8284       frame.wb_push_candidate1 = push_regs[0];
8285       if (push_regs.size () > 1)
8286         frame.wb_push_candidate2 = push_regs[1];
8287     }
8288
8289   /* With stack-clash, a register must be saved in non-leaf functions.
8290      The saving of the bottommost register counts as an implicit probe,
8291      which allows us to maintain the invariant described in the comment
8292      at expand_prologue.  */
8293   gcc_assert (crtl->is_leaf || maybe_ne (saved_regs_size, 0));
8294
8295   if (!regs_at_top_p)
8296     {
8297       offset += get_frame_size ();
8298       offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT);
8299       top_of_locals = offset;
8300     }
8301   offset += frame.saved_varargs_size;
8302   gcc_assert (multiple_p (offset, STACK_BOUNDARY / BITS_PER_UNIT));
8303   frame.frame_size = offset;
8304
8305   frame.bytes_above_hard_fp = frame.frame_size - frame.bytes_below_hard_fp;
8306   gcc_assert (known_ge (top_of_locals, 0));
8307   frame.bytes_above_locals = frame.frame_size - top_of_locals;
8308
8309   frame.initial_adjust = 0;
8310   frame.final_adjust = 0;
8311   frame.callee_adjust = 0;
8312   frame.sve_callee_adjust = 0;
8313
8314   frame.wb_pop_candidate1 = frame.wb_push_candidate1;
8315   frame.wb_pop_candidate2 = frame.wb_push_candidate2;
8316
8317   /* Shadow call stack only deals with functions where the LR is pushed
8318      onto the stack and without specifying the "no_sanitize" attribute
8319      with the argument "shadow-call-stack".  */
8320   frame.is_scs_enabled
8321     = (!crtl->calls_eh_return
8322        && sanitize_flags_p (SANITIZE_SHADOW_CALL_STACK)
8323        && known_ge (frame.reg_offset[LR_REGNUM], 0));
8324
8325   /* When shadow call stack is enabled, the scs_pop in the epilogue will
8326      restore x30, and we don't need to pop x30 again in the traditional
8327      way.  Pop candidates record the registers that need to be popped
8328      eventually.  */
8329   if (frame.is_scs_enabled)
8330     {
8331       if (frame.wb_pop_candidate2 == R30_REGNUM)
8332         frame.wb_pop_candidate2 = INVALID_REGNUM;
8333       else if (frame.wb_pop_candidate1 == R30_REGNUM)
8334         frame.wb_pop_candidate1 = INVALID_REGNUM;
8335     }
8336
8337   /* If candidate2 is INVALID_REGNUM, we need to adjust max_push_offset to
8338      256 to ensure that the offset meets the requirements of emit_move_insn.
8339      Similarly, if candidate1 is INVALID_REGNUM, we need to set
8340      max_push_offset to 0, because no registers are popped at this time,
8341      so callee_adjust cannot be adjusted.  */
8342   HOST_WIDE_INT max_push_offset = 0;
8343   if (frame.wb_pop_candidate1 != INVALID_REGNUM)
8344     {
8345       if (frame.wb_pop_candidate2 != INVALID_REGNUM)
8346         max_push_offset = 512;
8347       else
8348         max_push_offset = 256;
8349     }
8350
8351   HOST_WIDE_INT const_size, const_below_saved_regs, const_above_fp;
8352   HOST_WIDE_INT const_saved_regs_size;
8353   if (known_eq (saved_regs_size, 0))
8354     frame.initial_adjust = frame.frame_size;
8355   else if (frame.frame_size.is_constant (&const_size)
8356            && const_size < max_push_offset
8357            && known_eq (frame.bytes_above_hard_fp, const_size))
8358     {
8359       /* Simple, small frame with no data below the saved registers.
8360
8361          stp reg1, reg2, [sp, -frame_size]!
8362          stp reg3, reg4, [sp, 16]  */
8363       frame.callee_adjust = const_size;
8364     }
8365   else if (frame.bytes_below_saved_regs.is_constant (&const_below_saved_regs)
8366            && saved_regs_size.is_constant (&const_saved_regs_size)
8367            && const_below_saved_regs + const_saved_regs_size < 512
8368            /* We could handle this case even with data below the saved
8369               registers, provided that that data left us with valid offsets
8370               for all predicate and vector save slots.  It's such a rare
8371               case that it hardly seems worth the effort though.  */
8372            && (!saves_below_hard_fp_p || const_below_saved_regs == 0)
8373            && !(cfun->calls_alloca
8374                 && frame.bytes_above_hard_fp.is_constant (&const_above_fp)
8375                 && const_above_fp < max_push_offset))
8376     {
8377       /* Frame with small area below the saved registers:
8378
8379          sub sp, sp, frame_size
8380          stp reg1, reg2, [sp, bytes_below_saved_regs]
8381          stp reg3, reg4, [sp, bytes_below_saved_regs + 16]  */
8382       frame.initial_adjust = frame.frame_size;
8383     }
8384   else if (saves_below_hard_fp_p
8385            && known_eq (saved_regs_size, below_hard_fp_saved_regs_size))
8386     {
8387       /* Frame in which all saves are SVE saves:
8388
8389          sub sp, sp, frame_size - bytes_below_saved_regs
8390          save SVE registers relative to SP
8391          sub sp, sp, bytes_below_saved_regs  */
8392       frame.initial_adjust = frame.frame_size - frame.bytes_below_saved_regs;
8393       frame.final_adjust = frame.bytes_below_saved_regs;
8394     }
8395   else if (frame.wb_push_candidate1 != INVALID_REGNUM
8396            && frame.bytes_above_hard_fp.is_constant (&const_above_fp)
8397            && const_above_fp < max_push_offset)
8398     {
8399       /* Frame with large area below the saved registers, or with SVE saves,
8400          but with a small area above:
8401
8402          stp reg1, reg2, [sp, -hard_fp_offset]!
8403          stp reg3, reg4, [sp, 16]
8404          [sub sp, sp, below_hard_fp_saved_regs_size]
8405          [save SVE registers relative to SP]
8406          sub sp, sp, bytes_below_saved_regs  */
8407       frame.callee_adjust = const_above_fp;
8408       frame.sve_callee_adjust = below_hard_fp_saved_regs_size;
8409       frame.final_adjust = frame.bytes_below_saved_regs;
8410     }
8411   else
8412     {
8413       /* General case:
8414
8415          sub sp, sp, hard_fp_offset
8416          stp x29, x30, [sp, 0]
8417          add x29, sp, 0
8418          stp reg3, reg4, [sp, 16]
8419          [sub sp, sp, below_hard_fp_saved_regs_size]
8420          [save SVE registers relative to SP]
8421          sub sp, sp, bytes_below_saved_regs  */
8422       frame.initial_adjust = frame.bytes_above_hard_fp;
8423       frame.sve_callee_adjust = below_hard_fp_saved_regs_size;
8424       frame.final_adjust = frame.bytes_below_saved_regs;
8425     }
8426
8427   /* The frame is allocated in pieces, with each non-final piece
8428      including a register save at offset 0 that acts as a probe for
8429      the following piece.  In addition, the save of the bottommost register
8430      acts as a probe for callees and allocas.  Roll back any probes that
8431      aren't needed.
8432
8433      A probe isn't needed if it is associated with the final allocation
8434      (including callees and allocas) that happens before the epilogue is
8435      executed.  */
8436   if (crtl->is_leaf
8437       && !cfun->calls_alloca
8438       && known_eq (frame.final_adjust, 0))
8439     {
8440       if (maybe_ne (frame.sve_callee_adjust, 0))
8441         frame.sve_save_and_probe = INVALID_REGNUM;
8442       else
8443         frame.hard_fp_save_and_probe = INVALID_REGNUM;
8444     }
8445
8446   /* Make sure the individual adjustments add up to the full frame size.  */
8447   gcc_assert (known_eq (frame.initial_adjust
8448                         + frame.callee_adjust
8449                         + frame.sve_callee_adjust
8450                         + frame.final_adjust, frame.frame_size));
8451
8452   if (frame.callee_adjust == 0)
8453     {
8454       /* We've decided not to do a "real" push and pop.  However,
8455          setting up the frame chain is treated as being essentially
8456          a multi-instruction push.  */
8457       frame.wb_pop_candidate1 = frame.wb_pop_candidate2 = INVALID_REGNUM;
8458       if (!frame.emit_frame_chain)
8459         frame.wb_push_candidate1 = frame.wb_push_candidate2 = INVALID_REGNUM;
8460     }
8461
8462   frame.laid_out = true;
8463 }
8464
8465 /* Return true if the register REGNO is saved on entry to
8466    the current function.  */
8467
8468 static bool
8469 aarch64_register_saved_on_entry (int regno)
8470 {
8471   return known_ge (cfun->machine->frame.reg_offset[regno], 0);
8472 }
8473
8474 /* Push the register number REGNO of mode MODE to the stack with write-back
8475    adjusting the stack by ADJUSTMENT.  */
8476
8477 static void
8478 aarch64_pushwb_single_reg (machine_mode mode, unsigned regno,
8479                            HOST_WIDE_INT adjustment)
8480  {
8481   rtx base_rtx = stack_pointer_rtx;
8482   rtx insn, reg, mem;
8483
8484   reg = gen_rtx_REG (mode, regno);
8485   mem = gen_rtx_PRE_MODIFY (Pmode, base_rtx,
8486                             plus_constant (Pmode, base_rtx, -adjustment));
8487   mem = gen_frame_mem (mode, mem);
8488
8489   insn = emit_move_insn (mem, reg);
8490   RTX_FRAME_RELATED_P (insn) = 1;
8491 }
8492
8493 /* Generate and return an instruction to store the pair of registers
8494    REG and REG2 of mode MODE to location BASE with write-back adjusting
8495    the stack location BASE by ADJUSTMENT.  */
8496
8497 static rtx
8498 aarch64_gen_storewb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
8499                           HOST_WIDE_INT adjustment)
8500 {
8501   rtx new_base = plus_constant (Pmode, base, -adjustment);
8502   rtx mem = gen_frame_mem (mode, new_base);
8503   rtx mem2 = adjust_address_nv (mem, mode, GET_MODE_SIZE (mode));
8504
8505   return gen_rtx_PARALLEL (VOIDmode,
8506                            gen_rtvec (3,
8507                                       gen_rtx_SET (base, new_base),
8508                                       gen_rtx_SET (mem, reg),
8509                                       gen_rtx_SET (mem2, reg2)));
8510 }
8511
8512 /* Push registers numbered REGNO1 and REGNO2 to the stack, adjusting the
8513    stack pointer by ADJUSTMENT.  */
8514
8515 static void
8516 aarch64_push_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment)
8517 {
8518   rtx_insn *insn;
8519   machine_mode mode = aarch64_reg_save_mode (regno1);
8520
8521   if (regno2 == INVALID_REGNUM)
8522     return aarch64_pushwb_single_reg (mode, regno1, adjustment);
8523
8524   rtx reg1 = gen_rtx_REG (mode, regno1);
8525   rtx reg2 = gen_rtx_REG (mode, regno2);
8526
8527   insn = emit_insn (aarch64_gen_storewb_pair (mode, stack_pointer_rtx, reg1,
8528                                               reg2, adjustment));
8529   RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 2)) = 1;
8530   RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
8531   RTX_FRAME_RELATED_P (insn) = 1;
8532 }
8533
8534 /* Load the pair of register REG, REG2 of mode MODE from stack location BASE,
8535    adjusting it by ADJUSTMENT afterwards.  */
8536
8537 static rtx
8538 aarch64_gen_loadwb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
8539                          HOST_WIDE_INT adjustment)
8540 {
8541   rtx mem = gen_frame_mem (mode, base);
8542   rtx mem2 = adjust_address_nv (mem, mode, GET_MODE_SIZE (mode));
8543   rtx new_base = plus_constant (Pmode, base, adjustment);
8544
8545   return gen_rtx_PARALLEL (VOIDmode,
8546                            gen_rtvec (3,
8547                                       gen_rtx_SET (base, new_base),
8548                                       gen_rtx_SET (reg, mem),
8549                                       gen_rtx_SET (reg2, mem2)));
8550 }
8551
8552 /* Pop the two registers numbered REGNO1, REGNO2 from the stack, adjusting it
8553    afterwards by ADJUSTMENT and writing the appropriate REG_CFA_RESTORE notes
8554    into CFI_OPS.  */
8555
8556 static void
8557 aarch64_pop_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment,
8558                   rtx *cfi_ops)
8559 {
8560   machine_mode mode = aarch64_reg_save_mode (regno1);
8561   rtx reg1 = gen_rtx_REG (mode, regno1);
8562
8563   *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg1, *cfi_ops);
8564
8565   if (regno2 == INVALID_REGNUM)
8566     {
8567       rtx mem = plus_constant (Pmode, stack_pointer_rtx, adjustment);
8568       mem = gen_rtx_POST_MODIFY (Pmode, stack_pointer_rtx, mem);
8569       emit_move_insn (reg1, gen_frame_mem (mode, mem));
8570     }
8571   else
8572     {
8573       rtx reg2 = gen_rtx_REG (mode, regno2);
8574       *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
8575       emit_insn (aarch64_gen_loadwb_pair (mode, stack_pointer_rtx, reg1,
8576                                           reg2, adjustment));
8577     }
8578 }
8579
8580 /* Given an ldp/stp register operand mode MODE, return a suitable mode to use
8581    for a mem rtx representing the entire pair.  */
8582
8583 static machine_mode
8584 aarch64_pair_mode_for_mode (machine_mode mode)
8585 {
8586   if (known_eq (GET_MODE_SIZE (mode), 4))
8587     return V2x4QImode;
8588   else if (known_eq (GET_MODE_SIZE (mode), 8))
8589     return V2x8QImode;
8590   else if (known_eq (GET_MODE_SIZE (mode), 16))
8591     return V2x16QImode;
8592   else
8593     gcc_unreachable ();
8594 }
8595
8596 /* Given a base mem MEM with mode and address suitable for a single ldp/stp
8597    operand, return an rtx like MEM which instead represents the entire pair.  */
8598
8599 static rtx
8600 aarch64_pair_mem_from_base (rtx mem)
8601 {
8602   auto pair_mode = aarch64_pair_mode_for_mode (GET_MODE (mem));
8603   mem = adjust_bitfield_address_nv (mem, pair_mode, 0);
8604   gcc_assert (aarch64_mem_pair_lanes_operand (mem, pair_mode));
8605   return mem;
8606 }
8607
8608 /* Generate and return a store pair instruction to store REG1 and REG2
8609    into memory starting at BASE_MEM.  All three rtxes should have modes of the
8610    same size.  */
8611
8612 rtx
8613 aarch64_gen_store_pair (rtx base_mem, rtx reg1, rtx reg2)
8614 {
8615   rtx pair_mem = aarch64_pair_mem_from_base (base_mem);
8616
8617   return gen_rtx_SET (pair_mem,
8618                       gen_rtx_UNSPEC (GET_MODE (pair_mem),
8619                                       gen_rtvec (2, reg1, reg2),
8620                                       UNSPEC_STP));
8621 }
8622
8623 /* Generate and return a load pair instruction to load a pair of
8624    registers starting at BASE_MEM into REG1 and REG2.  If CODE is
8625    UNKNOWN, all three rtxes should have modes of the same size.
8626    Otherwise, CODE is {SIGN,ZERO}_EXTEND, base_mem should be in SImode,
8627    and REG{1,2} should be in DImode.  */
8628
8629 rtx
8630 aarch64_gen_load_pair (rtx reg1, rtx reg2, rtx base_mem, enum rtx_code code)
8631 {
8632   rtx pair_mem = aarch64_pair_mem_from_base (base_mem);
8633
8634   const bool any_extend_p = (code == ZERO_EXTEND || code == SIGN_EXTEND);
8635   if (any_extend_p)
8636     gcc_checking_assert (GET_MODE (base_mem) == SImode
8637                          && GET_MODE (reg1) == DImode
8638                          && GET_MODE (reg2) == DImode);
8639   else
8640     gcc_assert (code == UNKNOWN);
8641
8642   rtx unspecs[2] = {
8643     gen_rtx_UNSPEC (any_extend_p ? SImode : GET_MODE (reg1),
8644                     gen_rtvec (1, pair_mem),
8645                     UNSPEC_LDP_FST),
8646     gen_rtx_UNSPEC (any_extend_p ? SImode : GET_MODE (reg2),
8647                     gen_rtvec (1, copy_rtx (pair_mem)),
8648                     UNSPEC_LDP_SND)
8649   };
8650
8651   if (any_extend_p)
8652     for (int i = 0; i < 2; i++)
8653       unspecs[i] = gen_rtx_fmt_e (code, DImode, unspecs[i]);
8654
8655   return gen_rtx_PARALLEL (VOIDmode,
8656                            gen_rtvec (2,
8657                                       gen_rtx_SET (reg1, unspecs[0]),
8658                                       gen_rtx_SET (reg2, unspecs[1])));
8659 }
8660
8661 /* Return TRUE if return address signing should be enabled for the current
8662    function, otherwise return FALSE.  */
8663
8664 bool
8665 aarch64_return_address_signing_enabled (void)
8666 {
8667   /* This function should only be called after frame laid out.   */
8668   gcc_assert (cfun->machine->frame.laid_out);
8669
8670   /* If signing scope is AARCH_FUNCTION_NON_LEAF, we only sign a leaf function
8671      if its LR is pushed onto stack.  */
8672   return (aarch_ra_sign_scope == AARCH_FUNCTION_ALL
8673           || (aarch_ra_sign_scope == AARCH_FUNCTION_NON_LEAF
8674               && known_ge (cfun->machine->frame.reg_offset[LR_REGNUM], 0)));
8675 }
8676
8677 /* Only used by the arm backend.  */
8678 void aarch_bti_arch_check (void)
8679 {}
8680
8681 /* Return TRUE if Branch Target Identification Mechanism is enabled.  */
8682 bool
8683 aarch_bti_enabled (void)
8684 {
8685   return (aarch_enable_bti == 1);
8686 }
8687
8688 /* Check if INSN is a BTI J insn.  */
8689 bool
8690 aarch_bti_j_insn_p (rtx_insn *insn)
8691 {
8692   if (!insn || !INSN_P (insn))
8693     return false;
8694
8695   rtx pat = PATTERN (insn);
8696   return GET_CODE (pat) == UNSPEC_VOLATILE && XINT (pat, 1) == UNSPECV_BTI_J;
8697 }
8698
8699 /* Return TRUE if Guarded Control Stack is enabled.  */
8700 bool
8701 aarch64_gcs_enabled (void)
8702 {
8703   return (aarch64_enable_gcs == 1);
8704 }
8705
8706 /* Check if X (or any sub-rtx of X) is a PACIASP/PACIBSP instruction.  */
8707 bool
8708 aarch_pac_insn_p (rtx x)
8709 {
8710   if (!INSN_P (x))
8711     return false;
8712
8713   subrtx_var_iterator::array_type array;
8714   FOR_EACH_SUBRTX_VAR (iter, array, PATTERN (x), ALL)
8715     {
8716       rtx sub = *iter;
8717       if (sub && GET_CODE (sub) == UNSPEC)
8718         {
8719           int unspec_val = XINT (sub, 1);
8720           switch (unspec_val)
8721             {
8722             case UNSPEC_PACIASP:
8723             case UNSPEC_PACIBSP:
8724               return true;
8725
8726             default:
8727               return false;
8728             }
8729           iter.skip_subrtxes ();
8730         }
8731     }
8732   return false;
8733 }
8734
8735 rtx aarch_gen_bti_c (void)
8736 {
8737   return gen_bti_c ();
8738 }
8739
8740 rtx aarch_gen_bti_j (void)
8741 {
8742   return gen_bti_j ();
8743 }
8744
8745 /* The caller is going to use ST1D or LD1D to save or restore an SVE
8746    register in mode MODE at BASE_RTX + OFFSET, where OFFSET is in
8747    the range [1, 16] * GET_MODE_SIZE (MODE).  Prepare for this by:
8748
8749      (1) updating BASE_RTX + OFFSET so that it is a legitimate ST1D
8750          or LD1D address
8751
8752      (2) setting PRED to a valid predicate register for the ST1D or LD1D,
8753          if the variable isn't already nonnull
8754
8755    (1) is needed when OFFSET is in the range [8, 16] * GET_MODE_SIZE (MODE).
8756    Handle this case using a temporary base register that is suitable for
8757    all offsets in that range.  Use ANCHOR_REG as this base register if it
8758    is nonnull, otherwise create a new register and store it in ANCHOR_REG.  */
8759
8760 static inline void
8761 aarch64_adjust_sve_callee_save_base (machine_mode mode, rtx &base_rtx,
8762                                      rtx &anchor_reg, poly_int64 &offset,
8763                                      rtx &ptrue)
8764 {
8765   if (maybe_ge (offset, 8 * GET_MODE_SIZE (mode)))
8766     {
8767       /* This is the maximum valid offset of the anchor from the base.
8768          Lower values would be valid too.  */
8769       poly_int64 anchor_offset = 16 * GET_MODE_SIZE (mode);
8770       if (!anchor_reg)
8771         {
8772           anchor_reg = gen_rtx_REG (Pmode, STACK_CLASH_SVE_CFA_REGNUM);
8773           emit_insn (gen_add3_insn (anchor_reg, base_rtx,
8774                                     gen_int_mode (anchor_offset, Pmode)));
8775         }
8776       base_rtx = anchor_reg;
8777       offset -= anchor_offset;
8778     }
8779   if (!ptrue)
8780     {
8781       int pred_reg = cfun->machine->frame.spare_pred_reg;
8782       emit_move_insn (gen_rtx_REG (VNx16BImode, pred_reg),
8783                       CONSTM1_RTX (VNx16BImode));
8784       ptrue = gen_rtx_REG (VNx2BImode, pred_reg);
8785     }
8786 }
8787
8788 /* Add a REG_CFA_EXPRESSION note to INSN to say that register REG
8789    is saved at BASE + OFFSET.  */
8790
8791 static void
8792 aarch64_add_cfa_expression (rtx_insn *insn, rtx reg,
8793                             rtx base, poly_int64 offset)
8794 {
8795   rtx mem = gen_frame_mem (GET_MODE (reg),
8796                            plus_constant (Pmode, base, offset));
8797   add_reg_note (insn, REG_CFA_EXPRESSION, gen_rtx_SET (mem, reg));
8798 }
8799
8800 /* Emit code to save the callee-saved registers in REGS.  Skip any
8801    write-back candidates if SKIP_WB is true, otherwise consider only
8802    write-back candidates.
8803
8804    The stack pointer is currently BYTES_BELOW_SP bytes above the bottom
8805    of the static frame.  HARD_FP_VALID_P is true if the hard frame pointer
8806    has been set up.  */
8807
8808 static void
8809 aarch64_save_callee_saves (poly_int64 bytes_below_sp,
8810                            array_slice<unsigned int> regs, bool skip_wb,
8811                            bool hard_fp_valid_p)
8812 {
8813   aarch64_frame &frame = cfun->machine->frame;
8814   rtx_insn *insn;
8815   rtx anchor_reg = NULL_RTX, ptrue = NULL_RTX;
8816
8817   auto skip_save_p = [&](unsigned int regno)
8818     {
8819       if (cfun->machine->reg_is_wrapped_separately[regno])
8820         return true;
8821
8822       if (skip_wb == (regno == frame.wb_push_candidate1
8823                       || regno == frame.wb_push_candidate2))
8824         return true;
8825
8826       return false;
8827     };
8828
8829   for (unsigned int i = 0; i < regs.size (); ++i)
8830     {
8831       unsigned int regno = regs[i];
8832       poly_int64 offset;
8833       bool frame_related_p = aarch64_emit_cfi_for_reg_p (regno);
8834
8835       if (skip_save_p (regno))
8836         continue;
8837
8838       machine_mode mode = aarch64_reg_save_mode (regno);
8839       rtx reg = gen_rtx_REG (mode, regno);
8840       rtx move_src = reg;
8841       offset = frame.reg_offset[regno] - bytes_below_sp;
8842       if (regno == VG_REGNUM)
8843         {
8844           move_src = gen_rtx_REG (DImode, IP0_REGNUM);
8845           emit_move_insn (move_src, gen_int_mode (aarch64_sve_vg, DImode));
8846         }
8847       rtx base_rtx = stack_pointer_rtx;
8848       poly_int64 sp_offset = offset;
8849
8850       HOST_WIDE_INT const_offset;
8851       if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
8852         aarch64_adjust_sve_callee_save_base (mode, base_rtx, anchor_reg,
8853                                              offset, ptrue);
8854       else if (GP_REGNUM_P (REGNO (reg))
8855                && (!offset.is_constant (&const_offset) || const_offset >= 512))
8856         {
8857           poly_int64 fp_offset = frame.bytes_below_hard_fp - bytes_below_sp;
8858           if (hard_fp_valid_p)
8859             base_rtx = hard_frame_pointer_rtx;
8860           else
8861             {
8862               if (!anchor_reg)
8863                 {
8864                   anchor_reg = gen_rtx_REG (Pmode, STACK_CLASH_SVE_CFA_REGNUM);
8865                   emit_insn (gen_add3_insn (anchor_reg, base_rtx,
8866                                             gen_int_mode (fp_offset, Pmode)));
8867                 }
8868               base_rtx = anchor_reg;
8869             }
8870           offset -= fp_offset;
8871         }
8872       rtx mem = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
8873       rtx cfi_mem = gen_frame_mem (mode, plus_constant (Pmode,
8874                                                         stack_pointer_rtx,
8875                                                         sp_offset));
8876       rtx cfi_set = gen_rtx_SET (cfi_mem, reg);
8877       bool need_cfi_note_p = (base_rtx != stack_pointer_rtx);
8878
8879       unsigned int regno2;
8880       if (!aarch64_sve_mode_p (mode)
8881           && reg == move_src
8882           && i + 1 < regs.size ()
8883           && (regno2 = regs[i + 1], !skip_save_p (regno2))
8884           && known_eq (GET_MODE_SIZE (mode),
8885                        frame.reg_offset[regno2] - frame.reg_offset[regno]))
8886         {
8887           rtx reg2 = gen_rtx_REG (mode, regno2);
8888
8889           offset += GET_MODE_SIZE (mode);
8890           insn = emit_insn (aarch64_gen_store_pair (mem, reg, reg2));
8891
8892           rtx cfi_mem2
8893             = gen_frame_mem (mode,
8894                              plus_constant (Pmode,
8895                                             stack_pointer_rtx,
8896                                             sp_offset + GET_MODE_SIZE (mode)));
8897           rtx cfi_set2 = gen_rtx_SET (cfi_mem2, reg2);
8898
8899           /* The first part of a frame-related parallel insn is always
8900              assumed to be relevant to the frame calculations;
8901              subsequent parts, are only frame-related if
8902              explicitly marked.  */
8903           if (aarch64_emit_cfi_for_reg_p (regno2))
8904             RTX_FRAME_RELATED_P (cfi_set2) = 1;
8905
8906           /* Add a REG_FRAME_RELATED_EXPR note since the unspec
8907              representation of stp cannot be understood directly by
8908              dwarf2cfi.  */
8909           rtx par = gen_rtx_PARALLEL (VOIDmode,
8910                                       gen_rtvec (2, cfi_set, cfi_set2));
8911           add_reg_note (insn, REG_FRAME_RELATED_EXPR, par);
8912
8913           regno = regno2;
8914           ++i;
8915         }
8916       else
8917         {
8918           if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
8919             {
8920               insn = emit_insn (gen_aarch64_pred_mov (mode, mem,
8921                                                       ptrue, move_src));
8922               need_cfi_note_p = true;
8923             }
8924           else if (aarch64_sve_mode_p (mode))
8925             insn = emit_insn (gen_rtx_SET (mem, move_src));
8926           else
8927             insn = emit_move_insn (mem, move_src);
8928
8929           if (frame_related_p && (need_cfi_note_p || move_src != reg))
8930             add_reg_note (insn, REG_FRAME_RELATED_EXPR, cfi_set);
8931         }
8932
8933       RTX_FRAME_RELATED_P (insn) = frame_related_p;
8934
8935       /* Emit a fake instruction to indicate that the VG save slot has
8936          been initialized.  */
8937       if (regno == VG_REGNUM)
8938         emit_insn (gen_aarch64_old_vg_saved (move_src, mem));
8939     }
8940 }
8941
8942 /* Emit code to restore the callee registers in REGS, ignoring pop candidates
8943    and any other registers that are handled separately.  Write the appropriate
8944    REG_CFA_RESTORE notes into CFI_OPS.
8945
8946    The stack pointer is currently BYTES_BELOW_SP bytes above the bottom
8947    of the static frame.  */
8948
8949 static void
8950 aarch64_restore_callee_saves (poly_int64 bytes_below_sp,
8951                               array_slice<unsigned int> regs, rtx *cfi_ops)
8952 {
8953   aarch64_frame &frame = cfun->machine->frame;
8954   poly_int64 offset;
8955   rtx anchor_reg = NULL_RTX, ptrue = NULL_RTX;
8956
8957   auto skip_restore_p = [&](unsigned int regno)
8958     {
8959       if (cfun->machine->reg_is_wrapped_separately[regno])
8960         return true;
8961
8962       if (regno == frame.wb_pop_candidate1
8963           || regno == frame.wb_pop_candidate2)
8964         return true;
8965
8966       /* The shadow call stack code restores LR separately.  */
8967       if (frame.is_scs_enabled && regno == LR_REGNUM)
8968         return true;
8969
8970       return false;
8971     };
8972
8973   for (unsigned int i = 0; i < regs.size (); ++i)
8974     {
8975       unsigned int regno = regs[i];
8976       bool frame_related_p = aarch64_emit_cfi_for_reg_p (regno);
8977       if (skip_restore_p (regno))
8978         continue;
8979
8980       machine_mode mode = aarch64_reg_save_mode (regno);
8981       rtx reg = gen_rtx_REG (mode, regno);
8982       offset = frame.reg_offset[regno] - bytes_below_sp;
8983       rtx base_rtx = stack_pointer_rtx;
8984       if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
8985         aarch64_adjust_sve_callee_save_base (mode, base_rtx, anchor_reg,
8986                                              offset, ptrue);
8987       rtx mem = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
8988
8989       unsigned int regno2;
8990       if (!aarch64_sve_mode_p (mode)
8991           && i + 1 < regs.size ()
8992           && (regno2 = regs[i + 1], !skip_restore_p (regno2))
8993           && known_eq (GET_MODE_SIZE (mode),
8994                        frame.reg_offset[regno2] - frame.reg_offset[regno]))
8995         {
8996           rtx reg2 = gen_rtx_REG (mode, regno2);
8997
8998           offset += GET_MODE_SIZE (mode);
8999           emit_insn (aarch64_gen_load_pair (reg, reg2, mem));
9000
9001           *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
9002           regno = regno2;
9003           ++i;
9004         }
9005       else if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
9006         emit_insn (gen_aarch64_pred_mov (mode, reg, ptrue, mem));
9007       else if (aarch64_sve_mode_p (mode))
9008         emit_insn (gen_rtx_SET (reg, mem));
9009       else
9010         emit_move_insn (reg, mem);
9011       if (frame_related_p)
9012         *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg, *cfi_ops);
9013     }
9014 }
9015
9016 /* Return true if OFFSET is a signed 4-bit value multiplied by the size
9017    of MODE.  */
9018
9019 static inline bool
9020 offset_4bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
9021 {
9022   HOST_WIDE_INT multiple;
9023   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
9024           && IN_RANGE (multiple, -8, 7));
9025 }
9026
9027 /* Return true if OFFSET is a signed 6-bit value multiplied by the size
9028    of MODE.  */
9029
9030 static inline bool
9031 offset_6bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
9032 {
9033   HOST_WIDE_INT multiple;
9034   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
9035           && IN_RANGE (multiple, -32, 31));
9036 }
9037
9038 /* Return true if OFFSET is an unsigned 6-bit value multiplied by the size
9039    of MODE.  */
9040
9041 static inline bool
9042 offset_6bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
9043 {
9044   HOST_WIDE_INT multiple;
9045   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
9046           && IN_RANGE (multiple, 0, 63));
9047 }
9048
9049 /* Return true if OFFSET is a signed 7-bit value multiplied by the size
9050    of MODE.  */
9051
9052 bool
9053 aarch64_offset_7bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
9054 {
9055   HOST_WIDE_INT multiple;
9056   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
9057           && IN_RANGE (multiple, -64, 63));
9058 }
9059
9060 /* Return true if OFFSET is a signed 9-bit value.  */
9061
9062 bool
9063 aarch64_offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED,
9064                                        poly_int64 offset)
9065 {
9066   HOST_WIDE_INT const_offset;
9067   return (offset.is_constant (&const_offset)
9068           && IN_RANGE (const_offset, -256, 255));
9069 }
9070
9071 /* Return true if OFFSET is a signed 9-bit value multiplied by the size
9072    of MODE.  */
9073
9074 static inline bool
9075 offset_9bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
9076 {
9077   HOST_WIDE_INT multiple;
9078   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
9079           && IN_RANGE (multiple, -256, 255));
9080 }
9081
9082 /* Return true if OFFSET is an unsigned 12-bit value multiplied by the size
9083    of MODE.  */
9084
9085 static inline bool
9086 offset_12bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
9087 {
9088   HOST_WIDE_INT multiple;
9089   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
9090           && IN_RANGE (multiple, 0, 4095));
9091 }
9092
9093 /* Implement TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS.  */
9094
9095 static sbitmap
9096 aarch64_get_separate_components (void)
9097 {
9098   aarch64_frame &frame = cfun->machine->frame;
9099   sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
9100   bitmap_clear (components);
9101
9102   /* The registers we need saved to the frame.  */
9103   bool enables_pstate_sm = aarch64_cfun_enables_pstate_sm ();
9104   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
9105     if (aarch64_register_saved_on_entry (regno))
9106       {
9107         /* Disallow shrink wrapping for registers that will be clobbered
9108            by an SMSTART SM in the prologue.  */
9109         if (enables_pstate_sm
9110             && (FP_REGNUM_P (regno) || PR_REGNUM_P (regno)))
9111           continue;
9112
9113         /* Punt on saves and restores that use ST1D and LD1D.  We could
9114            try to be smarter, but it would involve making sure that the
9115            spare predicate register itself is safe to use at the save
9116            and restore points.  Also, when a frame pointer is being used,
9117            the slots are often out of reach of ST1D and LD1D anyway.  */
9118         machine_mode mode = aarch64_reg_save_mode (regno);
9119         if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
9120           continue;
9121
9122         poly_int64 offset = frame.reg_offset[regno];
9123
9124         /* Get the offset relative to the register we'll use.  */
9125         if (frame_pointer_needed)
9126           offset -= frame.bytes_below_hard_fp;
9127
9128         /* Check that we can access the stack slot of the register with one
9129            direct load with no adjustments needed.  */
9130         if (aarch64_sve_mode_p (mode)
9131             ? offset_9bit_signed_scaled_p (mode, offset)
9132             : offset_12bit_unsigned_scaled_p (mode, offset))
9133           bitmap_set_bit (components, regno);
9134       }
9135
9136   /* Don't mess with the hard frame pointer.  */
9137   if (frame_pointer_needed)
9138     bitmap_clear_bit (components, HARD_FRAME_POINTER_REGNUM);
9139
9140   /* If the spare predicate register used by big-endian SVE code
9141      is call-preserved, it must be saved in the main prologue
9142      before any saves that use it.  */
9143   if (frame.spare_pred_reg != INVALID_REGNUM)
9144     bitmap_clear_bit (components, frame.spare_pred_reg);
9145
9146   unsigned reg1 = frame.wb_push_candidate1;
9147   unsigned reg2 = frame.wb_push_candidate2;
9148   /* If registers have been chosen to be stored/restored with
9149      writeback don't interfere with them to avoid having to output explicit
9150      stack adjustment instructions.  */
9151   if (reg2 != INVALID_REGNUM)
9152     bitmap_clear_bit (components, reg2);
9153   if (reg1 != INVALID_REGNUM)
9154     bitmap_clear_bit (components, reg1);
9155
9156   bitmap_clear_bit (components, LR_REGNUM);
9157   bitmap_clear_bit (components, SP_REGNUM);
9158   if (flag_stack_clash_protection)
9159     {
9160       if (frame.sve_save_and_probe != INVALID_REGNUM)
9161         bitmap_clear_bit (components, frame.sve_save_and_probe);
9162       if (frame.hard_fp_save_and_probe != INVALID_REGNUM)
9163         bitmap_clear_bit (components, frame.hard_fp_save_and_probe);
9164     }
9165
9166   /* The VG save sequence needs a temporary GPR.  Punt for now on trying
9167      to find one.  */
9168   bitmap_clear_bit (components, VG_REGNUM);
9169
9170   return components;
9171 }
9172
9173 /* Implement TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB.  */
9174
9175 static sbitmap
9176 aarch64_components_for_bb (basic_block bb)
9177 {
9178   bitmap in = DF_LIVE_IN (bb);
9179   bitmap gen = &DF_LIVE_BB_INFO (bb)->gen;
9180   bitmap kill = &DF_LIVE_BB_INFO (bb)->kill;
9181
9182   sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
9183   bitmap_clear (components);
9184
9185   /* Clobbered registers don't generate values in any meaningful sense,
9186      since nothing after the clobber can rely on their value.  And we can't
9187      say that partially-clobbered registers are unconditionally killed,
9188      because whether they're killed or not depends on the mode of the
9189      value they're holding.  Thus partially call-clobbered registers
9190      appear in neither the kill set nor the gen set.
9191
9192      Check manually for any calls that clobber more of a register than the
9193      current function can.  */
9194   function_abi_aggregator callee_abis;
9195   rtx_insn *insn;
9196   FOR_BB_INSNS (bb, insn)
9197     if (CALL_P (insn))
9198       callee_abis.note_callee_abi (insn_callee_abi (insn));
9199   HARD_REG_SET extra_caller_saves = callee_abis.caller_save_regs (*crtl->abi);
9200
9201   /* GPRs are used in a bb if they are in the IN, GEN, or KILL sets.  */
9202   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
9203     if (!fixed_regs[regno]
9204         && !crtl->abi->clobbers_full_reg_p (regno)
9205         && (TEST_HARD_REG_BIT (extra_caller_saves, regno)
9206             || bitmap_bit_p (in, regno)
9207             || bitmap_bit_p (gen, regno)
9208             || bitmap_bit_p (kill, regno)))
9209       {
9210         bitmap_set_bit (components, regno);
9211
9212         /* If there is a callee-save at an adjacent offset, add it too
9213            to increase the use of LDP/STP.  */
9214         poly_int64 offset = cfun->machine->frame.reg_offset[regno];
9215         unsigned regno2 = multiple_p (offset, 16) ? regno + 1 : regno - 1;
9216
9217         if (regno2 <= LAST_SAVED_REGNUM)
9218           {
9219             poly_int64 offset2 = cfun->machine->frame.reg_offset[regno2];
9220             if (regno < regno2
9221                 ? known_eq (offset + 8, offset2)
9222                 : multiple_p (offset2, 16) && known_eq (offset2 + 8, offset))
9223               bitmap_set_bit (components, regno2);
9224           }
9225       }
9226
9227   return components;
9228 }
9229
9230 /* Implement TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS.
9231    Nothing to do for aarch64.  */
9232
9233 static void
9234 aarch64_disqualify_components (sbitmap, edge, sbitmap, bool)
9235 {
9236 }
9237
9238 /* Return the next set bit in BMP from START onwards.  Return the total number
9239    of bits in BMP if no set bit is found at or after START.  */
9240
9241 static unsigned int
9242 aarch64_get_next_set_bit (sbitmap bmp, unsigned int start)
9243 {
9244   unsigned int nbits = SBITMAP_SIZE (bmp);
9245   if (start == nbits)
9246     return start;
9247
9248   gcc_assert (start < nbits);
9249   for (unsigned int i = start; i < nbits; i++)
9250     if (bitmap_bit_p (bmp, i))
9251       return i;
9252
9253   return nbits;
9254 }
9255
9256 /* Do the work for aarch64_emit_prologue_components and
9257    aarch64_emit_epilogue_components.  COMPONENTS is the bitmap of registers
9258    to save/restore, PROLOGUE_P indicates whether to emit the prologue sequence
9259    for these components or the epilogue sequence.  That is, it determines
9260    whether we should emit stores or loads and what kind of CFA notes to attach
9261    to the insns.  Otherwise the logic for the two sequences is very
9262    similar.  */
9263
9264 static void
9265 aarch64_process_components (sbitmap components, bool prologue_p)
9266 {
9267   aarch64_frame &frame = cfun->machine->frame;
9268   rtx ptr_reg = gen_rtx_REG (Pmode, frame_pointer_needed
9269                              ? HARD_FRAME_POINTER_REGNUM
9270                              : STACK_POINTER_REGNUM);
9271
9272   unsigned last_regno = SBITMAP_SIZE (components);
9273   unsigned regno = aarch64_get_next_set_bit (components, R0_REGNUM);
9274   rtx_insn *insn = NULL;
9275
9276   while (regno != last_regno)
9277     {
9278       bool frame_related_p = aarch64_emit_cfi_for_reg_p (regno);
9279       machine_mode mode = aarch64_reg_save_mode (regno);
9280
9281       rtx reg = gen_rtx_REG (mode, regno);
9282       poly_int64 offset = frame.reg_offset[regno];
9283       if (frame_pointer_needed)
9284         offset -= frame.bytes_below_hard_fp;
9285
9286       rtx addr = plus_constant (Pmode, ptr_reg, offset);
9287       rtx mem = gen_frame_mem (mode, addr);
9288
9289       rtx set = prologue_p ? gen_rtx_SET (mem, reg) : gen_rtx_SET (reg, mem);
9290       unsigned regno2 = aarch64_get_next_set_bit (components, regno + 1);
9291       /* No more registers to handle after REGNO.
9292          Emit a single save/restore and exit.  */
9293       if (regno2 == last_regno)
9294         {
9295           insn = emit_insn (set);
9296           if (frame_related_p)
9297             {
9298               RTX_FRAME_RELATED_P (insn) = 1;
9299               if (prologue_p)
9300                 add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
9301               else
9302                 add_reg_note (insn, REG_CFA_RESTORE, reg);
9303             }
9304           break;
9305         }
9306
9307       poly_int64 offset2 = frame.reg_offset[regno2];
9308       /* The next register is not of the same class or its offset is not
9309          mergeable with the current one into a pair.  */
9310       if (aarch64_sve_mode_p (mode)
9311           || !satisfies_constraint_Ump (mem)
9312           || GP_REGNUM_P (regno) != GP_REGNUM_P (regno2)
9313           || (crtl->abi->id () == ARM_PCS_SIMD && FP_REGNUM_P (regno))
9314           || maybe_ne ((offset2 - frame.reg_offset[regno]),
9315                        GET_MODE_SIZE (mode)))
9316         {
9317           insn = emit_insn (set);
9318           if (frame_related_p)
9319             {
9320               RTX_FRAME_RELATED_P (insn) = 1;
9321               if (prologue_p)
9322                 add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
9323               else
9324                 add_reg_note (insn, REG_CFA_RESTORE, reg);
9325             }
9326
9327           regno = regno2;
9328           continue;
9329         }
9330
9331       bool frame_related2_p = aarch64_emit_cfi_for_reg_p (regno2);
9332
9333       /* REGNO2 can be saved/restored in a pair with REGNO.  */
9334       rtx reg2 = gen_rtx_REG (mode, regno2);
9335       if (frame_pointer_needed)
9336         offset2 -= frame.bytes_below_hard_fp;
9337       rtx addr2 = plus_constant (Pmode, ptr_reg, offset2);
9338       rtx mem2 = gen_frame_mem (mode, addr2);
9339       rtx set2 = prologue_p ? gen_rtx_SET (mem2, reg2)
9340                              : gen_rtx_SET (reg2, mem2);
9341
9342       if (prologue_p)
9343         insn = emit_insn (aarch64_gen_store_pair (mem, reg, reg2));
9344       else
9345         insn = emit_insn (aarch64_gen_load_pair (reg, reg2, mem));
9346
9347       if (frame_related_p || frame_related2_p)
9348         {
9349           RTX_FRAME_RELATED_P (insn) = 1;
9350           if (prologue_p)
9351             {
9352               if (frame_related_p)
9353                 add_reg_note (insn, REG_CFA_OFFSET, set);
9354               if (frame_related2_p)
9355                 add_reg_note (insn, REG_CFA_OFFSET, set2);
9356             }
9357           else
9358             {
9359               if (frame_related_p)
9360                 add_reg_note (insn, REG_CFA_RESTORE, reg);
9361               if (frame_related2_p)
9362                 add_reg_note (insn, REG_CFA_RESTORE, reg2);
9363             }
9364         }
9365
9366       regno = aarch64_get_next_set_bit (components, regno2 + 1);
9367     }
9368 }
9369
9370 /* Implement TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS.  */
9371
9372 static void
9373 aarch64_emit_prologue_components (sbitmap components)
9374 {
9375   aarch64_process_components (components, true);
9376 }
9377
9378 /* Implement TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS.  */
9379
9380 static void
9381 aarch64_emit_epilogue_components (sbitmap components)
9382 {
9383   aarch64_process_components (components, false);
9384 }
9385
9386 /* Implement TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS.  */
9387
9388 static void
9389 aarch64_set_handled_components (sbitmap components)
9390 {
9391   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
9392     if (bitmap_bit_p (components, regno))
9393       cfun->machine->reg_is_wrapped_separately[regno] = true;
9394 }
9395
9396 /* On AArch64 we have an ABI defined safe buffer.  This constant is used to
9397    determining the probe offset for alloca.  */
9398
9399 static HOST_WIDE_INT
9400 aarch64_stack_clash_protection_alloca_probe_range (void)
9401 {
9402   return STACK_CLASH_CALLER_GUARD;
9403 }
9404
9405 /* Emit a stack tie that acts as a scheduling barrier for all previous and
9406    subsequent memory accesses and that requires the stack pointer and REG
9407    to have their current values.  REG can be stack_pointer_rtx if no
9408    other register's value needs to be fixed.  */
9409
9410 static void
9411 aarch64_emit_stack_tie (rtx reg)
9412 {
9413   emit_insn (gen_stack_tie (reg, gen_int_mode (REGNO (reg), DImode)));
9414 }
9415
9416 /* Allocate POLY_SIZE bytes of stack space using TEMP1 and TEMP2 as scratch
9417    registers.  If POLY_SIZE is not large enough to require a probe this function
9418    will only adjust the stack.  When allocating the stack space
9419    FRAME_RELATED_P is then used to indicate if the allocation is frame related.
9420    FINAL_ADJUSTMENT_P indicates whether we are allocating the area below
9421    the saved registers.  If we are then we ensure that any allocation
9422    larger than the ABI defined buffer needs a probe so that the
9423    invariant of having a 1KB buffer is maintained.
9424
9425    We emit barriers after each stack adjustment to prevent optimizations from
9426    breaking the invariant that we never drop the stack more than a page.  This
9427    invariant is needed to make it easier to correctly handle asynchronous
9428    events, e.g. if we were to allow the stack to be dropped by more than a page
9429    and then have multiple probes up and we take a signal somewhere in between
9430    then the signal handler doesn't know the state of the stack and can make no
9431    assumptions about which pages have been probed.
9432
9433    FORCE_ISA_MODE is AARCH64_ISA_MODE_SM_ON if any variable component of
9434    POLY_SIZE is measured relative to the SME vector length instead of the
9435    current prevailing vector length.  It is 0 otherwise.  */
9436
9437 static void
9438 aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2,
9439                                         poly_int64 poly_size,
9440                                         aarch64_isa_mode force_isa_mode,
9441                                         bool frame_related_p,
9442                                         bool final_adjustment_p)
9443 {
9444   aarch64_frame &frame = cfun->machine->frame;
9445   HOST_WIDE_INT guard_size
9446     = 1 << param_stack_clash_protection_guard_size;
9447   HOST_WIDE_INT guard_used_by_caller = STACK_CLASH_CALLER_GUARD;
9448   HOST_WIDE_INT byte_sp_alignment = STACK_BOUNDARY / BITS_PER_UNIT;
9449   gcc_assert (multiple_p (poly_size, byte_sp_alignment));
9450   HOST_WIDE_INT min_probe_threshold
9451     = (final_adjustment_p
9452        ? guard_used_by_caller + byte_sp_alignment
9453        : guard_size - guard_used_by_caller);
9454   poly_int64 frame_size = frame.frame_size;
9455
9456   /* We should always have a positive probe threshold.  */
9457   gcc_assert (min_probe_threshold > 0);
9458
9459   if (flag_stack_clash_protection && !final_adjustment_p)
9460     {
9461       poly_int64 initial_adjust = frame.initial_adjust;
9462       poly_int64 sve_callee_adjust = frame.sve_callee_adjust;
9463       poly_int64 final_adjust = frame.final_adjust;
9464
9465       if (known_eq (frame_size, 0))
9466         {
9467           dump_stack_clash_frame_info (NO_PROBE_NO_FRAME, false);
9468         }
9469       else if (known_lt (initial_adjust + sve_callee_adjust,
9470                          guard_size - guard_used_by_caller)
9471                && known_lt (final_adjust, guard_used_by_caller))
9472         {
9473           dump_stack_clash_frame_info (NO_PROBE_SMALL_FRAME, true);
9474         }
9475     }
9476
9477   /* If SIZE is not large enough to require probing, just adjust the stack and
9478      exit.  */
9479   if (known_lt (poly_size, min_probe_threshold)
9480       || !flag_stack_clash_protection)
9481     {
9482       aarch64_sub_sp (temp1, temp2, poly_size, force_isa_mode,
9483                       frame_related_p);
9484       return;
9485     }
9486
9487   HOST_WIDE_INT size;
9488   /* Handle the SVE non-constant case first.  */
9489   if (!poly_size.is_constant (&size))
9490     {
9491      if (dump_file)
9492       {
9493         fprintf (dump_file, "Stack clash SVE prologue: ");
9494         print_dec (poly_size, dump_file);
9495         fprintf (dump_file, " bytes, dynamic probing will be required.\n");
9496       }
9497
9498       /* First calculate the amount of bytes we're actually spilling.  */
9499       aarch64_add_offset (Pmode, temp1, CONST0_RTX (Pmode),
9500                           poly_size, temp1, temp2, force_isa_mode,
9501                           false, true);
9502
9503       rtx_insn *insn = get_last_insn ();
9504
9505       if (frame_related_p)
9506         {
9507           /* This is done to provide unwinding information for the stack
9508              adjustments we're about to do, however to prevent the optimizers
9509              from removing the R11 move and leaving the CFA note (which would be
9510              very wrong) we tie the old and new stack pointer together.
9511              The tie will expand to nothing but the optimizers will not touch
9512              the instruction.  */
9513           rtx stack_ptr_copy = gen_rtx_REG (Pmode, STACK_CLASH_SVE_CFA_REGNUM);
9514           emit_move_insn (stack_ptr_copy, stack_pointer_rtx);
9515           aarch64_emit_stack_tie (stack_ptr_copy);
9516
9517           /* We want the CFA independent of the stack pointer for the
9518              duration of the loop.  */
9519           add_reg_note (insn, REG_CFA_DEF_CFA, stack_ptr_copy);
9520           RTX_FRAME_RELATED_P (insn) = 1;
9521         }
9522
9523       rtx probe_const = gen_int_mode (min_probe_threshold, Pmode);
9524       rtx guard_const = gen_int_mode (guard_size, Pmode);
9525
9526       insn = emit_insn (gen_probe_sve_stack_clash (Pmode, stack_pointer_rtx,
9527                                                    stack_pointer_rtx, temp1,
9528                                                    probe_const, guard_const));
9529
9530       /* Now reset the CFA register if needed.  */
9531       if (frame_related_p)
9532         {
9533           add_reg_note (insn, REG_CFA_DEF_CFA,
9534                         gen_rtx_PLUS (Pmode, stack_pointer_rtx,
9535                                       gen_int_mode (poly_size, Pmode)));
9536           RTX_FRAME_RELATED_P (insn) = 1;
9537         }
9538
9539       return;
9540     }
9541
9542   if (dump_file)
9543     fprintf (dump_file,
9544              "Stack clash AArch64 prologue: " HOST_WIDE_INT_PRINT_DEC
9545              " bytes, probing will be required.\n", size);
9546
9547   /* Round size to the nearest multiple of guard_size, and calculate the
9548      residual as the difference between the original size and the rounded
9549      size.  */
9550   HOST_WIDE_INT rounded_size = ROUND_DOWN (size, guard_size);
9551   HOST_WIDE_INT residual = size - rounded_size;
9552
9553   /* We can handle a small number of allocations/probes inline.  Otherwise
9554      punt to a loop.  */
9555   if (rounded_size <= STACK_CLASH_MAX_UNROLL_PAGES * guard_size)
9556     {
9557       for (HOST_WIDE_INT i = 0; i < rounded_size; i += guard_size)
9558         {
9559           aarch64_sub_sp (NULL, temp2, guard_size, force_isa_mode, true);
9560           emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
9561                                            guard_used_by_caller));
9562           emit_insn (gen_blockage ());
9563         }
9564       dump_stack_clash_frame_info (PROBE_INLINE, size != rounded_size);
9565     }
9566   else
9567     {
9568       /* Compute the ending address.  */
9569       aarch64_add_offset (Pmode, temp1, stack_pointer_rtx, -rounded_size,
9570                           temp1, NULL, force_isa_mode, false, true);
9571       rtx_insn *insn = get_last_insn ();
9572
9573       /* For the initial allocation, we don't have a frame pointer
9574          set up, so we always need CFI notes.  If we're doing the
9575          final allocation, then we may have a frame pointer, in which
9576          case it is the CFA, otherwise we need CFI notes.
9577
9578          We can determine which allocation we are doing by looking at
9579          the value of FRAME_RELATED_P since the final allocations are not
9580          frame related.  */
9581       if (frame_related_p)
9582         {
9583           /* We want the CFA independent of the stack pointer for the
9584              duration of the loop.  */
9585           add_reg_note (insn, REG_CFA_DEF_CFA,
9586                         plus_constant (Pmode, temp1, rounded_size));
9587           RTX_FRAME_RELATED_P (insn) = 1;
9588         }
9589
9590       /* This allocates and probes the stack.  Note that this re-uses some of
9591          the existing Ada stack protection code.  However we are guaranteed not
9592          to enter the non loop or residual branches of that code.
9593
9594          The non-loop part won't be entered because if our allocation amount
9595          doesn't require a loop, the case above would handle it.
9596
9597          The residual amount won't be entered because TEMP1 is a mutliple of
9598          the allocation size.  The residual will always be 0.  As such, the only
9599          part we are actually using from that code is the loop setup.  The
9600          actual probing is done in aarch64_output_probe_stack_range.  */
9601       insn = emit_insn (gen_probe_stack_range (stack_pointer_rtx,
9602                                                stack_pointer_rtx, temp1));
9603
9604       /* Now reset the CFA register if needed.  */
9605       if (frame_related_p)
9606         {
9607           add_reg_note (insn, REG_CFA_DEF_CFA,
9608                         plus_constant (Pmode, stack_pointer_rtx, rounded_size));
9609           RTX_FRAME_RELATED_P (insn) = 1;
9610         }
9611
9612       emit_insn (gen_blockage ());
9613       dump_stack_clash_frame_info (PROBE_LOOP, size != rounded_size);
9614     }
9615
9616   /* Handle any residuals.  Residuals of at least MIN_PROBE_THRESHOLD have to
9617      be probed.  This maintains the requirement that each page is probed at
9618      least once.  For initial probing we probe only if the allocation is
9619      more than GUARD_SIZE - buffer, and below the saved registers we probe
9620      if the amount is larger than buffer.  GUARD_SIZE - buffer + buffer ==
9621      GUARD_SIZE.  This works that for any allocation that is large enough to
9622      trigger a probe here, we'll have at least one, and if they're not large
9623      enough for this code to emit anything for them, The page would have been
9624      probed by the saving of FP/LR either by this function or any callees.  If
9625      we don't have any callees then we won't have more stack adjustments and so
9626      are still safe.  */
9627   if (residual)
9628     {
9629       gcc_assert (guard_used_by_caller + byte_sp_alignment <= size);
9630
9631       /* If we're doing final adjustments, and we've done any full page
9632          allocations then any residual needs to be probed.  */
9633       if (final_adjustment_p && rounded_size != 0)
9634         min_probe_threshold = 0;
9635
9636       aarch64_sub_sp (temp1, temp2, residual, force_isa_mode, frame_related_p);
9637       if (residual >= min_probe_threshold)
9638         {
9639           if (dump_file)
9640             fprintf (dump_file,
9641                      "Stack clash AArch64 prologue residuals: "
9642                      HOST_WIDE_INT_PRINT_DEC " bytes, probing will be required."
9643                      "\n", residual);
9644
9645           emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
9646                                            guard_used_by_caller));
9647           emit_insn (gen_blockage ());
9648         }
9649     }
9650 }
9651
9652 /* Implement TARGET_EXTRA_LIVE_ON_ENTRY.  */
9653
9654 void
9655 aarch64_extra_live_on_entry (bitmap regs)
9656 {
9657   if (TARGET_ZA)
9658     {
9659       bitmap_set_bit (regs, LOWERING_REGNUM);
9660       bitmap_set_bit (regs, SME_STATE_REGNUM);
9661       bitmap_set_bit (regs, TPIDR2_SETUP_REGNUM);
9662       bitmap_set_bit (regs, ZA_FREE_REGNUM);
9663       bitmap_set_bit (regs, ZA_SAVED_REGNUM);
9664
9665       /* The only time ZA can't have live contents on entry is when
9666          the function explicitly treats it as a pure output.  */
9667       auto za_flags = aarch64_cfun_shared_flags ("za");
9668       if (za_flags != (AARCH64_STATE_SHARED | AARCH64_STATE_OUT))
9669         bitmap_set_bit (regs, ZA_REGNUM);
9670
9671       /* Since ZT0 is call-clobbered, it is only live on input if
9672          it is explicitly shared, and is not a pure output.  */
9673       auto zt0_flags = aarch64_cfun_shared_flags ("zt0");
9674       if (zt0_flags != 0
9675           && zt0_flags != (AARCH64_STATE_SHARED | AARCH64_STATE_OUT))
9676         bitmap_set_bit (regs, ZT0_REGNUM);
9677     }
9678 }
9679
9680 /* Return 1 if the register is used by the epilogue.  We need to say the
9681    return register is used, but only after epilogue generation is complete.
9682    Note that in the case of sibcalls, the values "used by the epilogue" are
9683    considered live at the start of the called function.  */
9684
9685 int
9686 aarch64_epilogue_uses (int regno)
9687 {
9688   if (epilogue_completed)
9689     {
9690       if (regno == LR_REGNUM)
9691         return 1;
9692     }
9693   if (regno == LOWERING_REGNUM && TARGET_ZA)
9694     return 1;
9695   if (regno == SME_STATE_REGNUM && TARGET_ZA)
9696     return 1;
9697   if (regno == TPIDR2_SETUP_REGNUM && TARGET_ZA)
9698     return 1;
9699   /* If the function shares SME state with its caller, ensure that that
9700      data is not in the lazy save buffer on exit.  */
9701   if (regno == ZA_SAVED_REGNUM && aarch64_cfun_incoming_pstate_za () != 0)
9702     return 1;
9703   if (regno == ZA_REGNUM && aarch64_cfun_shared_flags ("za") != 0)
9704     return 1;
9705   if (regno == ZT0_REGNUM && aarch64_cfun_shared_flags ("zt0") != 0)
9706     return 1;
9707   return 0;
9708 }
9709
9710 /* Implement TARGET_USE_LATE_PROLOGUE_EPILOGUE.  */
9711
9712 static bool
9713 aarch64_use_late_prologue_epilogue ()
9714 {
9715   return aarch64_cfun_enables_pstate_sm ();
9716 }
9717
9718 /* The current function's frame has a save slot for the incoming state
9719    of SVCR.  Return a legitimate memory for the slot, based on the hard
9720    frame pointer.  */
9721
9722 static rtx
9723 aarch64_old_svcr_mem ()
9724 {
9725   gcc_assert (frame_pointer_needed
9726               && known_ge (cfun->machine->frame.old_svcr_offset, 0));
9727   rtx base = hard_frame_pointer_rtx;
9728   poly_int64 offset = (0
9729                        /* hard fp -> bottom of frame.  */
9730                        - cfun->machine->frame.bytes_below_hard_fp
9731                        /* bottom of frame -> save slot.  */
9732                        + cfun->machine->frame.old_svcr_offset);
9733   return gen_frame_mem (DImode, plus_constant (Pmode, base, offset));
9734 }
9735
9736 /* The current function's frame has a save slot for the incoming state
9737    of SVCR.  Load the slot into register REGNO and return the register.  */
9738
9739 static rtx
9740 aarch64_read_old_svcr (unsigned int regno)
9741 {
9742   rtx svcr = gen_rtx_REG (DImode, regno);
9743   emit_move_insn (svcr, aarch64_old_svcr_mem ());
9744   return svcr;
9745 }
9746
9747 /* Like the rtx version of aarch64_guard_switch_pstate_sm, but first
9748    load the incoming value of SVCR from its save slot into temporary
9749    register REGNO.  */
9750
9751 static rtx_insn *
9752 aarch64_guard_switch_pstate_sm (unsigned int regno,
9753                                 aarch64_isa_mode local_mode)
9754 {
9755   rtx old_svcr = aarch64_read_old_svcr (regno);
9756   return aarch64_guard_switch_pstate_sm (old_svcr, local_mode);
9757 }
9758
9759 /* AArch64 stack frames generated by this compiler look like:
9760
9761         +-------------------------------+
9762         |                               |
9763         |  incoming stack arguments     |
9764         |                               |
9765         +-------------------------------+
9766         |                               | <-- incoming stack pointer (aligned)
9767         |  callee-allocated save area   |
9768         |  for register varargs         |
9769         |                               |
9770         +-------------------------------+
9771         |  local variables (1)          | <-- frame_pointer_rtx
9772         |                               |
9773         +-------------------------------+
9774         |  padding (1)                  |
9775         +-------------------------------+
9776         |  callee-saved registers       |
9777         +-------------------------------+
9778         |  LR'                          |
9779         +-------------------------------+
9780         |  FP'                          |
9781         +-------------------------------+ <-- hard_frame_pointer_rtx (aligned)
9782         |  SVE vector registers         |
9783         +-------------------------------+
9784         |  SVE predicate registers      |
9785         +-------------------------------+
9786         |  local variables (2)          |
9787         +-------------------------------+
9788         |  padding (2)                  |
9789         +-------------------------------+
9790         |  dynamic allocation           |
9791         +-------------------------------+
9792         |  padding                      |
9793         +-------------------------------+
9794         |  outgoing stack arguments     | <-- arg_pointer
9795         |                               |
9796         +-------------------------------+
9797         |                               | <-- stack_pointer_rtx (aligned)
9798
9799    The regions marked (1) and (2) are mutually exclusive.  (2) is used
9800    when aarch64_save_regs_above_locals_p is true.
9801
9802    Dynamic stack allocations via alloca() decrease stack_pointer_rtx
9803    but leave frame_pointer_rtx and hard_frame_pointer_rtx
9804    unchanged.
9805
9806    By default for stack-clash we assume the guard is at least 64KB, but this
9807    value is configurable to either 4KB or 64KB.  We also force the guard size to
9808    be the same as the probing interval and both values are kept in sync.
9809
9810    With those assumptions the callee can allocate up to 63KB (or 3KB depending
9811    on the guard size) of stack space without probing.
9812
9813    When probing is needed, we emit a probe at the start of the prologue
9814    and every PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE bytes thereafter.
9815
9816    We can also use register saves as probes.  These are stored in
9817    sve_save_and_probe and hard_fp_save_and_probe.
9818
9819    For outgoing arguments we probe if the size is larger than 1KB, such that
9820    the ABI specified buffer is maintained for the next callee.
9821
9822    The following registers are reserved during frame layout and should not be
9823    used for any other purpose:
9824
9825    - r11: Used by stack clash protection when SVE is enabled, and also
9826           as an anchor register when saving and restoring registers
9827    - r12(EP0) and r13(EP1): Used as temporaries for stack adjustment.
9828    - r14 and r15: Used for speculation tracking.
9829    - r16(IP0), r17(IP1): Used by indirect tailcalls.
9830    - r30(LR), r29(FP): Used by standard frame layout.
9831
9832    These registers must be avoided in frame layout related code unless the
9833    explicit intention is to interact with one of the features listed above.  */
9834
9835 /* Generate the prologue instructions for entry into a function.
9836    Establish the stack frame by decreasing the stack pointer with a
9837    properly calculated size and, if necessary, create a frame record
9838    filled with the values of LR and previous frame pointer.  The
9839    current FP is also set up if it is in use.  */
9840
9841 void
9842 aarch64_expand_prologue (void)
9843 {
9844   aarch64_frame &frame = cfun->machine->frame;
9845   poly_int64 frame_size = frame.frame_size;
9846   poly_int64 initial_adjust = frame.initial_adjust;
9847   HOST_WIDE_INT callee_adjust = frame.callee_adjust;
9848   poly_int64 final_adjust = frame.final_adjust;
9849   poly_int64 sve_callee_adjust = frame.sve_callee_adjust;
9850   unsigned reg1 = frame.wb_push_candidate1;
9851   unsigned reg2 = frame.wb_push_candidate2;
9852   bool emit_frame_chain = frame.emit_frame_chain;
9853   rtx_insn *insn;
9854   aarch64_isa_mode force_isa_mode = 0;
9855   if (aarch64_cfun_enables_pstate_sm ())
9856     force_isa_mode = AARCH64_ISA_MODE_SM_ON;
9857
9858   if (flag_stack_clash_protection
9859       && known_eq (callee_adjust, 0)
9860       && known_lt (frame.reg_offset[VG_REGNUM], 0))
9861     {
9862       /* Fold the SVE allocation into the initial allocation.
9863          We don't do this in aarch64_layout_arg to avoid pessimizing
9864          the epilogue code.  */
9865       initial_adjust += sve_callee_adjust;
9866       sve_callee_adjust = 0;
9867     }
9868
9869   /* Sign return address for functions.  */
9870   if (aarch64_return_address_signing_enabled ())
9871     {
9872       switch (aarch64_ra_sign_key)
9873         {
9874           case AARCH64_KEY_A:
9875             insn = emit_insn (gen_paciasp ());
9876             break;
9877           case AARCH64_KEY_B:
9878             insn = emit_insn (gen_pacibsp ());
9879             break;
9880           default:
9881             gcc_unreachable ();
9882         }
9883       add_reg_note (insn, REG_CFA_NEGATE_RA_STATE, const0_rtx);
9884       RTX_FRAME_RELATED_P (insn) = 1;
9885     }
9886
9887   /* Push return address to shadow call stack.  */
9888   if (frame.is_scs_enabled)
9889     emit_insn (gen_scs_push ());
9890
9891   if (flag_stack_usage_info)
9892     current_function_static_stack_size = constant_lower_bound (frame_size);
9893
9894   if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
9895     {
9896       if (crtl->is_leaf && !cfun->calls_alloca)
9897         {
9898           if (maybe_gt (frame_size, PROBE_INTERVAL)
9899               && maybe_gt (frame_size, get_stack_check_protect ()))
9900             aarch64_emit_probe_stack_range (get_stack_check_protect (),
9901                                             (frame_size
9902                                              - get_stack_check_protect ()));
9903         }
9904       else if (maybe_gt (frame_size, 0))
9905         aarch64_emit_probe_stack_range (get_stack_check_protect (), frame_size);
9906     }
9907
9908   rtx tmp0_rtx = gen_rtx_REG (Pmode, EP0_REGNUM);
9909   rtx tmp1_rtx = gen_rtx_REG (Pmode, EP1_REGNUM);
9910
9911   /* In theory we should never have both an initial adjustment
9912      and a callee save adjustment.  Verify that is the case since the
9913      code below does not handle it for -fstack-clash-protection.  */
9914   gcc_assert (known_eq (initial_adjust, 0) || callee_adjust == 0);
9915
9916   /* Will only probe if the initial adjustment is larger than the guard
9917      less the amount of the guard reserved for use by the caller's
9918      outgoing args.  */
9919   aarch64_allocate_and_probe_stack_space (tmp0_rtx, tmp1_rtx, initial_adjust,
9920                                           force_isa_mode, true, false);
9921
9922   if (callee_adjust != 0)
9923     aarch64_push_regs (reg1, reg2, callee_adjust);
9924
9925   /* The offset of the current SP from the bottom of the static frame.  */
9926   poly_int64 bytes_below_sp = frame_size - initial_adjust - callee_adjust;
9927
9928   if (emit_frame_chain)
9929     {
9930       /* The offset of the frame chain record (if any) from the current SP.  */
9931       poly_int64 chain_offset = (initial_adjust + callee_adjust
9932                                  - frame.bytes_above_hard_fp);
9933       gcc_assert (known_ge (chain_offset, 0));
9934
9935       gcc_assert (reg1 == R29_REGNUM && reg2 == R30_REGNUM);
9936       if (callee_adjust == 0)
9937         aarch64_save_callee_saves (bytes_below_sp, frame.saved_gprs,
9938                                    false, false);
9939       else
9940         gcc_assert (known_eq (chain_offset, 0));
9941       aarch64_add_offset (Pmode, hard_frame_pointer_rtx,
9942                           stack_pointer_rtx, chain_offset,
9943                           tmp1_rtx, tmp0_rtx, force_isa_mode,
9944                           frame_pointer_needed);
9945       if (frame_pointer_needed && !frame_size.is_constant ())
9946         {
9947           /* Variable-sized frames need to describe the save slot
9948              address using DW_CFA_expression rather than DW_CFA_offset.
9949              This means that, without taking further action, the
9950              locations of the registers that we've already saved would
9951              remain based on the stack pointer even after we redefine
9952              the CFA based on the frame pointer.  We therefore need new
9953              DW_CFA_expressions to re-express the save slots with addresses
9954              based on the frame pointer.  */
9955           rtx_insn *insn = get_last_insn ();
9956           gcc_assert (RTX_FRAME_RELATED_P (insn));
9957
9958           /* Add an explicit CFA definition if this was previously
9959              implicit.  */
9960           if (!find_reg_note (insn, REG_CFA_ADJUST_CFA, NULL_RTX))
9961             {
9962               rtx src = plus_constant (Pmode, stack_pointer_rtx, chain_offset);
9963               add_reg_note (insn, REG_CFA_ADJUST_CFA,
9964                             gen_rtx_SET (hard_frame_pointer_rtx, src));
9965             }
9966
9967           /* Change the save slot expressions for the registers that
9968              we've already saved.  */
9969           aarch64_add_cfa_expression (insn, regno_reg_rtx[reg2],
9970                                       hard_frame_pointer_rtx, UNITS_PER_WORD);
9971           aarch64_add_cfa_expression (insn, regno_reg_rtx[reg1],
9972                                       hard_frame_pointer_rtx, 0);
9973         }
9974       aarch64_emit_stack_tie (hard_frame_pointer_rtx);
9975     }
9976
9977   aarch64_save_callee_saves (bytes_below_sp, frame.saved_gprs, true,
9978                              emit_frame_chain);
9979   if (maybe_ge (frame.reg_offset[VG_REGNUM], 0))
9980     {
9981       unsigned int saved_regs[] = { VG_REGNUM };
9982       aarch64_save_callee_saves (bytes_below_sp, saved_regs, true,
9983                                  emit_frame_chain);
9984     }
9985   if (maybe_ne (sve_callee_adjust, 0))
9986     {
9987       gcc_assert (!flag_stack_clash_protection
9988                   || known_eq (initial_adjust, 0)
9989                   /* The VG save isn't shrink-wrapped and so serves as
9990                      a probe of the initial allocation.  */
9991                   || known_eq (frame.reg_offset[VG_REGNUM], bytes_below_sp));
9992       aarch64_allocate_and_probe_stack_space (tmp1_rtx, tmp0_rtx,
9993                                               sve_callee_adjust,
9994                                               force_isa_mode,
9995                                               !frame_pointer_needed, false);
9996       bytes_below_sp -= sve_callee_adjust;
9997     }
9998   aarch64_save_callee_saves (bytes_below_sp, frame.saved_prs, true,
9999                              emit_frame_chain);
10000   aarch64_save_callee_saves (bytes_below_sp, frame.saved_fprs, true,
10001                              emit_frame_chain);
10002
10003   /* We may need to probe the final adjustment if it is larger than the guard
10004      that is assumed by the called.  */
10005   gcc_assert (known_eq (bytes_below_sp, final_adjust));
10006   aarch64_allocate_and_probe_stack_space (tmp1_rtx, tmp0_rtx, final_adjust,
10007                                           force_isa_mode,
10008                                           !frame_pointer_needed, true);
10009   if (emit_frame_chain && maybe_ne (final_adjust, 0))
10010     aarch64_emit_stack_tie (hard_frame_pointer_rtx);
10011
10012   /* Save the incoming value of PSTATE.SM, if required.  Code further
10013      down does this for locally-streaming functions.  */
10014   if (known_ge (frame.old_svcr_offset, 0)
10015       && !aarch64_cfun_enables_pstate_sm ())
10016     {
10017       rtx mem = aarch64_old_svcr_mem ();
10018       MEM_VOLATILE_P (mem) = 1;
10019       if (TARGET_SME)
10020         {
10021           rtx reg = gen_rtx_REG (DImode, IP0_REGNUM);
10022           emit_insn (gen_aarch64_read_svcr (reg));
10023           emit_move_insn (mem, reg);
10024         }
10025       else
10026         {
10027           rtx old_r0 = NULL_RTX, old_r1 = NULL_RTX;
10028           auto &args = crtl->args.info;
10029           if (args.aapcs_ncrn > 0)
10030             {
10031               old_r0 = gen_rtx_REG (DImode, PROBE_STACK_FIRST_REGNUM);
10032               emit_move_insn (old_r0, gen_rtx_REG (DImode, R0_REGNUM));
10033             }
10034           if (args.aapcs_ncrn > 1)
10035             {
10036               old_r1 = gen_rtx_REG (DImode, PROBE_STACK_SECOND_REGNUM);
10037               emit_move_insn (old_r1, gen_rtx_REG (DImode, R1_REGNUM));
10038             }
10039           emit_insn (gen_aarch64_get_sme_state ());
10040           emit_move_insn (mem, gen_rtx_REG (DImode, R0_REGNUM));
10041           if (old_r0)
10042             emit_move_insn (gen_rtx_REG (DImode, R0_REGNUM), old_r0);
10043           if (old_r1)
10044             emit_move_insn (gen_rtx_REG (DImode, R1_REGNUM), old_r1);
10045         }
10046     }
10047
10048   /* Enable PSTATE.SM, if required.  */
10049   if (aarch64_cfun_enables_pstate_sm ())
10050     {
10051       rtx_insn *guard_label = nullptr;
10052       if (known_ge (cfun->machine->frame.old_svcr_offset, 0))
10053         {
10054           /* The current function is streaming-compatible.  Save the
10055              original state of PSTATE.SM.  */
10056           rtx svcr = gen_rtx_REG (DImode, IP0_REGNUM);
10057           emit_insn (gen_aarch64_read_svcr (svcr));
10058           emit_move_insn (aarch64_old_svcr_mem (), svcr);
10059           guard_label = aarch64_guard_switch_pstate_sm (svcr,
10060                                                         AARCH64_ISA_MODE);
10061         }
10062       aarch64_sme_mode_switch_regs args_switch;
10063       auto &args = crtl->args.info;
10064       for (unsigned int i = 0; i < args.num_sme_mode_switch_args; ++i)
10065         {
10066           rtx x = args.sme_mode_switch_args[i];
10067           args_switch.add_reg (GET_MODE (x), REGNO (x));
10068         }
10069       args_switch.emit_prologue ();
10070       emit_insn (gen_aarch64_smstart_sm ());
10071       args_switch.emit_epilogue ();
10072       if (guard_label)
10073         emit_label (guard_label);
10074     }
10075 }
10076
10077 /* Return TRUE if we can use a simple_return insn.
10078
10079    This function checks whether the callee saved stack is empty, which
10080    means no restore actions are need. The pro_and_epilogue will use
10081    this to check whether shrink-wrapping opt is feasible.  */
10082
10083 bool
10084 aarch64_use_return_insn_p (void)
10085 {
10086   if (!reload_completed)
10087     return false;
10088
10089   if (crtl->profile)
10090     return false;
10091
10092   return known_eq (cfun->machine->frame.frame_size, 0);
10093 }
10094
10095 /* Generate the epilogue instructions for returning from a function.
10096    This is almost exactly the reverse of the prolog sequence, except
10097    that we need to insert barriers to avoid scheduling loads that read
10098    from a deallocated stack, and we optimize the unwind records by
10099    emitting them all together if possible.  */
10100 void
10101 aarch64_expand_epilogue (rtx_call_insn *sibcall)
10102 {
10103   aarch64_frame &frame = cfun->machine->frame;
10104   poly_int64 initial_adjust = frame.initial_adjust;
10105   HOST_WIDE_INT callee_adjust = frame.callee_adjust;
10106   poly_int64 final_adjust = frame.final_adjust;
10107   poly_int64 sve_callee_adjust = frame.sve_callee_adjust;
10108   poly_int64 bytes_below_hard_fp = frame.bytes_below_hard_fp;
10109   unsigned reg1 = frame.wb_pop_candidate1;
10110   unsigned reg2 = frame.wb_pop_candidate2;
10111   rtx cfi_ops = NULL;
10112   rtx_insn *insn;
10113   /* A stack clash protection prologue may not have left EP0_REGNUM or
10114      EP1_REGNUM in a usable state.  The same is true for allocations
10115      with an SVE component, since we then need both temporary registers
10116      for each allocation.  For stack clash we are in a usable state if
10117      the adjustment is less than GUARD_SIZE - GUARD_USED_BY_CALLER.  */
10118   HOST_WIDE_INT guard_size
10119     = 1 << param_stack_clash_protection_guard_size;
10120   HOST_WIDE_INT guard_used_by_caller = STACK_CLASH_CALLER_GUARD;
10121   aarch64_isa_mode force_isa_mode = 0;
10122   if (aarch64_cfun_enables_pstate_sm ())
10123     force_isa_mode = AARCH64_ISA_MODE_SM_ON;
10124
10125   /* We can re-use the registers when:
10126
10127      (a) the deallocation amount is the same as the corresponding
10128          allocation amount (which is false if we combine the initial
10129          and SVE callee save allocations in the prologue); and
10130
10131      (b) the allocation amount doesn't need a probe (which is false
10132          if the amount is guard_size - guard_used_by_caller or greater).
10133
10134      In such situations the register should remain live with the correct
10135      value.  */
10136   bool can_inherit_p = (initial_adjust.is_constant ()
10137                         && final_adjust.is_constant ()
10138                         && (!flag_stack_clash_protection
10139                             || (known_lt (initial_adjust,
10140                                           guard_size - guard_used_by_caller)
10141                                 && known_eq (sve_callee_adjust, 0))));
10142
10143   /* We need to add memory barrier to prevent read from deallocated stack.  */
10144   bool need_barrier_p
10145     = maybe_ne (get_frame_size ()
10146                 + frame.saved_varargs_size, 0);
10147
10148   /* Reset PSTATE.SM, if required.  */
10149   if (aarch64_cfun_enables_pstate_sm ())
10150     {
10151       rtx_insn *guard_label = nullptr;
10152       if (known_ge (cfun->machine->frame.old_svcr_offset, 0))
10153         guard_label = aarch64_guard_switch_pstate_sm (IP0_REGNUM,
10154                                                       AARCH64_ISA_MODE);
10155       aarch64_sme_mode_switch_regs return_switch;
10156       if (sibcall)
10157         return_switch.add_call_args (sibcall);
10158       else if (crtl->return_rtx && REG_P (crtl->return_rtx))
10159         return_switch.add_reg (GET_MODE (crtl->return_rtx),
10160                                REGNO (crtl->return_rtx));
10161       return_switch.emit_prologue ();
10162       emit_insn (gen_aarch64_smstop_sm ());
10163       return_switch.emit_epilogue ();
10164       if (guard_label)
10165         emit_label (guard_label);
10166     }
10167
10168   /* Emit a barrier to prevent loads from a deallocated stack.  */
10169   if (maybe_gt (final_adjust, crtl->outgoing_args_size)
10170       || cfun->calls_alloca
10171       || crtl->calls_eh_return)
10172     {
10173       aarch64_emit_stack_tie (stack_pointer_rtx);
10174       need_barrier_p = false;
10175     }
10176
10177   /* Restore the stack pointer from the frame pointer if it may not
10178      be the same as the stack pointer.  */
10179   rtx tmp0_rtx = gen_rtx_REG (Pmode, EP0_REGNUM);
10180   rtx tmp1_rtx = gen_rtx_REG (Pmode, EP1_REGNUM);
10181   if (frame_pointer_needed
10182       && (maybe_ne (final_adjust, 0) || cfun->calls_alloca))
10183     /* If writeback is used when restoring callee-saves, the CFA
10184        is restored on the instruction doing the writeback.  */
10185     aarch64_add_offset (Pmode, stack_pointer_rtx,
10186                         hard_frame_pointer_rtx,
10187                         -bytes_below_hard_fp + final_adjust,
10188                         tmp1_rtx, tmp0_rtx, force_isa_mode,
10189                         callee_adjust == 0);
10190   else
10191      /* The case where we need to re-use the register here is very rare, so
10192         avoid the complicated condition and just always emit a move if the
10193         immediate doesn't fit.  */
10194      aarch64_add_sp (tmp1_rtx, tmp0_rtx, final_adjust, force_isa_mode, true);
10195
10196   /* Restore the vector registers before the predicate registers,
10197      so that we can use P4 as a temporary for big-endian SVE frames.  */
10198   aarch64_restore_callee_saves (final_adjust, frame.saved_fprs, &cfi_ops);
10199   aarch64_restore_callee_saves (final_adjust, frame.saved_prs, &cfi_ops);
10200   if (maybe_ne (sve_callee_adjust, 0))
10201     aarch64_add_sp (NULL_RTX, NULL_RTX, sve_callee_adjust,
10202                     force_isa_mode, true);
10203
10204   /* When shadow call stack is enabled, the scs_pop in the epilogue will
10205      restore x30, we don't need to restore x30 again in the traditional
10206      way.  */
10207   aarch64_restore_callee_saves (final_adjust + sve_callee_adjust,
10208                                 frame.saved_gprs, &cfi_ops);
10209
10210   if (need_barrier_p)
10211     aarch64_emit_stack_tie (stack_pointer_rtx);
10212
10213   if (callee_adjust != 0)
10214     aarch64_pop_regs (reg1, reg2, callee_adjust, &cfi_ops);
10215
10216   /* If we have no register restore information, the CFA must have been
10217      defined in terms of the stack pointer since the end of the prologue.  */
10218   gcc_assert (cfi_ops || !frame_pointer_needed);
10219
10220   if (cfi_ops && (callee_adjust != 0 || maybe_gt (initial_adjust, 65536)))
10221     {
10222       /* Emit delayed restores and set the CFA to be SP + initial_adjust.  */
10223       insn = get_last_insn ();
10224       rtx new_cfa = plus_constant (Pmode, stack_pointer_rtx, initial_adjust);
10225       REG_NOTES (insn) = alloc_reg_note (REG_CFA_DEF_CFA, new_cfa, cfi_ops);
10226       RTX_FRAME_RELATED_P (insn) = 1;
10227       cfi_ops = NULL;
10228     }
10229
10230   /* Liveness of EP0_REGNUM can not be trusted across function calls either, so
10231      add restriction on emit_move optimization to leaf functions.  */
10232   aarch64_add_sp (tmp0_rtx, tmp1_rtx, initial_adjust, force_isa_mode,
10233                   (!can_inherit_p || !crtl->is_leaf
10234                    || df_regs_ever_live_p (EP0_REGNUM)));
10235
10236   if (cfi_ops)
10237     {
10238       /* Emit delayed restores and reset the CFA to be SP.  */
10239       insn = get_last_insn ();
10240       cfi_ops = alloc_reg_note (REG_CFA_DEF_CFA, stack_pointer_rtx, cfi_ops);
10241       REG_NOTES (insn) = cfi_ops;
10242       RTX_FRAME_RELATED_P (insn) = 1;
10243     }
10244
10245   /* Pop return address from shadow call stack.  */
10246   if (frame.is_scs_enabled)
10247     {
10248       machine_mode mode = aarch64_reg_save_mode (R30_REGNUM);
10249       rtx reg = gen_rtx_REG (mode, R30_REGNUM);
10250
10251       insn = emit_insn (gen_scs_pop ());
10252       add_reg_note (insn, REG_CFA_RESTORE, reg);
10253       RTX_FRAME_RELATED_P (insn) = 1;
10254     }
10255
10256   /* Stack adjustment for exception handler.  */
10257   if (crtl->calls_eh_return && !sibcall)
10258     {
10259       /* If the EH_RETURN_TAKEN_RTX flag is set then we need
10260          to unwind the stack and jump to the handler, otherwise
10261          skip this eh_return logic and continue with normal
10262          return after the label.  We have already reset the CFA
10263          to be SP; letting the CFA move during this adjustment
10264          is just as correct as retaining the CFA from the body
10265          of the function.  Therefore, do nothing special.  */
10266       rtx_code_label *label = gen_label_rtx ();
10267       rtx x = aarch64_gen_compare_zero_and_branch (EQ, EH_RETURN_TAKEN_RTX,
10268                                                    label);
10269       rtx jump = emit_jump_insn (x);
10270       JUMP_LABEL (jump) = label;
10271       LABEL_NUSES (label)++;
10272       emit_insn (gen_add2_insn (stack_pointer_rtx,
10273                                 EH_RETURN_STACKADJ_RTX));
10274       emit_jump_insn (gen_indirect_jump (EH_RETURN_HANDLER_RTX));
10275       emit_barrier ();
10276       emit_label (label);
10277     }
10278
10279   /* We prefer to emit the combined return/authenticate instruction RETAA,
10280      however there are three cases in which we must instead emit an explicit
10281      authentication instruction.
10282
10283         1) Sibcalls don't return in a normal way, so if we're about to call one
10284            we must authenticate.
10285
10286         2) The RETAA instruction is not available without FEAT_PAuth, so if we
10287            are generating code for !TARGET_PAUTH we can't use it and must
10288            explicitly authenticate.
10289     */
10290   if (aarch64_return_address_signing_enabled ()
10291       && (sibcall || !TARGET_PAUTH))
10292     {
10293       switch (aarch64_ra_sign_key)
10294         {
10295           case AARCH64_KEY_A:
10296             insn = emit_insn (gen_autiasp ());
10297             break;
10298           case AARCH64_KEY_B:
10299             insn = emit_insn (gen_autibsp ());
10300             break;
10301           default:
10302             gcc_unreachable ();
10303         }
10304       add_reg_note (insn, REG_CFA_NEGATE_RA_STATE, const0_rtx);
10305       RTX_FRAME_RELATED_P (insn) = 1;
10306     }
10307
10308   emit_use (gen_rtx_REG (DImode, LR_REGNUM));
10309   if (!sibcall)
10310     emit_jump_insn (ret_rtx);
10311 }
10312
10313 /* Output code to add DELTA to the first argument, and then jump
10314    to FUNCTION.  Used for C++ multiple inheritance.  */
10315 static void
10316 aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
10317                          HOST_WIDE_INT delta,
10318                          HOST_WIDE_INT vcall_offset,
10319                          tree function)
10320 {
10321   /* The this pointer is always in x0.  Note that this differs from
10322      Arm where the this pointer maybe bumped to r1 if r0 is required
10323      to return a pointer to an aggregate.  On AArch64 a result value
10324      pointer will be in x8.  */
10325   int this_regno = R0_REGNUM;
10326   rtx this_rtx, temp0, temp1, addr, funexp;
10327   rtx_insn *insn;
10328   const char *fnname = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (thunk));
10329
10330   if (aarch_bti_enabled ())
10331     emit_insn (gen_bti_c());
10332
10333   reload_completed = 1;
10334   emit_note (NOTE_INSN_PROLOGUE_END);
10335
10336   this_rtx = gen_rtx_REG (Pmode, this_regno);
10337   temp0 = gen_rtx_REG (Pmode, EP0_REGNUM);
10338   temp1 = gen_rtx_REG (Pmode, EP1_REGNUM);
10339
10340   if (vcall_offset == 0)
10341     aarch64_add_offset (Pmode, this_rtx, this_rtx, delta, temp1, temp0,
10342                         0, false);
10343   else
10344     {
10345       gcc_assert ((vcall_offset & (POINTER_BYTES - 1)) == 0);
10346
10347       addr = this_rtx;
10348       if (delta != 0)
10349         {
10350           if (delta >= -256 && delta < 256)
10351             addr = gen_rtx_PRE_MODIFY (Pmode, this_rtx,
10352                                        plus_constant (Pmode, this_rtx, delta));
10353           else
10354             aarch64_add_offset (Pmode, this_rtx, this_rtx, delta,
10355                                 temp1, temp0, 0, false);
10356         }
10357
10358       if (Pmode == ptr_mode)
10359         aarch64_emit_move (temp0, gen_rtx_MEM (ptr_mode, addr));
10360       else
10361         aarch64_emit_move (temp0,
10362                            gen_rtx_ZERO_EXTEND (Pmode,
10363                                                 gen_rtx_MEM (ptr_mode, addr)));
10364
10365       if (vcall_offset >= -256 && vcall_offset < 4096 * POINTER_BYTES)
10366           addr = plus_constant (Pmode, temp0, vcall_offset);
10367       else
10368         {
10369           aarch64_internal_mov_immediate (temp1, GEN_INT (vcall_offset), true,
10370                                           Pmode);
10371           addr = gen_rtx_PLUS (Pmode, temp0, temp1);
10372         }
10373
10374       if (Pmode == ptr_mode)
10375         aarch64_emit_move (temp1, gen_rtx_MEM (ptr_mode,addr));
10376       else
10377         aarch64_emit_move (temp1,
10378                            gen_rtx_SIGN_EXTEND (Pmode,
10379                                                 gen_rtx_MEM (ptr_mode, addr)));
10380
10381       emit_insn (gen_add2_insn (this_rtx, temp1));
10382     }
10383
10384   /* Generate a tail call to the target function.  */
10385   if (!TREE_USED (function))
10386     {
10387       assemble_external (function);
10388       TREE_USED (function) = 1;
10389     }
10390   funexp = XEXP (DECL_RTL (function), 0);
10391   funexp = gen_rtx_MEM (FUNCTION_MODE, funexp);
10392   auto isa_mode = aarch64_fntype_isa_mode (TREE_TYPE (function));
10393   auto pcs_variant = arm_pcs (fndecl_abi (function).id ());
10394   bool ir = lookup_attribute ("indirect_return",
10395                               TYPE_ATTRIBUTES (TREE_TYPE (function)));
10396   rtx callee_abi = aarch64_gen_callee_cookie (isa_mode, pcs_variant, ir);
10397   insn = emit_call_insn (gen_sibcall (funexp, const0_rtx, callee_abi));
10398   SIBLING_CALL_P (insn) = 1;
10399
10400   insn = get_insns ();
10401   shorten_branches (insn);
10402
10403   assemble_start_function (thunk, fnname);
10404   final_start_function (insn, file, 1);
10405   final (insn, file, 1);
10406   final_end_function ();
10407   assemble_end_function (thunk, fnname);
10408
10409   /* Stop pretending to be a post-reload pass.  */
10410   reload_completed = 0;
10411 }
10412
10413 static bool
10414 aarch64_tls_referenced_p (rtx x)
10415 {
10416   if (!TARGET_HAVE_TLS)
10417     return false;
10418   subrtx_iterator::array_type array;
10419   FOR_EACH_SUBRTX (iter, array, x, ALL)
10420     {
10421       const_rtx x = *iter;
10422       if (SYMBOL_REF_P (x) && SYMBOL_REF_TLS_MODEL (x) != 0)
10423         return true;
10424       /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
10425          TLS offsets, not real symbol references.  */
10426       if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
10427         iter.skip_subrtxes ();
10428     }
10429   return false;
10430 }
10431
10432
10433 static bool
10434 aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
10435 {
10436   if (GET_CODE (x) == HIGH)
10437     return true;
10438
10439   /* There's no way to calculate VL-based values using relocations.  */
10440   subrtx_iterator::array_type array;
10441   HOST_WIDE_INT factor;
10442   FOR_EACH_SUBRTX (iter, array, x, ALL)
10443     if (GET_CODE (*iter) == CONST_POLY_INT
10444         || aarch64_sme_vq_unspec_p (x, &factor))
10445       return true;
10446
10447   poly_int64 offset;
10448   rtx base = strip_offset_and_salt (x, &offset);
10449   if (SYMBOL_REF_P (base) || LABEL_REF_P (base))
10450     {
10451       /* We checked for POLY_INT_CST offsets above.  */
10452       if (aarch64_classify_symbol (base, offset.to_constant ())
10453           != SYMBOL_FORCE_TO_MEM)
10454         return true;
10455       else
10456         /* Avoid generating a 64-bit relocation in ILP32; leave
10457            to aarch64_expand_mov_immediate to handle it properly.  */
10458         return mode != ptr_mode;
10459     }
10460
10461   return aarch64_tls_referenced_p (x);
10462 }
10463
10464 /* Implement TARGET_CASE_VALUES_THRESHOLD.
10465    The expansion for a table switch is quite expensive due to the number
10466    of instructions, the table lookup and hard to predict indirect jump.
10467    When optimizing for speed, and -O3 enabled, use the per-core tuning if
10468    set, otherwise use tables for >= 11 cases as a tradeoff between size and
10469    performance.  When optimizing for size, use 8 for smallest codesize.  */
10470
10471 static unsigned int
10472 aarch64_case_values_threshold (void)
10473 {
10474   /* Use the specified limit for the number of cases before using jump
10475      tables at higher optimization levels.  */
10476   if (optimize > 2
10477       && aarch64_tune_params.max_case_values != 0)
10478     return aarch64_tune_params.max_case_values;
10479   else
10480     return optimize_size ? 8 : 11;
10481 }
10482
10483 /* Return true if register REGNO is a valid index register.
10484    STRICT_P is true if REG_OK_STRICT is in effect.  */
10485
10486 bool
10487 aarch64_regno_ok_for_index_p (int regno, bool strict_p)
10488 {
10489   if (!HARD_REGISTER_NUM_P (regno))
10490     {
10491       if (!strict_p)
10492         return true;
10493
10494       if (!reg_renumber)
10495         return false;
10496
10497       regno = reg_renumber[regno];
10498     }
10499   return GP_REGNUM_P (regno);
10500 }
10501
10502 /* Return true if register REGNO is a valid base register for mode MODE.
10503    STRICT_P is true if REG_OK_STRICT is in effect.  */
10504
10505 bool
10506 aarch64_regno_ok_for_base_p (int regno, bool strict_p)
10507 {
10508   if (!HARD_REGISTER_NUM_P (regno))
10509     {
10510       if (!strict_p)
10511         return true;
10512
10513       if (!reg_renumber)
10514         return false;
10515
10516       regno = reg_renumber[regno];
10517     }
10518
10519   /* The fake registers will be eliminated to either the stack or
10520      hard frame pointer, both of which are usually valid base registers.
10521      Reload deals with the cases where the eliminated form isn't valid.  */
10522   return (GP_REGNUM_P (regno)
10523           || regno == SP_REGNUM
10524           || regno == FRAME_POINTER_REGNUM
10525           || regno == ARG_POINTER_REGNUM);
10526 }
10527
10528 /* Return true if X is a valid base register for mode MODE.
10529    STRICT_P is true if REG_OK_STRICT is in effect.  */
10530
10531 static bool
10532 aarch64_base_register_rtx_p (rtx x, bool strict_p)
10533 {
10534   if (!strict_p
10535       && SUBREG_P (x)
10536       && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (x))])
10537     x = SUBREG_REG (x);
10538
10539   return (REG_P (x) && aarch64_regno_ok_for_base_p (REGNO (x), strict_p));
10540 }
10541
10542 /* Return true if address offset is a valid index.  If it is, fill in INFO
10543    appropriately.  STRICT_P is true if REG_OK_STRICT is in effect.  */
10544
10545 static bool
10546 aarch64_classify_index (struct aarch64_address_info *info, rtx x,
10547                         machine_mode mode, bool strict_p)
10548 {
10549   enum aarch64_address_type type;
10550   rtx index;
10551   int shift;
10552
10553   /* (reg:P) */
10554   if ((REG_P (x) || SUBREG_P (x))
10555       && GET_MODE (x) == Pmode)
10556     {
10557       type = ADDRESS_REG_REG;
10558       index = x;
10559       shift = 0;
10560     }
10561   /* (sign_extend:DI (reg:SI)) */
10562   else if ((GET_CODE (x) == SIGN_EXTEND
10563             || GET_CODE (x) == ZERO_EXTEND)
10564            && GET_MODE (x) == DImode
10565            && GET_MODE (XEXP (x, 0)) == SImode)
10566     {
10567       type = (GET_CODE (x) == SIGN_EXTEND)
10568         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
10569       index = XEXP (x, 0);
10570       shift = 0;
10571     }
10572   /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
10573   else if (GET_CODE (x) == MULT
10574            && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
10575                || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
10576            && GET_MODE (XEXP (x, 0)) == DImode
10577            && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
10578            && CONST_INT_P (XEXP (x, 1)))
10579     {
10580       type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
10581         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
10582       index = XEXP (XEXP (x, 0), 0);
10583       shift = exact_log2 (INTVAL (XEXP (x, 1)));
10584     }
10585   /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
10586   else if (GET_CODE (x) == ASHIFT
10587            && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
10588                || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
10589            && GET_MODE (XEXP (x, 0)) == DImode
10590            && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
10591            && CONST_INT_P (XEXP (x, 1)))
10592     {
10593       type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
10594         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
10595       index = XEXP (XEXP (x, 0), 0);
10596       shift = INTVAL (XEXP (x, 1));
10597     }
10598   /* (and:DI (mult:DI (reg:DI) (const_int scale))
10599      (const_int 0xffffffff<<shift)) */
10600   else if (GET_CODE (x) == AND
10601            && GET_MODE (x) == DImode
10602            && GET_CODE (XEXP (x, 0)) == MULT
10603            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
10604            && CONST_INT_P (XEXP (XEXP (x, 0), 1))
10605            && CONST_INT_P (XEXP (x, 1)))
10606     {
10607       type = ADDRESS_REG_UXTW;
10608       index = XEXP (XEXP (x, 0), 0);
10609       shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
10610       /* Avoid undefined code dealing with shift being -1. */
10611       if (shift != -1
10612           && INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
10613         shift = -1;
10614     }
10615   /* (and:DI (ashift:DI (reg:DI) (const_int shift))
10616      (const_int 0xffffffff<<shift)) */
10617   else if (GET_CODE (x) == AND
10618            && GET_MODE (x) == DImode
10619            && GET_CODE (XEXP (x, 0)) == ASHIFT
10620            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
10621            && CONST_INT_P (XEXP (XEXP (x, 0), 1))
10622            && CONST_INT_P (XEXP (x, 1)))
10623     {
10624       type = ADDRESS_REG_UXTW;
10625       index = XEXP (XEXP (x, 0), 0);
10626       shift = INTVAL (XEXP (XEXP (x, 0), 1));
10627       if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
10628         shift = -1;
10629     }
10630   /* (mult:P (reg:P) (const_int scale)) */
10631   else if (GET_CODE (x) == MULT
10632            && GET_MODE (x) == Pmode
10633            && GET_MODE (XEXP (x, 0)) == Pmode
10634            && CONST_INT_P (XEXP (x, 1)))
10635     {
10636       type = ADDRESS_REG_REG;
10637       index = XEXP (x, 0);
10638       shift = exact_log2 (INTVAL (XEXP (x, 1)));
10639     }
10640   /* (ashift:P (reg:P) (const_int shift)) */
10641   else if (GET_CODE (x) == ASHIFT
10642            && GET_MODE (x) == Pmode
10643            && GET_MODE (XEXP (x, 0)) == Pmode
10644            && CONST_INT_P (XEXP (x, 1)))
10645     {
10646       type = ADDRESS_REG_REG;
10647       index = XEXP (x, 0);
10648       shift = INTVAL (XEXP (x, 1));
10649     }
10650   else
10651     return false;
10652
10653   if (!strict_p
10654       && SUBREG_P (index)
10655       && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (index))])
10656     index = SUBREG_REG (index);
10657
10658   auto vec_flags = aarch64_classify_vector_memory_mode (mode);
10659   if (vec_flags & VEC_SVE_DATA)
10660     {
10661       if (type != ADDRESS_REG_REG
10662           || (1 << shift) != GET_MODE_UNIT_SIZE (mode))
10663         return false;
10664     }
10665   else
10666     {
10667       if (shift != 0
10668           && !(IN_RANGE (shift, 1, 3)
10669                && known_eq (1 << shift, GET_MODE_SIZE (mode))))
10670         return false;
10671     }
10672
10673   if (REG_P (index)
10674       && aarch64_regno_ok_for_index_p (REGNO (index), strict_p))
10675     {
10676       info->type = type;
10677       info->offset = index;
10678       info->shift = shift;
10679       return true;
10680     }
10681
10682   return false;
10683 }
10684
10685 /* Return true if MODE is one of the modes for which we
10686    support LDP/STP operations.  */
10687
10688 static bool
10689 aarch64_mode_valid_for_sched_fusion_p (machine_mode mode)
10690 {
10691   return mode == SImode || mode == DImode
10692          || mode == SFmode || mode == DFmode
10693          || mode == SDmode || mode == DDmode
10694          || (aarch64_vector_mode_supported_p (mode)
10695              && (known_eq (GET_MODE_SIZE (mode), 8)
10696                  || known_eq (GET_MODE_SIZE (mode), 16)));
10697 }
10698
10699 /* Return true if REGNO is a virtual pointer register, or an eliminable
10700    "soft" frame register.  Like REGNO_PTR_FRAME_P except that we don't
10701    include stack_pointer or hard_frame_pointer.  */
10702 static bool
10703 virt_or_elim_regno_p (unsigned regno)
10704 {
10705   return ((regno >= FIRST_VIRTUAL_REGISTER
10706            && regno <= LAST_VIRTUAL_POINTER_REGISTER)
10707           || regno == FRAME_POINTER_REGNUM
10708           || regno == ARG_POINTER_REGNUM);
10709 }
10710
10711 /* Return true if X is a valid address of type TYPE for machine mode MODE.
10712    If it is, fill in INFO appropriately.  STRICT_P is true if
10713    REG_OK_STRICT is in effect.  */
10714
10715 bool
10716 aarch64_classify_address (struct aarch64_address_info *info,
10717                           rtx x, machine_mode mode, bool strict_p,
10718                           aarch64_addr_query_type type)
10719 {
10720   enum rtx_code code = GET_CODE (x);
10721   rtx op0, op1;
10722   poly_int64 offset;
10723
10724   HOST_WIDE_INT const_size;
10725
10726   /* Whether a vector mode is partial doesn't affect address legitimacy.
10727      Partial vectors like VNx8QImode allow the same indexed addressing
10728      mode and MUL VL addressing mode as full vectors like VNx16QImode;
10729      in both cases, MUL VL counts multiples of GET_MODE_SIZE.  */
10730   unsigned int vec_flags = aarch64_classify_vector_memory_mode (mode);
10731   vec_flags &= ~VEC_PARTIAL;
10732
10733   /* We use load/store pair for all large int mode load/stores.
10734      TI/TF/TDmode may also use a load/store pair.  */
10735   bool advsimd_struct_p = (vec_flags == (VEC_ADVSIMD | VEC_STRUCT));
10736   bool load_store_pair_p = (type == ADDR_QUERY_LDP_STP
10737                             || type == ADDR_QUERY_LDP_STP_N
10738                             || mode == TImode
10739                             || mode == TFmode
10740                             || mode == TDmode
10741                             || advsimd_struct_p);
10742   /* If we are dealing with ADDR_QUERY_LDP_STP_N that means the incoming mode
10743      corresponds to the actual size of the memory being loaded/stored and the
10744      mode of the corresponding addressing mode is half of that.  */
10745   if (type == ADDR_QUERY_LDP_STP_N)
10746     {
10747       if (known_eq (GET_MODE_SIZE (mode), 32))
10748         mode = V16QImode;
10749       else if (known_eq (GET_MODE_SIZE (mode), 16))
10750         mode = DFmode;
10751       else if (known_eq (GET_MODE_SIZE (mode), 8))
10752         mode = SFmode;
10753       else
10754         return false;
10755
10756       /* This isn't really an Advanced SIMD struct mode, but a mode
10757          used to represent the complete mem in a load/store pair.  */
10758       advsimd_struct_p = false;
10759     }
10760
10761   bool allow_reg_index_p = (!load_store_pair_p
10762                             && ((vec_flags == 0
10763                                  && known_lt (GET_MODE_SIZE (mode), 16))
10764                                 || vec_flags == VEC_ADVSIMD
10765                                 || vec_flags & VEC_SVE_DATA));
10766
10767   /* For SVE, only accept [Rn], [Rn, #offset, MUL VL] and [Rn, Rm, LSL #shift].
10768      The latter is not valid for SVE predicates, and that's rejected through
10769      allow_reg_index_p above.  */
10770   if ((vec_flags & (VEC_SVE_DATA | VEC_SVE_PRED)) != 0
10771       && (code != REG && code != PLUS))
10772     return false;
10773
10774   gcc_checking_assert (GET_MODE (x) == VOIDmode
10775                        || SCALAR_INT_MODE_P (GET_MODE (x)));
10776
10777   switch (code)
10778     {
10779     case REG:
10780     case SUBREG:
10781       info->type = ADDRESS_REG_IMM;
10782       info->base = x;
10783       info->offset = const0_rtx;
10784       info->const_offset = 0;
10785       return aarch64_base_register_rtx_p (x, strict_p);
10786
10787     case PLUS:
10788       op0 = XEXP (x, 0);
10789       op1 = XEXP (x, 1);
10790
10791       if (! strict_p
10792           && REG_P (op0)
10793           && virt_or_elim_regno_p (REGNO (op0))
10794           && poly_int_rtx_p (op1, &offset))
10795         {
10796           info->type = ADDRESS_REG_IMM;
10797           info->base = op0;
10798           info->offset = op1;
10799           info->const_offset = offset;
10800
10801           return true;
10802         }
10803
10804       if (maybe_ne (GET_MODE_SIZE (mode), 0)
10805           && aarch64_base_register_rtx_p (op0, strict_p)
10806           && poly_int_rtx_p (op1, &offset))
10807         {
10808           info->type = ADDRESS_REG_IMM;
10809           info->base = op0;
10810           info->offset = op1;
10811           info->const_offset = offset;
10812
10813           /* TImode, TFmode and TDmode values are allowed in both pairs of X
10814              registers and individual Q registers.  The available
10815              address modes are:
10816              X,X: 7-bit signed scaled offset
10817              Q:   9-bit signed offset
10818              We conservatively require an offset representable in either mode.
10819              When performing the check for pairs of X registers i.e.  LDP/STP
10820              pass down DImode since that is the natural size of the LDP/STP
10821              instruction memory accesses.  */
10822           if (mode == TImode || mode == TFmode || mode == TDmode)
10823             return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
10824                     && (aarch64_offset_9bit_signed_unscaled_p (mode, offset)
10825                         || offset_12bit_unsigned_scaled_p (mode, offset)));
10826
10827           if (mode == V8DImode)
10828             return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
10829                     && aarch64_offset_7bit_signed_scaled_p (DImode, offset + 48));
10830
10831           /* A 7bit offset check because OImode will emit a ldp/stp
10832              instruction (only !TARGET_SIMD or big endian will get here).
10833              For ldp/stp instructions, the offset is scaled for the size of a
10834              single element of the pair.  */
10835           if (aarch64_advsimd_partial_struct_mode_p (mode)
10836               && known_eq (GET_MODE_SIZE (mode), 16))
10837             return aarch64_offset_7bit_signed_scaled_p (DImode, offset);
10838           if (aarch64_advsimd_full_struct_mode_p (mode)
10839               && known_eq (GET_MODE_SIZE (mode), 32))
10840             return aarch64_offset_7bit_signed_scaled_p (TImode, offset);
10841
10842           /* Three 9/12 bit offsets checks because CImode will emit three
10843              ldr/str instructions (only !TARGET_SIMD or big endian will
10844              get here).  */
10845           if (aarch64_advsimd_partial_struct_mode_p (mode)
10846               && known_eq (GET_MODE_SIZE (mode), 24))
10847             return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
10848                     && (aarch64_offset_9bit_signed_unscaled_p (DImode,
10849                                                                offset + 16)
10850                         || offset_12bit_unsigned_scaled_p (DImode,
10851                                                            offset + 16)));
10852           if (aarch64_advsimd_full_struct_mode_p (mode)
10853               && known_eq (GET_MODE_SIZE (mode), 48))
10854             return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
10855                     && (aarch64_offset_9bit_signed_unscaled_p (TImode,
10856                                                                offset + 32)
10857                         || offset_12bit_unsigned_scaled_p (TImode,
10858                                                            offset + 32)));
10859
10860           /* Two 7bit offsets checks because XImode will emit two ldp/stp
10861              instructions (only big endian will get here).  */
10862           if (aarch64_advsimd_partial_struct_mode_p (mode)
10863               && known_eq (GET_MODE_SIZE (mode), 32))
10864             return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
10865                     && aarch64_offset_7bit_signed_scaled_p (DImode,
10866                                                             offset + 16));
10867           if (aarch64_advsimd_full_struct_mode_p (mode)
10868               && known_eq (GET_MODE_SIZE (mode), 64))
10869             return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
10870                     && aarch64_offset_7bit_signed_scaled_p (TImode,
10871                                                             offset + 32));
10872
10873           /* Make "m" use the LD1 offset range for SVE data modes, so
10874              that pre-RTL optimizers like ivopts will work to that
10875              instead of the wider LDR/STR range.  */
10876           if (vec_flags == VEC_SVE_DATA)
10877             return (type == ADDR_QUERY_M
10878                     ? offset_4bit_signed_scaled_p (mode, offset)
10879                     : offset_9bit_signed_scaled_p (mode, offset));
10880
10881           if (vec_flags == (VEC_SVE_DATA | VEC_STRUCT))
10882             {
10883               poly_int64 end_offset = (offset
10884                                        + GET_MODE_SIZE (mode)
10885                                        - BYTES_PER_SVE_VECTOR);
10886               return (type == ADDR_QUERY_M
10887                       ? offset_4bit_signed_scaled_p (mode, offset)
10888                       : (offset_9bit_signed_scaled_p (SVE_BYTE_MODE, offset)
10889                          && offset_9bit_signed_scaled_p (SVE_BYTE_MODE,
10890                                                          end_offset)));
10891             }
10892
10893           if (vec_flags == VEC_SVE_PRED)
10894             return offset_9bit_signed_scaled_p (mode, offset);
10895
10896           if (vec_flags == (VEC_SVE_PRED | VEC_STRUCT))
10897             {
10898               poly_int64 end_offset = (offset
10899                                        + GET_MODE_SIZE (mode)
10900                                        - BYTES_PER_SVE_PRED);
10901               return (offset_9bit_signed_scaled_p (VNx16BImode, end_offset)
10902                       && offset_9bit_signed_scaled_p (VNx16BImode, offset));
10903             }
10904
10905           if (load_store_pair_p)
10906             return ((known_eq (GET_MODE_SIZE (mode), 4)
10907                      || known_eq (GET_MODE_SIZE (mode), 8)
10908                      || known_eq (GET_MODE_SIZE (mode), 16))
10909                     && aarch64_offset_7bit_signed_scaled_p (mode, offset));
10910           else
10911             return (aarch64_offset_9bit_signed_unscaled_p (mode, offset)
10912                     || offset_12bit_unsigned_scaled_p (mode, offset));
10913         }
10914
10915       if (allow_reg_index_p)
10916         {
10917           /* Look for base + (scaled/extended) index register.  */
10918           if (aarch64_base_register_rtx_p (op0, strict_p)
10919               && aarch64_classify_index (info, op1, mode, strict_p))
10920             {
10921               info->base = op0;
10922               return true;
10923             }
10924           if (aarch64_base_register_rtx_p (op1, strict_p)
10925               && aarch64_classify_index (info, op0, mode, strict_p))
10926             {
10927               info->base = op1;
10928               return true;
10929             }
10930         }
10931
10932       return false;
10933
10934     case POST_INC:
10935     case POST_DEC:
10936     case PRE_INC:
10937     case PRE_DEC:
10938       info->type = ADDRESS_REG_WB;
10939       info->base = XEXP (x, 0);
10940       info->offset = NULL_RTX;
10941       return aarch64_base_register_rtx_p (info->base, strict_p);
10942
10943     case POST_MODIFY:
10944     case PRE_MODIFY:
10945       info->type = ADDRESS_REG_WB;
10946       info->base = XEXP (x, 0);
10947       if (GET_CODE (XEXP (x, 1)) == PLUS
10948           && poly_int_rtx_p (XEXP (XEXP (x, 1), 1), &offset)
10949           && rtx_equal_p (XEXP (XEXP (x, 1), 0), info->base)
10950           && aarch64_base_register_rtx_p (info->base, strict_p))
10951         {
10952           info->offset = XEXP (XEXP (x, 1), 1);
10953           info->const_offset = offset;
10954
10955           /* TImode, TFmode and TDmode values are allowed in both pairs of X
10956              registers and individual Q registers.  The available
10957              address modes are:
10958              X,X: 7-bit signed scaled offset
10959              Q:   9-bit signed offset
10960              We conservatively require an offset representable in either mode.
10961            */
10962           if (mode == TImode || mode == TFmode || mode == TDmode)
10963             return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
10964                     && aarch64_offset_9bit_signed_unscaled_p (mode, offset));
10965
10966           if (load_store_pair_p)
10967             return ((known_eq (GET_MODE_SIZE (mode), 4)
10968                      || known_eq (GET_MODE_SIZE (mode), 8)
10969                      || known_eq (GET_MODE_SIZE (mode), 16))
10970                     && aarch64_offset_7bit_signed_scaled_p (mode, offset));
10971           else
10972             return aarch64_offset_9bit_signed_unscaled_p (mode, offset);
10973         }
10974       return false;
10975
10976     case CONST:
10977     case SYMBOL_REF:
10978     case LABEL_REF:
10979       /* load literal: pc-relative constant pool entry.  Only supported
10980          for SI mode or larger.  */
10981       info->type = ADDRESS_SYMBOLIC;
10982
10983       if (!load_store_pair_p
10984           && GET_MODE_SIZE (mode).is_constant (&const_size)
10985           && const_size >= 4)
10986         {
10987           poly_int64 offset;
10988           rtx sym = strip_offset_and_salt (x, &offset);
10989           return ((LABEL_REF_P (sym)
10990                    || (SYMBOL_REF_P (sym)
10991                        && CONSTANT_POOL_ADDRESS_P (sym)
10992                        && aarch64_pcrelative_literal_loads)));
10993         }
10994       return false;
10995
10996     case LO_SUM:
10997       info->type = ADDRESS_LO_SUM;
10998       info->base = XEXP (x, 0);
10999       info->offset = XEXP (x, 1);
11000       if (allow_reg_index_p
11001           && aarch64_base_register_rtx_p (info->base, strict_p))
11002         {
11003           poly_int64 offset;
11004           HOST_WIDE_INT const_offset;
11005           rtx sym = strip_offset_and_salt (info->offset, &offset);
11006           if (SYMBOL_REF_P (sym)
11007               && offset.is_constant (&const_offset)
11008               && (aarch64_classify_symbol (sym, const_offset)
11009                   == SYMBOL_SMALL_ABSOLUTE))
11010             {
11011               /* The symbol and offset must be aligned to the access size.  */
11012               unsigned int align;
11013
11014               if (CONSTANT_POOL_ADDRESS_P (sym))
11015                 align = GET_MODE_ALIGNMENT (get_pool_mode (sym));
11016               else if (TREE_CONSTANT_POOL_ADDRESS_P (sym))
11017                 {
11018                   tree exp = SYMBOL_REF_DECL (sym);
11019                   align = TYPE_ALIGN (TREE_TYPE (exp));
11020                   align = aarch64_constant_alignment (exp, align);
11021                 }
11022               else if (SYMBOL_REF_DECL (sym))
11023                 align = DECL_ALIGN (SYMBOL_REF_DECL (sym));
11024               else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym)
11025                        && SYMBOL_REF_BLOCK (sym) != NULL)
11026                 align = SYMBOL_REF_BLOCK (sym)->alignment;
11027               else
11028                 align = BITS_PER_UNIT;
11029
11030               poly_int64 ref_size = GET_MODE_SIZE (mode);
11031               if (known_eq (ref_size, 0))
11032                 ref_size = GET_MODE_SIZE (DImode);
11033
11034               return (multiple_p (const_offset, ref_size)
11035                       && multiple_p (align / BITS_PER_UNIT, ref_size));
11036             }
11037         }
11038       return false;
11039
11040     default:
11041       return false;
11042     }
11043 }
11044
11045 /* Return true if the address X is valid for a PRFM instruction.
11046    STRICT_P is true if we should do strict checking with
11047    aarch64_classify_address.  */
11048
11049 bool
11050 aarch64_address_valid_for_prefetch_p (rtx x, bool strict_p)
11051 {
11052   struct aarch64_address_info addr;
11053
11054   /* PRFM accepts the same addresses as DImode...  */
11055   bool res = aarch64_classify_address (&addr, x, DImode, strict_p);
11056   if (!res)
11057     return false;
11058
11059   /* ... except writeback forms.  */
11060   return addr.type != ADDRESS_REG_WB;
11061 }
11062
11063 bool
11064 aarch64_symbolic_address_p (rtx x)
11065 {
11066   poly_int64 offset;
11067   x = strip_offset_and_salt (x, &offset);
11068   return SYMBOL_REF_P (x) || LABEL_REF_P (x);
11069 }
11070
11071 /* Classify the base of symbolic expression X.  */
11072
11073 enum aarch64_symbol_type
11074 aarch64_classify_symbolic_expression (rtx x)
11075 {
11076   rtx offset;
11077
11078   split_const (x, &x, &offset);
11079   return aarch64_classify_symbol (x, INTVAL (offset));
11080 }
11081
11082
11083 /* Return TRUE if X is a legitimate address for accessing memory in
11084    mode MODE.  */
11085 static bool
11086 aarch64_legitimate_address_hook_p (machine_mode mode, rtx x, bool strict_p,
11087                                    code_helper = ERROR_MARK)
11088 {
11089   struct aarch64_address_info addr;
11090
11091   return aarch64_classify_address (&addr, x, mode, strict_p);
11092 }
11093
11094 /* Return TRUE if X is a legitimate address of type TYPE for accessing
11095    memory in mode MODE.  STRICT_P is true if REG_OK_STRICT is in effect.  */
11096 bool
11097 aarch64_legitimate_address_p (machine_mode mode, rtx x, bool strict_p,
11098                               aarch64_addr_query_type type)
11099 {
11100   struct aarch64_address_info addr;
11101
11102   return aarch64_classify_address (&addr, x, mode, strict_p, type);
11103 }
11104
11105 /* Implement TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT.  */
11106
11107 static bool
11108 aarch64_legitimize_address_displacement (rtx *offset1, rtx *offset2,
11109                                          poly_int64 orig_offset,
11110                                          machine_mode mode)
11111 {
11112   HOST_WIDE_INT size;
11113   if (GET_MODE_SIZE (mode).is_constant (&size))
11114     {
11115       HOST_WIDE_INT const_offset, second_offset;
11116
11117       /* A general SVE offset is A * VQ + B.  Remove the A component from
11118          coefficient 0 in order to get the constant B.  */
11119       const_offset = orig_offset.coeffs[0] - orig_offset.coeffs[1];
11120
11121       /* Split an out-of-range address displacement into a base and
11122          offset.  Use 4KB range for 1- and 2-byte accesses and a 16KB
11123          range otherwise to increase opportunities for sharing the base
11124          address of different sizes.  Unaligned accesses use the signed
11125          9-bit range, TImode/TFmode/TDmode use the intersection of signed
11126          scaled 7-bit and signed 9-bit offset.  */
11127       if (mode == TImode || mode == TFmode || mode == TDmode)
11128         second_offset = ((const_offset + 0x100) & 0x1f8) - 0x100;
11129       else if ((const_offset & (size - 1)) != 0)
11130         second_offset = ((const_offset + 0x100) & 0x1ff) - 0x100;
11131       else
11132         second_offset = const_offset & (size < 4 ? 0xfff : 0x3ffc);
11133
11134       if (second_offset == 0 || known_eq (orig_offset, second_offset))
11135         return false;
11136
11137       /* Split the offset into second_offset and the rest.  */
11138       *offset1 = gen_int_mode (orig_offset - second_offset, Pmode);
11139       *offset2 = gen_int_mode (second_offset, Pmode);
11140       return true;
11141     }
11142   else
11143     {
11144       /* Get the mode we should use as the basis of the range.  For structure
11145          modes this is the mode of one vector.  */
11146       unsigned int vec_flags = aarch64_classify_vector_mode (mode);
11147       machine_mode step_mode
11148         = (vec_flags & VEC_STRUCT) != 0 ? SVE_BYTE_MODE : mode;
11149
11150       /* Get the "mul vl" multiplier we'd like to use.  */
11151       HOST_WIDE_INT factor = GET_MODE_SIZE (step_mode).coeffs[1];
11152       HOST_WIDE_INT vnum = orig_offset.coeffs[1] / factor;
11153       if (vec_flags & VEC_SVE_DATA)
11154         /* LDR supports a 9-bit range, but the move patterns for
11155            structure modes require all vectors to be in range of the
11156            same base.  The simplest way of accomodating that while still
11157            promoting reuse of anchor points between different modes is
11158            to use an 8-bit range unconditionally.  */
11159         vnum = ((vnum + 128) & 255) - 128;
11160       else
11161         /* Predicates are only handled singly, so we might as well use
11162            the full range.  */
11163         vnum = ((vnum + 256) & 511) - 256;
11164       if (vnum == 0)
11165         return false;
11166
11167       /* Convert the "mul vl" multiplier into a byte offset.  */
11168       poly_int64 second_offset = GET_MODE_SIZE (step_mode) * vnum;
11169       if (known_eq (second_offset, orig_offset))
11170         return false;
11171
11172       /* Split the offset into second_offset and the rest.  */
11173       *offset1 = gen_int_mode (orig_offset - second_offset, Pmode);
11174       *offset2 = gen_int_mode (second_offset, Pmode);
11175       return true;
11176     }
11177 }
11178
11179 /* Return the binary representation of floating point constant VALUE in INTVAL.
11180    If the value cannot be converted, return false without setting INTVAL.
11181    The conversion is done in the given MODE.  */
11182 bool
11183 aarch64_reinterpret_float_as_int (rtx value, unsigned HOST_WIDE_INT *intval)
11184 {
11185
11186   /* We make a general exception for 0.  */
11187   if (aarch64_float_const_zero_rtx_p (value))
11188     {
11189       *intval = 0;
11190       return true;
11191     }
11192
11193   scalar_float_mode mode;
11194   if (!CONST_DOUBLE_P (value)
11195       || !is_a <scalar_float_mode> (GET_MODE (value), &mode)
11196       || GET_MODE_BITSIZE (mode) > HOST_BITS_PER_WIDE_INT
11197       /* Only support up to DF mode.  */
11198       || GET_MODE_BITSIZE (mode) > GET_MODE_BITSIZE (DFmode))
11199     return false;
11200
11201   unsigned HOST_WIDE_INT ival = 0;
11202
11203   long res[2];
11204   real_to_target (res,
11205                   CONST_DOUBLE_REAL_VALUE (value),
11206                   REAL_MODE_FORMAT (mode));
11207
11208   if (mode == DFmode || mode == DDmode)
11209     {
11210       int order = BYTES_BIG_ENDIAN ? 1 : 0;
11211       ival = zext_hwi (res[order], 32);
11212       ival |= (zext_hwi (res[1 - order], 32) << 32);
11213     }
11214   else
11215       ival = zext_hwi (res[0], 32);
11216
11217   *intval = ival;
11218   return true;
11219 }
11220
11221 /* Return TRUE if rtx X is an immediate constant that can be moved using a
11222    single MOV(+MOVK) followed by an FMOV.  */
11223 bool
11224 aarch64_float_const_rtx_p (rtx x)
11225 {
11226   machine_mode mode = GET_MODE (x);
11227   if (mode == VOIDmode)
11228     return false;
11229
11230   /* Determine whether it's cheaper to write float constants as
11231      mov/movk pairs over ldr/adrp pairs.  */
11232   unsigned HOST_WIDE_INT ival;
11233
11234   if (CONST_DOUBLE_P (x)
11235       && SCALAR_FLOAT_MODE_P (mode)
11236       && aarch64_reinterpret_float_as_int (x, &ival))
11237     {
11238       machine_mode imode = known_eq (GET_MODE_SIZE (mode), 8) ? DImode : SImode;
11239       int num_instr = aarch64_internal_mov_immediate
11240                         (NULL_RTX, gen_int_mode (ival, imode), false, imode);
11241       return num_instr < 3;
11242     }
11243
11244   return false;
11245 }
11246
11247 /* Return TRUE if rtx X is immediate constant 0.0 (but not in Decimal
11248    Floating Point).  */
11249 bool
11250 aarch64_float_const_zero_rtx_p (rtx x)
11251 {
11252   /* 0.0 in Decimal Floating Point cannot be represented by #0 or
11253      zr as our callers expect, so no need to check the actual
11254      value if X is of Decimal Floating Point type.  */
11255   if (GET_MODE_CLASS (GET_MODE (x)) == MODE_DECIMAL_FLOAT)
11256     return false;
11257
11258   if (REAL_VALUE_MINUS_ZERO (*CONST_DOUBLE_REAL_VALUE (x)))
11259     return !HONOR_SIGNED_ZEROS (GET_MODE (x));
11260   return real_equal (CONST_DOUBLE_REAL_VALUE (x), &dconst0);
11261 }
11262
11263 /* Return true if X is any kind of constant zero rtx.  */
11264
11265 bool
11266 aarch64_const_zero_rtx_p (rtx x)
11267 {
11268   return (x == CONST0_RTX (GET_MODE (x))
11269           || (CONST_DOUBLE_P (x) && aarch64_float_const_zero_rtx_p (x)));
11270 }
11271
11272 /* Return TRUE if rtx X is immediate constant that fits in a single
11273    MOVI immediate operation.  */
11274 bool
11275 aarch64_can_const_movi_rtx_p (rtx x, machine_mode mode)
11276 {
11277   if (!TARGET_SIMD)
11278      return false;
11279
11280   machine_mode vmode;
11281   scalar_int_mode imode;
11282   unsigned HOST_WIDE_INT ival;
11283
11284   if (CONST_DOUBLE_P (x)
11285       && SCALAR_FLOAT_MODE_P (mode))
11286     {
11287       if (!aarch64_reinterpret_float_as_int (x, &ival))
11288         return false;
11289
11290       /* We make a general exception for 0.  */
11291       if (aarch64_float_const_zero_rtx_p (x))
11292         return true;
11293
11294       imode = int_mode_for_mode (mode).require ();
11295     }
11296   else if (CONST_INT_P (x)
11297            && is_a <scalar_int_mode> (mode, &imode))
11298     ival = INTVAL (x);
11299   else
11300     return false;
11301
11302    /* use a 64 bit mode for everything except for DI/DF/DD mode, where we use
11303      a 128 bit vector mode.  */
11304   int width = GET_MODE_BITSIZE (imode) == 64 ? 128 : 64;
11305
11306   vmode = aarch64_simd_container_mode (imode, width);
11307   rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, ival);
11308
11309   return aarch64_simd_valid_mov_imm (v_op);
11310 }
11311
11312 /* Return TRUE if DST and SRC with mode MODE is a valid fp move.  */
11313 bool
11314 aarch64_valid_fp_move (rtx dst, rtx src, machine_mode mode)
11315 {
11316   if (!TARGET_FLOAT)
11317     return false;
11318
11319   if (aarch64_reg_or_fp_zero (src, mode))
11320     return true;
11321
11322   if (!register_operand (dst, mode))
11323     return false;
11324
11325   if (MEM_P (src))
11326     return true;
11327
11328   if (!DECIMAL_FLOAT_MODE_P (mode))
11329     {
11330       if (aarch64_can_const_movi_rtx_p (src, mode)
11331           || aarch64_float_const_representable_p (src)
11332           || aarch64_float_const_zero_rtx_p (src))
11333         return true;
11334
11335       /* Block FP immediates which are split during expand.  */
11336       if (aarch64_float_const_rtx_p (src))
11337         return false;
11338     }
11339
11340   return can_create_pseudo_p ();
11341 }
11342
11343 /* Return the fixed registers used for condition codes.  */
11344
11345 static bool
11346 aarch64_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
11347 {
11348   *p1 = CC_REGNUM;
11349   *p2 = INVALID_REGNUM;
11350   return true;
11351 }
11352
11353 /* Return a fresh memory reference to the current function's TPIDR2 block,
11354    creating a block if necessary.  */
11355
11356 static rtx
11357 aarch64_get_tpidr2_block ()
11358 {
11359   if (!cfun->machine->tpidr2_block)
11360     /* The TPIDR2 block is 16 bytes in size and must be aligned to a 128-bit
11361        boundary.  */
11362     cfun->machine->tpidr2_block = assign_stack_local (V16QImode, 16, 128);
11363   return copy_rtx (cfun->machine->tpidr2_block);
11364 }
11365
11366 /* Return a fresh register that points to the current function's
11367    TPIDR2 block, creating a block if necessary.  */
11368
11369 static rtx
11370 aarch64_get_tpidr2_ptr ()
11371 {
11372   rtx block = aarch64_get_tpidr2_block ();
11373   return force_reg (Pmode, XEXP (block, 0));
11374 }
11375
11376 /* Emit instructions to allocate a ZA lazy save buffer and initialize the
11377    current function's TPIDR2 block.  */
11378
11379 static void
11380 aarch64_init_tpidr2_block ()
11381 {
11382   rtx block = aarch64_get_tpidr2_block ();
11383
11384   /* The ZA save buffer is SVL.B*SVL.B bytes in size.  */
11385   rtx svl_bytes = aarch64_sme_vq_immediate (Pmode, 16, AARCH64_ISA_MODE);
11386   rtx svl_bytes_reg = force_reg (DImode, svl_bytes);
11387   rtx za_size = expand_simple_binop (Pmode, MULT, svl_bytes_reg,
11388                                      svl_bytes_reg, NULL, 0, OPTAB_LIB_WIDEN);
11389   rtx za_save_buffer = allocate_dynamic_stack_space (za_size, 128,
11390                                                      BITS_PER_UNIT, -1, true);
11391   za_save_buffer = force_reg (Pmode, za_save_buffer);
11392   cfun->machine->za_save_buffer = za_save_buffer;
11393
11394   /* The first word of the block points to the save buffer and the second
11395      word is the number of ZA slices to save.  */
11396   rtx block_0 = adjust_address (block, DImode, 0);
11397   emit_insn (aarch64_gen_store_pair (block_0, za_save_buffer, svl_bytes_reg));
11398
11399   if (!memory_operand (block, V16QImode))
11400     block = replace_equiv_address (block, force_reg (Pmode, XEXP (block, 0)));
11401   emit_insn (gen_aarch64_setup_local_tpidr2 (block));
11402 }
11403
11404 /* Restore the contents of ZA from the lazy save buffer, given that
11405    register TPIDR2_BLOCK points to the current function's TPIDR2 block.
11406    PSTATE.ZA is known to be 0 and TPIDR2_EL0 is known to be null.  */
11407
11408 void
11409 aarch64_restore_za (rtx tpidr2_block)
11410 {
11411   emit_insn (gen_aarch64_smstart_za ());
11412   if (REGNO (tpidr2_block) != R0_REGNUM)
11413     emit_move_insn (gen_rtx_REG (Pmode, R0_REGNUM), tpidr2_block);
11414   emit_insn (gen_aarch64_tpidr2_restore ());
11415 }
11416
11417 /* Return the ZT0 save buffer, creating one if necessary.  */
11418
11419 static rtx
11420 aarch64_get_zt0_save_buffer ()
11421 {
11422   if (!cfun->machine->zt0_save_buffer)
11423     cfun->machine->zt0_save_buffer = assign_stack_local (V8DImode, 64, 128);
11424   return cfun->machine->zt0_save_buffer;
11425 }
11426
11427 /* Save ZT0 to the current function's save buffer.  */
11428
11429 static void
11430 aarch64_save_zt0 ()
11431 {
11432   rtx mem = aarch64_get_zt0_save_buffer ();
11433   mem = replace_equiv_address (mem, force_reg (Pmode, XEXP (mem, 0)));
11434   emit_insn (gen_aarch64_sme_str_zt0 (mem));
11435 }
11436
11437 /* Restore ZT0 from the current function's save buffer.  FROM_LAZY_SAVE_P
11438    is true if the load is happening after a call to a private-ZA function,
11439    false if it can be treated as a normal load.  */
11440
11441 static void
11442 aarch64_restore_zt0 (bool from_lazy_save_p)
11443 {
11444   rtx mem = aarch64_get_zt0_save_buffer ();
11445   mem = replace_equiv_address (mem, force_reg (Pmode, XEXP (mem, 0)));
11446   emit_insn (from_lazy_save_p
11447              ? gen_aarch64_restore_zt0 (mem)
11448              : gen_aarch64_sme_ldr_zt0 (mem));
11449 }
11450
11451 /* Implement TARGET_START_CALL_ARGS.  */
11452
11453 static void
11454 aarch64_start_call_args (cumulative_args_t ca_v)
11455 {
11456   CUMULATIVE_ARGS *ca = get_cumulative_args (ca_v);
11457
11458   if (!TARGET_SME && (ca->isa_mode & AARCH64_ISA_MODE_SM_ON))
11459     {
11460       error ("calling a streaming function requires the ISA extension %qs",
11461              "sme");
11462       inform (input_location, "you can enable %qs using the command-line"
11463               " option %<-march%>, or by using the %<target%>"
11464               " attribute or pragma", "sme");
11465     }
11466
11467   if ((ca->shared_za_flags & (AARCH64_STATE_IN | AARCH64_STATE_OUT))
11468       && !aarch64_cfun_has_state ("za"))
11469     error ("call to a function that shares %qs state from a function"
11470            " that has no %qs state", "za", "za");
11471   else if ((ca->shared_zt0_flags & (AARCH64_STATE_IN | AARCH64_STATE_OUT))
11472            && !aarch64_cfun_has_state ("zt0"))
11473     error ("call to a function that shares %qs state from a function"
11474            " that has no %qs state", "zt0", "zt0");
11475   else if (!TARGET_ZA && (ca->isa_mode & AARCH64_ISA_MODE_ZA_ON))
11476     error ("call to a function that shares SME state from a function"
11477            " that has no SME state");
11478
11479   /* If this is a call to a private ZA function, emit a marker to
11480      indicate where any necessary set-up code could be inserted.
11481      The code itself is inserted by the mode-switching pass.  */
11482   if (TARGET_ZA && !(ca->isa_mode & AARCH64_ISA_MODE_ZA_ON))
11483     emit_insn (gen_aarch64_start_private_za_call ());
11484
11485   /* If this is a call to a shared-ZA function that doesn't share ZT0,
11486      save and restore ZT0 around the call.  */
11487   if (aarch64_cfun_has_state ("zt0")
11488       && (ca->isa_mode & AARCH64_ISA_MODE_ZA_ON)
11489       && ca->shared_zt0_flags == 0)
11490     aarch64_save_zt0 ();
11491 }
11492
11493 /* This function is used by the call expanders of the machine description.
11494    RESULT is the register in which the result is returned.  It's NULL for
11495    "call" and "sibcall".
11496    MEM is the location of the function call.
11497    COOKIE is either:
11498      - a const_int that gives the argument to the call's UNSPEC_CALLEE_ABI.
11499      - a PARALLEL that contains such a const_int as its first element.
11500        The second element is a PARALLEL that lists all the argument
11501        registers that need to be saved and restored around a change
11502        in PSTATE.SM, or const0_rtx if no such switch is needed.
11503        The third and fourth elements are const_ints that contain the
11504        sharing flags for ZA and ZT0 respectively.
11505    SIBCALL indicates whether this function call is normal call or sibling call.
11506    It will generate different pattern accordingly.  */
11507
11508 void
11509 aarch64_expand_call (rtx result, rtx mem, rtx cookie, bool sibcall)
11510 {
11511   rtx call, callee, tmp;
11512   rtvec vec;
11513   machine_mode mode;
11514
11515   rtx callee_abi = cookie;
11516   rtx sme_mode_switch_args = const0_rtx;
11517   unsigned int shared_za_flags = 0;
11518   unsigned int shared_zt0_flags = 0;
11519   if (GET_CODE (cookie) == PARALLEL)
11520     {
11521       callee_abi = XVECEXP (cookie, 0, 0);
11522       sme_mode_switch_args = XVECEXP (cookie, 0, 1);
11523       shared_za_flags = INTVAL (XVECEXP (cookie, 0, 2));
11524       shared_zt0_flags = INTVAL (XVECEXP (cookie, 0, 3));
11525     }
11526
11527   gcc_assert (CONST_INT_P (callee_abi));
11528   auto callee_isa_mode = aarch64_callee_isa_mode (callee_abi);
11529
11530   if (aarch64_cfun_has_state ("za")
11531       && (callee_isa_mode & AARCH64_ISA_MODE_ZA_ON)
11532       && !shared_za_flags)
11533     {
11534       sorry ("call to a function that shares state other than %qs"
11535              " from a function that has %qs state", "za", "za");
11536       inform (input_location, "use %<__arm_preserves(\"za\")%> if the"
11537               " callee preserves ZA");
11538     }
11539
11540   gcc_assert (MEM_P (mem));
11541   callee = XEXP (mem, 0);
11542
11543 #if TARGET_PECOFF
11544   tmp = legitimize_pe_coff_symbol (callee, false);
11545   if (tmp)
11546     callee = tmp;
11547 #endif
11548
11549   mode = GET_MODE (callee);
11550   gcc_assert (mode == Pmode);
11551
11552   /* Decide if we should generate indirect calls by loading the
11553      address of the callee into a register before performing
11554      the branch-and-link.  */
11555   if (SYMBOL_REF_P (callee)
11556       ? (aarch64_is_long_call_p (callee)
11557          || aarch64_is_noplt_call_p (callee))
11558       : !REG_P (callee))
11559     XEXP (mem, 0) = force_reg (mode, callee);
11560
11561   /* Accumulate the return values, including state that is shared via
11562      attributes.  */
11563   auto_vec<rtx, 8> return_values;
11564   if (result)
11565     {
11566       if (GET_CODE (result) == PARALLEL)
11567         for (int i = 0; i < XVECLEN (result, 0); ++i)
11568           return_values.safe_push (XVECEXP (result, 0, i));
11569       else
11570         return_values.safe_push (result);
11571     }
11572   unsigned int orig_num_return_values = return_values.length ();
11573   if (shared_za_flags & AARCH64_STATE_OUT)
11574     return_values.safe_push (gen_rtx_REG (VNx16BImode, ZA_REGNUM));
11575   /* When calling private-ZA functions from functions with ZA state,
11576      we want to know whether the call committed a lazy save.  */
11577   if (TARGET_ZA && !shared_za_flags)
11578     return_values.safe_push (gen_rtx_REG (VNx16BImode, ZA_SAVED_REGNUM));
11579   if (shared_zt0_flags & AARCH64_STATE_OUT)
11580     return_values.safe_push (gen_rtx_REG (V8DImode, ZT0_REGNUM));
11581
11582   /* Create the new return value, if necessary.  */
11583   if (orig_num_return_values != return_values.length ())
11584     {
11585       if (return_values.length () == 1)
11586         result = return_values[0];
11587       else
11588         {
11589           for (rtx &x : return_values)
11590             if (GET_CODE (x) != EXPR_LIST)
11591               x = gen_rtx_EXPR_LIST (VOIDmode, x, const0_rtx);
11592           rtvec v = gen_rtvec_v (return_values.length (),
11593                                  return_values.address ());
11594           result = gen_rtx_PARALLEL (VOIDmode, v);
11595         }
11596     }
11597
11598   call = gen_rtx_CALL (VOIDmode, mem, const0_rtx);
11599
11600   if (result != NULL_RTX)
11601     call = gen_rtx_SET (result, call);
11602
11603   if (sibcall)
11604     tmp = ret_rtx;
11605   else
11606     tmp = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (Pmode, LR_REGNUM));
11607
11608   callee_abi = gen_rtx_UNSPEC (DImode, gen_rtvec (1, callee_abi),
11609                                UNSPEC_CALLEE_ABI);
11610
11611   vec = gen_rtvec (3, call, callee_abi, tmp);
11612   call = gen_rtx_PARALLEL (VOIDmode, vec);
11613
11614   auto call_insn = aarch64_emit_call_insn (call);
11615
11616   /* Check whether the call requires a change to PSTATE.SM.  We can't
11617      emit the instructions to change PSTATE.SM yet, since they involve
11618      a change in vector length and a change in instruction set, which
11619      cannot be represented in RTL.
11620
11621      For now, just record which registers will be clobbered and used
11622      by the changes to PSTATE.SM.  */
11623   if (!sibcall && aarch64_call_switches_pstate_sm (callee_isa_mode))
11624     {
11625       aarch64_sme_mode_switch_regs args_switch;
11626       if (sme_mode_switch_args != const0_rtx)
11627         {
11628           unsigned int num_args = XVECLEN (sme_mode_switch_args, 0);
11629           for (unsigned int i = 0; i < num_args; ++i)
11630             {
11631               rtx x = XVECEXP (sme_mode_switch_args, 0, i);
11632               args_switch.add_reg (GET_MODE (x), REGNO (x));
11633             }
11634         }
11635
11636       aarch64_sme_mode_switch_regs result_switch;
11637       if (result)
11638         result_switch.add_call_result (call_insn);
11639
11640       unsigned int num_gprs = MAX (args_switch.num_gprs (),
11641                                    result_switch.num_gprs ());
11642       for (unsigned int i = 0; i < num_gprs; ++i)
11643         clobber_reg (&CALL_INSN_FUNCTION_USAGE (call_insn),
11644                      gen_rtx_REG (DImode, args_switch.FIRST_GPR + i));
11645
11646       for (int regno = V0_REGNUM; regno < V0_REGNUM + 32; regno += 4)
11647         clobber_reg (&CALL_INSN_FUNCTION_USAGE (call_insn),
11648                      gen_rtx_REG (V4x16QImode, regno));
11649
11650       for (int regno = P0_REGNUM; regno < P0_REGNUM + 16; regno += 1)
11651         clobber_reg (&CALL_INSN_FUNCTION_USAGE (call_insn),
11652                      gen_rtx_REG (VNx16BImode, regno));
11653
11654       /* Ensure that the VG save slot has been initialized.  Also emit
11655          an instruction to model the effect of the temporary clobber
11656          of VG, so that the prologue/epilogue pass sees the need to
11657          save the old value.  */
11658       use_reg (&CALL_INSN_FUNCTION_USAGE (call_insn),
11659                gen_rtx_REG (DImode, VG_REGNUM));
11660       emit_insn_before (gen_aarch64_update_vg (), call_insn);
11661
11662       cfun->machine->call_switches_pstate_sm = true;
11663     }
11664
11665   /* Add any ZA-related information.
11666
11667      ZA_REGNUM represents the current function's ZA state, rather than
11668      the contents of the ZA register itself.  We ensure that the function's
11669      ZA state is preserved by private-ZA call sequences, so the call itself
11670      does not use or clobber ZA_REGNUM.  The same thing applies to
11671      ZT0_REGNUM.  */
11672   if (TARGET_ZA)
11673     {
11674       /* The callee requires ZA to be active if the callee is shared-ZA,
11675          otherwise it requires ZA to be dormant or off.  The state of ZA is
11676          captured by a combination of SME_STATE_REGNUM, TPIDR2_SETUP_REGNUM,
11677          and ZA_SAVED_REGNUM.  */
11678       use_reg (&CALL_INSN_FUNCTION_USAGE (call_insn),
11679                gen_rtx_REG (DImode, SME_STATE_REGNUM));
11680       use_reg (&CALL_INSN_FUNCTION_USAGE (call_insn),
11681                gen_rtx_REG (DImode, TPIDR2_SETUP_REGNUM));
11682       use_reg (&CALL_INSN_FUNCTION_USAGE (call_insn),
11683                gen_rtx_REG (VNx16BImode, ZA_SAVED_REGNUM));
11684
11685       /* Keep the aarch64_start/end_private_za_call markers live.  */
11686       if (!(callee_isa_mode & AARCH64_ISA_MODE_ZA_ON))
11687         use_reg (&CALL_INSN_FUNCTION_USAGE (call_insn),
11688                  gen_rtx_REG (VNx16BImode, LOWERING_REGNUM));
11689
11690       /* If the callee is a shared-ZA function, record whether it uses the
11691          current value of ZA and ZT0.  */
11692       if (shared_za_flags & AARCH64_STATE_IN)
11693         use_reg (&CALL_INSN_FUNCTION_USAGE (call_insn),
11694                  gen_rtx_REG (VNx16BImode, ZA_REGNUM));
11695
11696       if (shared_zt0_flags & AARCH64_STATE_IN)
11697         use_reg (&CALL_INSN_FUNCTION_USAGE (call_insn),
11698                  gen_rtx_REG (V8DImode, ZT0_REGNUM));
11699     }
11700 }
11701
11702 /* Implement TARGET_END_CALL_ARGS.  */
11703
11704 static void
11705 aarch64_end_call_args (cumulative_args_t ca_v)
11706 {
11707   CUMULATIVE_ARGS *ca = get_cumulative_args (ca_v);
11708
11709   /* If this is a call to a private ZA function, emit a marker to
11710      indicate where any necessary restoration code could be inserted.
11711      The code itself is inserted by the mode-switching pass.  */
11712   if (TARGET_ZA && !(ca->isa_mode & AARCH64_ISA_MODE_ZA_ON))
11713     emit_insn (gen_aarch64_end_private_za_call ());
11714
11715   /* If this is a call to a shared-ZA function that doesn't share ZT0,
11716      save and restore ZT0 around the call.  */
11717   if (aarch64_cfun_has_state ("zt0")
11718       && (ca->isa_mode & AARCH64_ISA_MODE_ZA_ON)
11719       && ca->shared_zt0_flags == 0)
11720     aarch64_restore_zt0 (false);
11721 }
11722
11723 /* Emit call insn with PAT and do aarch64-specific handling.  */
11724
11725 rtx_call_insn *
11726 aarch64_emit_call_insn (rtx pat)
11727 {
11728   auto insn = emit_call_insn (pat);
11729
11730   rtx *fusage = &CALL_INSN_FUNCTION_USAGE (insn);
11731   clobber_reg (fusage, gen_rtx_REG (word_mode, IP0_REGNUM));
11732   clobber_reg (fusage, gen_rtx_REG (word_mode, IP1_REGNUM));
11733   return as_a<rtx_call_insn *> (insn);
11734 }
11735
11736 machine_mode
11737 aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
11738 {
11739   machine_mode mode_x = GET_MODE (x);
11740   rtx_code code_x = GET_CODE (x);
11741
11742   /* All floating point compares return CCFP if it is an equality
11743      comparison, and CCFPE otherwise.  */
11744   if (GET_MODE_CLASS (mode_x) == MODE_FLOAT)
11745     {
11746       switch (code)
11747         {
11748         case EQ:
11749         case NE:
11750         case UNORDERED:
11751         case ORDERED:
11752         case UNLT:
11753         case UNLE:
11754         case UNGT:
11755         case UNGE:
11756         case UNEQ:
11757           return CCFPmode;
11758
11759         case LT:
11760         case LE:
11761         case GT:
11762         case GE:
11763         case LTGT:
11764           return CCFPEmode;
11765
11766         default:
11767           gcc_unreachable ();
11768         }
11769     }
11770
11771   /* Equality comparisons of short modes against zero can be performed
11772      using the TST instruction with the appropriate bitmask.  */
11773   if (y == const0_rtx && (REG_P (x) || SUBREG_P (x))
11774       && (code == EQ || code == NE)
11775       && (mode_x == HImode || mode_x == QImode))
11776     return CC_Zmode;
11777
11778   /* Similarly, comparisons of zero_extends from shorter modes can
11779      be performed using an ANDS with an immediate mask.  */
11780   if (y == const0_rtx && code_x == ZERO_EXTEND
11781       && (mode_x == SImode || mode_x == DImode)
11782       && (GET_MODE (XEXP (x, 0)) == HImode || GET_MODE (XEXP (x, 0)) == QImode)
11783       && (code == EQ || code == NE))
11784     return CC_Zmode;
11785
11786   /* Zero extracts support equality comparisons.  */
11787   if ((mode_x == SImode || mode_x == DImode)
11788       && y == const0_rtx
11789       && (code_x == ZERO_EXTRACT && CONST_INT_P (XEXP (x, 1))
11790           && CONST_INT_P (XEXP (x, 2)))
11791       && (code == EQ || code == NE))
11792     return CC_Zmode;
11793
11794   /* ANDS/BICS/TST support equality and all signed comparisons.  */
11795   if ((mode_x == SImode || mode_x == DImode)
11796       && y == const0_rtx
11797       && (code_x == AND)
11798       && (code == EQ || code == NE || code == LT || code == GE
11799           || code == GT || code == LE))
11800     return CC_NZVmode;
11801
11802   /* ADDS/SUBS correctly set N and Z flags.  */
11803   if ((mode_x == SImode || mode_x == DImode)
11804       && y == const0_rtx
11805       && (code == EQ || code == NE || code == LT || code == GE)
11806       && (code_x == PLUS || code_x == MINUS || code_x == NEG))
11807     return CC_NZmode;
11808
11809   /* A compare with a shifted operand.  Because of canonicalization,
11810      the comparison will have to be swapped when we emit the assembly
11811      code.  */
11812   if ((mode_x == SImode || mode_x == DImode)
11813       && (REG_P (y) || SUBREG_P (y) || y == const0_rtx)
11814       && (code_x == ASHIFT || code_x == ASHIFTRT
11815           || code_x == LSHIFTRT
11816           || code_x == ZERO_EXTEND || code_x == SIGN_EXTEND))
11817     return CC_SWPmode;
11818
11819   /* Similarly for a negated operand, but we can only do this for
11820      equalities.  */
11821   if ((mode_x == SImode || mode_x == DImode)
11822       && (REG_P (y) || SUBREG_P (y))
11823       && (code == EQ || code == NE)
11824       && code_x == NEG)
11825     return CC_Zmode;
11826
11827   /* A test for unsigned overflow from an addition.  */
11828   if ((mode_x == DImode || mode_x == TImode)
11829       && (code == LTU || code == GEU)
11830       && code_x == PLUS
11831       && rtx_equal_p (XEXP (x, 0), y))
11832     return CC_Cmode;
11833
11834   /* A test for unsigned overflow from an add with carry.  */
11835   if ((mode_x == DImode || mode_x == TImode)
11836       && (code == LTU || code == GEU)
11837       && code_x == PLUS
11838       && CONST_SCALAR_INT_P (y)
11839       && (rtx_mode_t (y, mode_x)
11840           == (wi::shwi (1, mode_x)
11841               << (GET_MODE_BITSIZE (mode_x).to_constant () / 2))))
11842     return CC_ADCmode;
11843
11844   /* A test for signed overflow.  */
11845   if ((mode_x == DImode || mode_x == TImode)
11846       && code == NE
11847       && code_x == PLUS
11848       && GET_CODE (y) == SIGN_EXTEND)
11849     return CC_Vmode;
11850
11851   /* For everything else, return CCmode.  */
11852   return CCmode;
11853 }
11854
11855 static int
11856 aarch64_get_condition_code_1 (machine_mode, enum rtx_code);
11857
11858 int
11859 aarch64_get_condition_code (rtx x)
11860 {
11861   machine_mode mode = GET_MODE (XEXP (x, 0));
11862   enum rtx_code comp_code = GET_CODE (x);
11863
11864   if (GET_MODE_CLASS (mode) != MODE_CC)
11865     mode = SELECT_CC_MODE (comp_code, XEXP (x, 0), XEXP (x, 1));
11866   return aarch64_get_condition_code_1 (mode, comp_code);
11867 }
11868
11869 static int
11870 aarch64_get_condition_code_1 (machine_mode mode, enum rtx_code comp_code)
11871 {
11872   switch (mode)
11873     {
11874     case E_CCFPmode:
11875     case E_CCFPEmode:
11876       switch (comp_code)
11877         {
11878         case GE: return AARCH64_GE;
11879         case GT: return AARCH64_GT;
11880         case LE: return AARCH64_LS;
11881         case LT: return AARCH64_MI;
11882         case NE: return AARCH64_NE;
11883         case EQ: return AARCH64_EQ;
11884         case ORDERED: return AARCH64_VC;
11885         case UNORDERED: return AARCH64_VS;
11886         case UNLT: return AARCH64_LT;
11887         case UNLE: return AARCH64_LE;
11888         case UNGT: return AARCH64_HI;
11889         case UNGE: return AARCH64_PL;
11890         default: return -1;
11891         }
11892       break;
11893
11894     case E_CCmode:
11895       switch (comp_code)
11896         {
11897         case NE: return AARCH64_NE;
11898         case EQ: return AARCH64_EQ;
11899         case GE: return AARCH64_GE;
11900         case GT: return AARCH64_GT;
11901         case LE: return AARCH64_LE;
11902         case LT: return AARCH64_LT;
11903         case GEU: return AARCH64_CS;
11904         case GTU: return AARCH64_HI;
11905         case LEU: return AARCH64_LS;
11906         case LTU: return AARCH64_CC;
11907         default: return -1;
11908         }
11909       break;
11910
11911     case E_CC_SWPmode:
11912       switch (comp_code)
11913         {
11914         case NE: return AARCH64_NE;
11915         case EQ: return AARCH64_EQ;
11916         case GE: return AARCH64_LE;
11917         case GT: return AARCH64_LT;
11918         case LE: return AARCH64_GE;
11919         case LT: return AARCH64_GT;
11920         case GEU: return AARCH64_LS;
11921         case GTU: return AARCH64_CC;
11922         case LEU: return AARCH64_CS;
11923         case LTU: return AARCH64_HI;
11924         default: return -1;
11925         }
11926       break;
11927
11928     case E_CC_NZCmode:
11929       switch (comp_code)
11930         {
11931         case NE: return AARCH64_NE; /* = any */
11932         case EQ: return AARCH64_EQ; /* = none */
11933         case GE: return AARCH64_PL; /* = nfrst */
11934         case LT: return AARCH64_MI; /* = first */
11935         case GEU: return AARCH64_CS; /* = nlast */
11936         case GTU: return AARCH64_HI; /* = pmore */
11937         case LEU: return AARCH64_LS; /* = plast */
11938         case LTU: return AARCH64_CC; /* = last */
11939         default: return -1;
11940         }
11941       break;
11942
11943     case E_CC_NZVmode:
11944       switch (comp_code)
11945         {
11946         case NE: return AARCH64_NE;
11947         case EQ: return AARCH64_EQ;
11948         case GE: return AARCH64_PL;
11949         case LT: return AARCH64_MI;
11950         case GT: return AARCH64_GT;
11951         case LE: return AARCH64_LE;
11952         default: return -1;
11953         }
11954       break;
11955
11956     case E_CC_NZmode:
11957       switch (comp_code)
11958         {
11959         case NE: return AARCH64_NE;
11960         case EQ: return AARCH64_EQ;
11961         case GE: return AARCH64_PL;
11962         case LT: return AARCH64_MI;
11963         default: return -1;
11964         }
11965       break;
11966
11967     case E_CC_Zmode:
11968       switch (comp_code)
11969         {
11970         case NE: return AARCH64_NE;
11971         case EQ: return AARCH64_EQ;
11972         default: return -1;
11973         }
11974       break;
11975
11976     case E_CC_Cmode:
11977       switch (comp_code)
11978         {
11979         case LTU: return AARCH64_CS;
11980         case GEU: return AARCH64_CC;
11981         default: return -1;
11982         }
11983       break;
11984
11985     case E_CC_ADCmode:
11986       switch (comp_code)
11987         {
11988         case GEU: return AARCH64_CS;
11989         case LTU: return AARCH64_CC;
11990         default: return -1;
11991         }
11992       break;
11993
11994     case E_CC_Vmode:
11995       switch (comp_code)
11996         {
11997         case NE: return AARCH64_VS;
11998         case EQ: return AARCH64_VC;
11999         default: return -1;
12000         }
12001       break;
12002
12003     default:
12004       return -1;
12005     }
12006
12007   return -1;
12008 }
12009
12010 /* Return true if X is a CONST_INT, CONST_WIDE_INT or a constant vector
12011    duplicate of such constants.  If so, store in RET_WI the wide_int
12012    representation of the constant paired with the inner mode of the vector mode
12013    or MODE for scalar X constants.  If MODE is not provided then TImode is
12014    used.  */
12015
12016 static bool
12017 aarch64_extract_vec_duplicate_wide_int (rtx x, wide_int *ret_wi,
12018                                         scalar_mode mode = TImode)
12019 {
12020   rtx elt = unwrap_const_vec_duplicate (x);
12021   if (!CONST_SCALAR_INT_P (elt))
12022     return false;
12023   scalar_mode smode
12024     = CONST_SCALAR_INT_P (x) ? mode : GET_MODE_INNER (GET_MODE (x));
12025   *ret_wi = rtx_mode_t (elt, smode);
12026   return true;
12027 }
12028
12029 /* Return true if X is a scalar or a constant vector of integer
12030    immediates that represent the rounding constant used in the fixed-point
12031    arithmetic instructions.
12032    The accepted form of the constant is (1 << (C - 1)) where C is in the range
12033    [1, MODE_WIDTH/2].  */
12034
12035 bool
12036 aarch64_rnd_imm_p (rtx x)
12037 {
12038   wide_int rnd_cst;
12039   if (!aarch64_extract_vec_duplicate_wide_int (x, &rnd_cst))
12040     return false;
12041   int log2 = wi::exact_log2 (rnd_cst);
12042   if (log2 < 0)
12043     return false;
12044   return IN_RANGE (log2, 0, rnd_cst.get_precision () / 2 - 1);
12045 }
12046
12047 /* Return true if RND is a constant vector of integer rounding constants
12048    corresponding to a constant vector of shifts, SHIFT.
12049    The relationship should be RND == (1 << (SHIFT - 1)).  */
12050
12051 bool
12052 aarch64_const_vec_rnd_cst_p (rtx rnd, rtx shift)
12053 {
12054   wide_int rnd_cst, shft_cst;
12055   if (!aarch64_extract_vec_duplicate_wide_int (rnd, &rnd_cst)
12056       || !aarch64_extract_vec_duplicate_wide_int (shift, &shft_cst))
12057     return false;
12058
12059   return rnd_cst == (wi::shwi (1, rnd_cst.get_precision ()) << (shft_cst - 1));
12060 }
12061
12062 bool
12063 aarch64_const_vec_all_same_in_range_p (rtx x,
12064                                        HOST_WIDE_INT minval,
12065                                        HOST_WIDE_INT maxval)
12066 {
12067   rtx elt;
12068   return (const_vec_duplicate_p (x, &elt)
12069           && CONST_INT_P (elt)
12070           && IN_RANGE (INTVAL (elt), minval, maxval));
12071 }
12072
12073 /* Some constants can't be made using normal mov instructions in Advanced SIMD
12074    but we can still create them in various ways.  If the constant in VAL can be
12075    created using alternate methods then if possible then return true and
12076    additionally set TARGET to the rtx for the sequence if TARGET is not NULL.
12077    Otherwise return false if sequence is not possible.  */
12078
12079 bool
12080 aarch64_maybe_generate_simd_constant (rtx target, rtx val, machine_mode mode)
12081 {
12082   wide_int wval;
12083   auto smode = GET_MODE_INNER (mode);
12084   if (!aarch64_extract_vec_duplicate_wide_int (val, &wval, smode))
12085     return false;
12086
12087   /* For Advanced SIMD we can create an integer with only the top bit set
12088      using fneg (0.0f).  */
12089   if (TARGET_SIMD
12090       && !TARGET_SVE
12091       && smode == DImode
12092       && wi::only_sign_bit_p (wval))
12093     {
12094       if (!target)
12095         return true;
12096
12097       /* Use the same base type as aarch64_gen_shareable_zero.  */
12098       rtx zero = CONST0_RTX (V4SImode);
12099       emit_move_insn (lowpart_subreg (V4SImode, target, mode), zero);
12100       rtx neg = lowpart_subreg (V2DImode, target, mode);
12101       emit_insn (gen_aarch64_fnegv2di2 (neg, copy_rtx (neg)));
12102       return true;
12103     }
12104
12105   return false;
12106 }
12107
12108 /* Check if the value in VAL with mode MODE can be created using special
12109    instruction sequences.  */
12110
12111 bool aarch64_simd_special_constant_p (rtx val, machine_mode mode)
12112 {
12113   return aarch64_maybe_generate_simd_constant (NULL_RTX, val, mode);
12114 }
12115
12116 bool
12117 aarch64_const_vec_all_same_int_p (rtx x, HOST_WIDE_INT val)
12118 {
12119   return aarch64_const_vec_all_same_in_range_p (x, val, val);
12120 }
12121
12122 /* Return true if VEC is a constant in which every element is in the range
12123    [MINVAL, MAXVAL].  The elements do not need to have the same value.  */
12124
12125 static bool
12126 aarch64_const_vec_all_in_range_p (rtx vec,
12127                                   HOST_WIDE_INT minval,
12128                                   HOST_WIDE_INT maxval)
12129 {
12130   if (!CONST_VECTOR_P (vec)
12131       || GET_MODE_CLASS (GET_MODE (vec)) != MODE_VECTOR_INT)
12132     return false;
12133
12134   int nunits;
12135   if (!CONST_VECTOR_STEPPED_P (vec))
12136     nunits = const_vector_encoded_nelts (vec);
12137   else if (!CONST_VECTOR_NUNITS (vec).is_constant (&nunits))
12138     return false;
12139
12140   for (int i = 0; i < nunits; i++)
12141     {
12142       rtx vec_elem = CONST_VECTOR_ELT (vec, i);
12143       if (!CONST_INT_P (vec_elem)
12144           || !IN_RANGE (INTVAL (vec_elem), minval, maxval))
12145         return false;
12146     }
12147   return true;
12148 }
12149
12150 /* N Z C V.  */
12151 #define AARCH64_CC_V 1
12152 #define AARCH64_CC_C (1 << 1)
12153 #define AARCH64_CC_Z (1 << 2)
12154 #define AARCH64_CC_N (1 << 3)
12155
12156 /* N Z C V flags for ccmp.  Indexed by AARCH64_COND_CODE.  */
12157 static const int aarch64_nzcv_codes[] =
12158 {
12159   0,            /* EQ, Z == 1.  */
12160   AARCH64_CC_Z, /* NE, Z == 0.  */
12161   0,            /* CS, C == 1.  */
12162   AARCH64_CC_C, /* CC, C == 0.  */
12163   0,            /* MI, N == 1.  */
12164   AARCH64_CC_N, /* PL, N == 0.  */
12165   0,            /* VS, V == 1.  */
12166   AARCH64_CC_V, /* VC, V == 0.  */
12167   0,            /* HI, C ==1 && Z == 0.  */
12168   AARCH64_CC_C, /* LS, !(C == 1 && Z == 0).  */
12169   AARCH64_CC_V, /* GE, N == V.  */
12170   0,            /* LT, N != V.  */
12171   AARCH64_CC_Z, /* GT, Z == 0 && N == V.  */
12172   0,            /* LE, !(Z == 0 && N == V).  */
12173   0,            /* AL, Any.  */
12174   0             /* NV, Any.  */
12175 };
12176
12177 /* Print floating-point vector immediate operand X to F, negating it
12178    first if NEGATE is true.  Return true on success, false if it isn't
12179    a constant we can handle.  */
12180
12181 static bool
12182 aarch64_print_vector_float_operand (FILE *f, rtx x, bool negate)
12183 {
12184   rtx elt;
12185
12186   if (!const_vec_duplicate_p (x, &elt))
12187     return false;
12188
12189   REAL_VALUE_TYPE r = *CONST_DOUBLE_REAL_VALUE (elt);
12190   if (negate)
12191     r = real_value_negate (&r);
12192
12193   /* Handle the SVE single-bit immediates specially, since they have a
12194      fixed form in the assembly syntax.  */
12195   if (real_equal (&r, &dconst0))
12196     asm_fprintf (f, "0.0");
12197   else if (real_equal (&r, &dconst2))
12198     asm_fprintf (f, "2.0");
12199   else if (real_equal (&r, &dconst1))
12200     asm_fprintf (f, "1.0");
12201   else if (real_equal (&r, &dconsthalf))
12202     asm_fprintf (f, "0.5");
12203   else
12204     {
12205       const int buf_size = 20;
12206       char float_buf[buf_size] = {'\0'};
12207       real_to_decimal_for_mode (float_buf, &r, buf_size, buf_size,
12208                                 1, GET_MODE (elt));
12209       asm_fprintf (f, "%s", float_buf);
12210     }
12211
12212   return true;
12213 }
12214
12215 /* Return the equivalent letter for size.  */
12216 static char
12217 sizetochar (int size)
12218 {
12219   switch (size)
12220     {
12221     case 64: return 'd';
12222     case 32: return 's';
12223     case 16: return 'h';
12224     case 8:  return 'b';
12225     default: gcc_unreachable ();
12226     }
12227 }
12228
12229 /* Print operand X to file F in a target specific manner according to CODE.
12230    The acceptable formatting commands given by CODE are:
12231      'c':               An integer or symbol address without a preceding #
12232                         sign.
12233      'C':               Take the duplicated element in a vector constant
12234                         and print it in hex.
12235      'D':               Take the duplicated element in a vector constant
12236                         and print it as an unsigned integer, in decimal.
12237      'e':               Print the sign/zero-extend size as a character 8->b,
12238                         16->h, 32->w.  Can also be used for masks:
12239                         0xff->b, 0xffff->h, 0xffffffff->w.
12240      'I':               If the operand is a duplicated vector constant,
12241                         replace it with the duplicated scalar.  If the
12242                         operand is then a floating-point constant, replace
12243                         it with the integer bit representation.  Print the
12244                         transformed constant as a signed decimal number.
12245      'p':               Prints N such that 2^N == X (X must be power of 2 and
12246                         const int).
12247      'P':               Print the number of non-zero bits in X (a const_int).
12248      'H':               Print the higher numbered register of a pair (TImode)
12249                         of regs.
12250      'm':               Print a condition (eq, ne, etc).
12251      'M':               Same as 'm', but invert condition.
12252      'N':               Take the duplicated element in a vector constant
12253                         and print the negative of it in decimal.
12254      'b/h/s/d/q':       Print a scalar FP/SIMD register name.
12255      'Z':               Same for SVE registers.  ('z' was already taken.)
12256                         Note that it is not necessary to use %Z for operands
12257                         that have SVE modes.  The convention is to use %Z
12258                         only for non-SVE (or potentially non-SVE) modes.
12259      'S/T/U/V':         Print a FP/SIMD register name for a register list.
12260                         The register printed is the FP/SIMD register name
12261                         of X + 0/1/2/3 for S/T/U/V.
12262      'R':               Print a scalar Integer/FP/SIMD register name + 1.
12263      'X':               Print bottom 16 bits of integer constant in hex.
12264      'w/x':             Print a general register name or the zero register
12265                         (32-bit or 64-bit).
12266      '0':               Print a normal operand, if it's a general register,
12267                         then we assume DImode.
12268      'k':               Print NZCV for conditional compare instructions.
12269      'K':               Print a predicate register as pn<N> rather than p<N>
12270      'A':               Output address constant representing the first
12271                         argument of X, specifying a relocation offset
12272                         if appropriate.
12273      'L':               Output constant address specified by X
12274                         with a relocation offset if appropriate.
12275      'G':               Prints address of X, specifying a PC relative
12276                         relocation mode if appropriate.
12277      'y':               Output address of LDP or STP - this is used for
12278                         some LDP/STPs which don't use a PARALLEL in their
12279                         pattern (so the mode needs to be adjusted).
12280      'z':               Output address of a typical LDP or STP.  */
12281
12282 static void
12283 aarch64_print_operand (FILE *f, rtx x, int code)
12284 {
12285   rtx elt;
12286   switch (code)
12287     {
12288     case 'c':
12289       if (CONST_INT_P (x))
12290         fprintf (f, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
12291       else
12292         {
12293           poly_int64 offset;
12294           rtx base = strip_offset_and_salt (x, &offset);
12295           if (SYMBOL_REF_P (base))
12296             output_addr_const (f, x);
12297           else
12298             output_operand_lossage ("unsupported operand for code '%c'", code);
12299         }
12300       break;
12301
12302     case 'e':
12303       {
12304         x = unwrap_const_vec_duplicate (x);
12305         if (!CONST_INT_P (x))
12306           {
12307             output_operand_lossage ("invalid operand for '%%%c'", code);
12308             return;
12309           }
12310
12311         HOST_WIDE_INT val = INTVAL (x);
12312         if ((val & ~7) == 8 || val == 0xff)
12313           fputc ('b', f);
12314         else if ((val & ~7) == 16 || val == 0xffff)
12315           fputc ('h', f);
12316         else if ((val & ~7) == 32 || val == 0xffffffff)
12317           fputc ('w', f);
12318         else
12319           {
12320             output_operand_lossage ("invalid operand for '%%%c'", code);
12321             return;
12322           }
12323       }
12324       break;
12325
12326     case 'p':
12327       {
12328         int n;
12329
12330         if (!CONST_INT_P (x) || (n = exact_log2 (INTVAL (x))) < 0)
12331           {
12332             output_operand_lossage ("invalid operand for '%%%c'", code);
12333             return;
12334           }
12335
12336         asm_fprintf (f, "%d", n);
12337       }
12338       break;
12339
12340     case 'P':
12341       if (!CONST_INT_P (x))
12342         {
12343           output_operand_lossage ("invalid operand for '%%%c'", code);
12344           return;
12345         }
12346
12347       asm_fprintf (f, "%u", popcount_hwi (INTVAL (x)));
12348       break;
12349
12350     case 'H':
12351       if (x == const0_rtx)
12352         {
12353           asm_fprintf (f, "xzr");
12354           break;
12355         }
12356
12357       if (!REG_P (x) || !GP_REGNUM_P (REGNO (x) + 1))
12358         {
12359           output_operand_lossage ("invalid operand for '%%%c'", code);
12360           return;
12361         }
12362
12363       asm_fprintf (f, "%s", reg_names [REGNO (x) + 1]);
12364       break;
12365
12366     case 'I':
12367       {
12368         x = aarch64_bit_representation (unwrap_const_vec_duplicate (x));
12369         if (CONST_INT_P (x))
12370           asm_fprintf (f, "%wd", INTVAL (x));
12371         else
12372           {
12373             output_operand_lossage ("invalid operand for '%%%c'", code);
12374             return;
12375           }
12376         break;
12377       }
12378
12379     case 'M':
12380     case 'm':
12381       {
12382         int cond_code;
12383         /* CONST_TRUE_RTX means al/nv (al is the default, don't print it).  */
12384         if (x == const_true_rtx)
12385           {
12386             if (code == 'M')
12387               fputs ("nv", f);
12388             return;
12389           }
12390
12391         if (!COMPARISON_P (x))
12392           {
12393             output_operand_lossage ("invalid operand for '%%%c'", code);
12394             return;
12395           }
12396
12397         cond_code = aarch64_get_condition_code (x);
12398         gcc_assert (cond_code >= 0);
12399         if (code == 'M')
12400           cond_code = AARCH64_INVERSE_CONDITION_CODE (cond_code);
12401         if (GET_MODE (XEXP (x, 0)) == CC_NZCmode)
12402           fputs (aarch64_sve_condition_codes[cond_code], f);
12403         else
12404           fputs (aarch64_condition_codes[cond_code], f);
12405       }
12406       break;
12407
12408     case 'N':
12409       if (!const_vec_duplicate_p (x, &elt))
12410         {
12411           output_operand_lossage ("invalid vector constant");
12412           return;
12413         }
12414
12415       if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
12416         asm_fprintf (f, "%wd", (HOST_WIDE_INT) -UINTVAL (elt));
12417       else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
12418                && aarch64_print_vector_float_operand (f, x, true))
12419         ;
12420       else
12421         {
12422           output_operand_lossage ("invalid vector constant");
12423           return;
12424         }
12425       break;
12426
12427     case 'b':
12428     case 'h':
12429     case 's':
12430     case 'd':
12431     case 'q':
12432     case 'Z':
12433       code = TOLOWER (code);
12434       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
12435         {
12436           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
12437           return;
12438         }
12439       asm_fprintf (f, "%c%d", code, REGNO (x) - V0_REGNUM);
12440       break;
12441
12442     case 'S':
12443     case 'T':
12444     case 'U':
12445     case 'V':
12446       if (!REG_P (x) || (!FP_REGNUM_P (REGNO (x)) && !PR_REGNUM_P (REGNO (x))))
12447         {
12448           output_operand_lossage ("incompatible operand for '%%%c'", code);
12449           return;
12450         }
12451       if (PR_REGNUM_P (REGNO (x)))
12452         asm_fprintf (f, "p%d", REGNO (x) - P0_REGNUM + (code - 'S'));
12453       else
12454         asm_fprintf (f, "%c%d",
12455                      aarch64_sve_data_mode_p (GET_MODE (x)) ? 'z' : 'v',
12456                      REGNO (x) - V0_REGNUM + (code - 'S'));
12457       break;
12458
12459     case 'R':
12460       if (REG_P (x) && FP_REGNUM_P (REGNO (x))
12461           && (aarch64_advsimd_partial_struct_mode_p (GET_MODE (x))))
12462         asm_fprintf (f, "d%d", REGNO (x) - V0_REGNUM + 1);
12463       else if (REG_P (x) && FP_REGNUM_P (REGNO (x)))
12464         asm_fprintf (f, "q%d", REGNO (x) - V0_REGNUM + 1);
12465       else if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
12466         asm_fprintf (f, "x%d", REGNO (x) - R0_REGNUM + 1);
12467       else
12468         output_operand_lossage ("incompatible register operand for '%%%c'",
12469                                 code);
12470       break;
12471
12472     case 'X':
12473       if (!CONST_INT_P (x))
12474         {
12475           output_operand_lossage ("invalid operand for '%%%c'", code);
12476           return;
12477         }
12478       asm_fprintf (f, "0x%wx", UINTVAL (x) & 0xffff);
12479       break;
12480
12481     case 'C':
12482       {
12483         /* Print a replicated constant in hex.  */
12484         if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
12485           {
12486             output_operand_lossage ("invalid operand for '%%%c'", code);
12487             return;
12488           }
12489         scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
12490         asm_fprintf (f, "0x%wx", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
12491       }
12492       break;
12493
12494     case 'D':
12495       {
12496         /* Print a replicated constant in decimal, treating it as
12497            unsigned.  */
12498         if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
12499           {
12500             output_operand_lossage ("invalid operand for '%%%c'", code);
12501             return;
12502           }
12503         scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
12504         asm_fprintf (f, "%wd", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
12505       }
12506       break;
12507
12508     case 'w':
12509     case 'x':
12510       if (aarch64_const_zero_rtx_p (x))
12511         {
12512           asm_fprintf (f, "%czr", code);
12513           break;
12514         }
12515
12516       if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
12517         {
12518           asm_fprintf (f, "%c%d", code, REGNO (x) - R0_REGNUM);
12519           break;
12520         }
12521
12522       if (REG_P (x) && REGNO (x) == SP_REGNUM)
12523         {
12524           asm_fprintf (f, "%ssp", code == 'w' ? "w" : "");
12525           break;
12526         }
12527
12528       /* Fall through */
12529
12530     case 0:
12531       if (x == NULL)
12532         {
12533           output_operand_lossage ("missing operand");
12534           return;
12535         }
12536
12537       switch (GET_CODE (x))
12538         {
12539         case CONST_STRING:
12540           {
12541             asm_fprintf (f, "%s", XSTR (x, 0));
12542             break;
12543           }
12544         case REG:
12545           if (aarch64_sve_data_mode_p (GET_MODE (x)))
12546             {
12547               if (REG_NREGS (x) == 1)
12548                 asm_fprintf (f, "z%d", REGNO (x) - V0_REGNUM);
12549               else
12550                 {
12551                   char suffix
12552                     = sizetochar (GET_MODE_UNIT_BITSIZE (GET_MODE (x)));
12553                   asm_fprintf (f, "{z%d.%c - z%d.%c}",
12554                                REGNO (x) - V0_REGNUM, suffix,
12555                                END_REGNO (x) - V0_REGNUM - 1, suffix);
12556                 }
12557             }
12558           else
12559             asm_fprintf (f, "%s", reg_names [REGNO (x)]);
12560           break;
12561
12562         case MEM:
12563           output_address (GET_MODE (x), XEXP (x, 0));
12564           break;
12565
12566         case LABEL_REF:
12567         case SYMBOL_REF:
12568           output_addr_const (asm_out_file, x);
12569           break;
12570
12571         case CONST_INT:
12572           asm_fprintf (f, "%wd", INTVAL (x));
12573           break;
12574
12575         case CONST:
12576           if (!VECTOR_MODE_P (GET_MODE (x)))
12577             {
12578               output_addr_const (asm_out_file, x);
12579               break;
12580             }
12581           /* fall through */
12582
12583         case CONST_VECTOR:
12584           if (!const_vec_duplicate_p (x, &elt))
12585             {
12586               output_operand_lossage ("invalid vector constant");
12587               return;
12588             }
12589
12590           if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
12591             asm_fprintf (f, "%wd", INTVAL (elt));
12592           else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
12593                    && aarch64_print_vector_float_operand (f, x, false))
12594             ;
12595           else
12596             {
12597               output_operand_lossage ("invalid vector constant");
12598               return;
12599             }
12600           break;
12601
12602         case CONST_DOUBLE:
12603           /* Since we define TARGET_SUPPORTS_WIDE_INT we shouldn't ever
12604              be getting CONST_DOUBLEs holding integers.  */
12605           gcc_assert (GET_MODE (x) != VOIDmode);
12606           if (aarch64_float_const_zero_rtx_p (x))
12607             {
12608               fputc ('0', f);
12609               break;
12610             }
12611           else if (aarch64_float_const_representable_p (x))
12612             {
12613 #define buf_size 20
12614               char float_buf[buf_size] = {'\0'};
12615               real_to_decimal_for_mode (float_buf,
12616                                         CONST_DOUBLE_REAL_VALUE (x),
12617                                         buf_size, buf_size,
12618                                         1, GET_MODE (x));
12619               asm_fprintf (asm_out_file, "%s", float_buf);
12620               break;
12621 #undef buf_size
12622             }
12623           output_operand_lossage ("invalid constant");
12624           return;
12625         default:
12626           output_operand_lossage ("invalid operand");
12627           return;
12628         }
12629       break;
12630
12631     case 'A':
12632       if (GET_CODE (x) == HIGH)
12633         x = XEXP (x, 0);
12634
12635       switch (aarch64_classify_symbolic_expression (x))
12636         {
12637         case SYMBOL_SMALL_GOT_4G:
12638           asm_fprintf (asm_out_file, ":got:");
12639           break;
12640
12641         case SYMBOL_SMALL_TLSGD:
12642           asm_fprintf (asm_out_file, ":tlsgd:");
12643           break;
12644
12645         case SYMBOL_SMALL_TLSDESC:
12646           asm_fprintf (asm_out_file, ":tlsdesc:");
12647           break;
12648
12649         case SYMBOL_SMALL_TLSIE:
12650           asm_fprintf (asm_out_file, ":gottprel:");
12651           break;
12652
12653         case SYMBOL_TLSLE24:
12654           asm_fprintf (asm_out_file, ":tprel:");
12655           break;
12656
12657         case SYMBOL_TINY_GOT:
12658           gcc_unreachable ();
12659           break;
12660
12661         default:
12662           break;
12663         }
12664       output_addr_const (asm_out_file, x);
12665       break;
12666
12667     case 'L':
12668       switch (aarch64_classify_symbolic_expression (x))
12669         {
12670         case SYMBOL_SMALL_GOT_4G:
12671           asm_fprintf (asm_out_file, ":got_lo12:");
12672           break;
12673
12674         case SYMBOL_SMALL_TLSGD:
12675           asm_fprintf (asm_out_file, ":tlsgd_lo12:");
12676           break;
12677
12678         case SYMBOL_SMALL_TLSDESC:
12679           asm_fprintf (asm_out_file, ":tlsdesc_lo12:");
12680           break;
12681
12682         case SYMBOL_SMALL_TLSIE:
12683           asm_fprintf (asm_out_file, ":gottprel_lo12:");
12684           break;
12685
12686         case SYMBOL_TLSLE12:
12687           asm_fprintf (asm_out_file, ":tprel_lo12:");
12688           break;
12689
12690         case SYMBOL_TLSLE24:
12691           asm_fprintf (asm_out_file, ":tprel_lo12_nc:");
12692           break;
12693
12694         case SYMBOL_TINY_GOT:
12695           asm_fprintf (asm_out_file, ":got:");
12696           break;
12697
12698         case SYMBOL_TINY_TLSIE:
12699           asm_fprintf (asm_out_file, ":gottprel:");
12700           break;
12701
12702         default:
12703           break;
12704         }
12705       output_addr_const (asm_out_file, x);
12706       break;
12707
12708     case 'G':
12709       switch (aarch64_classify_symbolic_expression (x))
12710         {
12711         case SYMBOL_TLSLE24:
12712           asm_fprintf (asm_out_file, ":tprel_hi12:");
12713           break;
12714         default:
12715           break;
12716         }
12717       output_addr_const (asm_out_file, x);
12718       break;
12719
12720     case 'k':
12721       {
12722         HOST_WIDE_INT cond_code;
12723
12724         if (!CONST_INT_P (x))
12725           {
12726             output_operand_lossage ("invalid operand for '%%%c'", code);
12727             return;
12728           }
12729
12730         cond_code = INTVAL (x);
12731         gcc_assert (cond_code >= 0 && cond_code <= AARCH64_NV);
12732         asm_fprintf (f, "%d", aarch64_nzcv_codes[cond_code]);
12733       }
12734       break;
12735
12736     case 'K':
12737       if (!REG_P (x) || !PR_REGNUM_P (REGNO (x)))
12738         {
12739           output_operand_lossage ("invalid operand for '%%%c'", code);
12740           return;
12741         }
12742       asm_fprintf (f, "pn%d", REGNO (x) - P0_REGNUM);
12743       break;
12744
12745     case 'y':
12746     case 'z':
12747       {
12748         machine_mode mode = GET_MODE (x);
12749
12750         if (!MEM_P (x)
12751             || (code == 'y'
12752                 && maybe_ne (GET_MODE_SIZE (mode), 8)
12753                 && maybe_ne (GET_MODE_SIZE (mode), 16)
12754                 && maybe_ne (GET_MODE_SIZE (mode), 32)))
12755           {
12756             output_operand_lossage ("invalid operand for '%%%c'", code);
12757             return;
12758           }
12759
12760         if (!aarch64_print_address_internal (f, mode, XEXP (x, 0),
12761                                             code == 'y'
12762                                             ? ADDR_QUERY_LDP_STP_N
12763                                             : ADDR_QUERY_LDP_STP))
12764           output_operand_lossage ("invalid operand prefix '%%%c'", code);
12765       }
12766       break;
12767
12768     default:
12769       output_operand_lossage ("invalid operand prefix '%%%c'", code);
12770       return;
12771     }
12772 }
12773
12774 /* Print address 'x' of a memory access with mode 'mode'.
12775    'op' is the context required by aarch64_classify_address.  It can either be
12776    MEM for a normal memory access or PARALLEL for LDP/STP.  */
12777 static bool
12778 aarch64_print_address_internal (FILE *f, machine_mode mode, rtx x,
12779                                 aarch64_addr_query_type type)
12780 {
12781   struct aarch64_address_info addr;
12782   unsigned int size, vec_flags;
12783
12784   /* Check all addresses are Pmode - including ILP32.  */
12785   if (GET_MODE (x) != Pmode
12786       && (!CONST_INT_P (x)
12787           || trunc_int_for_mode (INTVAL (x), Pmode) != INTVAL (x)))
12788     {
12789       output_operand_lossage ("invalid address mode");
12790       return false;
12791     }
12792
12793   const bool load_store_pair_p = (type == ADDR_QUERY_LDP_STP
12794                                   || type == ADDR_QUERY_LDP_STP_N);
12795
12796   if (aarch64_classify_address (&addr, x, mode, true, type))
12797     switch (addr.type)
12798       {
12799       case ADDRESS_REG_IMM:
12800         if (known_eq (addr.const_offset, 0))
12801           {
12802             asm_fprintf (f, "[%s]", reg_names[REGNO (addr.base)]);
12803             return true;
12804           }
12805
12806         vec_flags = aarch64_classify_vector_memory_mode (mode);
12807         if ((vec_flags & VEC_ANY_SVE) && !load_store_pair_p)
12808           {
12809             HOST_WIDE_INT vnum
12810               = exact_div (addr.const_offset,
12811                            aarch64_vl_bytes (mode, vec_flags)).to_constant ();
12812             asm_fprintf (f, "[%s, #%wd, mul vl]",
12813                          reg_names[REGNO (addr.base)], vnum);
12814             return true;
12815           }
12816
12817         if (!CONST_INT_P (addr.offset))
12818           return false;
12819
12820         asm_fprintf (f, "[%s, %wd]", reg_names[REGNO (addr.base)],
12821                      INTVAL (addr.offset));
12822         return true;
12823
12824       case ADDRESS_REG_REG:
12825         if (addr.shift == 0)
12826           asm_fprintf (f, "[%s, %s]", reg_names [REGNO (addr.base)],
12827                        reg_names [REGNO (addr.offset)]);
12828         else
12829           asm_fprintf (f, "[%s, %s, lsl %u]", reg_names [REGNO (addr.base)],
12830                        reg_names [REGNO (addr.offset)], addr.shift);
12831         return true;
12832
12833       case ADDRESS_REG_UXTW:
12834         if (addr.shift == 0)
12835           asm_fprintf (f, "[%s, w%d, uxtw]", reg_names [REGNO (addr.base)],
12836                        REGNO (addr.offset) - R0_REGNUM);
12837         else
12838           asm_fprintf (f, "[%s, w%d, uxtw %u]", reg_names [REGNO (addr.base)],
12839                        REGNO (addr.offset) - R0_REGNUM, addr.shift);
12840         return true;
12841
12842       case ADDRESS_REG_SXTW:
12843         if (addr.shift == 0)
12844           asm_fprintf (f, "[%s, w%d, sxtw]", reg_names [REGNO (addr.base)],
12845                        REGNO (addr.offset) - R0_REGNUM);
12846         else
12847           asm_fprintf (f, "[%s, w%d, sxtw %u]", reg_names [REGNO (addr.base)],
12848                        REGNO (addr.offset) - R0_REGNUM, addr.shift);
12849         return true;
12850
12851       case ADDRESS_REG_WB:
12852         /* Writeback is only supported for fixed-width modes.  */
12853         size = GET_MODE_SIZE (mode).to_constant ();
12854         switch (GET_CODE (x))
12855           {
12856           case PRE_INC:
12857             asm_fprintf (f, "[%s, %d]!", reg_names [REGNO (addr.base)], size);
12858             return true;
12859           case POST_INC:
12860             asm_fprintf (f, "[%s], %d", reg_names [REGNO (addr.base)], size);
12861             return true;
12862           case PRE_DEC:
12863             asm_fprintf (f, "[%s, -%d]!", reg_names [REGNO (addr.base)], size);
12864             return true;
12865           case POST_DEC:
12866             asm_fprintf (f, "[%s], -%d", reg_names [REGNO (addr.base)], size);
12867             return true;
12868           case PRE_MODIFY:
12869             asm_fprintf (f, "[%s, %wd]!", reg_names[REGNO (addr.base)],
12870                          INTVAL (addr.offset));
12871             return true;
12872           case POST_MODIFY:
12873             asm_fprintf (f, "[%s], %wd", reg_names[REGNO (addr.base)],
12874                          INTVAL (addr.offset));
12875             return true;
12876           default:
12877             break;
12878           }
12879         break;
12880
12881       case ADDRESS_LO_SUM:
12882         asm_fprintf (f, "[%s, #:lo12:", reg_names [REGNO (addr.base)]);
12883         output_addr_const (f, addr.offset);
12884         asm_fprintf (f, "]");
12885         return true;
12886
12887       case ADDRESS_SYMBOLIC:
12888         output_addr_const (f, x);
12889         return true;
12890       }
12891
12892   return false;
12893 }
12894
12895 /* Print address 'x' of a memory access with mode 'mode'.  */
12896 static void
12897 aarch64_print_operand_address (FILE *f, machine_mode mode, rtx x)
12898 {
12899   if (!aarch64_print_address_internal (f, mode, x, ADDR_QUERY_ANY))
12900     output_addr_const (f, x);
12901 }
12902
12903 /* Implement TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA.  */
12904
12905 static bool
12906 aarch64_output_addr_const_extra (FILE *file, rtx x)
12907 {
12908   if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_SALT_ADDR)
12909     {
12910       output_addr_const (file, XVECEXP (x, 0, 0));
12911       return true;
12912    }
12913   return false;
12914 }
12915
12916 bool
12917 aarch64_label_mentioned_p (rtx x)
12918 {
12919   const char *fmt;
12920   int i;
12921
12922   if (LABEL_REF_P (x))
12923     return true;
12924
12925   /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
12926      referencing instruction, but they are constant offsets, not
12927      symbols.  */
12928   if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
12929     return false;
12930
12931   fmt = GET_RTX_FORMAT (GET_CODE (x));
12932   for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
12933     {
12934       if (fmt[i] == 'E')
12935         {
12936           int j;
12937
12938           for (j = XVECLEN (x, i) - 1; j >= 0; j--)
12939             if (aarch64_label_mentioned_p (XVECEXP (x, i, j)))
12940               return 1;
12941         }
12942       else if (fmt[i] == 'e' && aarch64_label_mentioned_p (XEXP (x, i)))
12943         return 1;
12944     }
12945
12946   return 0;
12947 }
12948
12949 /* Implement REGNO_REG_CLASS.  */
12950
12951 enum reg_class
12952 aarch64_regno_regclass (unsigned regno)
12953 {
12954   if (W8_W11_REGNUM_P (regno))
12955     return W8_W11_REGS;
12956
12957   if (W12_W15_REGNUM_P (regno))
12958     return W12_W15_REGS;
12959
12960   if (STUB_REGNUM_P (regno))
12961     return STUB_REGS;
12962
12963   if (GP_REGNUM_P (regno))
12964     return GENERAL_REGS;
12965
12966   if (regno == SP_REGNUM)
12967     return STACK_REG;
12968
12969   if (regno == FRAME_POINTER_REGNUM
12970       || regno == ARG_POINTER_REGNUM)
12971     return POINTER_REGS;
12972
12973   if (FP_REGNUM_P (regno))
12974     return (FP_LO8_REGNUM_P (regno) ? FP_LO8_REGS
12975             : FP_LO_REGNUM_P (regno) ? FP_LO_REGS : FP_REGS);
12976
12977   if (PR_REGNUM_P (regno))
12978     return PR_LO_REGNUM_P (regno) ? PR_LO_REGS : PR_HI_REGS;
12979
12980   if (regno == FPM_REGNUM)
12981     return MOVEABLE_SYSREGS;
12982
12983   if (regno == FFR_REGNUM || regno == FFRT_REGNUM)
12984     return FFR_REGS;
12985
12986   if (FAKE_REGNUM_P (regno))
12987     return FAKE_REGS;
12988
12989   return NO_REGS;
12990 }
12991
12992 /* OFFSET is an address offset for mode MODE, which has SIZE bytes.
12993    If OFFSET is out of range, return an offset of an anchor point
12994    that is in range.  Return 0 otherwise.  */
12995
12996 static HOST_WIDE_INT
12997 aarch64_anchor_offset (HOST_WIDE_INT offset, HOST_WIDE_INT size,
12998                        machine_mode mode)
12999 {
13000   /* Does it look like we'll need a 16-byte load/store-pair operation?  */
13001   if (size > 16)
13002     return (offset + 0x400) & ~0x7f0;
13003
13004   /* For offsets that aren't a multiple of the access size, the limit is
13005      -256...255.  */
13006   if (offset & (size - 1))
13007     {
13008       /* BLKmode typically uses LDP of X-registers.  */
13009       if (mode == BLKmode)
13010         return (offset + 512) & ~0x3ff;
13011       return (offset + 0x100) & ~0x1ff;
13012     }
13013
13014   /* Small negative offsets are supported.  */
13015   if (IN_RANGE (offset, -256, 0))
13016     return 0;
13017
13018   if (mode == TImode || mode == TFmode || mode == TDmode)
13019     return (offset + 0x100) & ~0x1ff;
13020
13021   /* Use 12-bit offset by access size.  */
13022   return offset & (~0xfff * size);
13023 }
13024
13025 static rtx
13026 aarch64_legitimize_address (rtx x, rtx /* orig_x  */, machine_mode mode)
13027 {
13028 #if TARGET_PECOFF
13029   rtx tmp = legitimize_pe_coff_symbol (x, true);
13030   if (tmp)
13031     return tmp;
13032 #endif
13033
13034   /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask),
13035      where mask is selected by alignment and size of the offset.
13036      We try to pick as large a range for the offset as possible to
13037      maximize the chance of a CSE.  However, for aligned addresses
13038      we limit the range to 4k so that structures with different sized
13039      elements are likely to use the same base.  We need to be careful
13040      not to split a CONST for some forms of address expression, otherwise
13041      it will generate sub-optimal code.  */
13042
13043   /* First split X + CONST (base, offset) into (base + X) + offset.  */
13044   if (GET_CODE (x) == PLUS && GET_CODE (XEXP (x, 1)) == CONST)
13045     {
13046       poly_int64 offset;
13047       rtx base = strip_offset (XEXP (x, 1), &offset);
13048
13049       base = expand_binop (Pmode, add_optab, base, XEXP (x, 0),
13050                            NULL_RTX, true, OPTAB_DIRECT);
13051       x = plus_constant (Pmode, base, offset);
13052     }
13053
13054   if (GET_CODE (x) == PLUS && CONST_INT_P (XEXP (x, 1)))
13055     {
13056       rtx base = XEXP (x, 0);
13057       rtx offset_rtx = XEXP (x, 1);
13058       HOST_WIDE_INT offset = INTVAL (offset_rtx);
13059
13060       if (GET_CODE (base) == PLUS)
13061         {
13062           rtx op0 = XEXP (base, 0);
13063           rtx op1 = XEXP (base, 1);
13064
13065           /* Force any scaling into a temp for CSE.  */
13066           op0 = force_reg (Pmode, op0);
13067           op1 = force_reg (Pmode, op1);
13068
13069           /* Let the pointer register be in op0.  */
13070           if (REG_POINTER (op1))
13071             std::swap (op0, op1);
13072
13073           /* If the pointer is virtual or frame related, then we know that
13074              virtual register instantiation or register elimination is going
13075              to apply a second constant.  We want the two constants folded
13076              together easily.  Therefore, emit as (OP0 + CONST) + OP1.  */
13077           if (virt_or_elim_regno_p (REGNO (op0)))
13078             {
13079               base = expand_binop (Pmode, add_optab, op0, offset_rtx,
13080                                    NULL_RTX, true, OPTAB_DIRECT);
13081               return gen_rtx_PLUS (Pmode, base, op1);
13082             }
13083
13084           /* Otherwise, in order to encourage CSE (and thence loop strength
13085              reduce) scaled addresses, emit as (OP0 + OP1) + CONST.  */
13086           base = expand_binop (Pmode, add_optab, op0, op1,
13087                                NULL_RTX, true, OPTAB_DIRECT);
13088           x = gen_rtx_PLUS (Pmode, base, offset_rtx);
13089         }
13090
13091       HOST_WIDE_INT size;
13092       if (GET_MODE_SIZE (mode).is_constant (&size))
13093         {
13094           HOST_WIDE_INT base_offset = aarch64_anchor_offset (offset, size,
13095                                                              mode);
13096           if (base_offset != 0)
13097             {
13098               base = plus_constant (Pmode, base, base_offset);
13099               base = force_operand (base, NULL_RTX);
13100               return plus_constant (Pmode, base, offset - base_offset);
13101             }
13102         }
13103     }
13104
13105   return x;
13106 }
13107
13108 static reg_class_t
13109 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x,
13110                           reg_class_t rclass,
13111                           machine_mode mode,
13112                           secondary_reload_info *sri)
13113 {
13114   /* Use aarch64_sve_reload_mem for SVE memory reloads that cannot use
13115      LDR and STR.  See the comment at the head of aarch64-sve.md for
13116      more details about the big-endian handling.  */
13117   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
13118   if (reg_class_subset_p (rclass, FP_REGS)
13119       && !((REG_P (x) && HARD_REGISTER_P (x))
13120            || aarch64_simd_valid_mov_imm (x))
13121       && mode != VNx16QImode
13122       && (vec_flags & VEC_SVE_DATA)
13123       && ((vec_flags & VEC_PARTIAL) || BYTES_BIG_ENDIAN))
13124     {
13125       sri->icode = CODE_FOR_aarch64_sve_reload_mem;
13126       return NO_REGS;
13127     }
13128
13129   /* If we have to disable direct literal pool loads and stores because the
13130      function is too big, then we need a scratch register.  */
13131   if (MEM_P (x) && SYMBOL_REF_P (x) && CONSTANT_POOL_ADDRESS_P (x)
13132       && (SCALAR_FLOAT_MODE_P (GET_MODE (x))
13133           || targetm.vector_mode_supported_p (GET_MODE (x)))
13134       && !aarch64_pcrelative_literal_loads)
13135     {
13136       sri->icode = code_for_aarch64_reload_movcp (mode, DImode);
13137       return NO_REGS;
13138     }
13139
13140   /* Without the TARGET_SIMD or TARGET_SVE instructions we cannot move a
13141      Q register to a Q register directly.  We need a scratch.  */
13142   if (REG_P (x)
13143       && (mode == TFmode
13144           || mode == TImode
13145           || mode == TDmode
13146           || (vec_flags == VEC_ADVSIMD && known_eq (GET_MODE_SIZE (mode), 16)))
13147       && mode == GET_MODE (x)
13148       && !TARGET_SIMD
13149       && FP_REGNUM_P (REGNO (x))
13150       && reg_class_subset_p (rclass, FP_REGS))
13151     {
13152       sri->icode = code_for_aarch64_reload_mov (mode);
13153       return NO_REGS;
13154     }
13155
13156   /* A TFmode, TImode or TDmode memory access should be handled via an FP_REGS
13157      because AArch64 has richer addressing modes for LDR/STR instructions
13158      than LDP/STP instructions.  */
13159   if (TARGET_FLOAT && rclass == GENERAL_REGS
13160       && known_eq (GET_MODE_SIZE (mode), 16) && MEM_P (x))
13161     return FP_REGS;
13162
13163   if (rclass == FP_REGS
13164       && (mode == TImode || mode == TFmode || mode == TDmode)
13165       && CONSTANT_P(x))
13166       return GENERAL_REGS;
13167
13168   return NO_REGS;
13169 }
13170
13171 /* Implement TARGET_SECONDARY_MEMORY_NEEDED.  */
13172
13173 static bool
13174 aarch64_secondary_memory_needed (machine_mode mode, reg_class_t class1,
13175                                  reg_class_t class2)
13176 {
13177   if (!TARGET_SIMD
13178       && reg_classes_intersect_p (class1, FP_REGS)
13179       && reg_classes_intersect_p (class2, FP_REGS))
13180     {
13181       /* We can't do a 128-bit FPR-to-FPR move without TARGET_SIMD,
13182          so we can't easily split a move involving tuples of 128-bit
13183          vectors.  Force the copy through memory instead.
13184
13185          (Tuples of 64-bit vectors are fine.)  */
13186       unsigned int vec_flags = aarch64_classify_vector_mode (mode);
13187       if (vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
13188         return true;
13189     }
13190   return false;
13191 }
13192
13193 /* Implement TARGET_FRAME_POINTER_REQUIRED.  */
13194
13195 static bool
13196 aarch64_frame_pointer_required ()
13197 {
13198   /* If the function needs to record the incoming value of PSTATE.SM,
13199      make sure that the slot is accessible from the frame pointer.  */
13200   return aarch64_need_old_pstate_sm ();
13201 }
13202
13203 static bool
13204 aarch64_can_eliminate (const int from ATTRIBUTE_UNUSED, const int to)
13205 {
13206   gcc_assert (from == ARG_POINTER_REGNUM || from == FRAME_POINTER_REGNUM);
13207
13208   /* If we need a frame pointer, ARG_POINTER_REGNUM and FRAME_POINTER_REGNUM
13209      can only eliminate to HARD_FRAME_POINTER_REGNUM.  */
13210   if (frame_pointer_needed)
13211     return to == HARD_FRAME_POINTER_REGNUM;
13212   return true;
13213 }
13214
13215 poly_int64
13216 aarch64_initial_elimination_offset (unsigned from, unsigned to)
13217 {
13218   aarch64_frame &frame = cfun->machine->frame;
13219
13220   if (to == HARD_FRAME_POINTER_REGNUM)
13221     {
13222       if (from == ARG_POINTER_REGNUM)
13223         return frame.bytes_above_hard_fp;
13224
13225       if (from == FRAME_POINTER_REGNUM)
13226         return frame.bytes_above_hard_fp - frame.bytes_above_locals;
13227     }
13228
13229   if (to == STACK_POINTER_REGNUM)
13230     {
13231       if (from == FRAME_POINTER_REGNUM)
13232         return frame.frame_size - frame.bytes_above_locals;
13233     }
13234
13235   return frame.frame_size;
13236 }
13237
13238
13239 /* Get return address without mangling.  */
13240
13241 rtx
13242 aarch64_return_addr_rtx (void)
13243 {
13244   rtx val = get_hard_reg_initial_val (Pmode, LR_REGNUM);
13245   /* Note: aarch64_return_address_signing_enabled only
13246      works after cfun->machine->frame.laid_out is set,
13247      so here we don't know if the return address will
13248      be signed or not.  */
13249   rtx lr = gen_rtx_REG (Pmode, LR_REGNUM);
13250   emit_move_insn (lr, val);
13251   emit_insn (GEN_FCN (CODE_FOR_xpaclri) ());
13252   return lr;
13253 }
13254
13255
13256 /* Implement RETURN_ADDR_RTX.  We do not support moving back to a
13257    previous frame.  */
13258
13259 rtx
13260 aarch64_return_addr (int count, rtx frame ATTRIBUTE_UNUSED)
13261 {
13262   if (count != 0)
13263     return const0_rtx;
13264   return aarch64_return_addr_rtx ();
13265 }
13266
13267 static void
13268 aarch64_asm_trampoline_template (FILE *f)
13269 {
13270   /* Even if the current function doesn't have branch protection, some
13271      later function might, so since this template is only generated once
13272      we have to add a BTI just in case. */
13273   asm_fprintf (f, "\thint\t34 // bti c\n");
13274
13275   if (TARGET_ILP32)
13276     {
13277       asm_fprintf (f, "\tldr\tw%d, .+20\n", IP1_REGNUM - R0_REGNUM);
13278       asm_fprintf (f, "\tldr\tw%d, .+20\n", STATIC_CHAIN_REGNUM - R0_REGNUM);
13279     }
13280   else
13281     {
13282       asm_fprintf (f, "\tldr\t%s, .+20\n", reg_names [IP1_REGNUM]);
13283       asm_fprintf (f, "\tldr\t%s, .+24\n", reg_names [STATIC_CHAIN_REGNUM]);
13284     }
13285   asm_fprintf (f, "\tbr\t%s\n", reg_names [IP1_REGNUM]);
13286
13287   /* We always emit a speculation barrier.
13288      This is because the same trampoline template is used for every nested
13289      function.  Since nested functions are not particularly common or
13290      performant we don't worry too much about the extra instructions to copy
13291      around.
13292      This is not yet a problem, since we have not yet implemented function
13293      specific attributes to choose between hardening against straight line
13294      speculation or not, but such function specific attributes are likely to
13295      happen in the future.  */
13296   asm_fprintf (f, "\tdsb\tsy\n\tisb\n");
13297
13298   assemble_aligned_integer (POINTER_BYTES, const0_rtx);
13299   assemble_aligned_integer (POINTER_BYTES, const0_rtx);
13300 }
13301
13302 static void
13303 aarch64_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
13304 {
13305   rtx fnaddr, mem, a_tramp;
13306   const int tramp_code_sz = 24;
13307
13308   /* Don't need to copy the trailing D-words, we fill those in below.  */
13309   /* We create our own memory address in Pmode so that `emit_block_move` can
13310      use parts of the backend which expect Pmode addresses.  */
13311   rtx temp = convert_memory_address (Pmode, XEXP (m_tramp, 0));
13312   emit_block_move (gen_rtx_MEM (BLKmode, temp),
13313                    assemble_trampoline_template (),
13314                    GEN_INT (tramp_code_sz), BLOCK_OP_NORMAL);
13315   mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz);
13316   fnaddr = XEXP (DECL_RTL (fndecl), 0);
13317   if (GET_MODE (fnaddr) != ptr_mode)
13318     fnaddr = convert_memory_address (ptr_mode, fnaddr);
13319   emit_move_insn (mem, fnaddr);
13320
13321   mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz + POINTER_BYTES);
13322   emit_move_insn (mem, chain_value);
13323
13324   /* XXX We should really define a "clear_cache" pattern and use
13325      gen_clear_cache().  */
13326   a_tramp = XEXP (m_tramp, 0);
13327   maybe_emit_call_builtin___clear_cache (a_tramp,
13328                                          plus_constant (ptr_mode,
13329                                                         a_tramp,
13330                                                         TRAMPOLINE_SIZE));
13331 }
13332
13333 static unsigned char
13334 aarch64_class_max_nregs (reg_class_t regclass, machine_mode mode)
13335 {
13336   /* ??? Logically we should only need to provide a value when
13337      HARD_REGNO_MODE_OK says that at least one register in REGCLASS
13338      can hold MODE, but at the moment we need to handle all modes.
13339      Just ignore any runtime parts for registers that can't store them.  */
13340   HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
13341   unsigned int nregs, vec_flags;
13342   switch (regclass)
13343     {
13344     case W8_W11_REGS:
13345     case W12_W15_REGS:
13346     case STUB_REGS:
13347     case TAILCALL_ADDR_REGS:
13348     case POINTER_REGS:
13349     case GENERAL_REGS:
13350     case ALL_REGS:
13351     case POINTER_AND_FP_REGS:
13352     case FP_REGS:
13353     case FP_LO_REGS:
13354     case FP_LO8_REGS:
13355       vec_flags = aarch64_classify_vector_mode (mode);
13356       if ((vec_flags & VEC_SVE_DATA)
13357           && constant_multiple_p (GET_MODE_SIZE (mode),
13358                                   aarch64_vl_bytes (mode, vec_flags), &nregs))
13359         return nregs;
13360       if (vec_flags == (VEC_ADVSIMD | VEC_STRUCT | VEC_PARTIAL))
13361         return GET_MODE_SIZE (mode).to_constant () / 8;
13362       return (vec_flags & VEC_ADVSIMD
13363               ? CEIL (lowest_size, UNITS_PER_VREG)
13364               : CEIL (lowest_size, UNITS_PER_WORD));
13365
13366     case PR_REGS:
13367     case PR_LO_REGS:
13368     case PR_HI_REGS:
13369       return mode == VNx64BImode ? 4 : mode == VNx32BImode ? 2 : 1;
13370
13371     case MOVEABLE_SYSREGS:
13372     case STACK_REG:
13373     case FFR_REGS:
13374     case PR_AND_FFR_REGS:
13375     case FAKE_REGS:
13376       return 1;
13377
13378     case NO_REGS:
13379       return 0;
13380
13381     default:
13382       break;
13383     }
13384   gcc_unreachable ();
13385 }
13386
13387 static reg_class_t
13388 aarch64_preferred_reload_class (rtx x, reg_class_t regclass)
13389 {
13390   if (regclass == POINTER_REGS)
13391     return GENERAL_REGS;
13392
13393   if (regclass == STACK_REG)
13394     {
13395       if (REG_P(x)
13396           && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x)), POINTER_REGS))
13397           return regclass;
13398
13399       return NO_REGS;
13400     }
13401
13402   /* Register eliminiation can result in a request for
13403      SP+constant->FP_REGS.  We cannot support such operations which
13404      use SP as source and an FP_REG as destination, so reject out
13405      right now.  */
13406   if (! reg_class_subset_p (regclass, GENERAL_REGS) && GET_CODE (x) == PLUS)
13407     {
13408       rtx lhs = XEXP (x, 0);
13409
13410       /* Look through a possible SUBREG introduced by ILP32.  */
13411       if (SUBREG_P (lhs))
13412         lhs = SUBREG_REG (lhs);
13413
13414       gcc_assert (REG_P (lhs));
13415       gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs)),
13416                                       POINTER_REGS));
13417       return NO_REGS;
13418     }
13419
13420   return regclass;
13421 }
13422
13423 void
13424 aarch64_asm_output_labelref (FILE* f, const char *name)
13425 {
13426   asm_fprintf (f, "%U%s", name);
13427 }
13428
13429 static void
13430 aarch64_elf_asm_constructor (rtx symbol, int priority)
13431 {
13432   if (priority == DEFAULT_INIT_PRIORITY)
13433     default_ctor_section_asm_out_constructor (symbol, priority);
13434   else
13435     {
13436       section *s;
13437       /* While priority is known to be in range [0, 65535], so 18 bytes
13438          would be enough, the compiler might not know that.  To avoid
13439          -Wformat-truncation false positive, use a larger size.  */
13440       char buf[23];
13441       snprintf (buf, sizeof (buf), ".init_array.%.5u", priority);
13442       s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
13443       switch_to_section (s);
13444       assemble_align (POINTER_SIZE);
13445       assemble_aligned_integer (POINTER_BYTES, symbol);
13446     }
13447 }
13448
13449 static void
13450 aarch64_elf_asm_destructor (rtx symbol, int priority)
13451 {
13452   if (priority == DEFAULT_INIT_PRIORITY)
13453     default_dtor_section_asm_out_destructor (symbol, priority);
13454   else
13455     {
13456       section *s;
13457       /* While priority is known to be in range [0, 65535], so 18 bytes
13458          would be enough, the compiler might not know that.  To avoid
13459          -Wformat-truncation false positive, use a larger size.  */
13460       char buf[23];
13461       snprintf (buf, sizeof (buf), ".fini_array.%.5u", priority);
13462       s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
13463       switch_to_section (s);
13464       assemble_align (POINTER_SIZE);
13465       assemble_aligned_integer (POINTER_BYTES, symbol);
13466     }
13467 }
13468
13469 const char*
13470 aarch64_output_casesi (rtx *operands)
13471 {
13472   char buf[100];
13473   char label[100];
13474   rtx diff_vec = PATTERN (NEXT_INSN (as_a <rtx_insn *> (operands[2])));
13475   int index;
13476   static const char *const patterns[4][2] =
13477   {
13478     {
13479       "ldrb\t%w3, [%0,%w1,uxtw]",
13480       "add\t%3, %4, %w3, sxtb #2"
13481     },
13482     {
13483       "ldrh\t%w3, [%0,%w1,uxtw #1]",
13484       "add\t%3, %4, %w3, sxth #2"
13485     },
13486     {
13487       "ldr\t%w3, [%0,%w1,uxtw #2]",
13488       "add\t%3, %4, %w3, sxtw #2"
13489     },
13490     /* We assume that DImode is only generated when not optimizing and
13491        that we don't really need 64-bit address offsets.  That would
13492        imply an object file with 8GB of code in a single function!  */
13493     {
13494       "ldr\t%w3, [%0,%w1,uxtw #2]",
13495       "add\t%3, %4, %w3, sxtw #2"
13496     }
13497   };
13498
13499   gcc_assert (GET_CODE (diff_vec) == ADDR_DIFF_VEC);
13500
13501   scalar_int_mode mode = as_a <scalar_int_mode> (GET_MODE (diff_vec));
13502   index = exact_log2 (GET_MODE_SIZE (mode));
13503
13504   gcc_assert (index >= 0 && index <= 3);
13505
13506   /* Need to implement table size reduction, by chaning the code below.  */
13507   output_asm_insn (patterns[index][0], operands);
13508   ASM_GENERATE_INTERNAL_LABEL (label, "Lrtx", CODE_LABEL_NUMBER (operands[2]));
13509   snprintf (buf, sizeof (buf),
13510             "adr\t%%4, %s", targetm.strip_name_encoding (label));
13511   output_asm_insn (buf, operands);
13512   output_asm_insn (patterns[index][1], operands);
13513   output_asm_insn ("br\t%3", operands);
13514   output_asm_insn (aarch64_sls_barrier (aarch64_harden_sls_retbr_p ()),
13515                    operands);
13516   assemble_label (asm_out_file, label);
13517   return "";
13518 }
13519
13520 /* Return the asm string for an SME ZERO instruction whose 8-bit mask
13521    operand is MASK.  */
13522 const char *
13523 aarch64_output_sme_zero_za (rtx mask)
13524 {
13525   auto mask_val = UINTVAL (mask);
13526   if (mask_val == 0)
13527     return "zero\t{}";
13528
13529   if (mask_val == 0xff)
13530     return "zero\t{ za }";
13531
13532   static constexpr struct { unsigned char mask; char letter; } tiles[] = {
13533     { 0xff, 'b' },
13534     { 0x55, 'h' },
13535     { 0x11, 's' },
13536     { 0x01, 'd' }
13537   };
13538   /* The last entry in the list has the form "za7.d }", but that's the
13539      same length as "za7.d, ".  */
13540   static char buffer[sizeof("zero\t{ ") + sizeof ("za7.d, ") * 8 + 1];
13541   for (auto &tile : tiles)
13542     {
13543       unsigned int tile_mask = tile.mask;
13544       unsigned int tile_index = 0;
13545       unsigned int i = snprintf (buffer, sizeof (buffer), "zero\t");
13546       const char *prefix = "{ ";
13547       auto remaining_mask = mask_val;
13548       while (tile_mask < 0x100)
13549         {
13550           if ((remaining_mask & tile_mask) == tile_mask)
13551             {
13552               i += snprintf (buffer + i, sizeof (buffer) - i, "%sza%d.%c",
13553                              prefix, tile_index, tile.letter);
13554               prefix = ", ";
13555               remaining_mask &= ~tile_mask;
13556             }
13557           tile_mask <<= 1;
13558           tile_index += 1;
13559         }
13560       if (remaining_mask == 0)
13561         {
13562           gcc_assert (i + 3 <= sizeof (buffer));
13563           snprintf (buffer + i, sizeof (buffer) - i, " }");
13564           return buffer;
13565         }
13566     }
13567   gcc_unreachable ();
13568 }
13569
13570 /* Return size in bits of an arithmetic operand which is shifted/scaled and
13571    masked such that it is suitable for a UXTB, UXTH, or UXTW extend
13572    operator.  */
13573
13574 int
13575 aarch64_uxt_size (int shift, HOST_WIDE_INT mask)
13576 {
13577   if (shift >= 0 && shift <= 4)
13578     {
13579       int size;
13580       for (size = 8; size <= 32; size *= 2)
13581         {
13582           HOST_WIDE_INT bits = ((HOST_WIDE_INT)1U << size) - 1;
13583           if (mask == bits << shift)
13584             return size;
13585         }
13586     }
13587   return 0;
13588 }
13589
13590 /* Constant pools are per function only when PC relative
13591    literal loads are true or we are in the large memory
13592    model.  */
13593
13594 static inline bool
13595 aarch64_can_use_per_function_literal_pools_p (void)
13596 {
13597   return (aarch64_pcrelative_literal_loads
13598           || aarch64_cmodel == AARCH64_CMODEL_LARGE);
13599 }
13600
13601 static bool
13602 aarch64_use_blocks_for_constant_p (machine_mode, const_rtx)
13603 {
13604   /* We can't use blocks for constants when we're using a per-function
13605      constant pool.  */
13606   return !aarch64_can_use_per_function_literal_pools_p ();
13607 }
13608
13609 /* Select appropriate section for constants depending
13610    on where we place literal pools.  */
13611
13612 static section *
13613 aarch64_select_rtx_section (machine_mode mode,
13614                             rtx x,
13615                             unsigned HOST_WIDE_INT align)
13616 {
13617   if (aarch64_can_use_per_function_literal_pools_p ())
13618     return function_section (current_function_decl);
13619
13620   return default_elf_select_rtx_section (mode, x, align);
13621 }
13622
13623 /* Implement ASM_OUTPUT_POOL_EPILOGUE.  */
13624 void
13625 aarch64_asm_output_pool_epilogue (FILE *f, const char *, tree,
13626                                   HOST_WIDE_INT offset)
13627 {
13628   /* When using per-function literal pools, we must ensure that any code
13629      section is aligned to the minimal instruction length, lest we get
13630      errors from the assembler re "unaligned instructions".  */
13631   if ((offset & 3) && aarch64_can_use_per_function_literal_pools_p ())
13632     ASM_OUTPUT_ALIGN (f, 2);
13633 }
13634
13635 /* Costs.  */
13636
13637 /* Helper function for rtx cost calculation.  Strip a shift expression
13638    from X.  Returns the inner operand if successful, or the original
13639    expression on failure.  */
13640 static rtx
13641 aarch64_strip_shift (rtx x)
13642 {
13643   rtx op = x;
13644
13645   /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
13646      we can convert both to ROR during final output.  */
13647   if ((GET_CODE (op) == ASHIFT
13648        || GET_CODE (op) == ASHIFTRT
13649        || GET_CODE (op) == LSHIFTRT
13650        || GET_CODE (op) == ROTATERT
13651        || GET_CODE (op) == ROTATE)
13652       && CONST_INT_P (XEXP (op, 1)))
13653     return XEXP (op, 0);
13654
13655   if (GET_CODE (op) == MULT
13656       && CONST_INT_P (XEXP (op, 1))
13657       && ((unsigned) exact_log2 (INTVAL (XEXP (op, 1)))) < 64)
13658     return XEXP (op, 0);
13659
13660   return x;
13661 }
13662
13663 /* Helper function for rtx cost calculation.  Strip an extend
13664    expression from X.  Returns the inner operand if successful, or the
13665    original expression on failure.  We deal with a number of possible
13666    canonicalization variations here. If STRIP_SHIFT is true, then
13667    we can strip off a shift also.  */
13668 static rtx
13669 aarch64_strip_extend (rtx x, bool strip_shift)
13670 {
13671   scalar_int_mode mode;
13672   rtx op = x;
13673
13674   if (!is_a <scalar_int_mode> (GET_MODE (op), &mode))
13675     return op;
13676
13677   if (GET_CODE (op) == AND
13678       && GET_CODE (XEXP (op, 0)) == MULT
13679       && CONST_INT_P (XEXP (XEXP (op, 0), 1))
13680       && CONST_INT_P (XEXP (op, 1))
13681       && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op, 0), 1))),
13682                            INTVAL (XEXP (op, 1))) != 0)
13683     return XEXP (XEXP (op, 0), 0);
13684
13685   /* Now handle extended register, as this may also have an optional
13686      left shift by 1..4.  */
13687   if (strip_shift
13688       && GET_CODE (op) == ASHIFT
13689       && CONST_INT_P (XEXP (op, 1))
13690       && ((unsigned HOST_WIDE_INT) INTVAL (XEXP (op, 1))) <= 4)
13691     op = XEXP (op, 0);
13692
13693   if (GET_CODE (op) == ZERO_EXTEND
13694       || GET_CODE (op) == SIGN_EXTEND)
13695     op = XEXP (op, 0);
13696
13697   if (op != x)
13698     return op;
13699
13700   return x;
13701 }
13702
13703 /* Helper function for rtx cost calculation. Strip extension as well as any
13704    inner VEC_SELECT high-half from X. Returns the inner vector operand if
13705    successful, or the original expression on failure.  */
13706 static rtx
13707 aarch64_strip_extend_vec_half (rtx x)
13708 {
13709   if (GET_CODE (x) == ZERO_EXTEND || GET_CODE (x) == SIGN_EXTEND)
13710     {
13711       x = XEXP (x, 0);
13712       if (GET_CODE (x) == VEC_SELECT
13713           && vec_series_highpart_p (GET_MODE (x), GET_MODE (XEXP (x, 0)),
13714                                     XEXP (x, 1)))
13715         x = XEXP (x, 0);
13716     }
13717   return x;
13718 }
13719
13720 /* Helper function for rtx cost calculation. Strip VEC_DUPLICATE as well as
13721    any subsequent extend and VEC_SELECT from X. Returns the inner scalar
13722    operand if successful, or the original expression on failure.  */
13723 static rtx
13724 aarch64_strip_duplicate_vec_elt (rtx x)
13725 {
13726   if (GET_CODE (x) == VEC_DUPLICATE
13727       && is_a<scalar_mode> (GET_MODE (XEXP (x, 0))))
13728     {
13729       x = XEXP (x, 0);
13730       if (GET_CODE (x) == VEC_SELECT)
13731         x = XEXP (x, 0);
13732       else if ((GET_CODE (x) == ZERO_EXTEND || GET_CODE (x) == SIGN_EXTEND)
13733                && GET_CODE (XEXP (x, 0)) == VEC_SELECT)
13734         x = XEXP (XEXP (x, 0), 0);
13735     }
13736   return x;
13737 }
13738
13739 /* Return true iff CODE is a shift supported in combination
13740    with arithmetic instructions.  */
13741
13742 static bool
13743 aarch64_shift_p (enum rtx_code code)
13744 {
13745   return code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT;
13746 }
13747
13748
13749 /* Return true iff X is a cheap shift without a sign extend. */
13750
13751 static bool
13752 aarch64_cheap_mult_shift_p (rtx x)
13753 {
13754   rtx op0, op1;
13755
13756   op0 = XEXP (x, 0);
13757   op1 = XEXP (x, 1);
13758
13759   if (!(aarch64_tune_params.extra_tuning_flags
13760                       & AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND))
13761     return false;
13762
13763   if (GET_CODE (op0) == SIGN_EXTEND)
13764     return false;
13765
13766   if (GET_CODE (x) == ASHIFT && CONST_INT_P (op1)
13767       && UINTVAL (op1) <= 4)
13768     return true;
13769
13770   if (GET_CODE (x) != MULT || !CONST_INT_P (op1))
13771     return false;
13772
13773   HOST_WIDE_INT l2 = exact_log2 (INTVAL (op1));
13774
13775   if (l2 > 0 && l2 <= 4)
13776     return true;
13777
13778   return false;
13779 }
13780
13781 /* Helper function for rtx cost calculation.  Calculate the cost of
13782    a MULT or ASHIFT, which may be part of a compound PLUS/MINUS rtx.
13783    Return the calculated cost of the expression, recursing manually in to
13784    operands where needed.  */
13785
13786 static int
13787 aarch64_rtx_mult_cost (rtx x, enum rtx_code code, int outer, bool speed)
13788 {
13789   rtx op0, op1;
13790   const struct cpu_cost_table *extra_cost
13791     = aarch64_tune_params.insn_extra_cost;
13792   int cost = 0;
13793   bool compound_p = (outer == PLUS || outer == MINUS);
13794   machine_mode mode = GET_MODE (x);
13795
13796   gcc_checking_assert (code == MULT);
13797
13798   op0 = XEXP (x, 0);
13799   op1 = XEXP (x, 1);
13800
13801   if (VECTOR_MODE_P (mode))
13802     {
13803       unsigned int vec_flags = aarch64_classify_vector_mode (mode);
13804       if (TARGET_SIMD && (vec_flags & VEC_ADVSIMD))
13805         {
13806           /* The select-operand-high-half versions of the instruction have the
13807              same cost as the three vector version - don't add the costs of the
13808              extension or selection into the costs of the multiply.  */
13809           op0 = aarch64_strip_extend_vec_half (op0);
13810           op1 = aarch64_strip_extend_vec_half (op1);
13811           /* The by-element versions of the instruction have the same costs as
13812              the normal 3-vector version.  We make an assumption that the input
13813              to the VEC_DUPLICATE is already on the FP & SIMD side.  This means
13814              costing of a MUL by element pre RA is a bit optimistic.  */
13815           op0 = aarch64_strip_duplicate_vec_elt (op0);
13816           op1 = aarch64_strip_duplicate_vec_elt (op1);
13817         }
13818       cost += rtx_cost (op0, mode, MULT, 0, speed);
13819       cost += rtx_cost (op1, mode, MULT, 1, speed);
13820       if (speed)
13821         {
13822           if (GET_CODE (x) == MULT)
13823             cost += extra_cost->vect.mult;
13824           /* This is to catch the SSRA costing currently flowing here.  */
13825           else
13826             cost += extra_cost->vect.alu;
13827         }
13828       return cost;
13829     }
13830
13831   /* Integer multiply/fma.  */
13832   if (GET_MODE_CLASS (mode) == MODE_INT)
13833     {
13834       /* The multiply will be canonicalized as a shift, cost it as such.  */
13835       if (aarch64_shift_p (GET_CODE (x))
13836           || (CONST_INT_P (op1)
13837               && exact_log2 (INTVAL (op1)) > 0))
13838         {
13839           bool is_extend = GET_CODE (op0) == ZERO_EXTEND
13840                            || GET_CODE (op0) == SIGN_EXTEND;
13841           if (speed)
13842             {
13843               if (compound_p)
13844                 {
13845                   /* If the shift is considered cheap,
13846                      then don't add any cost. */
13847                   if (aarch64_cheap_mult_shift_p (x))
13848                     ;
13849                   else if (REG_P (op1))
13850                     /* ARITH + shift-by-register.  */
13851                     cost += extra_cost->alu.arith_shift_reg;
13852                   else if (is_extend)
13853                     /* ARITH + extended register.  We don't have a cost field
13854                        for ARITH+EXTEND+SHIFT, so use extend_arith here.  */
13855                     cost += extra_cost->alu.extend_arith;
13856                   else
13857                     /* ARITH + shift-by-immediate.  */
13858                     cost += extra_cost->alu.arith_shift;
13859                 }
13860               else
13861                 /* LSL (immediate).  */
13862                 cost += extra_cost->alu.shift;
13863
13864             }
13865           /* Strip extends as we will have costed them in the case above.  */
13866           if (is_extend)
13867             op0 = aarch64_strip_extend (op0, true);
13868
13869           cost += rtx_cost (op0, VOIDmode, code, 0, speed);
13870
13871           return cost;
13872         }
13873
13874       /* MNEG or [US]MNEGL.  Extract the NEG operand and indicate that it's a
13875          compound and let the below cases handle it.  After all, MNEG is a
13876          special-case alias of MSUB.  */
13877       if (GET_CODE (op0) == NEG)
13878         {
13879           op0 = XEXP (op0, 0);
13880           compound_p = true;
13881         }
13882
13883       /* Integer multiplies or FMAs have zero/sign extending variants.  */
13884       if ((GET_CODE (op0) == ZERO_EXTEND
13885            && GET_CODE (op1) == ZERO_EXTEND)
13886           || (GET_CODE (op0) == SIGN_EXTEND
13887               && GET_CODE (op1) == SIGN_EXTEND))
13888         {
13889           cost += rtx_cost (XEXP (op0, 0), VOIDmode, MULT, 0, speed);
13890           cost += rtx_cost (XEXP (op1, 0), VOIDmode, MULT, 1, speed);
13891
13892           if (speed)
13893             {
13894               if (compound_p)
13895                 /* SMADDL/UMADDL/UMSUBL/SMSUBL.  */
13896                 cost += extra_cost->mult[0].extend_add;
13897               else
13898                 /* MUL/SMULL/UMULL.  */
13899                 cost += extra_cost->mult[0].extend;
13900             }
13901
13902           return cost;
13903         }
13904
13905       /* This is either an integer multiply or a MADD.  In both cases
13906          we want to recurse and cost the operands.  */
13907       cost += rtx_cost (op0, mode, MULT, 0, speed);
13908       cost += rtx_cost (op1, mode, MULT, 1, speed);
13909
13910       if (speed)
13911         {
13912           if (compound_p)
13913             /* MADD/MSUB.  */
13914             cost += extra_cost->mult[mode == DImode].add;
13915           else
13916             /* MUL.  */
13917             cost += extra_cost->mult[mode == DImode].simple;
13918         }
13919
13920       return cost;
13921     }
13922   else
13923     {
13924       if (speed)
13925         {
13926           /* Floating-point FMA/FMUL can also support negations of the
13927              operands, unless the rounding mode is upward or downward in
13928              which case FNMUL is different than FMUL with operand negation.  */
13929           bool neg0 = GET_CODE (op0) == NEG;
13930           bool neg1 = GET_CODE (op1) == NEG;
13931           if (compound_p || !flag_rounding_math || (neg0 && neg1))
13932             {
13933               if (neg0)
13934                 op0 = XEXP (op0, 0);
13935               if (neg1)
13936                 op1 = XEXP (op1, 0);
13937             }
13938
13939           if (compound_p)
13940             /* FMADD/FNMADD/FNMSUB/FMSUB.  */
13941             cost += extra_cost->fp[mode == DFmode].fma;
13942           else
13943             /* FMUL/FNMUL.  */
13944             cost += extra_cost->fp[mode == DFmode].mult;
13945         }
13946
13947       cost += rtx_cost (op0, mode, MULT, 0, speed);
13948       cost += rtx_cost (op1, mode, MULT, 1, speed);
13949       return cost;
13950     }
13951 }
13952
13953 static int
13954 aarch64_address_cost (rtx x,
13955                       machine_mode mode,
13956                       addr_space_t as ATTRIBUTE_UNUSED,
13957                       bool speed)
13958 {
13959   enum rtx_code c = GET_CODE (x);
13960   const struct cpu_addrcost_table *addr_cost = aarch64_tune_params.addr_cost;
13961   struct aarch64_address_info info;
13962   int cost = 0;
13963   info.shift = 0;
13964
13965   if (!aarch64_classify_address (&info, x, mode, false))
13966     {
13967       if (GET_CODE (x) == CONST || SYMBOL_REF_P (x))
13968         {
13969           /* This is a CONST or SYMBOL ref which will be split
13970              in a different way depending on the code model in use.
13971              Cost it through the generic infrastructure.  */
13972           int cost_symbol_ref = rtx_cost (x, Pmode, MEM, 1, speed);
13973           /* Divide through by the cost of one instruction to
13974              bring it to the same units as the address costs.  */
13975           cost_symbol_ref /= COSTS_N_INSNS (1);
13976           /* The cost is then the cost of preparing the address,
13977              followed by an immediate (possibly 0) offset.  */
13978           return cost_symbol_ref + addr_cost->imm_offset;
13979         }
13980       else
13981         {
13982           /* This is most likely a jump table from a case
13983              statement.  */
13984           return addr_cost->register_offset;
13985         }
13986     }
13987
13988   switch (info.type)
13989     {
13990       case ADDRESS_LO_SUM:
13991       case ADDRESS_SYMBOLIC:
13992       case ADDRESS_REG_IMM:
13993         cost += addr_cost->imm_offset;
13994         break;
13995
13996       case ADDRESS_REG_WB:
13997         if (c == PRE_INC || c == PRE_DEC || c == PRE_MODIFY)
13998           cost += addr_cost->pre_modify;
13999         else if (c == POST_INC || c == POST_DEC || c == POST_MODIFY)
14000           {
14001             unsigned int nvectors = aarch64_ldn_stn_vectors (mode);
14002             if (nvectors == 3)
14003               cost += addr_cost->post_modify_ld3_st3;
14004             else if (nvectors == 4)
14005               cost += addr_cost->post_modify_ld4_st4;
14006             else
14007               cost += addr_cost->post_modify;
14008           }
14009         else
14010           gcc_unreachable ();
14011
14012         break;
14013
14014       case ADDRESS_REG_REG:
14015         cost += addr_cost->register_offset;
14016         break;
14017
14018       case ADDRESS_REG_SXTW:
14019         cost += addr_cost->register_sextend;
14020         break;
14021
14022       case ADDRESS_REG_UXTW:
14023         cost += addr_cost->register_zextend;
14024         break;
14025
14026       default:
14027         gcc_unreachable ();
14028     }
14029
14030
14031   if (info.shift > 0)
14032     {
14033       /* For the sake of calculating the cost of the shifted register
14034          component, we can treat same sized modes in the same way.  */
14035       if (known_eq (GET_MODE_BITSIZE (mode), 16))
14036         cost += addr_cost->addr_scale_costs.hi;
14037       else if (known_eq (GET_MODE_BITSIZE (mode), 32))
14038         cost += addr_cost->addr_scale_costs.si;
14039       else if (known_eq (GET_MODE_BITSIZE (mode), 64))
14040         cost += addr_cost->addr_scale_costs.di;
14041       else
14042         /* We can't tell, or this is a 128-bit vector.  */
14043         cost += addr_cost->addr_scale_costs.ti;
14044     }
14045
14046   return cost;
14047 }
14048
14049 /* Return the cost of a branch.  If SPEED_P is true then the compiler is
14050    optimizing for speed.  If PREDICTABLE_P is true then the branch is predicted
14051    to be taken.  */
14052
14053 int
14054 aarch64_branch_cost (bool speed_p, bool predictable_p)
14055 {
14056   /* When optimizing for speed, use the cost of unpredictable branches.  */
14057   const struct cpu_branch_cost *branch_costs =
14058     aarch64_tune_params.branch_costs;
14059
14060   if (!speed_p || predictable_p)
14061     return branch_costs->predictable;
14062   else
14063     return branch_costs->unpredictable;
14064 }
14065
14066 /* Return true if X is a zero or sign extract
14067    usable in an ADD or SUB (extended register) instruction.  */
14068 static bool
14069 aarch64_rtx_arith_op_extract_p (rtx x)
14070 {
14071   /* The simple case <ARITH>, XD, XN, XM, [us]xt.
14072      No shift.  */
14073   if (GET_CODE (x) == SIGN_EXTEND
14074       || GET_CODE (x) == ZERO_EXTEND)
14075     return REG_P (XEXP (x, 0));
14076
14077   return false;
14078 }
14079
14080 static bool
14081 aarch64_frint_unspec_p (unsigned int u)
14082 {
14083   switch (u)
14084     {
14085       case UNSPEC_FRINTZ:
14086       case UNSPEC_FRINTP:
14087       case UNSPEC_FRINTM:
14088       case UNSPEC_FRINTA:
14089       case UNSPEC_FRINTN:
14090       case UNSPEC_FRINTX:
14091       case UNSPEC_FRINTI:
14092         return true;
14093
14094       default:
14095         return false;
14096     }
14097 }
14098
14099 /* Return true iff X is an rtx that will match an extr instruction
14100    i.e. as described in the *extr<mode>5_insn family of patterns.
14101    OP0 and OP1 will be set to the operands of the shifts involved
14102    on success and will be NULL_RTX otherwise.  */
14103
14104 static bool
14105 aarch64_extr_rtx_p (rtx x, rtx *res_op0, rtx *res_op1)
14106 {
14107   rtx op0, op1;
14108   scalar_int_mode mode;
14109   if (!is_a <scalar_int_mode> (GET_MODE (x), &mode))
14110     return false;
14111
14112   *res_op0 = NULL_RTX;
14113   *res_op1 = NULL_RTX;
14114
14115   if (GET_CODE (x) != IOR)
14116     return false;
14117
14118   op0 = XEXP (x, 0);
14119   op1 = XEXP (x, 1);
14120
14121   if ((GET_CODE (op0) == ASHIFT && GET_CODE (op1) == LSHIFTRT)
14122       || (GET_CODE (op1) == ASHIFT && GET_CODE (op0) == LSHIFTRT))
14123     {
14124      /* Canonicalise locally to ashift in op0, lshiftrt in op1.  */
14125       if (GET_CODE (op1) == ASHIFT)
14126         std::swap (op0, op1);
14127
14128       if (!CONST_INT_P (XEXP (op0, 1)) || !CONST_INT_P (XEXP (op1, 1)))
14129         return false;
14130
14131       unsigned HOST_WIDE_INT shft_amnt_0 = UINTVAL (XEXP (op0, 1));
14132       unsigned HOST_WIDE_INT shft_amnt_1 = UINTVAL (XEXP (op1, 1));
14133
14134       if (shft_amnt_0 < GET_MODE_BITSIZE (mode)
14135           && shft_amnt_0 + shft_amnt_1 == GET_MODE_BITSIZE (mode))
14136         {
14137           *res_op0 = XEXP (op0, 0);
14138           *res_op1 = XEXP (op1, 0);
14139           return true;
14140         }
14141     }
14142
14143   return false;
14144 }
14145
14146 /* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
14147    storing it in *COST.  Result is true if the total cost of the operation
14148    has now been calculated.  */
14149 static bool
14150 aarch64_if_then_else_costs (rtx op0, rtx op1, rtx op2, int *cost, bool speed)
14151 {
14152   rtx inner;
14153   rtx comparator;
14154   enum rtx_code cmpcode;
14155   const struct cpu_cost_table *extra_cost
14156     = aarch64_tune_params.insn_extra_cost;
14157
14158   if (COMPARISON_P (op0))
14159     {
14160       inner = XEXP (op0, 0);
14161       comparator = XEXP (op0, 1);
14162       cmpcode = GET_CODE (op0);
14163     }
14164   else
14165     {
14166       inner = op0;
14167       comparator = const0_rtx;
14168       cmpcode = NE;
14169     }
14170
14171   if (GET_CODE (op1) == PC || GET_CODE (op2) == PC)
14172     {
14173       /* Conditional branch.  */
14174       if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
14175         return true;
14176       else
14177         {
14178           if (cmpcode == NE || cmpcode == EQ)
14179             {
14180               if (comparator == const0_rtx)
14181                 {
14182                   /* TBZ/TBNZ/CBZ/CBNZ.  */
14183                   if (GET_CODE (inner) == ZERO_EXTRACT)
14184                     /* TBZ/TBNZ.  */
14185                     *cost += rtx_cost (XEXP (inner, 0), VOIDmode,
14186                                        ZERO_EXTRACT, 0, speed);
14187                   else
14188                     /* CBZ/CBNZ.  */
14189                     *cost += rtx_cost (inner, VOIDmode, cmpcode, 0, speed);
14190
14191                   return true;
14192                 }
14193               if (register_operand (inner, VOIDmode)
14194                   && aarch64_imm24 (comparator, VOIDmode))
14195                 {
14196                   /* SUB and SUBS.  */
14197                   *cost += COSTS_N_INSNS (2);
14198                   if (speed)
14199                     *cost += extra_cost->alu.arith * 2;
14200                   return true;
14201                 }
14202             }
14203           else if (cmpcode == LT || cmpcode == GE)
14204             {
14205               /* TBZ/TBNZ.  */
14206               if (comparator == const0_rtx)
14207                 return true;
14208             }
14209         }
14210     }
14211   else if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
14212     {
14213       /* CCMP.  */
14214       if (GET_CODE (op1) == COMPARE)
14215         {
14216           /* Increase cost of CCMP reg, 0, imm, CC to prefer CMP reg, 0.  */
14217           if (XEXP (op1, 1) == const0_rtx)
14218             *cost += 1;
14219           if (speed)
14220             {
14221               machine_mode mode = GET_MODE (XEXP (op1, 0));
14222
14223               if (GET_MODE_CLASS (mode) == MODE_INT)
14224                 *cost += extra_cost->alu.arith;
14225               else
14226                 *cost += extra_cost->fp[mode == DFmode].compare;
14227             }
14228           return true;
14229         }
14230
14231       /* It's a conditional operation based on the status flags,
14232          so it must be some flavor of CSEL.  */
14233
14234       /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL.  */
14235       if (GET_CODE (op1) == NEG
14236           || GET_CODE (op1) == NOT
14237           || (GET_CODE (op1) == PLUS && XEXP (op1, 1) == const1_rtx))
14238         op1 = XEXP (op1, 0);
14239       else if (GET_CODE (op1) == ZERO_EXTEND && GET_CODE (op2) == ZERO_EXTEND)
14240         {
14241           /* CSEL with zero-extension (*cmovdi_insn_uxtw).  */
14242           op1 = XEXP (op1, 0);
14243           op2 = XEXP (op2, 0);
14244         }
14245       else if (GET_CODE (op1) == ZERO_EXTEND && op2 == const0_rtx)
14246         {
14247           inner = XEXP (op1, 0);
14248           if (GET_CODE (inner) == NEG || GET_CODE (inner) == NOT)
14249             /* CSINV/NEG with zero extend + const 0 (*csinv3_uxtw_insn3).  */
14250             op1 = XEXP (inner, 0);
14251         }
14252       else if (op1 == constm1_rtx || op1 == const1_rtx)
14253         {
14254           /* Use CSINV or CSINC.  */
14255           *cost += rtx_cost (op2, VOIDmode, IF_THEN_ELSE, 2, speed);
14256           return true;
14257         }
14258       else if (op2 == constm1_rtx || op2 == const1_rtx)
14259         {
14260           /* Use CSINV or CSINC.  */
14261           *cost += rtx_cost (op1, VOIDmode, IF_THEN_ELSE, 1, speed);
14262           return true;
14263         }
14264
14265       *cost += rtx_cost (op1, VOIDmode, IF_THEN_ELSE, 1, speed);
14266       *cost += rtx_cost (op2, VOIDmode, IF_THEN_ELSE, 2, speed);
14267       return true;
14268     }
14269
14270   /* We don't know what this is, cost all operands.  */
14271   return false;
14272 }
14273
14274 /* Check whether X is a bitfield operation of the form shift + extend that
14275    maps down to a UBFIZ/SBFIZ/UBFX/SBFX instruction.  If so, return the
14276    operand to which the bitfield operation is applied.  Otherwise return
14277    NULL_RTX.  */
14278
14279 static rtx
14280 aarch64_extend_bitfield_pattern_p (rtx x)
14281 {
14282   rtx_code outer_code = GET_CODE (x);
14283   machine_mode outer_mode = GET_MODE (x);
14284
14285   if (outer_code != ZERO_EXTEND && outer_code != SIGN_EXTEND
14286       && outer_mode != SImode && outer_mode != DImode)
14287     return NULL_RTX;
14288
14289   rtx inner = XEXP (x, 0);
14290   rtx_code inner_code = GET_CODE (inner);
14291   machine_mode inner_mode = GET_MODE (inner);
14292   rtx op = NULL_RTX;
14293
14294   switch (inner_code)
14295     {
14296       case ASHIFT:
14297         if (CONST_INT_P (XEXP (inner, 1))
14298             && (inner_mode == QImode || inner_mode == HImode))
14299           op = XEXP (inner, 0);
14300         break;
14301       case LSHIFTRT:
14302         if (outer_code == ZERO_EXTEND && CONST_INT_P (XEXP (inner, 1))
14303             && (inner_mode == QImode || inner_mode == HImode))
14304           op = XEXP (inner, 0);
14305         break;
14306       case ASHIFTRT:
14307         if (outer_code == SIGN_EXTEND && CONST_INT_P (XEXP (inner, 1))
14308             && (inner_mode == QImode || inner_mode == HImode))
14309           op = XEXP (inner, 0);
14310         break;
14311       default:
14312         break;
14313     }
14314
14315   return op;
14316 }
14317
14318 /* Return true if the mask and a shift amount from an RTX of the form
14319    (x << SHFT_AMNT) & MASK are valid to combine into a UBFIZ instruction of
14320    mode MODE.  See the *andim_ashift<mode>_bfiz pattern.  */
14321
14322 bool
14323 aarch64_mask_and_shift_for_ubfiz_p (scalar_int_mode mode, rtx mask,
14324                                     rtx shft_amnt)
14325 {
14326   return CONST_INT_P (mask) && CONST_INT_P (shft_amnt)
14327          && INTVAL (mask) > 0
14328          && UINTVAL (shft_amnt) < GET_MODE_BITSIZE (mode)
14329          && exact_log2 ((UINTVAL (mask) >> UINTVAL (shft_amnt)) + 1) >= 0
14330          && (UINTVAL (mask)
14331              & ((HOST_WIDE_INT_1U << UINTVAL (shft_amnt)) - 1)) == 0;
14332 }
14333
14334 /* Return true if the masks and a shift amount from an RTX of the form
14335    ((x & MASK1) | ((y << SHIFT_AMNT) & MASK2)) are valid to combine into
14336    a BFI instruction of mode MODE.  See *arch64_bfi patterns.  */
14337
14338 bool
14339 aarch64_masks_and_shift_for_bfi_p (scalar_int_mode mode,
14340                                    unsigned HOST_WIDE_INT mask1,
14341                                    unsigned HOST_WIDE_INT shft_amnt,
14342                                    unsigned HOST_WIDE_INT mask2)
14343 {
14344   unsigned HOST_WIDE_INT t;
14345
14346   /* Verify that there is no overlap in what bits are set in the two masks.  */
14347   if (mask1 != ~mask2)
14348     return false;
14349
14350   /* Verify that mask2 is not all zeros or ones.  */
14351   if (mask2 == 0 || mask2 == HOST_WIDE_INT_M1U)
14352     return false;
14353
14354   /* The shift amount should always be less than the mode size.  */
14355   gcc_assert (shft_amnt < GET_MODE_BITSIZE (mode));
14356
14357   /* Verify that the mask being shifted is contiguous and would be in the
14358      least significant bits after shifting by shft_amnt.  */
14359   t = mask2 + (HOST_WIDE_INT_1U << shft_amnt);
14360   return (t == (t & -t));
14361 }
14362
14363 /* Return true if X is an RTX representing an operation in the ABD family
14364    of instructions.  */
14365
14366 static bool
14367 aarch64_abd_rtx_p (rtx x)
14368 {
14369   if (GET_CODE (x) != MINUS)
14370     return false;
14371   rtx max_arm = XEXP (x, 0);
14372   rtx min_arm = XEXP (x, 1);
14373   if (GET_CODE (max_arm) != SMAX && GET_CODE (max_arm) != UMAX)
14374     return false;
14375   bool signed_p = GET_CODE (max_arm) == SMAX;
14376   if (signed_p && GET_CODE (min_arm) != SMIN)
14377     return false;
14378   else if (!signed_p && GET_CODE (min_arm) != UMIN)
14379     return false;
14380
14381   rtx maxop0 = XEXP (max_arm, 0);
14382   rtx maxop1 = XEXP (max_arm, 1);
14383   rtx minop0 = XEXP (min_arm, 0);
14384   rtx minop1 = XEXP (min_arm, 1);
14385   return rtx_equal_p (maxop0, minop0) && rtx_equal_p (maxop1, minop1);
14386 }
14387
14388 /* Calculate the cost of calculating X, storing it in *COST.  Result
14389    is true if the total cost of the operation has now been calculated.  */
14390 static bool
14391 aarch64_rtx_costs (rtx x, machine_mode mode, int outer ATTRIBUTE_UNUSED,
14392                    int param ATTRIBUTE_UNUSED, int *cost, bool speed)
14393 {
14394   rtx op0, op1, op2;
14395   const struct cpu_cost_table *extra_cost
14396     = aarch64_tune_params.insn_extra_cost;
14397   rtx_code code = GET_CODE (x);
14398   scalar_int_mode int_mode;
14399
14400   /* By default, assume that everything has equivalent cost to the
14401      cheapest instruction.  Any additional costs are applied as a delta
14402      above this default.  */
14403   *cost = COSTS_N_INSNS (1);
14404
14405   switch (code)
14406     {
14407     case SET:
14408       /* The cost depends entirely on the operands to SET.  */
14409       *cost = 0;
14410       op0 = SET_DEST (x);
14411       op1 = SET_SRC (x);
14412
14413       switch (GET_CODE (op0))
14414         {
14415         case MEM:
14416           if (speed)
14417             {
14418               rtx address = XEXP (op0, 0);
14419               if (VECTOR_MODE_P (mode))
14420                 *cost += extra_cost->ldst.storev;
14421               else if (GET_MODE_CLASS (mode) == MODE_INT)
14422                 *cost += extra_cost->ldst.store;
14423               else if (mode == SFmode || mode == SDmode)
14424                 *cost += extra_cost->ldst.storef;
14425               else if (mode == DFmode || mode == DDmode)
14426                 *cost += extra_cost->ldst.stored;
14427
14428               *cost +=
14429                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
14430                                                      0, speed));
14431             }
14432
14433           *cost += rtx_cost (op1, mode, SET, 1, speed);
14434           return true;
14435
14436         case SUBREG:
14437           if (! REG_P (SUBREG_REG (op0)))
14438             *cost += rtx_cost (SUBREG_REG (op0), VOIDmode, SET, 0, speed);
14439
14440           /* Fall through.  */
14441         case REG:
14442           /* The cost is one per vector-register copied.  */
14443           if (VECTOR_MODE_P (GET_MODE (op0)) && REG_P (op1))
14444             {
14445               int nregs = aarch64_hard_regno_nregs (V0_REGNUM, GET_MODE (op0));
14446               *cost = COSTS_N_INSNS (nregs);
14447             }
14448           /* const0_rtx is in general free, but we will use an
14449              instruction to set a register to 0.  */
14450           else if (REG_P (op1) || op1 == const0_rtx)
14451             {
14452               /* The cost is 1 per register copied.  */
14453               int nregs = aarch64_hard_regno_nregs (R0_REGNUM, GET_MODE (op0));
14454               *cost = COSTS_N_INSNS (nregs);
14455             }
14456           else
14457             /* Cost is just the cost of the RHS of the set.  */
14458             *cost += rtx_cost (op1, mode, SET, 1, speed);
14459           return true;
14460
14461         case ZERO_EXTRACT:
14462         case SIGN_EXTRACT:
14463           /* Bit-field insertion.  Strip any redundant widening of
14464              the RHS to meet the width of the target.  */
14465           if (SUBREG_P (op1))
14466             op1 = SUBREG_REG (op1);
14467           if ((GET_CODE (op1) == ZERO_EXTEND
14468                || GET_CODE (op1) == SIGN_EXTEND)
14469               && CONST_INT_P (XEXP (op0, 1))
14470               && is_a <scalar_int_mode> (GET_MODE (XEXP (op1, 0)), &int_mode)
14471               && GET_MODE_BITSIZE (int_mode) >= INTVAL (XEXP (op0, 1)))
14472             op1 = XEXP (op1, 0);
14473
14474           if (CONST_INT_P (op1))
14475             {
14476               /* MOV immediate is assumed to always be cheap.  */
14477               *cost = COSTS_N_INSNS (1);
14478             }
14479           else
14480             {
14481               /* BFM.  */
14482               if (speed)
14483                 *cost += extra_cost->alu.bfi;
14484               *cost += rtx_cost (op1, VOIDmode, code, 1, speed);
14485             }
14486
14487           return true;
14488
14489         default:
14490           /* We can't make sense of this, assume default cost.  */
14491           *cost = COSTS_N_INSNS (1);
14492           return false;
14493         }
14494       return false;
14495
14496     case CONST_INT:
14497       /* If an instruction can incorporate a constant within the
14498          instruction, the instruction's expression avoids calling
14499          rtx_cost() on the constant.  If rtx_cost() is called on a
14500          constant, then it is usually because the constant must be
14501          moved into a register by one or more instructions.
14502
14503          The exception is constant 0, which can be expressed
14504          as XZR/WZR and is therefore free.  The exception to this is
14505          if we have (set (reg) (const0_rtx)) in which case we must cost
14506          the move.  However, we can catch that when we cost the SET, so
14507          we don't need to consider that here.  */
14508       if (x == const0_rtx)
14509         *cost = 0;
14510       else
14511         {
14512           /* To an approximation, building any other constant is
14513              proportionally expensive to the number of instructions
14514              required to build that constant.  This is true whether we
14515              are compiling for SPEED or otherwise.  */
14516           machine_mode imode = known_le (GET_MODE_SIZE (mode), 4)
14517                                 ? SImode : DImode;
14518           *cost = COSTS_N_INSNS (aarch64_internal_mov_immediate
14519                                  (NULL_RTX, x, false, imode));
14520         }
14521       return true;
14522
14523     case CONST_DOUBLE:
14524
14525       /* First determine number of instructions to do the move
14526           as an integer constant.  */
14527       if (!aarch64_float_const_representable_p (x)
14528            && !aarch64_can_const_movi_rtx_p (x, mode)
14529            && aarch64_float_const_rtx_p (x))
14530         {
14531           unsigned HOST_WIDE_INT ival;
14532           bool succeed = aarch64_reinterpret_float_as_int (x, &ival);
14533           gcc_assert (succeed);
14534
14535           machine_mode imode = known_eq (GET_MODE_SIZE (mode), 8)
14536                                 ? DImode : SImode;
14537           int ncost = aarch64_internal_mov_immediate
14538                 (NULL_RTX, gen_int_mode (ival, imode), false, imode);
14539           *cost += COSTS_N_INSNS (ncost);
14540           return true;
14541         }
14542
14543       if (speed)
14544         {
14545           /* mov[df,sf]_aarch64.  */
14546           if (aarch64_float_const_representable_p (x))
14547             /* FMOV (scalar immediate).  */
14548             *cost += extra_cost->fp[mode == DFmode || mode == DDmode].fpconst;
14549           else if (!aarch64_float_const_zero_rtx_p (x))
14550             {
14551               /* This will be a load from memory.  */
14552               if (mode == DFmode || mode == DDmode)
14553                 *cost += extra_cost->ldst.loadd;
14554               else
14555                 *cost += extra_cost->ldst.loadf;
14556             }
14557           else
14558             /* Otherwise this is +0.0.  We get this using MOVI d0, #0
14559                or MOV v0.s[0], wzr - neither of which are modeled by the
14560                cost tables.  Just use the default cost.  */
14561             {
14562             }
14563         }
14564
14565       return true;
14566
14567     case MEM:
14568       if (speed)
14569         {
14570           /* For loads we want the base cost of a load, plus an
14571              approximation for the additional cost of the addressing
14572              mode.  */
14573           rtx address = XEXP (x, 0);
14574           if (VECTOR_MODE_P (mode))
14575             *cost += extra_cost->ldst.loadv;
14576           else if (GET_MODE_CLASS (mode) == MODE_INT)
14577             *cost += extra_cost->ldst.load;
14578           else if (mode == SFmode || mode == SDmode)
14579             *cost += extra_cost->ldst.loadf;
14580           else if (mode == DFmode || mode == DDmode)
14581             *cost += extra_cost->ldst.loadd;
14582
14583           *cost +=
14584                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
14585                                                      0, speed));
14586         }
14587
14588       return true;
14589
14590     case NEG:
14591       op0 = XEXP (x, 0);
14592
14593       if (VECTOR_MODE_P (mode))
14594         {
14595           /* Many vector comparison operations are represented as NEG
14596              of a comparison.  */
14597           if (COMPARISON_P (op0))
14598             {
14599               rtx op00 = XEXP (op0, 0);
14600               rtx op01 = XEXP (op0, 1);
14601               machine_mode inner_mode = GET_MODE (op00);
14602               /* FACGE/FACGT.  */
14603               if (GET_MODE_CLASS (inner_mode) == MODE_VECTOR_FLOAT
14604                   && GET_CODE (op00) == ABS
14605                   && GET_CODE (op01) == ABS)
14606                 {
14607                   op00 = XEXP (op00, 0);
14608                   op01 = XEXP (op01, 0);
14609                 }
14610               *cost += rtx_cost (op00, inner_mode, GET_CODE (op0), 0, speed);
14611               *cost += rtx_cost (op01, inner_mode, GET_CODE (op0), 1, speed);
14612               if (speed)
14613                 *cost += extra_cost->vect.alu;
14614               return true;
14615             }
14616           if (speed)
14617             {
14618               /* FNEG.  */
14619               *cost += extra_cost->vect.alu;
14620             }
14621           return false;
14622         }
14623
14624       if (GET_MODE_CLASS (mode) == MODE_INT)
14625         {
14626           if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
14627               || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
14628             {
14629               /* CSETM.  */
14630               *cost += rtx_cost (XEXP (op0, 0), VOIDmode, NEG, 0, speed);
14631               return true;
14632             }
14633
14634           /* Cost this as SUB wzr, X.  */
14635           op0 = CONST0_RTX (mode);
14636           op1 = XEXP (x, 0);
14637           goto cost_minus;
14638         }
14639
14640       if (GET_MODE_CLASS (mode) == MODE_FLOAT)
14641         {
14642           /* Support (neg(fma...)) as a single instruction only if
14643              sign of zeros is unimportant.  This matches the decision
14644              making in aarch64.md.  */
14645           if (GET_CODE (op0) == FMA && !HONOR_SIGNED_ZEROS (GET_MODE (op0)))
14646             {
14647               /* FNMADD.  */
14648               *cost = rtx_cost (op0, mode, NEG, 0, speed);
14649               return true;
14650             }
14651           if (GET_CODE (op0) == MULT)
14652             {
14653               /* FNMUL.  */
14654               *cost = rtx_cost (op0, mode, NEG, 0, speed);
14655               return true;
14656             }
14657           if (speed)
14658             /* FNEG.  */
14659             *cost += extra_cost->fp[mode == DFmode].neg;
14660           return false;
14661         }
14662
14663       return false;
14664
14665     case CLRSB:
14666     case CLZ:
14667       if (speed)
14668         {
14669           if (VECTOR_MODE_P (mode))
14670             *cost += extra_cost->vect.alu;
14671           else
14672             *cost += extra_cost->alu.clz;
14673         }
14674
14675       return false;
14676
14677     case CTZ:
14678       if (VECTOR_MODE_P (mode))
14679         {
14680           *cost = COSTS_N_INSNS (3);
14681           if (speed)
14682             *cost += extra_cost->vect.alu * 3;
14683         }
14684       else if (TARGET_CSSC)
14685         {
14686           *cost = COSTS_N_INSNS (1);
14687           if (speed)
14688             *cost += extra_cost->alu.clz;
14689         }
14690       else
14691         {
14692           *cost = COSTS_N_INSNS (2);
14693           if (speed)
14694             *cost += extra_cost->alu.clz + extra_cost->alu.rev;
14695         }
14696       return false;
14697
14698     case COMPARE:
14699       op0 = XEXP (x, 0);
14700       op1 = XEXP (x, 1);
14701
14702       if (op1 == const0_rtx
14703           && GET_CODE (op0) == AND)
14704         {
14705           x = op0;
14706           mode = GET_MODE (op0);
14707           goto cost_logic;
14708         }
14709
14710       if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT)
14711         {
14712           /* TODO: A write to the CC flags possibly costs extra, this
14713              needs encoding in the cost tables.  */
14714
14715           mode = GET_MODE (op0);
14716           /* ANDS.  */
14717           if (GET_CODE (op0) == AND)
14718             {
14719               x = op0;
14720               goto cost_logic;
14721             }
14722
14723           if (GET_CODE (op0) == PLUS)
14724             {
14725               /* ADDS (and CMN alias).  */
14726               x = op0;
14727               goto cost_plus;
14728             }
14729
14730           if (GET_CODE (op0) == MINUS)
14731             {
14732               /* SUBS.  */
14733               x = op0;
14734               goto cost_minus;
14735             }
14736
14737           if (GET_CODE (op0) == ZERO_EXTRACT && op1 == const0_rtx
14738               && GET_MODE (x) == CC_NZmode && CONST_INT_P (XEXP (op0, 1))
14739               && CONST_INT_P (XEXP (op0, 2)))
14740             {
14741               /* COMPARE of ZERO_EXTRACT form of TST-immediate.
14742                  Handle it here directly rather than going to cost_logic
14743                  since we know the immediate generated for the TST is valid
14744                  so we can avoid creating an intermediate rtx for it only
14745                  for costing purposes.  */
14746               if (speed)
14747                 *cost += extra_cost->alu.logical;
14748
14749               *cost += rtx_cost (XEXP (op0, 0), GET_MODE (op0),
14750                                  ZERO_EXTRACT, 0, speed);
14751               return true;
14752             }
14753
14754           if (GET_CODE (op1) == NEG)
14755             {
14756               /* CMN.  */
14757               if (speed)
14758                 *cost += extra_cost->alu.arith;
14759
14760               *cost += rtx_cost (op0, mode, COMPARE, 0, speed);
14761               *cost += rtx_cost (XEXP (op1, 0), mode, NEG, 1, speed);
14762               return true;
14763             }
14764
14765           /* CMP.
14766
14767              Compare can freely swap the order of operands, and
14768              canonicalization puts the more complex operation first.
14769              But the integer MINUS logic expects the shift/extend
14770              operation in op1.  */
14771           if (! (REG_P (op0)
14772                  || (SUBREG_P (op0) && REG_P (SUBREG_REG (op0)))))
14773           {
14774             op0 = XEXP (x, 1);
14775             op1 = XEXP (x, 0);
14776           }
14777           goto cost_minus;
14778         }
14779
14780       if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_FLOAT)
14781         {
14782           /* FCMP.  */
14783           if (speed)
14784             *cost += extra_cost->fp[mode == DFmode].compare;
14785
14786           if (CONST_DOUBLE_P (op1) && aarch64_float_const_zero_rtx_p (op1))
14787             {
14788               *cost += rtx_cost (op0, VOIDmode, COMPARE, 0, speed);
14789               /* FCMP supports constant 0.0 for no extra cost. */
14790               return true;
14791             }
14792           return false;
14793         }
14794
14795       if (VECTOR_MODE_P (mode))
14796         {
14797           /* Vector compare.  */
14798           if (speed)
14799             *cost += extra_cost->vect.alu;
14800
14801           if (aarch64_float_const_zero_rtx_p (op1))
14802             {
14803               /* Vector cm (eq|ge|gt|lt|le) supports constant 0.0 for no extra
14804                  cost.  */
14805               return true;
14806             }
14807           return false;
14808         }
14809       return false;
14810
14811     case MINUS:
14812       {
14813         op0 = XEXP (x, 0);
14814         op1 = XEXP (x, 1);
14815
14816 cost_minus:
14817         if (VECTOR_MODE_P (mode))
14818           {
14819             unsigned int vec_flags = aarch64_classify_vector_mode (mode);
14820             if (TARGET_SIMD && (vec_flags & VEC_ADVSIMD))
14821               {
14822                 /* Recognise the SABD and UABD operation here.
14823                    Recursion from the PLUS case will catch the accumulating
14824                    forms.  */
14825                 if (aarch64_abd_rtx_p (x))
14826                   {
14827                     if (speed)
14828                       *cost += extra_cost->vect.alu;
14829                     return true;
14830                   }
14831                   /* SUBL2 and SUBW2.
14832                    The select-operand-high-half versions of the sub instruction
14833                    have the same cost as the regular three vector version -
14834                    don't add the costs of the select into the costs of the sub.
14835                    */
14836                 op0 = aarch64_strip_extend_vec_half (op0);
14837                 op1 = aarch64_strip_extend_vec_half (op1);
14838               }
14839           }
14840
14841         *cost += rtx_cost (op0, mode, MINUS, 0, speed);
14842
14843         /* Detect valid immediates.  */
14844         if ((GET_MODE_CLASS (mode) == MODE_INT
14845              || (GET_MODE_CLASS (mode) == MODE_CC
14846                  && GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT))
14847             && CONST_INT_P (op1)
14848             && aarch64_uimm12_shift (INTVAL (op1)))
14849           {
14850             if (speed)
14851               /* SUB(S) (immediate).  */
14852               *cost += extra_cost->alu.arith;
14853             return true;
14854           }
14855
14856         /* Look for SUB (extended register).  */
14857         if (is_a <scalar_int_mode> (mode)
14858             && aarch64_rtx_arith_op_extract_p (op1))
14859           {
14860             if (speed)
14861               *cost += extra_cost->alu.extend_arith;
14862
14863             op1 = aarch64_strip_extend (op1, true);
14864             *cost += rtx_cost (op1, VOIDmode, GET_CODE (op1), 0, speed);
14865             return true;
14866           }
14867
14868         rtx new_op1 = aarch64_strip_extend (op1, false);
14869
14870         /* Cost this as an FMA-alike operation.  */
14871         if ((GET_CODE (new_op1) == MULT
14872              || aarch64_shift_p (GET_CODE (new_op1)))
14873             && code != COMPARE)
14874           {
14875             *cost += aarch64_rtx_mult_cost (new_op1, MULT, code, speed);
14876             return true;
14877           }
14878
14879         *cost += rtx_cost (new_op1, VOIDmode, MINUS, 1, speed);
14880
14881         if (speed)
14882           {
14883             if (VECTOR_MODE_P (mode))
14884               {
14885                 /* Vector SUB.  */
14886                 *cost += extra_cost->vect.alu;
14887               }
14888             else if (GET_MODE_CLASS (mode) == MODE_INT)
14889               {
14890                 /* SUB(S).  */
14891                 *cost += extra_cost->alu.arith;
14892               }
14893             else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
14894               {
14895                 /* FSUB.  */
14896                 *cost += extra_cost->fp[mode == DFmode].addsub;
14897               }
14898           }
14899         return true;
14900       }
14901
14902     case PLUS:
14903       {
14904         rtx new_op0;
14905
14906         op0 = XEXP (x, 0);
14907         op1 = XEXP (x, 1);
14908
14909 cost_plus:
14910         if (VECTOR_MODE_P (mode))
14911           {
14912             /* ADDL2 and ADDW2.  */
14913             unsigned int vec_flags = aarch64_classify_vector_mode (mode);
14914             if (TARGET_SIMD && (vec_flags & VEC_ADVSIMD))
14915               {
14916                 /* The select-operand-high-half versions of the add instruction
14917                    have the same cost as the regular three vector version -
14918                    don't add the costs of the select into the costs of the add.
14919                    */
14920                 op0 = aarch64_strip_extend_vec_half (op0);
14921                 op1 = aarch64_strip_extend_vec_half (op1);
14922               }
14923           }
14924
14925         if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
14926             || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
14927           {
14928             /* CSINC.  */
14929             *cost += rtx_cost (XEXP (op0, 0), mode, PLUS, 0, speed);
14930             *cost += rtx_cost (op1, mode, PLUS, 1, speed);
14931             return true;
14932           }
14933
14934         if (GET_MODE_CLASS (mode) == MODE_INT
14935             && (aarch64_plus_immediate (op1, mode)
14936                 || aarch64_sve_addvl_addpl_immediate (op1, mode)))
14937           {
14938             *cost += rtx_cost (op0, mode, PLUS, 0, speed);
14939
14940             if (speed)
14941               {
14942                 /* ADD (immediate).  */
14943                 *cost += extra_cost->alu.arith;
14944
14945                 /* Some tunings prefer to not use the VL-based scalar ops.
14946                    Increase the cost of the poly immediate to prevent their
14947                    formation.  */
14948                 if (GET_CODE (op1) == CONST_POLY_INT
14949                     && (aarch64_tune_params.extra_tuning_flags
14950                         & AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS))
14951                   *cost += COSTS_N_INSNS (1);
14952               }
14953             return true;
14954           }
14955
14956         if (aarch64_pluslong_immediate (op1, mode))
14957           {
14958             /* 24-bit add in 2 instructions or 12-bit shifted add.  */
14959             if ((INTVAL (op1) & 0xfff) != 0)
14960               *cost += COSTS_N_INSNS (1);
14961
14962             *cost += rtx_cost (op0, mode, PLUS, 0, speed);
14963             return true;
14964           }
14965
14966         *cost += rtx_cost (op1, mode, PLUS, 1, speed);
14967
14968         /* Look for ADD (extended register).  */
14969         if (is_a <scalar_int_mode> (mode)
14970             && aarch64_rtx_arith_op_extract_p (op0))
14971           {
14972             if (speed)
14973               *cost += extra_cost->alu.extend_arith;
14974
14975             op0 = aarch64_strip_extend (op0, true);
14976             *cost += rtx_cost (op0, VOIDmode, GET_CODE (op0), 0, speed);
14977             return true;
14978           }
14979
14980         /* Strip any extend, leave shifts behind as we will
14981            cost them through mult_cost.  */
14982         new_op0 = aarch64_strip_extend (op0, false);
14983
14984         if (GET_CODE (new_op0) == MULT
14985             || aarch64_shift_p (GET_CODE (new_op0)))
14986           {
14987             *cost += aarch64_rtx_mult_cost (new_op0, MULT, PLUS,
14988                                             speed);
14989             return true;
14990           }
14991
14992         *cost += rtx_cost (new_op0, VOIDmode, PLUS, 0, speed);
14993
14994         if (speed)
14995           {
14996             if (VECTOR_MODE_P (mode))
14997               {
14998                 /* Vector ADD.  */
14999                 *cost += extra_cost->vect.alu;
15000               }
15001             else if (GET_MODE_CLASS (mode) == MODE_INT)
15002               {
15003                 /* ADD.  */
15004                 *cost += extra_cost->alu.arith;
15005               }
15006             else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
15007               {
15008                 /* FADD.  */
15009                 *cost += extra_cost->fp[mode == DFmode].addsub;
15010               }
15011           }
15012         return true;
15013       }
15014
15015     case BITREVERSE:
15016     case BSWAP:
15017       *cost = COSTS_N_INSNS (1);
15018
15019       if (speed)
15020         {
15021           if (VECTOR_MODE_P (mode))
15022             *cost += extra_cost->vect.alu;
15023           else
15024             *cost += extra_cost->alu.rev;
15025         }
15026       return false;
15027
15028     case IOR:
15029       if (aarch_rev16_p (x))
15030         {
15031           *cost = COSTS_N_INSNS (1);
15032
15033           if (speed)
15034             {
15035               if (VECTOR_MODE_P (mode))
15036                 *cost += extra_cost->vect.alu;
15037               else
15038                 *cost += extra_cost->alu.rev;
15039             }
15040           return true;
15041         }
15042
15043       if (aarch64_extr_rtx_p (x, &op0, &op1))
15044         {
15045           *cost += rtx_cost (op0, mode, IOR, 0, speed);
15046           *cost += rtx_cost (op1, mode, IOR, 1, speed);
15047           if (speed)
15048             *cost += extra_cost->alu.shift;
15049
15050           return true;
15051         }
15052     /* Fall through.  */
15053     case XOR:
15054     case AND:
15055     cost_logic:
15056       op0 = XEXP (x, 0);
15057       op1 = XEXP (x, 1);
15058
15059       if (VECTOR_MODE_P (mode))
15060         {
15061           if (speed)
15062             *cost += extra_cost->vect.alu;
15063           return true;
15064         }
15065
15066       if (code == AND
15067           && GET_CODE (op0) == MULT
15068           && CONST_INT_P (XEXP (op0, 1))
15069           && CONST_INT_P (op1)
15070           && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0, 1))),
15071                                INTVAL (op1)) != 0)
15072         {
15073           /* This is a UBFM/SBFM.  */
15074           *cost += rtx_cost (XEXP (op0, 0), mode, ZERO_EXTRACT, 0, speed);
15075           if (speed)
15076             *cost += extra_cost->alu.bfx;
15077           return true;
15078         }
15079
15080       if (is_int_mode (mode, &int_mode))
15081         {
15082           if (CONST_INT_P (op1))
15083             {
15084               /* We have a mask + shift version of a UBFIZ
15085                  i.e. the *andim_ashift<mode>_bfiz pattern.  */
15086               if (GET_CODE (op0) == ASHIFT
15087                   && aarch64_mask_and_shift_for_ubfiz_p (int_mode, op1,
15088                                                          XEXP (op0, 1)))
15089                 {
15090                   *cost += rtx_cost (XEXP (op0, 0), int_mode, code, 0, speed);
15091                   if (speed)
15092                     *cost += extra_cost->alu.bfx;
15093
15094                   return true;
15095                 }
15096               else if (aarch64_bitmask_imm (INTVAL (op1), int_mode))
15097                 {
15098                 /* We possibly get the immediate for free, this is not
15099                    modelled.  */
15100                   *cost += rtx_cost (op0, int_mode, code, 0, speed);
15101                   if (speed)
15102                     *cost += extra_cost->alu.logical;
15103
15104                   return true;
15105                 }
15106             }
15107           else
15108             {
15109               rtx new_op0 = op0;
15110
15111               /* Handle ORN, EON, or BIC.  */
15112               if (GET_CODE (op0) == NOT)
15113                 op0 = XEXP (op0, 0);
15114
15115               new_op0 = aarch64_strip_shift (op0);
15116
15117               /* If we had a shift on op0 then this is a logical-shift-
15118                  by-register/immediate operation.  Otherwise, this is just
15119                  a logical operation.  */
15120               if (speed)
15121                 {
15122                   if (new_op0 != op0)
15123                     {
15124                       /* Shift by immediate.  */
15125                       if (CONST_INT_P (XEXP (op0, 1)))
15126                         *cost += extra_cost->alu.log_shift;
15127                       else
15128                         *cost += extra_cost->alu.log_shift_reg;
15129                     }
15130                   else
15131                     *cost += extra_cost->alu.logical;
15132                 }
15133
15134               /* In both cases we want to cost both operands.  */
15135               *cost += rtx_cost (new_op0, int_mode, code, 0, speed);
15136               *cost += rtx_cost (op1, int_mode, code, 1, speed);
15137
15138               return true;
15139             }
15140         }
15141       return false;
15142
15143     case NOT:
15144       x = XEXP (x, 0);
15145       op0 = aarch64_strip_shift (x);
15146
15147       if (VECTOR_MODE_P (mode))
15148         {
15149           /* Vector NOT.  */
15150           *cost += extra_cost->vect.alu;
15151           return false;
15152         }
15153
15154       /* MVN-shifted-reg.  */
15155       if (op0 != x)
15156         {
15157           *cost += rtx_cost (op0, mode, code, 0, speed);
15158
15159           if (speed)
15160             *cost += extra_cost->alu.log_shift;
15161
15162           return true;
15163         }
15164       /* EON can have two forms: (xor (not a) b) but also (not (xor a b)).
15165          Handle the second form here taking care that 'a' in the above can
15166          be a shift.  */
15167       else if (GET_CODE (op0) == XOR)
15168         {
15169           rtx newop0 = XEXP (op0, 0);
15170           rtx newop1 = XEXP (op0, 1);
15171           rtx op0_stripped = aarch64_strip_shift (newop0);
15172
15173           *cost += rtx_cost (newop1, mode, code, 1, speed);
15174           *cost += rtx_cost (op0_stripped, mode, XOR, 0, speed);
15175
15176           if (speed)
15177             {
15178               if (op0_stripped != newop0)
15179                 *cost += extra_cost->alu.log_shift;
15180               else
15181                 *cost += extra_cost->alu.logical;
15182             }
15183
15184           return true;
15185         }
15186       /* MVN.  */
15187       if (speed)
15188         *cost += extra_cost->alu.logical;
15189
15190       return false;
15191
15192     case ZERO_EXTEND:
15193
15194       op0 = XEXP (x, 0);
15195       /* If a value is written in SI mode, then zero extended to DI
15196          mode, the operation will in general be free as a write to
15197          a 'w' register implicitly zeroes the upper bits of an 'x'
15198          register.  However, if this is
15199
15200            (set (reg) (zero_extend (reg)))
15201
15202          we must cost the explicit register move.  */
15203       if (mode == DImode
15204           && GET_MODE (op0) == SImode)
15205         {
15206           int op_cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, 0, speed);
15207
15208         /* If OP_COST is non-zero, then the cost of the zero extend
15209            is effectively the cost of the inner operation.  Otherwise
15210            we have a MOV instruction and we take the cost from the MOV
15211            itself.  This is true independently of whether we are
15212            optimizing for space or time.  */
15213           if (op_cost)
15214             *cost = op_cost;
15215
15216           return true;
15217         }
15218       else if (MEM_P (op0))
15219         {
15220           /* All loads can zero extend to any size for free.  */
15221           *cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, param, speed);
15222           return true;
15223         }
15224
15225       op0 = aarch64_extend_bitfield_pattern_p (x);
15226       if (op0)
15227         {
15228           *cost += rtx_cost (op0, mode, ZERO_EXTEND, 0, speed);
15229           if (speed)
15230             *cost += extra_cost->alu.bfx;
15231           return true;
15232         }
15233
15234       if (speed)
15235         {
15236           if (VECTOR_MODE_P (mode))
15237             {
15238               /* UMOV.  */
15239               *cost += extra_cost->vect.alu;
15240             }
15241           else
15242             {
15243               /* We generate an AND instead of UXTB/UXTH.  */
15244               *cost += extra_cost->alu.logical;
15245             }
15246         }
15247       return false;
15248
15249     case SIGN_EXTEND:
15250       if (MEM_P (XEXP (x, 0)))
15251         {
15252           /* LDRSH.  */
15253           if (speed)
15254             {
15255               rtx address = XEXP (XEXP (x, 0), 0);
15256               *cost += extra_cost->ldst.load_sign_extend;
15257
15258               *cost +=
15259                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
15260                                                      0, speed));
15261             }
15262           return true;
15263         }
15264
15265       op0 = aarch64_extend_bitfield_pattern_p (x);
15266       if (op0)
15267         {
15268           *cost += rtx_cost (op0, mode, SIGN_EXTEND, 0, speed);
15269           if (speed)
15270             *cost += extra_cost->alu.bfx;
15271           return true;
15272         }
15273
15274       if (speed)
15275         {
15276           if (VECTOR_MODE_P (mode))
15277             *cost += extra_cost->vect.alu;
15278           else
15279             *cost += extra_cost->alu.extend;
15280         }
15281       return false;
15282
15283     case ROTATE:
15284     case ROTATERT:
15285     case LSHIFTRT:
15286     case ASHIFTRT:
15287     case ASHIFT:
15288       op0 = XEXP (x, 0);
15289       op1 = XEXP (x, 1);
15290
15291       if (CONST_INT_P (op1))
15292         {
15293           if (speed)
15294             {
15295               if (VECTOR_MODE_P (mode))
15296                 {
15297                   /* Vector shift (immediate).  */
15298                   *cost += extra_cost->vect.alu;
15299                 }
15300               else
15301                 {
15302                   /* LSL (immediate), ASR (immediate), UBMF, UBFIZ and friends.
15303                      These are all aliases.  */
15304                   *cost += extra_cost->alu.shift;
15305                 }
15306             }
15307
15308           /* We can incorporate zero/sign extend for free.  */
15309           if (GET_CODE (op0) == ZERO_EXTEND
15310               || GET_CODE (op0) == SIGN_EXTEND)
15311             op0 = XEXP (op0, 0);
15312
15313           *cost += rtx_cost (op0, VOIDmode, ASHIFT, 0, speed);
15314           return true;
15315         }
15316       else
15317         {
15318           if (VECTOR_MODE_P (mode))
15319             {
15320               if (speed)
15321                 /* Vector shift (register).  */
15322                 *cost += extra_cost->vect.alu;
15323             }
15324           else
15325             {
15326               if (speed)
15327                 /* LSLV, ASRV.  */
15328                 *cost += extra_cost->alu.shift_reg;
15329
15330                /* The register shift amount may be in a shorter mode expressed
15331                   as a lowpart SUBREG.  For costing purposes just look inside.  */
15332               if (SUBREG_P (op1) && subreg_lowpart_p (op1))
15333                 op1 = SUBREG_REG (op1);
15334               if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
15335                   && CONST_INT_P (XEXP (op1, 1))
15336                   && known_eq (INTVAL (XEXP (op1, 1)),
15337                                GET_MODE_BITSIZE (mode) - 1))
15338                 {
15339                   *cost += rtx_cost (op0, mode, code, 0, speed);
15340                   /* We already demanded XEXP (op1, 0) to be REG_P, so
15341                      don't recurse into it.  */
15342                   return true;
15343                 }
15344             }
15345           return false;  /* All arguments need to be in registers.  */
15346         }
15347
15348     case SYMBOL_REF:
15349
15350       if (aarch64_cmodel == AARCH64_CMODEL_LARGE
15351           || aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC)
15352         {
15353           /* LDR.  */
15354           if (speed)
15355             *cost += extra_cost->ldst.load;
15356         }
15357       else if (aarch64_cmodel == AARCH64_CMODEL_SMALL
15358                || aarch64_cmodel == AARCH64_CMODEL_SMALL_PIC)
15359         {
15360           /* ADRP, followed by ADD.  */
15361           *cost += COSTS_N_INSNS (1);
15362           if (speed)
15363             *cost += 2 * extra_cost->alu.arith;
15364         }
15365       else if (aarch64_cmodel == AARCH64_CMODEL_TINY
15366                || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
15367         {
15368           /* ADR.  */
15369           if (speed)
15370             *cost += extra_cost->alu.arith;
15371         }
15372
15373       if (flag_pic)
15374         {
15375           /* One extra load instruction, after accessing the GOT.  */
15376           *cost += COSTS_N_INSNS (1);
15377           if (speed)
15378             *cost += extra_cost->ldst.load;
15379         }
15380       return true;
15381
15382     case HIGH:
15383     case LO_SUM:
15384       /* ADRP/ADD (immediate).  */
15385       if (speed)
15386         *cost += extra_cost->alu.arith;
15387       return true;
15388
15389     case ZERO_EXTRACT:
15390     case SIGN_EXTRACT:
15391       /* UBFX/SBFX.  */
15392       if (speed)
15393         {
15394           if (VECTOR_MODE_P (mode))
15395             *cost += extra_cost->vect.alu;
15396           else
15397             *cost += extra_cost->alu.bfx;
15398         }
15399
15400       /* We can trust that the immediates used will be correct (there
15401          are no by-register forms), so we need only cost op0.  */
15402       *cost += rtx_cost (XEXP (x, 0), VOIDmode, code, 0, speed);
15403       return true;
15404
15405     case MULT:
15406       *cost += aarch64_rtx_mult_cost (x, MULT, 0, speed);
15407       /* aarch64_rtx_mult_cost always handles recursion to its
15408          operands.  */
15409       return true;
15410
15411     case MOD:
15412     /* We can expand signed mod by power of 2 using a NEGS, two parallel
15413        ANDs and a CSNEG.  Assume here that CSNEG is the same as the cost of
15414        an unconditional negate.  This case should only ever be reached through
15415        the set_smod_pow2_cheap check in expmed.cc.  */
15416       if (CONST_INT_P (XEXP (x, 1))
15417           && exact_log2 (INTVAL (XEXP (x, 1))) > 0
15418           && (mode == SImode || mode == DImode))
15419         {
15420           /* We expand to 4 instructions.  Reset the baseline.  */
15421           *cost = COSTS_N_INSNS (4);
15422
15423           if (speed)
15424             *cost += 2 * extra_cost->alu.logical
15425                      + 2 * extra_cost->alu.arith;
15426
15427           return true;
15428         }
15429
15430     /* Fall-through.  */
15431     case UMOD:
15432       if (speed)
15433         {
15434           /* Slighly prefer UMOD over SMOD.  */
15435           if (VECTOR_MODE_P (mode))
15436             *cost += extra_cost->vect.alu;
15437           else if (GET_MODE_CLASS (mode) == MODE_INT)
15438             *cost += (extra_cost->mult[mode == DImode].add
15439                       + extra_cost->mult[mode == DImode].idiv
15440                       + (code == MOD ? 1 : 0));
15441         }
15442       return false;  /* All arguments need to be in registers.  */
15443
15444     case DIV:
15445     case UDIV:
15446     case SQRT:
15447       if (speed)
15448         {
15449           if (VECTOR_MODE_P (mode))
15450             *cost += extra_cost->vect.alu;
15451           else if (GET_MODE_CLASS (mode) == MODE_INT)
15452             /* There is no integer SQRT, so only DIV and UDIV can get
15453                here.  */
15454             *cost += (extra_cost->mult[mode == DImode].idiv
15455                      /* Slighly prefer UDIV over SDIV.  */
15456                      + (code == DIV ? 1 : 0));
15457           else
15458             *cost += extra_cost->fp[mode == DFmode].div;
15459         }
15460       return false;  /* All arguments need to be in registers.  */
15461
15462     case IF_THEN_ELSE:
15463       return aarch64_if_then_else_costs (XEXP (x, 0), XEXP (x, 1),
15464                                          XEXP (x, 2), cost, speed);
15465
15466     case EQ:
15467     case NE:
15468     case GT:
15469     case GTU:
15470     case LT:
15471     case LTU:
15472     case GE:
15473     case GEU:
15474     case LE:
15475     case LEU:
15476
15477       return false; /* All arguments must be in registers.  */
15478
15479     case FMA:
15480       op0 = XEXP (x, 0);
15481       op1 = XEXP (x, 1);
15482       op2 = XEXP (x, 2);
15483
15484       if (speed)
15485         {
15486           if (VECTOR_MODE_P (mode))
15487             *cost += extra_cost->vect.alu;
15488           else
15489             *cost += extra_cost->fp[mode == DFmode].fma;
15490         }
15491
15492       /* FMSUB, FNMADD, and FNMSUB are free.  */
15493       if (GET_CODE (op0) == NEG)
15494         op0 = XEXP (op0, 0);
15495
15496       if (GET_CODE (op2) == NEG)
15497         op2 = XEXP (op2, 0);
15498
15499       /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
15500          and the by-element operand as operand 0.  */
15501       if (GET_CODE (op1) == NEG)
15502         op1 = XEXP (op1, 0);
15503
15504       /* Catch vector-by-element operations.  The by-element operand can
15505          either be (vec_duplicate (vec_select (x))) or just
15506          (vec_select (x)), depending on whether we are multiplying by
15507          a vector or a scalar.
15508
15509          Canonicalization is not very good in these cases, FMA4 will put the
15510          by-element operand as operand 0, FNMA4 will have it as operand 1.  */
15511       if (GET_CODE (op0) == VEC_DUPLICATE)
15512         op0 = XEXP (op0, 0);
15513       else if (GET_CODE (op1) == VEC_DUPLICATE)
15514         op1 = XEXP (op1, 0);
15515
15516       if (GET_CODE (op0) == VEC_SELECT)
15517         op0 = XEXP (op0, 0);
15518       else if (GET_CODE (op1) == VEC_SELECT)
15519         op1 = XEXP (op1, 0);
15520
15521       /* If the remaining parameters are not registers,
15522          get the cost to put them into registers.  */
15523       *cost += rtx_cost (op0, mode, FMA, 0, speed);
15524       *cost += rtx_cost (op1, mode, FMA, 1, speed);
15525       *cost += rtx_cost (op2, mode, FMA, 2, speed);
15526       return true;
15527
15528     case FLOAT:
15529     case UNSIGNED_FLOAT:
15530       if (speed)
15531         *cost += extra_cost->fp[mode == DFmode].fromint;
15532       return false;
15533
15534     case FLOAT_EXTEND:
15535       if (speed)
15536         {
15537           if (VECTOR_MODE_P (mode))
15538             {
15539               /*Vector truncate.  */
15540               *cost += extra_cost->vect.alu;
15541             }
15542           else
15543             *cost += extra_cost->fp[mode == DFmode].widen;
15544         }
15545       return false;
15546
15547     case FLOAT_TRUNCATE:
15548       if (speed)
15549         {
15550           if (VECTOR_MODE_P (mode))
15551             {
15552               /*Vector conversion.  */
15553               *cost += extra_cost->vect.alu;
15554             }
15555           else
15556             *cost += extra_cost->fp[mode == DFmode].narrow;
15557         }
15558       return false;
15559
15560     case FIX:
15561     case UNSIGNED_FIX:
15562       x = XEXP (x, 0);
15563       /* Strip the rounding part.  They will all be implemented
15564          by the fcvt* family of instructions anyway.  */
15565       if (GET_CODE (x) == UNSPEC)
15566         {
15567           unsigned int uns_code = XINT (x, 1);
15568
15569           if (uns_code == UNSPEC_FRINTA
15570               || uns_code == UNSPEC_FRINTM
15571               || uns_code == UNSPEC_FRINTN
15572               || uns_code == UNSPEC_FRINTP
15573               || uns_code == UNSPEC_FRINTZ)
15574             x = XVECEXP (x, 0, 0);
15575         }
15576
15577       if (speed)
15578         {
15579           if (VECTOR_MODE_P (mode))
15580             *cost += extra_cost->vect.alu;
15581           else
15582             *cost += extra_cost->fp[GET_MODE (x) == DFmode].toint;
15583         }
15584
15585       /* We can combine fmul by a power of 2 followed by a fcvt into a single
15586          fixed-point fcvt.  */
15587       if (GET_CODE (x) == MULT
15588           && ((VECTOR_MODE_P (mode)
15589                && aarch64_vec_fpconst_pow_of_2 (XEXP (x, 1)) > 0)
15590               || aarch64_fpconst_pow_of_2 (XEXP (x, 1)) > 0))
15591         {
15592           *cost += rtx_cost (XEXP (x, 0), VOIDmode, code, 0, speed);
15593           return true;
15594         }
15595
15596       *cost += rtx_cost (x, VOIDmode, code, 0, speed);
15597       return true;
15598
15599     case ABS:
15600       if (VECTOR_MODE_P (mode))
15601         {
15602           /* ABS (vector).  */
15603           if (speed)
15604             *cost += extra_cost->vect.alu;
15605         }
15606       else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
15607         {
15608           op0 = XEXP (x, 0);
15609
15610           /* FABD, which is analogous to FADD.  */
15611           if (GET_CODE (op0) == MINUS)
15612             {
15613               *cost += rtx_cost (XEXP (op0, 0), mode, MINUS, 0, speed);
15614               *cost += rtx_cost (XEXP (op0, 1), mode, MINUS, 1, speed);
15615               if (speed)
15616                 *cost += extra_cost->fp[mode == DFmode].addsub;
15617
15618               return true;
15619             }
15620           /* Simple FABS is analogous to FNEG.  */
15621           if (speed)
15622             *cost += extra_cost->fp[mode == DFmode].neg;
15623         }
15624       else
15625         {
15626           /* Integer ABS will either be split to
15627              two arithmetic instructions, or will be an ABS
15628              (scalar), which we don't model.  */
15629           *cost = COSTS_N_INSNS (2);
15630           if (speed)
15631             *cost += 2 * extra_cost->alu.arith;
15632         }
15633       return false;
15634
15635     case SMAX:
15636     case SMIN:
15637       if (speed)
15638         {
15639           if (VECTOR_MODE_P (mode))
15640             *cost += extra_cost->vect.alu;
15641           else
15642             {
15643               /* FMAXNM/FMINNM/FMAX/FMIN.
15644                  TODO: This may not be accurate for all implementations, but
15645                  we do not model this in the cost tables.  */
15646               *cost += extra_cost->fp[mode == DFmode].addsub;
15647             }
15648         }
15649       return false;
15650
15651     case UNSPEC:
15652       /* The floating point round to integer frint* instructions.  */
15653       if (aarch64_frint_unspec_p (XINT (x, 1)))
15654         {
15655           if (speed)
15656             *cost += extra_cost->fp[mode == DFmode].roundint;
15657
15658           return false;
15659         }
15660       break;
15661
15662     case TRUNCATE:
15663
15664       /* Decompose <su>muldi3_highpart.  */
15665       if (/* (truncate:DI  */
15666           mode == DImode
15667           /*   (lshiftrt:TI  */
15668           && GET_MODE (XEXP (x, 0)) == TImode
15669           && GET_CODE (XEXP (x, 0)) == LSHIFTRT
15670           /*      (mult:TI  */
15671           && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
15672           /*        (ANY_EXTEND:TI (reg:DI))
15673                     (ANY_EXTEND:TI (reg:DI)))  */
15674           && ((GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == ZERO_EXTEND
15675                && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == ZERO_EXTEND)
15676               || (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == SIGN_EXTEND
15677                   && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == SIGN_EXTEND))
15678           && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0)) == DImode
15679           && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0)) == DImode
15680           /*     (const_int 64)  */
15681           && CONST_INT_P (XEXP (XEXP (x, 0), 1))
15682           && UINTVAL (XEXP (XEXP (x, 0), 1)) == 64)
15683         {
15684           /* UMULH/SMULH.  */
15685           if (speed)
15686             *cost += extra_cost->mult[mode == DImode].extend;
15687           *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0),
15688                              mode, MULT, 0, speed);
15689           *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0),
15690                              mode, MULT, 1, speed);
15691           return true;
15692         }
15693         break;
15694     case CONST_VECTOR:
15695         {
15696           /* Load using MOVI/MVNI.  */
15697           if (aarch64_simd_valid_mov_imm (x))
15698             *cost = extra_cost->vect.movi;
15699           else /* Load using constant pool.  */
15700             *cost = extra_cost->ldst.load;
15701           break;
15702         }
15703     case VEC_CONCAT:
15704         /* depending on the operation, either DUP or INS.
15705            For now, keep default costing.  */
15706         break;
15707     case VEC_DUPLICATE:
15708         /* Load using a DUP.  */
15709         *cost = extra_cost->vect.dup;
15710         return false;
15711     case VEC_SELECT:
15712         {
15713           rtx op0 = XEXP (x, 0);
15714           *cost = rtx_cost (op0, GET_MODE (op0), VEC_SELECT, 0, speed);
15715
15716           /* cost subreg of 0 as free, otherwise as DUP */
15717           rtx op1 = XEXP (x, 1);
15718           if (vec_series_lowpart_p (mode, GET_MODE (op1), op1))
15719             ;
15720           else if (vec_series_highpart_p (mode, GET_MODE (op1), op1))
15721             *cost = extra_cost->vect.dup;
15722           else
15723             *cost = extra_cost->vect.extract;
15724           return true;
15725         }
15726     default:
15727       break;
15728     }
15729
15730   if (dump_file
15731       && flag_aarch64_verbose_cost)
15732     fprintf (dump_file,
15733       "\nFailed to cost RTX.  Assuming default cost.\n");
15734
15735   return true;
15736 }
15737
15738 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
15739    calculated for X.  This cost is stored in *COST.  Returns true
15740    if the total cost of X was calculated.  */
15741 static bool
15742 aarch64_rtx_costs_wrapper (rtx x, machine_mode mode, int outer,
15743                    int param, int *cost, bool speed)
15744 {
15745   bool result = aarch64_rtx_costs (x, mode, outer, param, cost, speed);
15746
15747   if (dump_file
15748       && flag_aarch64_verbose_cost)
15749     {
15750       print_rtl_single (dump_file, x);
15751       fprintf (dump_file, "\n%s cost: %d (%s)\n",
15752                speed ? "Hot" : "Cold",
15753                *cost, result ? "final" : "partial");
15754     }
15755
15756   return result;
15757 }
15758
15759 static int
15760 aarch64_register_move_cost (machine_mode mode,
15761                             reg_class_t from_i, reg_class_t to_i)
15762 {
15763   enum reg_class from = (enum reg_class) from_i;
15764   enum reg_class to = (enum reg_class) to_i;
15765   const struct cpu_regmove_cost *regmove_cost
15766     = aarch64_tune_params.regmove_cost;
15767
15768   /* Trest any subset of POINTER_REGS as though it were GENERAL_REGS.  */
15769   if (reg_class_subset_p (to, POINTER_REGS))
15770     to = GENERAL_REGS;
15771
15772   if (reg_class_subset_p (from, POINTER_REGS))
15773     from = GENERAL_REGS;
15774
15775   /* Make RDFFR very expensive.  In particular, if we know that the FFR
15776      contains a PTRUE (e.g. after a SETFFR), we must never use RDFFR
15777      as a way of obtaining a PTRUE.  */
15778   if (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL
15779       && hard_reg_set_subset_p (reg_class_contents[from_i],
15780                                 reg_class_contents[FFR_REGS]))
15781     return 80;
15782
15783   /* Moves to/from sysregs are expensive, and must go via GPR.  */
15784   if (from == MOVEABLE_SYSREGS)
15785     return 80 + aarch64_register_move_cost (mode, GENERAL_REGS, to);
15786   if (to == MOVEABLE_SYSREGS)
15787     return 80 + aarch64_register_move_cost (mode, from, GENERAL_REGS);
15788
15789   /* Moving between GPR and stack cost is the same as GP2GP.  */
15790   if ((from == GENERAL_REGS && to == STACK_REG)
15791       || (to == GENERAL_REGS && from == STACK_REG))
15792     return regmove_cost->GP2GP;
15793
15794   /* To/From the stack register, we move via the gprs.  */
15795   if (to == STACK_REG || from == STACK_REG)
15796     return aarch64_register_move_cost (mode, from, GENERAL_REGS)
15797             + aarch64_register_move_cost (mode, GENERAL_REGS, to);
15798
15799   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
15800   if (vec_flags != (VEC_ADVSIMD | VEC_STRUCT | VEC_PARTIAL)
15801       && known_eq (GET_MODE_SIZE (mode), 16))
15802     {
15803       /* 128-bit operations on general registers require 2 instructions.  */
15804       if (from == GENERAL_REGS && to == GENERAL_REGS)
15805         return regmove_cost->GP2GP * 2;
15806       else if (from == GENERAL_REGS)
15807         return regmove_cost->GP2FP * 2;
15808       else if (to == GENERAL_REGS)
15809         return regmove_cost->FP2GP * 2;
15810
15811       /* When AdvSIMD instructions are disabled it is not possible to move
15812          a 128-bit value directly between Q registers.  This is handled in
15813          secondary reload.  A general register is used as a scratch to move
15814          the upper DI value and the lower DI value is moved directly,
15815          hence the cost is the sum of three moves. */
15816       if (!TARGET_SIMD && !TARGET_SVE)
15817         return regmove_cost->GP2FP + regmove_cost->FP2GP + regmove_cost->FP2FP;
15818
15819       return regmove_cost->FP2FP;
15820     }
15821
15822   if (from == GENERAL_REGS && to == GENERAL_REGS)
15823     return regmove_cost->GP2GP;
15824   else if (from == GENERAL_REGS)
15825     return regmove_cost->GP2FP;
15826   else if (to == GENERAL_REGS)
15827     return regmove_cost->FP2GP;
15828
15829   if (!TARGET_SIMD && vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
15830     {
15831       /* Needs a round-trip through memory, which can use LDP/STP for pairs.
15832          The cost must be greater than 2 units to indicate that direct
15833          moves aren't possible.  */
15834       auto per_vector = (aarch64_tune_params.memmov_cost.load_fp
15835                          + aarch64_tune_params.memmov_cost.store_fp);
15836       return MIN (CEIL (per_vector, 2), 4);
15837     }
15838
15839   return regmove_cost->FP2FP;
15840 }
15841
15842 /* Implements TARGET_MEMORY_MOVE_COST.  */
15843 static int
15844 aarch64_memory_move_cost (machine_mode mode, reg_class_t rclass_i, bool in)
15845 {
15846   enum reg_class rclass = (enum reg_class) rclass_i;
15847   if (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL
15848       ? reg_classes_intersect_p (rclass, PR_REGS)
15849       : reg_class_subset_p (rclass, PR_REGS))
15850     return (in
15851             ? aarch64_tune_params.memmov_cost.load_pred
15852             : aarch64_tune_params.memmov_cost.store_pred);
15853
15854   if (VECTOR_MODE_P (mode) || FLOAT_MODE_P (mode)
15855       ? reg_classes_intersect_p (rclass, FP_REGS)
15856       : reg_class_subset_p (rclass, FP_REGS))
15857     return (in
15858             ? aarch64_tune_params.memmov_cost.load_fp
15859             : aarch64_tune_params.memmov_cost.store_fp);
15860
15861   /* If the move needs to go through GPRs, add the cost of doing that.  */
15862   int base = 0;
15863   if (rclass_i == MOVEABLE_SYSREGS)
15864     base += (in
15865              ? aarch64_register_move_cost (DImode, GENERAL_REGS, rclass_i)
15866              : aarch64_register_move_cost (DImode, rclass_i, GENERAL_REGS));
15867
15868   return (in
15869           ? base + aarch64_tune_params.memmov_cost.load_int
15870           : base + aarch64_tune_params.memmov_cost.store_int);
15871 }
15872
15873 /* Implement TARGET_INSN_COST.  We have the opportunity to do something
15874    much more productive here, such as using insn attributes to cost things.
15875    But we don't, not yet.
15876
15877    The main point of this current definition is to make calling insn_cost
15878    on one instruction equivalent to calling seq_cost on a sequence that
15879    contains only that instruction.  The default definition would instead
15880    only look at SET_SRCs, ignoring SET_DESTs.
15881
15882    This ensures that, for example, storing a 128-bit zero vector is more
15883    expensive than storing a 128-bit vector register.  A move of zero
15884    into a 128-bit vector register followed by multiple stores of that
15885    register is then cheaper than multiple stores of zero (which would
15886    use STP of XZR).  This in turn allows STP Qs to be formed.  */
15887 static int
15888 aarch64_insn_cost (rtx_insn *insn, bool speed)
15889 {
15890   if (rtx set = single_set (insn))
15891     return set_rtx_cost (set, speed);
15892   return pattern_cost (PATTERN (insn), speed);
15893 }
15894
15895 /* Implement TARGET_INIT_BUILTINS.  */
15896 static void
15897 aarch64_init_builtins ()
15898 {
15899   aarch64_general_init_builtins ();
15900   aarch64_sve::init_builtins ();
15901 #ifdef SUBTARGET_INIT_BUILTINS
15902   SUBTARGET_INIT_BUILTINS;
15903 #endif
15904 }
15905
15906 /* Implement TARGET_FOLD_BUILTIN.  */
15907 static tree
15908 aarch64_fold_builtin (tree fndecl, int nargs, tree *args, bool)
15909 {
15910   unsigned int code = DECL_MD_FUNCTION_CODE (fndecl);
15911   unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
15912   tree type = TREE_TYPE (TREE_TYPE (fndecl));
15913   switch (code & AARCH64_BUILTIN_CLASS)
15914     {
15915     case AARCH64_BUILTIN_GENERAL:
15916       return aarch64_general_fold_builtin (subcode, type, nargs, args);
15917
15918     case AARCH64_BUILTIN_SVE:
15919       return NULL_TREE;
15920     }
15921   gcc_unreachable ();
15922 }
15923
15924 /* Implement TARGET_GIMPLE_FOLD_BUILTIN.  */
15925 static bool
15926 aarch64_gimple_fold_builtin (gimple_stmt_iterator *gsi)
15927 {
15928   gcall *stmt = as_a <gcall *> (gsi_stmt (*gsi));
15929   tree fndecl = gimple_call_fndecl (stmt);
15930   unsigned int code = DECL_MD_FUNCTION_CODE (fndecl);
15931   unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
15932   gimple *new_stmt = NULL;
15933   switch (code & AARCH64_BUILTIN_CLASS)
15934     {
15935     case AARCH64_BUILTIN_GENERAL:
15936       new_stmt = aarch64_general_gimple_fold_builtin (subcode, stmt, gsi);
15937       break;
15938
15939     case AARCH64_BUILTIN_SVE:
15940       new_stmt = aarch64_sve::gimple_fold_builtin (subcode, gsi, stmt);
15941       break;
15942     }
15943
15944   if (!new_stmt)
15945     return false;
15946
15947   gsi_replace (gsi, new_stmt, false);
15948   return true;
15949 }
15950
15951 /* Implement TARGET_EXPAND_BUILTIN.  */
15952 static rtx
15953 aarch64_expand_builtin (tree exp, rtx target, rtx, machine_mode, int ignore)
15954 {
15955   tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
15956   unsigned int code = DECL_MD_FUNCTION_CODE (fndecl);
15957   unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
15958   switch (code & AARCH64_BUILTIN_CLASS)
15959     {
15960     case AARCH64_BUILTIN_GENERAL:
15961       return aarch64_general_expand_builtin (subcode, exp, target, ignore);
15962
15963     case AARCH64_BUILTIN_SVE:
15964       return aarch64_sve::expand_builtin (subcode, exp, target);
15965     }
15966   gcc_unreachable ();
15967 }
15968
15969 /* Implement TARGET_BUILTIN_DECL.  */
15970 static tree
15971 aarch64_builtin_decl (unsigned int code, bool initialize_p)
15972 {
15973   unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
15974   switch (code & AARCH64_BUILTIN_CLASS)
15975     {
15976     case AARCH64_BUILTIN_GENERAL:
15977       return aarch64_general_builtin_decl (subcode, initialize_p);
15978
15979     case AARCH64_BUILTIN_SVE:
15980       return aarch64_sve::builtin_decl (subcode, initialize_p);
15981     }
15982   gcc_unreachable ();
15983 }
15984
15985 /* Return true if it is safe and beneficial to use the approximate rsqrt optabs
15986    to optimize 1.0/sqrt.  */
15987
15988 static bool
15989 use_rsqrt_p (machine_mode mode)
15990 {
15991   return (!flag_trapping_math
15992           && flag_unsafe_math_optimizations
15993           && ((aarch64_tune_params.approx_modes->recip_sqrt
15994                & AARCH64_APPROX_MODE (mode))
15995               || flag_mrecip_low_precision_sqrt));
15996 }
15997
15998 /* Function to decide when to use the approximate reciprocal square root
15999    builtin.  */
16000
16001 static tree
16002 aarch64_builtin_reciprocal (tree fndecl)
16003 {
16004   machine_mode mode = TYPE_MODE (TREE_TYPE (fndecl));
16005
16006   if (!use_rsqrt_p (mode))
16007     return NULL_TREE;
16008   unsigned int code = DECL_MD_FUNCTION_CODE (fndecl);
16009   unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
16010   switch (code & AARCH64_BUILTIN_CLASS)
16011     {
16012     case AARCH64_BUILTIN_GENERAL:
16013       return aarch64_general_builtin_rsqrt (subcode);
16014
16015     case AARCH64_BUILTIN_SVE:
16016       return NULL_TREE;
16017     }
16018   gcc_unreachable ();
16019 }
16020
16021 /* Emit code to perform the floating-point operation:
16022
16023      DST = SRC1 * SRC2
16024
16025    where all three operands are already known to be registers.
16026    If the operation is an SVE one, PTRUE is a suitable all-true
16027    predicate.  */
16028
16029 static void
16030 aarch64_emit_mult (rtx dst, rtx ptrue, rtx src1, rtx src2)
16031 {
16032   if (ptrue)
16033     emit_insn (gen_aarch64_pred (UNSPEC_COND_FMUL, GET_MODE (dst),
16034                                  dst, ptrue, src1, src2,
16035                                  gen_int_mode (SVE_RELAXED_GP, SImode)));
16036   else
16037     emit_set_insn (dst, gen_rtx_MULT (GET_MODE (dst), src1, src2));
16038 }
16039
16040 /* Emit instruction sequence to compute either the approximate square root
16041    or its approximate reciprocal, depending on the flag RECP, and return
16042    whether the sequence was emitted or not.  */
16043
16044 bool
16045 aarch64_emit_approx_sqrt (rtx dst, rtx src, bool recp)
16046 {
16047   machine_mode mode = GET_MODE (dst);
16048
16049   if (GET_MODE_INNER (mode) == HFmode)
16050     {
16051       gcc_assert (!recp);
16052       return false;
16053     }
16054
16055   if (!recp)
16056     {
16057       if (!(flag_mlow_precision_sqrt
16058             || (aarch64_tune_params.approx_modes->sqrt
16059                 & AARCH64_APPROX_MODE (mode))))
16060         return false;
16061
16062       if (!flag_finite_math_only
16063           || flag_trapping_math
16064           || !flag_unsafe_math_optimizations
16065           || optimize_function_for_size_p (cfun))
16066         return false;
16067     }
16068   else
16069     /* Caller assumes we cannot fail.  */
16070     gcc_assert (use_rsqrt_p (mode));
16071
16072   rtx pg = NULL_RTX;
16073   if (aarch64_sve_mode_p (mode))
16074     pg = aarch64_ptrue_reg (aarch64_sve_pred_mode (mode));
16075   machine_mode mmsk = (VECTOR_MODE_P (mode)
16076                        ? related_int_vector_mode (mode).require ()
16077                        : int_mode_for_mode (mode).require ());
16078   rtx xmsk = NULL_RTX;
16079   if (!recp)
16080     {
16081       /* When calculating the approximate square root, compare the
16082          argument with 0.0 and create a mask.  */
16083       rtx zero = CONST0_RTX (mode);
16084       if (pg)
16085         {
16086           xmsk = gen_reg_rtx (GET_MODE (pg));
16087           rtx hint = gen_int_mode (SVE_KNOWN_PTRUE, SImode);
16088           emit_insn (gen_aarch64_pred_fcm (UNSPEC_COND_FCMNE, mode,
16089                                            xmsk, pg, hint, src, zero));
16090         }
16091       else
16092         {
16093           xmsk = gen_reg_rtx (mmsk);
16094           emit_insn (gen_rtx_SET (xmsk,
16095                                   gen_rtx_NEG (mmsk,
16096                                                gen_rtx_EQ (mmsk, src, zero))));
16097         }
16098     }
16099
16100   /* Estimate the approximate reciprocal square root.  */
16101   rtx xdst = gen_reg_rtx (mode);
16102   emit_insn (gen_aarch64_rsqrte (mode, xdst, src));
16103
16104   /* Iterate over the series twice for SF and thrice for DF.  */
16105   int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
16106
16107   /* Optionally iterate over the series once less for faster performance
16108      while sacrificing the accuracy.  */
16109   if ((recp && flag_mrecip_low_precision_sqrt)
16110       || (!recp && flag_mlow_precision_sqrt))
16111     iterations--;
16112
16113   /* Iterate over the series to calculate the approximate reciprocal square
16114      root.  */
16115   rtx x1 = gen_reg_rtx (mode);
16116   while (iterations--)
16117     {
16118       rtx x2 = gen_reg_rtx (mode);
16119       aarch64_emit_mult (x2, pg, xdst, xdst);
16120
16121       emit_insn (gen_aarch64_rsqrts (mode, x1, src, x2));
16122
16123       if (iterations > 0)
16124         aarch64_emit_mult (xdst, pg, xdst, x1);
16125     }
16126
16127   if (!recp)
16128     {
16129       if (pg)
16130         /* Multiply nonzero source values by the corresponding intermediate
16131            result elements, so that the final calculation is the approximate
16132            square root rather than its reciprocal.  Select a zero result for
16133            zero source values, to avoid the Inf * 0 -> NaN that we'd get
16134            otherwise.  */
16135         emit_insn (gen_cond (UNSPEC_COND_FMUL, mode,
16136                              xdst, xmsk, xdst, src, CONST0_RTX (mode)));
16137       else
16138         {
16139           /* Qualify the approximate reciprocal square root when the
16140              argument is 0.0 by squashing the intermediary result to 0.0.  */
16141           rtx xtmp = gen_reg_rtx (mmsk);
16142           emit_set_insn (xtmp, gen_rtx_AND (mmsk, gen_rtx_NOT (mmsk, xmsk),
16143                                             gen_rtx_SUBREG (mmsk, xdst, 0)));
16144           emit_move_insn (xdst, gen_rtx_SUBREG (mode, xtmp, 0));
16145
16146           /* Calculate the approximate square root.  */
16147           aarch64_emit_mult (xdst, pg, xdst, src);
16148         }
16149     }
16150
16151   /* Finalize the approximation.  */
16152   aarch64_emit_mult (dst, pg, xdst, x1);
16153
16154   return true;
16155 }
16156
16157 /* Emit the instruction sequence to compute the approximation for the division
16158    of NUM by DEN in QUO and return whether the sequence was emitted or not.  */
16159
16160 bool
16161 aarch64_emit_approx_div (rtx quo, rtx num, rtx den)
16162 {
16163   machine_mode mode = GET_MODE (quo);
16164
16165   if (GET_MODE_INNER (mode) == HFmode)
16166     return false;
16167
16168   bool use_approx_division_p = (flag_mlow_precision_div
16169                                 || (aarch64_tune_params.approx_modes->division
16170                                     & AARCH64_APPROX_MODE (mode)));
16171
16172   if (!flag_finite_math_only
16173       || flag_trapping_math
16174       || !flag_unsafe_math_optimizations
16175       || optimize_function_for_size_p (cfun)
16176       || !use_approx_division_p)
16177     return false;
16178
16179   if (!TARGET_SIMD && VECTOR_MODE_P (mode))
16180     return false;
16181
16182   rtx pg = NULL_RTX;
16183   if (aarch64_sve_mode_p (mode))
16184     pg = aarch64_ptrue_reg (aarch64_sve_pred_mode (mode));
16185
16186   /* Estimate the approximate reciprocal.  */
16187   rtx xrcp = gen_reg_rtx (mode);
16188   emit_insn (gen_aarch64_frecpe (mode, xrcp, den));
16189
16190   /* Iterate over the series twice for SF and thrice for DF.  */
16191   int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
16192
16193   /* Optionally iterate over the series less for faster performance,
16194      while sacrificing the accuracy.  The default is 2 for DF and 1 for SF.  */
16195   if (flag_mlow_precision_div)
16196     iterations = (GET_MODE_INNER (mode) == DFmode
16197                   ? aarch64_double_recp_precision
16198                   : aarch64_float_recp_precision);
16199
16200   /* Iterate over the series to calculate the approximate reciprocal.  */
16201   rtx xtmp = gen_reg_rtx (mode);
16202   while (iterations--)
16203     {
16204       emit_insn (gen_aarch64_frecps (mode, xtmp, xrcp, den));
16205
16206       if (iterations > 0)
16207         aarch64_emit_mult (xrcp, pg, xrcp, xtmp);
16208     }
16209
16210   if (num != CONST1_RTX (mode))
16211     {
16212       /* As the approximate reciprocal of DEN is already calculated, only
16213          calculate the approximate division when NUM is not 1.0.  */
16214       rtx xnum = force_reg (mode, num);
16215       aarch64_emit_mult (xrcp, pg, xrcp, xnum);
16216     }
16217
16218   /* Finalize the approximation.  */
16219   aarch64_emit_mult (quo, pg, xrcp, xtmp);
16220   return true;
16221 }
16222
16223 /* Emit an optimized sequence to perform a vector rotate
16224    of REG by the vector constant amount AMNT_VEC and place the result
16225    in DST.  Return true iff successful.  */
16226
16227 bool
16228 aarch64_emit_opt_vec_rotate (rtx dst, rtx reg, rtx amnt_vec)
16229 {
16230   rtx amnt = unwrap_const_vec_duplicate (amnt_vec);
16231   gcc_assert (CONST_INT_P (amnt));
16232   HOST_WIDE_INT rotamnt = UINTVAL (amnt);
16233   machine_mode mode = GET_MODE (reg);
16234   /* Don't end up here after reload.  */
16235   gcc_assert (can_create_pseudo_p ());
16236   /* Rotates by half the element width map down to REV* instructions and should
16237      always be preferred when possible.  */
16238   if (rotamnt == GET_MODE_UNIT_BITSIZE (mode) / 2
16239       && expand_rotate_as_vec_perm (mode, dst, reg, amnt))
16240     return true;
16241   /* 64 and 128-bit vector modes can use the XAR instruction
16242      when available.  */
16243   else if ((TARGET_SHA3 && mode == V2DImode)
16244            || (TARGET_SVE2
16245                && (known_eq (GET_MODE_SIZE (mode), 8)
16246                    || known_eq (GET_MODE_SIZE (mode), 16))))
16247     {
16248       rtx zeroes = aarch64_gen_shareable_zero (mode);
16249       rtx xar_op
16250         = gen_rtx_ROTATE (mode, gen_rtx_XOR (mode, reg, zeroes),
16251                                                 amnt_vec);
16252       emit_set_insn (dst, xar_op);
16253       return true;
16254     }
16255   /* If none of the above, try to expand rotates by any byte amount as
16256      permutes.  */
16257   else if (expand_rotate_as_vec_perm (mode, dst, reg, amnt))
16258     return true;
16259   return false;
16260 }
16261
16262 /* Return the number of instructions that can be issued per cycle.  */
16263 static int
16264 aarch64_sched_issue_rate (void)
16265 {
16266   return aarch64_tune_params.issue_rate;
16267 }
16268
16269 /* Implement TARGET_SCHED_VARIABLE_ISSUE.  */
16270 static int
16271 aarch64_sched_variable_issue (FILE *, int, rtx_insn *insn, int more)
16272 {
16273   if (DEBUG_INSN_P (insn))
16274     return more;
16275
16276   rtx_code code = GET_CODE (PATTERN (insn));
16277   if (code == USE || code == CLOBBER)
16278     return more;
16279
16280   if (get_attr_type (insn) == TYPE_NO_INSN)
16281     return more;
16282
16283   return more - 1;
16284 }
16285
16286 static int
16287 aarch64_sched_first_cycle_multipass_dfa_lookahead (void)
16288 {
16289   int issue_rate = aarch64_sched_issue_rate ();
16290
16291   return issue_rate > 1 && !sched_fusion ? issue_rate : 0;
16292 }
16293
16294
16295 /* Implement TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD as
16296    autopref_multipass_dfa_lookahead_guard from haifa-sched.cc.  It only
16297    has an effect if PARAM_SCHED_AUTOPREF_QUEUE_DEPTH > 0.  */
16298
16299 static int
16300 aarch64_first_cycle_multipass_dfa_lookahead_guard (rtx_insn *insn,
16301                                                     int ready_index)
16302 {
16303   return autopref_multipass_dfa_lookahead_guard (insn, ready_index);
16304 }
16305
16306
16307 /* Vectorizer cost model target hooks.  */
16308
16309 /* If a vld1 from address ADDR should be recorded in vector_load_decls,
16310    return the decl that should be recorded.  Return null otherwise.  */
16311 tree
16312 aarch64_vector_load_decl (tree addr)
16313 {
16314   if (TREE_CODE (addr) != ADDR_EXPR)
16315     return NULL_TREE;
16316   tree base = get_base_address (TREE_OPERAND (addr, 0));
16317   if (TREE_CODE (base) != VAR_DECL)
16318     return NULL_TREE;
16319   return base;
16320 }
16321
16322 /* Return true if STMT_INFO accesses a decl that is known to be the
16323    argument to a vld1 in the same function.  */
16324 static bool
16325 aarch64_accesses_vector_load_decl_p (stmt_vec_info stmt_info)
16326 {
16327   if (!cfun->machine->vector_load_decls)
16328     return false;
16329   auto dr = STMT_VINFO_DATA_REF (stmt_info);
16330   if (!dr)
16331     return false;
16332   tree decl = aarch64_vector_load_decl (DR_BASE_ADDRESS (dr));
16333   return decl && cfun->machine->vector_load_decls->contains (decl);
16334 }
16335
16336 /* Information about how the CPU would issue the scalar, Advanced SIMD
16337    or SVE version of a vector loop, using the scheme defined by the
16338    aarch64_base_vec_issue_info hierarchy of structures.  */
16339 class aarch64_vec_op_count
16340 {
16341 public:
16342   aarch64_vec_op_count () = default;
16343   aarch64_vec_op_count (const aarch64_vec_issue_info *, unsigned int,
16344                         unsigned int = 1);
16345
16346   unsigned int vec_flags () const { return m_vec_flags; }
16347   unsigned int vf_factor () const { return m_vf_factor; }
16348
16349   const aarch64_base_vec_issue_info *base_issue_info () const;
16350   const aarch64_simd_vec_issue_info *simd_issue_info () const;
16351   const aarch64_sve_vec_issue_info *sve_issue_info () const;
16352
16353   fractional_cost rename_cycles_per_iter () const;
16354   fractional_cost min_nonpred_cycles_per_iter () const;
16355   fractional_cost min_pred_cycles_per_iter () const;
16356   fractional_cost min_cycles_per_iter () const;
16357
16358   void dump () const;
16359
16360   /* The number of individual "general" operations.  See the comments
16361      in aarch64_base_vec_issue_info for details.  */
16362   unsigned int general_ops = 0;
16363
16364   /* The number of load and store operations, under the same scheme
16365      as above.  */
16366   unsigned int loads = 0;
16367   unsigned int stores = 0;
16368
16369   /* The minimum number of cycles needed to execute all loop-carried
16370      operations, which in the vector code become associated with
16371      reductions.  */
16372   unsigned int reduction_latency = 0;
16373
16374   /* The number of individual predicate operations.  See the comments
16375      in aarch64_sve_vec_issue_info for details.  */
16376   unsigned int pred_ops = 0;
16377
16378 private:
16379   /* The issue information for the core.  */
16380   const aarch64_vec_issue_info *m_issue_info = nullptr;
16381
16382   /* - If M_VEC_FLAGS is zero then this structure describes scalar code
16383      - If M_VEC_FLAGS & VEC_ADVSIMD is nonzero then this structure describes
16384        Advanced SIMD code.
16385      - If M_VEC_FLAGS & VEC_ANY_SVE is nonzero then this structure describes
16386        SVE code.  */
16387   unsigned int m_vec_flags = 0;
16388
16389   /* Assume that, when the code is executing on the core described
16390      by M_ISSUE_INFO, one iteration of the loop will handle M_VF_FACTOR
16391      times more data than the vectorizer anticipates.
16392
16393      This is only ever different from 1 for SVE.  It allows us to consider
16394      what would happen on a 256-bit SVE target even when the -mtune
16395      parameters say that the “likely” SVE length is 128 bits.  */
16396   unsigned int m_vf_factor = 1;
16397 };
16398
16399 aarch64_vec_op_count::
16400 aarch64_vec_op_count (const aarch64_vec_issue_info *issue_info,
16401                       unsigned int vec_flags, unsigned int vf_factor)
16402   : m_issue_info (issue_info),
16403     m_vec_flags (vec_flags),
16404     m_vf_factor (vf_factor)
16405 {
16406 }
16407
16408 /* Return the base issue information (i.e. the parts that make sense
16409    for both scalar and vector code).  Return null if we have no issue
16410    information.  */
16411 const aarch64_base_vec_issue_info *
16412 aarch64_vec_op_count::base_issue_info () const
16413 {
16414   if (auto *ret = simd_issue_info ())
16415     return ret;
16416   return m_issue_info->scalar;
16417 }
16418
16419 /* If the structure describes vector code and we have associated issue
16420    information, return that issue information, otherwise return null.  */
16421 const aarch64_simd_vec_issue_info *
16422 aarch64_vec_op_count::simd_issue_info () const
16423 {
16424   if (auto *ret = sve_issue_info ())
16425     return ret;
16426   if (m_vec_flags)
16427     return m_issue_info->advsimd;
16428   return nullptr;
16429 }
16430
16431 /* If the structure describes SVE code and we have associated issue
16432    information, return that issue information, otherwise return null.  */
16433 const aarch64_sve_vec_issue_info *
16434 aarch64_vec_op_count::sve_issue_info () const
16435 {
16436   if (m_vec_flags & VEC_ANY_SVE)
16437     return m_issue_info->sve;
16438   return nullptr;
16439 }
16440
16441 /* Estimate the minimum number of cycles per iteration needed to rename
16442    the instructions.
16443
16444    ??? For now this is done inline rather than via cost tables, since it
16445    isn't clear how it should be parameterized for the general case.  */
16446 fractional_cost
16447 aarch64_vec_op_count::rename_cycles_per_iter () const
16448 {
16449   if (sve_issue_info () == &neoverse512tvb_sve_issue_info
16450       || sve_issue_info () == &neoversen2_sve_issue_info
16451       || sve_issue_info () == &neoversev2_sve_issue_info)
16452     /* + 1 for an addition.  We've already counted a general op for each
16453        store, so we don't need to account for stores separately.  The branch
16454        reads no registers and so does not need to be counted either.
16455
16456        ??? This value is very much on the pessimistic side, but seems to work
16457        pretty well in practice.  */
16458     return { general_ops + loads + pred_ops + 1, 5 };
16459
16460   return 0;
16461 }
16462
16463 /* Like min_cycles_per_iter, but excluding predicate operations.  */
16464 fractional_cost
16465 aarch64_vec_op_count::min_nonpred_cycles_per_iter () const
16466 {
16467   auto *issue_info = base_issue_info ();
16468
16469   fractional_cost cycles = MAX (reduction_latency, 1);
16470   cycles = std::max (cycles, { stores, issue_info->stores_per_cycle });
16471   cycles = std::max (cycles, { loads + stores,
16472                                issue_info->loads_stores_per_cycle });
16473   cycles = std::max (cycles, { general_ops,
16474                                issue_info->general_ops_per_cycle });
16475   cycles = std::max (cycles, rename_cycles_per_iter ());
16476   return cycles;
16477 }
16478
16479 /* Like min_cycles_per_iter, but including only the predicate operations.  */
16480 fractional_cost
16481 aarch64_vec_op_count::min_pred_cycles_per_iter () const
16482 {
16483   if (auto *issue_info = sve_issue_info ())
16484     return { pred_ops, issue_info->pred_ops_per_cycle };
16485   return 0;
16486 }
16487
16488 /* Estimate the minimum number of cycles needed to issue the operations.
16489    This is a very simplistic model!  */
16490 fractional_cost
16491 aarch64_vec_op_count::min_cycles_per_iter () const
16492 {
16493   return std::max (min_nonpred_cycles_per_iter (),
16494                    min_pred_cycles_per_iter ());
16495 }
16496
16497 /* Dump information about the structure.  */
16498 void
16499 aarch64_vec_op_count::dump () const
16500 {
16501   dump_printf_loc (MSG_NOTE, vect_location,
16502                    "  load operations = %d\n", loads);
16503   dump_printf_loc (MSG_NOTE, vect_location,
16504                    "  store operations = %d\n", stores);
16505   dump_printf_loc (MSG_NOTE, vect_location,
16506                    "  general operations = %d\n", general_ops);
16507   if (sve_issue_info ())
16508     dump_printf_loc (MSG_NOTE, vect_location,
16509                      "  predicate operations = %d\n", pred_ops);
16510   dump_printf_loc (MSG_NOTE, vect_location,
16511                    "  reduction latency = %d\n", reduction_latency);
16512   if (auto rcpi = rename_cycles_per_iter ())
16513     dump_printf_loc (MSG_NOTE, vect_location,
16514                      "  estimated cycles per iteration to rename = %f\n",
16515                      rcpi.as_double ());
16516   if (auto pred_cpi = min_pred_cycles_per_iter ())
16517     {
16518       dump_printf_loc (MSG_NOTE, vect_location,
16519                        "  estimated min cycles per iteration"
16520                        " without predication = %f\n",
16521                        min_nonpred_cycles_per_iter ().as_double ());
16522       dump_printf_loc (MSG_NOTE, vect_location,
16523                        "  estimated min cycles per iteration"
16524                        " for predication = %f\n", pred_cpi.as_double ());
16525     }
16526   if (auto cpi = min_cycles_per_iter ())
16527     dump_printf_loc (MSG_NOTE, vect_location,
16528                      "  estimated min cycles per iteration = %f\n",
16529                      cpi.as_double ());
16530 }
16531
16532 /* Information about vector code that we're in the process of costing.  */
16533 class aarch64_vector_costs : public vector_costs
16534 {
16535 public:
16536   aarch64_vector_costs (vec_info *, bool);
16537
16538   unsigned int add_stmt_cost (int count, vect_cost_for_stmt kind,
16539                               stmt_vec_info stmt_info, slp_tree, tree vectype,
16540                               int misalign,
16541                               vect_cost_model_location where) override;
16542   void finish_cost (const vector_costs *) override;
16543   bool better_main_loop_than_p (const vector_costs *other) const override;
16544
16545 private:
16546   void record_potential_advsimd_unrolling (loop_vec_info);
16547   void analyze_loop_vinfo (loop_vec_info);
16548   void count_ops (unsigned int, vect_cost_for_stmt, stmt_vec_info, slp_tree,
16549                   aarch64_vec_op_count *);
16550   fractional_cost adjust_body_cost_sve (const aarch64_vec_op_count *,
16551                                         fractional_cost, unsigned int,
16552                                         unsigned int *, bool *);
16553   unsigned int adjust_body_cost (loop_vec_info, const aarch64_vector_costs *,
16554                                  unsigned int);
16555   bool prefer_unrolled_loop () const;
16556   unsigned int determine_suggested_unroll_factor ();
16557
16558   /* True if we have performed one-time initialization based on the
16559      vec_info.  */
16560   bool m_analyzed_vinfo = false;
16561
16562   /* This loop uses an average operation that is not supported by SVE, but is
16563      supported by Advanced SIMD and SVE2.  */
16564   bool m_has_avg = false;
16565
16566   /* Additional initialization costs for using gather or scatter operation in
16567      the current loop.  */
16568   unsigned int m_sve_gather_scatter_init_cost = 0;
16569
16570   /* True if the vector body contains a store to a decl and if the
16571      function is known to have a vld1 from the same decl.
16572
16573      In the Advanced SIMD ACLE, the recommended endian-agnostic way of
16574      initializing a vector is:
16575
16576        float f[4] = { elts };
16577        float32x4_t x = vld1q_f32(f);
16578
16579      We should strongly prefer vectorization of the initialization of f,
16580      so that the store to f and the load back can be optimized away,
16581      leaving a vectorization of { elts }.  */
16582   bool m_stores_to_vector_load_decl = false;
16583
16584   /* Non-zero if the last operation we costed is a vector promotion or demotion.
16585      In this case the value is the number of insns in the last operation.
16586
16587      On AArch64 vector promotion and demotions require us to first widen or
16588      narrow the input and only after that emit conversion instructions.  For
16589      costing this means we need to emit the cost of the final conversions as
16590      well.  */
16591   unsigned int m_num_last_promote_demote = 0;
16592
16593   /* - If M_VEC_FLAGS is zero then we're costing the original scalar code.
16594      - If M_VEC_FLAGS & VEC_ADVSIMD is nonzero then we're costing Advanced
16595        SIMD code.
16596      - If M_VEC_FLAGS & VEC_ANY_SVE is nonzero then we're costing SVE code.  */
16597   unsigned int m_vec_flags = 0;
16598
16599   /* At the moment, we do not model LDP and STP in the vector and scalar costs.
16600      This means that code such as:
16601
16602         a[0] = x;
16603         a[1] = x;
16604
16605      will be costed as two scalar instructions and two vector instructions
16606      (a scalar_to_vec and an unaligned_store).  For SLP, the vector form
16607      wins if the costs are equal, because of the fact that the vector costs
16608      include constant initializations whereas the scalar costs don't.
16609      We would therefore tend to vectorize the code above, even though
16610      the scalar version can use a single STP.
16611
16612      We should eventually fix this and model LDP and STP in the main costs;
16613      see the comment in aarch64_sve_adjust_stmt_cost for some of the problems.
16614      Until then, we look specifically for code that does nothing more than
16615      STP-like operations.  We cost them on that basis in addition to the
16616      normal latency-based costs.
16617
16618      If the scalar or vector code could be a sequence of STPs +
16619      initialization, this variable counts the cost of the sequence,
16620      with 2 units per instruction.  The variable is ~0U for other
16621      kinds of code.  */
16622   unsigned int m_stp_sequence_cost = 0;
16623
16624   /* On some CPUs, SVE and Advanced SIMD provide the same theoretical vector
16625      throughput, such as 4x128 Advanced SIMD vs. 2x256 SVE.  In those
16626      situations, we try to predict whether an Advanced SIMD implementation
16627      of the loop could be completely unrolled and become straight-line code.
16628      If so, it is generally better to use the Advanced SIMD version rather
16629      than length-agnostic SVE, since the SVE loop would execute an unknown
16630      number of times and so could not be completely unrolled in the same way.
16631
16632      If we're applying this heuristic, M_UNROLLED_ADVSIMD_NITERS is the
16633      number of Advanced SIMD loop iterations that would be unrolled and
16634      M_UNROLLED_ADVSIMD_STMTS estimates the total number of statements
16635      in the unrolled loop.  Both values are zero if we're not applying
16636      the heuristic.  */
16637   unsigned HOST_WIDE_INT m_unrolled_advsimd_niters = 0;
16638   unsigned HOST_WIDE_INT m_unrolled_advsimd_stmts = 0;
16639
16640   /* If we're vectorizing a loop that executes a constant number of times,
16641      this variable gives the number of times that the vector loop would
16642      iterate, otherwise it is zero.  */
16643   uint64_t m_num_vector_iterations = 0;
16644
16645   /* Used only when vectorizing loops.  Estimates the number and kind of
16646      operations that would be needed by one iteration of the scalar
16647      or vector loop.  There is one entry for each tuning option of
16648      interest.  */
16649   auto_vec<aarch64_vec_op_count, 2> m_ops;
16650 };
16651
16652 aarch64_vector_costs::aarch64_vector_costs (vec_info *vinfo,
16653                                             bool costing_for_scalar)
16654   : vector_costs (vinfo, costing_for_scalar),
16655     m_vec_flags (costing_for_scalar ? 0
16656                  : aarch64_classify_vector_mode (vinfo->vector_mode))
16657 {
16658   if (auto *issue_info = aarch64_tune_params.vec_costs->issue_info)
16659     {
16660       m_ops.quick_push ({ issue_info, m_vec_flags });
16661       if (aarch64_tune_params.vec_costs == &neoverse512tvb_vector_cost)
16662         {
16663           unsigned int vf_factor = (m_vec_flags & VEC_ANY_SVE) ? 2 : 1;
16664           m_ops.quick_push ({ &neoversev1_vec_issue_info, m_vec_flags,
16665                               vf_factor });
16666         }
16667     }
16668 }
16669
16670 /* Implement TARGET_VECTORIZE_CREATE_COSTS.  */
16671 vector_costs *
16672 aarch64_vectorize_create_costs (vec_info *vinfo, bool costing_for_scalar)
16673 {
16674   return new aarch64_vector_costs (vinfo, costing_for_scalar);
16675 }
16676
16677 /* Return the appropriate SIMD costs for vectors of type VECTYPE.  */
16678 static const simd_vec_cost *
16679 aarch64_simd_vec_costs (tree vectype)
16680 {
16681   const cpu_vector_cost *costs = aarch64_tune_params.vec_costs;
16682   if (vectype != NULL
16683       && aarch64_sve_mode_p (TYPE_MODE (vectype))
16684       && costs->sve != NULL)
16685     return costs->sve;
16686   return costs->advsimd;
16687 }
16688
16689 /* Return the appropriate SIMD costs for vectors with VEC_* flags FLAGS.  */
16690 static const simd_vec_cost *
16691 aarch64_simd_vec_costs_for_flags (unsigned int flags)
16692 {
16693   const cpu_vector_cost *costs = aarch64_tune_params.vec_costs;
16694   if ((flags & VEC_ANY_SVE) && costs->sve)
16695     return costs->sve;
16696   return costs->advsimd;
16697 }
16698
16699 /* If STMT_INFO is a memory reference, return the scalar memory type,
16700    otherwise return null.  */
16701 static tree
16702 aarch64_dr_type (stmt_vec_info stmt_info)
16703 {
16704   if (auto dr = STMT_VINFO_DATA_REF (stmt_info))
16705     return TREE_TYPE (DR_REF (dr));
16706   return NULL_TREE;
16707 }
16708
16709 /* Decide whether to use the unrolling heuristic described above
16710    m_unrolled_advsimd_niters, updating that field if so.  LOOP_VINFO
16711    describes the loop that we're vectorizing.  */
16712 void
16713 aarch64_vector_costs::
16714 record_potential_advsimd_unrolling (loop_vec_info loop_vinfo)
16715 {
16716   /* The heuristic only makes sense on targets that have the same
16717      vector throughput for SVE and Advanced SIMD.  */
16718   if (!(aarch64_tune_params.extra_tuning_flags
16719         & AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT))
16720     return;
16721
16722   /* We only want to apply the heuristic if LOOP_VINFO is being
16723      vectorized for SVE.  */
16724   if (!(m_vec_flags & VEC_ANY_SVE))
16725     return;
16726
16727   /* Check whether it is possible in principle to use Advanced SIMD
16728      instead.  */
16729   if (aarch64_autovec_preference == AARCH64_AUTOVEC_SVE_ONLY)
16730     return;
16731
16732   /* We don't want to apply the heuristic to outer loops, since it's
16733      harder to track two levels of unrolling.  */
16734   if (LOOP_VINFO_LOOP (loop_vinfo)->inner)
16735     return;
16736
16737   /* Only handle cases in which the number of Advanced SIMD iterations
16738      would be known at compile time but the number of SVE iterations
16739      would not.  */
16740   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
16741       || aarch64_sve_vg.is_constant ())
16742     return;
16743
16744   /* Guess how many times the Advanced SIMD loop would iterate and make
16745      sure that it is within the complete unrolling limit.  Even if the
16746      number of iterations is small enough, the number of statements might
16747      not be, which is why we need to estimate the number of statements too.  */
16748   unsigned int estimated_vq = aarch64_estimated_sve_vq ();
16749   unsigned int advsimd_vf = CEIL (vect_vf_for_cost (loop_vinfo), estimated_vq);
16750   unsigned HOST_WIDE_INT unrolled_advsimd_niters
16751     = LOOP_VINFO_INT_NITERS (loop_vinfo) / advsimd_vf;
16752   if (unrolled_advsimd_niters > (unsigned int) param_max_completely_peel_times)
16753     return;
16754
16755   /* Record that we're applying the heuristic and should try to estimate
16756      the number of statements in the Advanced SIMD loop.  */
16757   m_unrolled_advsimd_niters = unrolled_advsimd_niters;
16758 }
16759
16760 /* Do one-time initialization of the aarch64_vector_costs given that we're
16761    costing the loop vectorization described by LOOP_VINFO.  */
16762 void
16763 aarch64_vector_costs::analyze_loop_vinfo (loop_vec_info loop_vinfo)
16764 {
16765   /* Record the number of times that the vector loop would execute,
16766      if known.  */
16767   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
16768   auto scalar_niters = max_stmt_executions_int (loop);
16769   if (scalar_niters >= 0)
16770     {
16771       unsigned int vf = vect_vf_for_cost (loop_vinfo);
16772       if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
16773         m_num_vector_iterations = scalar_niters / vf;
16774       else
16775         m_num_vector_iterations = CEIL (scalar_niters, vf);
16776     }
16777
16778   /* Detect whether we're vectorizing for SVE and should apply the unrolling
16779      heuristic described above m_unrolled_advsimd_niters.  */
16780   record_potential_advsimd_unrolling (loop_vinfo);
16781 }
16782
16783 /* Implement targetm.vectorize.builtin_vectorization_cost.  */
16784 static int
16785 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
16786                                     tree vectype,
16787                                     int misalign ATTRIBUTE_UNUSED)
16788 {
16789   unsigned elements;
16790   const cpu_vector_cost *costs = aarch64_tune_params.vec_costs;
16791   bool fp = false;
16792
16793   if (vectype != NULL)
16794     fp = FLOAT_TYPE_P (vectype);
16795
16796   const simd_vec_cost *simd_costs = aarch64_simd_vec_costs (vectype);
16797
16798   switch (type_of_cost)
16799     {
16800       case scalar_stmt:
16801         return fp ? costs->scalar_fp_stmt_cost : costs->scalar_int_stmt_cost;
16802
16803       case scalar_load:
16804         return costs->scalar_load_cost;
16805
16806       case scalar_store:
16807         return costs->scalar_store_cost;
16808
16809       case vector_stmt:
16810         return fp ? simd_costs->fp_stmt_cost
16811                   : simd_costs->int_stmt_cost;
16812
16813       case vector_load:
16814         return simd_costs->align_load_cost;
16815
16816       case vector_store:
16817         return simd_costs->store_cost;
16818
16819       case vec_to_scalar:
16820         return simd_costs->vec_to_scalar_cost;
16821
16822       case scalar_to_vec:
16823         return simd_costs->scalar_to_vec_cost;
16824
16825       case unaligned_load:
16826       case vector_gather_load:
16827         return simd_costs->unalign_load_cost;
16828
16829       case unaligned_store:
16830       case vector_scatter_store:
16831         return simd_costs->unalign_store_cost;
16832
16833       case cond_branch_taken:
16834         return costs->cond_taken_branch_cost;
16835
16836       case cond_branch_not_taken:
16837         return costs->cond_not_taken_branch_cost;
16838
16839       case vec_perm:
16840         return simd_costs->permute_cost;
16841
16842       case vec_promote_demote:
16843         return fp ? simd_costs->fp_stmt_cost
16844                   : simd_costs->int_stmt_cost;
16845
16846       case vec_construct:
16847         elements = estimated_poly_value (TYPE_VECTOR_SUBPARTS (vectype));
16848         return elements / 2 + 1;
16849
16850       default:
16851         gcc_unreachable ();
16852     }
16853 }
16854
16855 /* Return true if an access of kind KIND for STMT_INFO (or NODE if SLP)
16856    represents one vector of an LD[234] or ST[234] operation.  Return the total
16857    number of vectors (2, 3 or 4) if so, otherwise return a value outside that
16858    range.  */
16859 static int
16860 aarch64_ld234_st234_vectors (vect_cost_for_stmt kind, stmt_vec_info stmt_info,
16861                              slp_tree node)
16862 {
16863   if ((kind == vector_load
16864        || kind == unaligned_load
16865        || kind == vector_store
16866        || kind == unaligned_store)
16867       && STMT_VINFO_DATA_REF (stmt_info))
16868     {
16869       stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
16870       if (stmt_info
16871           && vect_mem_access_type (stmt_info, node) == VMAT_LOAD_STORE_LANES)
16872         return DR_GROUP_SIZE (stmt_info);
16873     }
16874   return 0;
16875 }
16876
16877 /* Return true if creating multiple copies of STMT_INFO for Advanced SIMD
16878    vectors would produce a series of LDP or STP operations.  KIND is the
16879    kind of statement that STMT_INFO represents.  */
16880 static bool
16881 aarch64_advsimd_ldp_stp_p (enum vect_cost_for_stmt kind,
16882                            stmt_vec_info stmt_info)
16883 {
16884   switch (kind)
16885     {
16886     case vector_load:
16887     case vector_store:
16888     case unaligned_load:
16889     case unaligned_store:
16890       break;
16891
16892     default:
16893       return false;
16894     }
16895
16896   return is_gimple_assign (stmt_info->stmt);
16897 }
16898
16899 /* Return true if STMT_INFO is the second part of a two-statement multiply-add
16900    or multiply-subtract sequence that might be suitable for fusing into a
16901    single instruction.  If VEC_FLAGS is zero, analyze the operation as
16902    a scalar one, otherwise analyze it as an operation on vectors with those
16903    VEC_* flags.  */
16904 static bool
16905 aarch64_multiply_add_p (vec_info *vinfo, stmt_vec_info stmt_info,
16906                         unsigned int vec_flags)
16907 {
16908   gassign *assign = dyn_cast<gassign *> (stmt_info->stmt);
16909   if (!assign)
16910     return false;
16911   tree_code code = gimple_assign_rhs_code (assign);
16912   if (code != PLUS_EXPR && code != MINUS_EXPR)
16913     return false;
16914
16915   auto is_mul_result = [&](int i)
16916     {
16917       tree rhs = gimple_op (assign, i);
16918       /* ??? Should we try to check for a single use as well?  */
16919       if (TREE_CODE (rhs) != SSA_NAME)
16920         return false;
16921
16922       stmt_vec_info def_stmt_info = vinfo->lookup_def (rhs);
16923       if (!def_stmt_info
16924           || STMT_VINFO_DEF_TYPE (def_stmt_info) != vect_internal_def)
16925         return false;
16926       gassign *rhs_assign = dyn_cast<gassign *> (def_stmt_info->stmt);
16927       if (!rhs_assign || gimple_assign_rhs_code (rhs_assign) != MULT_EXPR)
16928         return false;
16929
16930       if (vec_flags & VEC_ADVSIMD)
16931         {
16932           /* Scalar and SVE code can tie the result to any FMLA input (or none,
16933              although that requires a MOVPRFX for SVE).  However, Advanced SIMD
16934              only supports MLA forms, so will require a move if the result
16935              cannot be tied to the accumulator.  The most important case in
16936              which this is true is when the accumulator input is invariant.  */
16937           rhs = gimple_op (assign, 3 - i);
16938           if (TREE_CODE (rhs) != SSA_NAME)
16939             return false;
16940           def_stmt_info = vinfo->lookup_def (rhs);
16941           if (!def_stmt_info
16942               || STMT_VINFO_DEF_TYPE (def_stmt_info) == vect_external_def
16943               || STMT_VINFO_DEF_TYPE (def_stmt_info) == vect_constant_def)
16944             return false;
16945         }
16946
16947       return true;
16948     };
16949
16950   if (code == MINUS_EXPR && (vec_flags & VEC_ADVSIMD))
16951     /* Advanced SIMD doesn't have FNMADD/FNMSUB/FNMLA/FNMLS, so the
16952        multiplication must be on the second operand (to form an FMLS).
16953        But if both operands are multiplications and the second operand
16954        is used more than once, we'll instead negate the second operand
16955        and use it as an accumulator for the first operand.  */
16956     return (is_mul_result (2)
16957             && (has_single_use (gimple_assign_rhs2 (assign))
16958                 || !is_mul_result (1)));
16959
16960   return is_mul_result (1) || is_mul_result (2);
16961 }
16962
16963 /* Return true if STMT_INFO is the second part of a two-statement boolean AND
16964    expression sequence that might be suitable for fusing into a
16965    single instruction.  If VEC_FLAGS is zero, analyze the operation as
16966    a scalar one, otherwise analyze it as an operation on vectors with those
16967    VEC_* flags.  */
16968
16969 static bool
16970 aarch64_bool_compound_p (vec_info *vinfo, stmt_vec_info stmt_info,
16971                          unsigned int vec_flags)
16972 {
16973   gassign *assign = dyn_cast<gassign *> (stmt_info->stmt);
16974   if (!assign
16975       || gimple_assign_rhs_code (assign) != BIT_AND_EXPR
16976       || !STMT_VINFO_VECTYPE (stmt_info)
16977       || !VECTOR_BOOLEAN_TYPE_P (STMT_VINFO_VECTYPE (stmt_info)))
16978     return false;
16979
16980   for (int i = 1; i < 3; ++i)
16981     {
16982       tree rhs = gimple_op (assign, i);
16983
16984       if (TREE_CODE (rhs) != SSA_NAME)
16985         continue;
16986
16987       stmt_vec_info def_stmt_info = vinfo->lookup_def (rhs);
16988       if (!def_stmt_info
16989           || STMT_VINFO_DEF_TYPE (def_stmt_info) != vect_internal_def)
16990         continue;
16991
16992       gassign *rhs_assign = dyn_cast<gassign *> (def_stmt_info->stmt);
16993       if (!rhs_assign
16994           || TREE_CODE_CLASS (gimple_assign_rhs_code (rhs_assign))
16995                 != tcc_comparison)
16996         continue;
16997
16998       if (vec_flags & VEC_ADVSIMD)
16999         return false;
17000
17001       return true;
17002     }
17003   return false;
17004 }
17005
17006 /* We are considering implementing STMT_INFO using SVE.  If STMT_INFO is an
17007    in-loop reduction that SVE supports directly, return its latency in cycles,
17008    otherwise return zero.  SVE_COSTS specifies the latencies of the relevant
17009    instructions.  */
17010 static unsigned int
17011 aarch64_sve_in_loop_reduction_latency (vec_info *vinfo,
17012                                        stmt_vec_info stmt_info,
17013                                        const sve_vec_cost *sve_costs)
17014 {
17015   switch (vect_reduc_type (vinfo, stmt_info))
17016     {
17017     case EXTRACT_LAST_REDUCTION:
17018       return sve_costs->clast_cost;
17019
17020     case FOLD_LEFT_REDUCTION:
17021       switch (TYPE_MODE (TREE_TYPE (gimple_get_lhs (stmt_info->stmt))))
17022         {
17023         case E_HFmode:
17024         case E_BFmode:
17025           return sve_costs->fadda_f16_cost;
17026
17027         case E_SFmode:
17028           return sve_costs->fadda_f32_cost;
17029
17030         case E_DFmode:
17031           return sve_costs->fadda_f64_cost;
17032
17033         default:
17034           break;
17035         }
17036       break;
17037     }
17038
17039   return 0;
17040 }
17041
17042 /* STMT_INFO describes a loop-carried operation in the original scalar code
17043    that we are considering implementing as a reduction.  Return one of the
17044    following values, depending on VEC_FLAGS:
17045
17046    - If VEC_FLAGS is zero, return the loop carry latency of the original
17047      scalar operation.
17048
17049    - If VEC_FLAGS & VEC_ADVSIMD, return the loop carry latency of the
17050      Advanced SIMD implementation.
17051
17052    - If VEC_FLAGS & VEC_ANY_SVE, return the loop carry latency of the
17053      SVE implementation.  */
17054 static unsigned int
17055 aarch64_in_loop_reduction_latency (vec_info *vinfo, stmt_vec_info stmt_info,
17056                                    unsigned int vec_flags)
17057 {
17058   const cpu_vector_cost *vec_costs = aarch64_tune_params.vec_costs;
17059   const sve_vec_cost *sve_costs = nullptr;
17060   if (vec_flags & VEC_ANY_SVE)
17061     sve_costs = aarch64_tune_params.vec_costs->sve;
17062
17063   /* If the caller is asking for the SVE latency, check for forms of reduction
17064      that only SVE can handle directly.  */
17065   if (sve_costs)
17066     {
17067       unsigned int latency
17068         = aarch64_sve_in_loop_reduction_latency (vinfo, stmt_info, sve_costs);
17069       if (latency)
17070         return latency;
17071     }
17072
17073   /* Handle scalar costs.  */
17074   bool is_float = FLOAT_TYPE_P (TREE_TYPE (gimple_get_lhs (stmt_info->stmt)));
17075   if (vec_flags == 0)
17076     {
17077       if (is_float)
17078         return vec_costs->scalar_fp_stmt_cost;
17079       return vec_costs->scalar_int_stmt_cost;
17080     }
17081
17082   /* Otherwise, the loop body just contains normal integer or FP operations,
17083      with a vector reduction outside the loop.  */
17084   const simd_vec_cost *simd_costs
17085     = aarch64_simd_vec_costs_for_flags (vec_flags);
17086   if (is_float)
17087     return simd_costs->fp_stmt_cost;
17088   return simd_costs->int_stmt_cost;
17089 }
17090
17091 /* STMT_COST is the cost calculated by aarch64_builtin_vectorization_cost
17092    for STMT_INFO, which has cost kind KIND.  If this is a scalar operation,
17093    try to subdivide the target-independent categorization provided by KIND
17094    to get a more accurate cost.  */
17095 static fractional_cost
17096 aarch64_detect_scalar_stmt_subtype (vec_info *vinfo, vect_cost_for_stmt kind,
17097                                     stmt_vec_info stmt_info,
17098                                     fractional_cost stmt_cost)
17099 {
17100   /* Detect an extension of a loaded value.  In general, we'll be able to fuse
17101      the extension with the load.  */
17102   if (kind == scalar_stmt && vect_is_extending_load (vinfo, stmt_info))
17103     return 0;
17104
17105   return stmt_cost;
17106 }
17107
17108 /* STMT_COST is the cost calculated by aarch64_builtin_vectorization_cost
17109    for the vectorized form of STMT_INFO possibly using SLP node NODE, which has
17110    cost kind KIND and which when vectorized would operate on vector type
17111    VECTYPE.  Try to subdivide the target-independent categorization provided by
17112    KIND to get a more accurate cost.  WHERE specifies where the cost associated
17113    with KIND occurs.  */
17114 static fractional_cost
17115 aarch64_detect_vector_stmt_subtype (vec_info *vinfo, vect_cost_for_stmt kind,
17116                                     stmt_vec_info stmt_info, slp_tree node,
17117                                     tree vectype,
17118                                     enum vect_cost_model_location where,
17119                                     fractional_cost stmt_cost)
17120 {
17121   const simd_vec_cost *simd_costs = aarch64_simd_vec_costs (vectype);
17122   const sve_vec_cost *sve_costs = nullptr;
17123   if (aarch64_sve_mode_p (TYPE_MODE (vectype)))
17124     sve_costs = aarch64_tune_params.vec_costs->sve;
17125
17126   /* It's generally better to avoid costing inductions, since the induction
17127      will usually be hidden by other operations.  This is particularly true
17128      for things like COND_REDUCTIONS.  */
17129   if (is_a<gphi *> (stmt_info->stmt))
17130     return 0;
17131
17132   /* Detect cases in which vec_to_scalar is describing the extraction of a
17133      vector element in preparation for a scalar store.  The store itself is
17134      costed separately.  */
17135   if (vect_is_store_elt_extraction (kind, stmt_info))
17136     return simd_costs->store_elt_extra_cost;
17137
17138   /* Detect SVE gather loads, which are costed as a single scalar_load
17139      for each element.  We therefore need to divide the full-instruction
17140      cost by the number of elements in the vector.  */
17141   if (kind == scalar_load
17142       && sve_costs
17143       && vect_mem_access_type (stmt_info, node) == VMAT_GATHER_SCATTER)
17144     {
17145       unsigned int nunits = vect_nunits_for_cost (vectype);
17146       /* Test for VNx2 modes, which have 64-bit containers.  */
17147       if (known_eq (GET_MODE_NUNITS (TYPE_MODE (vectype)), aarch64_sve_vg))
17148         return { sve_costs->gather_load_x64_cost, nunits };
17149       return { sve_costs->gather_load_x32_cost, nunits };
17150     }
17151
17152   /* Detect cases in which a scalar_store is really storing one element
17153      in a scatter operation.  */
17154   if (kind == scalar_store
17155       && sve_costs
17156       && vect_mem_access_type (stmt_info, node) == VMAT_GATHER_SCATTER)
17157     return sve_costs->scatter_store_elt_cost;
17158
17159   /* Detect cases in which vec_to_scalar represents an in-loop reduction.  */
17160   if (kind == vec_to_scalar
17161       && where == vect_body
17162       && sve_costs)
17163     {
17164       unsigned int latency
17165         = aarch64_sve_in_loop_reduction_latency (vinfo, stmt_info, sve_costs);
17166       if (latency)
17167         return latency;
17168     }
17169
17170   /* Detect cases in which vec_to_scalar represents a single reduction
17171      instruction like FADDP or MAXV.  */
17172   if (kind == vec_to_scalar
17173       && where == vect_epilogue
17174       && vect_is_reduction (stmt_info))
17175     switch (GET_MODE_INNER (TYPE_MODE (vectype)))
17176       {
17177       case E_QImode:
17178         return simd_costs->reduc_i8_cost;
17179
17180       case E_HImode:
17181         return simd_costs->reduc_i16_cost;
17182
17183       case E_SImode:
17184         return simd_costs->reduc_i32_cost;
17185
17186       case E_DImode:
17187         return simd_costs->reduc_i64_cost;
17188
17189       case E_HFmode:
17190       case E_BFmode:
17191         return simd_costs->reduc_f16_cost;
17192
17193       case E_SFmode:
17194         return simd_costs->reduc_f32_cost;
17195
17196       case E_DFmode:
17197         return simd_costs->reduc_f64_cost;
17198
17199       default:
17200         break;
17201       }
17202
17203   /* Otherwise stick with the original categorization.  */
17204   return stmt_cost;
17205 }
17206
17207 /* STMT_COST is the cost calculated by aarch64_builtin_vectorization_cost
17208    for STMT_INFO, which has cost kind KIND and which when vectorized would
17209    operate on vector type VECTYPE.  Adjust the cost as necessary for SVE
17210    targets.  */
17211 static fractional_cost
17212 aarch64_sve_adjust_stmt_cost (class vec_info *vinfo, vect_cost_for_stmt kind,
17213                               stmt_vec_info stmt_info, tree vectype,
17214                               fractional_cost stmt_cost)
17215 {
17216   /* Unlike vec_promote_demote, vector_stmt conversions do not change the
17217      vector register size or number of units.  Integer promotions of this
17218      type therefore map to SXT[BHW] or UXT[BHW].
17219
17220      Most loads have extending forms that can do the sign or zero extension
17221      on the fly.  Optimistically assume that a load followed by an extension
17222      will fold to this form during combine, and that the extension therefore
17223      comes for free.  */
17224   if (kind == vector_stmt && vect_is_extending_load (vinfo, stmt_info))
17225     stmt_cost = 0;
17226
17227   /* For similar reasons, vector_stmt integer truncations are a no-op,
17228      because we can just ignore the unused upper bits of the source.  */
17229   if (kind == vector_stmt && vect_is_integer_truncation (stmt_info))
17230     stmt_cost = 0;
17231
17232   /* Advanced SIMD can load and store pairs of registers using LDP and STP,
17233      but there are no equivalent instructions for SVE.  This means that
17234      (all other things being equal) 128-bit SVE needs twice as many load
17235      and store instructions as Advanced SIMD in order to process vector pairs.
17236
17237      Also, scalar code can often use LDP and STP to access pairs of values,
17238      so it is too simplistic to say that one SVE load or store replaces
17239      VF scalar loads and stores.
17240
17241      Ideally we would account for this in the scalar and Advanced SIMD
17242      costs by making suitable load/store pairs as cheap as a single
17243      load/store.  However, that would be a very invasive change and in
17244      practice it tends to stress other parts of the cost model too much.
17245      E.g. stores of scalar constants currently count just a store,
17246      whereas stores of vector constants count a store and a vec_init.
17247      This is an artificial distinction for AArch64, where stores of
17248      nonzero scalar constants need the same kind of register invariant
17249      as vector stores.
17250
17251      An alternative would be to double the cost of any SVE loads and stores
17252      that could be paired in Advanced SIMD (and possibly also paired in
17253      scalar code).  But this tends to stress other parts of the cost model
17254      in the same way.  It also means that we can fall back to Advanced SIMD
17255      even if full-loop predication would have been useful.
17256
17257      Here we go for a more conservative version: double the costs of SVE
17258      loads and stores if one iteration of the scalar loop processes enough
17259      elements for it to use a whole number of Advanced SIMD LDP or STP
17260      instructions.  This makes it very likely that the VF would be 1 for
17261      Advanced SIMD, and so no epilogue should be needed.  */
17262   if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
17263     {
17264       stmt_vec_info first = DR_GROUP_FIRST_ELEMENT (stmt_info);
17265       unsigned int count = DR_GROUP_SIZE (first) - DR_GROUP_GAP (first);
17266       unsigned int elt_bits = GET_MODE_UNIT_BITSIZE (TYPE_MODE (vectype));
17267       if (multiple_p (count * elt_bits, 256)
17268           && aarch64_advsimd_ldp_stp_p (kind, stmt_info))
17269         stmt_cost *= 2;
17270     }
17271
17272   return stmt_cost;
17273 }
17274
17275 /* STMT_COST is the cost calculated for STMT_INFO, which has cost kind KIND
17276    and which when vectorized would operate on vector type VECTYPE.  Add the
17277    cost of any embedded operations.  */
17278 static fractional_cost
17279 aarch64_adjust_stmt_cost (vec_info *vinfo, vect_cost_for_stmt kind,
17280                           stmt_vec_info stmt_info, slp_tree node, tree vectype,
17281                           unsigned vec_flags, fractional_cost stmt_cost)
17282 {
17283   if (vectype)
17284     {
17285       const simd_vec_cost *simd_costs = aarch64_simd_vec_costs (vectype);
17286
17287       /* Detect cases in which a vector load or store represents an
17288          LD[234] or ST[234] instruction.  */
17289       switch (aarch64_ld234_st234_vectors (kind, stmt_info, node))
17290         {
17291         case 2:
17292           stmt_cost += simd_costs->ld2_st2_permute_cost;
17293           break;
17294
17295         case 3:
17296           stmt_cost += simd_costs->ld3_st3_permute_cost;
17297           break;
17298
17299         case 4:
17300           stmt_cost += simd_costs->ld4_st4_permute_cost;
17301           break;
17302         }
17303
17304       gassign *assign = dyn_cast<gassign *> (STMT_VINFO_STMT (stmt_info));
17305       if ((kind == scalar_stmt || kind == vector_stmt) && assign)
17306         {
17307           /* For MLA we need to reduce the cost since MLA is 1 instruction.  */
17308           if (!vect_is_reduction (stmt_info)
17309               && aarch64_multiply_add_p (vinfo, stmt_info, vec_flags))
17310             return 0;
17311
17312           /* For vector boolean ANDs with a compare operand we just need
17313              one insn.  */
17314           if (aarch64_bool_compound_p (vinfo, stmt_info, vec_flags))
17315             return 0;
17316         }
17317
17318       if (kind == vector_stmt || kind == vec_to_scalar)
17319         if (tree cmp_type = vect_embedded_comparison_type (stmt_info))
17320           {
17321             if (FLOAT_TYPE_P (cmp_type))
17322               stmt_cost += simd_costs->fp_stmt_cost;
17323             else
17324               stmt_cost += simd_costs->int_stmt_cost;
17325           }
17326     }
17327
17328   if (kind == scalar_stmt)
17329     if (tree cmp_type = vect_embedded_comparison_type (stmt_info))
17330       {
17331         if (FLOAT_TYPE_P (cmp_type))
17332           stmt_cost += aarch64_tune_params.vec_costs->scalar_fp_stmt_cost;
17333         else
17334           stmt_cost += aarch64_tune_params.vec_costs->scalar_int_stmt_cost;
17335       }
17336
17337   return stmt_cost;
17338 }
17339
17340 /* Return true if STMT_INFO is part of a reduction that has the form:
17341
17342       r = r op ...;
17343       r = r op ...;
17344
17345    with the single accumulator being read and written multiple times.  */
17346 static bool
17347 aarch64_force_single_cycle (vec_info *vinfo, stmt_vec_info stmt_info)
17348 {
17349   if (!STMT_VINFO_REDUC_DEF (stmt_info))
17350     return false;
17351
17352   auto reduc_info = info_for_reduction (vinfo, stmt_info);
17353   return STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info);
17354 }
17355
17356 /* COUNT, KIND and STMT_INFO are the same as for vector_costs::add_stmt_cost
17357    and they describe an operation in the body of a vector loop.  Record issue
17358    information relating to the vector operation in OPS.  */
17359 void
17360 aarch64_vector_costs::count_ops (unsigned int count, vect_cost_for_stmt kind,
17361                                  stmt_vec_info stmt_info, slp_tree node,
17362                                  aarch64_vec_op_count *ops)
17363 {
17364   const aarch64_base_vec_issue_info *base_issue = ops->base_issue_info ();
17365   if (!base_issue)
17366     return;
17367   const aarch64_simd_vec_issue_info *simd_issue = ops->simd_issue_info ();
17368   const aarch64_sve_vec_issue_info *sve_issue = ops->sve_issue_info ();
17369
17370   /* Calculate the minimum cycles per iteration imposed by a reduction
17371      operation.  */
17372   if ((kind == scalar_stmt || kind == vector_stmt || kind == vec_to_scalar)
17373       && vect_is_reduction (stmt_info))
17374     {
17375       unsigned int base
17376         = aarch64_in_loop_reduction_latency (m_vinfo, stmt_info, m_vec_flags);
17377       if (aarch64_force_single_cycle (m_vinfo, stmt_info))
17378         /* ??? Ideally we'd use a tree to reduce the copies down to 1 vector,
17379            and then accumulate that, but at the moment the loop-carried
17380            dependency includes all copies.  */
17381         ops->reduction_latency = MAX (ops->reduction_latency, base * count);
17382       else
17383         ops->reduction_latency = MAX (ops->reduction_latency, base);
17384     }
17385
17386   if (stmt_info && (kind == scalar_stmt || kind == vector_stmt))
17387     {
17388       /* Assume that multiply-adds will become a single operation.  */
17389       if (aarch64_multiply_add_p (m_vinfo, stmt_info, m_vec_flags))
17390         return;
17391
17392       /* Assume that bool AND with compare operands will become a single
17393          operation.  */
17394       if (aarch64_bool_compound_p (m_vinfo, stmt_info, m_vec_flags))
17395         return;
17396     }
17397
17398   /* Detect the case where we are using an emulated gather/scatter.  When a
17399      target does not support gathers and scatters directly the vectorizer
17400      emulates these by constructing an index vector and then issuing an
17401      extraction for every lane in the vector.  If the index vector is loaded
17402      from memory, the vector load and extractions are subsequently lowered by
17403      veclower into a series of scalar index loads.  After the final loads are
17404      done it issues a vec_construct to recreate the vector from the scalar.  For
17405      costing when we see a vec_to_scalar on a stmt with VMAT_GATHER_SCATTER we
17406      are dealing with an emulated instruction and should adjust costing
17407      properly.  */
17408   if (kind == vec_to_scalar
17409       && (m_vec_flags & VEC_ADVSIMD)
17410       && vect_mem_access_type (stmt_info, node) == VMAT_GATHER_SCATTER)
17411     {
17412       auto dr = STMT_VINFO_DATA_REF (stmt_info);
17413       tree dr_ref = DR_REF (dr);
17414       while (handled_component_p (dr_ref))
17415         {
17416           if (TREE_CODE (dr_ref) == ARRAY_REF)
17417             {
17418               tree offset = TREE_OPERAND (dr_ref, 1);
17419               if (SSA_VAR_P (offset))
17420                 {
17421                   if (gimple_vuse (SSA_NAME_DEF_STMT (offset)))
17422                     {
17423                       if (STMT_VINFO_TYPE (stmt_info) == load_vec_info_type)
17424                         ops->loads += count - 1;
17425                       else
17426                           /* Stores want to count both the index to array and data to
17427                              array using vec_to_scalar.  However we have index stores
17428                              in Adv.SIMD and so we only want to adjust the index
17429                              loads.  */
17430                         ops->loads += count / 2;
17431                       return;
17432                     }
17433                   break;
17434                 }
17435             }
17436           dr_ref = TREE_OPERAND (dr_ref, 0);
17437         }
17438     }
17439
17440   /* Count the basic operation cost associated with KIND.  */
17441   switch (kind)
17442     {
17443     case cond_branch_taken:
17444     case cond_branch_not_taken:
17445     case vector_gather_load:
17446     case vector_scatter_store:
17447       /* We currently don't expect these to be used in a loop body.  */
17448       break;
17449
17450     case vec_perm:
17451     case vec_promote_demote:
17452     case vec_construct:
17453     case vec_to_scalar:
17454     case scalar_to_vec:
17455     case vector_stmt:
17456     case scalar_stmt:
17457       ops->general_ops += count;
17458       break;
17459
17460     case scalar_load:
17461     case vector_load:
17462     case unaligned_load:
17463       ops->loads += count;
17464       if (m_vec_flags || FLOAT_TYPE_P (aarch64_dr_type (stmt_info)))
17465         ops->general_ops += base_issue->fp_simd_load_general_ops * count;
17466       break;
17467
17468     case vector_store:
17469     case unaligned_store:
17470     case scalar_store:
17471       ops->stores += count;
17472       if (m_vec_flags || FLOAT_TYPE_P (aarch64_dr_type (stmt_info)))
17473         ops->general_ops += base_issue->fp_simd_store_general_ops * count;
17474       break;
17475     }
17476
17477   /* Add any embedded comparison operations.  */
17478   if ((kind == scalar_stmt || kind == vector_stmt || kind == vec_to_scalar)
17479       && vect_embedded_comparison_type (stmt_info))
17480     ops->general_ops += count;
17481
17482   /* COND_REDUCTIONS need two sets of VEC_COND_EXPRs, whereas so far we
17483      have only accounted for one.  */
17484   if ((kind == vector_stmt || kind == vec_to_scalar)
17485       && vect_reduc_type (m_vinfo, stmt_info) == COND_REDUCTION)
17486     ops->general_ops += count;
17487
17488   /* Count the predicate operations needed by an SVE comparison.  */
17489   if (sve_issue && (kind == vector_stmt || kind == vec_to_scalar))
17490     if (tree type = vect_comparison_type (stmt_info))
17491       {
17492         unsigned int base = (FLOAT_TYPE_P (type)
17493                              ? sve_issue->fp_cmp_pred_ops
17494                              : sve_issue->int_cmp_pred_ops);
17495         ops->pred_ops += base * count;
17496       }
17497
17498   /* Add any extra overhead associated with LD[234] and ST[234] operations.  */
17499   if (simd_issue)
17500     switch (aarch64_ld234_st234_vectors (kind, stmt_info, node))
17501       {
17502       case 2:
17503         ops->general_ops += simd_issue->ld2_st2_general_ops * count;
17504         break;
17505
17506       case 3:
17507         ops->general_ops += simd_issue->ld3_st3_general_ops * count;
17508         break;
17509
17510       case 4:
17511         ops->general_ops += simd_issue->ld4_st4_general_ops * count;
17512         break;
17513       }
17514
17515   /* Add any overhead associated with gather loads and scatter stores.  */
17516   if (sve_issue
17517       && (kind == scalar_load || kind == scalar_store)
17518       && vect_mem_access_type (stmt_info, node) == VMAT_GATHER_SCATTER)
17519     {
17520       unsigned int pairs = CEIL (count, 2);
17521       ops->pred_ops += sve_issue->gather_scatter_pair_pred_ops * pairs;
17522       ops->general_ops += sve_issue->gather_scatter_pair_general_ops * pairs;
17523     }
17524 }
17525
17526 /* Return true if STMT_INFO contains a memory access and if the constant
17527    component of the memory address is aligned to SIZE bytes.  */
17528 static bool
17529 aarch64_aligned_constant_offset_p (stmt_vec_info stmt_info,
17530                                    poly_uint64 size)
17531 {
17532   if (!STMT_VINFO_DATA_REF (stmt_info))
17533     return false;
17534
17535   if (auto first_stmt = DR_GROUP_FIRST_ELEMENT (stmt_info))
17536     stmt_info = first_stmt;
17537   tree constant_offset = DR_INIT (STMT_VINFO_DATA_REF (stmt_info));
17538   /* Needed for gathers & scatters, for example.  */
17539   if (!constant_offset)
17540     return false;
17541
17542   return multiple_p (wi::to_poly_offset (constant_offset), size);
17543 }
17544
17545 /* Check if a scalar or vector stmt could be part of a region of code
17546    that does nothing more than store values to memory, in the scalar
17547    case using STP.  Return the cost of the stmt if so, counting 2 for
17548    one instruction.  Return ~0U otherwise.
17549
17550    The arguments are a subset of those passed to add_stmt_cost.  */
17551 unsigned int
17552 aarch64_stp_sequence_cost (unsigned int count, vect_cost_for_stmt kind,
17553                            stmt_vec_info stmt_info, tree vectype)
17554 {
17555   /* Code that stores vector constants uses a vector_load to create
17556      the constant.  We don't apply the heuristic to that case for two
17557      main reasons:
17558
17559      - At the moment, STPs are only formed via peephole2, and the
17560        constant scalar moves would often come between STRs and so
17561        prevent STP formation.
17562
17563      - The scalar code also has to load the constant somehow, and that
17564        isn't costed.  */
17565   switch (kind)
17566     {
17567     case scalar_to_vec:
17568       /* Count 2 insns for a GPR->SIMD dup and 1 insn for a FPR->SIMD dup.  */
17569       return (FLOAT_TYPE_P (vectype) ? 2 : 4) * count;
17570
17571     case vec_construct:
17572       if (FLOAT_TYPE_P (vectype))
17573         /* Count 1 insn for the maximum number of FP->SIMD INS
17574            instructions.  */
17575         return (vect_nunits_for_cost (vectype) - 1) * 2 * count;
17576
17577       /* Count 2 insns for a GPR->SIMD move and 2 insns for the
17578          maximum number of GPR->SIMD INS instructions.  */
17579       return vect_nunits_for_cost (vectype) * 4 * count;
17580
17581     case vector_store:
17582     case unaligned_store:
17583       /* Count 1 insn per vector if we can't form STP Q pairs.  */
17584       if (aarch64_sve_mode_p (TYPE_MODE (vectype)))
17585         return count * 2;
17586
17587       if (stmt_info)
17588         {
17589           /* Assume we won't be able to use STP if the constant offset
17590              component of the address is misaligned.  ??? This could be
17591              removed if we formed STP pairs earlier, rather than relying
17592              on peephole2.  */
17593           auto size = GET_MODE_SIZE (TYPE_MODE (vectype));
17594           if (!aarch64_aligned_constant_offset_p (stmt_info, size))
17595             return count * 2;
17596         }
17597       return CEIL (count, 2) * 2;
17598
17599     case scalar_store:
17600       if (stmt_info && STMT_VINFO_DATA_REF (stmt_info))
17601         {
17602           /* Check for a mode in which STP pairs can be formed.  */
17603           auto size = GET_MODE_SIZE (TYPE_MODE (aarch64_dr_type (stmt_info)));
17604           if (maybe_ne (size, 4) && maybe_ne (size, 8))
17605             return ~0U;
17606
17607           /* Assume we won't be able to use STP if the constant offset
17608              component of the address is misaligned.  ??? This could be
17609              removed if we formed STP pairs earlier, rather than relying
17610              on peephole2.  */
17611           if (!aarch64_aligned_constant_offset_p (stmt_info, size))
17612             return ~0U;
17613         }
17614       return count;
17615
17616     default:
17617       return ~0U;
17618     }
17619 }
17620
17621 unsigned
17622 aarch64_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind,
17623                                      stmt_vec_info stmt_info, slp_tree node,
17624                                      tree vectype, int misalign,
17625                                      vect_cost_model_location where)
17626 {
17627   fractional_cost stmt_cost
17628     = aarch64_builtin_vectorization_cost (kind, vectype, misalign);
17629
17630   bool in_inner_loop_p = (where == vect_body
17631                           && stmt_info
17632                           && stmt_in_inner_loop_p (m_vinfo, stmt_info));
17633
17634   /* Do one-time initialization based on the vinfo.  */
17635   loop_vec_info loop_vinfo = dyn_cast<loop_vec_info> (m_vinfo);
17636   if (!m_analyzed_vinfo)
17637     {
17638       if (loop_vinfo)
17639         analyze_loop_vinfo (loop_vinfo);
17640
17641       m_analyzed_vinfo = true;
17642     }
17643
17644   /* Apply the heuristic described above m_stp_sequence_cost.  */
17645   if (m_stp_sequence_cost != ~0U)
17646     {
17647       uint64_t cost = aarch64_stp_sequence_cost (count, kind,
17648                                                  stmt_info, vectype);
17649       m_stp_sequence_cost = MIN (m_stp_sequence_cost + cost, ~0U);
17650     }
17651
17652   /* Try to get a more accurate cost by looking at STMT_INFO instead
17653      of just looking at KIND.  */
17654   if (stmt_info)
17655     {
17656       /* If we scalarize a strided store, the vectorizer costs one
17657          vec_to_scalar for each element.  However, we can store the first
17658          element using an FP store without a separate extract step.  */
17659       if (vect_is_store_elt_extraction (kind, stmt_info))
17660         count -= 1;
17661
17662       stmt_cost = aarch64_detect_scalar_stmt_subtype (m_vinfo, kind,
17663                                                       stmt_info, stmt_cost);
17664
17665       if (vectype && m_vec_flags)
17666         stmt_cost = aarch64_detect_vector_stmt_subtype (m_vinfo, kind,
17667                                                         stmt_info, node,
17668                                                         vectype, where,
17669                                                         stmt_cost);
17670
17671       /* Check if we've seen an SVE gather/scatter operation and which size.  */
17672       if (kind == scalar_load
17673           && aarch64_sve_mode_p (TYPE_MODE (vectype))
17674           && vect_mem_access_type (stmt_info, node) == VMAT_GATHER_SCATTER)
17675         {
17676           const sve_vec_cost *sve_costs = aarch64_tune_params.vec_costs->sve;
17677           if (sve_costs)
17678             {
17679               /* Test for VNx2 modes, which have 64-bit containers.  */
17680               if (known_eq (GET_MODE_NUNITS (TYPE_MODE (vectype)),
17681                             aarch64_sve_vg))
17682                 m_sve_gather_scatter_init_cost
17683                   += sve_costs->gather_load_x64_init_cost;
17684               else
17685                 m_sve_gather_scatter_init_cost
17686                   += sve_costs->gather_load_x32_init_cost;
17687             }
17688         }
17689     }
17690
17691   /* Do any SVE-specific adjustments to the cost.  */
17692   if (stmt_info && vectype && aarch64_sve_mode_p (TYPE_MODE (vectype)))
17693     stmt_cost = aarch64_sve_adjust_stmt_cost (m_vinfo, kind, stmt_info,
17694                                               vectype, stmt_cost);
17695
17696   /*  Vector promotion and demotion requires us to widen the operation first
17697       and only after that perform the conversion.  Unfortunately the mid-end
17698       expects this to be doable as a single operation and doesn't pass on
17699       enough context here for us to tell which operation is happening.  To
17700       account for this we count every promote-demote operation twice and if
17701       the previously costed operation was also a promote-demote we reduce
17702       the cost of the currently being costed operation to simulate the final
17703       conversion cost.  Note that for SVE we can do better here if the converted
17704       value comes from a load since the widening load would consume the widening
17705       operations.  However since we're in stage 3 we can't change the helper
17706       vect_is_extending_load and duplicating the code seems not useful.  */
17707   gassign *assign = NULL;
17708   if (kind == vec_promote_demote
17709       && (assign = dyn_cast <gassign *> (STMT_VINFO_STMT (stmt_info)))
17710       && gimple_assign_rhs_code (assign) == FLOAT_EXPR)
17711     {
17712       auto new_count = count * 2 - m_num_last_promote_demote;
17713       m_num_last_promote_demote = count;
17714       count = new_count;
17715     }
17716   else
17717     m_num_last_promote_demote = 0;
17718
17719   if (stmt_info)
17720     {
17721       /* Account for any extra "embedded" costs that apply additively
17722          to the base cost calculated above.  */
17723       stmt_cost = aarch64_adjust_stmt_cost (m_vinfo, kind, stmt_info, node,
17724                                             vectype, m_vec_flags, stmt_cost);
17725
17726       /* If we're recording a nonzero vector loop body cost for the
17727          innermost loop, also estimate the operations that would need
17728          to be issued by all relevant implementations of the loop.  */
17729       if (loop_vinfo
17730           && (m_costing_for_scalar || where == vect_body)
17731           && (!LOOP_VINFO_LOOP (loop_vinfo)->inner || in_inner_loop_p)
17732           && stmt_cost != 0)
17733         for (auto &ops : m_ops)
17734           count_ops (count, kind, stmt_info, node, &ops);
17735
17736       /* If we're applying the SVE vs. Advanced SIMD unrolling heuristic,
17737          estimate the number of statements in the unrolled Advanced SIMD
17738          loop.  For simplicitly, we assume that one iteration of the
17739          Advanced SIMD loop would need the same number of statements
17740          as one iteration of the SVE loop.  */
17741       if (where == vect_body && m_unrolled_advsimd_niters)
17742         m_unrolled_advsimd_stmts += count * m_unrolled_advsimd_niters;
17743
17744       /* Detect the use of an averaging operation.  */
17745       gimple *stmt = stmt_info->stmt;
17746       if (is_gimple_call (stmt)
17747           && gimple_call_internal_p (stmt))
17748         {
17749           switch (gimple_call_internal_fn (stmt))
17750             {
17751             case IFN_AVG_FLOOR:
17752             case IFN_AVG_CEIL:
17753               m_has_avg = true;
17754             default:
17755               break;
17756             }
17757         }
17758     }
17759
17760   /* If the statement stores to a decl that is known to be the argument
17761      to a vld1 in the same function, ignore the store for costing purposes.
17762      See the comment above m_stores_to_vector_load_decl for more details.  */
17763   if (stmt_info
17764       && (kind == vector_store || kind == unaligned_store)
17765       && aarch64_accesses_vector_load_decl_p (stmt_info))
17766     {
17767       stmt_cost = 0;
17768       m_stores_to_vector_load_decl = true;
17769     }
17770
17771   return record_stmt_cost (stmt_info, where, (count * stmt_cost).ceil ());
17772 }
17773
17774 /* Return true if (a) we're applying the Advanced SIMD vs. SVE unrolling
17775    heuristic described above m_unrolled_advsimd_niters and (b) the heuristic
17776    says that we should prefer the Advanced SIMD loop.  */
17777 bool
17778 aarch64_vector_costs::prefer_unrolled_loop () const
17779 {
17780   if (!m_unrolled_advsimd_stmts)
17781     return false;
17782
17783   if (dump_enabled_p ())
17784     dump_printf_loc (MSG_NOTE, vect_location, "Number of insns in"
17785                      " unrolled Advanced SIMD loop = "
17786                      HOST_WIDE_INT_PRINT_UNSIGNED "\n",
17787                      m_unrolled_advsimd_stmts);
17788
17789   /* The balance here is tricky.  On the one hand, we can't be sure whether
17790      the code is vectorizable with Advanced SIMD or not.  However, even if
17791      it isn't vectorizable with Advanced SIMD, there's a possibility that
17792      the scalar code could also be unrolled.  Some of the code might then
17793      benefit from SLP, or from using LDP and STP.  We therefore apply
17794      the heuristic regardless of can_use_advsimd_p.  */
17795   return (m_unrolled_advsimd_stmts
17796           && (m_unrolled_advsimd_stmts
17797               <= (unsigned int) param_max_completely_peeled_insns));
17798 }
17799
17800 /* Subroutine of adjust_body_cost for handling SVE.  Use ISSUE_INFO to work out
17801    how fast the SVE code can be issued and compare it to the equivalent value
17802    for scalar code (SCALAR_CYCLES_PER_ITER).  If COULD_USE_ADVSIMD is true,
17803    also compare it to the issue rate of Advanced SIMD code
17804    (ADVSIMD_CYCLES_PER_ITER).
17805
17806    ORIG_BODY_COST is the cost originally passed to adjust_body_cost and
17807    *BODY_COST is the current value of the adjusted cost.  *SHOULD_DISPARAGE
17808    is true if we think the loop body is too expensive.  */
17809
17810 fractional_cost
17811 aarch64_vector_costs::
17812 adjust_body_cost_sve (const aarch64_vec_op_count *ops,
17813                       fractional_cost scalar_cycles_per_iter,
17814                       unsigned int orig_body_cost, unsigned int *body_cost,
17815                       bool *should_disparage)
17816 {
17817   if (dump_enabled_p ())
17818     ops->dump ();
17819
17820   fractional_cost sve_pred_cycles_per_iter = ops->min_pred_cycles_per_iter ();
17821   fractional_cost sve_cycles_per_iter = ops->min_cycles_per_iter ();
17822
17823   /* If the scalar version of the loop could issue at least as
17824      quickly as the predicate parts of the SVE loop, make the SVE loop
17825      prohibitively expensive.  In this case vectorization is adding an
17826      overhead that the original scalar code didn't have.
17827
17828      This is mostly intended to detect cases in which WHILELOs dominate
17829      for very tight loops, which is something that normal latency-based
17830      costs would not model.  Adding this kind of cliffedge would be
17831      too drastic for scalar_cycles_per_iter vs. sve_cycles_per_iter;
17832      code in the caller handles that case in a more conservative way.  */
17833   fractional_cost sve_estimate = sve_pred_cycles_per_iter + 1;
17834   if (scalar_cycles_per_iter < sve_estimate)
17835     {
17836       unsigned int min_cost
17837         = orig_body_cost * estimated_poly_value (BYTES_PER_SVE_VECTOR);
17838       if (*body_cost < min_cost)
17839         {
17840           if (dump_enabled_p ())
17841             dump_printf_loc (MSG_NOTE, vect_location,
17842                              "Increasing body cost to %d because the"
17843                              " scalar code could issue within the limit"
17844                              " imposed by predicate operations\n",
17845                              min_cost);
17846           *body_cost = min_cost;
17847           *should_disparage = true;
17848         }
17849     }
17850
17851   return sve_cycles_per_iter;
17852 }
17853
17854 unsigned int
17855 aarch64_vector_costs::determine_suggested_unroll_factor ()
17856 {
17857   bool sve = m_vec_flags & VEC_ANY_SVE;
17858   /* If we are trying to unroll an Advanced SIMD main loop that contains
17859      an averaging operation that we do not support with SVE and we might use a
17860      predicated epilogue, we need to be conservative and block unrolling as
17861      this might lead to a less optimal loop for the first and only epilogue
17862      using the original loop's vectorization factor.
17863      TODO: Remove this constraint when we add support for multiple epilogue
17864      vectorization.  */
17865   if (!sve && !TARGET_SVE2 && m_has_avg)
17866     return 1;
17867
17868   unsigned int max_unroll_factor = 1;
17869   for (auto vec_ops : m_ops)
17870     {
17871       aarch64_simd_vec_issue_info const *vec_issue
17872         = vec_ops.simd_issue_info ();
17873       if (!vec_issue)
17874         return 1;
17875       /* Limit unroll factor to a value adjustable by the user, the default
17876          value is 4. */
17877       unsigned int unroll_factor = aarch64_vect_unroll_limit;
17878       unsigned int factor
17879        = vec_ops.reduction_latency > 1 ? vec_ops.reduction_latency : 1;
17880       unsigned int temp;
17881
17882       /* Sanity check, this should never happen.  */
17883       if ((vec_ops.stores + vec_ops.loads + vec_ops.general_ops) == 0)
17884         return 1;
17885
17886       /* Check stores.  */
17887       if (vec_ops.stores > 0)
17888         {
17889           temp = CEIL (factor * vec_issue->stores_per_cycle,
17890                        vec_ops.stores);
17891           unroll_factor = MIN (unroll_factor, temp);
17892         }
17893
17894       /* Check loads + stores.  */
17895       if (vec_ops.loads > 0)
17896         {
17897           temp = CEIL (factor * vec_issue->loads_stores_per_cycle,
17898                        vec_ops.loads + vec_ops.stores);
17899           unroll_factor = MIN (unroll_factor, temp);
17900         }
17901
17902       /* Check general ops.  */
17903       if (vec_ops.general_ops > 0)
17904         {
17905           temp = CEIL (factor * vec_issue->general_ops_per_cycle,
17906                        vec_ops.general_ops);
17907           unroll_factor = MIN (unroll_factor, temp);
17908          }
17909       max_unroll_factor = MAX (max_unroll_factor, unroll_factor);
17910     }
17911
17912   /* Make sure unroll factor is power of 2.  */
17913   return 1 << ceil_log2 (max_unroll_factor);
17914 }
17915
17916 /* BODY_COST is the cost of a vector loop body.  Adjust the cost as necessary
17917    and return the new cost.  */
17918 unsigned int
17919 aarch64_vector_costs::
17920 adjust_body_cost (loop_vec_info loop_vinfo,
17921                   const aarch64_vector_costs *scalar_costs,
17922                   unsigned int body_cost)
17923 {
17924   if (scalar_costs->m_ops.is_empty () || m_ops.is_empty ())
17925     return body_cost;
17926
17927   const auto &scalar_ops = scalar_costs->m_ops[0];
17928   const auto &vector_ops = m_ops[0];
17929   unsigned int estimated_vf = vect_vf_for_cost (loop_vinfo);
17930   unsigned int orig_body_cost = body_cost;
17931   bool should_disparage = false;
17932
17933   if (dump_enabled_p ())
17934     dump_printf_loc (MSG_NOTE, vect_location,
17935                      "Original vector body cost = %d\n", body_cost);
17936
17937   /* If we know we have a single partial vector iteration, cap the VF
17938      to the number of scalar iterations for costing purposes.  */
17939   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
17940     {
17941       auto niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
17942       if (niters < estimated_vf && dump_enabled_p ())
17943         dump_printf_loc (MSG_NOTE, vect_location,
17944                          "Scalar loop iterates at most %wd times.  Capping VF "
17945                          " from %d to %wd\n", niters, estimated_vf, niters);
17946
17947       estimated_vf = MIN (estimated_vf, niters);
17948     }
17949
17950   fractional_cost scalar_cycles_per_iter
17951     = scalar_ops.min_cycles_per_iter () * estimated_vf;
17952
17953   fractional_cost vector_cycles_per_iter = vector_ops.min_cycles_per_iter ();
17954
17955   if (dump_enabled_p ())
17956     {
17957       if (IN_RANGE (m_num_vector_iterations, 0, 65536))
17958         dump_printf_loc (MSG_NOTE, vect_location,
17959                          "Vector loop iterates at most %wd times\n",
17960                          m_num_vector_iterations);
17961       dump_printf_loc (MSG_NOTE, vect_location, "Scalar issue estimate:\n");
17962       scalar_ops.dump ();
17963       dump_printf_loc (MSG_NOTE, vect_location,
17964                        "  estimated cycles per vector iteration"
17965                        " (for VF %d) = %f\n",
17966                        estimated_vf, scalar_cycles_per_iter.as_double ());
17967     }
17968
17969   if (vector_ops.sve_issue_info ())
17970     {
17971       if (dump_enabled_p ())
17972         dump_printf_loc (MSG_NOTE, vect_location, "SVE issue estimate:\n");
17973       vector_cycles_per_iter
17974         = adjust_body_cost_sve (&vector_ops, scalar_cycles_per_iter,
17975                                 orig_body_cost, &body_cost, &should_disparage);
17976
17977       if (aarch64_tune_params.vec_costs == &neoverse512tvb_vector_cost)
17978         {
17979           /* Also take Neoverse V1 tuning into account, doubling the
17980              scalar and Advanced SIMD estimates to account for the
17981              doubling in SVE vector length.  */
17982           if (dump_enabled_p ())
17983             dump_printf_loc (MSG_NOTE, vect_location,
17984                              "Neoverse V1 estimate:\n");
17985           auto vf_factor = m_ops[1].vf_factor ();
17986           adjust_body_cost_sve (&m_ops[1], scalar_cycles_per_iter * vf_factor,
17987                                 orig_body_cost, &body_cost, &should_disparage);
17988         }
17989     }
17990   else
17991     {
17992       if (dump_enabled_p ())
17993         {
17994           dump_printf_loc (MSG_NOTE, vect_location,
17995                            "Vector issue estimate:\n");
17996           vector_ops.dump ();
17997         }
17998     }
17999
18000   /* Decide whether to stick to latency-based costs or whether to try to
18001      take issue rates into account.  */
18002   unsigned int threshold = aarch64_loop_vect_issue_rate_niters;
18003   if (m_vec_flags & VEC_ANY_SVE)
18004     threshold = CEIL (threshold, aarch64_estimated_sve_vq ());
18005
18006   if (m_num_vector_iterations >= 1
18007       && m_num_vector_iterations < threshold)
18008     {
18009       if (dump_enabled_p ())
18010         dump_printf_loc (MSG_NOTE, vect_location,
18011                          "Low iteration count, so using pure latency"
18012                          " costs\n");
18013     }
18014   /* Increase the cost of the vector code if it looks like the scalar code
18015      could issue more quickly.  These values are only rough estimates,
18016      so minor differences should only result in minor changes.  */
18017   else if (scalar_cycles_per_iter < vector_cycles_per_iter)
18018     {
18019       body_cost = fractional_cost::scale (body_cost, vector_cycles_per_iter,
18020                                           scalar_cycles_per_iter);
18021       if (dump_enabled_p ())
18022         dump_printf_loc (MSG_NOTE, vect_location,
18023                          "Increasing body cost to %d because scalar code"
18024                          " would issue more quickly\n", body_cost);
18025     }
18026   /* In general, it's expected that the proposed vector code would be able
18027      to issue more quickly than the original scalar code.  This should
18028      already be reflected to some extent in the latency-based costs.
18029
18030      However, the latency-based costs effectively assume that the scalar
18031      code and the vector code execute serially, which tends to underplay
18032      one important case: if the real (non-serialized) execution time of
18033      a scalar iteration is dominated by loop-carried dependencies,
18034      and if the vector code is able to reduce both the length of
18035      the loop-carried dependencies *and* the number of cycles needed
18036      to issue the code in general, we can be more confident that the
18037      vector code is an improvement, even if adding the other (non-loop-carried)
18038      latencies tends to hide this saving.  We therefore reduce the cost of the
18039      vector loop body in proportion to the saving.  */
18040   else if (scalar_ops.reduction_latency > vector_ops.reduction_latency
18041            && scalar_ops.reduction_latency == scalar_cycles_per_iter
18042            && scalar_cycles_per_iter > vector_cycles_per_iter
18043            && !should_disparage)
18044     {
18045       body_cost = fractional_cost::scale (body_cost, vector_cycles_per_iter,
18046                                           scalar_cycles_per_iter);
18047       if (dump_enabled_p ())
18048         dump_printf_loc (MSG_NOTE, vect_location,
18049                          "Decreasing body cost to %d account for smaller"
18050                          " reduction latency\n", body_cost);
18051     }
18052
18053   return body_cost;
18054 }
18055
18056 void
18057 aarch64_vector_costs::finish_cost (const vector_costs *uncast_scalar_costs)
18058 {
18059   /* Record the issue information for any SVE WHILE instructions that the
18060      loop needs.  */
18061   loop_vec_info loop_vinfo = dyn_cast<loop_vec_info> (m_vinfo);
18062   if (!m_ops.is_empty ()
18063       && loop_vinfo
18064       && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
18065     {
18066       unsigned int num_masks = 0;
18067       rgroup_controls *rgm;
18068       unsigned int num_vectors_m1;
18069       FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo).rgc_vec,
18070                         num_vectors_m1, rgm)
18071         if (rgm->type)
18072           num_masks += num_vectors_m1 + 1;
18073       for (auto &ops : m_ops)
18074         if (auto *issue = ops.sve_issue_info ())
18075           ops.pred_ops += num_masks * issue->while_pred_ops;
18076     }
18077
18078   auto *scalar_costs
18079     = static_cast<const aarch64_vector_costs *> (uncast_scalar_costs);
18080   if (loop_vinfo && m_vec_flags)
18081     {
18082       m_costs[vect_body] = adjust_body_cost (loop_vinfo, scalar_costs,
18083                                              m_costs[vect_body]);
18084       m_suggested_unroll_factor = determine_suggested_unroll_factor ();
18085
18086       /* For gather and scatters there's an additional overhead for the first
18087          iteration.  For low count loops they're not beneficial so model the
18088          overhead as loop prologue costs.  */
18089       m_costs[vect_prologue] += m_sve_gather_scatter_init_cost;
18090     }
18091
18092   /* Apply the heuristic described above m_stp_sequence_cost.  Prefer
18093      the scalar code in the event of a tie, since there is more chance
18094      of scalar code being optimized with surrounding operations.
18095
18096      In addition, if the vector body is a simple store to a decl that
18097      is elsewhere loaded using vld1, strongly prefer the vector form,
18098      to the extent of giving the prologue a zero cost.  See the comment
18099      above m_stores_to_vector_load_decl for details.  */
18100   if (!loop_vinfo
18101       && scalar_costs
18102       && m_stp_sequence_cost != ~0U)
18103     {
18104       if (m_stores_to_vector_load_decl)
18105         m_costs[vect_prologue] = 0;
18106       else if (m_stp_sequence_cost >= scalar_costs->m_stp_sequence_cost)
18107         m_costs[vect_body] = 2 * scalar_costs->total_cost ();
18108     }
18109
18110   vector_costs::finish_cost (scalar_costs);
18111 }
18112
18113 bool
18114 aarch64_vector_costs::
18115 better_main_loop_than_p (const vector_costs *uncast_other) const
18116 {
18117   auto other = static_cast<const aarch64_vector_costs *> (uncast_other);
18118
18119   auto this_loop_vinfo = as_a<loop_vec_info> (this->m_vinfo);
18120   auto other_loop_vinfo = as_a<loop_vec_info> (other->m_vinfo);
18121
18122   if (dump_enabled_p ())
18123     dump_printf_loc (MSG_NOTE, vect_location,
18124                      "Comparing two main loops (%s at VF %d vs %s at VF %d)\n",
18125                      GET_MODE_NAME (this_loop_vinfo->vector_mode),
18126                      vect_vf_for_cost (this_loop_vinfo),
18127                      GET_MODE_NAME (other_loop_vinfo->vector_mode),
18128                      vect_vf_for_cost (other_loop_vinfo));
18129
18130   /* Apply the unrolling heuristic described above
18131      m_unrolled_advsimd_niters.  */
18132   if (bool (m_unrolled_advsimd_stmts)
18133       != bool (other->m_unrolled_advsimd_stmts))
18134     {
18135       bool this_prefer_unrolled = this->prefer_unrolled_loop ();
18136       bool other_prefer_unrolled = other->prefer_unrolled_loop ();
18137       if (this_prefer_unrolled != other_prefer_unrolled)
18138         {
18139           if (dump_enabled_p ())
18140             dump_printf_loc (MSG_NOTE, vect_location,
18141                              "Preferring Advanced SIMD loop because"
18142                              " it can be unrolled\n");
18143           return other_prefer_unrolled;
18144         }
18145     }
18146
18147   for (unsigned int i = 0; i < m_ops.length (); ++i)
18148     {
18149       if (dump_enabled_p ())
18150         {
18151           if (i)
18152             dump_printf_loc (MSG_NOTE, vect_location,
18153                              "Reconsidering with subtuning %d\n", i);
18154           dump_printf_loc (MSG_NOTE, vect_location,
18155                            "Issue info for %s loop:\n",
18156                            GET_MODE_NAME (this_loop_vinfo->vector_mode));
18157           this->m_ops[i].dump ();
18158           dump_printf_loc (MSG_NOTE, vect_location,
18159                            "Issue info for %s loop:\n",
18160                            GET_MODE_NAME (other_loop_vinfo->vector_mode));
18161           other->m_ops[i].dump ();
18162         }
18163
18164       auto this_estimated_vf = (vect_vf_for_cost (this_loop_vinfo)
18165                                 * this->m_ops[i].vf_factor ());
18166       auto other_estimated_vf = (vect_vf_for_cost (other_loop_vinfo)
18167                                  * other->m_ops[i].vf_factor ());
18168
18169       /* If it appears that one loop could process the same amount of data
18170          in fewer cycles, prefer that loop over the other one.  */
18171       fractional_cost this_cost
18172         = this->m_ops[i].min_cycles_per_iter () * other_estimated_vf;
18173       fractional_cost other_cost
18174         = other->m_ops[i].min_cycles_per_iter () * this_estimated_vf;
18175       if (dump_enabled_p ())
18176         {
18177           dump_printf_loc (MSG_NOTE, vect_location,
18178                            "Weighted cycles per iteration of %s loop ~= %f\n",
18179                            GET_MODE_NAME (this_loop_vinfo->vector_mode),
18180                            this_cost.as_double ());
18181           dump_printf_loc (MSG_NOTE, vect_location,
18182                            "Weighted cycles per iteration of %s loop ~= %f\n",
18183                            GET_MODE_NAME (other_loop_vinfo->vector_mode),
18184                            other_cost.as_double ());
18185         }
18186       if (this_cost != other_cost)
18187         {
18188           if (dump_enabled_p ())
18189             dump_printf_loc (MSG_NOTE, vect_location,
18190                              "Preferring loop with lower cycles"
18191                              " per iteration\n");
18192           return this_cost < other_cost;
18193         }
18194
18195       /* If the issue rate of SVE code is limited by predicate operations
18196          (i.e. if sve_pred_cycles_per_iter > sve_nonpred_cycles_per_iter),
18197          and if Advanced SIMD code could issue within the limit imposed
18198          by the predicate operations, the predicate operations are adding an
18199          overhead that the original code didn't have and so we should prefer
18200          the Advanced SIMD version.  */
18201       auto better_pred_limit_p = [](const aarch64_vec_op_count &a,
18202                                     const aarch64_vec_op_count &b) -> bool
18203         {
18204           if (a.pred_ops == 0
18205               && (b.min_pred_cycles_per_iter ()
18206                   > b.min_nonpred_cycles_per_iter ()))
18207             {
18208               if (dump_enabled_p ())
18209                 dump_printf_loc (MSG_NOTE, vect_location,
18210                                  "Preferring Advanced SIMD loop since"
18211                                  " SVE loop is predicate-limited\n");
18212               return true;
18213             }
18214           return false;
18215         };
18216       if (better_pred_limit_p (this->m_ops[i], other->m_ops[i]))
18217         return true;
18218       if (better_pred_limit_p (other->m_ops[i], this->m_ops[i]))
18219         return false;
18220     }
18221
18222   return vector_costs::better_main_loop_than_p (other);
18223 }
18224
18225 static void initialize_aarch64_code_model (struct gcc_options *);
18226
18227 /* Parse TOKEN, which has length LENGTH to see if it is an option
18228    described in FLAG.  If it is, return the index bit for that fusion type.
18229    If not, error (printing OPTION_NAME) and return zero.  */
18230
18231 static unsigned int
18232 aarch64_parse_one_option_token (const char *token,
18233                                 size_t length,
18234                                 const struct aarch64_flag_desc *flag,
18235                                 const char *option_name)
18236 {
18237   for (; flag->name != NULL; flag++)
18238     {
18239       if (length == strlen (flag->name)
18240           && !strncmp (flag->name, token, length))
18241         return flag->flag;
18242     }
18243
18244   error ("unknown flag passed in %<-moverride=%s%> (%s)", option_name, token);
18245   return 0;
18246 }
18247
18248 /* Parse OPTION which is a comma-separated list of flags to enable.
18249    FLAGS gives the list of flags we understand, INITIAL_STATE gives any
18250    default state we inherit from the CPU tuning structures.  OPTION_NAME
18251    gives the top-level option we are parsing in the -moverride string,
18252    for use in error messages.  */
18253
18254 static unsigned int
18255 aarch64_parse_boolean_options (const char *option,
18256                                const struct aarch64_flag_desc *flags,
18257                                unsigned int initial_state,
18258                                const char *option_name)
18259 {
18260   const char separator = '.';
18261   const char* specs = option;
18262   const char* ntoken = option;
18263   unsigned int found_flags = initial_state;
18264
18265   while ((ntoken = strchr (specs, separator)))
18266     {
18267       size_t token_length = ntoken - specs;
18268       unsigned token_ops = aarch64_parse_one_option_token (specs,
18269                                                            token_length,
18270                                                            flags,
18271                                                            option_name);
18272       /* If we find "none" (or, for simplicity's sake, an error) anywhere
18273          in the token stream, reset the supported operations.  So:
18274
18275            adrp+add.cmp+branch.none.adrp+add
18276
18277            would have the result of turning on only adrp+add fusion.  */
18278       if (!token_ops)
18279         found_flags = 0;
18280
18281       found_flags |= token_ops;
18282       specs = ++ntoken;
18283     }
18284
18285   /* We ended with a comma, print something.  */
18286   if (!(*specs))
18287     {
18288       error ("%qs string ill-formed", option_name);
18289       return 0;
18290     }
18291
18292   /* We still have one more token to parse.  */
18293   size_t token_length = strlen (specs);
18294   unsigned token_ops = aarch64_parse_one_option_token (specs,
18295                                                        token_length,
18296                                                        flags,
18297                                                        option_name);
18298    if (!token_ops)
18299      found_flags = 0;
18300
18301   found_flags |= token_ops;
18302   return found_flags;
18303 }
18304
18305 /* Support for overriding instruction fusion.  */
18306
18307 static void
18308 aarch64_parse_fuse_string (const char *fuse_string,
18309                             struct tune_params *tune)
18310 {
18311   tune->fusible_ops = aarch64_parse_boolean_options (fuse_string,
18312                                                      aarch64_fusible_pairs,
18313                                                      tune->fusible_ops,
18314                                                      "fuse=");
18315 }
18316
18317 /* Support for overriding other tuning flags.  */
18318
18319 static void
18320 aarch64_parse_tune_string (const char *tune_string,
18321                             struct tune_params *tune)
18322 {
18323   tune->extra_tuning_flags
18324     = aarch64_parse_boolean_options (tune_string,
18325                                      aarch64_tuning_flags,
18326                                      tune->extra_tuning_flags,
18327                                      "tune=");
18328 }
18329
18330 /* Parse the sve_width tuning moverride string in TUNE_STRING.
18331    Accept the valid SVE vector widths allowed by
18332    aarch64_sve_vector_bits_enum and use it to override sve_width
18333    in TUNE.  */
18334
18335 static void
18336 aarch64_parse_sve_width_string (const char *tune_string,
18337                                 struct tune_params *tune)
18338 {
18339   int width = -1;
18340
18341   int n = sscanf (tune_string, "%d", &width);
18342   if (n == EOF)
18343     {
18344       error ("invalid format for %<sve_width%>");
18345       return;
18346     }
18347   switch (width)
18348     {
18349     case SVE_128:
18350     case SVE_256:
18351     case SVE_512:
18352     case SVE_1024:
18353     case SVE_2048:
18354       break;
18355     default:
18356       error ("invalid %<sve_width%> value: %d", width);
18357     }
18358   tune->sve_width = (enum aarch64_sve_vector_bits_enum) width;
18359 }
18360
18361 /* Parse TOKEN, which has length LENGTH to see if it is a tuning option
18362    we understand.  If it is, extract the option string and handoff to
18363    the appropriate function.  */
18364
18365 void
18366 aarch64_parse_one_override_token (const char* token,
18367                                   size_t length,
18368                                   struct tune_params *tune)
18369 {
18370   const struct aarch64_tuning_override_function *fn
18371     = aarch64_tuning_override_functions;
18372
18373   const char *option_part = strchr (token, '=');
18374   if (!option_part)
18375     {
18376       error ("tuning string missing in option (%s)", token);
18377       return;
18378     }
18379
18380   /* Get the length of the option name.  */
18381   length = option_part - token;
18382   /* Skip the '=' to get to the option string.  */
18383   option_part++;
18384
18385   for (; fn->name != NULL; fn++)
18386     {
18387       if (!strncmp (fn->name, token, length))
18388         {
18389           fn->parse_override (option_part, tune);
18390           return;
18391         }
18392     }
18393
18394   error ("unknown tuning option (%s)",token);
18395   return;
18396 }
18397
18398 /* A checking mechanism for the implementation of the tls size.  */
18399
18400 static void
18401 initialize_aarch64_tls_size (struct gcc_options *opts)
18402 {
18403   if (aarch64_tls_size == 0)
18404     aarch64_tls_size = 24;
18405
18406   switch (opts->x_aarch64_cmodel_var)
18407     {
18408     case AARCH64_CMODEL_TINY:
18409       /* Both the default and maximum TLS size allowed under tiny is 1M which
18410          needs two instructions to address, so we clamp the size to 24.  */
18411       if (aarch64_tls_size > 24)
18412         aarch64_tls_size = 24;
18413       break;
18414     case AARCH64_CMODEL_SMALL:
18415       /* The maximum TLS size allowed under small is 4G.  */
18416       if (aarch64_tls_size > 32)
18417         aarch64_tls_size = 32;
18418       break;
18419     case AARCH64_CMODEL_LARGE:
18420       /* The maximum TLS size allowed under large is 16E.
18421          FIXME: 16E should be 64bit, we only support 48bit offset now.  */
18422       if (aarch64_tls_size > 48)
18423         aarch64_tls_size = 48;
18424       break;
18425     default:
18426       gcc_unreachable ();
18427     }
18428
18429   return;
18430 }
18431
18432 /* Return the CPU corresponding to the enum CPU.  */
18433
18434 static const struct processor *
18435 aarch64_get_tune_cpu (enum aarch64_cpu cpu)
18436 {
18437   gcc_assert (cpu != aarch64_no_cpu);
18438
18439   return &all_cores[cpu];
18440 }
18441
18442 /* Return the architecture corresponding to the enum ARCH.  */
18443
18444 static const struct processor *
18445 aarch64_get_arch (enum aarch64_arch arch)
18446 {
18447   gcc_assert (arch != aarch64_no_arch);
18448
18449   return &all_architectures[arch];
18450 }
18451
18452 /* Parse STRING looking for options in the format:
18453      string     :: option:string
18454      option     :: name=substring
18455      name       :: {a-z}
18456      substring  :: defined by option.  */
18457
18458 static void
18459 aarch64_parse_override_string (const char* input_string,
18460                                struct tune_params* tune)
18461 {
18462   const char separator = ':';
18463   size_t string_length = strlen (input_string) + 1;
18464   char *string_root = (char *) xmalloc (sizeof (*string_root) * string_length);
18465   char *string = string_root;
18466   strncpy (string, input_string, string_length);
18467   string[string_length - 1] = '\0';
18468
18469   char* ntoken = string;
18470
18471   while ((ntoken = strchr (string, separator)))
18472     {
18473       size_t token_length = ntoken - string;
18474       /* Make this substring look like a string.  */
18475       *ntoken = '\0';
18476       aarch64_parse_one_override_token (string, token_length, tune);
18477       string = ++ntoken;
18478     }
18479
18480   /* One last option to parse.  */
18481   aarch64_parse_one_override_token (string, strlen (string), tune);
18482   free (string_root);
18483 }
18484
18485 /* Adjust CURRENT_TUNE (a generic tuning struct) with settings that
18486    are best for a generic target with the currently-enabled architecture
18487    extensions.  */
18488 static void
18489 aarch64_adjust_generic_arch_tuning (struct tune_params &current_tune)
18490 {
18491   /* Neoverse V1 is the only core that is known to benefit from
18492      AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS.  There is therefore no
18493      point enabling it for SVE2 and above.  */
18494   if (TARGET_SVE2)
18495     current_tune.extra_tuning_flags
18496       &= ~AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS;
18497 }
18498
18499 static void
18500 aarch64_override_options_after_change_1 (struct gcc_options *opts)
18501 {
18502   /* PR 70044: We have to be careful about being called multiple times for the
18503      same function.  This means all changes should be repeatable.  */
18504
18505   /* Set aarch64_use_frame_pointer based on -fno-omit-frame-pointer.
18506      Disable the frame pointer flag so the mid-end will not use a frame
18507      pointer in leaf functions in order to support -fomit-leaf-frame-pointer.
18508      Set x_flag_omit_frame_pointer to the special value 2 to differentiate
18509      between -fomit-frame-pointer (1) and -fno-omit-frame-pointer (2).  */
18510   aarch64_use_frame_pointer = opts->x_flag_omit_frame_pointer != 1;
18511   if (opts->x_flag_omit_frame_pointer == 0)
18512     opts->x_flag_omit_frame_pointer = 2;
18513
18514   /* If not optimizing for size, set the default
18515      alignment to what the target wants.  */
18516   if (!opts->x_optimize_size)
18517     {
18518       if (opts->x_flag_align_loops && !opts->x_str_align_loops)
18519         opts->x_str_align_loops = aarch64_tune_params.loop_align;
18520       if (opts->x_flag_align_jumps && !opts->x_str_align_jumps)
18521         opts->x_str_align_jumps = aarch64_tune_params.jump_align;
18522       if (opts->x_flag_align_functions && !opts->x_str_align_functions)
18523         opts->x_str_align_functions = aarch64_tune_params.function_align;
18524     }
18525
18526   /* We default to no pc-relative literal loads.  */
18527
18528   aarch64_pcrelative_literal_loads = false;
18529
18530   /* If -mpc-relative-literal-loads is set on the command line, this
18531      implies that the user asked for PC relative literal loads.  */
18532   if (opts->x_pcrelative_literal_loads == 1)
18533     aarch64_pcrelative_literal_loads = true;
18534
18535   /* In the tiny memory model it makes no sense to disallow PC relative
18536      literal pool loads.  */
18537   if (aarch64_cmodel == AARCH64_CMODEL_TINY
18538       || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
18539     aarch64_pcrelative_literal_loads = true;
18540
18541   /* When enabling the lower precision Newton series for the square root, also
18542      enable it for the reciprocal square root, since the latter is an
18543      intermediary step for the former.  */
18544   if (flag_mlow_precision_sqrt)
18545     flag_mrecip_low_precision_sqrt = true;
18546 }
18547
18548 /* 'Unpack' up the internal tuning structs and update the options
18549     in OPTS.  The caller must have set up selected_tune and selected_arch
18550     as all the other target-specific codegen decisions are
18551     derived from them.  */
18552
18553 void
18554 aarch64_override_options_internal (struct gcc_options *opts)
18555 {
18556   const struct processor *tune = aarch64_get_tune_cpu (opts->x_selected_tune);
18557   aarch64_tune = tune->sched_core;
18558   /* Make a copy of the tuning parameters attached to the core, which
18559      we may later overwrite.  */
18560   aarch64_tune_params = *(tune->tune);
18561   if (tune->tune == &generic_tunings)
18562     aarch64_adjust_generic_arch_tuning (aarch64_tune_params);
18563
18564   if (opts->x_aarch64_override_tune_string)
18565     aarch64_parse_override_string (opts->x_aarch64_override_tune_string,
18566                                    &aarch64_tune_params);
18567
18568   if (opts->x_aarch64_ldp_policy_param)
18569     aarch64_tune_params.ldp_policy_model = opts->x_aarch64_ldp_policy_param;
18570
18571   if (opts->x_aarch64_stp_policy_param)
18572     aarch64_tune_params.stp_policy_model = opts->x_aarch64_stp_policy_param;
18573
18574   /* This target defaults to strict volatile bitfields.  */
18575   if (opts->x_flag_strict_volatile_bitfields < 0 && abi_version_at_least (2))
18576     opts->x_flag_strict_volatile_bitfields = 1;
18577
18578   if (aarch64_stack_protector_guard == SSP_GLOBAL
18579       && opts->x_aarch64_stack_protector_guard_offset_str)
18580     {
18581       error ("incompatible options %<-mstack-protector-guard=global%> and "
18582              "%<-mstack-protector-guard-offset=%s%>",
18583              aarch64_stack_protector_guard_offset_str);
18584     }
18585
18586   if (aarch64_stack_protector_guard == SSP_SYSREG
18587       && !(opts->x_aarch64_stack_protector_guard_offset_str
18588            && opts->x_aarch64_stack_protector_guard_reg_str))
18589     {
18590       error ("both %<-mstack-protector-guard-offset%> and "
18591              "%<-mstack-protector-guard-reg%> must be used "
18592              "with %<-mstack-protector-guard=sysreg%>");
18593     }
18594
18595   if (opts->x_aarch64_stack_protector_guard_reg_str)
18596     {
18597       if (strlen (opts->x_aarch64_stack_protector_guard_reg_str) > 100)
18598           error ("specify a system register with a small string length");
18599     }
18600
18601   if (opts->x_aarch64_stack_protector_guard_offset_str)
18602     {
18603       char *end;
18604       const char *str = aarch64_stack_protector_guard_offset_str;
18605       errno = 0;
18606       long offs = strtol (aarch64_stack_protector_guard_offset_str, &end, 0);
18607       if (!*str || *end || errno)
18608         error ("%qs is not a valid offset in %qs", str,
18609                "-mstack-protector-guard-offset=");
18610       aarch64_stack_protector_guard_offset = offs;
18611     }
18612
18613   if ((flag_sanitize & SANITIZE_SHADOW_CALL_STACK)
18614       && !fixed_regs[R18_REGNUM])
18615     error ("%<-fsanitize=shadow-call-stack%> requires %<-ffixed-x18%>");
18616
18617   aarch64_feature_flags isa_flags = aarch64_get_isa_flags (opts);
18618   if ((isa_flags & (AARCH64_FL_SM_ON | AARCH64_FL_ZA_ON))
18619       && !(isa_flags & AARCH64_FL_SME))
18620     {
18621       if (isa_flags & AARCH64_FL_SM_ON)
18622         error ("streaming functions require the ISA extension %qs", "sme");
18623       else
18624         error ("functions with SME state require the ISA extension %qs",
18625                "sme");
18626       inform (input_location, "you can enable %qs using the command-line"
18627               " option %<-march%>, or by using the %<target%>"
18628               " attribute or pragma", "sme");
18629       opts->x_target_flags &= ~MASK_GENERAL_REGS_ONLY;
18630       auto new_flags = isa_flags | feature_deps::SME ().enable;
18631       aarch64_set_asm_isa_flags (opts, new_flags);
18632     }
18633
18634   initialize_aarch64_code_model (opts);
18635   initialize_aarch64_tls_size (opts);
18636   aarch64_tpidr_register = opts->x_aarch64_tpidr_reg;
18637
18638   int queue_depth = 0;
18639   switch (aarch64_tune_params.autoprefetcher_model)
18640     {
18641       case tune_params::AUTOPREFETCHER_OFF:
18642         queue_depth = -1;
18643         break;
18644       case tune_params::AUTOPREFETCHER_WEAK:
18645         queue_depth = 0;
18646         break;
18647       case tune_params::AUTOPREFETCHER_STRONG:
18648         queue_depth = max_insn_queue_index + 1;
18649         break;
18650       default:
18651         gcc_unreachable ();
18652     }
18653
18654   /* We don't mind passing in global_options_set here as we don't use
18655      the *options_set structs anyway.  */
18656   SET_OPTION_IF_UNSET (opts, &global_options_set,
18657                        param_sched_autopref_queue_depth, queue_depth);
18658
18659   /* Set up parameters to be used in prefetching algorithm.  Do not
18660      override the defaults unless we are tuning for a core we have
18661      researched values for.  */
18662   if (aarch64_tune_params.prefetch->num_slots > 0)
18663     SET_OPTION_IF_UNSET (opts, &global_options_set,
18664                          param_simultaneous_prefetches,
18665                          aarch64_tune_params.prefetch->num_slots);
18666   if (aarch64_tune_params.prefetch->l1_cache_size >= 0)
18667     SET_OPTION_IF_UNSET (opts, &global_options_set,
18668                          param_l1_cache_size,
18669                          aarch64_tune_params.prefetch->l1_cache_size);
18670   if (aarch64_tune_params.prefetch->l1_cache_line_size >= 0)
18671     SET_OPTION_IF_UNSET (opts, &global_options_set,
18672                          param_l1_cache_line_size,
18673                          aarch64_tune_params.prefetch->l1_cache_line_size);
18674
18675   if (aarch64_tune_params.prefetch->l1_cache_line_size >= 0)
18676     {
18677       SET_OPTION_IF_UNSET (opts, &global_options_set,
18678                            param_destruct_interfere_size,
18679                            aarch64_tune_params.prefetch->l1_cache_line_size);
18680       SET_OPTION_IF_UNSET (opts, &global_options_set,
18681                            param_construct_interfere_size,
18682                            aarch64_tune_params.prefetch->l1_cache_line_size);
18683     }
18684   else
18685     {
18686       /* For a generic AArch64 target, cover the current range of cache line
18687          sizes.  */
18688       SET_OPTION_IF_UNSET (opts, &global_options_set,
18689                            param_destruct_interfere_size,
18690                            256);
18691       SET_OPTION_IF_UNSET (opts, &global_options_set,
18692                            param_construct_interfere_size,
18693                            64);
18694     }
18695
18696   if (aarch64_tune_params.prefetch->l2_cache_size >= 0)
18697     SET_OPTION_IF_UNSET (opts, &global_options_set,
18698                          param_l2_cache_size,
18699                          aarch64_tune_params.prefetch->l2_cache_size);
18700   if (!aarch64_tune_params.prefetch->prefetch_dynamic_strides)
18701     SET_OPTION_IF_UNSET (opts, &global_options_set,
18702                          param_prefetch_dynamic_strides, 0);
18703   if (aarch64_tune_params.prefetch->minimum_stride >= 0)
18704     SET_OPTION_IF_UNSET (opts, &global_options_set,
18705                          param_prefetch_minimum_stride,
18706                          aarch64_tune_params.prefetch->minimum_stride);
18707
18708   /* Use the alternative scheduling-pressure algorithm by default.  */
18709   SET_OPTION_IF_UNSET (opts, &global_options_set,
18710                        param_sched_pressure_algorithm,
18711                        SCHED_PRESSURE_MODEL);
18712
18713   /* Validate the guard size.  */
18714   int guard_size = param_stack_clash_protection_guard_size;
18715
18716   if (guard_size != 12 && guard_size != 16)
18717     error ("only values 12 (4 KB) and 16 (64 KB) are supported for guard "
18718            "size.  Given value %d (%llu KB) is out of range",
18719            guard_size, (1ULL << guard_size) / 1024ULL);
18720
18721   /* Enforce that interval is the same size as size so the mid-end does the
18722      right thing.  */
18723   SET_OPTION_IF_UNSET (opts, &global_options_set,
18724                        param_stack_clash_protection_probe_interval,
18725                        guard_size);
18726
18727   /* The maybe_set calls won't update the value if the user has explicitly set
18728      one.  Which means we need to validate that probing interval and guard size
18729      are equal.  */
18730   int probe_interval
18731     = param_stack_clash_protection_probe_interval;
18732   if (guard_size != probe_interval)
18733     error ("stack clash guard size %<%d%> must be equal to probing interval "
18734            "%<%d%>", guard_size, probe_interval);
18735
18736   /* Enable sw prefetching at specified optimization level for
18737      CPUS that have prefetch.  Lower optimization level threshold by 1
18738      when profiling is enabled.  */
18739   if (opts->x_flag_prefetch_loop_arrays < 0
18740       && !opts->x_optimize_size
18741       && aarch64_tune_params.prefetch->default_opt_level >= 0
18742       && opts->x_optimize >= aarch64_tune_params.prefetch->default_opt_level)
18743     opts->x_flag_prefetch_loop_arrays = 1;
18744
18745   /* Avoid loop-dependant FMA chains.  */
18746   if (aarch64_tune_params.extra_tuning_flags
18747       & AARCH64_EXTRA_TUNE_AVOID_CROSS_LOOP_FMA)
18748     SET_OPTION_IF_UNSET (opts, &global_options_set, param_avoid_fma_max_bits,
18749                          512);
18750
18751   /* Consider fully pipelined FMA in reassociation.  */
18752   if (aarch64_tune_params.extra_tuning_flags
18753       & AARCH64_EXTRA_TUNE_FULLY_PIPELINED_FMA)
18754     SET_OPTION_IF_UNSET (opts, &global_options_set, param_fully_pipelined_fma,
18755                          1);
18756
18757   aarch64_override_options_after_change_1 (opts);
18758 }
18759
18760 /* Straight line speculation indicators.  */
18761 enum aarch64_sls_hardening_type
18762 {
18763   SLS_NONE = 0,
18764   SLS_RETBR = 1,
18765   SLS_BLR = 2,
18766   SLS_ALL = 3,
18767 };
18768 static enum aarch64_sls_hardening_type aarch64_sls_hardening;
18769
18770 /* Return whether we should mitigatate Straight Line Speculation for the RET
18771    and BR instructions.  */
18772 bool
18773 aarch64_harden_sls_retbr_p (void)
18774 {
18775   return aarch64_sls_hardening & SLS_RETBR;
18776 }
18777
18778 /* Return whether we should mitigatate Straight Line Speculation for the BLR
18779    instruction.  */
18780 bool
18781 aarch64_harden_sls_blr_p (void)
18782 {
18783   return aarch64_sls_hardening & SLS_BLR;
18784 }
18785
18786 /* As of yet we only allow setting these options globally, in the future we may
18787    allow setting them per function.  */
18788 static void
18789 aarch64_validate_sls_mitigation (const char *const_str)
18790 {
18791   char *token_save = NULL;
18792   char *str = NULL;
18793
18794   if (strcmp (const_str, "none") == 0)
18795     {
18796       aarch64_sls_hardening = SLS_NONE;
18797       return;
18798     }
18799   if (strcmp (const_str, "all") == 0)
18800     {
18801       aarch64_sls_hardening = SLS_ALL;
18802       return;
18803     }
18804
18805   char *str_root = xstrdup (const_str);
18806   str = strtok_r (str_root, ",", &token_save);
18807   if (!str)
18808     error ("invalid argument given to %<-mharden-sls=%>");
18809
18810   int temp = SLS_NONE;
18811   while (str)
18812     {
18813       if (strcmp (str, "blr") == 0)
18814         temp |= SLS_BLR;
18815       else if (strcmp (str, "retbr") == 0)
18816         temp |= SLS_RETBR;
18817       else if (strcmp (str, "none") == 0 || strcmp (str, "all") == 0)
18818         {
18819           error ("%qs must be by itself for %<-mharden-sls=%>", str);
18820           break;
18821         }
18822       else
18823         {
18824           error ("invalid argument %qs for %<-mharden-sls=%>", str);
18825           break;
18826         }
18827       str = strtok_r (NULL, ",", &token_save);
18828     }
18829   aarch64_sls_hardening = (aarch64_sls_hardening_type) temp;
18830   free (str_root);
18831 }
18832
18833 /* Return the VG value associated with -msve-vector-bits= value VALUE.  */
18834
18835 static poly_uint16
18836 aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits_enum value)
18837 {
18838   /* 128-bit SVE and Advanced SIMD modes use different register layouts
18839      on big-endian targets, so we would need to forbid subregs that convert
18840      from one to the other.  By default a reinterpret sequence would then
18841      involve a store to memory in one mode and a load back in the other.
18842      Even if we optimize that sequence using reverse instructions,
18843      it would still be a significant potential overhead.
18844
18845      For now, it seems better to generate length-agnostic code for that
18846      case instead.  */
18847   if (value == SVE_SCALABLE
18848       || (value == SVE_128 && BYTES_BIG_ENDIAN))
18849     return poly_uint16 (2, 2);
18850   else
18851     return (int) value / 64;
18852 }
18853
18854 /* Set the global aarch64_asm_isa_flags to FLAGS and update
18855    aarch64_isa_flags accordingly.  */
18856
18857 void
18858 aarch64_set_asm_isa_flags (aarch64_feature_flags flags)
18859 {
18860   aarch64_set_asm_isa_flags (&global_options, flags);
18861 }
18862
18863 static void
18864 aarch64_handle_no_branch_protection (void)
18865 {
18866   aarch_ra_sign_scope = AARCH_FUNCTION_NONE;
18867   aarch_enable_bti = 0;
18868   aarch64_enable_gcs = 0;
18869 }
18870
18871 static void
18872 aarch64_handle_standard_branch_protection (void)
18873 {
18874   aarch_ra_sign_scope = AARCH_FUNCTION_NON_LEAF;
18875   aarch64_ra_sign_key = AARCH64_KEY_A;
18876   aarch_enable_bti = 1;
18877   aarch64_enable_gcs = 1;
18878 }
18879
18880 static void
18881 aarch64_handle_pac_ret_protection (void)
18882 {
18883   aarch_ra_sign_scope = AARCH_FUNCTION_NON_LEAF;
18884   aarch64_ra_sign_key = AARCH64_KEY_A;
18885 }
18886
18887 static void
18888 aarch64_handle_pac_ret_leaf (void)
18889 {
18890   aarch_ra_sign_scope = AARCH_FUNCTION_ALL;
18891 }
18892
18893 static void
18894 aarch64_handle_pac_ret_b_key (void)
18895 {
18896   aarch64_ra_sign_key = AARCH64_KEY_B;
18897 }
18898
18899 static void
18900 aarch64_handle_bti_protection (void)
18901 {
18902   aarch_enable_bti = 1;
18903 }
18904 static void
18905 aarch64_handle_gcs_protection (void)
18906 {
18907   aarch64_enable_gcs = 1;
18908 }
18909
18910 static const struct aarch_branch_protect_type aarch64_pac_ret_subtypes[] = {
18911   { "leaf", false, aarch64_handle_pac_ret_leaf, NULL, 0 },
18912   { "b-key", false, aarch64_handle_pac_ret_b_key, NULL, 0 },
18913   { NULL, false, NULL, NULL, 0 }
18914 };
18915
18916 static const struct aarch_branch_protect_type aarch64_branch_protect_types[] =
18917 {
18918   { "none", true, aarch64_handle_no_branch_protection, NULL, 0 },
18919   { "standard", true, aarch64_handle_standard_branch_protection, NULL, 0 },
18920   { "pac-ret", false, aarch64_handle_pac_ret_protection,
18921     aarch64_pac_ret_subtypes, ARRAY_SIZE (aarch64_pac_ret_subtypes) },
18922   { "bti", false, aarch64_handle_bti_protection, NULL, 0 },
18923   { "gcs", false, aarch64_handle_gcs_protection, NULL, 0 },
18924   { NULL, false, NULL, NULL, 0 }
18925 };
18926
18927 /* Implement TARGET_OPTION_OVERRIDE.  This is called once in the beginning
18928    and is used to parse the -m{cpu,tune,arch} strings and setup the initial
18929    tuning structs.  In particular it must set selected_tune and
18930    aarch64_asm_isa_flags that define the available ISA features and tuning
18931    decisions.  It must also set selected_arch as this will be used to
18932    output the .arch asm tags for each function.  */
18933
18934 static void
18935 aarch64_override_options (void)
18936 {
18937   aarch64_feature_flags cpu_isa = 0;
18938   aarch64_feature_flags arch_isa = 0;
18939   aarch64_set_asm_isa_flags (0);
18940
18941   aarch64_cpu cpu = aarch64_no_cpu;
18942   aarch64_arch arch = aarch64_no_arch;
18943   aarch64_cpu tune = aarch64_no_cpu;
18944
18945   if (aarch64_harden_sls_string)
18946     aarch64_validate_sls_mitigation (aarch64_harden_sls_string);
18947
18948   if (aarch64_branch_protection_string)
18949     aarch_validate_mbranch_protection (aarch64_branch_protect_types,
18950                                        aarch64_branch_protection_string,
18951                                        "-mbranch-protection=");
18952
18953   /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
18954      If either of -march or -mtune is given, they override their
18955      respective component of -mcpu.  */
18956   if (aarch64_cpu_string)
18957     aarch64_validate_mcpu (aarch64_cpu_string, &cpu, &cpu_isa);
18958
18959   if (aarch64_arch_string)
18960     aarch64_validate_march (aarch64_arch_string, &arch, &arch_isa);
18961
18962   if (aarch64_tune_string)
18963     aarch64_validate_mtune (aarch64_tune_string, &tune);
18964
18965 #ifdef SUBTARGET_OVERRIDE_OPTIONS
18966   SUBTARGET_OVERRIDE_OPTIONS;
18967 #endif
18968
18969   if (cpu != aarch64_no_cpu && arch != aarch64_no_arch)
18970     {
18971       /* If both -mcpu and -march are specified, warn if they are not
18972          feature compatible.  feature compatible means that the inclusion of the
18973          cpu features would end up disabling an achitecture feature.  In
18974          otherwords the cpu features need to be a strict superset of the arch
18975          features and if so prefer the -march ISA flags.  */
18976       if (~cpu_isa & arch_isa)
18977         {
18978           std::string ext_diff
18979             = aarch64_get_extension_string_for_isa_flags (arch_isa, cpu_isa);
18980           warning (0, "switch %<-mcpu=%s%> conflicts with %<-march=%s%> switch "
18981                       "and resulted in options %qs being added",
18982                        aarch64_cpu_string,
18983                        aarch64_arch_string,
18984                        ext_diff.c_str ());
18985         }
18986
18987       selected_arch = arch;
18988       aarch64_set_asm_isa_flags (arch_isa | AARCH64_FL_DEFAULT_ISA_MODE);
18989     }
18990   else if (cpu != aarch64_no_cpu)
18991     {
18992       selected_arch = aarch64_get_tune_cpu (cpu)->arch;
18993       aarch64_set_asm_isa_flags (cpu_isa | AARCH64_FL_DEFAULT_ISA_MODE);
18994     }
18995   else if (arch != aarch64_no_arch)
18996     {
18997       cpu = aarch64_get_arch (arch)->ident;
18998       selected_arch = arch;
18999       aarch64_set_asm_isa_flags (arch_isa | AARCH64_FL_DEFAULT_ISA_MODE);
19000     }
19001   else
19002     {
19003       /* No -mcpu or -march specified, so use the default CPU.  */
19004       cpu = TARGET_CPU_DEFAULT;
19005       const processor *cpu_info = aarch64_get_tune_cpu (cpu);
19006       selected_arch = cpu_info->arch;
19007       aarch64_set_asm_isa_flags (cpu_info->flags
19008                                  | AARCH64_FL_DEFAULT_ISA_MODE);
19009     }
19010
19011   selected_tune = (tune != aarch64_no_cpu) ? tune : cpu;
19012
19013   if (aarch_enable_bti == 2)
19014     {
19015 #ifdef TARGET_ENABLE_BTI
19016       aarch_enable_bti = 1;
19017 #else
19018       aarch_enable_bti = 0;
19019 #endif
19020     }
19021
19022   if (aarch64_enable_gcs == 2)
19023     {
19024 #ifdef TARGET_ENABLE_GCS
19025       aarch64_enable_gcs = 1;
19026 #else
19027       aarch64_enable_gcs = 0;
19028 #endif
19029     }
19030
19031   /* Return address signing is currently not supported for ILP32 targets.  For
19032      LP64 targets use the configured option in the absence of a command-line
19033      option for -mbranch-protection.  */
19034   if (!TARGET_ILP32 && aarch64_branch_protection_string == NULL)
19035     {
19036 #ifdef TARGET_ENABLE_PAC_RET
19037       aarch_ra_sign_scope = AARCH_FUNCTION_NON_LEAF;
19038 #else
19039       aarch_ra_sign_scope = AARCH_FUNCTION_NONE;
19040 #endif
19041     }
19042
19043 #ifndef HAVE_AS_MABI_OPTION
19044   /* The compiler may have been configured with 2.23.* binutils, which does
19045      not have support for ILP32.  */
19046   if (TARGET_ILP32)
19047     error ("assembler does not support %<-mabi=ilp32%>");
19048 #endif
19049   if (TARGET_ILP32)
19050     warning (OPT_Wdeprecated, "%<-mabi=ilp32%> is deprecated");
19051
19052   /* Convert -msve-vector-bits to a VG count.  */
19053   aarch64_sve_vg = aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits);
19054
19055   if (aarch_ra_sign_scope != AARCH_FUNCTION_NONE && TARGET_ILP32)
19056     sorry ("return address signing is only supported for %<-mabi=lp64%>");
19057
19058   /* The pass to insert speculation tracking runs before
19059      shrink-wrapping and the latter does not know how to update the
19060      tracking status.  So disable it in this case.  */
19061   if (aarch64_track_speculation)
19062     flag_shrink_wrap = 0;
19063
19064   aarch64_override_options_internal (&global_options);
19065
19066   /* Save these options as the default ones in case we push and pop them later
19067      while processing functions with potential target attributes.  */
19068   target_option_default_node = target_option_current_node
19069     = build_target_option_node (&global_options, &global_options_set);
19070 }
19071
19072 /* Implement targetm.override_options_after_change.  */
19073
19074 static void
19075 aarch64_override_options_after_change (void)
19076 {
19077   aarch64_override_options_after_change_1 (&global_options);
19078 }
19079
19080 /* Implement the TARGET_OFFLOAD_OPTIONS hook.  */
19081 static char *
19082 aarch64_offload_options (void)
19083 {
19084   if (TARGET_ILP32)
19085     return xstrdup ("-foffload-abi=ilp32 -foffload-abi-host-opts=-mabi=ilp32");
19086   else
19087     return xstrdup ("-foffload-abi=lp64 -foffload-abi-host-opts=-mabi=lp64");
19088 }
19089
19090 static struct machine_function *
19091 aarch64_init_machine_status (void)
19092 {
19093   struct machine_function *machine;
19094   machine = ggc_cleared_alloc<machine_function> ();
19095   return machine;
19096 }
19097
19098 void
19099 aarch64_init_expanders (void)
19100 {
19101   init_machine_status = aarch64_init_machine_status;
19102 }
19103
19104 /* A checking mechanism for the implementation of the various code models.  */
19105 static void
19106 initialize_aarch64_code_model (struct gcc_options *opts)
19107 {
19108   aarch64_cmodel = opts->x_aarch64_cmodel_var;
19109   switch (opts->x_aarch64_cmodel_var)
19110     {
19111     case AARCH64_CMODEL_TINY:
19112       if (opts->x_flag_pic)
19113         aarch64_cmodel = AARCH64_CMODEL_TINY_PIC;
19114       break;
19115     case AARCH64_CMODEL_SMALL:
19116       if (opts->x_flag_pic)
19117         {
19118 #ifdef HAVE_AS_SMALL_PIC_RELOCS
19119           aarch64_cmodel = (flag_pic == 2
19120                             ? AARCH64_CMODEL_SMALL_PIC
19121                             : AARCH64_CMODEL_SMALL_SPIC);
19122 #else
19123           aarch64_cmodel = AARCH64_CMODEL_SMALL_PIC;
19124 #endif
19125         }
19126       break;
19127     case AARCH64_CMODEL_LARGE:
19128       if (opts->x_flag_pic)
19129         sorry ("code model %qs with %<-f%s%>", "large",
19130                opts->x_flag_pic > 1 ? "PIC" : "pic");
19131       if (opts->x_aarch64_abi == AARCH64_ABI_ILP32)
19132         sorry ("code model %qs not supported in ilp32 mode", "large");
19133       break;
19134     case AARCH64_CMODEL_TINY_PIC:
19135     case AARCH64_CMODEL_SMALL_PIC:
19136     case AARCH64_CMODEL_SMALL_SPIC:
19137       gcc_unreachable ();
19138     }
19139 }
19140
19141 /* Implements TARGET_OPTION_RESTORE.  Restore the backend codegen decisions
19142    using the information saved in PTR.  */
19143
19144 static void
19145 aarch64_option_restore (struct gcc_options *opts,
19146                         struct gcc_options * /* opts_set */,
19147                         struct cl_target_option * /* ptr */)
19148 {
19149   aarch64_override_options_internal (opts);
19150 }
19151
19152 /* Implement TARGET_OPTION_PRINT.  */
19153
19154 static void
19155 aarch64_option_print (FILE *file, int indent, struct cl_target_option *ptr)
19156 {
19157   const struct processor *cpu
19158     = aarch64_get_tune_cpu (ptr->x_selected_tune);
19159   const struct processor *arch = aarch64_get_arch (ptr->x_selected_arch);
19160   aarch64_feature_flags isa_flags = aarch64_get_asm_isa_flags(ptr);
19161   std::string extension
19162     = aarch64_get_extension_string_for_isa_flags (isa_flags, arch->flags);
19163
19164   fprintf (file, "%*sselected tune = %s\n", indent, "", cpu->name);
19165   fprintf (file, "%*sselected arch = %s%s\n", indent, "",
19166            arch->name, extension.c_str ());
19167 }
19168
19169 static GTY(()) tree aarch64_previous_fndecl;
19170
19171 void
19172 aarch64_reset_previous_fndecl (void)
19173 {
19174   aarch64_previous_fndecl = NULL;
19175 }
19176
19177 /* Restore or save the TREE_TARGET_GLOBALS from or to NEW_TREE.
19178    Used by aarch64_set_current_function and aarch64_pragma_target_parse to
19179    make sure optab availability predicates are recomputed when necessary.  */
19180
19181 void
19182 aarch64_save_restore_target_globals (tree new_tree)
19183 {
19184   if (TREE_TARGET_GLOBALS (new_tree))
19185     restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
19186   else if (new_tree == target_option_default_node)
19187     restore_target_globals (&default_target_globals);
19188   else
19189     TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
19190 }
19191
19192 /* Return the target_option_node for FNDECL, or the current options
19193    if FNDECL is null.  */
19194
19195 static tree
19196 aarch64_fndecl_options (tree fndecl)
19197 {
19198   if (!fndecl)
19199     return target_option_current_node;
19200
19201   if (tree options = DECL_FUNCTION_SPECIFIC_TARGET (fndecl))
19202     return options;
19203
19204   return target_option_default_node;
19205 }
19206
19207 /* Implement TARGET_SET_CURRENT_FUNCTION.  Unpack the codegen decisions
19208    like tuning and ISA features from the DECL_FUNCTION_SPECIFIC_TARGET
19209    of the function, if such exists.  This function may be called multiple
19210    times on a single function so use aarch64_previous_fndecl to avoid
19211    setting up identical state.  */
19212
19213 static void
19214 aarch64_set_current_function (tree fndecl)
19215 {
19216   tree old_tree = aarch64_fndecl_options (aarch64_previous_fndecl);
19217   tree new_tree = aarch64_fndecl_options (fndecl);
19218
19219   auto new_isa_mode = (fndecl
19220                        ? aarch64_fndecl_isa_mode (fndecl)
19221                        : AARCH64_DEFAULT_ISA_MODE);
19222   auto isa_flags = aarch64_get_isa_flags (TREE_TARGET_OPTION (new_tree));
19223
19224   static bool reported_zt0_p;
19225   if (!reported_zt0_p
19226       && !(isa_flags & AARCH64_FL_SME2)
19227       && fndecl
19228       && aarch64_fndecl_has_state (fndecl, "zt0"))
19229     {
19230       error ("functions with %qs state require the ISA extension %qs",
19231              "zt0", "sme2");
19232       inform (input_location, "you can enable %qs using the command-line"
19233               " option %<-march%>, or by using the %<target%>"
19234               " attribute or pragma", "sme2");
19235       reported_zt0_p = true;
19236     }
19237
19238   /* If nothing to do, return.  #pragma GCC reset or #pragma GCC pop to
19239      the default have been handled by aarch64_save_restore_target_globals from
19240      aarch64_pragma_target_parse.  */
19241   if (old_tree == new_tree
19242       && (!fndecl || aarch64_previous_fndecl)
19243       && (isa_flags & AARCH64_FL_ISA_MODES).val[0] == new_isa_mode)
19244     {
19245       gcc_assert (AARCH64_ISA_MODE == new_isa_mode);
19246       return;
19247     }
19248
19249   aarch64_previous_fndecl = fndecl;
19250
19251   /* First set the target options.  */
19252   cl_target_option_restore (&global_options, &global_options_set,
19253                             TREE_TARGET_OPTION (new_tree));
19254
19255   /* The ISA mode can vary based on function type attributes and
19256      function declaration attributes.  Make sure that the target
19257      options correctly reflect these attributes.  */
19258   if ((isa_flags & AARCH64_FL_ISA_MODES).val[0] != new_isa_mode)
19259     {
19260       auto base_flags = (aarch64_asm_isa_flags & ~AARCH64_FL_ISA_MODES);
19261       aarch64_set_asm_isa_flags (base_flags
19262                                  | aarch64_feature_flags (new_isa_mode));
19263
19264       aarch64_override_options_internal (&global_options);
19265       new_tree = build_target_option_node (&global_options,
19266                                            &global_options_set);
19267       DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_tree;
19268
19269       tree new_optimize = build_optimization_node (&global_options,
19270                                                    &global_options_set);
19271       if (new_optimize != optimization_default_node)
19272         DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
19273     }
19274
19275   aarch64_save_restore_target_globals (new_tree);
19276
19277   gcc_assert (AARCH64_ISA_MODE == new_isa_mode);
19278 }
19279
19280 /* Enum describing the various ways we can handle attributes.
19281    In many cases we can reuse the generic option handling machinery.  */
19282
19283 enum aarch64_attr_opt_type
19284 {
19285   aarch64_attr_mask,    /* Attribute should set a bit in target_flags.  */
19286   aarch64_attr_bool,    /* Attribute sets or unsets a boolean variable.  */
19287   aarch64_attr_enum,    /* Attribute sets an enum variable.  */
19288   aarch64_attr_custom   /* Attribute requires a custom handling function.  */
19289 };
19290
19291 /* All the information needed to handle a target attribute.
19292    NAME is the name of the attribute.
19293    ATTR_TYPE specifies the type of behavior of the attribute as described
19294    in the definition of enum aarch64_attr_opt_type.
19295    ALLOW_NEG is true if the attribute supports a "no-" form.
19296    HANDLER is the function that takes the attribute string as an argument
19297    It is needed only when the ATTR_TYPE is aarch64_attr_custom.
19298    OPT_NUM is the enum specifying the option that the attribute modifies.
19299    This is needed for attributes that mirror the behavior of a command-line
19300    option, that is it has ATTR_TYPE aarch64_attr_mask, aarch64_attr_bool or
19301    aarch64_attr_enum.  */
19302
19303 struct aarch64_attribute_info
19304 {
19305   const char *name;
19306   enum aarch64_attr_opt_type attr_type;
19307   bool allow_neg;
19308   bool (*handler) (const char *);
19309   enum opt_code opt_num;
19310 };
19311
19312 /* Handle the ARCH_STR argument to the arch= target attribute.  */
19313
19314 static bool
19315 aarch64_handle_attr_arch (const char *str)
19316 {
19317   aarch64_arch tmp_arch = aarch64_no_arch;
19318   std::string invalid_extension;
19319   aarch64_feature_flags tmp_flags;
19320   enum aarch_parse_opt_result parse_res
19321     = aarch64_parse_arch (str, &tmp_arch, &tmp_flags, &invalid_extension);
19322
19323   if (parse_res == AARCH_PARSE_OK)
19324     {
19325       gcc_assert (tmp_arch != aarch64_no_arch);
19326       selected_arch = tmp_arch;
19327       aarch64_set_asm_isa_flags (tmp_flags | (aarch64_asm_isa_flags
19328                                               & AARCH64_FL_ISA_MODES));
19329       return true;
19330     }
19331
19332   switch (parse_res)
19333     {
19334       case AARCH_PARSE_MISSING_ARG:
19335         error ("missing name in %<target(\"arch=\")%> pragma or attribute");
19336         break;
19337       case AARCH_PARSE_INVALID_ARG:
19338         error ("invalid name %qs in %<target(\"arch=\")%> pragma or attribute", str);
19339         aarch64_print_hint_for_arch (str);
19340         break;
19341       case AARCH_PARSE_INVALID_FEATURE:
19342         error ("invalid feature modifier %s of value %qs in "
19343                "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
19344         aarch64_print_hint_for_extensions (invalid_extension.c_str ());
19345         break;
19346       default:
19347         gcc_unreachable ();
19348     }
19349
19350   return false;
19351 }
19352
19353 /* Handle the argument CPU_STR to the cpu= target attribute.  */
19354
19355 static bool
19356 aarch64_handle_attr_cpu (const char *str)
19357 {
19358   aarch64_cpu tmp_cpu = aarch64_no_cpu;
19359   std::string invalid_extension;
19360   aarch64_feature_flags tmp_flags;
19361   enum aarch_parse_opt_result parse_res
19362     = aarch64_parse_cpu (str, &tmp_cpu, &tmp_flags, &invalid_extension);
19363
19364   if (parse_res == AARCH_PARSE_OK)
19365     {
19366       gcc_assert (tmp_cpu != aarch64_no_cpu);
19367       selected_tune = tmp_cpu;
19368       selected_arch = aarch64_get_tune_cpu (tmp_cpu)->arch;
19369       aarch64_set_asm_isa_flags (tmp_flags | (aarch64_asm_isa_flags
19370                                               & AARCH64_FL_ISA_MODES));
19371       return true;
19372     }
19373
19374   switch (parse_res)
19375     {
19376       case AARCH_PARSE_MISSING_ARG:
19377         error ("missing name in %<target(\"cpu=\")%> pragma or attribute");
19378         break;
19379       case AARCH_PARSE_INVALID_ARG:
19380         error ("invalid name %qs in %<target(\"cpu=\")%> pragma or attribute", str);
19381         aarch64_print_hint_for_core (str);
19382         break;
19383       case AARCH_PARSE_INVALID_FEATURE:
19384         error ("invalid feature modifier %qs of value %qs in "
19385                "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
19386         aarch64_print_hint_for_extensions (invalid_extension.c_str ());
19387         break;
19388       default:
19389         gcc_unreachable ();
19390     }
19391
19392   return false;
19393 }
19394
19395 /* Handle the argument STR to the branch-protection= attribute.  */
19396
19397 static bool
19398 aarch64_handle_attr_branch_protection (const char* str)
19399 {
19400   return aarch_validate_mbranch_protection (aarch64_branch_protect_types, str,
19401                                             "target(\"branch-protection=\")");
19402 }
19403
19404 /* Handle the argument STR to the tune= target attribute.  */
19405
19406 static bool
19407 aarch64_handle_attr_tune (const char *str)
19408 {
19409   aarch64_cpu tmp_tune = aarch64_no_cpu;
19410   enum aarch_parse_opt_result parse_res
19411     = aarch64_parse_tune (str, &tmp_tune);
19412
19413   if (parse_res == AARCH_PARSE_OK)
19414     {
19415       gcc_assert (tmp_tune != aarch64_no_cpu);
19416       selected_tune = tmp_tune;
19417       return true;
19418     }
19419
19420   switch (parse_res)
19421     {
19422       case AARCH_PARSE_INVALID_ARG:
19423         error ("invalid name %qs in %<target(\"tune=\")%> pragma or attribute", str);
19424         aarch64_print_hint_for_core (str);
19425         break;
19426       default:
19427         gcc_unreachable ();
19428     }
19429
19430   return false;
19431 }
19432
19433 /* Parse an architecture extensions target attribute string specified in STR.
19434    For example "+fp+nosimd".  Show any errors if needed.  Return TRUE
19435    if successful.  Update aarch64_isa_flags to reflect the ISA features
19436    modified.  */
19437
19438 static bool
19439 aarch64_handle_attr_isa_flags (char *str)
19440 {
19441   enum aarch_parse_opt_result parse_res;
19442   auto isa_flags = aarch64_asm_isa_flags;
19443
19444   /* We allow "+nothing" in the beginning to clear out all architectural
19445      features if the user wants to handpick specific features.  */
19446   if (strncmp ("+nothing", str, 8) == 0)
19447     {
19448       isa_flags &= AARCH64_FL_ISA_MODES;
19449       str += 8;
19450     }
19451
19452   std::string invalid_extension;
19453   parse_res = aarch64_parse_extension (str, &isa_flags, &invalid_extension);
19454
19455   if (parse_res == AARCH_PARSE_OK)
19456     {
19457       aarch64_set_asm_isa_flags (isa_flags);
19458       return true;
19459     }
19460
19461   switch (parse_res)
19462     {
19463       case AARCH_PARSE_MISSING_ARG:
19464         error ("missing value in %<target()%> pragma or attribute");
19465         break;
19466
19467       case AARCH_PARSE_INVALID_FEATURE:
19468         error ("invalid feature modifier %qs of value %qs in "
19469                "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
19470         break;
19471
19472       default:
19473         gcc_unreachable ();
19474     }
19475
19476  return false;
19477 }
19478
19479 /* The target attributes that we support.  On top of these we also support just
19480    ISA extensions, like  __attribute__ ((target ("+crc"))), but that case is
19481    handled explicitly in aarch64_process_one_target_attr.  */
19482
19483 static const struct aarch64_attribute_info aarch64_attributes[] =
19484 {
19485   { "general-regs-only", aarch64_attr_mask, false, NULL,
19486      OPT_mgeneral_regs_only },
19487   { "fix-cortex-a53-835769", aarch64_attr_bool, true, NULL,
19488      OPT_mfix_cortex_a53_835769 },
19489   { "fix-cortex-a53-843419", aarch64_attr_bool, true, NULL,
19490      OPT_mfix_cortex_a53_843419 },
19491   { "cmodel", aarch64_attr_enum, false, NULL, OPT_mcmodel_ },
19492   { "strict-align", aarch64_attr_mask, true, NULL, OPT_mstrict_align },
19493   { "omit-leaf-frame-pointer", aarch64_attr_bool, true, NULL,
19494      OPT_momit_leaf_frame_pointer },
19495   { "tls-dialect", aarch64_attr_enum, false, NULL, OPT_mtls_dialect_ },
19496   { "arch", aarch64_attr_custom, false, aarch64_handle_attr_arch,
19497      OPT_march_ },
19498   { "cpu", aarch64_attr_custom, false, aarch64_handle_attr_cpu, OPT_mcpu_ },
19499   { "tune", aarch64_attr_custom, false, aarch64_handle_attr_tune,
19500      OPT_mtune_ },
19501   { "branch-protection", aarch64_attr_custom, false,
19502      aarch64_handle_attr_branch_protection, OPT_mbranch_protection_ },
19503   { "sign-return-address", aarch64_attr_enum, false, NULL,
19504      OPT_msign_return_address_ },
19505   { "outline-atomics", aarch64_attr_bool, true, NULL,
19506      OPT_moutline_atomics},
19507   { NULL, aarch64_attr_custom, false, NULL, OPT____ }
19508 };
19509
19510 /* Parse ARG_STR which contains the definition of one target attribute.
19511    Show appropriate errors if any or return true if the attribute is valid.  */
19512
19513 static bool
19514 aarch64_process_one_target_attr (char *arg_str)
19515 {
19516   bool invert = false;
19517
19518   size_t len = strlen (arg_str);
19519
19520   if (len == 0)
19521     {
19522       error ("malformed %<target()%> pragma or attribute");
19523       return false;
19524     }
19525
19526   auto_vec<char, 32> buffer;
19527   buffer.safe_grow (len + 1);
19528   char *str_to_check = buffer.address ();
19529   memcpy (str_to_check, arg_str, len + 1);
19530
19531   /* We have something like __attribute__ ((target ("+fp+nosimd"))).
19532      It is easier to detect and handle it explicitly here rather than going
19533      through the machinery for the rest of the target attributes in this
19534      function.  */
19535   if (*str_to_check == '+')
19536     return aarch64_handle_attr_isa_flags (str_to_check);
19537
19538   if (len > 3 && startswith (str_to_check, "no-"))
19539     {
19540       invert = true;
19541       str_to_check += 3;
19542     }
19543   char *arg = strchr (str_to_check, '=');
19544
19545   /* If we found opt=foo then terminate STR_TO_CHECK at the '='
19546      and point ARG to "foo".  */
19547   if (arg)
19548     {
19549       *arg = '\0';
19550       arg++;
19551     }
19552   const struct aarch64_attribute_info *p_attr;
19553   bool found = false;
19554   for (p_attr = aarch64_attributes; p_attr->name; p_attr++)
19555     {
19556       /* If the names don't match up, or the user has given an argument
19557          to an attribute that doesn't accept one, or didn't give an argument
19558          to an attribute that expects one, fail to match.  */
19559       if (strcmp (str_to_check, p_attr->name) != 0)
19560         continue;
19561
19562       found = true;
19563       bool attr_need_arg_p = p_attr->attr_type == aarch64_attr_custom
19564                               || p_attr->attr_type == aarch64_attr_enum;
19565
19566       if (attr_need_arg_p ^ (arg != NULL))
19567         {
19568           error ("pragma or attribute %<target(\"%s\")%> does not accept an argument", str_to_check);
19569           return false;
19570         }
19571
19572       /* If the name matches but the attribute does not allow "no-" versions
19573          then we can't match.  */
19574       if (invert && !p_attr->allow_neg)
19575         {
19576           error ("pragma or attribute %<target(\"%s\")%> does not allow a negated form", str_to_check);
19577           return false;
19578         }
19579
19580       switch (p_attr->attr_type)
19581         {
19582         /* Has a custom handler registered.
19583            For example, cpu=, arch=, tune=.  */
19584           case aarch64_attr_custom:
19585             gcc_assert (p_attr->handler);
19586             if (!p_attr->handler (arg))
19587               return false;
19588             break;
19589
19590           /* Either set or unset a boolean option.  */
19591           case aarch64_attr_bool:
19592             {
19593               struct cl_decoded_option decoded;
19594
19595               generate_option (p_attr->opt_num, NULL, !invert,
19596                                CL_TARGET, &decoded);
19597               aarch64_handle_option (&global_options, &global_options_set,
19598                                       &decoded, input_location);
19599               break;
19600             }
19601           /* Set or unset a bit in the target_flags.  aarch64_handle_option
19602              should know what mask to apply given the option number.  */
19603           case aarch64_attr_mask:
19604             {
19605               struct cl_decoded_option decoded;
19606               /* We only need to specify the option number.
19607                  aarch64_handle_option will know which mask to apply.  */
19608               decoded.opt_index = p_attr->opt_num;
19609               decoded.value = !invert;
19610               aarch64_handle_option (&global_options, &global_options_set,
19611                                       &decoded, input_location);
19612               break;
19613             }
19614           /* Use the option setting machinery to set an option to an enum.  */
19615           case aarch64_attr_enum:
19616             {
19617               gcc_assert (arg);
19618               bool valid;
19619               int value;
19620               valid = opt_enum_arg_to_value (p_attr->opt_num, arg,
19621                                               &value, CL_TARGET);
19622               if (valid)
19623                 {
19624                   set_option (&global_options, NULL, p_attr->opt_num, value,
19625                               NULL, DK_UNSPECIFIED, input_location,
19626                               global_dc);
19627                 }
19628               else
19629                 {
19630                   error ("pragma or attribute %<target(\"%s=%s\")%> is not valid", str_to_check, arg);
19631                 }
19632               break;
19633             }
19634           default:
19635             gcc_unreachable ();
19636         }
19637     }
19638
19639   /* If we reached here we either have found an attribute and validated
19640      it or didn't match any.  If we matched an attribute but its arguments
19641      were malformed we will have returned false already.  */
19642   return found;
19643 }
19644
19645 /* Count how many times the character C appears in
19646    NULL-terminated string STR.  */
19647
19648 static unsigned int
19649 num_occurences_in_str (char c, char *str)
19650 {
19651   unsigned int res = 0;
19652   while (*str != '\0')
19653     {
19654       if (*str == c)
19655         res++;
19656
19657       str++;
19658     }
19659
19660   return res;
19661 }
19662
19663 /* Parse the tree in ARGS that contains the target attribute information
19664    and update the global target options space.  */
19665
19666 bool
19667 aarch64_process_target_attr (tree args)
19668 {
19669   if (TREE_CODE (args) == TREE_LIST)
19670     {
19671       do
19672         {
19673           tree head = TREE_VALUE (args);
19674           if (head)
19675             {
19676               if (!aarch64_process_target_attr (head))
19677                 return false;
19678             }
19679           args = TREE_CHAIN (args);
19680         } while (args);
19681
19682       return true;
19683     }
19684
19685   if (TREE_CODE (args) != STRING_CST)
19686     {
19687       error ("attribute %<target%> argument not a string");
19688       return false;
19689     }
19690
19691   size_t len = strlen (TREE_STRING_POINTER (args));
19692   auto_vec<char, 32> buffer;
19693   buffer.safe_grow (len + 1);
19694   char *str_to_check = buffer.address ();
19695   memcpy (str_to_check, TREE_STRING_POINTER (args), len + 1);
19696
19697   if (len == 0)
19698     {
19699       error ("malformed %<target()%> pragma or attribute");
19700       return false;
19701     }
19702
19703   /* Used to catch empty spaces between commas i.e.
19704      attribute ((target ("attr1,,attr2"))).  */
19705   unsigned int num_commas = num_occurences_in_str (',', str_to_check);
19706
19707   /* Handle multiple target attributes separated by ','.  */
19708   char *token = strtok_r (str_to_check, ",", &str_to_check);
19709
19710   unsigned int num_attrs = 0;
19711   while (token)
19712     {
19713       num_attrs++;
19714       if (!aarch64_process_one_target_attr (token))
19715         {
19716           /* Check if token is possibly an arch extension without
19717              leading '+'.  */
19718           aarch64_feature_flags isa_temp = 0;
19719           auto with_plus = std::string ("+") + token;
19720           enum aarch_parse_opt_result ext_res
19721             = aarch64_parse_extension (with_plus.c_str (), &isa_temp, nullptr);
19722
19723           if (ext_res == AARCH_PARSE_OK)
19724             error ("arch extension %qs should be prefixed by %<+%>",
19725                    token);
19726           else
19727             error ("pragma or attribute %<target(\"%s\")%> is not valid", token);
19728           return false;
19729         }
19730
19731       token = strtok_r (NULL, ",", &str_to_check);
19732     }
19733
19734   if (num_attrs != num_commas + 1)
19735     {
19736       error ("malformed %<target(\"%s\")%> pragma or attribute", TREE_STRING_POINTER (args));
19737       return false;
19738     }
19739
19740   return true;
19741 }
19742
19743 static bool aarch64_process_target_version_attr (tree args);
19744
19745 /* Implement TARGET_OPTION_VALID_ATTRIBUTE_P.  This is used to
19746    process attribute ((target ("..."))).  */
19747
19748 static bool
19749 aarch64_option_valid_attribute_p (tree fndecl, tree, tree args, int)
19750 {
19751   struct cl_target_option cur_target;
19752   bool ret;
19753   tree old_optimize;
19754   tree new_target, new_optimize;
19755   tree existing_target = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
19756
19757   /* If what we're processing is the current pragma string then the
19758      target option node is already stored in target_option_current_node
19759      by aarch64_pragma_target_parse in aarch64-c.cc.  Use that to avoid
19760      having to re-parse the string.  This is especially useful to keep
19761      arm_neon.h compile times down since that header contains a lot
19762      of intrinsics enclosed in pragmas.  */
19763   if (!existing_target && args == current_target_pragma)
19764     {
19765       DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = target_option_current_node;
19766       return true;
19767     }
19768   tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
19769
19770   old_optimize
19771     = build_optimization_node (&global_options, &global_options_set);
19772   func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
19773
19774   /* If the function changed the optimization levels as well as setting
19775      target options, start with the optimizations specified.  */
19776   if (func_optimize && func_optimize != old_optimize)
19777     cl_optimization_restore (&global_options, &global_options_set,
19778                              TREE_OPTIMIZATION (func_optimize));
19779
19780   /* Save the current target options to restore at the end.  */
19781   cl_target_option_save (&cur_target, &global_options, &global_options_set);
19782
19783   /* If fndecl already has some target attributes applied to it, unpack
19784      them so that we add this attribute on top of them, rather than
19785      overwriting them.  */
19786   if (existing_target)
19787     {
19788       struct cl_target_option *existing_options
19789         = TREE_TARGET_OPTION (existing_target);
19790
19791       if (existing_options)
19792         cl_target_option_restore (&global_options, &global_options_set,
19793                                   existing_options);
19794     }
19795   else
19796     cl_target_option_restore (&global_options, &global_options_set,
19797                               TREE_TARGET_OPTION (target_option_current_node));
19798
19799   ret = aarch64_process_target_attr (args);
19800   if (ret)
19801     {
19802       tree version_attr = lookup_attribute ("target_version",
19803                                             DECL_ATTRIBUTES (fndecl));
19804       if (version_attr != NULL_TREE)
19805         {
19806           /* Reapply any target_version attribute after target attribute.
19807              This should be equivalent to applying the target_version once
19808              after processing all target attributes.  */
19809           tree version_args = TREE_VALUE (version_attr);
19810           ret = aarch64_process_target_version_attr (version_args);
19811         }
19812     }
19813
19814   /* Set up any additional state.  */
19815   if (ret)
19816     {
19817       aarch64_override_options_internal (&global_options);
19818       new_target = build_target_option_node (&global_options,
19819                                              &global_options_set);
19820     }
19821   else
19822     new_target = NULL;
19823
19824   new_optimize = build_optimization_node (&global_options,
19825                                           &global_options_set);
19826
19827   if (fndecl && ret)
19828     {
19829       DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
19830
19831       if (old_optimize != new_optimize)
19832         DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
19833     }
19834
19835   cl_target_option_restore (&global_options, &global_options_set, &cur_target);
19836
19837   if (old_optimize != new_optimize)
19838     cl_optimization_restore (&global_options, &global_options_set,
19839                              TREE_OPTIMIZATION (old_optimize));
19840   return ret;
19841 }
19842
19843 typedef unsigned long long aarch64_fmv_feature_mask;
19844
19845 typedef struct
19846 {
19847   const char *name;
19848   aarch64_fmv_feature_mask feature_mask;
19849   aarch64_feature_flags opt_flags;
19850 } aarch64_fmv_feature_datum;
19851
19852 #define AARCH64_FMV_FEATURE(NAME, FEAT_NAME, C) \
19853   {NAME, 1ULL << FEAT_##FEAT_NAME, ::feature_deps::fmv_deps_##FEAT_NAME},
19854
19855 /* The "rdma" alias uses a different FEAT_NAME to avoid a duplicate
19856    feature_deps name.  */
19857 #define FEAT_RDMA FEAT_RDM
19858
19859 /* FMV features are listed in priority order, to make it easier to sort target
19860    strings.  */
19861 static aarch64_fmv_feature_datum aarch64_fmv_feature_data[] = {
19862 #include "config/aarch64/aarch64-option-extensions.def"
19863 };
19864
19865 /* Parse a function multiversioning feature string STR, as found in a
19866    target_version or target_clones attribute.
19867
19868    If ISA_FLAGS is nonnull, then update it with the specified architecture
19869    features turned on.  If FEATURE_MASK is nonnull, then assign to it a bitmask
19870    representing the set of features explicitly specified in the feature string.
19871    Return an aarch_parse_opt_result describing the result.
19872
19873    When the STR string contains an invalid or duplicate extension, a copy of
19874    the extension string is created and stored to INVALID_EXTENSION.  */
19875
19876 static enum aarch_parse_opt_result
19877 aarch64_parse_fmv_features (const char *str, aarch64_feature_flags *isa_flags,
19878                             aarch64_fmv_feature_mask *feature_mask,
19879                             std::string *invalid_extension)
19880 {
19881   if (feature_mask)
19882     *feature_mask = 0ULL;
19883
19884   if (strcmp (str, "default") == 0)
19885     return AARCH_PARSE_OK;
19886
19887   while (str != NULL && *str != 0)
19888     {
19889       const char *ext;
19890       size_t len;
19891
19892       ext = strchr (str, '+');
19893
19894       if (ext != NULL)
19895         len = ext - str;
19896       else
19897         len = strlen (str);
19898
19899       if (len == 0)
19900         return AARCH_PARSE_MISSING_ARG;
19901
19902       int num_features = ARRAY_SIZE (aarch64_fmv_feature_data);
19903       int i;
19904       for (i = 0; i < num_features; i++)
19905         {
19906           if (strlen (aarch64_fmv_feature_data[i].name) == len
19907               && strncmp (aarch64_fmv_feature_data[i].name, str, len) == 0)
19908             {
19909               if (isa_flags)
19910                 *isa_flags |= aarch64_fmv_feature_data[i].opt_flags;
19911               if (feature_mask)
19912                 {
19913                   auto old_feature_mask = *feature_mask;
19914                   *feature_mask |= aarch64_fmv_feature_data[i].feature_mask;
19915                   if (*feature_mask == old_feature_mask)
19916                     {
19917                       /* Duplicate feature.  */
19918                       if (invalid_extension)
19919                         *invalid_extension = std::string (str, len);
19920                       return AARCH_PARSE_DUPLICATE_FEATURE;
19921                     }
19922                 }
19923               break;
19924             }
19925         }
19926
19927       if (i == num_features)
19928         {
19929           /* Feature not found in list.  */
19930           if (invalid_extension)
19931             *invalid_extension = std::string (str, len);
19932           return AARCH_PARSE_INVALID_FEATURE;
19933         }
19934
19935       str = ext;
19936       if (str)
19937         /* Skip over the next '+'.  */
19938         str++;
19939     }
19940
19941   return AARCH_PARSE_OK;
19942 }
19943
19944 /* Parse the tree in ARGS that contains the target_version attribute
19945    information and update the global target options space.  */
19946
19947 static bool
19948 aarch64_process_target_version_attr (tree args)
19949 {
19950   static bool issued_warning = false;
19951   if (!issued_warning)
19952     {
19953       warning (OPT_Wexperimental_fmv_target,
19954                "Function Multi Versioning support is experimental, and the "
19955                "behavior is likely to change");
19956       issued_warning = true;
19957     }
19958
19959   if (TREE_CODE (args) == TREE_LIST)
19960     {
19961       if (TREE_CHAIN (args))
19962         {
19963           error ("attribute %<target_version%> has multiple values");
19964           return false;
19965         }
19966       args = TREE_VALUE (args);
19967     }
19968
19969   if (!args || TREE_CODE (args) != STRING_CST)
19970     {
19971       error ("attribute %<target_version%> argument not a string");
19972       return false;
19973     }
19974
19975   const char *str = TREE_STRING_POINTER (args);
19976
19977   enum aarch_parse_opt_result parse_res;
19978   auto isa_flags = aarch64_asm_isa_flags;
19979
19980   std::string invalid_extension;
19981   parse_res = aarch64_parse_fmv_features (str, &isa_flags, NULL,
19982                                           &invalid_extension);
19983
19984   if (parse_res == AARCH_PARSE_OK)
19985     {
19986       aarch64_set_asm_isa_flags (isa_flags);
19987       return true;
19988     }
19989
19990   switch (parse_res)
19991     {
19992     case AARCH_PARSE_MISSING_ARG:
19993       error ("missing value in %<target_version%> attribute");
19994       break;
19995
19996     case AARCH_PARSE_INVALID_FEATURE:
19997       error ("invalid feature modifier %qs of value %qs in "
19998              "%<target_version%> attribute", invalid_extension.c_str (),
19999              str);
20000       break;
20001
20002     case AARCH_PARSE_DUPLICATE_FEATURE:
20003       error ("duplicate feature modifier %qs of value %qs in "
20004              "%<target_version%> attribute", invalid_extension.c_str (),
20005              str);
20006       break;
20007
20008     default:
20009       gcc_unreachable ();
20010     }
20011
20012   return false;
20013 }
20014
20015 /* Implement TARGET_OPTION_VALID_VERSION_ATTRIBUTE_P.  This is used to
20016    process attribute ((target_version ("..."))).  */
20017
20018 static bool
20019 aarch64_option_valid_version_attribute_p (tree fndecl, tree, tree args, int)
20020 {
20021   struct cl_target_option cur_target;
20022   bool ret;
20023   tree new_target;
20024   tree existing_target = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
20025
20026   /* Save the current target options to restore at the end.  */
20027   cl_target_option_save (&cur_target, &global_options, &global_options_set);
20028
20029   /* If fndecl already has some target attributes applied to it, unpack
20030      them so that we add this attribute on top of them, rather than
20031      overwriting them.  */
20032   if (existing_target)
20033     {
20034       struct cl_target_option *existing_options
20035         = TREE_TARGET_OPTION (existing_target);
20036
20037       if (existing_options)
20038         cl_target_option_restore (&global_options, &global_options_set,
20039                                   existing_options);
20040     }
20041   else
20042     cl_target_option_restore (&global_options, &global_options_set,
20043                               TREE_TARGET_OPTION (target_option_current_node));
20044
20045   ret = aarch64_process_target_version_attr (args);
20046
20047   /* Set up any additional state.  */
20048   if (ret)
20049     {
20050       aarch64_override_options_internal (&global_options);
20051       new_target = build_target_option_node (&global_options,
20052                                              &global_options_set);
20053     }
20054   else
20055     new_target = NULL;
20056
20057   if (fndecl && ret)
20058       DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
20059
20060   cl_target_option_restore (&global_options, &global_options_set, &cur_target);
20061
20062   return ret;
20063 }
20064
20065 /* This parses the attribute arguments to target_version in DECL and the
20066    feature mask required to select those targets.  No adjustments are made to
20067    add or remove redundant feature requirements.  */
20068
20069 static aarch64_fmv_feature_mask
20070 get_feature_mask_for_version (tree decl)
20071 {
20072   tree version_attr = lookup_attribute ("target_version",
20073                                         DECL_ATTRIBUTES (decl));
20074   if (version_attr == NULL)
20075     return 0;
20076
20077   const char *version_string = TREE_STRING_POINTER (TREE_VALUE (TREE_VALUE
20078                                                     (version_attr)));
20079   enum aarch_parse_opt_result parse_res;
20080   aarch64_fmv_feature_mask feature_mask;
20081
20082   parse_res = aarch64_parse_fmv_features (version_string, NULL, &feature_mask,
20083                                           NULL);
20084
20085   /* We should have detected any errors before getting here.  */
20086   gcc_assert (parse_res == AARCH_PARSE_OK);
20087
20088   return feature_mask;
20089 }
20090
20091 /* Compare priorities of two feature masks. Return:
20092      1: mask1 is higher priority
20093     -1: mask2 is higher priority
20094      0: masks are equal.  */
20095
20096 static int
20097 compare_feature_masks (aarch64_fmv_feature_mask mask1,
20098                        aarch64_fmv_feature_mask mask2)
20099 {
20100   int pop1 = popcount_hwi (mask1);
20101   int pop2 = popcount_hwi (mask2);
20102   if (pop1 > pop2)
20103     return 1;
20104   if (pop2 > pop1)
20105     return -1;
20106
20107   auto diff_mask = mask1 ^ mask2;
20108   if (diff_mask == 0ULL)
20109     return 0;
20110   int num_features = ARRAY_SIZE (aarch64_fmv_feature_data);
20111   for (int i = num_features - 1; i >= 0; i--)
20112     {
20113       auto bit_mask = aarch64_fmv_feature_data[i].feature_mask;
20114       if (diff_mask & bit_mask)
20115         return (mask1 & bit_mask) ? 1 : -1;
20116     }
20117   gcc_unreachable();
20118 }
20119
20120 /* Compare priorities of two version decls.  */
20121
20122 int
20123 aarch64_compare_version_priority (tree decl1, tree decl2)
20124 {
20125   auto mask1 = get_feature_mask_for_version (decl1);
20126   auto mask2 = get_feature_mask_for_version (decl2);
20127
20128   return compare_feature_masks (mask1, mask2);
20129 }
20130
20131 /* Build the struct __ifunc_arg_t type:
20132
20133    struct __ifunc_arg_t
20134    {
20135      unsigned long _size; // Size of the struct, so it can grow.
20136      unsigned long _hwcap;
20137      unsigned long _hwcap2;
20138    }
20139  */
20140
20141 static tree
20142 build_ifunc_arg_type ()
20143 {
20144   tree ifunc_arg_type = lang_hooks.types.make_type (RECORD_TYPE);
20145   tree field1 = build_decl (UNKNOWN_LOCATION, FIELD_DECL,
20146                             get_identifier ("_size"),
20147                             long_unsigned_type_node);
20148   tree field2 = build_decl (UNKNOWN_LOCATION, FIELD_DECL,
20149                             get_identifier ("_hwcap"),
20150                             long_unsigned_type_node);
20151   tree field3 = build_decl (UNKNOWN_LOCATION, FIELD_DECL,
20152                             get_identifier ("_hwcap2"),
20153                             long_unsigned_type_node);
20154
20155   DECL_FIELD_CONTEXT (field1) = ifunc_arg_type;
20156   DECL_FIELD_CONTEXT (field2) = ifunc_arg_type;
20157   DECL_FIELD_CONTEXT (field3) = ifunc_arg_type;
20158
20159   TYPE_FIELDS (ifunc_arg_type) = field1;
20160   DECL_CHAIN (field1) = field2;
20161   DECL_CHAIN (field2) = field3;
20162
20163   layout_type (ifunc_arg_type);
20164
20165   tree const_type = build_qualified_type (ifunc_arg_type, TYPE_QUAL_CONST);
20166   tree pointer_type = build_pointer_type (const_type);
20167
20168   return pointer_type;
20169 }
20170
20171 /* Implement TARGET_MANGLE_DECL_ASSEMBLER_NAME, to add function multiversioning
20172    suffixes.  */
20173
20174 tree
20175 aarch64_mangle_decl_assembler_name (tree decl, tree id)
20176 {
20177   /* For function version, add the target suffix to the assembler name.  */
20178   if (TREE_CODE (decl) == FUNCTION_DECL
20179       && DECL_FUNCTION_VERSIONED (decl))
20180     {
20181       aarch64_fmv_feature_mask feature_mask = get_feature_mask_for_version (decl);
20182
20183       std::string name = IDENTIFIER_POINTER (id);
20184
20185       /* For the default version, append ".default".  */
20186       if (feature_mask == 0ULL)
20187         {
20188           name += ".default";
20189           return get_identifier (name.c_str());
20190         }
20191
20192       name += "._";
20193
20194       int num_features = ARRAY_SIZE (aarch64_fmv_feature_data);
20195       for (int i = 0; i < num_features; i++)
20196         {
20197           if (feature_mask & aarch64_fmv_feature_data[i].feature_mask)
20198             {
20199               name += "M";
20200               name += aarch64_fmv_feature_data[i].name;
20201             }
20202         }
20203
20204       if (DECL_ASSEMBLER_NAME_SET_P (decl))
20205         SET_DECL_RTL (decl, NULL);
20206
20207       id = get_identifier (name.c_str());
20208     }
20209   return id;
20210 }
20211
20212 /* Return an identifier for the base assembler name of a versioned function.
20213    This is computed by taking the default version's assembler name, and
20214    stripping off the ".default" suffix if it's already been appended.  */
20215
20216 static tree
20217 get_suffixed_assembler_name (tree default_decl, const char *suffix)
20218 {
20219   std::string name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (default_decl));
20220
20221   auto size = name.size ();
20222   if (size >= 8 && name.compare (size - 8, 8, ".default") == 0)
20223     name.resize (size - 8);
20224   name += suffix;
20225   return get_identifier (name.c_str());
20226 }
20227
20228 /* Make the resolver function decl to dispatch the versions of
20229    a multi-versioned function,  DEFAULT_DECL.  IFUNC_ALIAS_DECL is
20230    ifunc alias that will point to the created resolver.  Create an
20231    empty basic block in the resolver and store the pointer in
20232    EMPTY_BB.  Return the decl of the resolver function.  */
20233
20234 static tree
20235 make_resolver_func (const tree default_decl,
20236                     const tree ifunc_alias_decl,
20237                     basic_block *empty_bb)
20238 {
20239   tree decl, type, t;
20240
20241   /* Create resolver function name based on default_decl.  We need to remove an
20242      existing ".default" suffix if this has already been appended.  */
20243   tree decl_name = get_suffixed_assembler_name (default_decl, ".resolver");
20244   const char *resolver_name = IDENTIFIER_POINTER (decl_name);
20245
20246   /* The resolver function should have signature
20247      (void *) resolver (uint64_t, const __ifunc_arg_t *) */
20248   type = build_function_type_list (ptr_type_node,
20249                                    uint64_type_node,
20250                                    build_ifunc_arg_type (),
20251                                    NULL_TREE);
20252
20253   decl = build_fn_decl (resolver_name, type);
20254   SET_DECL_ASSEMBLER_NAME (decl, decl_name);
20255
20256   DECL_NAME (decl) = decl_name;
20257   TREE_USED (decl) = 1;
20258   DECL_ARTIFICIAL (decl) = 1;
20259   DECL_IGNORED_P (decl) = 1;
20260   TREE_PUBLIC (decl) = 0;
20261   DECL_UNINLINABLE (decl) = 1;
20262
20263   /* Resolver is not external, body is generated.  */
20264   DECL_EXTERNAL (decl) = 0;
20265   DECL_EXTERNAL (ifunc_alias_decl) = 0;
20266
20267   DECL_CONTEXT (decl) = NULL_TREE;
20268   DECL_INITIAL (decl) = make_node (BLOCK);
20269   DECL_STATIC_CONSTRUCTOR (decl) = 0;
20270
20271   if (DECL_COMDAT_GROUP (default_decl)
20272       || TREE_PUBLIC (default_decl))
20273     {
20274       /* In this case, each translation unit with a call to this
20275          versioned function will put out a resolver.  Ensure it
20276          is comdat to keep just one copy.  */
20277       DECL_COMDAT (decl) = 1;
20278       make_decl_one_only (decl, DECL_ASSEMBLER_NAME (decl));
20279     }
20280   else
20281     TREE_PUBLIC (ifunc_alias_decl) = 0;
20282
20283   /* Build result decl and add to function_decl. */
20284   t = build_decl (UNKNOWN_LOCATION, RESULT_DECL, NULL_TREE, ptr_type_node);
20285   DECL_CONTEXT (t) = decl;
20286   DECL_ARTIFICIAL (t) = 1;
20287   DECL_IGNORED_P (t) = 1;
20288   DECL_RESULT (decl) = t;
20289
20290   /* Build parameter decls and add to function_decl. */
20291   tree arg1 = build_decl (UNKNOWN_LOCATION, PARM_DECL,
20292                           get_identifier ("hwcap"),
20293                           uint64_type_node);
20294   tree arg2 = build_decl (UNKNOWN_LOCATION, PARM_DECL,
20295                           get_identifier ("arg"),
20296                           build_ifunc_arg_type());
20297   DECL_CONTEXT (arg1) = decl;
20298   DECL_CONTEXT (arg2) = decl;
20299   DECL_ARTIFICIAL (arg1) = 1;
20300   DECL_ARTIFICIAL (arg2) = 1;
20301   DECL_IGNORED_P (arg1) = 1;
20302   DECL_IGNORED_P (arg2) = 1;
20303   DECL_ARG_TYPE (arg1) = uint64_type_node;
20304   DECL_ARG_TYPE (arg2) = build_ifunc_arg_type ();
20305   DECL_ARGUMENTS (decl) = arg1;
20306   TREE_CHAIN (arg1) = arg2;
20307
20308   gimplify_function_tree (decl);
20309   push_cfun (DECL_STRUCT_FUNCTION (decl));
20310   *empty_bb = init_lowered_empty_function (decl, false,
20311                                            profile_count::uninitialized ());
20312
20313   cgraph_node::add_new_function (decl, true);
20314   symtab->call_cgraph_insertion_hooks (cgraph_node::get_create (decl));
20315
20316   pop_cfun ();
20317
20318   gcc_assert (ifunc_alias_decl != NULL);
20319   /* Mark ifunc_alias_decl as "ifunc" with resolver as resolver_name.  */
20320   DECL_ATTRIBUTES (ifunc_alias_decl)
20321     = make_attribute ("ifunc", resolver_name,
20322                       DECL_ATTRIBUTES (ifunc_alias_decl));
20323
20324   /* Create the alias for dispatch to resolver here.  */
20325   cgraph_node::create_same_body_alias (ifunc_alias_decl, decl);
20326   return decl;
20327 }
20328
20329 /* This adds a condition to the basic_block NEW_BB in function FUNCTION_DECL
20330    to return a pointer to VERSION_DECL if all feature bits specified in
20331    FEATURE_MASK are not set in MASK_VAR.  This function will be called during
20332    version dispatch to decide which function version to execute.  It returns
20333    the basic block at the end, to which more conditions can be added.  */
20334 static basic_block
20335 add_condition_to_bb (tree function_decl, tree version_decl,
20336                      aarch64_fmv_feature_mask feature_mask,
20337                      tree mask_var, basic_block new_bb)
20338 {
20339   gimple *return_stmt;
20340   tree convert_expr, result_var;
20341   gimple *convert_stmt;
20342   gimple *if_else_stmt;
20343
20344   basic_block bb1, bb2, bb3;
20345   edge e12, e23;
20346
20347   gimple_seq gseq;
20348
20349   push_cfun (DECL_STRUCT_FUNCTION (function_decl));
20350
20351   gcc_assert (new_bb != NULL);
20352   gseq = bb_seq (new_bb);
20353
20354   convert_expr = build1 (CONVERT_EXPR, ptr_type_node,
20355                          build_fold_addr_expr (version_decl));
20356   result_var = create_tmp_var (ptr_type_node);
20357   convert_stmt = gimple_build_assign (result_var, convert_expr);
20358   return_stmt = gimple_build_return (result_var);
20359
20360   if (feature_mask == 0ULL)
20361     {
20362       /* Default version.  */
20363       gimple_seq_add_stmt (&gseq, convert_stmt);
20364       gimple_seq_add_stmt (&gseq, return_stmt);
20365       set_bb_seq (new_bb, gseq);
20366       gimple_set_bb (convert_stmt, new_bb);
20367       gimple_set_bb (return_stmt, new_bb);
20368       pop_cfun ();
20369       return new_bb;
20370     }
20371
20372   tree and_expr_var = create_tmp_var (long_long_unsigned_type_node);
20373   tree and_expr = build2 (BIT_AND_EXPR,
20374                           long_long_unsigned_type_node,
20375                           mask_var,
20376                           build_int_cst (long_long_unsigned_type_node,
20377                                          feature_mask));
20378   gimple *and_stmt = gimple_build_assign (and_expr_var, and_expr);
20379   gimple_set_block (and_stmt, DECL_INITIAL (function_decl));
20380   gimple_set_bb (and_stmt, new_bb);
20381   gimple_seq_add_stmt (&gseq, and_stmt);
20382
20383   tree zero_llu = build_int_cst (long_long_unsigned_type_node, 0);
20384   if_else_stmt = gimple_build_cond (EQ_EXPR, and_expr_var, zero_llu,
20385                                     NULL_TREE, NULL_TREE);
20386   gimple_set_block (if_else_stmt, DECL_INITIAL (function_decl));
20387   gimple_set_bb (if_else_stmt, new_bb);
20388   gimple_seq_add_stmt (&gseq, if_else_stmt);
20389
20390   gimple_seq_add_stmt (&gseq, convert_stmt);
20391   gimple_seq_add_stmt (&gseq, return_stmt);
20392   set_bb_seq (new_bb, gseq);
20393
20394   bb1 = new_bb;
20395   e12 = split_block (bb1, if_else_stmt);
20396   bb2 = e12->dest;
20397   e12->flags &= ~EDGE_FALLTHRU;
20398   e12->flags |= EDGE_TRUE_VALUE;
20399
20400   e23 = split_block (bb2, return_stmt);
20401
20402   gimple_set_bb (convert_stmt, bb2);
20403   gimple_set_bb (return_stmt, bb2);
20404
20405   bb3 = e23->dest;
20406   make_edge (bb1, bb3, EDGE_FALSE_VALUE);
20407
20408   remove_edge (e23);
20409   make_edge (bb2, EXIT_BLOCK_PTR_FOR_FN (cfun), 0);
20410
20411   pop_cfun ();
20412
20413   return bb3;
20414 }
20415
20416 /* This function generates the dispatch function for
20417    multi-versioned functions.  DISPATCH_DECL is the function which will
20418    contain the dispatch logic.  FNDECLS are the function choices for
20419    dispatch, and is a tree chain.  EMPTY_BB is the basic block pointer
20420    in DISPATCH_DECL in which the dispatch code is generated.  */
20421
20422 static int
20423 dispatch_function_versions (tree dispatch_decl,
20424                             void *fndecls_p,
20425                             basic_block *empty_bb)
20426 {
20427   gimple *ifunc_cpu_init_stmt;
20428   gimple_seq gseq;
20429   vec<tree> *fndecls;
20430
20431   gcc_assert (dispatch_decl != NULL
20432               && fndecls_p != NULL
20433               && empty_bb != NULL);
20434
20435   push_cfun (DECL_STRUCT_FUNCTION (dispatch_decl));
20436
20437   gseq = bb_seq (*empty_bb);
20438   /* Function version dispatch is via IFUNC.  IFUNC resolvers fire before
20439      constructors, so explicity call __init_cpu_features_resolver here.  */
20440   tree init_fn_type = build_function_type_list (void_type_node,
20441                                                 long_unsigned_type_node,
20442                                                 build_ifunc_arg_type(),
20443                                                 NULL);
20444   tree init_fn_id = get_identifier ("__init_cpu_features_resolver");
20445   tree init_fn_decl = build_decl (UNKNOWN_LOCATION, FUNCTION_DECL,
20446                                   init_fn_id, init_fn_type);
20447   DECL_EXTERNAL (init_fn_decl) = 1;
20448   TREE_PUBLIC (init_fn_decl) = 1;
20449   DECL_VISIBILITY (init_fn_decl) = VISIBILITY_HIDDEN;
20450   DECL_VISIBILITY_SPECIFIED (init_fn_decl) = 1;
20451   tree arg1 = DECL_ARGUMENTS (dispatch_decl);
20452   tree arg2 = TREE_CHAIN (arg1);
20453   ifunc_cpu_init_stmt = gimple_build_call (init_fn_decl, 2, arg1, arg2);
20454   gimple_seq_add_stmt (&gseq, ifunc_cpu_init_stmt);
20455   gimple_set_bb (ifunc_cpu_init_stmt, *empty_bb);
20456
20457   /* Build the struct type for __aarch64_cpu_features.  */
20458   tree global_type = lang_hooks.types.make_type (RECORD_TYPE);
20459   tree field1 = build_decl (UNKNOWN_LOCATION, FIELD_DECL,
20460                             get_identifier ("features"),
20461                             long_long_unsigned_type_node);
20462   DECL_FIELD_CONTEXT (field1) = global_type;
20463   TYPE_FIELDS (global_type) = field1;
20464   layout_type (global_type);
20465
20466   tree global_var = build_decl (UNKNOWN_LOCATION, VAR_DECL,
20467                                 get_identifier ("__aarch64_cpu_features"),
20468                                 global_type);
20469   DECL_EXTERNAL (global_var) = 1;
20470   TREE_PUBLIC (global_var) = 1;
20471   DECL_VISIBILITY (global_var) = VISIBILITY_HIDDEN;
20472   DECL_VISIBILITY_SPECIFIED (global_var) = 1;
20473   tree mask_var = create_tmp_var (long_long_unsigned_type_node);
20474
20475   tree component_expr = build3 (COMPONENT_REF, long_long_unsigned_type_node,
20476                                 global_var, field1, NULL_TREE);
20477   gimple *component_stmt = gimple_build_assign (mask_var, component_expr);
20478   gimple_set_block (component_stmt, DECL_INITIAL (dispatch_decl));
20479   gimple_set_bb (component_stmt, *empty_bb);
20480   gimple_seq_add_stmt (&gseq, component_stmt);
20481
20482   tree not_expr = build1 (BIT_NOT_EXPR, long_long_unsigned_type_node, mask_var);
20483   gimple *not_stmt = gimple_build_assign (mask_var, not_expr);
20484   gimple_set_block (not_stmt, DECL_INITIAL (dispatch_decl));
20485   gimple_set_bb (not_stmt, *empty_bb);
20486   gimple_seq_add_stmt (&gseq, not_stmt);
20487
20488   set_bb_seq (*empty_bb, gseq);
20489
20490   pop_cfun ();
20491
20492   /* fndecls_p is actually a vector.  */
20493   fndecls = static_cast<vec<tree> *> (fndecls_p);
20494
20495   /* At least one more version other than the default.  */
20496   unsigned int num_versions = fndecls->length ();
20497   gcc_assert (num_versions >= 2);
20498
20499   struct function_version_info
20500     {
20501       tree version_decl;
20502       aarch64_fmv_feature_mask feature_mask;
20503     } *function_versions;
20504
20505   function_versions = (struct function_version_info *)
20506     XNEWVEC (struct function_version_info, (num_versions));
20507
20508   unsigned int actual_versions = 0;
20509
20510   for (tree version_decl : *fndecls)
20511     {
20512       aarch64_fmv_feature_mask feature_mask;
20513       /* Get attribute string, parse it and find the right features.  */
20514       feature_mask = get_feature_mask_for_version (version_decl);
20515       function_versions [actual_versions].version_decl = version_decl;
20516       function_versions [actual_versions].feature_mask = feature_mask;
20517       actual_versions++;
20518     }
20519
20520   auto compare_feature_version_info = [](const void *p1, const void *p2) {
20521     const function_version_info v1 = *(const function_version_info *)p1;
20522     const function_version_info v2 = *(const function_version_info *)p2;
20523     return - compare_feature_masks (v1.feature_mask, v2.feature_mask);
20524   };
20525
20526   /* Sort the versions according to descending order of dispatch priority.  */
20527   qsort (function_versions, actual_versions,
20528          sizeof (struct function_version_info), compare_feature_version_info);
20529
20530   for (unsigned int i = 0; i < actual_versions; ++i)
20531     *empty_bb = add_condition_to_bb (dispatch_decl,
20532                                      function_versions[i].version_decl,
20533                                      function_versions[i].feature_mask,
20534                                      mask_var,
20535                                      *empty_bb);
20536
20537   free (function_versions);
20538   return 0;
20539 }
20540
20541 /* Implement TARGET_GENERATE_VERSION_DISPATCHER_BODY.  */
20542
20543 tree
20544 aarch64_generate_version_dispatcher_body (void *node_p)
20545 {
20546   tree resolver_decl;
20547   basic_block empty_bb;
20548   tree default_ver_decl;
20549   struct cgraph_node *versn;
20550   struct cgraph_node *node;
20551
20552   struct cgraph_function_version_info *node_version_info = NULL;
20553   struct cgraph_function_version_info *versn_info = NULL;
20554
20555   node = (cgraph_node *)node_p;
20556
20557   node_version_info = node->function_version ();
20558   gcc_assert (node->dispatcher_function
20559               && node_version_info != NULL);
20560
20561   if (node_version_info->dispatcher_resolver)
20562     return node_version_info->dispatcher_resolver;
20563
20564   /* The first version in the chain corresponds to the default version.  */
20565   default_ver_decl = node_version_info->next->this_node->decl;
20566
20567   /* node is going to be an alias, so remove the finalized bit.  */
20568   node->definition = false;
20569
20570   resolver_decl = make_resolver_func (default_ver_decl,
20571                                       node->decl, &empty_bb);
20572
20573   node_version_info->dispatcher_resolver = resolver_decl;
20574
20575   push_cfun (DECL_STRUCT_FUNCTION (resolver_decl));
20576
20577   auto_vec<tree, 2> fn_ver_vec;
20578
20579   for (versn_info = node_version_info->next; versn_info;
20580        versn_info = versn_info->next)
20581     {
20582       versn = versn_info->this_node;
20583       /* Check for virtual functions here again, as by this time it should
20584          have been determined if this function needs a vtable index or
20585          not.  This happens for methods in derived classes that override
20586          virtual methods in base classes but are not explicitly marked as
20587          virtual.  */
20588       if (DECL_VINDEX (versn->decl))
20589         sorry ("virtual function multiversioning not supported");
20590
20591       fn_ver_vec.safe_push (versn->decl);
20592     }
20593
20594   dispatch_function_versions (resolver_decl, &fn_ver_vec, &empty_bb);
20595   cgraph_edge::rebuild_edges ();
20596   pop_cfun ();
20597
20598   /* Fix up symbol names.  First we need to obtain the base name, which may
20599      have already been mangled.  */
20600   tree base_name = get_suffixed_assembler_name (default_ver_decl, "");
20601
20602   /* We need to redo the version mangling on the non-default versions for the
20603      target_clones case.  Redoing the mangling for the target_version case is
20604      redundant but does no harm.  We need to skip the default version, because
20605      expand_clones will append ".default" later; fortunately that suffix is the
20606      one we want anyway.  */
20607   for (versn_info = node_version_info->next->next; versn_info;
20608        versn_info = versn_info->next)
20609     {
20610       tree version_decl = versn_info->this_node->decl;
20611       tree name = aarch64_mangle_decl_assembler_name (version_decl,
20612                                                       base_name);
20613       symtab->change_decl_assembler_name (version_decl, name);
20614     }
20615
20616   /* We also need to use the base name for the ifunc declaration.  */
20617   symtab->change_decl_assembler_name (node->decl, base_name);
20618
20619   return resolver_decl;
20620 }
20621
20622 /* Make a dispatcher declaration for the multi-versioned function DECL.
20623    Calls to DECL function will be replaced with calls to the dispatcher
20624    by the front-end.  Returns the decl of the dispatcher function.  */
20625
20626 tree
20627 aarch64_get_function_versions_dispatcher (void *decl)
20628 {
20629   tree fn = (tree) decl;
20630   struct cgraph_node *node = NULL;
20631   struct cgraph_node *default_node = NULL;
20632   struct cgraph_function_version_info *node_v = NULL;
20633   struct cgraph_function_version_info *first_v = NULL;
20634
20635   tree dispatch_decl = NULL;
20636
20637   struct cgraph_function_version_info *default_version_info = NULL;
20638
20639   gcc_assert (fn != NULL && DECL_FUNCTION_VERSIONED (fn));
20640
20641   node = cgraph_node::get (fn);
20642   gcc_assert (node != NULL);
20643
20644   node_v = node->function_version ();
20645   gcc_assert (node_v != NULL);
20646
20647   if (node_v->dispatcher_resolver != NULL)
20648     return node_v->dispatcher_resolver;
20649
20650   /* Find the default version and make it the first node.  */
20651   first_v = node_v;
20652   /* Go to the beginning of the chain.  */
20653   while (first_v->prev != NULL)
20654     first_v = first_v->prev;
20655   default_version_info = first_v;
20656   while (default_version_info != NULL)
20657     {
20658       if (get_feature_mask_for_version
20659             (default_version_info->this_node->decl) == 0ULL)
20660         break;
20661       default_version_info = default_version_info->next;
20662     }
20663
20664   /* If there is no default node, just return NULL.  */
20665   if (default_version_info == NULL)
20666     return NULL;
20667
20668   /* Make default info the first node.  */
20669   if (first_v != default_version_info)
20670     {
20671       default_version_info->prev->next = default_version_info->next;
20672       if (default_version_info->next)
20673         default_version_info->next->prev = default_version_info->prev;
20674       first_v->prev = default_version_info;
20675       default_version_info->next = first_v;
20676       default_version_info->prev = NULL;
20677     }
20678
20679   default_node = default_version_info->this_node;
20680
20681   if (targetm.has_ifunc_p ())
20682     {
20683       struct cgraph_function_version_info *it_v = NULL;
20684       struct cgraph_node *dispatcher_node = NULL;
20685       struct cgraph_function_version_info *dispatcher_version_info = NULL;
20686
20687       /* Right now, the dispatching is done via ifunc.  */
20688       dispatch_decl = make_dispatcher_decl (default_node->decl);
20689       TREE_NOTHROW (dispatch_decl) = TREE_NOTHROW (fn);
20690
20691       dispatcher_node = cgraph_node::get_create (dispatch_decl);
20692       gcc_assert (dispatcher_node != NULL);
20693       dispatcher_node->dispatcher_function = 1;
20694       dispatcher_version_info
20695         = dispatcher_node->insert_new_function_version ();
20696       dispatcher_version_info->next = default_version_info;
20697       dispatcher_node->definition = 1;
20698
20699       /* Set the dispatcher for all the versions.  */
20700       it_v = default_version_info;
20701       while (it_v != NULL)
20702         {
20703           it_v->dispatcher_resolver = dispatch_decl;
20704           it_v = it_v->next;
20705         }
20706     }
20707   else
20708     {
20709       error_at (DECL_SOURCE_LOCATION (default_node->decl),
20710                 "multiversioning needs %<ifunc%> which is not supported "
20711                 "on this target");
20712     }
20713
20714   return dispatch_decl;
20715 }
20716
20717 /* This function returns true if FN1 and FN2 are versions of the same function,
20718    that is, the target_version attributes of the function decls are different.
20719    This assumes that FN1 and FN2 have the same signature.  */
20720
20721 bool
20722 aarch64_common_function_versions (tree fn1, tree fn2)
20723 {
20724   if (TREE_CODE (fn1) != FUNCTION_DECL
20725       || TREE_CODE (fn2) != FUNCTION_DECL)
20726     return false;
20727
20728   return (aarch64_compare_version_priority (fn1, fn2) != 0);
20729 }
20730
20731 /* Implement TARGET_FUNCTION_ATTRIBUTE_INLINABLE_P.  Use an opt-out
20732    rather than an opt-in list.  */
20733
20734 static bool
20735 aarch64_function_attribute_inlinable_p (const_tree fndecl)
20736 {
20737   /* A function that has local SME state cannot be inlined into its caller,
20738      since we only support managing PSTATE.ZA switches at function scope.  */
20739   return (!aarch64_fndecl_has_new_state (fndecl, "za")
20740           && !aarch64_fndecl_has_new_state (fndecl, "zt0"));
20741 }
20742
20743 /* Helper for aarch64_can_inline_p.  In the case where CALLER and CALLEE are
20744    tri-bool options (yes, no, don't care) and the default value is
20745    DEF, determine whether to reject inlining.  */
20746
20747 static bool
20748 aarch64_tribools_ok_for_inlining_p (int caller, int callee,
20749                                      int dont_care, int def)
20750 {
20751   /* If the callee doesn't care, always allow inlining.  */
20752   if (callee == dont_care)
20753     return true;
20754
20755   /* If the caller doesn't care, always allow inlining.  */
20756   if (caller == dont_care)
20757     return true;
20758
20759   /* Otherwise, allow inlining if either the callee and caller values
20760      agree, or if the callee is using the default value.  */
20761   return (callee == caller || callee == def);
20762 }
20763
20764 /* Bit allocations for ipa_fn_summary::target_info.  */
20765
20766 /* Set if the function contains a stmt that relies on the function's
20767    choice of PSTATE.SM setting (0 for non-streaming, 1 for streaming).
20768    Not meaningful for streaming-compatible functions.  */
20769 constexpr auto AARCH64_IPA_SM_FIXED = 1U << 0;
20770
20771 /* Set if the function clobbers ZA and ZT0.  Not meaningful for functions that
20772    have ZA state.  */
20773 constexpr auto AARCH64_IPA_CLOBBERS_ZA = 1U << 1;
20774 constexpr auto AARCH64_IPA_CLOBBERS_ZT0 = 1U << 2;
20775
20776 /* Implement TARGET_NEED_IPA_FN_TARGET_INFO.  */
20777
20778 static bool
20779 aarch64_need_ipa_fn_target_info (const_tree, unsigned int &)
20780 {
20781   /* We could in principle skip this for streaming-compatible functions
20782      that have ZA state, but that's a rare combination.  */
20783   return true;
20784 }
20785
20786 /* Implement TARGET_UPDATE_IPA_FN_TARGET_INFO.  */
20787
20788 static bool
20789 aarch64_update_ipa_fn_target_info (unsigned int &info, const gimple *stmt)
20790 {
20791   if (auto *ga = dyn_cast<const gasm *> (stmt))
20792     {
20793       /* We don't know what the asm does, so conservatively assume that
20794          it requires the function's current SM mode.  */
20795       info |= AARCH64_IPA_SM_FIXED;
20796       for (unsigned int i = 0; i < gimple_asm_nclobbers (ga); ++i)
20797         {
20798           tree op = gimple_asm_clobber_op (ga, i);
20799           const char *clobber = TREE_STRING_POINTER (TREE_VALUE (op));
20800           if (strcmp (clobber, "za") == 0)
20801             info |= AARCH64_IPA_CLOBBERS_ZA;
20802           if (strcmp (clobber, "zt0") == 0)
20803             info |= AARCH64_IPA_CLOBBERS_ZT0;
20804         }
20805     }
20806   if (auto *call = dyn_cast<const gcall *> (stmt))
20807     {
20808       if (gimple_call_builtin_p (call, BUILT_IN_MD))
20809         {
20810           /* The attributes on AArch64 builtins are supposed to be accurate.
20811              If the function isn't marked streaming-compatible then it
20812              needs whichever SM mode it selects.  */
20813           tree decl = gimple_call_fndecl (call);
20814           if (aarch64_fndecl_pstate_sm (decl) != 0)
20815             info |= AARCH64_IPA_SM_FIXED;
20816         }
20817     }
20818   return true;
20819 }
20820
20821 /* Implement TARGET_CAN_INLINE_P.  Decide whether it is valid
20822    to inline CALLEE into CALLER based on target-specific info.
20823    Make sure that the caller and callee have compatible architectural
20824    features.  Then go through the other possible target attributes
20825    and see if they can block inlining.  Try not to reject always_inline
20826    callees unless they are incompatible architecturally.  */
20827
20828 static bool
20829 aarch64_can_inline_p (tree caller, tree callee)
20830 {
20831   tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
20832   tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
20833
20834   struct cl_target_option *caller_opts
20835         = TREE_TARGET_OPTION (caller_tree ? caller_tree
20836                                            : target_option_default_node);
20837
20838   struct cl_target_option *callee_opts
20839         = TREE_TARGET_OPTION (callee_tree ? callee_tree
20840                                            : target_option_default_node);
20841
20842   /* Callee's ISA flags should be a subset of the caller's.  */
20843   auto caller_asm_isa = (aarch64_get_asm_isa_flags (caller_opts)
20844                          & ~AARCH64_FL_ISA_MODES);
20845   auto callee_asm_isa = (aarch64_get_asm_isa_flags (callee_opts)
20846                          & ~AARCH64_FL_ISA_MODES);
20847   if (callee_asm_isa & ~caller_asm_isa)
20848     return false;
20849
20850   auto caller_isa = (aarch64_get_isa_flags (caller_opts)
20851                      & ~AARCH64_FL_ISA_MODES);
20852   auto callee_isa = (aarch64_get_isa_flags (callee_opts)
20853                      & ~AARCH64_FL_ISA_MODES);
20854   if (callee_isa & ~caller_isa)
20855     return false;
20856
20857   /* Return true if the callee might have target_info property PROPERTY.
20858      The answer must be true unless we have positive proof to the contrary.  */
20859   auto callee_has_property = [&](unsigned int property)
20860     {
20861       if (ipa_fn_summaries)
20862         if (auto *summary = ipa_fn_summaries->get (cgraph_node::get (callee)))
20863           if (!(summary->target_info & property))
20864             return false;
20865       return true;
20866     };
20867
20868   /* Streaming-compatible code can be inlined into functions with any
20869      PSTATE.SM mode.  Otherwise the caller and callee must agree on
20870      PSTATE.SM mode, unless we can prove that the callee is naturally
20871      streaming-compatible.  */
20872   auto caller_sm = (aarch64_get_isa_flags (caller_opts) & AARCH64_FL_SM_STATE);
20873   auto callee_sm = (aarch64_get_isa_flags (callee_opts) & AARCH64_FL_SM_STATE);
20874   if (callee_sm
20875       && caller_sm != callee_sm
20876       && callee_has_property (AARCH64_IPA_SM_FIXED))
20877     return false;
20878
20879   /* aarch64_function_attribute_inlinable_p prevents new-ZA and new-ZT0
20880      functions from being inlined into others.  We also need to prevent
20881      inlining of shared-ZA functions into functions without ZA state,
20882      since this is an error condition.
20883
20884      The only other problematic case for ZA is inlining a function that
20885      directly clobbers ZA or ZT0 into a function that has ZA or ZT0 state.  */
20886   auto caller_za = (aarch64_get_isa_flags (caller_opts) & AARCH64_FL_ZA_ON);
20887   auto callee_za = (aarch64_get_isa_flags (callee_opts) & AARCH64_FL_ZA_ON);
20888   if (!caller_za && callee_za)
20889     return false;
20890   if (!callee_za
20891       && aarch64_fndecl_has_state (caller, "za")
20892       && callee_has_property (AARCH64_IPA_CLOBBERS_ZA))
20893     return false;
20894   if (!callee_za
20895       && aarch64_fndecl_has_state (caller, "zt0")
20896       && callee_has_property (AARCH64_IPA_CLOBBERS_ZT0))
20897     return false;
20898
20899   /* Allow non-strict aligned functions inlining into strict
20900      aligned ones.  */
20901   if ((TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)
20902        != TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags))
20903       && !(!TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags)
20904            && TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)))
20905     return false;
20906
20907   bool always_inline = lookup_attribute ("always_inline",
20908                                           DECL_ATTRIBUTES (callee));
20909
20910   /* If the architectural features match up and the callee is always_inline
20911      then the other attributes don't matter.  */
20912   if (always_inline)
20913     return true;
20914
20915   if (caller_opts->x_aarch64_cmodel_var
20916       != callee_opts->x_aarch64_cmodel_var)
20917     return false;
20918
20919   if (caller_opts->x_aarch64_tls_dialect
20920       != callee_opts->x_aarch64_tls_dialect)
20921     return false;
20922
20923   /* Honour explicit requests to workaround errata.  */
20924   if (!aarch64_tribools_ok_for_inlining_p (
20925           caller_opts->x_aarch64_fix_a53_err835769,
20926           callee_opts->x_aarch64_fix_a53_err835769,
20927           2, TARGET_FIX_ERR_A53_835769_DEFAULT))
20928     return false;
20929
20930   if (!aarch64_tribools_ok_for_inlining_p (
20931           caller_opts->x_aarch64_fix_a53_err843419,
20932           callee_opts->x_aarch64_fix_a53_err843419,
20933           2, TARGET_FIX_ERR_A53_843419))
20934     return false;
20935
20936   /* If the user explicitly specified -momit-leaf-frame-pointer for the
20937      caller and calle and they don't match up, reject inlining.  */
20938   if (!aarch64_tribools_ok_for_inlining_p (
20939           caller_opts->x_flag_omit_leaf_frame_pointer,
20940           callee_opts->x_flag_omit_leaf_frame_pointer,
20941           2, 1))
20942     return false;
20943
20944   /* If the callee has specific tuning overrides, respect them.  */
20945   if (callee_opts->x_aarch64_override_tune_string != NULL
20946       && caller_opts->x_aarch64_override_tune_string == NULL)
20947     return false;
20948
20949   /* If the user specified tuning override strings for the
20950      caller and callee and they don't match up, reject inlining.
20951      We just do a string compare here, we don't analyze the meaning
20952      of the string, as it would be too costly for little gain.  */
20953   if (callee_opts->x_aarch64_override_tune_string
20954       && caller_opts->x_aarch64_override_tune_string
20955       && (strcmp (callee_opts->x_aarch64_override_tune_string,
20956                   caller_opts->x_aarch64_override_tune_string) != 0))
20957     return false;
20958
20959   return true;
20960 }
20961
20962 /* Return the ID of the TLDESC ABI, initializing the descriptor if hasn't
20963    been already.  */
20964
20965 arm_pcs
20966 aarch64_tlsdesc_abi_id ()
20967 {
20968   predefined_function_abi &tlsdesc_abi = function_abis[ARM_PCS_TLSDESC];
20969   if (!tlsdesc_abi.initialized_p ())
20970     {
20971       HARD_REG_SET full_reg_clobbers;
20972       CLEAR_HARD_REG_SET (full_reg_clobbers);
20973       SET_HARD_REG_BIT (full_reg_clobbers, R0_REGNUM);
20974       SET_HARD_REG_BIT (full_reg_clobbers, CC_REGNUM);
20975       for (int regno = P0_REGNUM; regno <= P15_REGNUM; ++regno)
20976         SET_HARD_REG_BIT (full_reg_clobbers, regno);
20977       tlsdesc_abi.initialize (ARM_PCS_TLSDESC, full_reg_clobbers);
20978     }
20979   return ARM_PCS_TLSDESC;
20980 }
20981
20982 /* Return true if SYMBOL_REF X binds locally.  */
20983
20984 static bool
20985 aarch64_symbol_binds_local_p (const_rtx x)
20986 {
20987   return (SYMBOL_REF_DECL (x)
20988           ? targetm.binds_local_p (SYMBOL_REF_DECL (x))
20989           : SYMBOL_REF_LOCAL_P (x));
20990 }
20991
20992 /* Return true if SYMBOL_REF X is thread local */
20993 static bool
20994 aarch64_tls_symbol_p (rtx x)
20995 {
20996   if (! TARGET_HAVE_TLS)
20997     return false;
20998
20999   x = strip_salt (x);
21000   if (!SYMBOL_REF_P (x))
21001     return false;
21002
21003   return SYMBOL_REF_TLS_MODEL (x) != 0;
21004 }
21005
21006 /* Classify a TLS symbol into one of the TLS kinds.  */
21007 enum aarch64_symbol_type
21008 aarch64_classify_tls_symbol (rtx x)
21009 {
21010   enum tls_model tls_kind = tls_symbolic_operand_type (x);
21011
21012   switch (tls_kind)
21013     {
21014     case TLS_MODEL_GLOBAL_DYNAMIC:
21015     case TLS_MODEL_LOCAL_DYNAMIC:
21016       return TARGET_TLS_DESC ? SYMBOL_SMALL_TLSDESC : SYMBOL_SMALL_TLSGD;
21017
21018     case TLS_MODEL_INITIAL_EXEC:
21019       switch (aarch64_cmodel)
21020         {
21021         case AARCH64_CMODEL_TINY:
21022         case AARCH64_CMODEL_TINY_PIC:
21023           return SYMBOL_TINY_TLSIE;
21024         default:
21025           return SYMBOL_SMALL_TLSIE;
21026         }
21027
21028     case TLS_MODEL_LOCAL_EXEC:
21029       if (aarch64_tls_size == 12)
21030         return SYMBOL_TLSLE12;
21031       else if (aarch64_tls_size == 24)
21032         return SYMBOL_TLSLE24;
21033       else if (aarch64_tls_size == 32)
21034         return SYMBOL_TLSLE32;
21035       else if (aarch64_tls_size == 48)
21036         return SYMBOL_TLSLE48;
21037       else
21038         gcc_unreachable ();
21039
21040     case TLS_MODEL_EMULATED:
21041     case TLS_MODEL_NONE:
21042       return SYMBOL_FORCE_TO_MEM;
21043
21044     default:
21045       gcc_unreachable ();
21046     }
21047 }
21048
21049 /* Return the correct method for accessing X + OFFSET, where X is either
21050    a SYMBOL_REF or LABEL_REF.  */
21051
21052 enum aarch64_symbol_type
21053 aarch64_classify_symbol (rtx x, HOST_WIDE_INT offset)
21054 {
21055   x = strip_salt (x);
21056
21057   if (LABEL_REF_P (x))
21058     {
21059       switch (aarch64_cmodel)
21060         {
21061         case AARCH64_CMODEL_LARGE:
21062           return SYMBOL_FORCE_TO_MEM;
21063
21064         case AARCH64_CMODEL_TINY_PIC:
21065         case AARCH64_CMODEL_TINY:
21066           return SYMBOL_TINY_ABSOLUTE;
21067
21068         case AARCH64_CMODEL_SMALL_SPIC:
21069         case AARCH64_CMODEL_SMALL_PIC:
21070         case AARCH64_CMODEL_SMALL:
21071           return SYMBOL_SMALL_ABSOLUTE;
21072
21073         default:
21074           gcc_unreachable ();
21075         }
21076     }
21077
21078   if (SYMBOL_REF_P (x))
21079     {
21080       if (aarch64_tls_symbol_p (x))
21081         return aarch64_classify_tls_symbol (x);
21082
21083       switch (aarch64_cmodel)
21084         {
21085         case AARCH64_CMODEL_TINY_PIC:
21086         case AARCH64_CMODEL_TINY:
21087           /* With -fPIC non-local symbols use the GOT.  For orthogonality
21088              always use the GOT for extern weak symbols.  */
21089           if (!TARGET_PECOFF
21090               && (flag_pic || SYMBOL_REF_WEAK (x))
21091               && !aarch64_symbol_binds_local_p (x))
21092             return SYMBOL_TINY_GOT;
21093
21094           /* When we retrieve symbol + offset address, we have to make sure
21095              the offset does not cause overflow of the final address.  But
21096              we have no way of knowing the address of symbol at compile time
21097              so we can't accurately say if the distance between the PC and
21098              symbol + offset is outside the addressible range of +/-1MB in the
21099              TINY code model.  So we limit the maximum offset to +/-64KB and
21100              assume the offset to the symbol is not larger than +/-(1MB - 64KB).
21101              If offset_within_block_p is true we allow larger offsets.  */
21102           if (!(IN_RANGE (offset, -0x10000, 0x10000)
21103                 || offset_within_block_p (x, offset)))
21104             return SYMBOL_FORCE_TO_MEM;
21105
21106           return SYMBOL_TINY_ABSOLUTE;
21107
21108
21109         case AARCH64_CMODEL_SMALL_SPIC:
21110         case AARCH64_CMODEL_SMALL_PIC:
21111         case AARCH64_CMODEL_SMALL:
21112           if (!TARGET_PECOFF
21113               && (flag_pic || SYMBOL_REF_WEAK (x))
21114               && !aarch64_symbol_binds_local_p (x))
21115             return aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC
21116                     ? SYMBOL_SMALL_GOT_28K : SYMBOL_SMALL_GOT_4G;
21117
21118           /* Same reasoning as the tiny code model, but the offset cap here is
21119              1MB, allowing +/-3.9GB for the offset to the symbol.  */
21120           if (!(IN_RANGE (offset, -0x100000, 0x100000)
21121                 || offset_within_block_p (x, offset)))
21122             return SYMBOL_FORCE_TO_MEM;
21123
21124           return SYMBOL_SMALL_ABSOLUTE;
21125
21126         case AARCH64_CMODEL_LARGE:
21127           /* This is alright even in PIC code as the constant
21128              pool reference is always PC relative and within
21129              the same translation unit.  */
21130           if (!aarch64_pcrelative_literal_loads && CONSTANT_POOL_ADDRESS_P (x))
21131             return SYMBOL_SMALL_ABSOLUTE;
21132           else
21133             return SYMBOL_FORCE_TO_MEM;
21134
21135         default:
21136           gcc_unreachable ();
21137         }
21138     }
21139
21140   /* By default push everything into the constant pool.  */
21141   return SYMBOL_FORCE_TO_MEM;
21142 }
21143
21144 bool
21145 aarch64_constant_address_p (rtx x)
21146 {
21147   return (CONSTANT_P (x) && memory_address_p (DImode, x));
21148 }
21149
21150 bool
21151 aarch64_legitimate_pic_operand_p (rtx x)
21152 {
21153   poly_int64 offset;
21154   x = strip_offset_and_salt (x, &offset);
21155   if (SYMBOL_REF_P (x))
21156     return false;
21157
21158   return true;
21159 }
21160
21161 /* Implement TARGET_LEGITIMATE_CONSTANT_P hook.  Return true for constants
21162    that should be rematerialized rather than spilled.  */
21163
21164 static bool
21165 aarch64_legitimate_constant_p (machine_mode mode, rtx x)
21166 {
21167   /* Support CSE and rematerialization of common constants.  */
21168   if (CONST_INT_P (x)
21169       || CONST_DOUBLE_P (x))
21170     return true;
21171
21172   /* Only accept variable-length vector constants if they can be
21173      handled directly.
21174
21175      ??? It would be possible (but complex) to handle rematerialization
21176      of other constants via secondary reloads.  */
21177   if (!GET_MODE_SIZE (mode).is_constant ())
21178     return aarch64_simd_valid_mov_imm (x);
21179
21180   /* Otherwise, accept any CONST_VECTOR that, if all else fails, can at
21181      least be forced to memory and loaded from there.  */
21182   if (CONST_VECTOR_P (x))
21183     return !targetm.cannot_force_const_mem (mode, x);
21184
21185   /* Do not allow vector struct mode constants for Advanced SIMD.
21186      We could support 0 and -1 easily, but they need support in
21187      aarch64-simd.md.  */
21188   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
21189   if (vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
21190     return false;
21191
21192   if (GET_CODE (x) == HIGH)
21193     x = XEXP (x, 0);
21194
21195   /* Accept polynomial constants that can be calculated by using the
21196      destination of a move as the sole temporary.  Constants that
21197      require a second temporary cannot be rematerialized (they can't be
21198      forced to memory and also aren't legitimate constants).  */
21199   poly_int64 offset;
21200   if (poly_int_rtx_p (x, &offset))
21201     return aarch64_offset_temporaries (false, offset) <= 1;
21202
21203   /* If an offset is being added to something else, we need to allow the
21204      base to be moved into the destination register, meaning that there
21205      are no free temporaries for the offset.  */
21206   x = strip_offset_and_salt (x, &offset);
21207   if (!offset.is_constant () && aarch64_offset_temporaries (true, offset) > 0)
21208     return false;
21209
21210   /* Do not allow const (plus (anchor_symbol, const_int)).  */
21211   if (maybe_ne (offset, 0) && SYMBOL_REF_P (x) && SYMBOL_REF_ANCHOR_P (x))
21212     return false;
21213
21214   /* Treat symbols as constants.  Avoid TLS symbols as they are complex,
21215      so spilling them is better than rematerialization.  */
21216   if (SYMBOL_REF_P (x) && !SYMBOL_REF_TLS_MODEL (x))
21217     return true;
21218
21219   /* Label references are always constant.  */
21220   if (LABEL_REF_P (x))
21221     return true;
21222
21223   return false;
21224 }
21225
21226 rtx
21227 aarch64_load_tp (rtx target)
21228 {
21229   if (!target
21230       || GET_MODE (target) != Pmode
21231       || !register_operand (target, Pmode))
21232     target = gen_reg_rtx (Pmode);
21233
21234   /* Can return in any reg.  */
21235   emit_insn (gen_aarch64_load_tp_hard (target));
21236   return target;
21237 }
21238
21239 /* On AAPCS systems, this is the "struct __va_list".  */
21240 static GTY(()) tree va_list_type;
21241
21242 /* Implement TARGET_BUILD_BUILTIN_VA_LIST.
21243    Return the type to use as __builtin_va_list.
21244
21245    AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
21246
21247    struct __va_list
21248    {
21249      void *__stack;
21250      void *__gr_top;
21251      void *__vr_top;
21252      int   __gr_offs;
21253      int   __vr_offs;
21254    };  */
21255
21256 static tree
21257 aarch64_build_builtin_va_list (void)
21258 {
21259   tree va_list_name;
21260   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
21261
21262   /* Create the type.  */
21263   va_list_type = lang_hooks.types.make_type (RECORD_TYPE);
21264   /* Give it the required name.  */
21265   va_list_name = build_decl (BUILTINS_LOCATION,
21266                              TYPE_DECL,
21267                              get_identifier ("__va_list"),
21268                              va_list_type);
21269   DECL_ARTIFICIAL (va_list_name) = 1;
21270   TREE_PUBLIC (va_list_name) = 1;
21271   TYPE_NAME (va_list_type) = va_list_name;
21272   TYPE_STUB_DECL (va_list_type) = va_list_name;
21273
21274   /* Create the fields.  */
21275   f_stack = build_decl (BUILTINS_LOCATION,
21276                         FIELD_DECL, get_identifier ("__stack"),
21277                         ptr_type_node);
21278   f_grtop = build_decl (BUILTINS_LOCATION,
21279                         FIELD_DECL, get_identifier ("__gr_top"),
21280                         ptr_type_node);
21281   f_vrtop = build_decl (BUILTINS_LOCATION,
21282                         FIELD_DECL, get_identifier ("__vr_top"),
21283                         ptr_type_node);
21284   f_groff = build_decl (BUILTINS_LOCATION,
21285                         FIELD_DECL, get_identifier ("__gr_offs"),
21286                         integer_type_node);
21287   f_vroff = build_decl (BUILTINS_LOCATION,
21288                         FIELD_DECL, get_identifier ("__vr_offs"),
21289                         integer_type_node);
21290
21291   /* Tell tree-stdarg pass about our internal offset fields.
21292      NOTE: va_list_gpr/fpr_counter_field are only used for tree comparision
21293      purpose to identify whether the code is updating va_list internal
21294      offset fields through irregular way.  */
21295   va_list_gpr_counter_field = f_groff;
21296   va_list_fpr_counter_field = f_vroff;
21297
21298   DECL_ARTIFICIAL (f_stack) = 1;
21299   DECL_ARTIFICIAL (f_grtop) = 1;
21300   DECL_ARTIFICIAL (f_vrtop) = 1;
21301   DECL_ARTIFICIAL (f_groff) = 1;
21302   DECL_ARTIFICIAL (f_vroff) = 1;
21303
21304   DECL_FIELD_CONTEXT (f_stack) = va_list_type;
21305   DECL_FIELD_CONTEXT (f_grtop) = va_list_type;
21306   DECL_FIELD_CONTEXT (f_vrtop) = va_list_type;
21307   DECL_FIELD_CONTEXT (f_groff) = va_list_type;
21308   DECL_FIELD_CONTEXT (f_vroff) = va_list_type;
21309
21310   TYPE_FIELDS (va_list_type) = f_stack;
21311   DECL_CHAIN (f_stack) = f_grtop;
21312   DECL_CHAIN (f_grtop) = f_vrtop;
21313   DECL_CHAIN (f_vrtop) = f_groff;
21314   DECL_CHAIN (f_groff) = f_vroff;
21315
21316   /* Compute its layout.  */
21317   layout_type (va_list_type);
21318
21319   return va_list_type;
21320 }
21321
21322 /* Implement TARGET_EXPAND_BUILTIN_VA_START.  */
21323 static void
21324 aarch64_expand_builtin_va_start (tree valist, rtx nextarg ATTRIBUTE_UNUSED)
21325 {
21326   const CUMULATIVE_ARGS *cum;
21327   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
21328   tree stack, grtop, vrtop, groff, vroff;
21329   tree t;
21330   int gr_save_area_size = cfun->va_list_gpr_size;
21331   int vr_save_area_size = cfun->va_list_fpr_size;
21332   int vr_offset;
21333
21334   cum = &crtl->args.info;
21335   if (cfun->va_list_gpr_size)
21336     gr_save_area_size = MIN ((NUM_ARG_REGS - cum->aapcs_ncrn) * UNITS_PER_WORD,
21337                              cfun->va_list_gpr_size);
21338   if (cfun->va_list_fpr_size)
21339     vr_save_area_size = MIN ((NUM_FP_ARG_REGS - cum->aapcs_nvrn)
21340                              * UNITS_PER_VREG, cfun->va_list_fpr_size);
21341
21342   if (!TARGET_FLOAT)
21343     {
21344       gcc_assert (cum->aapcs_nvrn == 0);
21345       vr_save_area_size = 0;
21346     }
21347
21348   f_stack = TYPE_FIELDS (va_list_type_node);
21349   f_grtop = DECL_CHAIN (f_stack);
21350   f_vrtop = DECL_CHAIN (f_grtop);
21351   f_groff = DECL_CHAIN (f_vrtop);
21352   f_vroff = DECL_CHAIN (f_groff);
21353
21354   stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), valist, f_stack,
21355                   NULL_TREE);
21356   grtop = build3 (COMPONENT_REF, TREE_TYPE (f_grtop), valist, f_grtop,
21357                   NULL_TREE);
21358   vrtop = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop), valist, f_vrtop,
21359                   NULL_TREE);
21360   groff = build3 (COMPONENT_REF, TREE_TYPE (f_groff), valist, f_groff,
21361                   NULL_TREE);
21362   vroff = build3 (COMPONENT_REF, TREE_TYPE (f_vroff), valist, f_vroff,
21363                   NULL_TREE);
21364
21365   /* Emit code to initialize STACK, which points to the next varargs stack
21366      argument.  CUM->AAPCS_STACK_SIZE gives the number of stack words used
21367      by named arguments.  STACK is 8-byte aligned.  */
21368   t = make_tree (TREE_TYPE (stack), virtual_incoming_args_rtx);
21369   if (cum->aapcs_stack_size > 0)
21370     t = fold_build_pointer_plus_hwi (t, cum->aapcs_stack_size * UNITS_PER_WORD);
21371   t = build2 (MODIFY_EXPR, TREE_TYPE (stack), stack, t);
21372   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
21373
21374   /* Emit code to initialize GRTOP, the top of the GR save area.
21375      virtual_incoming_args_rtx should have been 16 byte aligned.  */
21376   t = make_tree (TREE_TYPE (grtop), virtual_incoming_args_rtx);
21377   t = build2 (MODIFY_EXPR, TREE_TYPE (grtop), grtop, t);
21378   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
21379
21380   /* Emit code to initialize VRTOP, the top of the VR save area.
21381      This address is gr_save_area_bytes below GRTOP, rounded
21382      down to the next 16-byte boundary.  */
21383   t = make_tree (TREE_TYPE (vrtop), virtual_incoming_args_rtx);
21384   vr_offset = ROUND_UP (gr_save_area_size,
21385                         STACK_BOUNDARY / BITS_PER_UNIT);
21386
21387   if (vr_offset)
21388     t = fold_build_pointer_plus_hwi (t, -vr_offset);
21389   t = build2 (MODIFY_EXPR, TREE_TYPE (vrtop), vrtop, t);
21390   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
21391
21392   /* Emit code to initialize GROFF, the offset from GRTOP of the
21393      next GPR argument.  */
21394   t = build2 (MODIFY_EXPR, TREE_TYPE (groff), groff,
21395               build_int_cst (TREE_TYPE (groff), -gr_save_area_size));
21396   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
21397
21398   /* Likewise emit code to initialize VROFF, the offset from FTOP
21399      of the next VR argument.  */
21400   t = build2 (MODIFY_EXPR, TREE_TYPE (vroff), vroff,
21401               build_int_cst (TREE_TYPE (vroff), -vr_save_area_size));
21402   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
21403 }
21404
21405 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR.  */
21406
21407 static tree
21408 aarch64_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
21409                               gimple_seq *post_p ATTRIBUTE_UNUSED)
21410 {
21411   tree addr;
21412   bool indirect_p;
21413   bool is_ha;           /* is HFA or HVA.  */
21414   bool dw_align;        /* double-word align.  */
21415   machine_mode ag_mode = VOIDmode;
21416   int nregs;
21417   machine_mode mode;
21418
21419   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
21420   tree stack, f_top, f_off, off, arg, roundup, on_stack;
21421   HOST_WIDE_INT size, rsize, adjust, align;
21422   tree t, u, cond1, cond2;
21423
21424   indirect_p = pass_va_arg_by_reference (type);
21425   if (indirect_p)
21426     type = build_pointer_type (type);
21427
21428   mode = TYPE_MODE (type);
21429
21430   f_stack = TYPE_FIELDS (va_list_type_node);
21431   f_grtop = DECL_CHAIN (f_stack);
21432   f_vrtop = DECL_CHAIN (f_grtop);
21433   f_groff = DECL_CHAIN (f_vrtop);
21434   f_vroff = DECL_CHAIN (f_groff);
21435
21436   stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), unshare_expr (valist),
21437                   f_stack, NULL_TREE);
21438   size = int_size_in_bytes (type);
21439
21440   unsigned int abi_break_gcc_9;
21441   unsigned int abi_break_gcc_13;
21442   unsigned int abi_break_gcc_14;
21443   align
21444     = aarch64_function_arg_alignment (mode, type, &abi_break_gcc_9,
21445                                       &abi_break_gcc_13, &abi_break_gcc_14)
21446     / BITS_PER_UNIT;
21447
21448   dw_align = false;
21449   adjust = 0;
21450   if (aarch64_vfp_is_call_or_return_candidate (mode, type, &ag_mode, &nregs,
21451                                                &is_ha, false))
21452     {
21453       /* No frontends can create types with variable-sized modes, so we
21454          shouldn't be asked to pass or return them.  */
21455       unsigned int ag_size = GET_MODE_SIZE (ag_mode).to_constant ();
21456
21457       /* TYPE passed in fp/simd registers.  */
21458       if (!TARGET_FLOAT)
21459         aarch64_err_no_fpadvsimd (mode);
21460
21461       f_top = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop),
21462                       unshare_expr (valist), f_vrtop, NULL_TREE);
21463       f_off = build3 (COMPONENT_REF, TREE_TYPE (f_vroff),
21464                       unshare_expr (valist), f_vroff, NULL_TREE);
21465
21466       rsize = nregs * UNITS_PER_VREG;
21467
21468       if (is_ha)
21469         {
21470           if (BYTES_BIG_ENDIAN && ag_size < UNITS_PER_VREG)
21471             adjust = UNITS_PER_VREG - ag_size;
21472         }
21473       else if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
21474                && size < UNITS_PER_VREG)
21475         {
21476           adjust = UNITS_PER_VREG - size;
21477         }
21478     }
21479   else
21480     {
21481       /* TYPE passed in general registers.  */
21482       f_top = build3 (COMPONENT_REF, TREE_TYPE (f_grtop),
21483                       unshare_expr (valist), f_grtop, NULL_TREE);
21484       f_off = build3 (COMPONENT_REF, TREE_TYPE (f_groff),
21485                       unshare_expr (valist), f_groff, NULL_TREE);
21486       rsize = ROUND_UP (size, UNITS_PER_WORD);
21487       nregs = rsize / UNITS_PER_WORD;
21488
21489       if (align <= 8
21490           && abi_break_gcc_13
21491           && warn_psabi
21492           && !bitint_or_aggr_of_bitint_p (type))
21493         inform (input_location, "parameter passing for argument of type "
21494                 "%qT changed in GCC 13.1", type);
21495
21496       if (warn_psabi
21497           && abi_break_gcc_14
21498           && (abi_break_gcc_14 > 8 * BITS_PER_UNIT) != (align > 8)
21499           && !bitint_or_aggr_of_bitint_p (type))
21500         inform (input_location, "parameter passing for argument of type "
21501                 "%qT changed in GCC 14.1", type);
21502
21503       if (align > 8)
21504         {
21505           if (abi_break_gcc_9
21506               && warn_psabi
21507               && !bitint_or_aggr_of_bitint_p (type))
21508             inform (input_location, "parameter passing for argument of type "
21509                     "%qT changed in GCC 9.1", type);
21510           dw_align = true;
21511         }
21512
21513       if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
21514           && size < UNITS_PER_WORD)
21515         {
21516           adjust = UNITS_PER_WORD  - size;
21517         }
21518     }
21519
21520   /* Get a local temporary for the field value.  */
21521   off = get_initialized_tmp_var (f_off, pre_p, NULL);
21522
21523   /* Emit code to branch if off >= 0.  */
21524   t = build2 (GE_EXPR, boolean_type_node, off,
21525               build_int_cst (TREE_TYPE (off), 0));
21526   cond1 = build3 (COND_EXPR, ptr_type_node, t, NULL_TREE, NULL_TREE);
21527
21528   if (dw_align)
21529     {
21530       /* Emit: offs = (offs + 15) & -16.  */
21531       t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
21532                   build_int_cst (TREE_TYPE (off), 15));
21533       t = build2 (BIT_AND_EXPR, TREE_TYPE (off), t,
21534                   build_int_cst (TREE_TYPE (off), -16));
21535       roundup = build2 (MODIFY_EXPR, TREE_TYPE (off), off, t);
21536     }
21537   else
21538     roundup = NULL;
21539
21540   /* Update ap.__[g|v]r_offs  */
21541   t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
21542               build_int_cst (TREE_TYPE (off), rsize));
21543   t = build2 (MODIFY_EXPR, TREE_TYPE (f_off), unshare_expr (f_off), t);
21544
21545   /* String up.  */
21546   if (roundup)
21547     t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
21548
21549   /* [cond2] if (ap.__[g|v]r_offs > 0)  */
21550   u = build2 (GT_EXPR, boolean_type_node, unshare_expr (f_off),
21551               build_int_cst (TREE_TYPE (f_off), 0));
21552   cond2 = build3 (COND_EXPR, ptr_type_node, u, NULL_TREE, NULL_TREE);
21553
21554   /* String up: make sure the assignment happens before the use.  */
21555   t = build2 (COMPOUND_EXPR, TREE_TYPE (cond2), t, cond2);
21556   COND_EXPR_ELSE (cond1) = t;
21557
21558   /* Prepare the trees handling the argument that is passed on the stack;
21559      the top level node will store in ON_STACK.  */
21560   arg = get_initialized_tmp_var (stack, pre_p, NULL);
21561   if (align > 8)
21562     {
21563       /* if (alignof(type) > 8) (arg = arg + 15) & -16;  */
21564       t = fold_build_pointer_plus_hwi (arg, 15);
21565       t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
21566                   build_int_cst (TREE_TYPE (t), -16));
21567       roundup = build2 (MODIFY_EXPR, TREE_TYPE (arg), arg, t);
21568     }
21569   else
21570     roundup = NULL;
21571   /* Advance ap.__stack  */
21572   t = fold_build_pointer_plus_hwi (arg, size + 7);
21573   t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
21574               build_int_cst (TREE_TYPE (t), -8));
21575   t = build2 (MODIFY_EXPR, TREE_TYPE (stack), unshare_expr (stack), t);
21576   /* String up roundup and advance.  */
21577   if (roundup)
21578     t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
21579   /* String up with arg */
21580   on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), t, arg);
21581   /* Big-endianness related address adjustment.  */
21582   if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
21583       && size < UNITS_PER_WORD)
21584   {
21585     t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (arg), arg,
21586                 size_int (UNITS_PER_WORD - size));
21587     on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), on_stack, t);
21588   }
21589
21590   COND_EXPR_THEN (cond1) = unshare_expr (on_stack);
21591   COND_EXPR_THEN (cond2) = unshare_expr (on_stack);
21592
21593   /* Adjustment to OFFSET in the case of BIG_ENDIAN.  */
21594   t = off;
21595   if (adjust)
21596     t = build2 (PREINCREMENT_EXPR, TREE_TYPE (off), off,
21597                 build_int_cst (TREE_TYPE (off), adjust));
21598
21599   t = fold_convert (sizetype, t);
21600   t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (f_top), f_top, t);
21601
21602   if (is_ha)
21603     {
21604       /* type ha; // treat as "struct {ftype field[n];}"
21605          ... [computing offs]
21606          for (i = 0; i <nregs; ++i, offs += 16)
21607            ha.field[i] = *((ftype *)(ap.__vr_top + offs));
21608          return ha;  */
21609       int i;
21610       tree tmp_ha, field_t, field_ptr_t;
21611
21612       /* Declare a local variable.  */
21613       tmp_ha = create_tmp_var_raw (type, "ha");
21614       gimple_add_tmp_var (tmp_ha);
21615
21616       /* Establish the base type.  */
21617       switch (ag_mode)
21618         {
21619         case E_SFmode:
21620           field_t = float_type_node;
21621           field_ptr_t = float_ptr_type_node;
21622           break;
21623         case E_DFmode:
21624           field_t = double_type_node;
21625           field_ptr_t = double_ptr_type_node;
21626           break;
21627         case E_TFmode:
21628           field_t = long_double_type_node;
21629           field_ptr_t = long_double_ptr_type_node;
21630           break;
21631         case E_SDmode:
21632           field_t = dfloat32_type_node;
21633           field_ptr_t = build_pointer_type (dfloat32_type_node);
21634           break;
21635         case E_DDmode:
21636           field_t = dfloat64_type_node;
21637           field_ptr_t = build_pointer_type (dfloat64_type_node);
21638           break;
21639         case E_TDmode:
21640           field_t = dfloat128_type_node;
21641           field_ptr_t = build_pointer_type (dfloat128_type_node);
21642           break;
21643         case E_HFmode:
21644           field_t = aarch64_fp16_type_node;
21645           field_ptr_t = aarch64_fp16_ptr_type_node;
21646           break;
21647         case E_BFmode:
21648           field_t = bfloat16_type_node;
21649           field_ptr_t = aarch64_bf16_ptr_type_node;
21650           break;
21651         case E_V2SImode:
21652         case E_V4SImode:
21653             {
21654               tree innertype = make_signed_type (GET_MODE_PRECISION (SImode));
21655               field_t = build_vector_type_for_mode (innertype, ag_mode);
21656               field_ptr_t = build_pointer_type (field_t);
21657             }
21658           break;
21659         default:
21660           gcc_assert (0);
21661         }
21662
21663       /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area  */
21664       TREE_ADDRESSABLE (tmp_ha) = 1;
21665       tmp_ha = build1 (ADDR_EXPR, field_ptr_t, tmp_ha);
21666       addr = t;
21667       t = fold_convert (field_ptr_t, addr);
21668       t = build2 (MODIFY_EXPR, field_t,
21669                   build1 (INDIRECT_REF, field_t, tmp_ha),
21670                   build1 (INDIRECT_REF, field_t, t));
21671
21672       /* ha.field[i] = *((field_ptr_t)vr_saved_area + i)  */
21673       for (i = 1; i < nregs; ++i)
21674         {
21675           addr = fold_build_pointer_plus_hwi (addr, UNITS_PER_VREG);
21676           u = fold_convert (field_ptr_t, addr);
21677           u = build2 (MODIFY_EXPR, field_t,
21678                       build2 (MEM_REF, field_t, tmp_ha,
21679                               build_int_cst (field_ptr_t,
21680                                              (i *
21681                                               int_size_in_bytes (field_t)))),
21682                       build1 (INDIRECT_REF, field_t, u));
21683           t = build2 (COMPOUND_EXPR, TREE_TYPE (t), t, u);
21684         }
21685
21686       u = fold_convert (TREE_TYPE (f_top), tmp_ha);
21687       t = build2 (COMPOUND_EXPR, TREE_TYPE (f_top), t, u);
21688     }
21689
21690   COND_EXPR_ELSE (cond2) = t;
21691   addr = fold_convert (build_pointer_type (type), cond1);
21692   addr = build_va_arg_indirect_ref (addr);
21693
21694   if (indirect_p)
21695     addr = build_va_arg_indirect_ref (addr);
21696
21697   return addr;
21698 }
21699
21700 /* Implement TARGET_SETUP_INCOMING_VARARGS.  */
21701
21702 static void
21703 aarch64_setup_incoming_varargs (cumulative_args_t cum_v,
21704                                 const function_arg_info &arg,
21705                                 int *pretend_size ATTRIBUTE_UNUSED, int no_rtl)
21706 {
21707   CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
21708   CUMULATIVE_ARGS local_cum;
21709   int gr_saved = cfun->va_list_gpr_size;
21710   int vr_saved = cfun->va_list_fpr_size;
21711
21712   /* The caller has advanced CUM up to, but not beyond, the last named
21713      argument.  Advance a local copy of CUM past the last "real" named
21714      argument, to find out how many registers are left over.  */
21715   local_cum = *cum;
21716   if (!TYPE_NO_NAMED_ARGS_STDARG_P (TREE_TYPE (current_function_decl)))
21717     aarch64_function_arg_advance (pack_cumulative_args(&local_cum), arg);
21718
21719   /* Found out how many registers we need to save.
21720      Honor tree-stdvar analysis results.  */
21721   if (cfun->va_list_gpr_size)
21722     gr_saved = MIN (NUM_ARG_REGS - local_cum.aapcs_ncrn,
21723                     cfun->va_list_gpr_size / UNITS_PER_WORD);
21724   if (cfun->va_list_fpr_size)
21725     vr_saved = MIN (NUM_FP_ARG_REGS - local_cum.aapcs_nvrn,
21726                     cfun->va_list_fpr_size / UNITS_PER_VREG);
21727
21728   if (!TARGET_FLOAT)
21729     {
21730       gcc_assert (local_cum.aapcs_nvrn == 0);
21731       vr_saved = 0;
21732     }
21733
21734   if (!no_rtl)
21735     {
21736       if (gr_saved > 0)
21737         {
21738           rtx ptr, mem;
21739
21740           /* virtual_incoming_args_rtx should have been 16-byte aligned.  */
21741           ptr = plus_constant (Pmode, virtual_incoming_args_rtx,
21742                                - gr_saved * UNITS_PER_WORD);
21743           mem = gen_frame_mem (BLKmode, ptr);
21744           set_mem_alias_set (mem, get_varargs_alias_set ());
21745
21746           move_block_from_reg (local_cum.aapcs_ncrn + R0_REGNUM,
21747                                mem, gr_saved);
21748         }
21749       if (vr_saved > 0)
21750         {
21751           /* We can't use move_block_from_reg, because it will use
21752              the wrong mode, storing D regs only.  */
21753           machine_mode mode = TImode;
21754           int off, i, vr_start;
21755
21756           /* Set OFF to the offset from virtual_incoming_args_rtx of
21757              the first vector register.  The VR save area lies below
21758              the GR one, and is aligned to 16 bytes.  */
21759           off = -ROUND_UP (gr_saved * UNITS_PER_WORD,
21760                            STACK_BOUNDARY / BITS_PER_UNIT);
21761           off -= vr_saved * UNITS_PER_VREG;
21762
21763           vr_start = V0_REGNUM + local_cum.aapcs_nvrn;
21764           for (i = 0; i < vr_saved; ++i)
21765             {
21766               rtx ptr, mem;
21767
21768               ptr = plus_constant (Pmode, virtual_incoming_args_rtx, off);
21769               mem = gen_frame_mem (mode, ptr);
21770               set_mem_alias_set (mem, get_varargs_alias_set ());
21771               aarch64_emit_move (mem, gen_rtx_REG (mode, vr_start + i));
21772               off += UNITS_PER_VREG;
21773             }
21774         }
21775     }
21776
21777   /* We don't save the size into *PRETEND_SIZE because we want to avoid
21778      any complication of having crtl->args.pretend_args_size changed.  */
21779   cfun->machine->frame.saved_varargs_size
21780     = (ROUND_UP (gr_saved * UNITS_PER_WORD,
21781                  STACK_BOUNDARY / BITS_PER_UNIT)
21782        + vr_saved * UNITS_PER_VREG);
21783 }
21784
21785 static void
21786 aarch64_conditional_register_usage (void)
21787 {
21788   int i;
21789   if (!TARGET_FLOAT)
21790     {
21791       for (i = V0_REGNUM; i <= V31_REGNUM; i++)
21792         {
21793           fixed_regs[i] = 1;
21794           call_used_regs[i] = 1;
21795           CLEAR_HARD_REG_BIT (operand_reg_set, i);
21796         }
21797     }
21798   if (!TARGET_SVE)
21799     for (i = P0_REGNUM; i <= P15_REGNUM; i++)
21800       {
21801         fixed_regs[i] = 1;
21802         call_used_regs[i] = 1;
21803       }
21804
21805   /* Only allow these registers to be accessed via special patterns.  */
21806   CLEAR_HARD_REG_BIT (operand_reg_set, VG_REGNUM);
21807   CLEAR_HARD_REG_BIT (operand_reg_set, FFR_REGNUM);
21808   CLEAR_HARD_REG_BIT (operand_reg_set, FFRT_REGNUM);
21809   for (int i = FIRST_FAKE_REGNUM; i <= LAST_FAKE_REGNUM; ++i)
21810     CLEAR_HARD_REG_BIT (operand_reg_set, i);
21811
21812   /* When tracking speculation, we need a couple of call-clobbered registers
21813      to track the speculation state.  It would be nice to just use
21814      IP0 and IP1, but currently there are numerous places that just
21815      assume these registers are free for other uses (eg pointer
21816      authentication).  */
21817   if (aarch64_track_speculation)
21818     {
21819       fixed_regs[SPECULATION_TRACKER_REGNUM] = 1;
21820       call_used_regs[SPECULATION_TRACKER_REGNUM] = 1;
21821       fixed_regs[SPECULATION_SCRATCH_REGNUM] = 1;
21822       call_used_regs[SPECULATION_SCRATCH_REGNUM] = 1;
21823     }
21824 }
21825
21826 /* Implement TARGET_MEMBER_TYPE_FORCES_BLK.  */
21827
21828 bool
21829 aarch64_member_type_forces_blk (const_tree field_or_array, machine_mode mode)
21830 {
21831   /* For records we're passed a FIELD_DECL, for arrays we're passed
21832      an ARRAY_TYPE.  In both cases we're interested in the TREE_TYPE.  */
21833   const_tree type = TREE_TYPE (field_or_array);
21834
21835   /* Assign BLKmode to anything that contains more than 2 SVE predicates.
21836      For structures, the "multiple" case is indicated by MODE being
21837      VOIDmode.  */
21838   unsigned int num_zr, num_pr;
21839   if (aarch64_sve::builtin_type_p (type, &num_zr, &num_pr) && num_pr > 2)
21840     {
21841       if (TREE_CODE (field_or_array) == ARRAY_TYPE)
21842         return !simple_cst_equal (TYPE_SIZE (field_or_array),
21843                                   TYPE_SIZE (type));
21844       return mode == VOIDmode;
21845     }
21846
21847   return default_member_type_forces_blk (field_or_array, mode);
21848 }
21849
21850 /* Bitmasks that indicate whether earlier versions of GCC would have
21851    taken a different path through the ABI logic.  This should result in
21852    a -Wpsabi warning if the earlier path led to a different ABI decision.
21853
21854    WARN_PSABI_EMPTY_CXX17_BASE
21855       Indicates that the type includes an artificial empty C++17 base field
21856       that, prior to GCC 10.1, would prevent the type from being treated as
21857       a HFA or HVA.  See PR94383 for details.
21858
21859    WARN_PSABI_NO_UNIQUE_ADDRESS
21860       Indicates that the type includes an empty [[no_unique_address]] field
21861       that, prior to GCC 10.1, would prevent the type from being treated as
21862       a HFA or HVA.  */
21863 const unsigned int WARN_PSABI_EMPTY_CXX17_BASE = 1U << 0;
21864 const unsigned int WARN_PSABI_NO_UNIQUE_ADDRESS = 1U << 1;
21865 const unsigned int WARN_PSABI_ZERO_WIDTH_BITFIELD = 1U << 2;
21866
21867 /* Walk down the type tree of TYPE counting consecutive base elements.
21868    If *MODEP is VOIDmode, then set it to the first valid floating point
21869    type.  If a non-floating point type is found, or if a floating point
21870    type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
21871    otherwise return the count in the sub-tree.
21872
21873    The WARN_PSABI_FLAGS argument allows the caller to check whether this
21874    function has changed its behavior relative to earlier versions of GCC.
21875    Normally the argument should be nonnull and point to a zero-initialized
21876    variable.  The function then records whether the ABI decision might
21877    be affected by a known fix to the ABI logic, setting the associated
21878    WARN_PSABI_* bits if so.
21879
21880    When the argument is instead a null pointer, the function tries to
21881    simulate the behavior of GCC before all such ABI fixes were made.
21882    This is useful to check whether the function returns something
21883    different after the ABI fixes.  */
21884 static int
21885 aapcs_vfp_sub_candidate (const_tree type, machine_mode *modep,
21886                          unsigned int *warn_psabi_flags)
21887 {
21888   machine_mode mode;
21889   HOST_WIDE_INT size;
21890
21891   if (aarch64_sve::builtin_type_p (type))
21892     return -1;
21893
21894   switch (TREE_CODE (type))
21895     {
21896     case REAL_TYPE:
21897       mode = TYPE_MODE (type);
21898       if (mode != DFmode && mode != SFmode
21899           && mode != TFmode && mode != HFmode
21900           && mode != SDmode && mode != DDmode && mode != TDmode)
21901         return -1;
21902
21903       if (*modep == VOIDmode)
21904         *modep = mode;
21905
21906       if (*modep == mode)
21907         return 1;
21908
21909       break;
21910
21911     case COMPLEX_TYPE:
21912       mode = TYPE_MODE (TREE_TYPE (type));
21913       if (mode != DFmode && mode != SFmode
21914           && mode != TFmode && mode != HFmode)
21915         return -1;
21916
21917       if (*modep == VOIDmode)
21918         *modep = mode;
21919
21920       if (*modep == mode)
21921         return 2;
21922
21923       break;
21924
21925     case VECTOR_TYPE:
21926       /* Use V2SImode and V4SImode as representatives of all 64-bit
21927          and 128-bit vector types.  */
21928       size = int_size_in_bytes (type);
21929       switch (size)
21930         {
21931         case 8:
21932           mode = V2SImode;
21933           break;
21934         case 16:
21935           mode = V4SImode;
21936           break;
21937         default:
21938           return -1;
21939         }
21940
21941       if (*modep == VOIDmode)
21942         *modep = mode;
21943
21944       /* Vector modes are considered to be opaque: two vectors are
21945          equivalent for the purposes of being homogeneous aggregates
21946          if they are the same size.  */
21947       if (*modep == mode)
21948         return 1;
21949
21950       break;
21951
21952     case ARRAY_TYPE:
21953       {
21954         int count;
21955         tree index = TYPE_DOMAIN (type);
21956
21957         /* Can't handle incomplete types nor sizes that are not
21958            fixed.  */
21959         if (!COMPLETE_TYPE_P (type)
21960             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
21961           return -1;
21962
21963         count = aapcs_vfp_sub_candidate (TREE_TYPE (type), modep,
21964                                          warn_psabi_flags);
21965         if (count == -1
21966             || !index
21967             || !TYPE_MAX_VALUE (index)
21968             || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index))
21969             || !TYPE_MIN_VALUE (index)
21970             || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index))
21971             || count < 0)
21972           return -1;
21973
21974         count *= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index))
21975                       - tree_to_uhwi (TYPE_MIN_VALUE (index)));
21976
21977         /* There must be no padding.  */
21978         if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
21979                       count * GET_MODE_BITSIZE (*modep)))
21980           return -1;
21981
21982         return count;
21983       }
21984
21985     case RECORD_TYPE:
21986       {
21987         int count = 0;
21988         int sub_count;
21989         tree field;
21990
21991         /* Can't handle incomplete types nor sizes that are not
21992            fixed.  */
21993         if (!COMPLETE_TYPE_P (type)
21994             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
21995           return -1;
21996
21997         for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
21998           {
21999             if (TREE_CODE (field) != FIELD_DECL)
22000               continue;
22001
22002             if (DECL_FIELD_ABI_IGNORED (field))
22003               {
22004                 /* See whether this is something that earlier versions of
22005                    GCC failed to ignore.  */
22006                 unsigned int flag;
22007                 if (lookup_attribute ("no_unique_address",
22008                                       DECL_ATTRIBUTES (field)))
22009                   flag = WARN_PSABI_NO_UNIQUE_ADDRESS;
22010                 else if (cxx17_empty_base_field_p (field))
22011                   flag = WARN_PSABI_EMPTY_CXX17_BASE;
22012                 else
22013                   /* No compatibility problem.  */
22014                   continue;
22015
22016                 /* Simulate the old behavior when WARN_PSABI_FLAGS is null.  */
22017                 if (warn_psabi_flags)
22018                   {
22019                     *warn_psabi_flags |= flag;
22020                     continue;
22021                   }
22022               }
22023             /* A zero-width bitfield may affect layout in some
22024                circumstances, but adds no members.  The determination
22025                of whether or not a type is an HFA is performed after
22026                layout is complete, so if the type still looks like an
22027                HFA afterwards, it is still classed as one.  This is
22028                potentially an ABI break for the hard-float ABI.  */
22029             else if (DECL_BIT_FIELD (field)
22030                      && integer_zerop (DECL_SIZE (field)))
22031               {
22032                 /* Prior to GCC-12 these fields were striped early,
22033                    hiding them from the back-end entirely and
22034                    resulting in the correct behaviour for argument
22035                    passing.  Simulate that old behaviour without
22036                    generating a warning.  */
22037                 if (DECL_FIELD_CXX_ZERO_WIDTH_BIT_FIELD (field))
22038                   continue;
22039                 if (warn_psabi_flags)
22040                   {
22041                     *warn_psabi_flags |= WARN_PSABI_ZERO_WIDTH_BITFIELD;
22042                     continue;
22043                   }
22044               }
22045
22046             sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep,
22047                                                  warn_psabi_flags);
22048             if (sub_count < 0)
22049               return -1;
22050             count += sub_count;
22051           }
22052
22053         /* There must be no padding.  */
22054         if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
22055                       count * GET_MODE_BITSIZE (*modep)))
22056           return -1;
22057
22058         return count;
22059       }
22060
22061     case UNION_TYPE:
22062     case QUAL_UNION_TYPE:
22063       {
22064         /* These aren't very interesting except in a degenerate case.  */
22065         int count = 0;
22066         int sub_count;
22067         tree field;
22068
22069         /* Can't handle incomplete types nor sizes that are not
22070            fixed.  */
22071         if (!COMPLETE_TYPE_P (type)
22072             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
22073           return -1;
22074
22075         for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
22076           {
22077             if (TREE_CODE (field) != FIELD_DECL)
22078               continue;
22079
22080             sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep,
22081                                                  warn_psabi_flags);
22082             if (sub_count < 0)
22083               return -1;
22084             count = count > sub_count ? count : sub_count;
22085           }
22086
22087         /* There must be no padding.  */
22088         if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
22089                       count * GET_MODE_BITSIZE (*modep)))
22090           return -1;
22091
22092         return count;
22093       }
22094
22095     default:
22096       break;
22097     }
22098
22099   return -1;
22100 }
22101
22102 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector
22103    type as described in AAPCS64 \S 4.1.2.
22104
22105    See the comment above aarch64_composite_type_p for the notes on MODE.  */
22106
22107 static bool
22108 aarch64_short_vector_p (const_tree type,
22109                         machine_mode mode)
22110 {
22111   poly_int64 size = -1;
22112
22113   if (type && VECTOR_TYPE_P (type))
22114     {
22115       if (aarch64_sve::builtin_type_p (type))
22116         return false;
22117       size = int_size_in_bytes (type);
22118     }
22119   else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
22120            || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
22121     {
22122       /* The containing "else if" is too loose: it means that we look at TYPE
22123          if the type is a vector type (good), but that we otherwise ignore TYPE
22124          and look only at the mode.  This is wrong because the type describes
22125          the language-level information whereas the mode is purely an internal
22126          GCC concept.  We can therefore reach here for types that are not
22127          vectors in the AAPCS64 sense.
22128
22129          We can't "fix" that for the traditional Advanced SIMD vector modes
22130          without breaking backwards compatibility.  However, there's no such
22131          baggage for the structure modes, which were introduced in GCC 12.  */
22132       if (aarch64_advsimd_struct_mode_p (mode))
22133         return false;
22134
22135       /* For similar reasons, rely only on the type, not the mode, when
22136          processing SVE types.  */
22137       if (type && aarch64_some_values_include_pst_objects_p (type))
22138         /* Leave later code to report an error if SVE is disabled.  */
22139         gcc_assert (!TARGET_SVE || aarch64_sve_mode_p (mode));
22140       else
22141         size = GET_MODE_SIZE (mode);
22142     }
22143   if (known_eq (size, 8) || known_eq (size, 16))
22144     {
22145       /* 64-bit and 128-bit vectors should only acquire an SVE mode if
22146          they are being treated as scalable AAPCS64 types.  */
22147       gcc_assert (!aarch64_sve_mode_p (mode)
22148                   && !aarch64_advsimd_struct_mode_p (mode));
22149       return true;
22150     }
22151   return false;
22152 }
22153
22154 /* Return TRUE if the type, as described by TYPE and MODE, is a composite
22155    type as described in AAPCS64 \S 4.3.  This includes aggregate, union and
22156    array types.  The C99 floating-point complex types are also considered
22157    as composite types, according to AAPCS64 \S 7.1.1.  The complex integer
22158    types, which are GCC extensions and out of the scope of AAPCS64, are
22159    treated as composite types here as well.
22160
22161    Note that MODE itself is not sufficient in determining whether a type
22162    is such a composite type or not.  This is because
22163    stor-layout.cc:compute_record_mode may have already changed the MODE
22164    (BLKmode) of a RECORD_TYPE TYPE to some other mode.  For example, a
22165    structure with only one field may have its MODE set to the mode of the
22166    field.  Also an integer mode whose size matches the size of the
22167    RECORD_TYPE type may be used to substitute the original mode
22168    (i.e. BLKmode) in certain circumstances.  In other words, MODE cannot be
22169    solely relied on.  */
22170
22171 static bool
22172 aarch64_composite_type_p (const_tree type,
22173                           machine_mode mode)
22174 {
22175   if (aarch64_short_vector_p (type, mode))
22176     return false;
22177
22178   if (type && (AGGREGATE_TYPE_P (type) || TREE_CODE (type) == COMPLEX_TYPE))
22179     return true;
22180
22181   if (type
22182       && TREE_CODE (type) == BITINT_TYPE
22183       && int_size_in_bytes (type) > 16)
22184     return true;
22185
22186   if (mode == BLKmode
22187       || GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT
22188       || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
22189     return true;
22190
22191   return false;
22192 }
22193
22194 /* Return TRUE if an argument, whose type is described by TYPE and MODE,
22195    shall be passed or returned in simd/fp register(s) (providing these
22196    parameter passing registers are available).
22197
22198    Upon successful return, *COUNT returns the number of needed registers,
22199    *BASE_MODE returns the mode of the individual register and when IS_HA
22200    is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
22201    floating-point aggregate or a homogeneous short-vector aggregate.
22202
22203    SILENT_P is true if the function should refrain from reporting any
22204    diagnostics.  This should only be used if the caller is certain that
22205    any ABI decisions would eventually come through this function with
22206    SILENT_P set to false.  */
22207
22208 static bool
22209 aarch64_vfp_is_call_or_return_candidate (machine_mode mode,
22210                                          const_tree type,
22211                                          machine_mode *base_mode,
22212                                          int *count,
22213                                          bool *is_ha,
22214                                          bool silent_p)
22215 {
22216   if (is_ha != NULL) *is_ha = false;
22217
22218   machine_mode new_mode = VOIDmode;
22219   bool composite_p = aarch64_composite_type_p (type, mode);
22220
22221   if ((!composite_p
22222        && (GET_MODE_CLASS (mode) == MODE_FLOAT
22223            || GET_MODE_CLASS (mode) == MODE_DECIMAL_FLOAT
22224            || (type && TYPE_MAIN_VARIANT (type) == aarch64_mfp8_type_node)))
22225       || aarch64_short_vector_p (type, mode))
22226     {
22227       *count = 1;
22228       new_mode = mode;
22229     }
22230   else if (GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT)
22231     {
22232       if (is_ha != NULL) *is_ha = true;
22233       *count = 2;
22234       new_mode = GET_MODE_INNER (mode);
22235     }
22236   else if (type && composite_p)
22237     {
22238       unsigned int warn_psabi_flags = 0;
22239       int ag_count = aapcs_vfp_sub_candidate (type, &new_mode,
22240                                               &warn_psabi_flags);
22241       if (ag_count > 0 && ag_count <= HA_MAX_NUM_FLDS)
22242         {
22243           static unsigned last_reported_type_uid;
22244           unsigned uid = TYPE_UID (TYPE_MAIN_VARIANT (type));
22245           int alt;
22246           if (!silent_p
22247               && warn_psabi
22248               && warn_psabi_flags
22249               && uid != last_reported_type_uid
22250               && ((alt = aapcs_vfp_sub_candidate (type, &new_mode, NULL))
22251                   != ag_count))
22252             {
22253               const char *url10
22254                 = CHANGES_ROOT_URL "gcc-10/changes.html#empty_base";
22255               const char *url12
22256                 = CHANGES_ROOT_URL "gcc-12/changes.html#zero_width_bitfields";
22257               gcc_assert (alt == -1);
22258               last_reported_type_uid = uid;
22259               /* Use TYPE_MAIN_VARIANT to strip any redundant const
22260                  qualification.  */
22261               if (warn_psabi_flags & WARN_PSABI_NO_UNIQUE_ADDRESS)
22262                 inform (input_location, "parameter passing for argument of "
22263                         "type %qT with %<[[no_unique_address]]%> members "
22264                         "changed %{in GCC 10.1%}",
22265                         TYPE_MAIN_VARIANT (type), url10);
22266               else if (warn_psabi_flags & WARN_PSABI_EMPTY_CXX17_BASE)
22267                 inform (input_location, "parameter passing for argument of "
22268                         "type %qT when C++17 is enabled changed to match "
22269                         "C++14 %{in GCC 10.1%}",
22270                         TYPE_MAIN_VARIANT (type), url10);
22271               else if (warn_psabi_flags & WARN_PSABI_ZERO_WIDTH_BITFIELD)
22272                 inform (input_location, "parameter passing for argument of "
22273                         "type %qT changed %{in GCC 12.1%}",
22274                         TYPE_MAIN_VARIANT (type), url12);
22275             }
22276
22277           if (is_ha != NULL) *is_ha = true;
22278           *count = ag_count;
22279         }
22280       else
22281         return false;
22282     }
22283   else
22284     return false;
22285
22286   gcc_assert (!aarch64_sve_mode_p (new_mode));
22287   *base_mode = new_mode;
22288   return true;
22289 }
22290
22291 /* Implement TARGET_STRUCT_VALUE_RTX.  */
22292
22293 static rtx
22294 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED,
22295                           int incoming ATTRIBUTE_UNUSED)
22296 {
22297   return gen_rtx_REG (Pmode, AARCH64_STRUCT_VALUE_REGNUM);
22298 }
22299
22300 /* Implements target hook vector_mode_supported_p.  */
22301 static bool
22302 aarch64_vector_mode_supported_p (machine_mode mode)
22303 {
22304   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
22305   return vec_flags != 0 && (vec_flags & VEC_STRUCT) == 0;
22306 }
22307
22308 /* Implements target hook vector_mode_supported_any_target_p.  */
22309 static bool
22310 aarch64_vector_mode_supported_any_target_p (machine_mode mode)
22311 {
22312   unsigned int vec_flags = aarch64_classify_vector_mode (mode, true);
22313   return vec_flags != 0 && (vec_flags & VEC_STRUCT) == 0;
22314 }
22315
22316 /* Return the full-width SVE vector mode for element mode MODE, if one
22317    exists.  */
22318 opt_machine_mode
22319 aarch64_full_sve_mode (scalar_mode mode)
22320 {
22321   switch (mode)
22322     {
22323     case E_DFmode:
22324       return VNx2DFmode;
22325     case E_SFmode:
22326       return VNx4SFmode;
22327     case E_HFmode:
22328       return VNx8HFmode;
22329     case E_BFmode:
22330       return VNx8BFmode;
22331     case E_DImode:
22332       return VNx2DImode;
22333     case E_SImode:
22334       return VNx4SImode;
22335     case E_HImode:
22336       return VNx8HImode;
22337     case E_QImode:
22338       return VNx16QImode;
22339     default:
22340       return opt_machine_mode ();
22341     }
22342 }
22343
22344 /* Return the 64-bit Advanced SIMD vector mode for element mode MODE,
22345    if it exists.  */
22346 opt_machine_mode
22347 aarch64_v64_mode (scalar_mode mode)
22348 {
22349   switch (mode)
22350     {
22351     case E_SFmode:
22352       return V2SFmode;
22353     case E_HFmode:
22354       return V4HFmode;
22355     case E_BFmode:
22356       return V4BFmode;
22357     case E_SImode:
22358       return V2SImode;
22359     case E_HImode:
22360       return V4HImode;
22361     case E_QImode:
22362       return V8QImode;
22363     default:
22364       return {};
22365     }
22366 }
22367
22368 /* Return the 128-bit Advanced SIMD vector mode for element mode MODE,
22369    if it exists.  */
22370 opt_machine_mode
22371 aarch64_v128_mode (scalar_mode mode)
22372 {
22373   switch (mode)
22374     {
22375     case E_DFmode:
22376       return V2DFmode;
22377     case E_SFmode:
22378       return V4SFmode;
22379     case E_HFmode:
22380       return V8HFmode;
22381     case E_BFmode:
22382       return V8BFmode;
22383     case E_SImode:
22384       return V4SImode;
22385     case E_HImode:
22386       return V8HImode;
22387     case E_QImode:
22388       return V16QImode;
22389     case E_DImode:
22390       return V2DImode;
22391     default:
22392       return opt_machine_mode ();
22393     }
22394 }
22395
22396 /* Return appropriate SIMD container
22397    for MODE within a vector of WIDTH bits.  */
22398 static machine_mode
22399 aarch64_simd_container_mode (scalar_mode mode, poly_int64 width)
22400 {
22401   if (TARGET_SVE
22402       && maybe_ne (width, 128)
22403       && known_eq (width, BITS_PER_SVE_VECTOR))
22404     return aarch64_full_sve_mode (mode).else_mode (word_mode);
22405
22406   gcc_assert (known_eq (width, 64) || known_eq (width, 128));
22407   if (TARGET_BASE_SIMD)
22408     {
22409       if (known_eq (width, 128))
22410         return aarch64_v128_mode (mode).else_mode (word_mode);
22411       else
22412         return aarch64_v64_mode (mode).else_mode (word_mode);
22413     }
22414   return word_mode;
22415 }
22416
22417 /* Compare an SVE mode SVE_M and an Advanced SIMD mode ASIMD_M
22418    and return whether the SVE mode should be preferred over the
22419    Advanced SIMD one in aarch64_autovectorize_vector_modes.  */
22420 static bool
22421 aarch64_cmp_autovec_modes (machine_mode sve_m, machine_mode asimd_m)
22422 {
22423   /* Take into account the aarch64-autovec-preference param if non-zero.  */
22424   bool only_asimd_p = aarch64_autovec_preference == AARCH64_AUTOVEC_ASIMD_ONLY;
22425   bool only_sve_p = aarch64_autovec_preference == AARCH64_AUTOVEC_SVE_ONLY;
22426
22427   if (only_asimd_p)
22428     return false;
22429   if (only_sve_p)
22430     return true;
22431
22432   /* The preference in case of a tie in costs.  */
22433   bool prefer_asimd = aarch64_autovec_preference == AARCH64_AUTOVEC_PREFER_ASIMD;
22434   bool prefer_sve = aarch64_autovec_preference == AARCH64_AUTOVEC_PREFER_SVE;
22435
22436   poly_int64 nunits_sve = GET_MODE_NUNITS (sve_m);
22437   poly_int64 nunits_asimd = GET_MODE_NUNITS (asimd_m);
22438   /* If the CPU information does not have an SVE width registered use the
22439      generic poly_int comparison that prefers SVE.  If a preference is
22440      explicitly requested avoid this path.  */
22441   if (aarch64_tune_params.sve_width == SVE_SCALABLE
22442       && !prefer_asimd
22443       && !prefer_sve)
22444     return maybe_gt (nunits_sve, nunits_asimd);
22445
22446   /* Otherwise estimate the runtime width of the modes involved.  */
22447   HOST_WIDE_INT est_sve = estimated_poly_value (nunits_sve);
22448   HOST_WIDE_INT est_asimd = estimated_poly_value (nunits_asimd);
22449
22450   /* Preferring SVE means picking it first unless the Advanced SIMD mode
22451      is clearly wider.  */
22452   if (prefer_sve)
22453     return est_sve >= est_asimd;
22454   /* Conversely, preferring Advanced SIMD means picking SVE only if SVE
22455      is clearly wider.  */
22456   if (prefer_asimd)
22457     return est_sve > est_asimd;
22458
22459   /* In the default case prefer Advanced SIMD over SVE in case of a tie.  */
22460   return est_sve > est_asimd;
22461 }
22462
22463 /* Return 128-bit container as the preferred SIMD mode for MODE.  */
22464 static machine_mode
22465 aarch64_preferred_simd_mode (scalar_mode mode)
22466 {
22467   /* Take into account explicit auto-vectorization ISA preferences through
22468      aarch64_cmp_autovec_modes.  */
22469   if (TARGET_SVE && aarch64_cmp_autovec_modes (VNx16QImode, V16QImode))
22470     return aarch64_full_sve_mode (mode).else_mode (word_mode);
22471   if (TARGET_SIMD)
22472     return aarch64_v128_mode (mode).else_mode (word_mode);
22473   return word_mode;
22474 }
22475
22476 /* Return a list of possible vector sizes for the vectorizer
22477    to iterate over.  */
22478 static unsigned int
22479 aarch64_autovectorize_vector_modes (vector_modes *modes, bool)
22480 {
22481   static const machine_mode sve_modes[] = {
22482     /* Try using full vectors for all element types.  */
22483     VNx16QImode,
22484
22485     /* Try using 16-bit containers for 8-bit elements and full vectors
22486        for wider elements.  */
22487     VNx8QImode,
22488
22489     /* Try using 32-bit containers for 8-bit and 16-bit elements and
22490        full vectors for wider elements.  */
22491     VNx4QImode,
22492
22493     /* Try using 64-bit containers for all element types.  */
22494     VNx2QImode
22495   };
22496
22497   static const machine_mode advsimd_modes[] = {
22498     /* Try using 128-bit vectors for all element types.  */
22499     V16QImode,
22500
22501     /* Try using 64-bit vectors for 8-bit elements and 128-bit vectors
22502        for wider elements.  */
22503     V8QImode,
22504
22505     /* Try using 64-bit vectors for 16-bit elements and 128-bit vectors
22506        for wider elements.
22507
22508        TODO: We could support a limited form of V4QImode too, so that
22509        we use 32-bit vectors for 8-bit elements.  */
22510     V4HImode,
22511
22512     /* Try using 64-bit vectors for 32-bit elements and 128-bit vectors
22513        for 64-bit elements.
22514
22515        TODO: We could similarly support limited forms of V2QImode and V2HImode
22516        for this case.  */
22517     V2SImode
22518   };
22519
22520   /* Try using N-byte SVE modes only after trying N-byte Advanced SIMD mode.
22521      This is because:
22522
22523      - If we can't use N-byte Advanced SIMD vectors then the placement
22524        doesn't matter; we'll just continue as though the Advanced SIMD
22525        entry didn't exist.
22526
22527      - If an SVE main loop with N bytes ends up being cheaper than an
22528        Advanced SIMD main loop with N bytes then by default we'll replace
22529        the Advanced SIMD version with the SVE one.
22530
22531      - If an Advanced SIMD main loop with N bytes ends up being cheaper
22532        than an SVE main loop with N bytes then by default we'll try to
22533        use the SVE loop to vectorize the epilogue instead.  */
22534
22535   bool only_asimd_p = aarch64_autovec_preference == AARCH64_AUTOVEC_ASIMD_ONLY;
22536   bool only_sve_p = aarch64_autovec_preference == AARCH64_AUTOVEC_SVE_ONLY;
22537
22538   unsigned int sve_i = (TARGET_SVE && !only_asimd_p) ? 0 : ARRAY_SIZE (sve_modes);
22539   unsigned int advsimd_i = 0;
22540
22541   while (!only_sve_p && advsimd_i < ARRAY_SIZE (advsimd_modes))
22542     {
22543       if (sve_i < ARRAY_SIZE (sve_modes)
22544           && aarch64_cmp_autovec_modes (sve_modes[sve_i],
22545                                         advsimd_modes[advsimd_i]))
22546         modes->safe_push (sve_modes[sve_i++]);
22547       else
22548         modes->safe_push (advsimd_modes[advsimd_i++]);
22549     }
22550   while (sve_i < ARRAY_SIZE (sve_modes))
22551    modes->safe_push (sve_modes[sve_i++]);
22552
22553   unsigned int flags = 0;
22554   if (aarch64_vect_compare_costs)
22555     flags |= VECT_COMPARE_COSTS;
22556   return flags;
22557 }
22558
22559 /* Implement TARGET_MANGLE_TYPE.  */
22560
22561 static const char *
22562 aarch64_mangle_type (const_tree type)
22563 {
22564   /* The AArch64 ABI documents say that "__va_list" has to be
22565      mangled as if it is in the "std" namespace.  */
22566   if (lang_hooks.types_compatible_p (CONST_CAST_TREE (type), va_list_type))
22567     return "St9__va_list";
22568
22569   /* Half-precision floating point types.  */
22570   if (SCALAR_FLOAT_TYPE_P (type) && TYPE_PRECISION (type) == 16)
22571     {
22572       if (TYPE_MAIN_VARIANT (type) == float16_type_node)
22573         return NULL;
22574       if (TYPE_MODE (type) == BFmode)
22575         return "u6__bf16";
22576       else
22577         return "Dh";
22578     }
22579
22580   /* Modal 8 bit floating point types.  */
22581   if (TYPE_MAIN_VARIANT (type) == aarch64_mfp8_type_node)
22582     return "u6__mfp8";
22583
22584   /* Mangle AArch64-specific internal types.  TYPE_NAME is non-NULL_TREE for
22585      builtin types.  */
22586   if (TYPE_NAME (type) != NULL)
22587     {
22588       const char *res;
22589       if ((res = aarch64_general_mangle_builtin_type (type))
22590           || (res = aarch64_sve::mangle_builtin_type (type)))
22591         return res;
22592     }
22593
22594   /* Use the default mangling.  */
22595   return NULL;
22596 }
22597
22598 /* Implement TARGET_INVALID_CONVERSION.  */
22599
22600 static const char *
22601 aarch64_invalid_conversion (const_tree fromtype, const_tree totype)
22602 {
22603   /* Do not allow conversions to/from FP8. But do allow conversions between
22604      volatile and const variants of __mfp8. */
22605   bool fromtype_is_fp8
22606       = (TYPE_MAIN_VARIANT (fromtype) == aarch64_mfp8_type_node);
22607   bool totype_is_fp8 = (TYPE_MAIN_VARIANT (totype) == aarch64_mfp8_type_node);
22608
22609   if (fromtype_is_fp8 && totype_is_fp8)
22610     return NULL;
22611
22612   if (fromtype_is_fp8)
22613     return N_ ("invalid conversion from type %<mfloat8_t%>");
22614   if (totype_is_fp8)
22615     return N_ ("invalid conversion to type %<mfloat8_t%>");
22616
22617   /* Conversion allowed.  */
22618   return NULL;
22619 }
22620
22621 /* Implement TARGET_VERIFY_TYPE_CONTEXT.  */
22622
22623 static bool
22624 aarch64_verify_type_context (location_t loc, type_context_kind context,
22625                              const_tree type, bool silent_p)
22626 {
22627   return aarch64_sve::verify_type_context (loc, context, type, silent_p);
22628 }
22629
22630 /* Find the first rtx_insn before insn that will generate an assembly
22631    instruction.  */
22632
22633 static rtx_insn *
22634 aarch64_prev_real_insn (rtx_insn *insn)
22635 {
22636   if (!insn)
22637     return NULL;
22638
22639   do
22640     {
22641       insn = prev_real_insn (insn);
22642     }
22643   while (insn && recog_memoized (insn) < 0);
22644
22645   return insn;
22646 }
22647
22648 static bool
22649 is_madd_op (enum attr_type t1)
22650 {
22651   unsigned int i;
22652   /* A number of these may be AArch32 only.  */
22653   enum attr_type mlatypes[] = {
22654     TYPE_MLA, TYPE_MLAS, TYPE_SMLAD, TYPE_SMLADX, TYPE_SMLAL, TYPE_SMLALD,
22655     TYPE_SMLALS, TYPE_SMLALXY, TYPE_SMLAWX, TYPE_SMLAWY, TYPE_SMLAXY,
22656     TYPE_SMMLA, TYPE_UMLAL, TYPE_UMLALS,TYPE_SMLSD, TYPE_SMLSDX, TYPE_SMLSLD
22657   };
22658
22659   for (i = 0; i < ARRAY_SIZE (mlatypes); i++)
22660     {
22661       if (t1 == mlatypes[i])
22662         return true;
22663     }
22664
22665   return false;
22666 }
22667
22668 /* Check if there is a register dependency between a load and the insn
22669    for which we hold recog_data.  */
22670
22671 static bool
22672 dep_between_memop_and_curr (rtx memop)
22673 {
22674   rtx load_reg;
22675   int opno;
22676
22677   gcc_assert (GET_CODE (memop) == SET);
22678
22679   if (!REG_P (SET_DEST (memop)))
22680     return false;
22681
22682   load_reg = SET_DEST (memop);
22683   for (opno = 1; opno < recog_data.n_operands; opno++)
22684     {
22685       rtx operand = recog_data.operand[opno];
22686       if (REG_P (operand)
22687           && reg_overlap_mentioned_p (load_reg, operand))
22688         return true;
22689
22690     }
22691   return false;
22692 }
22693
22694
22695 /* When working around the Cortex-A53 erratum 835769,
22696    given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate
22697    instruction and has a preceding memory instruction such that a NOP
22698    should be inserted between them.  */
22699
22700 bool
22701 aarch64_madd_needs_nop (rtx_insn* insn)
22702 {
22703   enum attr_type attr_type;
22704   rtx_insn *prev;
22705   rtx body;
22706
22707   if (!TARGET_FIX_ERR_A53_835769)
22708     return false;
22709
22710   if (!INSN_P (insn) || recog_memoized (insn) < 0)
22711     return false;
22712
22713   attr_type = get_attr_type (insn);
22714   if (!is_madd_op (attr_type))
22715     return false;
22716
22717   prev = aarch64_prev_real_insn (insn);
22718   /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN.
22719      Restore recog state to INSN to avoid state corruption.  */
22720   extract_constrain_insn_cached (insn);
22721
22722   if (!prev || !contains_mem_rtx_p (PATTERN (prev)))
22723     return false;
22724
22725   body = single_set (prev);
22726
22727   /* If the previous insn is a memory op and there is no dependency between
22728      it and the DImode madd, emit a NOP between them.  If body is NULL then we
22729      have a complex memory operation, probably a load/store pair.
22730      Be conservative for now and emit a NOP.  */
22731   if (GET_MODE (recog_data.operand[0]) == DImode
22732       && (!body || !dep_between_memop_and_curr (body)))
22733     return true;
22734
22735   return false;
22736
22737 }
22738
22739
22740 /* Implement FINAL_PRESCAN_INSN.  */
22741
22742 void
22743 aarch64_final_prescan_insn (rtx_insn *insn)
22744 {
22745   if (aarch64_madd_needs_nop (insn))
22746     fprintf (asm_out_file, "\tnop // between mem op and mult-accumulate\n");
22747 }
22748
22749
22750 /* Return true if BASE_OR_STEP is a valid immediate operand for an SVE INDEX
22751    instruction.  */
22752
22753 bool
22754 aarch64_sve_index_immediate_p (rtx base_or_step)
22755 {
22756   return (CONST_INT_P (base_or_step)
22757           && IN_RANGE (INTVAL (base_or_step), -16, 15));
22758 }
22759
22760 /* Return true if X is a valid immediate for the SVE ADD and SUB instructions
22761    when applied to mode MODE.  Negate X first if NEGATE_P is true.  */
22762
22763 bool
22764 aarch64_sve_arith_immediate_p (machine_mode mode, rtx x, bool negate_p)
22765 {
22766   rtx elt = unwrap_const_vec_duplicate (x);
22767   if (!CONST_INT_P (elt))
22768     return false;
22769
22770   HOST_WIDE_INT val = INTVAL (elt);
22771   if (negate_p)
22772     val = -val;
22773   val &= GET_MODE_MASK (GET_MODE_INNER (mode));
22774
22775   if (val & 0xff)
22776     return IN_RANGE (val, 0, 0xff);
22777   return IN_RANGE (val, 0, 0xff00);
22778 }
22779
22780 /* Return true if X is a valid immediate for the SVE SQADD and SQSUB
22781    instructions when applied to mode MODE.  Negate X first if NEGATE_P
22782    is true.  */
22783
22784 bool
22785 aarch64_sve_sqadd_sqsub_immediate_p (machine_mode mode, rtx x, bool negate_p)
22786 {
22787   if (!aarch64_sve_arith_immediate_p (mode, x, negate_p))
22788     return false;
22789
22790   /* After the optional negation, the immediate must be nonnegative.
22791      E.g. a saturating add of -127 must be done via SQSUB Zn.B, Zn.B, #127
22792      instead of SQADD Zn.B, Zn.B, #129.  */
22793   rtx elt = unwrap_const_vec_duplicate (x);
22794   return negate_p == (INTVAL (elt) < 0);
22795 }
22796
22797 /* Return true if X is a valid immediate operand for an SVE logical
22798    instruction such as AND.  */
22799
22800 bool
22801 aarch64_sve_bitmask_immediate_p (rtx x)
22802 {
22803   rtx elt;
22804
22805   return (const_vec_duplicate_p (x, &elt)
22806           && CONST_INT_P (elt)
22807           && aarch64_bitmask_imm (INTVAL (elt),
22808                                   GET_MODE_INNER (GET_MODE (x))));
22809 }
22810
22811 /* Return true if X is a valid immediate for the SVE DUP and CPY
22812    instructions.  */
22813
22814 bool
22815 aarch64_sve_dup_immediate_p (rtx x)
22816 {
22817   x = aarch64_bit_representation (unwrap_const_vec_duplicate (x));
22818   if (!CONST_INT_P (x))
22819     return false;
22820
22821   HOST_WIDE_INT val = INTVAL (x);
22822   if (val & 0xff)
22823     return IN_RANGE (val, -0x80, 0x7f);
22824   return IN_RANGE (val, -0x8000, 0x7f00);
22825 }
22826
22827 /* Return true if X is a valid immediate operand for an SVE CMP instruction.
22828    SIGNED_P says whether the operand is signed rather than unsigned.  */
22829
22830 bool
22831 aarch64_sve_cmp_immediate_p (rtx x, bool signed_p)
22832 {
22833   x = unwrap_const_vec_duplicate (x);
22834   return (CONST_INT_P (x)
22835           && (signed_p
22836               ? IN_RANGE (INTVAL (x), -16, 15)
22837               : IN_RANGE (INTVAL (x), 0, 127)));
22838 }
22839
22840 /* Return true if X is a valid immediate operand for an SVE FADD or FSUB
22841    instruction.  Negate X first if NEGATE_P is true.  */
22842
22843 bool
22844 aarch64_sve_float_arith_immediate_p (rtx x, bool negate_p)
22845 {
22846   rtx elt;
22847   REAL_VALUE_TYPE r;
22848
22849   if (GET_MODE_INNER (GET_MODE (x)) == BFmode
22850       || !const_vec_duplicate_p (x, &elt)
22851       || !CONST_DOUBLE_P (elt))
22852     return false;
22853
22854   r = *CONST_DOUBLE_REAL_VALUE (elt);
22855
22856   if (negate_p)
22857     r = real_value_negate (&r);
22858
22859   if (real_equal (&r, &dconst1))
22860     return true;
22861   if (real_equal (&r, &dconsthalf))
22862     return true;
22863   return false;
22864 }
22865
22866 /* Return true if X is a valid immediate operand for an SVE FMUL
22867    instruction.  */
22868
22869 bool
22870 aarch64_sve_float_mul_immediate_p (rtx x)
22871 {
22872   rtx elt;
22873
22874   return (GET_MODE_INNER (GET_MODE (x)) != BFmode
22875           && const_vec_duplicate_p (x, &elt)
22876           && CONST_DOUBLE_P (elt)
22877           && (real_equal (CONST_DOUBLE_REAL_VALUE (elt), &dconsthalf)
22878               || real_equal (CONST_DOUBLE_REAL_VALUE (elt), &dconst2)));
22879 }
22880
22881 /* Return true if replicating VAL32 is a valid 2-byte or 4-byte immediate
22882    for the Advanced SIMD operation described by WHICH and INSN.  If INFO
22883    is nonnull, use it to describe valid immediates.  */
22884 static bool
22885 aarch64_advsimd_valid_immediate_hs (unsigned int val32,
22886                                     simd_immediate_info *info,
22887                                     enum simd_immediate_check which,
22888                                     simd_immediate_info::insn_type insn)
22889 {
22890   /* Try a 4-byte immediate with LSL.  */
22891   for (unsigned int shift = 0; shift < 32; shift += 8)
22892     if ((val32 & (0xff << shift)) == val32)
22893       {
22894         if (info)
22895           *info = simd_immediate_info (SImode, val32 >> shift, insn,
22896                                        simd_immediate_info::LSL, shift);
22897         return true;
22898       }
22899
22900   /* Try a 2-byte immediate with LSL.  */
22901   unsigned int imm16 = val32 & 0xffff;
22902   if (imm16 == (val32 >> 16))
22903     for (unsigned int shift = 0; shift < 16; shift += 8)
22904       if ((imm16 & (0xff << shift)) == imm16)
22905         {
22906           if (info)
22907             *info = simd_immediate_info (HImode, imm16 >> shift, insn,
22908                                          simd_immediate_info::LSL, shift);
22909           return true;
22910         }
22911
22912   /* Try a 4-byte immediate with MSL, except for cases that MVN
22913      can handle.  */
22914   if (which == AARCH64_CHECK_MOV)
22915     for (unsigned int shift = 8; shift < 24; shift += 8)
22916       {
22917         unsigned int low = (1 << shift) - 1;
22918         if (((val32 & (0xff << shift)) | low) == val32)
22919           {
22920             if (info)
22921               *info = simd_immediate_info (SImode, val32 >> shift, insn,
22922                                            simd_immediate_info::MSL, shift);
22923             return true;
22924           }
22925       }
22926
22927   return false;
22928 }
22929
22930 /* Return true if replicating VAL64 with mode MODE is a valid immediate for the
22931    Advanced SIMD operation described by WHICH.  If INFO is nonnull,
22932    use it to describe valid immediates.  */
22933 static bool
22934 aarch64_advsimd_valid_immediate (unsigned HOST_WIDE_INT val64,
22935                                  scalar_int_mode mode,
22936                                  simd_immediate_info *info,
22937                                  enum simd_immediate_check which)
22938 {
22939   unsigned int val32 = val64 & 0xffffffff;
22940   unsigned int val8 = val64 & 0xff;
22941
22942   if (mode != DImode)
22943     {
22944       if ((which == AARCH64_CHECK_MOV || which == AARCH64_CHECK_ORR)
22945           && aarch64_advsimd_valid_immediate_hs (val32, info, which,
22946                                                  simd_immediate_info::MOV))
22947         return true;
22948
22949       if ((which == AARCH64_CHECK_MOV || which == AARCH64_CHECK_AND)
22950           && aarch64_advsimd_valid_immediate_hs (~val32, info, which,
22951                                                  simd_immediate_info::MVN))
22952         return true;
22953
22954       /* Try using a replicated byte.  */
22955       if (which == AARCH64_CHECK_MOV && mode == QImode)
22956         {
22957           if (info)
22958             *info = simd_immediate_info (QImode, val8);
22959           return true;
22960         }
22961     }
22962
22963   /* Try using a bit-to-bytemask.  */
22964   if (which == AARCH64_CHECK_MOV)
22965     {
22966       unsigned int i;
22967       for (i = 0; i < 64; i += 8)
22968         {
22969           unsigned char byte = (val64 >> i) & 0xff;
22970           if (byte != 0 && byte != 0xff)
22971             break;
22972         }
22973       if (i == 64)
22974         {
22975           if (info)
22976             *info = simd_immediate_info (DImode, val64);
22977           return true;
22978         }
22979     }
22980   return false;
22981 }
22982
22983 /* Return true if replicating IVAL with MODE gives a valid immediate for an SVE
22984    MOV instruction.  If INFO is nonnull, use it to describe valid
22985    immediates.  */
22986
22987 static bool
22988 aarch64_sve_valid_immediate (unsigned HOST_WIDE_INT ival, scalar_int_mode mode,
22989                              simd_immediate_info *info,
22990                              enum simd_immediate_check which)
22991 {
22992   HOST_WIDE_INT val = trunc_int_for_mode (ival, mode);
22993
22994   if (which == AARCH64_CHECK_MOV)
22995     {
22996       if (IN_RANGE (val, -0x80, 0x7f))
22997         {
22998           /* DUP with no shift.  */
22999           if (info)
23000             *info = simd_immediate_info (mode, val,
23001                                          simd_immediate_info::SVE_MOV);
23002           return true;
23003         }
23004       if ((val & 0xff) == 0 && IN_RANGE (val, -0x8000, 0x7f00))
23005         {
23006           /* DUP with LSL #8.  */
23007           if (info)
23008             *info = simd_immediate_info (mode, val,
23009                                          simd_immediate_info::SVE_MOV);
23010           return true;
23011         }
23012     }
23013   if (aarch64_bitmask_imm (ival, mode))
23014     {
23015       /* DUPM.  */
23016       if (info)
23017         *info = simd_immediate_info (mode, val, simd_immediate_info::SVE_MOV);
23018       return true;
23019     }
23020   return false;
23021 }
23022
23023 /* Return true if X is an UNSPEC_PTRUE constant of the form:
23024
23025        (const (unspec [PATTERN ZERO] UNSPEC_PTRUE))
23026
23027    where PATTERN is the svpattern as a CONST_INT and where ZERO
23028    is a zero constant of the required PTRUE mode (which can have
23029    fewer elements than X's mode, if zero bits are significant).
23030
23031    If so, and if INFO is nonnull, describe the immediate in INFO.  */
23032 bool
23033 aarch64_sve_ptrue_svpattern_p (rtx x, struct simd_immediate_info *info)
23034 {
23035   if (GET_CODE (x) != CONST)
23036     return false;
23037
23038   x = XEXP (x, 0);
23039   if (GET_CODE (x) != UNSPEC || XINT (x, 1) != UNSPEC_PTRUE)
23040     return false;
23041
23042   if (info)
23043     {
23044       aarch64_svpattern pattern
23045         = (aarch64_svpattern) INTVAL (XVECEXP (x, 0, 0));
23046       machine_mode pred_mode = GET_MODE (XVECEXP (x, 0, 1));
23047       scalar_int_mode int_mode = aarch64_sve_element_int_mode (pred_mode);
23048       *info = simd_immediate_info (int_mode, pattern);
23049     }
23050   return true;
23051 }
23052
23053 /* Return true if X is a valid SVE predicate.  If INFO is nonnull, use
23054    it to describe valid immediates.  */
23055
23056 static bool
23057 aarch64_sve_pred_valid_immediate (rtx x, simd_immediate_info *info)
23058 {
23059   if (aarch64_sve_ptrue_svpattern_p (x, info))
23060     return true;
23061
23062   if (x == CONST0_RTX (GET_MODE (x)))
23063     {
23064       if (info)
23065         *info = simd_immediate_info (DImode, 0);
23066       return true;
23067     }
23068
23069   /* Analyze the value as a VNx16BImode.  This should be relatively
23070      efficient, since rtx_vector_builder has enough built-in capacity
23071      to store all VLA predicate constants without needing the heap.  */
23072   rtx_vector_builder builder;
23073   if (!aarch64_get_sve_pred_bits (builder, x))
23074     return false;
23075
23076   unsigned int elt_size = aarch64_widest_sve_pred_elt_size (builder);
23077   if (int vl = aarch64_partial_ptrue_length (builder, elt_size))
23078     {
23079       machine_mode mode = aarch64_sve_pred_mode (elt_size).require ();
23080       aarch64_svpattern pattern = aarch64_svpattern_for_vl (mode, vl);
23081       if (pattern != AARCH64_NUM_SVPATTERNS)
23082         {
23083           if (info)
23084             {
23085               scalar_int_mode int_mode = aarch64_sve_element_int_mode (mode);
23086               *info = simd_immediate_info (int_mode, pattern);
23087             }
23088           return true;
23089         }
23090     }
23091   return false;
23092 }
23093
23094 /* We can only represent floating point constants which will fit in
23095    "quarter-precision" values.  These values are characterised by
23096    a sign bit, a 4-bit mantissa and a 3-bit exponent.  And are given
23097    by:
23098
23099    (-1)^s * (n/16) * 2^r
23100
23101    Where:
23102      's' is the sign bit.
23103      'n' is an integer in the range 16 <= n <= 31.
23104      'r' is an integer in the range -3 <= r <= 4.
23105
23106    Return true iff R represents a vale encodable into an AArch64 floating point
23107    move instruction as an immediate.  Othewise false.  */
23108
23109 static bool
23110 aarch64_real_float_const_representable_p (REAL_VALUE_TYPE r)
23111 {
23112   /* This represents our current view of how many bits
23113      make up the mantissa.  */
23114   int point_pos = 2 * HOST_BITS_PER_WIDE_INT - 1;
23115   int exponent;
23116   unsigned HOST_WIDE_INT mantissa, mask;
23117   REAL_VALUE_TYPE m;
23118   bool fail = false;
23119
23120   /* We cannot represent infinities, NaNs or +/-zero.  We won't
23121      know if we have +zero until we analyse the mantissa, but we
23122      can reject the other invalid values.  */
23123   if (REAL_VALUE_ISINF (r) || REAL_VALUE_ISNAN (r)
23124       || REAL_VALUE_MINUS_ZERO (r))
23125     return false;
23126
23127   /* Extract exponent.  */
23128   r = real_value_abs (&r);
23129   exponent = REAL_EXP (&r);
23130
23131   /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
23132      highest (sign) bit, with a fixed binary point at bit point_pos.
23133      m1 holds the low part of the mantissa, m2 the high part.
23134      WARNING: If we ever have a representation using more than 2 * H_W_I - 1
23135      bits for the mantissa, this can fail (low bits will be lost).  */
23136   real_ldexp (&m, &r, point_pos - exponent);
23137   wide_int w = real_to_integer (&m, &fail, HOST_BITS_PER_WIDE_INT * 2);
23138
23139   /* If the low part of the mantissa has bits set we cannot represent
23140      the value.  */
23141   if (fail || w.ulow () != 0)
23142     return false;
23143
23144   /* We have rejected the lower HOST_WIDE_INT, so update our
23145      understanding of how many bits lie in the mantissa and
23146      look only at the high HOST_WIDE_INT.  */
23147   mantissa = w.elt (1);
23148   point_pos -= HOST_BITS_PER_WIDE_INT;
23149
23150   /* We can only represent values with a mantissa of the form 1.xxxx.  */
23151   mask = ((unsigned HOST_WIDE_INT)1 << (point_pos - 5)) - 1;
23152   if ((mantissa & mask) != 0)
23153     return false;
23154
23155   /* Having filtered unrepresentable values, we may now remove all
23156      but the highest 5 bits.  */
23157   mantissa >>= point_pos - 5;
23158
23159   /* We cannot represent the value 0.0, so reject it.  This is handled
23160      elsewhere.  */
23161   if (mantissa == 0)
23162     return false;
23163
23164   /* Then, as bit 4 is always set, we can mask it off, leaving
23165      the mantissa in the range [0, 15].  */
23166   mantissa &= ~(1 << 4);
23167   gcc_assert (mantissa <= 15);
23168
23169   /* GCC internally does not use IEEE754-like encoding (where normalized
23170      significands are in the range [1, 2).  GCC uses [0.5, 1) (see real.cc).
23171      Our mantissa values are shifted 4 places to the left relative to
23172      normalized IEEE754 so we must modify the exponent returned by REAL_EXP
23173      by 5 places to correct for GCC's representation.  */
23174   exponent = 5 - exponent;
23175
23176   return (exponent >= 0 && exponent <= 7);
23177 }
23178
23179 /* Return true if OP is a valid SIMD immediate for the operation
23180    described by WHICH.  If INFO is nonnull, use it to describe valid
23181    immediates.  */
23182 static bool
23183 aarch64_simd_valid_imm (rtx op, simd_immediate_info *info,
23184                         enum simd_immediate_check which)
23185 {
23186   machine_mode mode = GET_MODE (op);
23187   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
23188   if (vec_flags == 0 || vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
23189     return false;
23190
23191   if ((vec_flags & VEC_ADVSIMD) && !TARGET_SIMD)
23192     return false;
23193
23194   if (vec_flags == (VEC_SVE_PRED | VEC_STRUCT))
23195     return op == CONST0_RTX (mode) || op == CONSTM1_RTX (mode);
23196
23197   if (vec_flags & VEC_SVE_PRED)
23198     return aarch64_sve_pred_valid_immediate (op, info);
23199
23200   scalar_mode elt_mode = GET_MODE_INNER (mode);
23201   rtx base, step;
23202   unsigned int n_elts;
23203   if (CONST_VECTOR_P (op)
23204       && CONST_VECTOR_DUPLICATE_P (op))
23205     n_elts = CONST_VECTOR_NPATTERNS (op);
23206   else if (which == AARCH64_CHECK_MOV
23207            && TARGET_SVE
23208            && const_vec_series_p (op, &base, &step))
23209     {
23210       gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_INT);
23211       if (!aarch64_sve_index_immediate_p (base)
23212           || !aarch64_sve_index_immediate_p (step))
23213         return false;
23214
23215       if (info)
23216         {
23217           /* Get the corresponding container mode.  E.g. an INDEX on V2SI
23218              should yield two integer values per 128-bit block, meaning
23219              that we need to treat it in the same way as V2DI and then
23220              ignore the upper 32 bits of each element.  */
23221           elt_mode = aarch64_sve_container_int_mode (mode);
23222           *info = simd_immediate_info (elt_mode, base, step);
23223         }
23224       return true;
23225     }
23226   else if (CONST_VECTOR_P (op)
23227            && CONST_VECTOR_NUNITS (op).is_constant (&n_elts))
23228     /* N_ELTS set above.  */;
23229   else
23230     return false;
23231
23232   /* If all elements in an SVE vector have the same value, we have a free
23233      choice between using the element mode and using the container mode.
23234      Using the element mode means that unused parts of the vector are
23235      duplicates of the used elements, while using the container mode means
23236      that the unused parts are an extension of the used elements.  Using the
23237      element mode is better for (say) VNx4HI 0x101, since 0x01010101 is valid
23238      for its container mode VNx4SI while 0x00000101 isn't.
23239
23240      If not all elements in an SVE vector have the same value, we need the
23241      transition from one element to the next to occur at container boundaries.
23242      E.g. a fixed-length VNx4HI containing { 1, 2, 3, 4 } should be treated
23243      in the same way as a VNx4SI containing { 1, 2, 3, 4 }.  */
23244   scalar_int_mode elt_int_mode;
23245   if ((vec_flags & VEC_SVE_DATA) && n_elts > 1)
23246     elt_int_mode = aarch64_sve_container_int_mode (mode);
23247   else
23248     elt_int_mode = int_mode_for_mode (elt_mode).require ();
23249
23250   unsigned int elt_size = GET_MODE_SIZE (elt_int_mode);
23251   if (elt_size > 8)
23252     return false;
23253
23254   /* Expand the vector constant out into a byte vector, with the least
23255      significant byte of the register first.  */
23256   auto_vec<unsigned char, 16> bytes;
23257   bytes.reserve (n_elts * elt_size);
23258   for (unsigned int i = 0; i < n_elts; i++)
23259     {
23260       /* The vector is provided in gcc endian-neutral fashion.
23261          For aarch64_be Advanced SIMD, it must be laid out in the vector
23262          register in reverse order.  */
23263       bool swap_p = ((vec_flags & VEC_ADVSIMD) != 0 && BYTES_BIG_ENDIAN);
23264       rtx elt = CONST_VECTOR_ELT (op, swap_p ? (n_elts - 1 - i) : i);
23265
23266       if (elt_mode != elt_int_mode)
23267         elt = gen_lowpart (elt_int_mode, elt);
23268
23269       if (!CONST_INT_P (elt))
23270         return false;
23271
23272       unsigned HOST_WIDE_INT elt_val = INTVAL (elt);
23273       for (unsigned int byte = 0; byte < elt_size; byte++)
23274         {
23275           bytes.quick_push (elt_val & 0xff);
23276           elt_val >>= BITS_PER_UNIT;
23277         }
23278     }
23279
23280   /* The immediate must repeat every eight bytes.  */
23281   unsigned int nbytes = bytes.length ();
23282   for (unsigned i = 8; i < nbytes; ++i)
23283     if (bytes[i] != bytes[i - 8])
23284       return false;
23285
23286   /* Get the repeating 8-byte value as an integer.  No endian correction
23287      is needed here because bytes is already in lsb-first order.  */
23288   unsigned HOST_WIDE_INT val64 = 0;
23289   for (unsigned int i = 0; i < 8; i++)
23290     val64 |= ((unsigned HOST_WIDE_INT) bytes[i % nbytes]
23291               << (i * BITS_PER_UNIT));
23292
23293   /* Try encoding the integer immediate as a floating point value if it's an
23294      exact value.  */
23295   scalar_float_mode fmode = DFmode;
23296   scalar_int_mode imode = DImode;
23297   unsigned HOST_WIDE_INT ival = val64;
23298   unsigned int val32 = val64 & 0xffffffff;
23299   if (val32 == (val64 >> 32))
23300     {
23301       fmode = SFmode;
23302       imode = SImode;
23303       ival = val32;
23304       unsigned int val16 = val32 & 0xffff;
23305       if (val16 == (val32 >> 16))
23306         {
23307           fmode = HFmode;
23308           imode = HImode;
23309           ival = val16;
23310           unsigned int val8 = val16 & 0xff;
23311           if (val8 == (val16 >> 8))
23312             {
23313               imode = QImode;
23314               ival = val8;
23315             }
23316         }
23317     }
23318
23319   if (which == AARCH64_CHECK_MOV
23320       && imode != QImode
23321       && (imode != HImode || TARGET_FP_F16INST))
23322     {
23323       long int as_long_ints[2];
23324       as_long_ints[0] = ival & 0xFFFFFFFF;
23325       as_long_ints[1] = (ival >> 32) & 0xFFFFFFFF;
23326
23327       REAL_VALUE_TYPE r;
23328       real_from_target (&r, as_long_ints, fmode);
23329       if (aarch64_real_float_const_representable_p (r))
23330         {
23331           if (info)
23332             {
23333               rtx float_val = const_double_from_real_value (r, fmode);
23334               *info = simd_immediate_info (fmode, float_val);
23335             }
23336           return true;
23337         }
23338     }
23339
23340   if (vec_flags & VEC_SVE_DATA)
23341     return aarch64_sve_valid_immediate (ival, imode, info, which);
23342
23343   if (aarch64_advsimd_valid_immediate (val64, imode, info, which))
23344     return true;
23345
23346   if (TARGET_SVE)
23347     return aarch64_sve_valid_immediate (ival, imode, info, which);
23348   return false;
23349 }
23350
23351 /* Return true if OP is a valid SIMD move immediate for SVE or AdvSIMD.  */
23352 bool
23353 aarch64_simd_valid_mov_imm (rtx op)
23354 {
23355   return aarch64_simd_valid_imm (op, NULL, AARCH64_CHECK_MOV);
23356 }
23357
23358 /* Return true if OP is a valid SIMD orr immediate for SVE or AdvSIMD.  */
23359 bool
23360 aarch64_simd_valid_orr_imm (rtx op)
23361 {
23362   return aarch64_simd_valid_imm (op, NULL, AARCH64_CHECK_ORR);
23363 }
23364
23365 /* Return true if OP is a valid SIMD and immediate for SVE or AdvSIMD.  */
23366 bool
23367 aarch64_simd_valid_and_imm (rtx op)
23368 {
23369   return aarch64_simd_valid_imm (op, NULL, AARCH64_CHECK_AND);
23370 }
23371
23372 /* Return true if OP is a valid SIMD xor immediate for SVE.  */
23373 bool
23374 aarch64_simd_valid_xor_imm (rtx op)
23375 {
23376   return aarch64_simd_valid_imm (op, NULL, AARCH64_CHECK_XOR);
23377 }
23378
23379 /* Check whether X is a VEC_SERIES-like constant that starts at 0 and
23380    has a step in the range of INDEX.  Return the index expression if so,
23381    otherwise return null.  */
23382 rtx
23383 aarch64_check_zero_based_sve_index_immediate (rtx x)
23384 {
23385   rtx base, step;
23386   if (const_vec_series_p (x, &base, &step)
23387       && base == const0_rtx
23388       && aarch64_sve_index_immediate_p (step))
23389     return step;
23390   return NULL_RTX;
23391 }
23392
23393 /* Check of immediate shift constants are within range.  */
23394 bool
23395 aarch64_simd_shift_imm_p (rtx x, machine_mode mode, bool left)
23396 {
23397   x = unwrap_const_vec_duplicate (x);
23398   if (!CONST_INT_P (x))
23399     return false;
23400   int bit_width = GET_MODE_UNIT_SIZE (mode) * BITS_PER_UNIT;
23401   if (left)
23402     return IN_RANGE (INTVAL (x), 0, bit_width - 1);
23403   else
23404     return IN_RANGE (INTVAL (x), 1, bit_width);
23405 }
23406
23407 /* Return the bitmask CONST_INT to select the bits required by a zero extract
23408    operation of width WIDTH at bit position POS.  */
23409
23410 rtx
23411 aarch64_mask_from_zextract_ops (rtx width, rtx pos)
23412 {
23413   gcc_assert (CONST_INT_P (width));
23414   gcc_assert (CONST_INT_P (pos));
23415
23416   unsigned HOST_WIDE_INT mask
23417     = ((unsigned HOST_WIDE_INT) 1 << UINTVAL (width)) - 1;
23418   return GEN_INT (mask << UINTVAL (pos));
23419 }
23420
23421 bool
23422 aarch64_mov_operand_p (rtx x, machine_mode mode)
23423 {
23424   if (GET_CODE (x) == HIGH
23425       && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
23426     return true;
23427
23428   if (CONST_INT_P (x))
23429     return true;
23430
23431   if (VECTOR_MODE_P (GET_MODE (x)))
23432     {
23433       /* Require predicate constants to be VNx16BI before RA, so that we
23434          force everything to have a canonical form.  */
23435       if (!lra_in_progress
23436           && !reload_completed
23437           && aarch64_sve_pred_mode_p (GET_MODE (x))
23438           && known_eq (GET_MODE_SIZE (GET_MODE (x)), BYTES_PER_SVE_PRED)
23439           && GET_MODE (x) != VNx16BImode)
23440         return false;
23441
23442       return aarch64_simd_valid_mov_imm (x);
23443     }
23444
23445   /* Remove UNSPEC_SALT_ADDR before checking symbol reference.  */
23446   x = strip_salt (x);
23447
23448   /* GOT accesses are valid moves.  */
23449   if (SYMBOL_REF_P (x)
23450       && aarch64_classify_symbolic_expression (x) == SYMBOL_SMALL_GOT_4G)
23451     return true;
23452
23453   if (SYMBOL_REF_P (x) && mode == DImode && CONSTANT_ADDRESS_P (x))
23454     return true;
23455
23456   if (TARGET_SVE
23457       && (aarch64_sve_cnt_immediate_p (x)
23458           || aarch64_sve_rdvl_immediate_p (x)))
23459     return true;
23460
23461   if (aarch64_rdsvl_immediate_p (x))
23462     return true;
23463
23464   return aarch64_classify_symbolic_expression (x)
23465     == SYMBOL_TINY_ABSOLUTE;
23466 }
23467
23468 /* Return a function-invariant register that contains VALUE.  *CACHED_INSN
23469    caches instructions that set up such registers, so that they can be
23470    reused by future calls.  */
23471
23472 static rtx
23473 aarch64_get_shareable_reg (rtx_insn **cached_insn, rtx value)
23474 {
23475   rtx_insn *insn = *cached_insn;
23476   if (insn && INSN_P (insn) && !insn->deleted ())
23477     {
23478       rtx pat = PATTERN (insn);
23479       if (GET_CODE (pat) == SET)
23480         {
23481           rtx dest = SET_DEST (pat);
23482           if (REG_P (dest)
23483               && !HARD_REGISTER_P (dest)
23484               && rtx_equal_p (SET_SRC (pat), value))
23485             return dest;
23486         }
23487     }
23488   rtx reg = gen_reg_rtx (GET_MODE (value));
23489   *cached_insn = emit_insn_before (gen_rtx_SET (reg, value),
23490                                    function_beg_insn);
23491   return reg;
23492 }
23493
23494 /* Create a 0 constant that is based on V4SI to allow CSE to optimally share
23495    the constant creation.  */
23496
23497 rtx
23498 aarch64_gen_shareable_zero (machine_mode mode)
23499 {
23500   rtx reg = aarch64_get_shareable_reg (&cfun->machine->advsimd_zero_insn,
23501                                        CONST0_RTX (V4SImode));
23502   return lowpart_subreg (mode, reg, GET_MODE (reg));
23503 }
23504
23505 /* INSN is some form of extension or shift that can be split into a
23506    permutation involving a shared zero.  Return true if we should
23507    perform such a split.
23508
23509    ??? For now, make sure that the split instruction executes more
23510    frequently than the zero that feeds it.  In future it would be good
23511    to split without that restriction and instead recombine shared zeros
23512    if they turn out not to be worthwhile.  This would allow splits in
23513    single-block functions and would also cope more naturally with
23514    rematerialization.  The downside of not doing this is that we lose the
23515    optimizations for vector epilogues as well.  */
23516
23517 bool
23518 aarch64_split_simd_shift_p (rtx_insn *insn)
23519 {
23520   return (can_create_pseudo_p ()
23521           && optimize_bb_for_speed_p (BLOCK_FOR_INSN (insn))
23522           && (ENTRY_BLOCK_PTR_FOR_FN (cfun)->count
23523               < BLOCK_FOR_INSN (insn)->count));
23524 }
23525
23526 /* Return a const_int vector of VAL.  */
23527 rtx
23528 aarch64_simd_gen_const_vector_dup (machine_mode mode, HOST_WIDE_INT val)
23529 {
23530   rtx c = gen_int_mode (val, GET_MODE_INNER (mode));
23531   return gen_const_vec_duplicate (mode, c);
23532 }
23533
23534 /* Check OP is a legal scalar immediate for the MOVI instruction.  */
23535
23536 bool
23537 aarch64_simd_scalar_immediate_valid_for_move (rtx op, scalar_int_mode mode)
23538 {
23539   machine_mode vmode;
23540
23541   vmode = aarch64_simd_container_mode (mode, 64);
23542   rtx op_v = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (op));
23543   return aarch64_simd_valid_mov_imm (op_v);
23544 }
23545
23546 /* Construct and return a PARALLEL RTX vector with elements numbering the
23547    lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
23548    the vector - from the perspective of the architecture.  This does not
23549    line up with GCC's perspective on lane numbers, so we end up with
23550    different masks depending on our target endian-ness.  The diagram
23551    below may help.  We must draw the distinction when building masks
23552    which select one half of the vector.  An instruction selecting
23553    architectural low-lanes for a big-endian target, must be described using
23554    a mask selecting GCC high-lanes.
23555
23556                  Big-Endian             Little-Endian
23557
23558 GCC             0   1   2   3           3   2   1   0
23559               | x | x | x | x |       | x | x | x | x |
23560 Architecture    3   2   1   0           3   2   1   0
23561
23562 Low Mask:         { 2, 3 }                { 0, 1 }
23563 High Mask:        { 0, 1 }                { 2, 3 }
23564
23565    MODE Is the mode of the vector and NUNITS is the number of units in it.  */
23566
23567 rtx
23568 aarch64_simd_vect_par_cnst_half (machine_mode mode, int nunits, bool high)
23569 {
23570   rtvec v = rtvec_alloc (nunits / 2);
23571   int high_base = nunits / 2;
23572   int low_base = 0;
23573   int base;
23574   rtx t1;
23575   int i;
23576
23577   if (BYTES_BIG_ENDIAN)
23578     base = high ? low_base : high_base;
23579   else
23580     base = high ? high_base : low_base;
23581
23582   for (i = 0; i < nunits / 2; i++)
23583     RTVEC_ELT (v, i) = GEN_INT (base + i);
23584
23585   t1 = gen_rtx_PARALLEL (mode, v);
23586   return t1;
23587 }
23588
23589 /* Check OP for validity as a PARALLEL RTX vector with elements
23590    numbering the lanes of either the high (HIGH == TRUE) or low lanes,
23591    from the perspective of the architecture.  See the diagram above
23592    aarch64_simd_vect_par_cnst_half for more details.  */
23593
23594 bool
23595 aarch64_simd_check_vect_par_cnst_half (rtx op, machine_mode mode,
23596                                        bool high)
23597 {
23598   int nelts;
23599   if (!VECTOR_MODE_P (mode) || !GET_MODE_NUNITS (mode).is_constant (&nelts))
23600     return false;
23601
23602   rtx ideal = aarch64_simd_vect_par_cnst_half (mode, nelts, high);
23603   HOST_WIDE_INT count_op = XVECLEN (op, 0);
23604   HOST_WIDE_INT count_ideal = XVECLEN (ideal, 0);
23605   int i = 0;
23606
23607   if (count_op != count_ideal)
23608     return false;
23609
23610   for (i = 0; i < count_ideal; i++)
23611     {
23612       rtx elt_op = XVECEXP (op, 0, i);
23613       rtx elt_ideal = XVECEXP (ideal, 0, i);
23614
23615       if (!CONST_INT_P (elt_op)
23616           || INTVAL (elt_ideal) != INTVAL (elt_op))
23617         return false;
23618     }
23619   return true;
23620 }
23621
23622 /* Return a PARALLEL containing NELTS elements, with element I equal
23623    to BASE + I * STEP.  */
23624
23625 rtx
23626 aarch64_gen_stepped_int_parallel (unsigned int nelts, int base, int step)
23627 {
23628   rtvec vec = rtvec_alloc (nelts);
23629   for (unsigned int i = 0; i < nelts; ++i)
23630     RTVEC_ELT (vec, i) = gen_int_mode (base + i * step, DImode);
23631   return gen_rtx_PARALLEL (VOIDmode, vec);
23632 }
23633
23634 /* Return true if OP is a PARALLEL of CONST_INTs that form a linear
23635    series with step STEP.  */
23636
23637 bool
23638 aarch64_stepped_int_parallel_p (rtx op, int step)
23639 {
23640   if (GET_CODE (op) != PARALLEL || !CONST_INT_P (XVECEXP (op, 0, 0)))
23641     return false;
23642
23643   unsigned HOST_WIDE_INT base = UINTVAL (XVECEXP (op, 0, 0));
23644   for (int i = 1; i < XVECLEN (op, 0); ++i)
23645     if (!CONST_INT_P (XVECEXP (op, 0, i))
23646         || UINTVAL (XVECEXP (op, 0, i)) != base + i * step)
23647       return false;
23648
23649   return true;
23650 }
23651
23652 /* Return true if OPERANDS[0] to OPERANDS[NUM_OPERANDS - 1] form a
23653    sequence of strided registers, with the stride being equal STRIDE.
23654    The operands are already known to be FPRs.  */
23655 bool
23656 aarch64_strided_registers_p (rtx *operands, unsigned int num_operands,
23657                              unsigned int stride)
23658 {
23659   for (unsigned int i = 1; i < num_operands; ++i)
23660     if (REGNO (operands[i]) != REGNO (operands[0]) + i * stride)
23661       return false;
23662   return true;
23663 }
23664
23665 /* Bounds-check lanes.  Ensure OPERAND lies between LOW (inclusive) and
23666    HIGH (exclusive).  */
23667 void
23668 aarch64_simd_lane_bounds (rtx operand, HOST_WIDE_INT low, HOST_WIDE_INT high,
23669                           const_tree exp)
23670 {
23671   HOST_WIDE_INT lane;
23672   gcc_assert (CONST_INT_P (operand));
23673   lane = INTVAL (operand);
23674
23675   if (lane < low || lane >= high)
23676   {
23677     if (exp)
23678       error_at (EXPR_LOCATION (exp), "lane %wd out of range %wd - %wd",
23679                 lane, low, high - 1);
23680     else
23681       error ("lane %wd out of range %wd - %wd", lane, low, high - 1);
23682   }
23683 }
23684
23685 /* Peform endian correction on lane number N, which indexes a vector
23686    of mode MODE, and return the result as an SImode rtx.  */
23687
23688 rtx
23689 aarch64_endian_lane_rtx (machine_mode mode, unsigned int n)
23690 {
23691   return gen_int_mode (ENDIAN_LANE_N (GET_MODE_NUNITS (mode), n), SImode);
23692 }
23693
23694 /* Return TRUE if OP is a valid vector addressing mode.  */
23695
23696 bool
23697 aarch64_simd_mem_operand_p (rtx op)
23698 {
23699   return (MEM_P (op)
23700           && (GET_CODE (XEXP (op, 0)) == POST_INC || REG_P (XEXP (op, 0)))
23701           && memory_operand (op, VOIDmode));
23702 }
23703
23704 /* Return true if OP is a valid MEM operand for an SVE LD1R instruction.  */
23705
23706 bool
23707 aarch64_sve_ld1r_operand_p (rtx op)
23708 {
23709   struct aarch64_address_info addr;
23710   scalar_mode mode;
23711
23712   return (MEM_P (op)
23713           && is_a <scalar_mode> (GET_MODE (op), &mode)
23714           && aarch64_classify_address (&addr, XEXP (op, 0), mode, false)
23715           && addr.type == ADDRESS_REG_IMM
23716           && offset_6bit_unsigned_scaled_p (mode, addr.const_offset));
23717 }
23718
23719 /* Return true if OP is a valid MEM operand for an SVE LD1R{Q,O} instruction
23720    where the size of the read data is specified by `mode` and the size of the
23721    vector elements are specified by `elem_mode`.   */
23722 bool
23723 aarch64_sve_ld1rq_ld1ro_operand_p (rtx op, machine_mode mode,
23724                                    scalar_mode elem_mode)
23725 {
23726   struct aarch64_address_info addr;
23727   if (!MEM_P (op)
23728       || !aarch64_classify_address (&addr, XEXP (op, 0), elem_mode, false))
23729     return false;
23730
23731   if (addr.type == ADDRESS_REG_IMM)
23732     return offset_4bit_signed_scaled_p (mode, addr.const_offset);
23733
23734   if (addr.type == ADDRESS_REG_REG)
23735     return (1U << addr.shift) == GET_MODE_SIZE (elem_mode);
23736
23737   return false;
23738 }
23739
23740 /* Return true if OP is a valid MEM operand for an SVE LD1RQ instruction.  */
23741 bool
23742 aarch64_sve_ld1rq_operand_p (rtx op)
23743 {
23744   return aarch64_sve_ld1rq_ld1ro_operand_p (op, TImode,
23745                                             GET_MODE_INNER (GET_MODE (op)));
23746 }
23747
23748 /* Return true if OP is a valid MEM operand for an SVE LD1RO instruction for
23749    accessing a vector where the element size is specified by `elem_mode`.  */
23750 bool
23751 aarch64_sve_ld1ro_operand_p (rtx op, scalar_mode elem_mode)
23752 {
23753   return aarch64_sve_ld1rq_ld1ro_operand_p (op, OImode, elem_mode);
23754 }
23755
23756 /* Return true if OP is a valid MEM operand for an SVE LDFF1 instruction.  */
23757 bool
23758 aarch64_sve_ldff1_operand_p (rtx op)
23759 {
23760   if (!MEM_P (op))
23761     return false;
23762
23763   struct aarch64_address_info addr;
23764   if (!aarch64_classify_address (&addr, XEXP (op, 0), GET_MODE (op), false))
23765     return false;
23766
23767   if (addr.type == ADDRESS_REG_IMM)
23768     return known_eq (addr.const_offset, 0);
23769
23770   return addr.type == ADDRESS_REG_REG;
23771 }
23772
23773 /* Return true if OP is a valid MEM operand for an SVE LDNF1 instruction.  */
23774 bool
23775 aarch64_sve_ldnf1_operand_p (rtx op)
23776 {
23777   struct aarch64_address_info addr;
23778
23779   return (MEM_P (op)
23780           && aarch64_classify_address (&addr, XEXP (op, 0),
23781                                        GET_MODE (op), false)
23782           && addr.type == ADDRESS_REG_IMM);
23783 }
23784
23785 /* Return true if OP is a valid MEM operand for an SVE LDR instruction.
23786    The conditions for STR are the same.  */
23787 bool
23788 aarch64_sve_ldr_operand_p (rtx op)
23789 {
23790   struct aarch64_address_info addr;
23791
23792   return (MEM_P (op)
23793           && aarch64_classify_address (&addr, XEXP (op, 0), GET_MODE (op),
23794                                        false, ADDR_QUERY_ANY)
23795           && addr.type == ADDRESS_REG_IMM);
23796 }
23797
23798 /* Return true if OP is a valid address for an SVE PRF[BHWD] instruction,
23799    addressing memory of mode MODE.  */
23800 bool
23801 aarch64_sve_prefetch_operand_p (rtx op, machine_mode mode)
23802 {
23803   struct aarch64_address_info addr;
23804   if (!aarch64_classify_address (&addr, op, mode, false, ADDR_QUERY_ANY))
23805     return false;
23806
23807   if (addr.type == ADDRESS_REG_IMM)
23808     return offset_6bit_signed_scaled_p (mode, addr.const_offset);
23809
23810   return addr.type == ADDRESS_REG_REG;
23811 }
23812
23813 /* Return true if OP is a valid MEM operand for an SVE_STRUCT mode.
23814    We need to be able to access the individual pieces, so the range
23815    is different from LD[234] and ST[234].  */
23816 bool
23817 aarch64_sve_struct_memory_operand_p (rtx op)
23818 {
23819   if (!MEM_P (op))
23820     return false;
23821
23822   machine_mode mode = GET_MODE (op);
23823   struct aarch64_address_info addr;
23824   if (!aarch64_classify_address (&addr, XEXP (op, 0), SVE_BYTE_MODE, false,
23825                                  ADDR_QUERY_ANY)
23826       || addr.type != ADDRESS_REG_IMM)
23827     return false;
23828
23829   poly_int64 first = addr.const_offset;
23830   poly_int64 last = first + GET_MODE_SIZE (mode) - BYTES_PER_SVE_VECTOR;
23831   return (offset_4bit_signed_scaled_p (SVE_BYTE_MODE, first)
23832           && offset_4bit_signed_scaled_p (SVE_BYTE_MODE, last));
23833 }
23834
23835 /* Return true if OFFSET is a constant integer and if VNUM is
23836    OFFSET * the number of bytes in an SVE vector.  This is the requirement
23837    that exists in SME LDR and STR instructions, where the VL offset must
23838    equal the ZA slice offset.  */
23839 bool
23840 aarch64_sme_ldr_vnum_offset_p (rtx offset, rtx vnum)
23841 {
23842   if (!CONST_INT_P (offset) || !IN_RANGE (INTVAL (offset), 0, 15))
23843     return false;
23844
23845   if (TARGET_STREAMING)
23846     {
23847       poly_int64 const_vnum;
23848       return (poly_int_rtx_p (vnum, &const_vnum)
23849               && known_eq (const_vnum,
23850                            INTVAL (offset) * BYTES_PER_SVE_VECTOR));
23851     }
23852   else
23853     {
23854       HOST_WIDE_INT factor;
23855       return (aarch64_sme_vq_unspec_p (vnum, &factor)
23856               && factor == INTVAL (offset) * 16);
23857     }
23858 }
23859
23860 /* Emit a register copy from operand to operand, taking care not to
23861    early-clobber source registers in the process.
23862
23863    COUNT is the number of components into which the copy needs to be
23864    decomposed.  */
23865 void
23866 aarch64_simd_emit_reg_reg_move (rtx *operands, machine_mode mode,
23867                                 unsigned int count)
23868 {
23869   unsigned int i;
23870   int rdest = REGNO (operands[0]);
23871   int rsrc = REGNO (operands[1]);
23872
23873   if (!reg_overlap_mentioned_p (operands[0], operands[1])
23874       || rdest < rsrc)
23875     for (i = 0; i < count; i++)
23876       emit_move_insn (gen_rtx_REG (mode, rdest + i),
23877                       gen_rtx_REG (mode, rsrc + i));
23878   else
23879     for (i = 0; i < count; i++)
23880       emit_move_insn (gen_rtx_REG (mode, rdest + count - i - 1),
23881                       gen_rtx_REG (mode, rsrc + count - i - 1));
23882 }
23883
23884 /* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is
23885    one of VSTRUCT modes: OI, CI, or XI.  */
23886 int
23887 aarch64_simd_attr_length_rglist (machine_mode mode)
23888 {
23889   /* This is only used (and only meaningful) for Advanced SIMD, not SVE.  */
23890   return (GET_MODE_SIZE (mode).to_constant () / UNITS_PER_VREG) * 4;
23891 }
23892
23893 /* Implement target hook TARGET_VECTOR_ALIGNMENT.  The AAPCS64 sets the maximum
23894    alignment of a vector to 128 bits.  SVE predicates have an alignment of
23895    16 bits.  */
23896 static HOST_WIDE_INT
23897 aarch64_simd_vector_alignment (const_tree type)
23898 {
23899   /* ??? Checking the mode isn't ideal, but VECTOR_BOOLEAN_TYPE_P can
23900      be set for non-predicate vectors of booleans.  Modes are the most
23901      direct way we have of identifying real SVE predicate types.  */
23902   if (GET_MODE_CLASS (TYPE_MODE (type)) == MODE_VECTOR_BOOL)
23903     return 16;
23904   widest_int min_size
23905     = constant_lower_bound (wi::to_poly_widest (TYPE_SIZE (type)));
23906   return wi::umin (min_size, 128).to_uhwi ();
23907 }
23908
23909 /* Implement target hook TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT.  */
23910 static poly_uint64
23911 aarch64_vectorize_preferred_vector_alignment (const_tree type)
23912 {
23913   if (aarch64_sve_data_mode_p (TYPE_MODE (type)))
23914     {
23915       /* If the length of the vector is a fixed power of 2, try to align
23916          to that length, otherwise don't try to align at all.  */
23917       HOST_WIDE_INT result;
23918       if (!GET_MODE_BITSIZE (TYPE_MODE (type)).is_constant (&result)
23919           || !pow2p_hwi (result))
23920         result = TYPE_ALIGN (TREE_TYPE (type));
23921       return result;
23922     }
23923   return TYPE_ALIGN (type);
23924 }
23925
23926 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE.  */
23927 static bool
23928 aarch64_simd_vector_alignment_reachable (const_tree type, bool is_packed)
23929 {
23930   if (is_packed)
23931     return false;
23932
23933   /* For fixed-length vectors, check that the vectorizer will aim for
23934      full-vector alignment.  This isn't true for generic GCC vectors
23935      that are wider than the ABI maximum of 128 bits.  */
23936   poly_uint64 preferred_alignment =
23937     aarch64_vectorize_preferred_vector_alignment (type);
23938   if (TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
23939       && maybe_ne (wi::to_widest (TYPE_SIZE (type)),
23940                    preferred_alignment))
23941     return false;
23942
23943   /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned.  */
23944   return true;
23945 }
23946
23947 /* Return true if the vector misalignment factor is supported by the
23948    target.  */
23949 static bool
23950 aarch64_builtin_support_vector_misalignment (machine_mode mode,
23951                                              const_tree type, int misalignment,
23952                                              bool is_packed)
23953 {
23954   if (TARGET_SIMD && STRICT_ALIGNMENT)
23955     {
23956       /* Return if movmisalign pattern is not supported for this mode.  */
23957       if (optab_handler (movmisalign_optab, mode) == CODE_FOR_nothing)
23958         return false;
23959
23960       /* Misalignment factor is unknown at compile time.  */
23961       if (misalignment == -1)
23962         return false;
23963     }
23964   return default_builtin_support_vector_misalignment (mode, type, misalignment,
23965                                                       is_packed);
23966 }
23967
23968 /* If VALS is a vector constant that can be loaded into a register
23969    using DUP, generate instructions to do so and return an RTX to
23970    assign to the register.  Otherwise return NULL_RTX.  */
23971 static rtx
23972 aarch64_simd_dup_constant (rtx vals)
23973 {
23974   machine_mode mode = GET_MODE (vals);
23975   machine_mode inner_mode = GET_MODE_INNER (mode);
23976   rtx x;
23977
23978   if (!const_vec_duplicate_p (vals, &x))
23979     return NULL_RTX;
23980
23981   /* We can load this constant by using DUP and a constant in a
23982      single ARM register.  This will be cheaper than a vector
23983      load.  */
23984   x = force_reg (inner_mode, x);
23985   return gen_vec_duplicate (mode, x);
23986 }
23987
23988
23989 /* Generate code to load VALS, which is a PARALLEL containing only
23990    constants (for vec_init) or CONST_VECTOR, efficiently into a
23991    register.  Returns an RTX to copy into the register, or NULL_RTX
23992    for a PARALLEL that cannot be converted into a CONST_VECTOR.  */
23993 static rtx
23994 aarch64_simd_make_constant (rtx vals)
23995 {
23996   machine_mode mode = GET_MODE (vals);
23997   rtx const_dup;
23998   rtx const_vec = NULL_RTX;
23999   int n_const = 0;
24000   int i;
24001
24002   if (CONST_VECTOR_P (vals))
24003     const_vec = vals;
24004   else if (GET_CODE (vals) == PARALLEL)
24005     {
24006       /* A CONST_VECTOR must contain only CONST_INTs and
24007          CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
24008          Only store valid constants in a CONST_VECTOR.  */
24009       int n_elts = XVECLEN (vals, 0);
24010       for (i = 0; i < n_elts; ++i)
24011         {
24012           rtx x = XVECEXP (vals, 0, i);
24013           if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
24014             n_const++;
24015         }
24016       if (n_const == n_elts)
24017         const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0));
24018     }
24019   else
24020     gcc_unreachable ();
24021
24022   if (const_vec != NULL_RTX
24023       && aarch64_simd_valid_mov_imm (const_vec))
24024     /* Load using MOVI/MVNI.  */
24025     return const_vec;
24026   else if ((const_dup = aarch64_simd_dup_constant (vals)) != NULL_RTX)
24027     /* Loaded using DUP.  */
24028     return const_dup;
24029   else if (const_vec != NULL_RTX)
24030     /* Load from constant pool. We cannot take advantage of single-cycle
24031        LD1 because we need a PC-relative addressing mode.  */
24032     return const_vec;
24033   else
24034     /* A PARALLEL containing something not valid inside CONST_VECTOR.
24035        We cannot construct an initializer.  */
24036     return NULL_RTX;
24037 }
24038
24039 /* A subroutine of aarch64_expand_vector_init, with the same interface.
24040    The caller has already tried a divide-and-conquer approach, so do
24041    not consider that case here.  */
24042
24043 void
24044 aarch64_expand_vector_init_fallback (rtx target, rtx vals)
24045 {
24046   machine_mode mode = GET_MODE (target);
24047   scalar_mode inner_mode = GET_MODE_INNER (mode);
24048   /* The number of vector elements.  */
24049   int n_elts = XVECLEN (vals, 0);
24050   /* The number of vector elements which are not constant.  */
24051   int n_var = 0;
24052   rtx any_const = NULL_RTX;
24053   /* The first element of vals.  */
24054   rtx v0 = XVECEXP (vals, 0, 0);
24055   bool all_same = true;
24056
24057   /* This is a special vec_init<M><N> where N is not an element mode but a
24058      vector mode with half the elements of M.  We expect to find two entries
24059      of mode N in VALS and we must put their concatentation into TARGET.  */
24060   if (XVECLEN (vals, 0) == 2 && VECTOR_MODE_P (GET_MODE (XVECEXP (vals, 0, 0))))
24061     {
24062       machine_mode narrow_mode = GET_MODE (XVECEXP (vals, 0, 0));
24063       gcc_assert (GET_MODE_INNER (narrow_mode) == inner_mode
24064                   && known_eq (GET_MODE_SIZE (mode),
24065                                2 * GET_MODE_SIZE (narrow_mode)));
24066       emit_insn (gen_aarch64_vec_concat (narrow_mode, target,
24067                                          XVECEXP (vals, 0, 0),
24068                                          XVECEXP (vals, 0, 1)));
24069      return;
24070    }
24071
24072   /* Count the number of variable elements to initialise.  */
24073   for (int i = 0; i < n_elts; ++i)
24074     {
24075       rtx x = XVECEXP (vals, 0, i);
24076       if (!(CONST_INT_P (x) || CONST_DOUBLE_P (x)))
24077         ++n_var;
24078       else
24079         any_const = x;
24080
24081       all_same &= rtx_equal_p (x, v0);
24082     }
24083
24084   /* No variable elements, hand off to aarch64_simd_make_constant which knows
24085      how best to handle this.  */
24086   if (n_var == 0)
24087     {
24088       rtx constant = aarch64_simd_make_constant (vals);
24089       if (constant != NULL_RTX)
24090         {
24091           emit_move_insn (target, constant);
24092           return;
24093         }
24094     }
24095
24096   /* Splat a single non-constant element if we can.  */
24097   if (all_same)
24098     {
24099       rtx x = force_reg (inner_mode, v0);
24100       aarch64_emit_move (target, gen_vec_duplicate (mode, x));
24101       return;
24102     }
24103
24104   enum insn_code icode = optab_handler (vec_set_optab, mode);
24105   gcc_assert (icode != CODE_FOR_nothing);
24106
24107   /* If there are only variable elements, try to optimize
24108      the insertion using dup for the most common element
24109      followed by insertions.  */
24110
24111   /* The algorithm will fill matches[*][0] with the earliest matching element,
24112      and matches[X][1] with the count of duplicate elements (if X is the
24113      earliest element which has duplicates).  */
24114
24115   if (n_var >= n_elts - 1 && n_elts <= 16)
24116     {
24117       int matches[16][2] = {0};
24118       for (int i = 0; i < n_elts; i++)
24119         {
24120           for (int j = 0; j <= i; j++)
24121             {
24122               if (rtx_equal_p (XVECEXP (vals, 0, i), XVECEXP (vals, 0, j)))
24123                 {
24124                   matches[i][0] = j;
24125                   matches[j][1]++;
24126                   break;
24127                 }
24128             }
24129         }
24130       int maxelement = 0;
24131       int maxv = 0;
24132       rtx const_elem = NULL_RTX;
24133       int const_elem_pos = 0;
24134
24135       for (int i = 0; i < n_elts; i++)
24136         {
24137           if (matches[i][1] > maxv)
24138             {
24139               maxelement = i;
24140               maxv = matches[i][1];
24141             }
24142           if (CONST_INT_P (XVECEXP (vals, 0, i))
24143               || CONST_DOUBLE_P (XVECEXP (vals, 0, i)))
24144             {
24145               const_elem_pos = i;
24146               const_elem = XVECEXP (vals, 0, i);
24147             }
24148         }
24149
24150       /* Create a duplicate of the most common element, unless all elements
24151          are equally useless to us, in which case just immediately set the
24152          vector register using the first element.  */
24153
24154       if (maxv == 1)
24155         {
24156           /* For vectors of two 64-bit elements, we can do even better.  */
24157           if (n_elts == 2
24158               && (inner_mode == E_DImode
24159                   || inner_mode == E_DFmode))
24160
24161             {
24162               rtx x0 = XVECEXP (vals, 0, 0);
24163               rtx x1 = XVECEXP (vals, 0, 1);
24164               /* Combine can pick up this case, but handling it directly
24165                  here leaves clearer RTL.
24166
24167                  This is load_pair_lanes<mode>, and also gives us a clean-up
24168                  for store_pair_lanes<mode>.  */
24169               if (memory_operand (x0, inner_mode)
24170                   && memory_operand (x1, inner_mode)
24171                   && aarch64_mergeable_load_pair_p (mode, x0, x1))
24172                 {
24173                   rtx t;
24174                   if (inner_mode == DFmode)
24175                     t = gen_load_pair_lanesdf (target, x0, x1);
24176                   else
24177                     t = gen_load_pair_lanesdi (target, x0, x1);
24178                   emit_insn (t);
24179                   return;
24180                 }
24181             }
24182           /* The subreg-move sequence below will move into lane zero of the
24183              vector register.  For big-endian we want that position to hold
24184              the last element of VALS.  */
24185           maxelement = BYTES_BIG_ENDIAN ? n_elts - 1 : 0;
24186
24187           /* If we have a single constant element, use that for duplicating
24188              instead.  */
24189           if (const_elem)
24190             {
24191               maxelement = const_elem_pos;
24192               aarch64_emit_move (target, gen_vec_duplicate (mode, const_elem));
24193             }
24194           else
24195             {
24196               rtx x = force_reg (inner_mode, XVECEXP (vals, 0, maxelement));
24197               aarch64_emit_move (target, lowpart_subreg (mode, x, inner_mode));
24198             }
24199         }
24200       else
24201         {
24202           rtx x = force_reg (inner_mode, XVECEXP (vals, 0, maxelement));
24203           aarch64_emit_move (target, gen_vec_duplicate (mode, x));
24204         }
24205
24206       /* Insert the rest.  */
24207       for (int i = 0; i < n_elts; i++)
24208         {
24209           rtx x = XVECEXP (vals, 0, i);
24210           if (matches[i][0] == maxelement)
24211             continue;
24212           x = force_reg (inner_mode, x);
24213           emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
24214         }
24215       return;
24216     }
24217
24218   /* Initialise a vector which is part-variable.  We want to first try
24219      to build those lanes which are constant in the most efficient way we
24220      can.  */
24221   if (n_var != n_elts)
24222     {
24223       rtx copy = copy_rtx (vals);
24224
24225       /* Load constant part of vector.  We really don't care what goes into the
24226          parts we will overwrite, but we're more likely to be able to load the
24227          constant efficiently if it has fewer, larger, repeating parts
24228          (see aarch64_simd_valid_imm).  */
24229       for (int i = 0; i < n_elts; i++)
24230         {
24231           rtx x = XVECEXP (vals, 0, i);
24232           if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
24233             continue;
24234           rtx subst = any_const;
24235           for (int bit = n_elts / 2; bit > 0; bit /= 2)
24236             {
24237               /* Look in the copied vector, as more elements are const.  */
24238               rtx test = XVECEXP (copy, 0, i ^ bit);
24239               if (CONST_INT_P (test) || CONST_DOUBLE_P (test))
24240                 {
24241                   subst = test;
24242                   break;
24243                 }
24244             }
24245           XVECEXP (copy, 0, i) = subst;
24246         }
24247       aarch64_expand_vector_init_fallback (target, copy);
24248     }
24249
24250   /* Insert the variable lanes directly.  */
24251   for (int i = 0; i < n_elts; i++)
24252     {
24253       rtx x = XVECEXP (vals, 0, i);
24254       if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
24255         continue;
24256       x = force_reg (inner_mode, x);
24257       emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
24258     }
24259 }
24260
24261 /* Return even or odd half of VALS depending on EVEN_P.  */
24262
24263 static rtx
24264 aarch64_unzip_vector_init (machine_mode mode, rtx vals, bool even_p)
24265 {
24266   int n = XVECLEN (vals, 0);
24267   machine_mode new_mode
24268     = aarch64_simd_container_mode (GET_MODE_INNER (mode),
24269                                    GET_MODE_BITSIZE (mode).to_constant () / 2);
24270   rtvec vec = rtvec_alloc (n / 2);
24271   for (int i = 0; i < n / 2; i++)
24272     RTVEC_ELT (vec, i) = (even_p) ? XVECEXP (vals, 0, 2 * i)
24273                                   : XVECEXP (vals, 0, 2 * i + 1);
24274   return gen_rtx_PARALLEL (new_mode, vec);
24275 }
24276
24277 /* Return true if SET is a scalar move.  */
24278
24279 static bool
24280 scalar_move_insn_p (rtx set)
24281 {
24282   rtx src = SET_SRC (set);
24283   rtx dest = SET_DEST (set);
24284   return (is_a<scalar_mode> (GET_MODE (dest))
24285           && aarch64_mov_operand (src, GET_MODE (dest)));
24286 }
24287
24288 /* Similar to seq_cost, but ignore cost for scalar moves.  */
24289
24290 static unsigned
24291 seq_cost_ignoring_scalar_moves (const rtx_insn *seq, bool speed)
24292 {
24293   unsigned cost = 0;
24294
24295   for (; seq; seq = NEXT_INSN (seq))
24296     if (NONDEBUG_INSN_P (seq))
24297       {
24298         if (rtx set = single_set (seq))
24299           {
24300             if (!scalar_move_insn_p (set))
24301               cost += set_rtx_cost (set, speed);
24302           }
24303         else
24304           {
24305             int this_cost = insn_cost (CONST_CAST_RTX_INSN (seq), speed);
24306             if (this_cost > 0)
24307               cost += this_cost;
24308             else
24309               cost++;
24310           }
24311       }
24312
24313   return cost;
24314 }
24315
24316 /* Expand a vector initialization sequence, such that TARGET is
24317    initialized to contain VALS.  */
24318
24319 void
24320 aarch64_expand_vector_init (rtx target, rtx vals)
24321 {
24322   /* Try decomposing the initializer into even and odd halves and
24323      then ZIP them together.  Use the resulting sequence if it is
24324      strictly cheaper than loading VALS directly.
24325
24326      Prefer the fallback sequence in the event of a tie, since it
24327      will tend to use fewer registers.  */
24328
24329   machine_mode mode = GET_MODE (target);
24330   int n_elts = XVECLEN (vals, 0);
24331
24332   if (n_elts < 4
24333       || maybe_ne (GET_MODE_BITSIZE (mode), 128))
24334     {
24335       aarch64_expand_vector_init_fallback (target, vals);
24336       return;
24337     }
24338
24339   start_sequence ();
24340   rtx halves[2];
24341   unsigned costs[2];
24342   for (int i = 0; i < 2; i++)
24343     {
24344       start_sequence ();
24345       rtx new_vals = aarch64_unzip_vector_init (mode, vals, i == 0);
24346       rtx tmp_reg = gen_reg_rtx (GET_MODE (new_vals));
24347       aarch64_expand_vector_init (tmp_reg, new_vals);
24348       halves[i] = gen_rtx_SUBREG (mode, tmp_reg, 0);
24349       rtx_insn *rec_seq = get_insns ();
24350       end_sequence ();
24351       costs[i] = seq_cost_ignoring_scalar_moves (rec_seq, !optimize_size);
24352       emit_insn (rec_seq);
24353     }
24354
24355   rtvec v = gen_rtvec (2, halves[0], halves[1]);
24356   rtx_insn *zip1_insn
24357     = emit_set_insn (target, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
24358   unsigned seq_total_cost
24359     = (!optimize_size) ? std::max (costs[0], costs[1]) : costs[0] + costs[1];
24360   seq_total_cost += insn_cost (zip1_insn, !optimize_size);
24361
24362   rtx_insn *seq = get_insns ();
24363   end_sequence ();
24364
24365   start_sequence ();
24366   aarch64_expand_vector_init_fallback (target, vals);
24367   rtx_insn *fallback_seq = get_insns ();
24368   unsigned fallback_seq_cost
24369     = seq_cost_ignoring_scalar_moves (fallback_seq, !optimize_size);
24370   end_sequence ();
24371
24372   emit_insn (seq_total_cost < fallback_seq_cost ? seq : fallback_seq);
24373 }
24374
24375 /* Emit RTL corresponding to:
24376    insr TARGET, ELEM.  */
24377
24378 static void
24379 emit_insr (rtx target, rtx elem)
24380 {
24381   machine_mode mode = GET_MODE (target);
24382   scalar_mode elem_mode = GET_MODE_INNER (mode);
24383   elem = force_reg (elem_mode, elem);
24384
24385   insn_code icode = optab_handler (vec_shl_insert_optab, mode);
24386   gcc_assert (icode != CODE_FOR_nothing);
24387   emit_insn (GEN_FCN (icode) (target, target, elem));
24388 }
24389
24390 /* Subroutine of aarch64_sve_expand_vector_init for handling
24391    trailing constants.
24392    This function works as follows:
24393    (a) Create a new vector consisting of trailing constants.
24394    (b) Initialize TARGET with the constant vector using emit_move_insn.
24395    (c) Insert remaining elements in TARGET using insr.
24396    NELTS is the total number of elements in original vector while
24397    while NELTS_REQD is the number of elements that are actually
24398    significant.
24399
24400    ??? The heuristic used is to do above only if number of constants
24401    is at least half the total number of elements.  May need fine tuning.  */
24402
24403 static bool
24404 aarch64_sve_expand_vector_init_handle_trailing_constants
24405  (rtx target, const rtx_vector_builder &builder, int nelts, int nelts_reqd)
24406 {
24407   machine_mode mode = GET_MODE (target);
24408   scalar_mode elem_mode = GET_MODE_INNER (mode);
24409   int n_trailing_constants = 0;
24410
24411   for (int i = nelts_reqd - 1;
24412        i >= 0 && valid_for_const_vector_p (elem_mode, builder.elt (i));
24413        i--)
24414     n_trailing_constants++;
24415
24416   if (n_trailing_constants >= nelts_reqd / 2)
24417     {
24418       /* Try to use the natural pattern of BUILDER to extend the trailing
24419          constant elements to a full vector.  Replace any variables in the
24420          extra elements with zeros.
24421
24422          ??? It would be better if the builders supported "don't care"
24423              elements, with the builder filling in whichever elements
24424              give the most compact encoding.  */
24425       rtx_vector_builder v (mode, nelts, 1);
24426       for (int i = 0; i < nelts; i++)
24427         {
24428           rtx x = builder.elt (i + nelts_reqd - n_trailing_constants);
24429           if (!valid_for_const_vector_p (elem_mode, x))
24430             x = CONST0_RTX (elem_mode);
24431           v.quick_push (x);
24432         }
24433       rtx const_vec = v.build ();
24434       emit_move_insn (target, const_vec);
24435
24436       for (int i = nelts_reqd - n_trailing_constants - 1; i >= 0; i--)
24437         emit_insr (target, builder.elt (i));
24438
24439       return true;
24440     }
24441
24442   return false;
24443 }
24444
24445 /* Subroutine of aarch64_sve_expand_vector_init.
24446    Works as follows:
24447    (a) Initialize TARGET by broadcasting element NELTS_REQD - 1 of BUILDER.
24448    (b) Skip trailing elements from BUILDER, which are the same as
24449        element NELTS_REQD - 1.
24450    (c) Insert earlier elements in reverse order in TARGET using insr.  */
24451
24452 static void
24453 aarch64_sve_expand_vector_init_insert_elems (rtx target,
24454                                              const rtx_vector_builder &builder,
24455                                              int nelts_reqd)
24456 {
24457   machine_mode mode = GET_MODE (target);
24458   scalar_mode elem_mode = GET_MODE_INNER (mode);
24459
24460   struct expand_operand ops[2];
24461   enum insn_code icode = optab_handler (vec_duplicate_optab, mode);
24462   gcc_assert (icode != CODE_FOR_nothing);
24463
24464   create_output_operand (&ops[0], target, mode);
24465   create_input_operand (&ops[1], builder.elt (nelts_reqd - 1), elem_mode);
24466   expand_insn (icode, 2, ops);
24467
24468   int ndups = builder.count_dups (nelts_reqd - 1, -1, -1);
24469   for (int i = nelts_reqd - ndups - 1; i >= 0; i--)
24470     emit_insr (target, builder.elt (i));
24471 }
24472
24473 /* Subroutine of aarch64_sve_expand_vector_init to handle case
24474    when all trailing elements of builder are same.
24475    This works as follows:
24476    (a) Use expand_insn interface to broadcast last vector element in TARGET.
24477    (b) Insert remaining elements in TARGET using insr.
24478
24479    ??? The heuristic used is to do above if number of same trailing elements
24480    is at least 3/4 of total number of elements, loosely based on
24481    heuristic from mostly_zeros_p.  May need fine-tuning.  */
24482
24483 static bool
24484 aarch64_sve_expand_vector_init_handle_trailing_same_elem
24485  (rtx target, const rtx_vector_builder &builder, int nelts_reqd)
24486 {
24487   int ndups = builder.count_dups (nelts_reqd - 1, -1, -1);
24488   if (ndups >= (3 * nelts_reqd) / 4)
24489     {
24490       aarch64_sve_expand_vector_init_insert_elems (target, builder,
24491                                                    nelts_reqd - ndups + 1);
24492       return true;
24493     }
24494
24495   return false;
24496 }
24497
24498 /* Initialize register TARGET from BUILDER. NELTS is the constant number
24499    of elements in BUILDER.
24500
24501    The function tries to initialize TARGET from BUILDER if it fits one
24502    of the special cases outlined below.
24503
24504    Failing that, the function divides BUILDER into two sub-vectors:
24505    v_even = even elements of BUILDER;
24506    v_odd = odd elements of BUILDER;
24507
24508    and recursively calls itself with v_even and v_odd.
24509
24510    if (recursive call succeeded for v_even or v_odd)
24511      TARGET = zip (v_even, v_odd)
24512
24513    The function returns true if it managed to build TARGET from BUILDER
24514    with one of the special cases, false otherwise.
24515
24516    Example: {a, 1, b, 2, c, 3, d, 4}
24517
24518    The vector gets divided into:
24519    v_even = {a, b, c, d}
24520    v_odd = {1, 2, 3, 4}
24521
24522    aarch64_sve_expand_vector_init(v_odd) hits case 1 and
24523    initialize tmp2 from constant vector v_odd using emit_move_insn.
24524
24525    aarch64_sve_expand_vector_init(v_even) fails since v_even contains
24526    4 elements, so we construct tmp1 from v_even using insr:
24527    tmp1 = dup(d)
24528    insr tmp1, c
24529    insr tmp1, b
24530    insr tmp1, a
24531
24532    And finally:
24533    TARGET = zip (tmp1, tmp2)
24534    which sets TARGET to {a, 1, b, 2, c, 3, d, 4}.  */
24535
24536 static bool
24537 aarch64_sve_expand_vector_init (rtx target, const rtx_vector_builder &builder,
24538                                 int nelts, int nelts_reqd)
24539 {
24540   machine_mode mode = GET_MODE (target);
24541
24542   /* Case 1: Vector contains trailing constants.  */
24543
24544   if (aarch64_sve_expand_vector_init_handle_trailing_constants
24545        (target, builder, nelts, nelts_reqd))
24546     return true;
24547
24548   /* Case 2: Vector contains leading constants.  */
24549
24550   rtx_vector_builder rev_builder (mode, nelts_reqd, 1);
24551   for (int i = 0; i < nelts_reqd; i++)
24552     rev_builder.quick_push (builder.elt (nelts_reqd - i - 1));
24553   rev_builder.finalize ();
24554
24555   if (aarch64_sve_expand_vector_init_handle_trailing_constants
24556        (target, rev_builder, nelts, nelts_reqd))
24557     {
24558       emit_insn (gen_aarch64_sve_rev (mode, target, target));
24559       return true;
24560     }
24561
24562   /* Case 3: Vector contains trailing same element.  */
24563
24564   if (aarch64_sve_expand_vector_init_handle_trailing_same_elem
24565        (target, builder, nelts_reqd))
24566     return true;
24567
24568   /* Case 4: Vector contains leading same element.  */
24569
24570   if (aarch64_sve_expand_vector_init_handle_trailing_same_elem
24571        (target, rev_builder, nelts_reqd) && nelts_reqd == nelts)
24572     {
24573       emit_insn (gen_aarch64_sve_rev (mode, target, target));
24574       return true;
24575     }
24576
24577   /* Avoid recursing below 4-elements.
24578      ??? The threshold 4 may need fine-tuning.  */
24579
24580   if (nelts_reqd <= 4)
24581     return false;
24582
24583   rtx_vector_builder v_even (mode, nelts, 1);
24584   rtx_vector_builder v_odd (mode, nelts, 1);
24585
24586   for (int i = 0; i < nelts * 2; i += 2)
24587     {
24588       v_even.quick_push (builder.elt (i));
24589       v_odd.quick_push (builder.elt (i + 1));
24590     }
24591
24592   v_even.finalize ();
24593   v_odd.finalize ();
24594
24595   rtx tmp1 = gen_reg_rtx (mode);
24596   bool did_even_p = aarch64_sve_expand_vector_init (tmp1, v_even,
24597                                                     nelts, nelts_reqd / 2);
24598
24599   rtx tmp2 = gen_reg_rtx (mode);
24600   bool did_odd_p = aarch64_sve_expand_vector_init (tmp2, v_odd,
24601                                                    nelts, nelts_reqd / 2);
24602
24603   if (!did_even_p && !did_odd_p)
24604     return false;
24605
24606   /* Initialize v_even and v_odd using INSR if it didn't match any of the
24607      special cases and zip v_even, v_odd.  */
24608
24609   if (!did_even_p)
24610     aarch64_sve_expand_vector_init_insert_elems (tmp1, v_even, nelts_reqd / 2);
24611
24612   if (!did_odd_p)
24613     aarch64_sve_expand_vector_init_insert_elems (tmp2, v_odd, nelts_reqd / 2);
24614
24615   rtvec v = gen_rtvec (2, tmp1, tmp2);
24616   emit_set_insn (target, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
24617   return true;
24618 }
24619
24620 /* Initialize register TARGET from the elements in PARALLEL rtx VALS.  */
24621
24622 void
24623 aarch64_sve_expand_vector_init (rtx target, rtx vals)
24624 {
24625   machine_mode mode = GET_MODE (target);
24626   int nelts = XVECLEN (vals, 0);
24627
24628   rtx_vector_builder v (mode, nelts, 1);
24629   for (int i = 0; i < nelts; i++)
24630     v.quick_push (XVECEXP (vals, 0, i));
24631   v.finalize ();
24632
24633   /* If neither sub-vectors of v could be initialized specially,
24634      then use INSR to insert all elements from v into TARGET.
24635      ??? This might not be optimal for vectors with large
24636      initializers like 16-element or above.
24637      For nelts < 4, it probably isn't useful to handle specially.  */
24638
24639   if (nelts < 4
24640       || !aarch64_sve_expand_vector_init (target, v, nelts, nelts))
24641     aarch64_sve_expand_vector_init_insert_elems (target, v, nelts);
24642 }
24643
24644 /* Initialize register TARGET from the two vector subelements in PARALLEL
24645    rtx VALS.  */
24646
24647 void
24648 aarch64_sve_expand_vector_init_subvector (rtx target, rtx vals)
24649 {
24650   machine_mode mode = GET_MODE (target);
24651   int nelts = XVECLEN (vals, 0);
24652
24653   gcc_assert (nelts % 2 == 0);
24654
24655   /* We have to be concatting vector.  */
24656   machine_mode elem_mode = GET_MODE (XVECEXP (vals, 0, 0));
24657   gcc_assert (VECTOR_MODE_P (elem_mode));
24658
24659   auto_vec<rtx> worklist;
24660   machine_mode wider_mode = elem_mode;
24661
24662   for (int i = 0; i < nelts; i++)
24663     worklist.safe_push (force_reg (elem_mode, XVECEXP (vals, 0, i)));
24664
24665   /* Keep widening pairwise to have maximum throughput.  */
24666   while (nelts >= 2)
24667     {
24668       wider_mode
24669         = related_vector_mode (wider_mode, GET_MODE_INNER (wider_mode),
24670                                GET_MODE_NUNITS (wider_mode) * 2).require ();
24671
24672       for (int i = 0; i < nelts; i += 2)
24673         {
24674           rtx arg0 = worklist[i];
24675           rtx arg1 = worklist[i+1];
24676           gcc_assert (GET_MODE (arg0) == GET_MODE (arg1));
24677
24678           rtx tmp = gen_reg_rtx (wider_mode);
24679           emit_insn (gen_aarch64_pack_partial (wider_mode, tmp, arg0, arg1));
24680           worklist[i / 2] = tmp;
24681         }
24682
24683       nelts /= 2;
24684     }
24685
24686   gcc_assert (wider_mode == mode);
24687   emit_move_insn (target, worklist[0]);
24688
24689   return;
24690 }
24691
24692 /* Check whether VALUE is a vector constant in which every element
24693    is either a power of 2 or a negated power of 2.  If so, return
24694    a constant vector of log2s, and flip CODE between PLUS and MINUS
24695    if VALUE contains negated powers of 2.  Return NULL_RTX otherwise.  */
24696
24697 static rtx
24698 aarch64_convert_mult_to_shift (rtx value, rtx_code &code)
24699 {
24700   if (!CONST_VECTOR_P (value))
24701     return NULL_RTX;
24702
24703   rtx_vector_builder builder;
24704   if (!builder.new_unary_operation (GET_MODE (value), value, false))
24705     return NULL_RTX;
24706
24707   scalar_mode int_mode = GET_MODE_INNER (GET_MODE (value));
24708   /* 1 if the result of the multiplication must be negated,
24709      0 if it mustn't, or -1 if we don't yet care.  */
24710   int negate = -1;
24711   unsigned int encoded_nelts = const_vector_encoded_nelts (value);
24712   for (unsigned int i = 0; i < encoded_nelts; ++i)
24713     {
24714       rtx elt = CONST_VECTOR_ENCODED_ELT (value, i);
24715       if (!CONST_SCALAR_INT_P (elt))
24716         return NULL_RTX;
24717       rtx_mode_t val (elt, int_mode);
24718       wide_int pow2 = wi::neg (val);
24719       if (val != pow2)
24720         {
24721           /* It matters whether we negate or not.  Make that choice,
24722              and make sure that it's consistent with previous elements.  */
24723           if (negate == !wi::neg_p (val))
24724             return NULL_RTX;
24725           negate = wi::neg_p (val);
24726           if (!negate)
24727             pow2 = val;
24728         }
24729       /* POW2 is now the value that we want to be a power of 2.  */
24730       int shift = wi::exact_log2 (pow2);
24731       if (shift < 0)
24732         return NULL_RTX;
24733       builder.quick_push (gen_int_mode (shift, int_mode));
24734     }
24735   if (negate == -1)
24736     /* PLUS and MINUS are equivalent; canonicalize on PLUS.  */
24737     code = PLUS;
24738   else if (negate == 1)
24739     code = code == PLUS ? MINUS : PLUS;
24740   return builder.build ();
24741 }
24742
24743 /* Prepare for an integer SVE multiply-add or multiply-subtract pattern;
24744    CODE is PLUS for the former and MINUS for the latter.  OPERANDS is the
24745    operands array, in the same order as for fma_optab.  Return true if
24746    the function emitted all the necessary instructions, false if the caller
24747    should generate the pattern normally with the new OPERANDS array.  */
24748
24749 bool
24750 aarch64_prepare_sve_int_fma (rtx *operands, rtx_code code)
24751 {
24752   machine_mode mode = GET_MODE (operands[0]);
24753   if (rtx shifts = aarch64_convert_mult_to_shift (operands[2], code))
24754     {
24755       rtx product = expand_binop (mode, vashl_optab, operands[1], shifts,
24756                                   NULL_RTX, true, OPTAB_DIRECT);
24757       force_expand_binop (mode, code == PLUS ? add_optab : sub_optab,
24758                           operands[3], product, operands[0], true,
24759                           OPTAB_DIRECT);
24760       return true;
24761     }
24762   operands[2] = force_reg (mode, operands[2]);
24763   return false;
24764 }
24765
24766 /* Likewise, but for a conditional pattern.  */
24767
24768 bool
24769 aarch64_prepare_sve_cond_int_fma (rtx *operands, rtx_code code)
24770 {
24771   machine_mode mode = GET_MODE (operands[0]);
24772   if (rtx shifts = aarch64_convert_mult_to_shift (operands[3], code))
24773     {
24774       rtx product = expand_binop (mode, vashl_optab, operands[2], shifts,
24775                                   NULL_RTX, true, OPTAB_DIRECT);
24776       emit_insn (gen_cond (code, mode, operands[0], operands[1],
24777                            operands[4], product, operands[5]));
24778       return true;
24779     }
24780   operands[3] = force_reg (mode, operands[3]);
24781   return false;
24782 }
24783
24784 static unsigned HOST_WIDE_INT
24785 aarch64_shift_truncation_mask (machine_mode mode)
24786 {
24787   if (!SHIFT_COUNT_TRUNCATED || aarch64_vector_data_mode_p (mode))
24788     return 0;
24789   return GET_MODE_UNIT_BITSIZE (mode) - 1;
24790 }
24791
24792 /* Select a format to encode pointers in exception handling data.  */
24793 int
24794 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED, int global)
24795 {
24796    int type;
24797    switch (aarch64_cmodel)
24798      {
24799      case AARCH64_CMODEL_TINY:
24800      case AARCH64_CMODEL_TINY_PIC:
24801      case AARCH64_CMODEL_SMALL:
24802      case AARCH64_CMODEL_SMALL_PIC:
24803      case AARCH64_CMODEL_SMALL_SPIC:
24804        /* text+got+data < 4Gb.  4-byte signed relocs are sufficient
24805           for everything.  */
24806        type = DW_EH_PE_sdata4;
24807        break;
24808      default:
24809        /* No assumptions here.  8-byte relocs required.  */
24810        type = DW_EH_PE_sdata8;
24811        break;
24812      }
24813    return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
24814 }
24815
24816 /* Output .variant_pcs for aarch64_vector_pcs function symbols.  */
24817
24818 static void
24819 aarch64_asm_output_variant_pcs (FILE *stream, const tree decl, const char* name)
24820 {
24821   if (TREE_CODE (decl) == FUNCTION_DECL)
24822     {
24823       arm_pcs pcs = (arm_pcs) fndecl_abi (decl).id ();
24824       if (pcs == ARM_PCS_SIMD || pcs == ARM_PCS_SVE)
24825         {
24826           fprintf (stream, "\t.variant_pcs\t");
24827           assemble_name (stream, name);
24828           fprintf (stream, "\n");
24829         }
24830     }
24831 }
24832
24833 /* The last .arch and .tune assembly strings that we printed.  */
24834 static std::string aarch64_last_printed_arch_string;
24835 static std::string aarch64_last_printed_tune_string;
24836
24837 /* Implement ASM_DECLARE_FUNCTION_NAME.  Output the ISA features used
24838    by the function fndecl.  */
24839
24840 void
24841 aarch64_declare_function_name (FILE *stream, const char* name,
24842                                 tree fndecl)
24843 {
24844   tree target_parts = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
24845
24846   struct cl_target_option *targ_options;
24847   if (target_parts)
24848     targ_options = TREE_TARGET_OPTION (target_parts);
24849   else
24850     targ_options = TREE_TARGET_OPTION (target_option_current_node);
24851   gcc_assert (targ_options);
24852
24853   auto isa_flags = aarch64_get_asm_isa_flags (targ_options);
24854   aarch64_arch arch = targ_options->x_selected_arch;
24855   std::string to_print
24856     = aarch64_get_arch_string_for_assembler (arch, isa_flags);
24857   /* Only update the assembler .arch string if it is distinct from the last
24858      such string we printed.  */
24859   if (to_print != aarch64_last_printed_arch_string)
24860     {
24861       asm_fprintf (asm_out_file, "\t.arch %s\n", to_print.c_str ());
24862       aarch64_last_printed_arch_string = to_print;
24863     }
24864
24865   /* Print the cpu name we're tuning for in the comments, might be
24866      useful to readers of the generated asm.  Do it only when it changes
24867      from function to function and verbose assembly is requested.  */
24868   const struct processor *this_tune
24869     = aarch64_get_tune_cpu (targ_options->x_selected_tune);
24870
24871   if (flag_debug_asm && aarch64_last_printed_tune_string != this_tune->name)
24872     {
24873       asm_fprintf (asm_out_file, "\t" ASM_COMMENT_START ".tune %s\n",
24874                    this_tune->name);
24875       aarch64_last_printed_tune_string = this_tune->name;
24876     }
24877
24878   aarch64_asm_output_variant_pcs (stream, fndecl, name);
24879
24880   /* Don't forget the type directive for ELF.  */
24881   ASM_OUTPUT_TYPE_DIRECTIVE (stream, name, "function");
24882   ASM_OUTPUT_FUNCTION_LABEL (stream, name, fndecl);
24883
24884   cfun->machine->label_is_assembled = true;
24885 }
24886
24887 /* Implement PRINT_PATCHABLE_FUNCTION_ENTRY.  */
24888
24889 void
24890 aarch64_print_patchable_function_entry (FILE *file,
24891                                         unsigned HOST_WIDE_INT patch_area_size,
24892                                         bool record_p)
24893 {
24894   if (!cfun->machine->label_is_assembled)
24895     {
24896       /* Emit the patching area before the entry label, if any.  */
24897       default_print_patchable_function_entry (file, patch_area_size,
24898                                               record_p);
24899       return;
24900     }
24901
24902   rtx pa = gen_patchable_area (GEN_INT (patch_area_size),
24903                                GEN_INT (record_p));
24904   basic_block bb = ENTRY_BLOCK_PTR_FOR_FN (cfun)->next_bb;
24905
24906   if (!aarch_bti_enabled ()
24907       || cgraph_node::get (cfun->decl)->only_called_directly_p ())
24908     {
24909       /* Emit the patchable_area at the beginning of the function.  */
24910       rtx_insn *insn = emit_insn_before (pa, BB_HEAD (bb));
24911       INSN_ADDRESSES_NEW (insn, -1);
24912       return;
24913     }
24914
24915   rtx_insn *insn = next_real_nondebug_insn (get_insns ());
24916   if (!insn
24917       || !INSN_P (insn)
24918       || GET_CODE (PATTERN (insn)) != UNSPEC_VOLATILE
24919       || XINT (PATTERN (insn), 1) != UNSPECV_BTI_C)
24920     {
24921       /* Emit a BTI_C.  */
24922       insn = emit_insn_before (gen_bti_c (), BB_HEAD (bb));
24923     }
24924
24925   /* Emit the patchable_area after BTI_C.  */
24926   insn = emit_insn_after (pa, insn);
24927   INSN_ADDRESSES_NEW (insn, -1);
24928 }
24929
24930 /* Output patchable area.  */
24931
24932 void
24933 aarch64_output_patchable_area (unsigned int patch_area_size, bool record_p)
24934 {
24935   default_print_patchable_function_entry (asm_out_file, patch_area_size,
24936                                           record_p);
24937 }
24938
24939 /* Implement ASM_OUTPUT_DEF_FROM_DECLS.  Output .variant_pcs for aliases.  */
24940
24941 void
24942 aarch64_asm_output_alias (FILE *stream, const tree decl, const tree target)
24943 {
24944   const char *name = XSTR (XEXP (DECL_RTL (decl), 0), 0);
24945   const char *value = IDENTIFIER_POINTER (target);
24946   aarch64_asm_output_variant_pcs (stream, decl, name);
24947   ASM_OUTPUT_DEF (stream, name, value);
24948 }
24949
24950 /* Implement ASM_OUTPUT_EXTERNAL.  Output .variant_pcs for undefined
24951    function symbol references.  */
24952
24953 void
24954 aarch64_asm_output_external (FILE *stream, tree decl, const char* name)
24955 {
24956   default_elf_asm_output_external (stream, decl, name);
24957   aarch64_asm_output_variant_pcs (stream, decl, name);
24958 }
24959
24960 /* Triggered after a .cfi_startproc directive is emitted into the assembly file.
24961    Used to output the .cfi_b_key_frame directive when signing the current
24962    function with the B key.  */
24963
24964 void
24965 aarch64_post_cfi_startproc (FILE *f, tree ignored ATTRIBUTE_UNUSED)
24966 {
24967   if (cfun->machine->frame.laid_out && aarch64_return_address_signing_enabled ()
24968       && aarch64_ra_sign_key == AARCH64_KEY_B)
24969         asm_fprintf (f, "\t.cfi_b_key_frame\n");
24970 }
24971
24972 /* Implements TARGET_ASM_FILE_START.  Output the assembly header.  */
24973
24974 static void
24975 aarch64_start_file (void)
24976 {
24977   struct cl_target_option *default_options
24978     = TREE_TARGET_OPTION (target_option_default_node);
24979
24980   aarch64_arch default_arch = default_options->x_selected_arch;
24981   auto default_isa_flags = aarch64_get_asm_isa_flags (default_options);
24982   std::string arch_string
24983     = aarch64_get_arch_string_for_assembler (default_arch, default_isa_flags);
24984   aarch64_last_printed_arch_string = arch_string;
24985   aarch64_last_printed_tune_string = "";
24986   asm_fprintf (asm_out_file, "\t.arch %s\n",
24987                arch_string.c_str ());
24988
24989   default_file_start ();
24990 }
24991
24992 /* Emit load exclusive.  */
24993
24994 static void
24995 aarch64_emit_load_exclusive (machine_mode mode, rtx rval,
24996                              rtx mem, rtx model_rtx)
24997 {
24998   if (mode == TImode)
24999     emit_insn (gen_aarch64_load_exclusive_pair (gen_lowpart (DImode, rval),
25000                                                 gen_highpart (DImode, rval),
25001                                                 mem, model_rtx));
25002   else
25003     emit_insn (gen_aarch64_load_exclusive (mode, rval, mem, model_rtx));
25004 }
25005
25006 /* Emit store exclusive.  */
25007
25008 static void
25009 aarch64_emit_store_exclusive (machine_mode mode, rtx bval,
25010                               rtx mem, rtx rval, rtx model_rtx)
25011 {
25012   if (mode == TImode)
25013     emit_insn (gen_aarch64_store_exclusive_pair
25014                (bval, mem, operand_subword (rval, 0, 0, TImode),
25015                 operand_subword (rval, 1, 0, TImode), model_rtx));
25016   else
25017     emit_insn (gen_aarch64_store_exclusive (mode, bval, mem, rval, model_rtx));
25018 }
25019
25020 /* Mark the previous jump instruction as unlikely.  */
25021
25022 static void
25023 aarch64_emit_unlikely_jump (rtx insn)
25024 {
25025   rtx_insn *jump = emit_jump_insn (insn);
25026   add_reg_br_prob_note (jump, profile_probability::very_unlikely ());
25027 }
25028
25029 /* We store the names of the various atomic helpers in a 5x5 array.
25030    Return the libcall function given MODE, MODEL and NAMES.  */
25031
25032 rtx
25033 aarch64_atomic_ool_func(machine_mode mode, rtx model_rtx,
25034                         const atomic_ool_names *names)
25035 {
25036   memmodel model = memmodel_from_int (INTVAL (model_rtx));
25037   int mode_idx, model_idx;
25038
25039   switch (mode)
25040     {
25041     case E_QImode:
25042       mode_idx = 0;
25043       break;
25044     case E_HImode:
25045       mode_idx = 1;
25046       break;
25047     case E_SImode:
25048       mode_idx = 2;
25049       break;
25050     case E_DImode:
25051       mode_idx = 3;
25052       break;
25053     case E_TImode:
25054       mode_idx = 4;
25055       break;
25056     default:
25057       gcc_unreachable ();
25058     }
25059
25060   switch (model)
25061     {
25062     case MEMMODEL_RELAXED:
25063       model_idx = 0;
25064       break;
25065     case MEMMODEL_CONSUME:
25066     case MEMMODEL_ACQUIRE:
25067       model_idx = 1;
25068       break;
25069     case MEMMODEL_RELEASE:
25070       model_idx = 2;
25071       break;
25072     case MEMMODEL_ACQ_REL:
25073     case MEMMODEL_SEQ_CST:
25074       model_idx = 3;
25075       break;
25076     case MEMMODEL_SYNC_ACQUIRE:
25077     case MEMMODEL_SYNC_RELEASE:
25078     case MEMMODEL_SYNC_SEQ_CST:
25079       model_idx = 4;
25080       break;
25081     default:
25082       gcc_unreachable ();
25083     }
25084
25085   return init_one_libfunc_visibility (names->str[mode_idx][model_idx],
25086                                       VISIBILITY_HIDDEN);
25087 }
25088
25089 #define DEF0(B, N) \
25090   { "__aarch64_" #B #N "_relax", \
25091     "__aarch64_" #B #N "_acq", \
25092     "__aarch64_" #B #N "_rel", \
25093     "__aarch64_" #B #N "_acq_rel", \
25094     "__aarch64_" #B #N "_sync" }
25095
25096 #define DEF4(B)  DEF0(B, 1), DEF0(B, 2), DEF0(B, 4), DEF0(B, 8), \
25097                  { NULL, NULL, NULL, NULL }
25098 #define DEF5(B)  DEF0(B, 1), DEF0(B, 2), DEF0(B, 4), DEF0(B, 8), DEF0(B, 16)
25099
25100 static const atomic_ool_names aarch64_ool_cas_names = { { DEF5(cas) } };
25101 const atomic_ool_names aarch64_ool_swp_names = { { DEF4(swp) } };
25102 const atomic_ool_names aarch64_ool_ldadd_names = { { DEF4(ldadd) } };
25103 const atomic_ool_names aarch64_ool_ldset_names = { { DEF4(ldset) } };
25104 const atomic_ool_names aarch64_ool_ldclr_names = { { DEF4(ldclr) } };
25105 const atomic_ool_names aarch64_ool_ldeor_names = { { DEF4(ldeor) } };
25106
25107 #undef DEF0
25108 #undef DEF4
25109 #undef DEF5
25110
25111 /* Expand a compare and swap pattern.  */
25112
25113 void
25114 aarch64_expand_compare_and_swap (rtx operands[])
25115 {
25116   rtx bval, rval, mem, oldval, newval, is_weak, mod_s, mod_f, x, cc_reg;
25117   machine_mode mode, r_mode;
25118
25119   bval = operands[0];
25120   rval = operands[1];
25121   mem = operands[2];
25122   oldval = operands[3];
25123   newval = operands[4];
25124   is_weak = operands[5];
25125   mod_s = operands[6];
25126   mod_f = operands[7];
25127   mode = GET_MODE (mem);
25128
25129   /* Normally the succ memory model must be stronger than fail, but in the
25130      unlikely event of fail being ACQUIRE and succ being RELEASE we need to
25131      promote succ to ACQ_REL so that we don't lose the acquire semantics.  */
25132   if (is_mm_acquire (memmodel_from_int (INTVAL (mod_f)))
25133       && is_mm_release (memmodel_from_int (INTVAL (mod_s))))
25134     mod_s = GEN_INT (MEMMODEL_ACQ_REL);
25135
25136   r_mode = mode;
25137   if (mode == QImode || mode == HImode)
25138     {
25139       r_mode = SImode;
25140       rval = gen_reg_rtx (r_mode);
25141     }
25142
25143   if (TARGET_LSE)
25144     {
25145       /* The CAS insn requires oldval and rval overlap, but we need to
25146          have a copy of oldval saved across the operation to tell if
25147          the operation is successful.  */
25148       if (reg_overlap_mentioned_p (rval, oldval))
25149         rval = copy_to_mode_reg (r_mode, oldval);
25150       else
25151         emit_move_insn (rval, gen_lowpart (r_mode, oldval));
25152       if (mode == TImode)
25153         newval = force_reg (mode, newval);
25154
25155       emit_insn (gen_aarch64_compare_and_swap_lse (mode, rval, mem,
25156                                                    newval, mod_s));
25157       cc_reg = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
25158     }
25159   else if (TARGET_OUTLINE_ATOMICS)
25160     {
25161       /* Oldval must satisfy compare afterward.  */
25162       if (!aarch64_plus_operand (oldval, mode))
25163         oldval = force_reg (mode, oldval);
25164       rtx func = aarch64_atomic_ool_func (mode, mod_s, &aarch64_ool_cas_names);
25165       rval = emit_library_call_value (func, NULL_RTX, LCT_NORMAL, r_mode,
25166                                       oldval, mode, newval, mode,
25167                                       XEXP (mem, 0), Pmode);
25168       cc_reg = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
25169     }
25170   else
25171     {
25172       /* The oldval predicate varies by mode.  Test it and force to reg.  */
25173       insn_code code = code_for_aarch64_compare_and_swap (mode);
25174       if (!insn_data[code].operand[2].predicate (oldval, mode))
25175         oldval = force_reg (mode, oldval);
25176
25177       emit_insn (GEN_FCN (code) (rval, mem, oldval, newval,
25178                                  is_weak, mod_s, mod_f));
25179       cc_reg = gen_rtx_REG (CCmode, CC_REGNUM);
25180     }
25181
25182   if (r_mode != mode)
25183     rval = gen_lowpart (mode, rval);
25184   emit_move_insn (operands[1], rval);
25185
25186   x = gen_rtx_EQ (SImode, cc_reg, const0_rtx);
25187   emit_insn (gen_rtx_SET (bval, x));
25188 }
25189
25190 /* Emit a barrier, that is appropriate for memory model MODEL, at the end of a
25191    sequence implementing an atomic operation.  */
25192
25193 static void
25194 aarch64_emit_post_barrier (enum memmodel model)
25195 {
25196   const enum memmodel base_model = memmodel_base (model);
25197
25198   if (is_mm_sync (model)
25199       && (base_model == MEMMODEL_ACQUIRE
25200           || base_model == MEMMODEL_ACQ_REL
25201           || base_model == MEMMODEL_SEQ_CST))
25202     {
25203       emit_insn (gen_mem_thread_fence (GEN_INT (MEMMODEL_SEQ_CST)));
25204     }
25205 }
25206
25207 /* Split a compare and swap pattern.  */
25208
25209 void
25210 aarch64_split_compare_and_swap (rtx operands[])
25211 {
25212   /* Split after prolog/epilog to avoid interactions with shrinkwrapping.  */
25213   gcc_assert (epilogue_completed);
25214
25215   rtx rval, mem, oldval, newval, scratch, x, model_rtx;
25216   machine_mode mode;
25217   bool is_weak;
25218   rtx_code_label *label1, *label2;
25219   enum memmodel model;
25220
25221   rval = operands[0];
25222   mem = operands[1];
25223   oldval = operands[2];
25224   newval = operands[3];
25225   model_rtx = operands[5];
25226   scratch = operands[7];
25227   mode = GET_MODE (mem);
25228   model = memmodel_from_int (INTVAL (model_rtx));
25229   is_weak = operands[4] != const0_rtx && mode != TImode;
25230
25231   /* When OLDVAL is zero and we want the strong version we can emit a tighter
25232     loop:
25233     .label1:
25234         LD[A]XR rval, [mem]
25235         CBNZ    rval, .label2
25236         ST[L]XR scratch, newval, [mem]
25237         CBNZ    scratch, .label1
25238     .label2:
25239         CMP     rval, 0.  */
25240   bool strong_zero_p = (!is_weak && !aarch64_track_speculation &&
25241                         oldval == const0_rtx && mode != TImode);
25242
25243   label1 = NULL;
25244   if (!is_weak)
25245     {
25246       label1 = gen_label_rtx ();
25247       emit_label (label1);
25248     }
25249   label2 = gen_label_rtx ();
25250
25251   /* The initial load can be relaxed for a __sync operation since a final
25252      barrier will be emitted to stop code hoisting.  */
25253   if (is_mm_sync (model))
25254     aarch64_emit_load_exclusive (mode, rval, mem, GEN_INT (MEMMODEL_RELAXED));
25255   else
25256     aarch64_emit_load_exclusive (mode, rval, mem, model_rtx);
25257
25258   if (strong_zero_p)
25259     x = gen_rtx_NE (VOIDmode, rval, const0_rtx);
25260   else
25261     {
25262       rtx cc_reg = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
25263       x = gen_rtx_NE (VOIDmode, cc_reg, const0_rtx);
25264     }
25265   x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
25266                             gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
25267   aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
25268
25269   aarch64_emit_store_exclusive (mode, scratch, mem, newval, model_rtx);
25270
25271   if (!is_weak)
25272     {
25273       x = aarch64_gen_compare_zero_and_branch (NE, scratch, label1);
25274       aarch64_emit_unlikely_jump (x);
25275     }
25276   else
25277     aarch64_gen_compare_reg (NE, scratch, const0_rtx);
25278
25279   /* 128-bit LDAXP is not atomic unless STLXP succeeds.  So for a mismatch,
25280      store the returned value and loop if the STLXP fails.  */
25281   if (mode == TImode)
25282     {
25283       rtx_code_label *label3 = gen_label_rtx ();
25284       emit_jump_insn (gen_rtx_SET (pc_rtx, gen_rtx_LABEL_REF (Pmode, label3)));
25285       emit_barrier ();
25286
25287       emit_label (label2);
25288       aarch64_emit_store_exclusive (mode, scratch, mem, rval, model_rtx);
25289
25290       x = aarch64_gen_compare_zero_and_branch (NE, scratch, label1);
25291       aarch64_emit_unlikely_jump (x);
25292
25293       label2 = label3;
25294     }
25295
25296   emit_label (label2);
25297
25298   /* If we used a CBNZ in the exchange loop emit an explicit compare with RVAL
25299      to set the condition flags.  If this is not used it will be removed by
25300      later passes.  */
25301   if (strong_zero_p)
25302     aarch64_gen_compare_reg (NE, rval, const0_rtx);
25303
25304   /* Emit any final barrier needed for a __sync operation.  */
25305   if (is_mm_sync (model))
25306     aarch64_emit_post_barrier (model);
25307 }
25308
25309 /* Split an atomic operation.  */
25310
25311 void
25312 aarch64_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
25313                          rtx value, rtx model_rtx, rtx cond)
25314 {
25315   /* Split after prolog/epilog to avoid interactions with shrinkwrapping.  */
25316   gcc_assert (epilogue_completed);
25317
25318   machine_mode mode = GET_MODE (mem);
25319   machine_mode wmode = (mode == DImode ? DImode : SImode);
25320   const enum memmodel model = memmodel_from_int (INTVAL (model_rtx));
25321   const bool is_sync = is_mm_sync (model);
25322   rtx_code_label *label;
25323   rtx x;
25324
25325   /* Split the atomic operation into a sequence.  */
25326   label = gen_label_rtx ();
25327   emit_label (label);
25328
25329   if (new_out)
25330     new_out = gen_lowpart (wmode, new_out);
25331   if (old_out)
25332     old_out = gen_lowpart (wmode, old_out);
25333   else
25334     old_out = new_out;
25335   value = simplify_gen_subreg (wmode, value, mode, 0);
25336
25337   /* The initial load can be relaxed for a __sync operation since a final
25338      barrier will be emitted to stop code hoisting.  */
25339  if (is_sync)
25340     aarch64_emit_load_exclusive (mode, old_out, mem,
25341                                  GEN_INT (MEMMODEL_RELAXED));
25342   else
25343     aarch64_emit_load_exclusive (mode, old_out, mem, model_rtx);
25344
25345   switch (code)
25346     {
25347     case SET:
25348       new_out = value;
25349       break;
25350
25351     case NOT:
25352       x = gen_rtx_AND (wmode, old_out, value);
25353       emit_insn (gen_rtx_SET (new_out, x));
25354       x = gen_rtx_NOT (wmode, new_out);
25355       emit_insn (gen_rtx_SET (new_out, x));
25356       break;
25357
25358     case MINUS:
25359       if (CONST_INT_P (value))
25360         {
25361           value = GEN_INT (-UINTVAL (value));
25362           code = PLUS;
25363         }
25364       /* Fall through.  */
25365
25366     default:
25367       x = gen_rtx_fmt_ee (code, wmode, old_out, value);
25368       emit_insn (gen_rtx_SET (new_out, x));
25369       break;
25370     }
25371
25372   aarch64_emit_store_exclusive (mode, cond, mem,
25373                                 gen_lowpart (mode, new_out), model_rtx);
25374
25375   x = aarch64_gen_compare_zero_and_branch (NE, cond, label);
25376   aarch64_emit_unlikely_jump (x);
25377
25378   /* Emit any final barrier needed for a __sync operation.  */
25379   if (is_sync)
25380     aarch64_emit_post_barrier (model);
25381 }
25382
25383 static void
25384 aarch64_init_libfuncs (void)
25385 {
25386    /* Half-precision float operations.  The compiler handles all operations
25387      with NULL libfuncs by converting to SFmode.  */
25388
25389   /* Conversions.  */
25390   set_conv_libfunc (trunc_optab, HFmode, SFmode, "__gnu_f2h_ieee");
25391   set_conv_libfunc (sext_optab, SFmode, HFmode, "__gnu_h2f_ieee");
25392
25393   /* Arithmetic.  */
25394   set_optab_libfunc (add_optab, HFmode, NULL);
25395   set_optab_libfunc (sdiv_optab, HFmode, NULL);
25396   set_optab_libfunc (smul_optab, HFmode, NULL);
25397   set_optab_libfunc (neg_optab, HFmode, NULL);
25398   set_optab_libfunc (sub_optab, HFmode, NULL);
25399
25400   /* Comparisons.  */
25401   set_optab_libfunc (eq_optab, HFmode, NULL);
25402   set_optab_libfunc (ne_optab, HFmode, NULL);
25403   set_optab_libfunc (lt_optab, HFmode, NULL);
25404   set_optab_libfunc (le_optab, HFmode, NULL);
25405   set_optab_libfunc (ge_optab, HFmode, NULL);
25406   set_optab_libfunc (gt_optab, HFmode, NULL);
25407   set_optab_libfunc (unord_optab, HFmode, NULL);
25408 }
25409
25410 /* Target hook for c_mode_for_suffix.  */
25411 static machine_mode
25412 aarch64_c_mode_for_suffix (char suffix)
25413 {
25414   if (suffix == 'q')
25415     return TFmode;
25416
25417   return VOIDmode;
25418 }
25419
25420 /* Return true iff X with mode MODE can be represented by a quarter-precision
25421    floating point immediate operand X.  Note, we cannot represent 0.0.  */
25422
25423 bool
25424 aarch64_float_const_representable_p (rtx x)
25425 {
25426   x = unwrap_const_vec_duplicate (x);
25427   machine_mode mode = GET_MODE (x);
25428   if (!CONST_DOUBLE_P (x))
25429     return false;
25430
25431   if ((mode == HFmode && !TARGET_FP_F16INST)
25432       || mode == BFmode)
25433     return false;
25434
25435   REAL_VALUE_TYPE r = *CONST_DOUBLE_REAL_VALUE (x);
25436
25437   return aarch64_real_float_const_representable_p (r);
25438 }
25439
25440 /* Returns the string with the instruction for the SIMD immediate
25441  * CONST_VECTOR of MODE and WIDTH.  WHICH selects a move, and(bic) or orr.  */
25442 char*
25443 aarch64_output_simd_imm (rtx const_vector, unsigned width,
25444                          enum simd_immediate_check which)
25445 {
25446   bool is_valid;
25447   static char templ[40];
25448   const char *mnemonic;
25449   const char *shift_op;
25450   unsigned int lane_count = 0;
25451   char element_char;
25452
25453   struct simd_immediate_info info;
25454
25455   is_valid = aarch64_simd_valid_imm (const_vector, &info, which);
25456   gcc_assert (is_valid);
25457
25458   element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
25459   lane_count = width / GET_MODE_BITSIZE (info.elt_mode);
25460
25461   if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
25462     {
25463       gcc_assert (info.insn == simd_immediate_info::MOV
25464                   && info.u.mov.shift == 0);
25465       /* For FP zero change it to a CONST_INT 0 and use the integer SIMD
25466          move immediate path.  */
25467       if (aarch64_float_const_zero_rtx_p (info.u.mov.value))
25468         info.u.mov.value = GEN_INT (0);
25469       else
25470         {
25471           const unsigned int buf_size = 20;
25472           char float_buf[buf_size] = {'\0'};
25473           real_to_decimal_for_mode (float_buf,
25474                                     CONST_DOUBLE_REAL_VALUE (info.u.mov.value),
25475                                     buf_size, buf_size, 1, info.elt_mode);
25476
25477           if (lane_count == 1)
25478             snprintf (templ, sizeof (templ), "fmov\t%%d0, %s", float_buf);
25479           else
25480             snprintf (templ, sizeof (templ), "fmov\t%%0.%d%c, %s",
25481                       lane_count, element_char, float_buf);
25482           return templ;
25483         }
25484     }
25485
25486   gcc_assert (CONST_INT_P (info.u.mov.value));
25487
25488   if (which == AARCH64_CHECK_MOV)
25489     {
25490       if (info.insn == simd_immediate_info::INDEX)
25491         {
25492           gcc_assert (TARGET_SVE);
25493           snprintf (templ, sizeof (templ), "index\t%%Z0.%c, #"
25494                     HOST_WIDE_INT_PRINT_DEC ", #" HOST_WIDE_INT_PRINT_DEC,
25495                     element_char, INTVAL (info.u.index.base),
25496                     INTVAL (info.u.index.step));
25497           return templ;
25498         }
25499
25500       if (info.insn == simd_immediate_info::SVE_MOV)
25501         {
25502           gcc_assert (TARGET_SVE);
25503           snprintf (templ, sizeof (templ), "mov\t%%Z0.%c, #" HOST_WIDE_INT_PRINT_DEC,
25504                     element_char, INTVAL (info.u.mov.value));
25505           return templ;
25506         }
25507
25508       mnemonic = info.insn == simd_immediate_info::MVN ? "mvni" : "movi";
25509       shift_op = (info.u.mov.modifier == simd_immediate_info::MSL
25510                   ? "msl" : "lsl");
25511       if (lane_count == 1)
25512         snprintf (templ, sizeof (templ), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX,
25513                   mnemonic, UINTVAL (info.u.mov.value));
25514       else if (info.u.mov.shift)
25515         snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
25516                   HOST_WIDE_INT_PRINT_HEX ", %s %d", mnemonic, lane_count,
25517                   element_char, UINTVAL (info.u.mov.value), shift_op,
25518                   info.u.mov.shift);
25519       else
25520         snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
25521                   HOST_WIDE_INT_PRINT_HEX, mnemonic, lane_count,
25522                   element_char, UINTVAL (info.u.mov.value));
25523     }
25524   else
25525     {
25526       /* AARCH64_CHECK_ORR, AARCH64_CHECK_AND or AARCH64_CHECK_XOR.  */
25527       mnemonic = "orr";
25528       if (which == AARCH64_CHECK_AND)
25529         mnemonic = info.insn == simd_immediate_info::MVN ? "bic" : "and";
25530       else if (which == AARCH64_CHECK_XOR)
25531         mnemonic = "eor";
25532
25533       if (info.insn == simd_immediate_info::SVE_MOV)
25534         {
25535           gcc_assert (TARGET_SVE);
25536           snprintf (templ, sizeof (templ), "%s\t%%Z0.%c, %%Z0.%c, "
25537                     HOST_WIDE_INT_PRINT_DEC, mnemonic, element_char,
25538                     element_char, INTVAL (info.u.mov.value));
25539         }
25540       else if (info.u.mov.shift)
25541         snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
25542                   HOST_WIDE_INT_PRINT_DEC ", %s #%d", mnemonic, lane_count,
25543                   element_char, UINTVAL (info.u.mov.value), "lsl",
25544                   info.u.mov.shift);
25545       else
25546         snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
25547                   HOST_WIDE_INT_PRINT_DEC, mnemonic, lane_count,
25548                   element_char, UINTVAL (info.u.mov.value));
25549     }
25550   return templ;
25551 }
25552
25553 /* Returns the string with the ORR instruction for the SIMD immediate
25554    CONST_VECTOR of WIDTH bits.  */
25555 char*
25556 aarch64_output_simd_orr_imm (rtx const_vector, unsigned width)
25557 {
25558   return aarch64_output_simd_imm (const_vector, width, AARCH64_CHECK_ORR);
25559 }
25560
25561 /* Returns the string with the AND/BIC instruction for the SIMD immediate
25562    CONST_VECTOR of WIDTH bits.  */
25563 char*
25564 aarch64_output_simd_and_imm (rtx const_vector, unsigned width)
25565 {
25566   return aarch64_output_simd_imm (const_vector, width, AARCH64_CHECK_AND);
25567 }
25568
25569 /* Returns the string with the EOR instruction for the SIMD immediate
25570    CONST_VECTOR of WIDTH bits.  */
25571 char*
25572 aarch64_output_simd_xor_imm (rtx const_vector, unsigned width)
25573 {
25574   return aarch64_output_simd_imm (const_vector, width, AARCH64_CHECK_XOR);
25575 }
25576
25577 /* Returns the string with the MOV instruction for the SIMD immediate
25578    CONST_VECTOR of WIDTH bits.  */
25579 char*
25580 aarch64_output_simd_mov_imm (rtx const_vector, unsigned width)
25581 {
25582   return aarch64_output_simd_imm (const_vector, width, AARCH64_CHECK_MOV);
25583 }
25584
25585 char*
25586 aarch64_output_scalar_simd_mov_immediate (rtx immediate, scalar_int_mode mode)
25587 {
25588
25589   /* If a floating point number was passed and we desire to use it in an
25590      integer mode do the conversion to integer.  */
25591   if (CONST_DOUBLE_P (immediate) && GET_MODE_CLASS (mode) == MODE_INT)
25592     {
25593       unsigned HOST_WIDE_INT ival;
25594       if (!aarch64_reinterpret_float_as_int (immediate, &ival))
25595           gcc_unreachable ();
25596       immediate = gen_int_mode (ival, mode);
25597     }
25598
25599   machine_mode vmode;
25600   /* use a 64 bit mode for everything except for DI/DF/DD mode, where we use
25601      a 128 bit vector mode.  */
25602   int width = GET_MODE_BITSIZE (mode) == 64 ? 128 : 64;
25603
25604   vmode = aarch64_simd_container_mode (mode, width);
25605   rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (immediate));
25606   return aarch64_output_simd_mov_imm (v_op, width);
25607 }
25608
25609 /* Return the output string to use for moving immediate CONST_VECTOR
25610    into an SVE register.  */
25611
25612 char *
25613 aarch64_output_sve_mov_immediate (rtx const_vector)
25614 {
25615   static char templ[40];
25616   struct simd_immediate_info info;
25617   char element_char;
25618   bool is_valid;
25619
25620   is_valid = aarch64_simd_valid_imm (const_vector, &info, AARCH64_CHECK_MOV);
25621   gcc_assert (is_valid);
25622
25623   element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
25624
25625   machine_mode vec_mode = GET_MODE (const_vector);
25626   if (aarch64_sve_pred_mode_p (vec_mode))
25627     {
25628       static char buf[sizeof ("ptrue\t%0.N, vlNNNNN")];
25629       if (info.insn == simd_immediate_info::MOV)
25630         {
25631           gcc_assert (info.u.mov.value == const0_rtx);
25632           snprintf (buf, sizeof (buf), "pfalse\t%%0.b");
25633         }
25634       else
25635         {
25636           gcc_assert (info.insn == simd_immediate_info::PTRUE);
25637           unsigned int total_bytes;
25638           if (info.u.pattern == AARCH64_SV_ALL
25639               && BYTES_PER_SVE_VECTOR.is_constant (&total_bytes))
25640             snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, vl%d", element_char,
25641                       total_bytes / GET_MODE_SIZE (info.elt_mode));
25642           else
25643             snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, %s", element_char,
25644                       svpattern_token (info.u.pattern));
25645         }
25646       return buf;
25647     }
25648
25649   if (info.insn == simd_immediate_info::INDEX)
25650     {
25651       snprintf (templ, sizeof (templ), "index\t%%0.%c, #"
25652                 HOST_WIDE_INT_PRINT_DEC ", #" HOST_WIDE_INT_PRINT_DEC,
25653                 element_char, INTVAL (info.u.index.base),
25654                 INTVAL (info.u.index.step));
25655       return templ;
25656     }
25657
25658   if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
25659     {
25660       if (aarch64_float_const_zero_rtx_p (info.u.mov.value))
25661         info.u.mov.value = GEN_INT (0);
25662       else
25663         {
25664           const int buf_size = 20;
25665           char float_buf[buf_size] = {};
25666           real_to_decimal_for_mode (float_buf,
25667                                     CONST_DOUBLE_REAL_VALUE (info.u.mov.value),
25668                                     buf_size, buf_size, 1, info.elt_mode);
25669
25670           snprintf (templ, sizeof (templ), "fmov\t%%0.%c, #%s",
25671                     element_char, float_buf);
25672           return templ;
25673         }
25674     }
25675
25676   if (info.u.mov.value == const0_rtx && TARGET_NON_STREAMING)
25677     snprintf (templ, sizeof (templ), "movi\t%%d0, #0");
25678   else
25679     snprintf (templ, sizeof (templ), "mov\t%%0.%c, #" HOST_WIDE_INT_PRINT_DEC,
25680               element_char, INTVAL (info.u.mov.value));
25681   return templ;
25682 }
25683
25684 /* Return the asm template for a PTRUES.  CONST_UNSPEC is the
25685    aarch64_sve_ptrue_svpattern_immediate that describes the predicate
25686    pattern.  */
25687
25688 char *
25689 aarch64_output_sve_ptrues (rtx const_unspec)
25690 {
25691   static char templ[40];
25692   struct simd_immediate_info info;
25693   bool is_valid;
25694
25695   is_valid = aarch64_simd_valid_imm (const_unspec, &info, AARCH64_CHECK_MOV);
25696   gcc_assert (is_valid && info.insn == simd_immediate_info::PTRUE);
25697
25698   char element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
25699   snprintf (templ, sizeof (templ), "ptrues\t%%0.%c, %s", element_char,
25700             svpattern_token (info.u.pattern));
25701   return templ;
25702 }
25703
25704 /* Split operands into moves from op[1] + op[2] into op[0].  */
25705
25706 void
25707 aarch64_split_combinev16qi (rtx operands[3])
25708 {
25709   machine_mode halfmode = GET_MODE (operands[1]);
25710
25711   gcc_assert (halfmode == V16QImode);
25712
25713   rtx destlo = simplify_gen_subreg (halfmode, operands[0],
25714                                     GET_MODE (operands[0]), 0);
25715   rtx desthi = simplify_gen_subreg (halfmode, operands[0],
25716                                     GET_MODE (operands[0]),
25717                                     GET_MODE_SIZE (halfmode));
25718
25719   bool skiplo = rtx_equal_p (destlo, operands[1]);
25720   bool skiphi = rtx_equal_p (desthi, operands[2]);
25721
25722   if (skiplo && skiphi)
25723     {
25724       /* No-op move.  Can't split to nothing; emit something.  */
25725       emit_note (NOTE_INSN_DELETED);
25726       return;
25727     }
25728
25729   /* Special case of reversed high/low parts.  */
25730   if (reg_overlap_mentioned_p (operands[2], destlo)
25731       && reg_overlap_mentioned_p (operands[1], desthi))
25732     {
25733       emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
25734       emit_insn (gen_xorv16qi3 (operands[2], operands[1], operands[2]));
25735       emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
25736     }
25737   else if (!reg_overlap_mentioned_p (operands[2], destlo))
25738     {
25739       /* Try to avoid unnecessary moves if part of the result
25740          is in the right place already.  */
25741       if (!skiplo)
25742         emit_move_insn (destlo, operands[1]);
25743       if (!skiphi)
25744         emit_move_insn (desthi, operands[2]);
25745     }
25746   else
25747     {
25748       if (!skiphi)
25749         emit_move_insn (desthi, operands[2]);
25750       if (!skiplo)
25751         emit_move_insn (destlo, operands[1]);
25752     }
25753 }
25754
25755 /* vec_perm support.  */
25756
25757 struct expand_vec_perm_d
25758 {
25759   rtx target, op0, op1;
25760   vec_perm_indices perm;
25761   machine_mode vmode;
25762   machine_mode op_mode;
25763   unsigned int vec_flags;
25764   unsigned int op_vec_flags;
25765   bool one_vector_p;
25766   bool zero_op0_p, zero_op1_p;
25767   bool testing_p;
25768 };
25769
25770 static bool aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d);
25771
25772 /* Generate a variable permutation.  */
25773
25774 static void
25775 aarch64_expand_vec_perm_1 (rtx target, rtx op0, rtx op1, rtx sel)
25776 {
25777   machine_mode vmode = GET_MODE (target);
25778   bool one_vector_p = rtx_equal_p (op0, op1);
25779
25780   gcc_checking_assert (vmode == V8QImode || vmode == V16QImode);
25781   gcc_checking_assert (GET_MODE (op0) == vmode);
25782   gcc_checking_assert (GET_MODE (op1) == vmode);
25783   gcc_checking_assert (GET_MODE (sel) == vmode);
25784   gcc_checking_assert (TARGET_SIMD);
25785
25786   if (one_vector_p)
25787     {
25788       if (vmode == V8QImode)
25789         {
25790           /* Expand the argument to a V16QI mode by duplicating it.  */
25791           rtx pair = gen_reg_rtx (V16QImode);
25792           emit_insn (gen_aarch64_combinev8qi (pair, op0, op0));
25793           emit_insn (gen_aarch64_qtbl1v8qi (target, pair, sel));
25794         }
25795       else
25796         {
25797           emit_insn (gen_aarch64_qtbl1v16qi (target, op0, sel));
25798         }
25799     }
25800   else
25801     {
25802       rtx pair;
25803
25804       if (vmode == V8QImode)
25805         {
25806           pair = gen_reg_rtx (V16QImode);
25807           emit_insn (gen_aarch64_combinev8qi (pair, op0, op1));
25808           emit_insn (gen_aarch64_qtbl1v8qi (target, pair, sel));
25809         }
25810       else
25811         {
25812           pair = gen_reg_rtx (V2x16QImode);
25813           emit_insn (gen_aarch64_combinev16qi (pair, op0, op1));
25814           emit_insn (gen_aarch64_qtbl2v16qi (target, pair, sel));
25815         }
25816     }
25817 }
25818
25819 /* Expand a vec_perm with the operands given by TARGET, OP0, OP1 and SEL.
25820    NELT is the number of elements in the vector.  */
25821
25822 void
25823 aarch64_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel,
25824                          unsigned int nelt)
25825 {
25826   machine_mode vmode = GET_MODE (target);
25827   bool one_vector_p = rtx_equal_p (op0, op1);
25828   rtx mask;
25829
25830   /* The TBL instruction does not use a modulo index, so we must take care
25831      of that ourselves.  */
25832   mask = aarch64_simd_gen_const_vector_dup (vmode,
25833       one_vector_p ? nelt - 1 : 2 * nelt - 1);
25834   sel = expand_simple_binop (vmode, AND, sel, mask, NULL, 0, OPTAB_LIB_WIDEN);
25835
25836   /* For big-endian, we also need to reverse the index within the vector
25837      (but not which vector).  */
25838   if (BYTES_BIG_ENDIAN)
25839     {
25840       /* If one_vector_p, mask is a vector of (nelt - 1)'s already.  */
25841       if (!one_vector_p)
25842         mask = aarch64_simd_gen_const_vector_dup (vmode, nelt - 1);
25843       sel = expand_simple_binop (vmode, XOR, sel, mask,
25844                                  NULL, 0, OPTAB_LIB_WIDEN);
25845     }
25846   aarch64_expand_vec_perm_1 (target, op0, op1, sel);
25847 }
25848
25849 /* Generate (set TARGET (unspec [OP0 OP1] CODE)).  */
25850
25851 static void
25852 emit_unspec2 (rtx target, int code, rtx op0, rtx op1)
25853 {
25854   emit_insn (gen_rtx_SET (target,
25855                           gen_rtx_UNSPEC (GET_MODE (target),
25856                                           gen_rtvec (2, op0, op1), code)));
25857 }
25858
25859 /* Expand an SVE vec_perm with the given operands.  */
25860
25861 void
25862 aarch64_expand_sve_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
25863 {
25864   machine_mode data_mode = GET_MODE (target);
25865   machine_mode sel_mode = GET_MODE (sel);
25866   /* Enforced by the pattern condition.  */
25867   int nunits = GET_MODE_NUNITS (sel_mode).to_constant ();
25868
25869   /* Note: vec_perm indices are supposed to wrap when they go beyond the
25870      size of the two value vectors, i.e. the upper bits of the indices
25871      are effectively ignored.  SVE TBL instead produces 0 for any
25872      out-of-range indices, so we need to modulo all the vec_perm indices
25873      to ensure they are all in range.  */
25874   rtx sel_reg = force_reg (sel_mode, sel);
25875
25876   /* Check if the sel only references the first values vector.  */
25877   if (CONST_VECTOR_P (sel)
25878       && aarch64_const_vec_all_in_range_p (sel, 0, nunits - 1))
25879     {
25880       emit_unspec2 (target, UNSPEC_TBL, op0, sel_reg);
25881       return;
25882     }
25883
25884   /* Check if the two values vectors are the same.  */
25885   if (rtx_equal_p (op0, op1))
25886     {
25887       rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode, nunits - 1);
25888       rtx sel_mod = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
25889                                          NULL, 0, OPTAB_DIRECT);
25890       emit_unspec2 (target, UNSPEC_TBL, op0, sel_mod);
25891       return;
25892     }
25893
25894   /* Run TBL on for each value vector and combine the results.  */
25895
25896   rtx res0 = gen_reg_rtx (data_mode);
25897   rtx res1 = gen_reg_rtx (data_mode);
25898   rtx neg_num_elems = aarch64_simd_gen_const_vector_dup (sel_mode, -nunits);
25899   if (!CONST_VECTOR_P (sel)
25900       || !aarch64_const_vec_all_in_range_p (sel, 0, 2 * nunits - 1))
25901     {
25902       rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode,
25903                                                        2 * nunits - 1);
25904       sel_reg = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
25905                                      NULL, 0, OPTAB_DIRECT);
25906     }
25907   emit_unspec2 (res0, UNSPEC_TBL, op0, sel_reg);
25908   rtx sel_sub = expand_simple_binop (sel_mode, PLUS, sel_reg, neg_num_elems,
25909                                      NULL, 0, OPTAB_DIRECT);
25910   emit_unspec2 (res1, UNSPEC_TBL, op1, sel_sub);
25911   if (GET_MODE_CLASS (data_mode) == MODE_VECTOR_INT)
25912     emit_insn (gen_rtx_SET (target, gen_rtx_IOR (data_mode, res0, res1)));
25913   else
25914     emit_unspec2 (target, UNSPEC_IORF, res0, res1);
25915 }
25916
25917 /* Recognize patterns suitable for the TRN instructions.  */
25918 static bool
25919 aarch64_evpc_trn (struct expand_vec_perm_d *d)
25920 {
25921   HOST_WIDE_INT odd;
25922   poly_uint64 nelt = d->perm.length ();
25923   rtx out, in0, in1;
25924   machine_mode vmode = d->vmode;
25925
25926   if (GET_MODE_UNIT_SIZE (vmode) > 8)
25927     return false;
25928
25929   /* Note that these are little-endian tests.
25930      We correct for big-endian later.  */
25931   if (!d->perm[0].is_constant (&odd)
25932       || (odd != 0 && odd != 1)
25933       || !d->perm.series_p (0, 2, odd, 2)
25934       || !d->perm.series_p (1, 2, nelt + odd, 2))
25935     return false;
25936
25937   /* Success!  */
25938   if (d->testing_p)
25939     return true;
25940
25941   in0 = d->op0;
25942   in1 = d->op1;
25943   /* We don't need a big-endian lane correction for SVE; see the comment
25944      at the head of aarch64-sve.md for details.  */
25945   if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
25946     {
25947       std::swap (in0, in1);
25948       odd = !odd;
25949     }
25950   out = d->target;
25951
25952   emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
25953                                       odd ? UNSPEC_TRN2 : UNSPEC_TRN1));
25954   return true;
25955 }
25956
25957 /* Try to re-encode the PERM constant so it combines odd and even elements.
25958    This rewrites constants such as {0, 1, 4, 5}/V4SF to {0, 2}/V2DI.
25959    We retry with this new constant with the full suite of patterns.  */
25960 static bool
25961 aarch64_evpc_reencode (struct expand_vec_perm_d *d)
25962 {
25963   expand_vec_perm_d newd;
25964
25965   /* The subregs that we'd create are not supported for big-endian SVE;
25966      see aarch64_modes_compatible_p for details.  */
25967   if (BYTES_BIG_ENDIAN && (d->vec_flags & VEC_ANY_SVE))
25968     return false;
25969
25970   /* Get the new mode.  Always twice the size of the inner
25971      and half the elements.  */
25972   machine_mode new_mode;
25973   if (!aarch64_coalesce_units (d->vmode, 2).exists (&new_mode))
25974     return false;
25975
25976   vec_perm_indices newpermindices;
25977   if (!newpermindices.new_shrunk_vector (d->perm, 2))
25978     return false;
25979
25980   newd.vmode = new_mode;
25981   newd.vec_flags = d->vec_flags;
25982   newd.op_mode = newd.vmode;
25983   newd.op_vec_flags = newd.vec_flags;
25984   newd.target = d->target ? gen_lowpart (new_mode, d->target) : NULL;
25985   newd.op0 = d->op0 ? gen_lowpart (new_mode, d->op0) : NULL;
25986   newd.op1 = d->op1 ? gen_lowpart (new_mode, d->op1) : NULL;
25987   newd.testing_p = d->testing_p;
25988   newd.one_vector_p = d->one_vector_p;
25989
25990   newd.perm.new_vector (newpermindices.encoding (), newd.one_vector_p ? 1 : 2,
25991                         newpermindices.nelts_per_input ());
25992   return aarch64_expand_vec_perm_const_1 (&newd);
25993 }
25994
25995 /* Recognize patterns suitable for the UZP instructions.  */
25996 static bool
25997 aarch64_evpc_uzp (struct expand_vec_perm_d *d)
25998 {
25999   HOST_WIDE_INT odd;
26000   rtx out, in0, in1;
26001   machine_mode vmode = d->vmode;
26002
26003   if (GET_MODE_UNIT_SIZE (vmode) > 8)
26004     return false;
26005
26006   /* Note that these are little-endian tests.
26007      We correct for big-endian later.  */
26008   if (!d->perm[0].is_constant (&odd)
26009       || (odd != 0 && odd != 1)
26010       || !d->perm.series_p (0, 1, odd, 2))
26011     return false;
26012
26013   /* Success!  */
26014   if (d->testing_p)
26015     return true;
26016
26017   in0 = d->op0;
26018   in1 = d->op1;
26019   /* We don't need a big-endian lane correction for SVE; see the comment
26020      at the head of aarch64-sve.md for details.  */
26021   if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
26022     {
26023       std::swap (in0, in1);
26024       odd = !odd;
26025     }
26026   out = d->target;
26027
26028   emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
26029                                       odd ? UNSPEC_UZP2 : UNSPEC_UZP1));
26030   return true;
26031 }
26032
26033 /* Recognize patterns suitable for the ZIP instructions.  */
26034 static bool
26035 aarch64_evpc_zip (struct expand_vec_perm_d *d)
26036 {
26037   unsigned int high;
26038   poly_uint64 nelt = d->perm.length ();
26039   rtx out, in0, in1;
26040   machine_mode vmode = d->vmode;
26041
26042   if (GET_MODE_UNIT_SIZE (vmode) > 8)
26043     return false;
26044
26045   /* Note that these are little-endian tests.
26046      We correct for big-endian later.  */
26047   poly_uint64 first = d->perm[0];
26048   if ((maybe_ne (first, 0U) && maybe_ne (first * 2, nelt))
26049       || !d->perm.series_p (0, 2, first, 1)
26050       || !d->perm.series_p (1, 2, first + nelt, 1))
26051     return false;
26052   high = maybe_ne (first, 0U);
26053
26054   /* Success!  */
26055   if (d->testing_p)
26056     return true;
26057
26058   in0 = d->op0;
26059   in1 = d->op1;
26060   /* We don't need a big-endian lane correction for SVE; see the comment
26061      at the head of aarch64-sve.md for details.  */
26062   if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
26063     {
26064       std::swap (in0, in1);
26065       high = !high;
26066     }
26067   out = d->target;
26068
26069   emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
26070                                       high ? UNSPEC_ZIP2 : UNSPEC_ZIP1));
26071   return true;
26072 }
26073
26074 /* Recognize patterns for the EXT insn.  */
26075
26076 static bool
26077 aarch64_evpc_ext (struct expand_vec_perm_d *d)
26078 {
26079   HOST_WIDE_INT location;
26080   rtx offset;
26081
26082   /* The first element always refers to the first vector.
26083      Check if the extracted indices are increasing by one.  */
26084   if ((d->vec_flags & VEC_SVE_PRED)
26085       || !d->perm[0].is_constant (&location)
26086       || !d->perm.series_p (0, 1, location, 1))
26087     return false;
26088
26089   /* Success! */
26090   if (d->testing_p)
26091     return true;
26092
26093   /* The case where (location == 0) is a no-op for both big- and little-endian,
26094      and is removed by the mid-end at optimization levels -O1 and higher.
26095
26096      We don't need a big-endian lane correction for SVE; see the comment
26097      at the head of aarch64-sve.md for details.  */
26098   if (BYTES_BIG_ENDIAN && location != 0 && d->vec_flags == VEC_ADVSIMD)
26099     {
26100       /* After setup, we want the high elements of the first vector (stored
26101          at the LSB end of the register), and the low elements of the second
26102          vector (stored at the MSB end of the register). So swap.  */
26103       std::swap (d->op0, d->op1);
26104       /* location != 0 (above), so safe to assume (nelt - location) < nelt.
26105          to_constant () is safe since this is restricted to Advanced SIMD
26106          vectors.  */
26107       location = d->perm.length ().to_constant () - location;
26108     }
26109
26110   offset = GEN_INT (location);
26111   emit_set_insn (d->target,
26112                  gen_rtx_UNSPEC (d->vmode,
26113                                  gen_rtvec (3, d->op0, d->op1, offset),
26114                                  UNSPEC_EXT));
26115   return true;
26116 }
26117
26118 /* Recognize patterns for the REV{64,32,16} insns, which reverse elements
26119    within each 64-bit, 32-bit or 16-bit granule.  */
26120
26121 static bool
26122 aarch64_evpc_rev_local (struct expand_vec_perm_d *d)
26123 {
26124   HOST_WIDE_INT diff;
26125   unsigned int i, size, unspec;
26126   machine_mode pred_mode;
26127
26128   if ((d->vec_flags & VEC_SVE_PRED)
26129       || !d->one_vector_p
26130       || !d->perm[0].is_constant (&diff)
26131       || !diff)
26132     return false;
26133
26134   if (d->vec_flags & VEC_SVE_DATA)
26135     size = (diff + 1) * aarch64_sve_container_bits (d->vmode);
26136   else
26137     size = (diff + 1) * GET_MODE_UNIT_BITSIZE (d->vmode);
26138   if (size == 64)
26139     {
26140       unspec = UNSPEC_REV64;
26141       pred_mode = VNx2BImode;
26142     }
26143   else if (size == 32)
26144     {
26145       unspec = UNSPEC_REV32;
26146       pred_mode = VNx4BImode;
26147     }
26148   else if (size == 16)
26149     {
26150       unspec = UNSPEC_REV16;
26151       pred_mode = VNx8BImode;
26152     }
26153   else
26154     return false;
26155
26156   unsigned int step = diff + 1;
26157   for (i = 0; i < step; ++i)
26158     if (!d->perm.series_p (i, step, diff - i, step))
26159       return false;
26160
26161   /* Success! */
26162   if (d->testing_p)
26163     return true;
26164
26165   if (d->vec_flags & VEC_SVE_DATA)
26166     {
26167       rtx pred = aarch64_ptrue_reg (pred_mode);
26168       emit_insn (gen_aarch64_sve_revbhw (d->vmode, pred_mode,
26169                                          d->target, pred, d->op0));
26170       return true;
26171     }
26172   rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), unspec);
26173   emit_set_insn (d->target, src);
26174   return true;
26175 }
26176
26177 /* Recognize patterns for the REV insn, which reverses elements within
26178    a full vector.  */
26179
26180 static bool
26181 aarch64_evpc_rev_global (struct expand_vec_perm_d *d)
26182 {
26183   poly_uint64 nelt = d->perm.length ();
26184
26185   if (!d->one_vector_p || d->vec_flags == VEC_ADVSIMD)
26186     return false;
26187
26188   if (!d->perm.series_p (0, 1, nelt - 1, -1))
26189     return false;
26190
26191   /* Success! */
26192   if (d->testing_p)
26193     return true;
26194
26195   rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), UNSPEC_REV);
26196   emit_set_insn (d->target, src);
26197   return true;
26198 }
26199
26200 static bool
26201 aarch64_evpc_dup (struct expand_vec_perm_d *d)
26202 {
26203   rtx out = d->target;
26204   rtx in0;
26205   HOST_WIDE_INT elt;
26206   machine_mode vmode = d->vmode;
26207   rtx lane;
26208
26209   if ((d->vec_flags & VEC_SVE_PRED)
26210       || d->perm.encoding ().encoded_nelts () != 1
26211       || !d->perm[0].is_constant (&elt))
26212     return false;
26213
26214   if ((d->vec_flags & VEC_SVE_DATA)
26215       && elt * (aarch64_sve_container_bits (vmode) / 8) >= 64)
26216     return false;
26217
26218   /* Success! */
26219   if (d->testing_p)
26220     return true;
26221
26222   /* The generic preparation in aarch64_expand_vec_perm_const_1
26223      swaps the operand order and the permute indices if it finds
26224      d->perm[0] to be in the second operand.  Thus, we can always
26225      use d->op0 and need not do any extra arithmetic to get the
26226      correct lane number.  */
26227   in0 = d->op0;
26228   lane = GEN_INT (elt); /* The pattern corrects for big-endian.  */
26229
26230   rtx parallel = gen_rtx_PARALLEL (vmode, gen_rtvec (1, lane));
26231   rtx select = gen_rtx_VEC_SELECT (GET_MODE_INNER (vmode), in0, parallel);
26232   emit_set_insn (out, gen_rtx_VEC_DUPLICATE (vmode, select));
26233   return true;
26234 }
26235
26236 /* Recognize things that can be done using the SVE2p1 Hybrid-VLA
26237    permutations, which apply Advanced-SIMD-style permutations to each
26238    individual 128-bit block.  */
26239
26240 static bool
26241 aarch64_evpc_hvla (struct expand_vec_perm_d *d)
26242 {
26243   machine_mode vmode = d->vmode;
26244   if (!TARGET_SVE2p1
26245       || !TARGET_NON_STREAMING
26246       || BYTES_BIG_ENDIAN
26247       || d->vec_flags != VEC_SVE_DATA
26248       || GET_MODE_UNIT_BITSIZE (vmode) > 64)
26249     return false;
26250
26251   /* Set SUBELTS to the number of elements in an Advanced SIMD vector
26252      and make sure that adding SUBELTS to each block of SUBELTS indices
26253      gives the next block of SUBELTS indices.  That is, it must be possible
26254      to interpret the index vector as SUBELTS interleaved linear series in
26255      which each series has step SUBELTS.  */
26256   unsigned int subelts = 128U / GET_MODE_UNIT_BITSIZE (vmode);
26257   unsigned int pairs = subelts / 2;
26258   for (unsigned int i = 0; i < subelts; ++i)
26259     if (!d->perm.series_p (i, subelts, d->perm[i], subelts))
26260       return false;
26261
26262   /* Used once we have verified that we can use UNSPEC to do the operation.  */
26263   auto use_binary = [&](int unspec) -> bool
26264     {
26265       if (!d->testing_p)
26266         {
26267           rtvec vec = gen_rtvec (2, d->op0, d->op1);
26268           emit_set_insn (d->target, gen_rtx_UNSPEC (vmode, vec, unspec));
26269         }
26270       return true;
26271     };
26272
26273   /* Now check whether the first SUBELTS elements match a supported
26274      Advanced-SIMD-style operation.  */
26275   poly_int64 first = d->perm[0];
26276   poly_int64 nelt = d->perm.length ();
26277   auto try_zip = [&]() -> bool
26278     {
26279       if (maybe_ne (first, 0) && maybe_ne (first, pairs))
26280         return false;
26281       for (unsigned int i = 0; i < pairs; ++i)
26282         if (maybe_ne (d->perm[i * 2], first + i)
26283             || maybe_ne (d->perm[i * 2 + 1], first + nelt + i))
26284           return false;
26285       return use_binary (maybe_ne (first, 0) ? UNSPEC_ZIPQ2 : UNSPEC_ZIPQ1);
26286     };
26287   auto try_uzp = [&]() -> bool
26288     {
26289       if (maybe_ne (first, 0) && maybe_ne (first, 1))
26290         return false;
26291       for (unsigned int i = 0; i < pairs; ++i)
26292         if (maybe_ne (d->perm[i], first + i * 2)
26293             || maybe_ne (d->perm[i + pairs], first + nelt + i * 2))
26294           return false;
26295       return use_binary (maybe_ne (first, 0) ? UNSPEC_UZPQ2 : UNSPEC_UZPQ1);
26296     };
26297   auto try_extq = [&]() -> bool
26298     {
26299       HOST_WIDE_INT start;
26300       if (!first.is_constant (&start) || !IN_RANGE (start, 0, subelts - 1))
26301         return false;
26302       for (unsigned int i = 0; i < subelts; ++i)
26303         {
26304           poly_int64 next = (start + i >= subelts
26305                              ? start + i - subelts + nelt
26306                              : start + i);
26307           if (maybe_ne (d->perm[i], next))
26308             return false;
26309         }
26310       if (!d->testing_p)
26311         {
26312           rtx op2 = gen_int_mode (start, SImode);
26313           emit_insn (gen_aarch64_sve_extq (vmode, d->target,
26314                                            d->op0, d->op1, op2));
26315         }
26316       return true;
26317     };
26318   auto try_dupq = [&]() -> bool
26319     {
26320       HOST_WIDE_INT start;
26321       if (!first.is_constant (&start) || !IN_RANGE (start, 0, subelts - 1))
26322         return false;
26323       for (unsigned int i = 0; i < subelts; ++i)
26324         if (maybe_ne (d->perm[i], start))
26325           return false;
26326       if (!d->testing_p)
26327         {
26328           rtx op1 = gen_int_mode (start, SImode);
26329           emit_insn (gen_aarch64_sve_dupq (vmode, d->target, d->op0, op1));
26330         }
26331       return true;
26332     };
26333
26334   return try_zip () || try_uzp () || try_extq () || try_dupq ();
26335 }
26336
26337 static bool
26338 aarch64_evpc_tbl (struct expand_vec_perm_d *d)
26339 {
26340   rtx rperm[MAX_COMPILE_TIME_VEC_BYTES], sel;
26341   machine_mode vmode = d->vmode;
26342
26343   /* Make sure that the indices are constant.  */
26344   unsigned int encoded_nelts = d->perm.encoding ().encoded_nelts ();
26345   for (unsigned int i = 0; i < encoded_nelts; ++i)
26346     if (!d->perm[i].is_constant ())
26347       return false;
26348
26349   if (d->testing_p)
26350     return true;
26351
26352   /* Generic code will try constant permutation twice.  Once with the
26353      original mode and again with the elements lowered to QImode.
26354      So wait and don't do the selector expansion ourselves.  */
26355   if (vmode != V8QImode && vmode != V16QImode)
26356     return false;
26357
26358   /* to_constant is safe since this routine is specific to Advanced SIMD
26359      vectors.  */
26360   unsigned int nelt = d->perm.length ().to_constant ();
26361
26362   /* If one register is the constant vector of 0 then we only need
26363      a one reg TBL and we map any accesses to the vector of 0 to -1.  We can't
26364      do this earlier since vec_perm_indices clamps elements to within range so
26365      we can only do it during codegen.  */
26366   if (d->zero_op0_p)
26367     d->op0 = d->op1;
26368   else if (d->zero_op1_p)
26369     d->op1 = d->op0;
26370
26371   for (unsigned int i = 0; i < nelt; ++i)
26372     {
26373       auto val = d->perm[i].to_constant ();
26374
26375       /* If we're selecting from a 0 vector, we can just use an out of range
26376          index instead.  */
26377       if ((d->zero_op0_p && val < nelt) || (d->zero_op1_p && val >= nelt))
26378         rperm[i] = constm1_rtx;
26379       else
26380         {
26381           /* If we are remapping a zero register as the first parameter we need
26382              to adjust the indices of the non-zero register.  */
26383           if (d->zero_op0_p)
26384             val = val % nelt;
26385
26386           /* If big-endian and two vectors we end up with a weird mixed-endian
26387              mode on NEON.  Reverse the index within each word but not the word
26388              itself.  to_constant is safe because we checked is_constant
26389              above.  */
26390           rperm[i] = GEN_INT (BYTES_BIG_ENDIAN ? val ^ (nelt - 1) : val);
26391         }
26392     }
26393
26394   sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
26395   sel = force_reg (vmode, sel);
26396
26397   aarch64_expand_vec_perm_1 (d->target, d->op0, d->op1, sel);
26398   return true;
26399 }
26400
26401 /* Try to implement D using an SVE TBL instruction.  */
26402
26403 static bool
26404 aarch64_evpc_sve_tbl (struct expand_vec_perm_d *d)
26405 {
26406   unsigned HOST_WIDE_INT nelt;
26407
26408   /* Permuting two variable-length vectors could overflow the
26409      index range.  */
26410   if (!d->one_vector_p && !d->perm.length ().is_constant (&nelt))
26411     return false;
26412
26413   if (d->testing_p)
26414     return true;
26415
26416   machine_mode sel_mode = related_int_vector_mode (d->vmode).require ();
26417   rtx sel = vec_perm_indices_to_rtx (sel_mode, d->perm);
26418   if (d->one_vector_p)
26419     emit_unspec2 (d->target, UNSPEC_TBL, d->op0, force_reg (sel_mode, sel));
26420   else
26421     aarch64_expand_sve_vec_perm (d->target, d->op0, d->op1, sel);
26422   return true;
26423 }
26424
26425 /* Try to implement D using SVE dup instruction.  */
26426
26427 static bool
26428 aarch64_evpc_sve_dup (struct expand_vec_perm_d *d)
26429 {
26430   if (BYTES_BIG_ENDIAN
26431       || !d->one_vector_p
26432       || d->vec_flags != VEC_SVE_DATA
26433       || d->op_vec_flags != VEC_ADVSIMD
26434       || d->perm.encoding ().nelts_per_pattern () != 1
26435       || !known_eq (d->perm.encoding ().npatterns (),
26436                     GET_MODE_NUNITS (d->op_mode))
26437       || !known_eq (GET_MODE_BITSIZE (d->op_mode), 128))
26438     return false;
26439
26440   int npatterns = d->perm.encoding ().npatterns ();
26441   for (int i = 0; i < npatterns; i++)
26442     if (!known_eq (d->perm[i], i))
26443       return false;
26444
26445   if (d->testing_p)
26446     return true;
26447
26448   aarch64_expand_sve_dupq (d->target, GET_MODE (d->target), d->op0);
26449   return true;
26450 }
26451
26452 /* Try to implement D using SVE SEL instruction.  */
26453
26454 static bool
26455 aarch64_evpc_sel (struct expand_vec_perm_d *d)
26456 {
26457   machine_mode vmode = d->vmode;
26458   int unit_size = GET_MODE_UNIT_SIZE (vmode);
26459
26460   if (d->vec_flags != VEC_SVE_DATA
26461       || unit_size > 8)
26462     return false;
26463
26464   int n_patterns = d->perm.encoding ().npatterns ();
26465   poly_int64 vec_len = d->perm.length ();
26466
26467   for (int i = 0; i < n_patterns; ++i)
26468     if (!known_eq (d->perm[i], i)
26469         && !known_eq (d->perm[i], vec_len + i))
26470       return false;
26471
26472   for (int i = n_patterns; i < n_patterns * 2; i++)
26473     if (!d->perm.series_p (i, n_patterns, i, n_patterns)
26474         && !d->perm.series_p (i, n_patterns, vec_len + i, n_patterns))
26475       return false;
26476
26477   if (d->testing_p)
26478     return true;
26479
26480   machine_mode pred_mode = aarch64_sve_pred_mode (vmode);
26481
26482   /* Build a predicate that is true when op0 elements should be used.  */
26483   rtx_vector_builder builder (pred_mode, n_patterns, 2);
26484   for (int i = 0; i < n_patterns * 2; i++)
26485     {
26486       rtx elem = known_eq (d->perm[i], i) ? CONST1_RTX (BImode)
26487                                           : CONST0_RTX (BImode);
26488       builder.quick_push (elem);
26489     }
26490
26491   rtx const_vec = builder.build ();
26492   rtx pred = force_reg (pred_mode, const_vec);
26493   /* TARGET = PRED ? OP0 : OP1.  */
26494   emit_insn (gen_vcond_mask (vmode, vmode, d->target, d->op0, d->op1, pred));
26495   return true;
26496 }
26497
26498 /* Recognize patterns suitable for the INS instructions.  */
26499 static bool
26500 aarch64_evpc_ins (struct expand_vec_perm_d *d)
26501 {
26502   machine_mode mode = d->vmode;
26503   unsigned HOST_WIDE_INT nelt;
26504
26505   if (d->vec_flags != VEC_ADVSIMD)
26506     return false;
26507
26508   /* to_constant is safe since this routine is specific to Advanced SIMD
26509      vectors.  */
26510   nelt = d->perm.length ().to_constant ();
26511   rtx insv = d->op0;
26512
26513   HOST_WIDE_INT idx = -1;
26514
26515   for (unsigned HOST_WIDE_INT i = 0; i < nelt; i++)
26516     {
26517       HOST_WIDE_INT elt;
26518       if (!d->perm[i].is_constant (&elt))
26519         return false;
26520       if (elt == (HOST_WIDE_INT) i)
26521         continue;
26522       if (idx != -1)
26523         {
26524           idx = -1;
26525           break;
26526         }
26527       idx = i;
26528     }
26529
26530   if (idx == -1)
26531     {
26532       insv = d->op1;
26533       for (unsigned HOST_WIDE_INT i = 0; i < nelt; i++)
26534         {
26535           if (d->perm[i].to_constant () == (HOST_WIDE_INT) (i + nelt))
26536             continue;
26537           if (idx != -1)
26538             return false;
26539           idx = i;
26540         }
26541
26542       if (idx == -1)
26543         return false;
26544     }
26545
26546   if (d->testing_p)
26547     return true;
26548
26549   gcc_assert (idx != -1);
26550
26551   unsigned extractindex = d->perm[idx].to_constant ();
26552   rtx extractv = d->op0;
26553   if (extractindex >= nelt)
26554     {
26555       extractv = d->op1;
26556       extractindex -= nelt;
26557     }
26558   gcc_assert (extractindex < nelt);
26559
26560   insn_code icode = code_for_aarch64_simd_vec_copy_lane (mode);
26561   expand_operand ops[5];
26562   create_output_operand (&ops[0], d->target, mode);
26563   create_input_operand (&ops[1], insv, mode);
26564   create_integer_operand (&ops[2], 1 << idx);
26565   create_input_operand (&ops[3], extractv, mode);
26566   create_integer_operand (&ops[4], extractindex);
26567   expand_insn (icode, 5, ops);
26568
26569   return true;
26570 }
26571
26572 static bool
26573 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
26574 {
26575   gcc_assert (d->op_mode != E_VOIDmode);
26576
26577   /* The pattern matching functions above are written to look for a small
26578      number to begin the sequence (0, 1, N/2).  If we begin with an index
26579      from the second operand, we can swap the operands.  */
26580   poly_int64 nelt = d->perm.length ();
26581   if (known_ge (d->perm[0], nelt))
26582     {
26583       d->perm.rotate_inputs (1);
26584       std::swap (d->op0, d->op1);
26585     }
26586
26587   if (((d->vec_flags == VEC_ADVSIMD && TARGET_SIMD)
26588        || d->vec_flags == VEC_SVE_DATA
26589        || d->vec_flags == (VEC_SVE_DATA | VEC_PARTIAL)
26590        || d->vec_flags == VEC_SVE_PRED)
26591       && known_gt (nelt, 1))
26592     {
26593       if (d->vmode == d->op_mode)
26594         {
26595           if (aarch64_evpc_rev_local (d))
26596             return true;
26597           else if (aarch64_evpc_rev_global (d))
26598             return true;
26599           else if (aarch64_evpc_ext (d))
26600             return true;
26601           else if (aarch64_evpc_dup (d))
26602             return true;
26603           else if (aarch64_evpc_zip (d))
26604             return true;
26605           else if (aarch64_evpc_uzp (d))
26606             return true;
26607           else if (aarch64_evpc_trn (d))
26608             return true;
26609           else if (aarch64_evpc_sel (d))
26610             return true;
26611           else if (aarch64_evpc_ins (d))
26612             return true;
26613           else if (aarch64_evpc_hvla (d))
26614             return true;
26615           else if (aarch64_evpc_reencode (d))
26616             return true;
26617
26618           if (d->vec_flags == VEC_SVE_DATA)
26619             return aarch64_evpc_sve_tbl (d);
26620           else if (d->vec_flags == VEC_ADVSIMD)
26621             return aarch64_evpc_tbl (d);
26622         }
26623       else
26624         {
26625           if (aarch64_evpc_sve_dup (d))
26626             return true;
26627         }
26628     }
26629   return false;
26630 }
26631
26632 /* Implement TARGET_VECTORIZE_VEC_PERM_CONST.  */
26633
26634 static bool
26635 aarch64_vectorize_vec_perm_const (machine_mode vmode, machine_mode op_mode,
26636                                   rtx target, rtx op0, rtx op1,
26637                                   const vec_perm_indices &sel)
26638 {
26639   struct expand_vec_perm_d d;
26640
26641   /* Check whether the mask can be applied to a single vector.  */
26642   if (sel.ninputs () == 1
26643       || (op0 && rtx_equal_p (op0, op1)))
26644     d.one_vector_p = true;
26645   else if (sel.all_from_input_p (0))
26646     {
26647       d.one_vector_p = true;
26648       op1 = op0;
26649     }
26650   else if (sel.all_from_input_p (1))
26651     {
26652       d.one_vector_p = true;
26653       op0 = op1;
26654     }
26655   else
26656     d.one_vector_p = false;
26657
26658   d.zero_op0_p = op0 == CONST0_RTX (op_mode);
26659   d.zero_op1_p = op1 == CONST0_RTX (op_mode);
26660   d.perm.new_vector (sel.encoding (), d.one_vector_p ? 1 : 2,
26661                      sel.nelts_per_input ());
26662   d.vmode = vmode;
26663   d.vec_flags = aarch64_classify_vector_mode (d.vmode);
26664   d.op_mode = op_mode;
26665   d.op_vec_flags = aarch64_classify_vector_mode (d.op_mode);
26666   d.target = target;
26667   d.op0 = op0 ? force_reg (op_mode, op0) : NULL_RTX;
26668   if (op0 == op1)
26669     d.op1 = d.op0;
26670   else
26671     d.op1 = op1 ? force_reg (op_mode, op1) : NULL_RTX;
26672   d.testing_p = !target;
26673
26674   if (!d.testing_p)
26675     return aarch64_expand_vec_perm_const_1 (&d);
26676
26677   rtx_insn *last = get_last_insn ();
26678   bool ret = aarch64_expand_vec_perm_const_1 (&d);
26679   gcc_assert (last == get_last_insn ());
26680
26681   return ret;
26682 }
26683 /* Generate a byte permute mask for a register of mode MODE,
26684    which has NUNITS units.  */
26685
26686 rtx
26687 aarch64_reverse_mask (machine_mode mode, unsigned int nunits)
26688 {
26689   /* We have to reverse each vector because we dont have
26690      a permuted load that can reverse-load according to ABI rules.  */
26691   rtx mask;
26692   rtvec v = rtvec_alloc (16);
26693   unsigned int i, j;
26694   unsigned int usize = GET_MODE_UNIT_SIZE (mode);
26695
26696   gcc_assert (BYTES_BIG_ENDIAN);
26697   gcc_assert (AARCH64_VALID_SIMD_QREG_MODE (mode));
26698
26699   for (i = 0; i < nunits; i++)
26700     for (j = 0; j < usize; j++)
26701       RTVEC_ELT (v, i * usize + j) = GEN_INT ((i + 1) * usize - 1 - j);
26702   mask = gen_rtx_CONST_VECTOR (V16QImode, v);
26703   return force_reg (V16QImode, mask);
26704 }
26705
26706 /* Expand an SVE integer comparison using the SVE equivalent of:
26707
26708      (set TARGET (CODE OP0 OP1)).  */
26709
26710 void
26711 aarch64_expand_sve_vec_cmp_int (rtx target, rtx_code code, rtx op0, rtx op1)
26712 {
26713   machine_mode pred_mode = GET_MODE (target);
26714   machine_mode data_mode = GET_MODE (op0);
26715   rtx res = aarch64_sve_emit_int_cmp (target, pred_mode, code, data_mode,
26716                                       op0, op1);
26717   if (!rtx_equal_p (target, res))
26718     emit_move_insn (target, res);
26719 }
26720
26721 /* Return the UNSPEC_COND_* code for comparison CODE.  */
26722
26723 static unsigned int
26724 aarch64_unspec_cond_code (rtx_code code)
26725 {
26726   switch (code)
26727     {
26728     case NE:
26729       return UNSPEC_COND_FCMNE;
26730     case EQ:
26731       return UNSPEC_COND_FCMEQ;
26732     case LT:
26733       return UNSPEC_COND_FCMLT;
26734     case GT:
26735       return UNSPEC_COND_FCMGT;
26736     case LE:
26737       return UNSPEC_COND_FCMLE;
26738     case GE:
26739       return UNSPEC_COND_FCMGE;
26740     case UNORDERED:
26741       return UNSPEC_COND_FCMUO;
26742     default:
26743       gcc_unreachable ();
26744     }
26745 }
26746
26747 /* Emit:
26748
26749       (set TARGET (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X>))
26750
26751    where <X> is the operation associated with comparison CODE.
26752    KNOWN_PTRUE_P is true if PRED is known to be a PTRUE.  */
26753
26754 static void
26755 aarch64_emit_sve_fp_cond (rtx target, rtx_code code, rtx pred,
26756                           bool known_ptrue_p, rtx op0, rtx op1)
26757 {
26758   rtx flag = gen_int_mode (known_ptrue_p, SImode);
26759   rtx unspec = gen_rtx_UNSPEC (GET_MODE (pred),
26760                                gen_rtvec (4, pred, flag, op0, op1),
26761                                aarch64_unspec_cond_code (code));
26762   emit_set_insn (target, unspec);
26763 }
26764
26765 /* Emit the SVE equivalent of:
26766
26767       (set TMP1 (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X1>))
26768       (set TMP2 (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X2>))
26769       (set TARGET (ior:PRED_MODE TMP1 TMP2))
26770
26771    where <Xi> is the operation associated with comparison CODEi.
26772    KNOWN_PTRUE_P is true if PRED is known to be a PTRUE.  */
26773
26774 static void
26775 aarch64_emit_sve_or_fp_conds (rtx target, rtx_code code1, rtx_code code2,
26776                               rtx pred, bool known_ptrue_p, rtx op0, rtx op1)
26777 {
26778   machine_mode pred_mode = GET_MODE (pred);
26779   rtx tmp1 = gen_reg_rtx (pred_mode);
26780   aarch64_emit_sve_fp_cond (tmp1, code1, pred, known_ptrue_p, op0, op1);
26781   rtx tmp2 = gen_reg_rtx (pred_mode);
26782   aarch64_emit_sve_fp_cond (tmp2, code2, pred, known_ptrue_p, op0, op1);
26783   aarch64_emit_binop (target, ior_optab, tmp1, tmp2);
26784 }
26785
26786 /* Emit the SVE equivalent of:
26787
26788       (set TMP (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X>))
26789       (set TARGET (not TMP))
26790
26791    where <X> is the operation associated with comparison CODE.
26792    KNOWN_PTRUE_P is true if PRED is known to be a PTRUE.  */
26793
26794 static void
26795 aarch64_emit_sve_invert_fp_cond (rtx target, rtx_code code, rtx pred,
26796                                  bool known_ptrue_p, rtx op0, rtx op1)
26797 {
26798   machine_mode pred_mode = GET_MODE (pred);
26799   rtx tmp = gen_reg_rtx (pred_mode);
26800   aarch64_emit_sve_fp_cond (tmp, code, pred, known_ptrue_p, op0, op1);
26801   aarch64_emit_unop (target, one_cmpl_optab, tmp);
26802 }
26803
26804 /* Expand an SVE floating-point comparison using the SVE equivalent of:
26805
26806      (set TARGET (CODE OP0 OP1))
26807
26808    If CAN_INVERT_P is true, the caller can also handle inverted results;
26809    return true if the result is in fact inverted.  */
26810
26811 bool
26812 aarch64_expand_sve_vec_cmp_float (rtx target, rtx_code code,
26813                                   rtx op0, rtx op1, bool can_invert_p)
26814 {
26815   machine_mode pred_mode = GET_MODE (target);
26816   machine_mode data_mode = GET_MODE (op0);
26817
26818   rtx ptrue = aarch64_ptrue_reg (pred_mode);
26819   switch (code)
26820     {
26821     case UNORDERED:
26822       /* UNORDERED has no immediate form.  */
26823       op1 = force_reg (data_mode, op1);
26824       /* fall through */
26825     case LT:
26826     case LE:
26827     case GT:
26828     case GE:
26829     case EQ:
26830     case NE:
26831       {
26832         /* There is native support for the comparison.  */
26833         aarch64_emit_sve_fp_cond (target, code, ptrue, true, op0, op1);
26834         return false;
26835       }
26836
26837     case LTGT:
26838       /* This is a trapping operation (LT or GT).  */
26839       aarch64_emit_sve_or_fp_conds (target, LT, GT, ptrue, true, op0, op1);
26840       return false;
26841
26842     case UNEQ:
26843       if (!flag_trapping_math)
26844         {
26845           /* This would trap for signaling NaNs.  */
26846           op1 = force_reg (data_mode, op1);
26847           aarch64_emit_sve_or_fp_conds (target, UNORDERED, EQ,
26848                                         ptrue, true, op0, op1);
26849           return false;
26850         }
26851       /* fall through */
26852     case UNLT:
26853     case UNLE:
26854     case UNGT:
26855     case UNGE:
26856       if (flag_trapping_math)
26857         {
26858           /* Work out which elements are ordered.  */
26859           rtx ordered = gen_reg_rtx (pred_mode);
26860           op1 = force_reg (data_mode, op1);
26861           aarch64_emit_sve_invert_fp_cond (ordered, UNORDERED,
26862                                            ptrue, true, op0, op1);
26863
26864           /* Test the opposite condition for the ordered elements,
26865              then invert the result.  */
26866           if (code == UNEQ)
26867             code = NE;
26868           else
26869             code = reverse_condition_maybe_unordered (code);
26870           if (can_invert_p)
26871             {
26872               aarch64_emit_sve_fp_cond (target, code,
26873                                         ordered, false, op0, op1);
26874               return true;
26875             }
26876           aarch64_emit_sve_invert_fp_cond (target, code,
26877                                            ordered, false, op0, op1);
26878           return false;
26879         }
26880       break;
26881
26882     case ORDERED:
26883       /* ORDERED has no immediate form.  */
26884       op1 = force_reg (data_mode, op1);
26885       break;
26886
26887     default:
26888       gcc_unreachable ();
26889     }
26890
26891   /* There is native support for the inverse comparison.  */
26892   code = reverse_condition_maybe_unordered (code);
26893   if (can_invert_p)
26894     {
26895       aarch64_emit_sve_fp_cond (target, code, ptrue, true, op0, op1);
26896       return true;
26897     }
26898   aarch64_emit_sve_invert_fp_cond (target, code, ptrue, true, op0, op1);
26899   return false;
26900 }
26901
26902 /* Return true if:
26903
26904    (a) MODE1 and MODE2 use the same layout for bytes that are common
26905        to both modes;
26906
26907    (b) subregs involving the two modes behave as the target-independent
26908        subreg rules require; and
26909
26910    (c) there is at least one register that can hold both modes.
26911
26912    Return false otherwise.  */
26913
26914 static bool
26915 aarch64_modes_compatible_p (machine_mode mode1, machine_mode mode2)
26916 {
26917   unsigned int flags1 = aarch64_classify_vector_mode (mode1);
26918   unsigned int flags2 = aarch64_classify_vector_mode (mode2);
26919
26920   bool sve1_p = (flags1 & VEC_ANY_SVE);
26921   bool sve2_p = (flags2 & VEC_ANY_SVE);
26922
26923   bool partial_sve1_p = sve1_p && (flags1 & VEC_PARTIAL);
26924   bool partial_sve2_p = sve2_p && (flags2 & VEC_PARTIAL);
26925
26926   bool pred1_p = (flags1 & VEC_SVE_PRED);
26927   bool pred2_p = (flags2 & VEC_SVE_PRED);
26928
26929   bool partial_advsimd_struct1_p = (flags1 == (VEC_ADVSIMD | VEC_STRUCT
26930                                                | VEC_PARTIAL));
26931   bool partial_advsimd_struct2_p = (flags2 == (VEC_ADVSIMD | VEC_STRUCT
26932                                                | VEC_PARTIAL));
26933
26934   /* Don't allow changes between predicate modes and other modes.
26935      Only predicate registers can hold predicate modes and only
26936      non-predicate registers can hold non-predicate modes, so any
26937      attempt to mix them would require a round trip through memory.  */
26938   if (pred1_p != pred2_p)
26939     return false;
26940
26941   /* The contents of partial SVE modes are distributed evenly across
26942      the register, whereas GCC expects them to be clustered together.
26943      We therefore need to be careful about mode changes involving them.  */
26944   if (partial_sve1_p && partial_sve2_p)
26945     {
26946       /* Reject changes between partial SVE modes that have different
26947          patterns of significant and insignificant bits.  */
26948       if ((aarch64_sve_container_bits (mode1)
26949            != aarch64_sve_container_bits (mode2))
26950           || GET_MODE_UNIT_SIZE (mode1) != GET_MODE_UNIT_SIZE (mode2))
26951         return false;
26952     }
26953   else if (partial_sve1_p)
26954     {
26955       /* The first lane of MODE1 is where GCC expects it, but anything
26956          bigger than that is not.  */
26957       if (maybe_gt (GET_MODE_SIZE (mode2), GET_MODE_UNIT_SIZE (mode1)))
26958         return false;
26959     }
26960   else if (partial_sve2_p)
26961     {
26962       /* Similarly in reverse.  */
26963       if (maybe_gt (GET_MODE_SIZE (mode1), GET_MODE_UNIT_SIZE (mode2)))
26964         return false;
26965     }
26966
26967   /* Don't allow changes between partial Advanced SIMD structure modes
26968      and other modes that are bigger than 8 bytes.  E.g. V16QI and V2x8QI
26969      are the same size, but the former occupies one Q register while the
26970      latter occupies two D registers.  */
26971   if (partial_advsimd_struct1_p != partial_advsimd_struct2_p
26972       && maybe_gt (GET_MODE_SIZE (mode1), 8)
26973       && maybe_gt (GET_MODE_SIZE (mode2), 8))
26974     return false;
26975
26976   if (maybe_ne (BITS_PER_SVE_VECTOR, 128u))
26977     {
26978       /* Don't allow changes between SVE modes and other modes that might
26979          be bigger than 128 bits.  In particular, OImode, CImode and XImode
26980          divide into 128-bit quantities while SVE modes divide into
26981          BITS_PER_SVE_VECTOR quantities.  */
26982       if (sve1_p && !sve2_p && maybe_gt (GET_MODE_BITSIZE (mode2), 128))
26983         return false;
26984       if (sve2_p && !sve1_p && maybe_gt (GET_MODE_BITSIZE (mode1), 128))
26985         return false;
26986     }
26987
26988   if (BYTES_BIG_ENDIAN)
26989     {
26990       /* Don't allow changes between SVE data modes and non-SVE modes.
26991          See the comment at the head of aarch64-sve.md for details.  */
26992       if (sve1_p != sve2_p)
26993         return false;
26994
26995       /* Don't allow changes in element size: lane 0 of the new vector
26996          would not then be lane 0 of the old vector.  See the comment
26997          above aarch64_maybe_expand_sve_subreg_move for a more detailed
26998          description.
26999
27000          In the worst case, this forces a register to be spilled in
27001          one mode and reloaded in the other, which handles the
27002          endianness correctly.  */
27003       if (sve1_p && GET_MODE_UNIT_SIZE (mode1) != GET_MODE_UNIT_SIZE (mode2))
27004         return false;
27005     }
27006   return true;
27007 }
27008
27009 /* Implement TARGET_MODES_TIEABLE_P.  In principle we should always defer
27010    to aarch64_modes_compatible_p.  However due to issues with register
27011    allocation it is preferable to avoid tieing integer scalar and FP
27012    scalar modes.  Executing integer operations in general registers is
27013    better than treating them as scalar vector operations.  This reduces
27014    latency and avoids redundant int<->FP moves.  So tie modes if they
27015    are either the same class, or one of them is a vector mode.  */
27016
27017 static bool
27018 aarch64_modes_tieable_p (machine_mode mode1, machine_mode mode2)
27019 {
27020   if (aarch64_modes_compatible_p (mode1, mode2))
27021     {
27022       if (GET_MODE_CLASS (mode1) == GET_MODE_CLASS (mode2))
27023         return true;
27024       if (VECTOR_MODE_P (mode1) || VECTOR_MODE_P (mode2))
27025         return true;
27026     }
27027   return false;
27028 }
27029
27030 /* Return a new RTX holding the result of moving POINTER forward by
27031    AMOUNT bytes.  */
27032
27033 static rtx
27034 aarch64_move_pointer (rtx pointer, poly_int64 amount)
27035 {
27036   rtx next = plus_constant (Pmode, XEXP (pointer, 0), amount);
27037
27038   return adjust_automodify_address (pointer, GET_MODE (pointer),
27039                                     next, amount);
27040 }
27041
27042 /* Expand a cpymem/movmem using the MOPS extension.  OPERANDS are taken
27043    from the cpymem/movmem pattern.  IS_MEMMOVE is true if this is a memmove
27044    rather than memcpy.  Return true iff we succeeded.  */
27045 bool
27046 aarch64_expand_cpymem_mops (rtx *operands, bool is_memmove)
27047 {
27048   if (!TARGET_MOPS)
27049     return false;
27050
27051   /* All three registers are changed by the instruction, so each one
27052      must be a fresh pseudo.  */
27053   rtx dst_addr = copy_to_mode_reg (Pmode, XEXP (operands[0], 0));
27054   rtx src_addr = copy_to_mode_reg (Pmode, XEXP (operands[1], 0));
27055   rtx dst_mem = replace_equiv_address (operands[0], dst_addr);
27056   rtx src_mem = replace_equiv_address (operands[1], src_addr);
27057   rtx sz_reg = copy_to_mode_reg (DImode, operands[2]);
27058   if (is_memmove)
27059     emit_insn (gen_aarch64_movmemdi (dst_mem, src_mem, sz_reg));
27060   else
27061     emit_insn (gen_aarch64_cpymemdi (dst_mem, src_mem, sz_reg));
27062   return true;
27063 }
27064
27065 /* Expand cpymem/movmem, as if from a __builtin_memcpy/memmove.
27066    OPERANDS are taken from the cpymem/movmem pattern.  IS_MEMMOVE is true
27067    if this is a memmove rather than memcpy.  Return true if we succeed,
27068    otherwise return false, indicating that a libcall should be emitted.  */
27069 bool
27070 aarch64_expand_cpymem (rtx *operands, bool is_memmove)
27071 {
27072   int mode_bytes;
27073   rtx dst = operands[0];
27074   rtx src = operands[1];
27075   unsigned align = UINTVAL (operands[3]);
27076   rtx base;
27077   machine_mode mode = BLKmode, next_mode;
27078
27079   /* Variable-sized or strict-align copies may use the MOPS expansion.  */
27080   if (!CONST_INT_P (operands[2]) || (STRICT_ALIGNMENT && align < 16))
27081     return aarch64_expand_cpymem_mops (operands, is_memmove);
27082
27083   unsigned HOST_WIDE_INT size = UINTVAL (operands[2]);
27084
27085   /* Set inline limits for memmove/memcpy.  MOPS has a separate threshold.  */
27086   unsigned max_copy_size = TARGET_SIMD ? 256 : 128;
27087   unsigned mops_threshold = is_memmove ? aarch64_mops_memmove_size_threshold
27088                                        : aarch64_mops_memcpy_size_threshold;
27089
27090   /* Reduce the maximum size with -Os.  */
27091   if (optimize_function_for_size_p (cfun))
27092     max_copy_size /= 4;
27093
27094   /* Large copies use MOPS when available or a library call.  */
27095   if (size > max_copy_size || (TARGET_MOPS && size > mops_threshold))
27096     return aarch64_expand_cpymem_mops (operands, is_memmove);
27097
27098   /* Default to 32-byte LDP/STP on large copies, however small copies or
27099      no SIMD support fall back to 16-byte chunks.
27100      ??? Although it would be possible to use LDP/STP Qn in streaming mode
27101      (so using TARGET_BASE_SIMD instead of TARGET_SIMD), it isn't clear
27102      whether that would improve performance.  */
27103   bool use_qregs = size > 24 && TARGET_SIMD;
27104
27105   base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
27106   dst = adjust_automodify_address (dst, VOIDmode, base, 0);
27107
27108   base = copy_to_mode_reg (Pmode, XEXP (src, 0));
27109   src = adjust_automodify_address (src, VOIDmode, base, 0);
27110
27111   auto_vec<std::pair<rtx, rtx>, 16> ops;
27112   int offset = 0;
27113
27114   while (size > 0)
27115     {
27116       /* Find the largest mode in which to do the copy in without over reading
27117          or writing.  */
27118       opt_scalar_int_mode mode_iter;
27119       FOR_EACH_MODE_IN_CLASS (mode_iter, MODE_INT)
27120         if (GET_MODE_SIZE (mode_iter.require ()) <= MIN (size, 16))
27121           mode = mode_iter.require ();
27122
27123       gcc_assert (mode != BLKmode);
27124
27125       mode_bytes = GET_MODE_SIZE (mode).to_constant ();
27126
27127       /* Prefer Q-register accesses.  */
27128       if (mode_bytes == 16 && use_qregs)
27129         mode = V4SImode;
27130
27131       rtx reg = gen_reg_rtx (mode);
27132       rtx load = gen_move_insn (reg, adjust_address (src, mode, offset));
27133       rtx store = gen_move_insn (adjust_address (dst, mode, offset), reg);
27134       ops.safe_push ({ load, store });
27135       size -= mode_bytes;
27136       offset += mode_bytes;
27137
27138       /* Emit trailing copies using overlapping unaligned accesses
27139          (when !STRICT_ALIGNMENT) - this is smaller and faster.  */
27140       if (size > 0 && size < 16 && !STRICT_ALIGNMENT)
27141         {
27142           next_mode = smallest_mode_for_size
27143             (size * BITS_PER_UNIT, MODE_INT).require ();
27144           int n_bytes = GET_MODE_SIZE (next_mode).to_constant ();
27145           gcc_assert (n_bytes <= mode_bytes);
27146           offset -= n_bytes - size;
27147           size = n_bytes;
27148         }
27149     }
27150
27151   /* Memcpy interleaves loads with stores, memmove emits all loads first.  */
27152   int nops = ops.length();
27153   int inc = is_memmove || nops <= 8 ? nops : 6;
27154
27155   for (int i = 0; i < nops; i += inc)
27156     {
27157       int m = MIN (nops, i + inc);
27158       /* Emit loads.  */
27159       for (int j = i; j < m; j++)
27160         emit_insn (ops[j].first);
27161       /* Emit stores.  */
27162       for (int j = i; j < m; j++)
27163         emit_insn (ops[j].second);
27164     }
27165   return true;
27166 }
27167
27168 /* Expand a setmem using the MOPS instructions.  OPERANDS are the same
27169    as for the setmem pattern.  Return true iff we succeed.  */
27170 static bool
27171 aarch64_expand_setmem_mops (rtx *operands)
27172 {
27173   if (!TARGET_MOPS)
27174     return false;
27175
27176   /* The first two registers are changed by the instruction, so both
27177      of them must be a fresh pseudo.  */
27178   rtx dst_addr = copy_to_mode_reg (Pmode, XEXP (operands[0], 0));
27179   rtx dst_mem = replace_equiv_address (operands[0], dst_addr);
27180   rtx sz_reg = copy_to_mode_reg (DImode, operands[1]);
27181   rtx val = operands[2];
27182   if (val != CONST0_RTX (QImode))
27183     val = force_reg (QImode, val);
27184   emit_insn (gen_aarch64_setmemdi (dst_mem, val, sz_reg));
27185   return true;
27186 }
27187
27188 /* Expand setmem, as if from a __builtin_memset.  Return true if
27189    we succeed, otherwise return false.  */
27190
27191 bool
27192 aarch64_expand_setmem (rtx *operands)
27193 {
27194   int mode_bytes;
27195   unsigned HOST_WIDE_INT len;
27196   rtx dst = operands[0];
27197   rtx val = operands[2], src;
27198   unsigned align = UINTVAL (operands[3]);
27199   rtx base;
27200   machine_mode mode = BLKmode, next_mode;
27201
27202   /* Variable-sized or strict-align memset may use the MOPS expansion.  */
27203   if (!CONST_INT_P (operands[1]) || !TARGET_SIMD
27204       || (STRICT_ALIGNMENT && align < 16))
27205     return aarch64_expand_setmem_mops (operands);
27206
27207   /* Set inline limits for memset.  MOPS has a separate threshold.  */
27208   unsigned max_set_size = MAX_SET_SIZE (optimize_function_for_speed_p (cfun));
27209   unsigned mops_threshold = aarch64_mops_memset_size_threshold;
27210
27211   len = UINTVAL (operands[1]);
27212
27213   /* Large memset uses MOPS when available or a library call.  */
27214   if (len > max_set_size || (TARGET_MOPS && len > mops_threshold))
27215     return aarch64_expand_setmem_mops (operands);
27216
27217   base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
27218   dst = adjust_automodify_address (dst, VOIDmode, base, 0);
27219
27220   /* Prepare the val using a DUP/MOVI v0.16B, val.  */
27221   val = expand_vector_broadcast (V16QImode, val);
27222   val = force_reg (V16QImode, val);
27223
27224   int offset = 0;
27225   while (len > 0)
27226     {
27227       /* Find the largest mode in which to do the copy without
27228          over writing.  */
27229       opt_scalar_int_mode mode_iter;
27230       FOR_EACH_MODE_IN_CLASS (mode_iter, MODE_INT)
27231         if (GET_MODE_SIZE (mode_iter.require ()) <= MIN (len, 16))
27232           mode = mode_iter.require ();
27233
27234       gcc_assert (mode != BLKmode);
27235
27236       mode_bytes = GET_MODE_SIZE (mode).to_constant ();
27237
27238       src = val;
27239
27240       /* Prefer Q-register accesses.  */
27241       if (mode_bytes == 16)
27242         mode = V16QImode;
27243       else
27244         src = lowpart_subreg (mode, src, GET_MODE (val));
27245
27246       emit_move_insn (adjust_address (dst, mode, offset), src);
27247       len -= mode_bytes;
27248       offset += mode_bytes;
27249
27250       /* Emit trailing writes using overlapping unaligned accesses
27251          (when !STRICT_ALIGNMENT) - this is smaller and faster.  */
27252       if (len > 0 && len < 16 && !STRICT_ALIGNMENT)
27253         {
27254           next_mode = smallest_mode_for_size
27255             (len * BITS_PER_UNIT, MODE_INT).require ();
27256           int n_bytes = GET_MODE_SIZE (next_mode).to_constant ();
27257           gcc_assert (n_bytes <= mode_bytes);
27258           offset -= n_bytes - len;
27259           len = n_bytes;
27260         }
27261     }
27262
27263   return true;
27264 }
27265
27266
27267 /* Split a DImode store of a CONST_INT SRC to MEM DST as two
27268    SImode stores.  Handle the case when the constant has identical
27269    bottom and top halves.  This is beneficial when the two stores can be
27270    merged into an STP and we avoid synthesising potentially expensive
27271    immediates twice.  Return true if such a split is possible.  */
27272
27273 bool
27274 aarch64_split_dimode_const_store (rtx dst, rtx src)
27275 {
27276   rtx lo = gen_lowpart (SImode, src);
27277   rtx hi = gen_highpart_mode (SImode, DImode, src);
27278
27279   if (!rtx_equal_p (lo, hi))
27280     return false;
27281
27282   unsigned int orig_cost
27283     = aarch64_internal_mov_immediate (NULL_RTX, src, false, DImode);
27284   unsigned int lo_cost
27285     = aarch64_internal_mov_immediate (NULL_RTX, lo, false, SImode);
27286
27287   /* We want to transform:
27288      MOV        x1, 49370
27289      MOVK       x1, 0x140, lsl 16
27290      MOVK       x1, 0xc0da, lsl 32
27291      MOVK       x1, 0x140, lsl 48
27292      STR        x1, [x0]
27293    into:
27294      MOV        w1, 49370
27295      MOVK       w1, 0x140, lsl 16
27296      STP        w1, w1, [x0]
27297    So we want to perform this when we save at least one instruction.  */
27298   if (orig_cost <= lo_cost)
27299     return false;
27300
27301   rtx mem_lo = adjust_address (dst, SImode, 0);
27302   if (!aarch64_mem_pair_operand (mem_lo, SImode))
27303     return false;
27304
27305   rtx tmp_reg = gen_reg_rtx (SImode);
27306   aarch64_expand_mov_immediate (tmp_reg, lo);
27307   rtx mem_hi = aarch64_move_pointer (mem_lo, GET_MODE_SIZE (SImode));
27308   /* Don't emit an explicit store pair as this may not be always profitable.
27309      Let the sched-fusion logic decide whether to merge them.  */
27310   emit_move_insn (mem_lo, tmp_reg);
27311   emit_move_insn (mem_hi, tmp_reg);
27312
27313   return true;
27314 }
27315
27316 /* Generate RTL for a conditional branch with rtx comparison CODE in
27317    mode CC_MODE.  The destination of the unlikely conditional branch
27318    is LABEL_REF.  */
27319
27320 void
27321 aarch64_gen_unlikely_cbranch (enum rtx_code code, machine_mode cc_mode,
27322                               rtx label_ref)
27323 {
27324   rtx x;
27325   x = gen_rtx_fmt_ee (code, VOIDmode,
27326                       gen_rtx_REG (cc_mode, CC_REGNUM),
27327                       const0_rtx);
27328
27329   x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
27330                             gen_rtx_LABEL_REF (VOIDmode, label_ref),
27331                             pc_rtx);
27332   aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
27333 }
27334
27335 /* Generate DImode scratch registers for 128-bit (TImode) addition.
27336
27337    OP1 represents the TImode destination operand 1
27338    OP2 represents the TImode destination operand 2
27339    LOW_DEST represents the low half (DImode) of TImode operand 0
27340    LOW_IN1 represents the low half (DImode) of TImode operand 1
27341    LOW_IN2 represents the low half (DImode) of TImode operand 2
27342    HIGH_DEST represents the high half (DImode) of TImode operand 0
27343    HIGH_IN1 represents the high half (DImode) of TImode operand 1
27344    HIGH_IN2 represents the high half (DImode) of TImode operand 2.  */
27345
27346 void
27347 aarch64_addti_scratch_regs (rtx op1, rtx op2, rtx *low_dest,
27348                             rtx *low_in1, rtx *low_in2,
27349                             rtx *high_dest, rtx *high_in1,
27350                             rtx *high_in2)
27351 {
27352   *low_dest = gen_reg_rtx (DImode);
27353   *low_in1 = force_lowpart_subreg (DImode, op1, TImode);
27354   *low_in2 = force_lowpart_subreg (DImode, op2, TImode);
27355   *high_dest = gen_reg_rtx (DImode);
27356   *high_in1 = force_highpart_subreg (DImode, op1, TImode);
27357   *high_in2 = force_highpart_subreg (DImode, op2, TImode);
27358 }
27359
27360 /* Generate DImode scratch registers for 128-bit (TImode) subtraction.
27361
27362    OP1 represents the TImode destination operand 1
27363    OP2 represents the TImode destination operand 2
27364    LOW_DEST represents the low half (DImode) of TImode operand 0
27365    LOW_IN1 represents the low half (DImode) of TImode operand 1
27366    LOW_IN2 represents the low half (DImode) of TImode operand 2
27367    HIGH_DEST represents the high half (DImode) of TImode operand 0
27368    HIGH_IN1 represents the high half (DImode) of TImode operand 1
27369    HIGH_IN2 represents the high half (DImode) of TImode operand 2.  */
27370
27371
27372 void
27373 aarch64_subvti_scratch_regs (rtx op1, rtx op2, rtx *low_dest,
27374                              rtx *low_in1, rtx *low_in2,
27375                              rtx *high_dest, rtx *high_in1,
27376                              rtx *high_in2)
27377 {
27378   *low_dest = gen_reg_rtx (DImode);
27379   *low_in1 = force_lowpart_subreg (DImode, op1, TImode);
27380   *low_in2 = force_lowpart_subreg (DImode, op2, TImode);
27381   *high_dest = gen_reg_rtx (DImode);
27382
27383   *high_in1 = force_highpart_subreg (DImode, op1, TImode);
27384   *high_in2 = force_highpart_subreg (DImode, op2, TImode);
27385 }
27386
27387 /* Generate RTL for 128-bit (TImode) subtraction with overflow.
27388
27389    OP0 represents the TImode destination operand 0
27390    LOW_DEST represents the low half (DImode) of TImode operand 0
27391    LOW_IN1 represents the low half (DImode) of TImode operand 1
27392    LOW_IN2 represents the low half (DImode) of TImode operand 2
27393    HIGH_DEST represents the high half (DImode) of TImode operand 0
27394    HIGH_IN1 represents the high half (DImode) of TImode operand 1
27395    HIGH_IN2 represents the high half (DImode) of TImode operand 2
27396    UNSIGNED_P is true if the operation is being performed on unsigned
27397    values.  */
27398 void
27399 aarch64_expand_subvti (rtx op0, rtx low_dest, rtx low_in1,
27400                        rtx low_in2, rtx high_dest, rtx high_in1,
27401                        rtx high_in2, bool unsigned_p)
27402 {
27403   if (low_in2 == const0_rtx)
27404     {
27405       low_dest = low_in1;
27406       high_in2 = force_reg (DImode, high_in2);
27407       if (unsigned_p)
27408         emit_insn (gen_subdi3_compare1 (high_dest, high_in1, high_in2));
27409       else
27410         emit_insn (gen_subvdi_insn (high_dest, high_in1, high_in2));
27411     }
27412   else
27413     {
27414       if (aarch64_plus_immediate (low_in2, DImode))
27415         emit_insn (gen_subdi3_compare1_imm (low_dest, low_in1, low_in2,
27416                                             GEN_INT (-UINTVAL (low_in2))));
27417       else
27418         {
27419           low_in2 = force_reg (DImode, low_in2);
27420           emit_insn (gen_subdi3_compare1 (low_dest, low_in1, low_in2));
27421         }
27422       high_in2 = force_reg (DImode, high_in2);
27423
27424       if (unsigned_p)
27425         emit_insn (gen_usubdi3_carryinC (high_dest, high_in1, high_in2));
27426       else
27427         emit_insn (gen_subdi3_carryinV (high_dest, high_in1, high_in2));
27428     }
27429
27430   emit_move_insn (gen_lowpart (DImode, op0), low_dest);
27431   emit_move_insn (gen_highpart (DImode, op0), high_dest);
27432
27433 }
27434
27435 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook.  */
27436
27437 static unsigned HOST_WIDE_INT
27438 aarch64_asan_shadow_offset (void)
27439 {
27440   if (TARGET_ILP32)
27441     return (HOST_WIDE_INT_1 << 29);
27442   else
27443     return (HOST_WIDE_INT_1 << 36);
27444 }
27445
27446 static rtx
27447 aarch64_gen_ccmp_first (rtx_insn **prep_seq, rtx_insn **gen_seq,
27448                         rtx_code code, tree treeop0, tree treeop1)
27449 {
27450   machine_mode op_mode, cmp_mode, cc_mode = CCmode;
27451   rtx op0, op1;
27452   int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
27453   insn_code icode;
27454   struct expand_operand ops[4];
27455
27456   start_sequence ();
27457   expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
27458
27459   op_mode = GET_MODE (op0);
27460   if (op_mode == VOIDmode)
27461     op_mode = GET_MODE (op1);
27462
27463   if (CONST_SCALAR_INT_P (op1))
27464     canonicalize_comparison (op_mode, &code, &op1);
27465
27466   switch (op_mode)
27467     {
27468     case E_QImode:
27469     case E_HImode:
27470     case E_SImode:
27471       cmp_mode = SImode;
27472       icode = CODE_FOR_cmpsi;
27473       break;
27474
27475     case E_DImode:
27476       cmp_mode = DImode;
27477       icode = CODE_FOR_cmpdi;
27478       break;
27479
27480     case E_SFmode:
27481       cmp_mode = SFmode;
27482       cc_mode = aarch64_select_cc_mode (code, op0, op1);
27483       icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpesf : CODE_FOR_fcmpsf;
27484       break;
27485
27486     case E_DFmode:
27487       cmp_mode = DFmode;
27488       cc_mode = aarch64_select_cc_mode (code, op0, op1);
27489       icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpedf : CODE_FOR_fcmpdf;
27490       break;
27491
27492     default:
27493       end_sequence ();
27494       return NULL_RTX;
27495     }
27496
27497   op0 = prepare_operand (icode, op0, 0, op_mode, cmp_mode, unsignedp);
27498   op1 = prepare_operand (icode, op1, 1, op_mode, cmp_mode, unsignedp);
27499   if (!op0 || !op1)
27500     {
27501       end_sequence ();
27502       return NULL_RTX;
27503     }
27504   *prep_seq = get_insns ();
27505   end_sequence ();
27506
27507   create_fixed_operand (&ops[0], op0);
27508   create_fixed_operand (&ops[1], op1);
27509
27510   start_sequence ();
27511   if (!maybe_expand_insn (icode, 2, ops))
27512     {
27513       end_sequence ();
27514       return NULL_RTX;
27515     }
27516   *gen_seq = get_insns ();
27517   end_sequence ();
27518
27519   return gen_rtx_fmt_ee (code, cc_mode,
27520                          gen_rtx_REG (cc_mode, CC_REGNUM), const0_rtx);
27521 }
27522
27523 static rtx
27524 aarch64_gen_ccmp_next (rtx_insn **prep_seq, rtx_insn **gen_seq, rtx prev,
27525                        rtx_code cmp_code, tree treeop0, tree treeop1,
27526                        rtx_code bit_code)
27527 {
27528   rtx op0, op1, target;
27529   machine_mode op_mode, cmp_mode, cc_mode = CCmode;
27530   int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
27531   insn_code icode;
27532   struct expand_operand ops[6];
27533   int aarch64_cond;
27534
27535   push_to_sequence (*prep_seq);
27536   expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
27537
27538   op_mode = GET_MODE (op0);
27539   if (op_mode == VOIDmode)
27540     op_mode = GET_MODE (op1);
27541
27542   if (CONST_SCALAR_INT_P (op1))
27543     canonicalize_comparison (op_mode, &cmp_code, &op1);
27544
27545   switch (op_mode)
27546     {
27547     case E_QImode:
27548     case E_HImode:
27549     case E_SImode:
27550       cmp_mode = SImode;
27551       break;
27552
27553     case E_DImode:
27554       cmp_mode = DImode;
27555       break;
27556
27557     case E_SFmode:
27558       cmp_mode = SFmode;
27559       cc_mode = aarch64_select_cc_mode (cmp_code, op0, op1);
27560       break;
27561
27562     case E_DFmode:
27563       cmp_mode = DFmode;
27564       cc_mode = aarch64_select_cc_mode (cmp_code, op0, op1);
27565       break;
27566
27567     default:
27568       end_sequence ();
27569       return NULL_RTX;
27570     }
27571
27572   icode = code_for_ccmp (cc_mode, cmp_mode);
27573
27574   op0 = prepare_operand (icode, op0, 2, op_mode, cmp_mode, unsignedp);
27575   op1 = prepare_operand (icode, op1, 3, op_mode, cmp_mode, unsignedp);
27576   if (!op0 || !op1)
27577     {
27578       end_sequence ();
27579       return NULL_RTX;
27580     }
27581   *prep_seq = get_insns ();
27582   end_sequence ();
27583
27584   target = gen_rtx_REG (cc_mode, CC_REGNUM);
27585   aarch64_cond = aarch64_get_condition_code_1 (cc_mode, cmp_code);
27586
27587   if (bit_code != AND)
27588     {
27589       /* Treat the ccmp patterns as canonical and use them where possible,
27590          but fall back to ccmp_rev patterns if there's no other option.  */
27591       rtx_code prev_code = GET_CODE (prev);
27592       machine_mode prev_mode = GET_MODE (XEXP (prev, 0));
27593       if ((prev_mode == CCFPmode || prev_mode == CCFPEmode)
27594           && !(prev_code == EQ
27595                || prev_code == NE
27596                || prev_code == ORDERED
27597                || prev_code == UNORDERED))
27598         icode = code_for_ccmp_rev (cc_mode, cmp_mode);
27599       else
27600         {
27601           rtx_code code = reverse_condition (prev_code);
27602           prev = gen_rtx_fmt_ee (code, VOIDmode, XEXP (prev, 0), const0_rtx);
27603         }
27604       aarch64_cond = AARCH64_INVERSE_CONDITION_CODE (aarch64_cond);
27605     }
27606
27607   create_fixed_operand (&ops[0], XEXP (prev, 0));
27608   create_fixed_operand (&ops[1], target);
27609   create_fixed_operand (&ops[2], op0);
27610   create_fixed_operand (&ops[3], op1);
27611   create_fixed_operand (&ops[4], prev);
27612   create_fixed_operand (&ops[5], GEN_INT (aarch64_cond));
27613
27614   push_to_sequence (*gen_seq);
27615   if (!maybe_expand_insn (icode, 6, ops))
27616     {
27617       end_sequence ();
27618       return NULL_RTX;
27619     }
27620
27621   *gen_seq = get_insns ();
27622   end_sequence ();
27623
27624   return gen_rtx_fmt_ee (cmp_code, VOIDmode, target, const0_rtx);
27625 }
27626
27627 #undef TARGET_GEN_CCMP_FIRST
27628 #define TARGET_GEN_CCMP_FIRST aarch64_gen_ccmp_first
27629
27630 #undef TARGET_GEN_CCMP_NEXT
27631 #define TARGET_GEN_CCMP_NEXT aarch64_gen_ccmp_next
27632
27633 /* Implement TARGET_SCHED_MACRO_FUSION_P.  Return true if target supports
27634    instruction fusion of some sort.  */
27635
27636 static bool
27637 aarch64_macro_fusion_p (void)
27638 {
27639   return aarch64_tune_params.fusible_ops != AARCH64_FUSE_NOTHING;
27640 }
27641
27642
27643 /* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P.  Return true if PREV and CURR
27644    should be kept together during scheduling.  */
27645
27646 static bool
27647 aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
27648 {
27649   rtx set_dest;
27650   rtx prev_set = single_set (prev);
27651   rtx curr_set = single_set (curr);
27652   /* prev and curr are simple SET insns i.e. no flag setting or branching.  */
27653   bool simple_sets_p = prev_set && curr_set && !any_condjump_p (curr);
27654
27655   if (!aarch64_macro_fusion_p ())
27656     return false;
27657
27658   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOV_MOVK))
27659     {
27660       /* We are trying to match:
27661          prev (mov)  == (set (reg r0) (const_int imm16))
27662          curr (movk) == (set (zero_extract (reg r0)
27663                                            (const_int 16)
27664                                            (const_int 16))
27665                              (const_int imm16_1))  */
27666
27667       set_dest = SET_DEST (curr_set);
27668
27669       if (GET_CODE (set_dest) == ZERO_EXTRACT
27670           && CONST_INT_P (SET_SRC (curr_set))
27671           && CONST_INT_P (SET_SRC (prev_set))
27672           && CONST_INT_P (XEXP (set_dest, 2))
27673           && INTVAL (XEXP (set_dest, 2)) == 16
27674           && REG_P (XEXP (set_dest, 0))
27675           && REG_P (SET_DEST (prev_set))
27676           && REGNO (XEXP (set_dest, 0)) == REGNO (SET_DEST (prev_set)))
27677         {
27678           return true;
27679         }
27680     }
27681
27682   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_ADD))
27683     {
27684
27685       /*  We're trying to match:
27686           prev (adrp) == (set (reg r1)
27687                               (high (symbol_ref ("SYM"))))
27688           curr (add) == (set (reg r0)
27689                              (lo_sum (reg r1)
27690                                      (symbol_ref ("SYM"))))
27691           Note that r0 need not necessarily be the same as r1, especially
27692           during pre-regalloc scheduling.  */
27693
27694       if (satisfies_constraint_Ush (SET_SRC (prev_set))
27695           && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
27696         {
27697           if (GET_CODE (SET_SRC (curr_set)) == LO_SUM
27698               && REG_P (XEXP (SET_SRC (curr_set), 0))
27699               && REGNO (XEXP (SET_SRC (curr_set), 0))
27700                  == REGNO (SET_DEST (prev_set))
27701               && rtx_equal_p (XEXP (SET_SRC (prev_set), 0),
27702                               XEXP (SET_SRC (curr_set), 1)))
27703             return true;
27704         }
27705     }
27706
27707   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOVK_MOVK))
27708     {
27709
27710       /* We're trying to match:
27711          prev (movk) == (set (zero_extract (reg r0)
27712                                            (const_int 16)
27713                                            (const_int 32))
27714                              (const_int imm16_1))
27715          curr (movk) == (set (zero_extract (reg r0)
27716                                            (const_int 16)
27717                                            (const_int 48))
27718                              (const_int imm16_2))  */
27719
27720       if (GET_CODE (SET_DEST (prev_set)) == ZERO_EXTRACT
27721           && GET_CODE (SET_DEST (curr_set)) == ZERO_EXTRACT
27722           && REG_P (XEXP (SET_DEST (prev_set), 0))
27723           && REG_P (XEXP (SET_DEST (curr_set), 0))
27724           && REGNO (XEXP (SET_DEST (prev_set), 0))
27725              == REGNO (XEXP (SET_DEST (curr_set), 0))
27726           && CONST_INT_P (XEXP (SET_DEST (prev_set), 2))
27727           && CONST_INT_P (XEXP (SET_DEST (curr_set), 2))
27728           && INTVAL (XEXP (SET_DEST (prev_set), 2)) == 32
27729           && INTVAL (XEXP (SET_DEST (curr_set), 2)) == 48
27730           && CONST_INT_P (SET_SRC (prev_set))
27731           && CONST_INT_P (SET_SRC (curr_set)))
27732         return true;
27733
27734     }
27735   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_LDR))
27736     {
27737       /* We're trying to match:
27738           prev (adrp) == (set (reg r0)
27739                               (high (symbol_ref ("SYM"))))
27740           curr (ldr) == (set (reg r1)
27741                              (mem (lo_sum (reg r0)
27742                                              (symbol_ref ("SYM")))))
27743                  or
27744           curr (ldr) == (set (reg r1)
27745                              (zero_extend (mem
27746                                            (lo_sum (reg r0)
27747                                                    (symbol_ref ("SYM"))))))  */
27748       if (satisfies_constraint_Ush (SET_SRC (prev_set))
27749           && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
27750         {
27751           rtx curr_src = SET_SRC (curr_set);
27752
27753           if (GET_CODE (curr_src) == ZERO_EXTEND)
27754             curr_src = XEXP (curr_src, 0);
27755
27756           if (MEM_P (curr_src) && GET_CODE (XEXP (curr_src, 0)) == LO_SUM
27757               && REG_P (XEXP (XEXP (curr_src, 0), 0))
27758               && REGNO (XEXP (XEXP (curr_src, 0), 0))
27759                  == REGNO (SET_DEST (prev_set))
27760               && rtx_equal_p (XEXP (XEXP (curr_src, 0), 1),
27761                               XEXP (SET_SRC (prev_set), 0)))
27762               return true;
27763         }
27764     }
27765
27766   /* Fuse compare (CMP/CMN/TST/BICS) and conditional branch.  */
27767   if (aarch64_fusion_enabled_p (AARCH64_FUSE_CMP_BRANCH)
27768       && prev_set && curr_set && any_condjump_p (curr)
27769       && GET_CODE (SET_SRC (prev_set)) == COMPARE
27770       && SCALAR_INT_MODE_P (GET_MODE (XEXP (SET_SRC (prev_set), 0)))
27771       && reg_referenced_p (SET_DEST (prev_set), PATTERN (curr)))
27772     return true;
27773
27774   /* Fuse CMP and CSEL/CSET.  */
27775   if (prev_set && curr_set
27776       && GET_CODE (SET_SRC (prev_set)) == COMPARE
27777       && SCALAR_INT_MODE_P (GET_MODE (XEXP (SET_SRC (prev_set), 0)))
27778       && reg_referenced_p (SET_DEST (prev_set), PATTERN (curr)))
27779     {
27780       enum attr_type prev_type = get_attr_type (prev);
27781       if ((prev_type == TYPE_ALUS_SREG || prev_type == TYPE_ALUS_IMM)
27782           && ((aarch64_fusion_enabled_p (AARCH64_FUSE_CMP_CSEL)
27783                && GET_CODE (SET_SRC (curr_set)) == IF_THEN_ELSE
27784                && aarch64_reg_or_zero (XEXP (SET_SRC (curr_set), 1), VOIDmode)
27785                && aarch64_reg_or_zero (XEXP (SET_SRC (curr_set), 2), VOIDmode)
27786                && SCALAR_INT_MODE_P (GET_MODE (XEXP (SET_SRC (curr_set), 1))))
27787               || (aarch64_fusion_enabled_p (AARCH64_FUSE_CMP_CSET)
27788                   && GET_RTX_CLASS (GET_CODE (SET_SRC (curr_set)))
27789                      == RTX_COMPARE
27790                   && REG_P (SET_DEST (curr_set)))))
27791         return true;
27792     }
27793
27794   /* Fuse flag-setting ALU instructions and conditional branch.  */
27795   if (aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_BRANCH)
27796       && any_condjump_p (curr))
27797     {
27798       unsigned int condreg1, condreg2;
27799       rtx cc_reg_1;
27800       aarch64_fixed_condition_code_regs (&condreg1, &condreg2);
27801       cc_reg_1 = gen_rtx_REG (CCmode, condreg1);
27802
27803       if (reg_referenced_p (cc_reg_1, PATTERN (curr))
27804           && prev
27805           && modified_in_p (cc_reg_1, prev))
27806         {
27807           enum attr_type prev_type = get_attr_type (prev);
27808
27809           /* FIXME: this misses some which is considered simple arthematic
27810              instructions for ThunderX.  Simple shifts are missed here.  */
27811           if (prev_type == TYPE_ALUS_SREG
27812               || prev_type == TYPE_ALUS_IMM
27813               || prev_type == TYPE_LOGICS_REG
27814               || prev_type == TYPE_LOGICS_IMM)
27815             return true;
27816         }
27817     }
27818
27819   /* Fuse ALU instructions and CBZ/CBNZ.  */
27820   if (prev_set
27821       && curr_set
27822       && aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_CBZ)
27823       && any_condjump_p (curr))
27824     {
27825       /* We're trying to match:
27826           prev (alu_insn) == (set (r0) plus ((r0) (r1/imm)))
27827           curr (cbz) ==  (set (pc) (if_then_else (eq/ne) (r0)
27828                                                          (const_int 0))
27829                                                  (label_ref ("SYM"))
27830                                                  (pc))  */
27831       if (SET_DEST (curr_set) == (pc_rtx)
27832           && GET_CODE (SET_SRC (curr_set)) == IF_THEN_ELSE
27833           && REG_P (XEXP (XEXP (SET_SRC (curr_set), 0), 0))
27834           && REG_P (SET_DEST (prev_set))
27835           && REGNO (SET_DEST (prev_set))
27836              == REGNO (XEXP (XEXP (SET_SRC (curr_set), 0), 0)))
27837         {
27838           /* Fuse ALU operations followed by conditional branch instruction.  */
27839           switch (get_attr_type (prev))
27840             {
27841             case TYPE_ALU_IMM:
27842             case TYPE_ALU_SREG:
27843             case TYPE_ADC_REG:
27844             case TYPE_ADC_IMM:
27845             case TYPE_ADCS_REG:
27846             case TYPE_ADCS_IMM:
27847             case TYPE_LOGIC_REG:
27848             case TYPE_LOGIC_IMM:
27849             case TYPE_CSEL:
27850             case TYPE_ADR:
27851             case TYPE_MOV_IMM:
27852             case TYPE_SHIFT_REG:
27853             case TYPE_SHIFT_IMM:
27854             case TYPE_BFM:
27855             case TYPE_RBIT:
27856             case TYPE_REV:
27857             case TYPE_EXTEND:
27858               return true;
27859
27860             default:;
27861             }
27862         }
27863     }
27864
27865   /* Fuse A+B+1 and A-B-1 */
27866   if (simple_sets_p
27867       && aarch64_fusion_enabled_p (AARCH64_FUSE_ADDSUB_2REG_CONST1))
27868     {
27869       /* We're trying to match:
27870           prev == (set (r0) (plus (r0) (r1)))
27871           curr == (set (r0) (plus (r0) (const_int 1)))
27872         or:
27873           prev == (set (r0) (minus (r0) (r1)))
27874           curr == (set (r0) (plus (r0) (const_int -1))) */
27875
27876       rtx prev_src = SET_SRC (prev_set);
27877       rtx curr_src = SET_SRC (curr_set);
27878
27879       int polarity = 1;
27880       if (GET_CODE (prev_src) == MINUS)
27881         polarity = -1;
27882
27883       if (GET_CODE (curr_src) == PLUS
27884           && (GET_CODE (prev_src) == PLUS || GET_CODE (prev_src) == MINUS)
27885           && CONST_INT_P (XEXP (curr_src, 1))
27886           && INTVAL (XEXP (curr_src, 1)) == polarity
27887           && REG_P (XEXP (curr_src, 0))
27888           && REG_P (SET_DEST (prev_set))
27889           && REGNO (SET_DEST (prev_set)) == REGNO (XEXP (curr_src, 0)))
27890         return true;
27891     }
27892
27893   return false;
27894 }
27895
27896 /* Return true iff the instruction fusion described by OP is enabled.  */
27897
27898 bool
27899 aarch64_fusion_enabled_p (enum aarch64_fusion_pairs op)
27900 {
27901   return (aarch64_tune_params.fusible_ops & op) != 0;
27902 }
27903
27904 /* If MEM is in the form of [base+offset], extract the two parts
27905    of address and set to BASE and OFFSET, otherwise return false
27906    after clearing BASE and OFFSET.  */
27907
27908 bool
27909 extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
27910 {
27911   rtx addr;
27912
27913   gcc_assert (MEM_P (mem));
27914
27915   addr = XEXP (mem, 0);
27916
27917   if (REG_P (addr))
27918     {
27919       *base = addr;
27920       *offset = const0_rtx;
27921       return true;
27922     }
27923
27924   if (GET_CODE (addr) == PLUS
27925       && REG_P (XEXP (addr, 0)) && CONST_INT_P (XEXP (addr, 1)))
27926     {
27927       *base = XEXP (addr, 0);
27928       *offset = XEXP (addr, 1);
27929       return true;
27930     }
27931
27932   *base = NULL_RTX;
27933   *offset = NULL_RTX;
27934
27935   return false;
27936 }
27937
27938 /* Types for scheduling fusion.  */
27939 enum sched_fusion_type
27940 {
27941   SCHED_FUSION_NONE = 0,
27942   SCHED_FUSION_LD_SIGN_EXTEND,
27943   SCHED_FUSION_LD_ZERO_EXTEND,
27944   SCHED_FUSION_LD,
27945   SCHED_FUSION_ST,
27946   SCHED_FUSION_NUM
27947 };
27948
27949 /* If INSN is a load or store of address in the form of [base+offset],
27950    extract the two parts and set to BASE and OFFSET.  Return scheduling
27951    fusion type this INSN is.  */
27952
27953 static enum sched_fusion_type
27954 fusion_load_store (rtx_insn *insn, rtx *base, rtx *offset)
27955 {
27956   rtx x, dest, src;
27957   enum sched_fusion_type fusion = SCHED_FUSION_LD;
27958
27959   gcc_assert (INSN_P (insn));
27960   x = PATTERN (insn);
27961   if (GET_CODE (x) != SET)
27962     return SCHED_FUSION_NONE;
27963
27964   src = SET_SRC (x);
27965   dest = SET_DEST (x);
27966
27967   machine_mode dest_mode = GET_MODE (dest);
27968
27969   if (!aarch64_mode_valid_for_sched_fusion_p (dest_mode))
27970     return SCHED_FUSION_NONE;
27971
27972   if (GET_CODE (src) == SIGN_EXTEND)
27973     {
27974       fusion = SCHED_FUSION_LD_SIGN_EXTEND;
27975       src = XEXP (src, 0);
27976       if (!MEM_P (src) || GET_MODE (src) != SImode)
27977         return SCHED_FUSION_NONE;
27978     }
27979   else if (GET_CODE (src) == ZERO_EXTEND)
27980     {
27981       fusion = SCHED_FUSION_LD_ZERO_EXTEND;
27982       src = XEXP (src, 0);
27983       if (!MEM_P (src) || GET_MODE (src) != SImode)
27984         return SCHED_FUSION_NONE;
27985     }
27986
27987   if (MEM_P (src) && REG_P (dest))
27988     extract_base_offset_in_addr (src, base, offset);
27989   else if (MEM_P (dest) && (REG_P (src) || src == const0_rtx))
27990     {
27991       fusion = SCHED_FUSION_ST;
27992       extract_base_offset_in_addr (dest, base, offset);
27993     }
27994   else
27995     return SCHED_FUSION_NONE;
27996
27997   if (*base == NULL_RTX || *offset == NULL_RTX)
27998     fusion = SCHED_FUSION_NONE;
27999
28000   return fusion;
28001 }
28002
28003 /* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
28004
28005    Currently we only support to fuse ldr or str instructions, so FUSION_PRI
28006    and PRI are only calculated for these instructions.  For other instruction,
28007    FUSION_PRI and PRI are simply set to MAX_PRI - 1.  In the future, other
28008    type instruction fusion can be added by returning different priorities.
28009
28010    It's important that irrelevant instructions get the largest FUSION_PRI.  */
28011
28012 static void
28013 aarch64_sched_fusion_priority (rtx_insn *insn, int max_pri,
28014                                int *fusion_pri, int *pri)
28015 {
28016   int tmp, off_val;
28017   rtx base, offset;
28018   enum sched_fusion_type fusion;
28019
28020   gcc_assert (INSN_P (insn));
28021
28022   tmp = max_pri - 1;
28023   fusion = fusion_load_store (insn, &base, &offset);
28024   if (fusion == SCHED_FUSION_NONE)
28025     {
28026       *pri = tmp;
28027       *fusion_pri = tmp;
28028       return;
28029     }
28030
28031   /* Set FUSION_PRI according to fusion type and base register.  */
28032   *fusion_pri = tmp - fusion * FIRST_PSEUDO_REGISTER - REGNO (base);
28033
28034   /* Calculate PRI.  */
28035   tmp /= 2;
28036
28037   /* INSN with smaller offset goes first.  */
28038   off_val = (int)(INTVAL (offset));
28039   if (off_val >= 0)
28040     tmp -= (off_val & 0xfffff);
28041   else
28042     tmp += ((- off_val) & 0xfffff);
28043
28044   *pri = tmp;
28045   return;
28046 }
28047
28048 /* Implement the TARGET_SCHED_ADJUST_PRIORITY hook.
28049    Adjust priority of sha1h instructions so they are scheduled before
28050    other SHA1 instructions.  */
28051
28052 static int
28053 aarch64_sched_adjust_priority (rtx_insn *insn, int priority)
28054 {
28055   rtx x = PATTERN (insn);
28056
28057   if (GET_CODE (x) == SET)
28058     {
28059       x = SET_SRC (x);
28060
28061       if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_SHA1H)
28062         return priority + 10;
28063     }
28064
28065   return priority;
28066 }
28067
28068 /* If REVERSED is null, return true if memory reference *MEM2 comes
28069    immediately after memory reference *MEM1.  Do not change the references
28070    in this case.
28071
28072    Otherwise, check if *MEM1 and *MEM2 are consecutive memory references and,
28073    if they are, try to make them use constant offsets from the same base
28074    register.  Return true on success.  When returning true, set *REVERSED
28075    to true if *MEM1 comes after *MEM2, false if *MEM1 comes before *MEM2.  */
28076 static bool
28077 aarch64_check_consecutive_mems (rtx *mem1, rtx *mem2, bool *reversed)
28078 {
28079   if (reversed)
28080     *reversed = false;
28081
28082   if (GET_RTX_CLASS (GET_CODE (XEXP (*mem1, 0))) == RTX_AUTOINC
28083       || GET_RTX_CLASS (GET_CODE (XEXP (*mem2, 0))) == RTX_AUTOINC)
28084     return false;
28085
28086   if (!MEM_SIZE_KNOWN_P (*mem1) || !MEM_SIZE_KNOWN_P (*mem2))
28087     return false;
28088
28089   auto size1 = MEM_SIZE (*mem1);
28090   auto size2 = MEM_SIZE (*mem2);
28091
28092   rtx base1, base2, offset1, offset2;
28093   extract_base_offset_in_addr (*mem1, &base1, &offset1);
28094   extract_base_offset_in_addr (*mem2, &base2, &offset2);
28095
28096   /* Make sure at least one memory is in base+offset form.  */
28097   if (!(base1 && offset1) && !(base2 && offset2))
28098     return false;
28099
28100   /* If both mems already use the same base register, just check the
28101      offsets.  */
28102   if (base1 && base2 && rtx_equal_p (base1, base2))
28103     {
28104       if (!offset1 || !offset2)
28105         return false;
28106
28107       if (known_eq (UINTVAL (offset1) + size1, UINTVAL (offset2)))
28108         return true;
28109
28110       if (known_eq (UINTVAL (offset2) + size2, UINTVAL (offset1)) && reversed)
28111         {
28112           *reversed = true;
28113           return true;
28114         }
28115
28116       return false;
28117     }
28118
28119   /* Otherwise, check whether the MEM_EXPRs and MEM_OFFSETs together
28120      guarantee that the values are consecutive.  */
28121   if (MEM_EXPR (*mem1)
28122       && MEM_EXPR (*mem2)
28123       && MEM_OFFSET_KNOWN_P (*mem1)
28124       && MEM_OFFSET_KNOWN_P (*mem2))
28125     {
28126       poly_int64 expr_offset1;
28127       poly_int64 expr_offset2;
28128       tree expr_base1 = get_addr_base_and_unit_offset (MEM_EXPR (*mem1),
28129                                                        &expr_offset1);
28130       tree expr_base2 = get_addr_base_and_unit_offset (MEM_EXPR (*mem2),
28131                                                        &expr_offset2);
28132       if (!expr_base1
28133           || !expr_base2
28134           || !DECL_P (expr_base1)
28135           || !operand_equal_p (expr_base1, expr_base2, OEP_ADDRESS_OF))
28136         return false;
28137
28138       expr_offset1 += MEM_OFFSET (*mem1);
28139       expr_offset2 += MEM_OFFSET (*mem2);
28140
28141       if (known_eq (expr_offset1 + size1, expr_offset2))
28142         ;
28143       else if (known_eq (expr_offset2 + size2, expr_offset1) && reversed)
28144         *reversed = true;
28145       else
28146         return false;
28147
28148       if (reversed)
28149         {
28150           if (base2)
28151             {
28152               rtx addr1 = plus_constant (Pmode, XEXP (*mem2, 0),
28153                                          expr_offset1 - expr_offset2);
28154               *mem1 = replace_equiv_address_nv (*mem1, addr1);
28155             }
28156           else
28157             {
28158               rtx addr2 = plus_constant (Pmode, XEXP (*mem1, 0),
28159                                          expr_offset2 - expr_offset1);
28160               *mem2 = replace_equiv_address_nv (*mem2, addr2);
28161             }
28162         }
28163       return true;
28164     }
28165
28166   return false;
28167 }
28168
28169 /* Test if MODE is suitable for a single transfer register in an ldp or stp
28170    instruction.  */
28171
28172 bool
28173 aarch64_ldpstp_operand_mode_p (machine_mode mode)
28174 {
28175   if (!targetm.hard_regno_mode_ok (V0_REGNUM, mode)
28176       || hard_regno_nregs (V0_REGNUM, mode) > 1)
28177     return false;
28178
28179   const auto size = GET_MODE_SIZE (mode);
28180   return known_eq (size, 4) || known_eq (size, 8) || known_eq (size, 16);
28181 }
28182
28183 /* Return true if MEM1 and MEM2 can be combined into a single access
28184    of mode MODE, with the combined access having the same address as MEM1.  */
28185
28186 bool
28187 aarch64_mergeable_load_pair_p (machine_mode mode, rtx mem1, rtx mem2)
28188 {
28189   if (STRICT_ALIGNMENT && MEM_ALIGN (mem1) < GET_MODE_ALIGNMENT (mode))
28190     return false;
28191   return aarch64_check_consecutive_mems (&mem1, &mem2, nullptr);
28192 }
28193
28194 /* Return true if MEM agrees with the ldp-stp policy model.
28195    Otherwise, false.  */
28196
28197 bool
28198 aarch64_mem_ok_with_ldpstp_policy_model (rtx mem, bool load, machine_mode mode)
28199 {
28200   auto policy = (load
28201                  ? aarch64_tune_params.ldp_policy_model
28202                  : aarch64_tune_params.stp_policy_model);
28203
28204   /* If we have AARCH64_LDP_STP_POLICY_NEVER, reject the load pair.  */
28205   if (policy == AARCH64_LDP_STP_POLICY_NEVER)
28206     return false;
28207
28208   /* If we have AARCH64_LDP_STP_POLICY_ALIGNED,
28209      do not emit the load pair unless the alignment is checked to be
28210      at least double the alignment of the type.  */
28211   if (policy == AARCH64_LDP_STP_POLICY_ALIGNED
28212       && !optimize_function_for_size_p (cfun)
28213       && MEM_ALIGN (mem) < 2 * GET_MODE_ALIGNMENT (mode))
28214     return false;
28215
28216   return true;
28217 }
28218
28219 /* Given OPERANDS of consecutive load/store, check if we can merge
28220    them into ldp/stp.  LOAD is true if they are load instructions.  */
28221
28222 bool
28223 aarch64_operands_ok_for_ldpstp (rtx *operands, bool load)
28224 {
28225   enum reg_class rclass_1, rclass_2;
28226   rtx mem_1, mem_2, reg_1, reg_2;
28227
28228   if (load)
28229     {
28230       mem_1 = operands[1];
28231       mem_2 = operands[3];
28232       reg_1 = operands[0];
28233       reg_2 = operands[2];
28234       gcc_assert (REG_P (reg_1) && REG_P (reg_2));
28235       if (REGNO (reg_1) == REGNO (reg_2))
28236         return false;
28237       if (reg_overlap_mentioned_p (reg_1, mem_2))
28238         return false;
28239     }
28240   else
28241     {
28242       mem_1 = operands[0];
28243       mem_2 = operands[2];
28244       reg_1 = operands[1];
28245       reg_2 = operands[3];
28246     }
28247
28248   /* The mems cannot be volatile.  */
28249   if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2))
28250     return false;
28251
28252   /* Check if the addresses are in the form of [base+offset].  */
28253   bool reversed = false;
28254   if (!aarch64_check_consecutive_mems (&mem_1, &mem_2, &reversed))
28255     return false;
28256
28257   /* The operands must be of the same size.  */
28258   gcc_assert (known_eq (GET_MODE_SIZE (GET_MODE (mem_1)),
28259                         GET_MODE_SIZE (GET_MODE (mem_2))));
28260
28261   /* The lower memory access must be a mem-pair operand.  */
28262   rtx lower_mem = reversed ? mem_2 : mem_1;
28263   machine_mode lower_mem_mode = GET_MODE (lower_mem);
28264   if (!aarch64_mem_pair_operand (lower_mem, lower_mem_mode))
28265     return false;
28266
28267   /* Check if lower_mem is ok with the ldp-stp policy model.  */
28268   if (!aarch64_mem_ok_with_ldpstp_policy_model (lower_mem, load,
28269                                                 lower_mem_mode))
28270     return false;
28271
28272   if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
28273     rclass_1 = FP_REGS;
28274   else
28275     rclass_1 = GENERAL_REGS;
28276
28277   if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
28278     rclass_2 = FP_REGS;
28279   else
28280     rclass_2 = GENERAL_REGS;
28281
28282   /* Check if the registers are of same class.  */
28283   if (rclass_1 != rclass_2)
28284     return false;
28285
28286   return true;
28287 }
28288
28289 /* Given OPERANDS of consecutive load/store that can be merged,
28290    swap them if they are not in ascending order.  */
28291 void
28292 aarch64_swap_ldrstr_operands (rtx* operands, bool load)
28293 {
28294   int mem_op = load ? 1 : 0;
28295   bool reversed = false;
28296   if (!aarch64_check_consecutive_mems (operands + mem_op,
28297                                        operands + mem_op + 2, &reversed))
28298     gcc_unreachable ();
28299
28300   if (reversed)
28301     {
28302       /* Irrespective of whether this is a load or a store,
28303          we do the same swap.  */
28304       std::swap (operands[0], operands[2]);
28305       std::swap (operands[1], operands[3]);
28306     }
28307 }
28308
28309 /* Helper function used for generation of load/store pair instructions, called
28310    from peepholes in aarch64-ldpstp.md.  OPERANDS is an array of
28311    operands as matched by the peepholes in that file.  LOAD_P is true if we're
28312    generating a load pair, otherwise we're generating a store pair.  CODE is
28313    either {ZERO,SIGN}_EXTEND for extending loads or UNKNOWN if we're generating a
28314    standard load/store pair.  */
28315
28316 void
28317 aarch64_finish_ldpstp_peephole (rtx *operands, bool load_p, enum rtx_code code)
28318 {
28319   aarch64_swap_ldrstr_operands (operands, load_p);
28320
28321   if (load_p)
28322     emit_insn (aarch64_gen_load_pair (operands[0], operands[2],
28323                                       operands[1], code));
28324   else
28325     {
28326       gcc_assert (code == UNKNOWN);
28327       emit_insn (aarch64_gen_store_pair (operands[0], operands[1],
28328                                          operands[3]));
28329     }
28330 }
28331
28332 /* Taking X and Y to be HOST_WIDE_INT pointers, return the result of a
28333    comparison between the two.  */
28334 int
28335 aarch64_host_wide_int_compare (const void *x, const void *y)
28336 {
28337   return wi::cmps (* ((const HOST_WIDE_INT *) x),
28338                    * ((const HOST_WIDE_INT *) y));
28339 }
28340
28341 /* Taking X and Y to be pairs of RTX, one pointing to a MEM rtx and the
28342    other pointing to a REG rtx containing an offset, compare the offsets
28343    of the two pairs.
28344
28345    Return:
28346
28347         1 iff offset (X) > offset (Y)
28348         0 iff offset (X) == offset (Y)
28349         -1 iff offset (X) < offset (Y)  */
28350 int
28351 aarch64_ldrstr_offset_compare (const void *x, const void *y)
28352 {
28353   const rtx * operands_1 = (const rtx *) x;
28354   const rtx * operands_2 = (const rtx *) y;
28355   rtx mem_1, mem_2, base, offset_1, offset_2;
28356
28357   if (MEM_P (operands_1[0]))
28358     mem_1 = operands_1[0];
28359   else
28360     mem_1 = operands_1[1];
28361
28362   if (MEM_P (operands_2[0]))
28363     mem_2 = operands_2[0];
28364   else
28365     mem_2 = operands_2[1];
28366
28367   /* Extract the offsets.  */
28368   extract_base_offset_in_addr (mem_1, &base, &offset_1);
28369   extract_base_offset_in_addr (mem_2, &base, &offset_2);
28370
28371   gcc_assert (offset_1 != NULL_RTX && offset_2 != NULL_RTX);
28372
28373   return wi::cmps (INTVAL (offset_1), INTVAL (offset_2));
28374 }
28375
28376 /* Given OPERANDS of consecutive load/store, check if we can merge
28377    them into ldp/stp by adjusting the offset.  LOAD is true if they
28378    are load instructions.  MODE is the mode of memory operands.
28379
28380    Given below consecutive stores:
28381
28382      str  w1, [xb, 0x100]
28383      str  w1, [xb, 0x104]
28384      str  w1, [xb, 0x108]
28385      str  w1, [xb, 0x10c]
28386
28387    Though the offsets are out of the range supported by stp, we can
28388    still pair them after adjusting the offset, like:
28389
28390      add  scratch, xb, 0x100
28391      stp  w1, w1, [scratch]
28392      stp  w1, w1, [scratch, 0x8]
28393
28394    The peephole patterns detecting this opportunity should guarantee
28395    the scratch register is avaliable.  */
28396
28397 bool
28398 aarch64_operands_adjust_ok_for_ldpstp (rtx *operands, bool load,
28399                                        machine_mode mode)
28400 {
28401   const int num_insns = 4;
28402   enum reg_class rclass;
28403   HOST_WIDE_INT offvals[num_insns], msize;
28404   rtx mem[num_insns], reg[num_insns], base[num_insns], offset[num_insns];
28405
28406   if (load)
28407     {
28408       for (int i = 0; i < num_insns; i++)
28409         {
28410           reg[i] = operands[2 * i];
28411           mem[i] = operands[2 * i + 1];
28412
28413           gcc_assert (REG_P (reg[i]));
28414         }
28415
28416       /* Do not attempt to merge the loads if the loads clobber each other.  */
28417       for (int i = 0; i < 8; i += 2)
28418         for (int j = i + 2; j < 8; j += 2)
28419           if (reg_overlap_mentioned_p (operands[i], operands[j]))
28420             return false;
28421     }
28422   else
28423     for (int i = 0; i < num_insns; i++)
28424       {
28425         mem[i] = operands[2 * i];
28426         reg[i] = operands[2 * i + 1];
28427       }
28428
28429   /* Skip if memory operand is by itself valid for ldp/stp.  */
28430   if (!MEM_P (mem[0]) || aarch64_mem_pair_operand (mem[0], mode))
28431     return false;
28432
28433   for (int i = 0; i < num_insns; i++)
28434     {
28435       /* The mems cannot be volatile.  */
28436       if (MEM_VOLATILE_P (mem[i]))
28437         return false;
28438
28439       /* Check if the addresses are in the form of [base+offset].  */
28440       extract_base_offset_in_addr (mem[i], base + i, offset + i);
28441       if (base[i] == NULL_RTX || offset[i] == NULL_RTX)
28442         return false;
28443     }
28444
28445   /* Check if the registers are of same class.  */
28446   rclass = REG_P (reg[0]) && FP_REGNUM_P (REGNO (reg[0]))
28447     ? FP_REGS : GENERAL_REGS;
28448
28449   for (int i = 1; i < num_insns; i++)
28450     if (REG_P (reg[i]) && FP_REGNUM_P (REGNO (reg[i])))
28451       {
28452         if (rclass != FP_REGS)
28453           return false;
28454       }
28455     else
28456       {
28457         if (rclass != GENERAL_REGS)
28458           return false;
28459       }
28460
28461   /* Only the last register in the order in which they occur
28462      may be clobbered by the load.  */
28463   if (rclass == GENERAL_REGS && load)
28464     for (int i = 0; i < num_insns - 1; i++)
28465       if (reg_mentioned_p (reg[i], mem[i]))
28466         return false;
28467
28468   /* Check if the bases are same.  */
28469   for (int i = 0; i < num_insns - 1; i++)
28470     if (!rtx_equal_p (base[i], base[i + 1]))
28471       return false;
28472
28473   for (int i = 0; i < num_insns; i++)
28474     offvals[i] = INTVAL (offset[i]);
28475
28476   msize = GET_MODE_SIZE (mode).to_constant ();
28477
28478   /* Check if the offsets can be put in the right order to do a ldp/stp.  */
28479   qsort (offvals, num_insns, sizeof (HOST_WIDE_INT),
28480          aarch64_host_wide_int_compare);
28481
28482   if (!(offvals[1] == offvals[0] + msize
28483         && offvals[3] == offvals[2] + msize))
28484     return false;
28485
28486   /* Check that offsets are within range of each other.  The ldp/stp
28487      instructions have 7 bit immediate offsets, so use 0x80.  */
28488   if (offvals[2] - offvals[0] >= msize * 0x80)
28489     return false;
28490
28491   /* The offsets must be aligned with respect to each other.  */
28492   if (offvals[0] % msize != offvals[2] % msize)
28493     return false;
28494
28495    /* Check if mem[0] is ok with the ldp-stp policy model.  */
28496   if (!aarch64_mem_ok_with_ldpstp_policy_model (mem[0], load, mode))
28497     return false;
28498
28499   return true;
28500 }
28501
28502 /* Given OPERANDS of consecutive load/store, this function pairs them
28503    into LDP/STP after adjusting the offset.  It depends on the fact
28504    that the operands can be sorted so the offsets are correct for STP.
28505    MODE is the mode of memory operands.  CODE is the rtl operator
28506    which should be applied to all memory operands, it's SIGN_EXTEND,
28507    ZERO_EXTEND or UNKNOWN.  */
28508
28509 bool
28510 aarch64_gen_adjusted_ldpstp (rtx *operands, bool load,
28511                              machine_mode mode, RTX_CODE code)
28512 {
28513   rtx base, offset_1, offset_2;
28514   rtx mem_1, mem_2;
28515   rtx temp_operands[8];
28516   HOST_WIDE_INT off_val_1, off_val_2, base_off, new_off_1, new_off_2,
28517                 stp_off_upper_limit, stp_off_lower_limit, msize;
28518
28519   /* We make changes on a copy as we may still bail out.  */
28520   for (int i = 0; i < 8; i ++)
28521     temp_operands[i] = operands[i];
28522
28523   /* Sort the operands.  Note for cases as below:
28524        [base + 0x310] = A
28525        [base + 0x320] = B
28526        [base + 0x330] = C
28527        [base + 0x320] = D
28528      We need stable sorting otherwise wrong data may be store to offset 0x320.
28529      Also note the dead store in above case should be optimized away, but no
28530      guarantees here.  */
28531   gcc_stablesort(temp_operands, 4, 2 * sizeof (rtx *),
28532                  aarch64_ldrstr_offset_compare);
28533
28534   /* Copy the memory operands so that if we have to bail for some
28535      reason the original addresses are unchanged.  */
28536   if (load)
28537     {
28538       mem_1 = copy_rtx (temp_operands[1]);
28539       mem_2 = copy_rtx (temp_operands[5]);
28540     }
28541   else
28542     {
28543       mem_1 = copy_rtx (temp_operands[0]);
28544       mem_2 = copy_rtx (temp_operands[4]);
28545       gcc_assert (code == UNKNOWN);
28546     }
28547
28548   extract_base_offset_in_addr (mem_1, &base, &offset_1);
28549   extract_base_offset_in_addr (mem_2, &base, &offset_2);
28550   gcc_assert (base != NULL_RTX && offset_1 != NULL_RTX
28551               && offset_2 != NULL_RTX);
28552
28553   /* Adjust offset so it can fit in LDP/STP instruction.  */
28554   msize = GET_MODE_SIZE (mode).to_constant();
28555   stp_off_upper_limit = msize * (0x40 - 1);
28556   stp_off_lower_limit = - msize * 0x40;
28557
28558   off_val_1 = INTVAL (offset_1);
28559   off_val_2 = INTVAL (offset_2);
28560
28561   /* The base offset is optimally half way between the two STP/LDP offsets.  */
28562   if (msize <= 4)
28563     base_off = (off_val_1 + off_val_2) / 2;
28564   else
28565     /* However, due to issues with negative LDP/STP offset generation for
28566        larger modes, for DF, DD, DI and vector modes. we must not use negative
28567        addresses smaller than 9 signed unadjusted bits can store.  This
28568        provides the most range in this case.  */
28569     base_off = off_val_1;
28570
28571   /* Adjust the base so that it is aligned with the addresses but still
28572      optimal.  */
28573   if (base_off % msize != off_val_1 % msize)
28574     /* Fix the offset, bearing in mind we want to make it bigger not
28575        smaller.  */
28576     base_off += (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
28577   else if (msize <= 4)
28578     /* The negative range of LDP/STP is one larger than the positive range.  */
28579     base_off += msize;
28580
28581   /* Check if base offset is too big or too small.  We can attempt to resolve
28582      this issue by setting it to the maximum value and seeing if the offsets
28583      still fit.  */
28584   if (base_off >= 0x1000)
28585     {
28586       base_off = 0x1000 - 1;
28587       /* We must still make sure that the base offset is aligned with respect
28588          to the address.  But it may not be made any bigger.  */
28589       base_off -= (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
28590     }
28591
28592   /* Likewise for the case where the base is too small.  */
28593   if (base_off <= -0x1000)
28594     {
28595       base_off = -0x1000 + 1;
28596       base_off += (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
28597     }
28598
28599   /* Offset of the first STP/LDP.  */
28600   new_off_1 = off_val_1 - base_off;
28601
28602   /* Offset of the second STP/LDP.  */
28603   new_off_2 = off_val_2 - base_off;
28604
28605   /* The offsets must be within the range of the LDP/STP instructions.  */
28606   if (new_off_1 > stp_off_upper_limit || new_off_1 < stp_off_lower_limit
28607       || new_off_2 > stp_off_upper_limit || new_off_2 < stp_off_lower_limit)
28608     return false;
28609
28610   replace_equiv_address_nv (mem_1, plus_constant (Pmode, operands[8],
28611                                                   new_off_1), true);
28612   replace_equiv_address_nv (mem_2, plus_constant (Pmode, operands[8],
28613                                                   new_off_2), true);
28614
28615   if (!aarch64_mem_pair_operand (mem_1, mode)
28616       || !aarch64_mem_pair_operand (mem_2, mode))
28617     return false;
28618
28619   if (load)
28620     {
28621       operands[0] = temp_operands[0];
28622       operands[1] = mem_1;
28623       operands[2] = temp_operands[2];
28624       operands[4] = temp_operands[4];
28625       operands[5] = mem_2;
28626       operands[6] = temp_operands[6];
28627     }
28628   else
28629     {
28630       operands[0] = mem_1;
28631       operands[1] = temp_operands[1];
28632       operands[3] = temp_operands[3];
28633       operands[4] = mem_2;
28634       operands[5] = temp_operands[5];
28635       operands[7] = temp_operands[7];
28636     }
28637
28638   /* Emit adjusting instruction.  */
28639   emit_insn (gen_rtx_SET (operands[8], plus_constant (DImode, base, base_off)));
28640   /* Emit ldp/stp instructions.  */
28641   if (load)
28642     {
28643       emit_insn (aarch64_gen_load_pair (operands[0], operands[2],
28644                                         operands[1], code));
28645       emit_insn (aarch64_gen_load_pair (operands[4], operands[6],
28646                                         operands[5], code));
28647     }
28648   else
28649     {
28650       emit_insn (aarch64_gen_store_pair (operands[0], operands[1],
28651                                          operands[3]));
28652       emit_insn (aarch64_gen_store_pair (operands[4], operands[5],
28653                                          operands[7]));
28654     }
28655   return true;
28656 }
28657
28658 /* Implement TARGET_VECTORIZE_CONDITIONAL_OPERATION_IS_EXPENSIVE.  Assume that
28659    predicated operations when available are beneficial.  */
28660
28661 static bool
28662 aarch64_conditional_operation_is_expensive (unsigned)
28663 {
28664   return false;
28665 }
28666
28667 /* Implement TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE.  Assume for now that
28668    it isn't worth branching around empty masked ops (including masked
28669    stores).  */
28670
28671 static bool
28672 aarch64_empty_mask_is_expensive (unsigned)
28673 {
28674   return false;
28675 }
28676
28677 /* Return 1 if pseudo register should be created and used to hold
28678    GOT address for PIC code.  */
28679
28680 bool
28681 aarch64_use_pseudo_pic_reg (void)
28682 {
28683   return aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC;
28684 }
28685
28686 /* Implement TARGET_UNSPEC_MAY_TRAP_P.  */
28687
28688 static int
28689 aarch64_unspec_may_trap_p (const_rtx x, unsigned flags)
28690 {
28691   switch (XINT (x, 1))
28692     {
28693     case UNSPEC_GOTSMALLPIC:
28694     case UNSPEC_GOTSMALLPIC28K:
28695     case UNSPEC_GOTTINYPIC:
28696       return 0;
28697     default:
28698       break;
28699     }
28700
28701   return default_unspec_may_trap_p (x, flags);
28702 }
28703
28704
28705 /* If X is a positive CONST_DOUBLE with a value that is a power of 2
28706    return the log2 of that value.  Otherwise return -1.  */
28707
28708 int
28709 aarch64_fpconst_pow_of_2 (rtx x)
28710 {
28711   const REAL_VALUE_TYPE *r;
28712
28713   if (!CONST_DOUBLE_P (x))
28714     return -1;
28715
28716   r = CONST_DOUBLE_REAL_VALUE (x);
28717
28718   if (REAL_VALUE_NEGATIVE (*r)
28719       || REAL_VALUE_ISNAN (*r)
28720       || REAL_VALUE_ISINF (*r)
28721       || !real_isinteger (r, DFmode))
28722     return -1;
28723
28724   return exact_log2 (real_to_integer (r));
28725 }
28726
28727 /* If X is a positive CONST_DOUBLE with a value that is the reciprocal of a
28728    power of 2 (i.e 1/2^n) return the number of float bits. e.g. for x==(1/2^n)
28729    return n. Otherwise return -1.  */
28730
28731 int
28732 aarch64_fpconst_pow2_recip (rtx x)
28733 {
28734   REAL_VALUE_TYPE r0;
28735
28736   if (!CONST_DOUBLE_P (x))
28737     return -1;
28738
28739   r0 = *CONST_DOUBLE_REAL_VALUE (x);
28740   if (exact_real_inverse (DFmode, &r0)
28741       && !REAL_VALUE_NEGATIVE (r0))
28742     {
28743         int ret = exact_log2 (real_to_integer (&r0));
28744         if (ret >= 1 && ret <= 32)
28745             return ret;
28746     }
28747   return -1;
28748 }
28749
28750 /* If X is a vector of equal CONST_DOUBLE values and that value is
28751    Y, return the aarch64_fpconst_pow_of_2 of Y.  Otherwise return -1.  */
28752
28753 int
28754 aarch64_vec_fpconst_pow_of_2 (rtx x)
28755 {
28756   int nelts;
28757   if (!CONST_VECTOR_P (x)
28758       || !CONST_VECTOR_NUNITS (x).is_constant (&nelts))
28759     return -1;
28760
28761   if (GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_FLOAT)
28762     return -1;
28763
28764   int firstval = aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, 0));
28765   if (firstval <= 0)
28766     return -1;
28767
28768   for (int i = 1; i < nelts; i++)
28769     if (aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, i)) != firstval)
28770       return -1;
28771
28772   return firstval;
28773 }
28774
28775 /* Implement TARGET_PROMOTED_TYPE to promote 16-bit floating point types
28776    to float.
28777
28778    __fp16 always promotes through this hook.
28779    _Float16 may promote if TARGET_FLT_EVAL_METHOD is 16, but we do that
28780    through the generic excess precision logic rather than here.  */
28781
28782 static tree
28783 aarch64_promoted_type (const_tree t)
28784 {
28785   if (SCALAR_FLOAT_TYPE_P (t)
28786       && TYPE_MAIN_VARIANT (t) == aarch64_fp16_type_node)
28787     return float_type_node;
28788
28789   return NULL_TREE;
28790 }
28791
28792 /* Implement the TARGET_OPTAB_SUPPORTED_P hook.  */
28793
28794 static bool
28795 aarch64_optab_supported_p (int op, machine_mode mode1, machine_mode,
28796                            optimization_type opt_type)
28797 {
28798   switch (op)
28799     {
28800     case rsqrt_optab:
28801       return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p (mode1);
28802
28803     default:
28804       return true;
28805     }
28806 }
28807
28808 /* Implement the TARGET_DWARF_POLY_INDETERMINATE_VALUE hook.  */
28809
28810 static unsigned int
28811 aarch64_dwarf_poly_indeterminate_value (unsigned int i, unsigned int *factor,
28812                                         int *offset)
28813 {
28814   /* Polynomial invariant 1 == (VG / 2) - 1.  */
28815   gcc_assert (i == 1);
28816   *factor = 2;
28817   *offset = 1;
28818   return AARCH64_DWARF_VG;
28819 }
28820
28821 /* Implement TARGET_LIBGCC_FLOATING_POINT_MODE_SUPPORTED_P - return TRUE
28822    if MODE is [BH]Fmode, and punt to the generic implementation otherwise.  */
28823
28824 static bool
28825 aarch64_libgcc_floating_mode_supported_p (scalar_float_mode mode)
28826 {
28827   return ((mode == HFmode || mode == BFmode)
28828           ? true
28829           : default_libgcc_floating_mode_supported_p (mode));
28830 }
28831
28832 /* Implement TARGET_SCALAR_MODE_SUPPORTED_P - return TRUE
28833    if MODE is [BH]Fmode, and punt to the generic implementation otherwise.  */
28834
28835 static bool
28836 aarch64_scalar_mode_supported_p (scalar_mode mode)
28837 {
28838   if (DECIMAL_FLOAT_MODE_P (mode))
28839     return default_decimal_float_supported_p ();
28840
28841   return ((mode == HFmode || mode == BFmode)
28842           ? true
28843           : default_scalar_mode_supported_p (mode));
28844 }
28845
28846 /* Set the value of FLT_EVAL_METHOD.
28847    ISO/IEC TS 18661-3 defines two values that we'd like to make use of:
28848
28849     0: evaluate all operations and constants, whose semantic type has at
28850        most the range and precision of type float, to the range and
28851        precision of float; evaluate all other operations and constants to
28852        the range and precision of the semantic type;
28853
28854     N, where _FloatN is a supported interchange floating type
28855        evaluate all operations and constants, whose semantic type has at
28856        most the range and precision of _FloatN type, to the range and
28857        precision of the _FloatN type; evaluate all other operations and
28858        constants to the range and precision of the semantic type;
28859
28860    If we have the ARMv8.2-A extensions then we support _Float16 in native
28861    precision, so we should set this to 16.  Otherwise, we support the type,
28862    but want to evaluate expressions in float precision, so set this to
28863    0.  */
28864
28865 static enum flt_eval_method
28866 aarch64_excess_precision (enum excess_precision_type type)
28867 {
28868   switch (type)
28869     {
28870       case EXCESS_PRECISION_TYPE_FAST:
28871       case EXCESS_PRECISION_TYPE_STANDARD:
28872         /* We can calculate either in 16-bit range and precision or
28873            32-bit range and precision.  Make that decision based on whether
28874            we have native support for the ARMv8.2-A 16-bit floating-point
28875            instructions or not.  */
28876         return (TARGET_FP_F16INST
28877                 ? FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16
28878                 : FLT_EVAL_METHOD_PROMOTE_TO_FLOAT);
28879       case EXCESS_PRECISION_TYPE_IMPLICIT:
28880       case EXCESS_PRECISION_TYPE_FLOAT16:
28881         return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16;
28882       default:
28883         gcc_unreachable ();
28884     }
28885   return FLT_EVAL_METHOD_UNPREDICTABLE;
28886 }
28887
28888 /* Implement TARGET_C_BITINT_TYPE_INFO.
28889    Return true if _BitInt(N) is supported and fill its details into *INFO.  */
28890 bool
28891 aarch64_bitint_type_info (int n, struct bitint_info *info)
28892 {
28893   if (TARGET_BIG_END)
28894     return false;
28895
28896   if (n <= 8)
28897     info->limb_mode = QImode;
28898   else if (n <= 16)
28899     info->limb_mode = HImode;
28900   else if (n <= 32)
28901     info->limb_mode = SImode;
28902   else if (n <= 64)
28903     info->limb_mode = DImode;
28904   else if (n <= 128)
28905     info->limb_mode = TImode;
28906   else
28907     /* The AAPCS for AArch64 defines _BitInt(N > 128) as an array with
28908        type {signed,unsigned} __int128[M] where M*128 >= N.  However, to be
28909        able to use libgcc's implementation to support large _BitInt's we need
28910        to use a LIMB_MODE that is no larger than 'long long'.  This is why we
28911        use DImode for our internal LIMB_MODE and we define the ABI_LIMB_MODE to
28912        be TImode to ensure we are ABI compliant.  */
28913     info->limb_mode = DImode;
28914
28915   if (n > 128)
28916     info->abi_limb_mode = TImode;
28917   else
28918     info->abi_limb_mode = info->limb_mode;
28919   info->big_endian = TARGET_BIG_END;
28920   info->extended = false;
28921   return true;
28922 }
28923
28924 /* Implement TARGET_C_MODE_FOR_FLOATING_TYPE.  Return TFmode for
28925    TI_LONG_DOUBLE_TYPE which is for long double type, go with the default
28926    one for the others.  */
28927
28928 static machine_mode
28929 aarch64_c_mode_for_floating_type (enum tree_index ti)
28930 {
28931   if (ti == TI_LONG_DOUBLE_TYPE)
28932     return TFmode;
28933   return default_mode_for_floating_type (ti);
28934 }
28935
28936 /* Implement TARGET_SCHED_CAN_SPECULATE_INSN.  Return true if INSN can be
28937    scheduled for speculative execution.  Reject the long-running division
28938    and square-root instructions.  */
28939
28940 static bool
28941 aarch64_sched_can_speculate_insn (rtx_insn *insn)
28942 {
28943   switch (get_attr_type (insn))
28944     {
28945       case TYPE_SDIV:
28946       case TYPE_UDIV:
28947       case TYPE_FDIVS:
28948       case TYPE_FDIVD:
28949       case TYPE_FSQRTS:
28950       case TYPE_FSQRTD:
28951       case TYPE_NEON_FP_SQRT_S:
28952       case TYPE_NEON_FP_SQRT_D:
28953       case TYPE_NEON_FP_SQRT_S_Q:
28954       case TYPE_NEON_FP_SQRT_D_Q:
28955       case TYPE_NEON_FP_DIV_S:
28956       case TYPE_NEON_FP_DIV_D:
28957       case TYPE_NEON_FP_DIV_S_Q:
28958       case TYPE_NEON_FP_DIV_D_Q:
28959         return false;
28960       default:
28961         return true;
28962     }
28963 }
28964
28965 /* Implement TARGET_COMPUTE_PRESSURE_CLASSES.  */
28966
28967 static int
28968 aarch64_compute_pressure_classes (reg_class *classes)
28969 {
28970   int i = 0;
28971   classes[i++] = GENERAL_REGS;
28972   classes[i++] = FP_REGS;
28973   /* PR_REGS isn't a useful pressure class because many predicate pseudo
28974      registers need to go in PR_LO_REGS at some point during their
28975      lifetime.  Splitting it into two halves has the effect of making
28976      all predicates count against PR_LO_REGS, so that we try whenever
28977      possible to restrict the number of live predicates to 8.  This
28978      greatly reduces the amount of spilling in certain loops.  */
28979   classes[i++] = PR_LO_REGS;
28980   classes[i++] = PR_HI_REGS;
28981   return i;
28982 }
28983
28984 /* Implement TARGET_CAN_CHANGE_MODE_CLASS.  */
28985
28986 static bool
28987 aarch64_can_change_mode_class (machine_mode from,
28988                                machine_mode to, reg_class_t)
28989 {
28990   return aarch64_modes_compatible_p (from, to);
28991 }
28992
28993 /* Implement TARGET_EARLY_REMAT_MODES.  */
28994
28995 static void
28996 aarch64_select_early_remat_modes (sbitmap modes)
28997 {
28998   /* SVE values are not normally live across a call, so it should be
28999      worth doing early rematerialization even in VL-specific mode.  */
29000   for (int i = 0; i < NUM_MACHINE_MODES; ++i)
29001     if (aarch64_sve_mode_p ((machine_mode) i))
29002       bitmap_set_bit (modes, i);
29003 }
29004
29005 /* Override the default target speculation_safe_value.  */
29006 static rtx
29007 aarch64_speculation_safe_value (machine_mode mode,
29008                                 rtx result, rtx val, rtx failval)
29009 {
29010   /* Maybe we should warn if falling back to hard barriers.  They are
29011      likely to be noticably more expensive than the alternative below.  */
29012   if (!aarch64_track_speculation)
29013     return default_speculation_safe_value (mode, result, val, failval);
29014
29015   if (!REG_P (val))
29016     val = copy_to_mode_reg (mode, val);
29017
29018   if (!aarch64_reg_or_zero (failval, mode))
29019     failval = copy_to_mode_reg (mode, failval);
29020
29021   emit_insn (gen_despeculate_copy (mode, result, val, failval));
29022   return result;
29023 }
29024
29025 /* Implement TARGET_ESTIMATED_POLY_VALUE.
29026    Look into the tuning structure for an estimate.
29027    KIND specifies the type of requested estimate: min, max or likely.
29028    For cores with a known SVE width all three estimates are the same.
29029    For generic SVE tuning we want to distinguish the maximum estimate from
29030    the minimum and likely ones.
29031    The likely estimate is the same as the minimum in that case to give a
29032    conservative behavior of auto-vectorizing with SVE when it is a win
29033    even for 128-bit SVE.
29034    When SVE width information is available VAL.coeffs[1] is multiplied by
29035    the number of VQ chunks over the initial Advanced SIMD 128 bits.  */
29036
29037 static HOST_WIDE_INT
29038 aarch64_estimated_poly_value (poly_int64 val,
29039                               poly_value_estimate_kind kind
29040                                 = POLY_VALUE_LIKELY)
29041 {
29042   unsigned int width_source = aarch64_tune_params.sve_width;
29043
29044   /* If there is no core-specific information then the minimum and likely
29045      values are based on 128-bit vectors and the maximum is based on
29046      the architectural maximum of 2048 bits.  */
29047   if (width_source == SVE_SCALABLE)
29048     switch (kind)
29049       {
29050       case POLY_VALUE_MIN:
29051       case POLY_VALUE_LIKELY:
29052         return val.coeffs[0];
29053       case POLY_VALUE_MAX:
29054           return val.coeffs[0] + val.coeffs[1] * 15;
29055       }
29056
29057   /* Allow sve_width to be a bitmask of different VL, treating the lowest
29058      as likely.  This could be made more general if future -mtune options
29059      need it to be.  */
29060   if (kind == POLY_VALUE_MAX)
29061     width_source = 1 << floor_log2 (width_source);
29062   else
29063     width_source = least_bit_hwi (width_source);
29064
29065   /* If the core provides width information, use that.  */
29066   HOST_WIDE_INT over_128 = width_source - 128;
29067   return val.coeffs[0] + val.coeffs[1] * over_128 / 128;
29068 }
29069
29070
29071 /* Return true for types that could be supported as SIMD return or
29072    argument types.  */
29073
29074 static bool
29075 supported_simd_type (tree t)
29076 {
29077   if (SCALAR_FLOAT_TYPE_P (t) || INTEGRAL_TYPE_P (t) || POINTER_TYPE_P (t))
29078     {
29079       HOST_WIDE_INT s = tree_to_shwi (TYPE_SIZE_UNIT (t));
29080       return s == 1 || s == 2 || s == 4 || s == 8;
29081     }
29082   return false;
29083 }
29084
29085 /* Determine the lane size for the clone argument/return type.  This follows
29086    the LS(P) rule in the VFABIA64.  */
29087
29088 static unsigned
29089 lane_size (cgraph_simd_clone_arg_type clone_arg_type, tree type)
29090 {
29091   gcc_assert (clone_arg_type != SIMD_CLONE_ARG_TYPE_MASK);
29092
29093   /* For non map-to-vector types that are pointers we use the element type it
29094      points to.  */
29095   if (POINTER_TYPE_P (type))
29096     switch (clone_arg_type)
29097       {
29098       default:
29099         break;
29100       case SIMD_CLONE_ARG_TYPE_UNIFORM:
29101       case SIMD_CLONE_ARG_TYPE_LINEAR_CONSTANT_STEP:
29102       case SIMD_CLONE_ARG_TYPE_LINEAR_VARIABLE_STEP:
29103         type = TREE_TYPE (type);
29104         break;
29105       }
29106
29107   /* For types (or pointers of non map-to-vector types point to) that are
29108      integers or floating point, we use their size if they are 1, 2, 4 or 8.
29109    */
29110   if (INTEGRAL_TYPE_P (type)
29111       || SCALAR_FLOAT_TYPE_P (type))
29112     switch (TYPE_PRECISION (type) / BITS_PER_UNIT)
29113       {
29114       default:
29115         break;
29116       case 1:
29117       case 2:
29118       case 4:
29119       case 8:
29120         return TYPE_PRECISION (type);
29121       }
29122   /* For any other we use the size of uintptr_t.  For map-to-vector types that
29123      are pointers, using the size of uintptr_t is the same as using the size of
29124      their type, seeing all pointers are the same size as uintptr_t.  */
29125   return POINTER_SIZE;
29126 }
29127
29128
29129 /* Implement TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN.  */
29130
29131 static int
29132 aarch64_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node *node,
29133                                         struct cgraph_simd_clone *clonei,
29134                                         tree base_type ATTRIBUTE_UNUSED,
29135                                         int num, bool explicit_p)
29136 {
29137   tree t, ret_type;
29138   unsigned int nds_elt_bits, wds_elt_bits;
29139   unsigned HOST_WIDE_INT const_simdlen;
29140
29141   if (!TARGET_SIMD)
29142     return 0;
29143
29144   /* For now, SVE simdclones won't produce illegal simdlen, So only check
29145      const simdlens here.  */
29146   if (maybe_ne (clonei->simdlen, 0U)
29147       && clonei->simdlen.is_constant (&const_simdlen)
29148       && (const_simdlen < 2
29149           || const_simdlen > 1024
29150           || (const_simdlen & (const_simdlen - 1)) != 0))
29151     {
29152       if (explicit_p)
29153         warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
29154                     "unsupported simdlen %wd", const_simdlen);
29155       return 0;
29156     }
29157
29158   ret_type = TREE_TYPE (TREE_TYPE (node->decl));
29159   /* According to AArch64's Vector ABI the type that determines the simdlen is
29160      the narrowest of types, so we ignore base_type for AArch64.  */
29161   if (TREE_CODE (ret_type) != VOID_TYPE
29162       && !supported_simd_type (ret_type))
29163     {
29164       if (!explicit_p)
29165         ;
29166       else if (COMPLEX_FLOAT_TYPE_P (ret_type))
29167         warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
29168                     "GCC does not currently support return type %qT "
29169                     "for simd", ret_type);
29170       else
29171         warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
29172                     "unsupported return type %qT for simd",
29173                     ret_type);
29174       return 0;
29175     }
29176
29177   auto_vec<std::pair <tree, unsigned int>> vec_elts (clonei->nargs + 1);
29178
29179   /* We are looking for the NDS type here according to the VFABIA64.  */
29180   if (TREE_CODE (ret_type) != VOID_TYPE)
29181     {
29182       nds_elt_bits = lane_size (SIMD_CLONE_ARG_TYPE_VECTOR, ret_type);
29183       wds_elt_bits = nds_elt_bits;
29184       vec_elts.safe_push (std::make_pair (ret_type, nds_elt_bits));
29185     }
29186   else
29187     {
29188       nds_elt_bits = POINTER_SIZE;
29189       wds_elt_bits = 0;
29190     }
29191
29192   int i;
29193   tree type_arg_types = TYPE_ARG_TYPES (TREE_TYPE (node->decl));
29194   bool decl_arg_p = (node->definition || type_arg_types == NULL_TREE);
29195   for (t = (decl_arg_p ? DECL_ARGUMENTS (node->decl) : type_arg_types), i = 0;
29196        t && t != void_list_node; t = TREE_CHAIN (t), i++)
29197     {
29198       tree type = decl_arg_p ? TREE_TYPE (t) : TREE_VALUE (t);
29199       if (clonei->args[i].arg_type != SIMD_CLONE_ARG_TYPE_UNIFORM
29200           && !supported_simd_type (type))
29201         {
29202           if (!explicit_p)
29203             ;
29204           else if (COMPLEX_FLOAT_TYPE_P (type))
29205             warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
29206                         "GCC does not currently support argument type %qT "
29207                         "for simd", type);
29208           else
29209             warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
29210                         "unsupported argument type %qT for simd",
29211                         type);
29212           return 0;
29213         }
29214       unsigned lane_bits = lane_size (clonei->args[i].arg_type, type);
29215       if (clonei->args[i].arg_type == SIMD_CLONE_ARG_TYPE_VECTOR)
29216         vec_elts.safe_push (std::make_pair (type, lane_bits));
29217       if (nds_elt_bits > lane_bits)
29218         nds_elt_bits = lane_bits;
29219       if (wds_elt_bits < lane_bits)
29220         wds_elt_bits = lane_bits;
29221     }
29222
29223   /* If we could not determine the WDS type from available parameters/return,
29224      then fallback to using uintptr_t.  */
29225   if (wds_elt_bits == 0)
29226     wds_elt_bits = POINTER_SIZE;
29227
29228   clonei->mask_mode = VOIDmode;
29229   poly_uint64 simdlen;
29230   typedef struct
29231     {
29232       poly_uint64 len;
29233       char mangle;
29234     } aarch64_clone_info;
29235   auto_vec<aarch64_clone_info, 3> clones;
29236
29237   /* Keep track of the possible simdlens the clones of this function can have,
29238      and check them later to see if we support them.  */
29239   if (known_eq (clonei->simdlen, 0U))
29240     {
29241       simdlen = exact_div (poly_uint64 (64), nds_elt_bits);
29242       if (maybe_ne (simdlen, 1U))
29243         clones.safe_push ({simdlen, 'n'});
29244       clones.safe_push ({simdlen * 2, 'n'});
29245       /* Only create an SVE simd clone if we aren't dealing with an unprototyped
29246          function.
29247          We have also disabled support for creating SVE simdclones for functions
29248          with function bodies and any simdclones when -msve-vector-bits is used.
29249          TODO: add support for these.  */
29250       if (prototype_p (TREE_TYPE (node->decl))
29251           && !node->definition
29252           && !aarch64_sve_vg.is_constant ())
29253         clones.safe_push ({exact_div (BITS_PER_SVE_VECTOR, wds_elt_bits), 's'});
29254     }
29255   else
29256     clones.safe_push ({clonei->simdlen, 'n'});
29257
29258   clonei->vecsize_int = 0;
29259   clonei->vecsize_float = 0;
29260
29261   /* We currently do not support generating simdclones where vector arguments
29262      do not fit into a single vector register, i.e. vector types that are more
29263      than 128-bits large.  This is because of how we currently represent such
29264      types in ACLE, where we use a struct to allow us to pass them as arguments
29265      and return.
29266      Hence why we have to check whether the simdlens available for this
29267      simdclone would cause a vector type to be larger than 128-bits, and reject
29268      such a clone.  */
29269   unsigned j = 0;
29270   while (j < clones.length ())
29271     {
29272       bool remove_simdlen = false;
29273       for (auto elt : vec_elts)
29274         if (clones[j].mangle == 'n'
29275             && known_gt (clones[j].len * elt.second, 128U))
29276           {
29277             /* Don't issue a warning for every simdclone when there is no
29278                specific simdlen clause.  */
29279             if (explicit_p && maybe_ne (clonei->simdlen, 0U))
29280               warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
29281                           "GCC does not currently support simdlen %wd for "
29282                           "type %qT",
29283                           constant_lower_bound (clones[j].len), elt.first);
29284             remove_simdlen = true;
29285             break;
29286           }
29287       if (remove_simdlen)
29288         clones.ordered_remove (j);
29289       else
29290         j++;
29291     }
29292
29293   int count = clones.length ();
29294   if (count == 0)
29295     {
29296       if (explicit_p && known_eq (clonei->simdlen, 0U))
29297         {
29298           /* Warn the user if we can't generate any simdclone.  */
29299           simdlen = exact_div (poly_uint64 (64), nds_elt_bits);
29300           warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
29301                       "GCC does not currently support a simdclone with simdlens"
29302                       " %wd and %wd for these types.",
29303                       constant_lower_bound (simdlen),
29304                       constant_lower_bound (simdlen*2));
29305         }
29306       return 0;
29307     }
29308
29309   gcc_assert (num < count);
29310   clonei->simdlen = clones[num].len;
29311   clonei->vecsize_mangle = clones[num].mangle;
29312   /* SVE simdclones always have a Mask, so set inbranch to 1.  */
29313   if (clonei->vecsize_mangle == 's')
29314     clonei->inbranch = 1;
29315   return count;
29316 }
29317
29318 /* Helper function to adjust an SVE vector type of an SVE simd clone.  Returns
29319    an SVE vector type based on the element type of the vector TYPE, with SIMDLEN
29320    number of elements.  If IS_MASK, returns an SVE mask type appropriate for use
29321    with the SVE type it would otherwise return.  */
29322
29323 static tree
29324 simd_clone_adjust_sve_vector_type (tree type, bool is_mask, poly_uint64 simdlen)
29325 {
29326   unsigned int num_zr = 0;
29327   unsigned int num_pr = 0;
29328   machine_mode vector_mode;
29329   type = TREE_TYPE (type);
29330   scalar_mode scalar_m = SCALAR_TYPE_MODE (type);
29331   vector_mode = aarch64_sve_data_mode (scalar_m, simdlen).require ();
29332   type = build_vector_type_for_mode (type, vector_mode);
29333   if (is_mask)
29334     {
29335       type = truth_type_for (type);
29336       num_pr = 1;
29337     }
29338   else
29339     num_zr = 1;
29340
29341   /* We create new types here with the SVE type attribute instead of using ACLE
29342      types as we need to support unpacked vectors which aren't available as
29343      ACLE SVE types.  */
29344
29345   /* ??? This creates anonymous "SVE type" attributes for all types,
29346      even those that correspond to <arm_sve.h> types.  This affects type
29347      compatibility in C/C++, but not in gimple.  (Gimple type equivalence
29348      is instead decided by TARGET_COMPATIBLE_VECTOR_TYPES_P.)
29349
29350      Thus a C/C++ definition of the implementation function will have a
29351      different function type from the declaration that this code creates.
29352      However, it doesn't seem worth trying to fix that until we have a
29353      way of handling implementations that operate on unpacked types.  */
29354   type = build_distinct_type_copy (type);
29355   aarch64_sve::add_sve_type_attribute (type, num_zr, num_pr, NULL, NULL);
29356   return type;
29357 }
29358
29359 /* Implement TARGET_SIMD_CLONE_ADJUST.  */
29360 static void
29361 aarch64_simd_clone_adjust (struct cgraph_node *node)
29362 {
29363   tree t = TREE_TYPE (node->decl);
29364
29365   if (node->simdclone->vecsize_mangle == 's')
29366     {
29367       /* This is additive and has no effect if SVE, or a superset thereof, is
29368          already enabled.  */
29369       tree target = build_string (strlen ("+sve") + 1, "+sve");
29370       if (!aarch64_option_valid_attribute_p (node->decl, NULL_TREE, target, 0))
29371         gcc_unreachable ();
29372       push_function_decl (node->decl);
29373     }
29374   else
29375     {
29376       /* Add aarch64_vector_pcs target attribute to SIMD clones so they
29377          use the correct ABI.  */
29378       TYPE_ATTRIBUTES (t) = make_attribute ("aarch64_vector_pcs", "default",
29379                                             TYPE_ATTRIBUTES (t));
29380     }
29381
29382   cgraph_simd_clone *sc = node->simdclone;
29383
29384   for (unsigned i = 0; i < sc->nargs; ++i)
29385     {
29386       bool is_mask = false;
29387       tree type;
29388       switch (sc->args[i].arg_type)
29389         {
29390         case SIMD_CLONE_ARG_TYPE_MASK:
29391           is_mask = true;
29392           gcc_fallthrough ();
29393         case SIMD_CLONE_ARG_TYPE_VECTOR:
29394         case SIMD_CLONE_ARG_TYPE_LINEAR_VAL_CONSTANT_STEP:
29395         case SIMD_CLONE_ARG_TYPE_LINEAR_VAL_VARIABLE_STEP:
29396           type = sc->args[i].vector_type;
29397           gcc_assert (VECTOR_TYPE_P (type));
29398           if (node->simdclone->vecsize_mangle == 's')
29399             type = simd_clone_adjust_sve_vector_type (type, is_mask,
29400                                                       sc->simdlen);
29401           sc->args[i].vector_type = type;
29402           break;
29403         default:
29404           continue;
29405         }
29406     }
29407   if (node->simdclone->vecsize_mangle == 's')
29408     {
29409       tree ret_type = TREE_TYPE (t);
29410       if (VECTOR_TYPE_P (ret_type))
29411         TREE_TYPE (t)
29412           = simd_clone_adjust_sve_vector_type (ret_type, false,
29413                                                node->simdclone->simdlen);
29414       pop_function_decl ();
29415     }
29416 }
29417
29418 /* Implement TARGET_SIMD_CLONE_USABLE.  */
29419
29420 static int
29421 aarch64_simd_clone_usable (struct cgraph_node *node, machine_mode vector_mode)
29422 {
29423   switch (node->simdclone->vecsize_mangle)
29424     {
29425     case 'n':
29426       if (!TARGET_SIMD || aarch64_sve_mode_p (vector_mode))
29427         return -1;
29428       return 0;
29429     case 's':
29430       if (!TARGET_SVE
29431           || !aarch64_sve_mode_p (vector_mode))
29432         return -1;
29433       return 0;
29434     default:
29435       gcc_unreachable ();
29436     }
29437 }
29438
29439 /* Implement TARGET_COMP_TYPE_ATTRIBUTES */
29440
29441 static int
29442 aarch64_comp_type_attributes (const_tree type1, const_tree type2)
29443 {
29444   auto check_attr = [&](const char *ns, const char *name) {
29445     tree attr1 = lookup_attribute (ns, name, TYPE_ATTRIBUTES (type1));
29446     tree attr2 = lookup_attribute (ns, name, TYPE_ATTRIBUTES (type2));
29447     if (!attr1 && !attr2)
29448       return true;
29449
29450     return attr1 && attr2 && attribute_value_equal (attr1, attr2);
29451   };
29452
29453   if (!check_attr ("gnu", "aarch64_vector_pcs"))
29454     return 0;
29455   if (!check_attr ("gnu", "indirect_return"))
29456     return 0;
29457   if (!check_attr ("gnu", "Advanced SIMD type"))
29458     return 0;
29459   if (!check_attr ("gnu", "SVE type"))
29460     return 0;
29461   if (!check_attr ("gnu", "SVE sizeless type"))
29462     return 0;
29463   if (!check_attr ("arm", "streaming"))
29464     return 0;
29465   if (!check_attr ("arm", "streaming_compatible"))
29466     return 0;
29467   if (aarch64_lookup_shared_state_flags (TYPE_ATTRIBUTES (type1), "za")
29468       != aarch64_lookup_shared_state_flags (TYPE_ATTRIBUTES (type2), "za"))
29469     return 0;
29470   if (aarch64_lookup_shared_state_flags (TYPE_ATTRIBUTES (type1), "zt0")
29471       != aarch64_lookup_shared_state_flags (TYPE_ATTRIBUTES (type2), "zt0"))
29472     return 0;
29473   return 1;
29474 }
29475
29476 /* Implement TARGET_MERGE_DECL_ATTRIBUTES.  */
29477
29478 static tree
29479 aarch64_merge_decl_attributes (tree olddecl, tree newdecl)
29480 {
29481   tree old_attrs = DECL_ATTRIBUTES (olddecl);
29482   tree old_new = lookup_attribute ("arm", "new", old_attrs);
29483
29484   tree new_attrs = DECL_ATTRIBUTES (newdecl);
29485   tree new_new = lookup_attribute ("arm", "new", new_attrs);
29486
29487   if (DECL_INITIAL (olddecl) && new_new)
29488     {
29489       error ("cannot apply attribute %qs to %q+D after the function"
29490              " has been defined", "new", newdecl);
29491       inform (DECL_SOURCE_LOCATION (olddecl), "%q+D defined here",
29492               newdecl);
29493     }
29494   else
29495     {
29496       if (old_new && new_new)
29497         {
29498           old_attrs = remove_attribute ("arm", "new", old_attrs);
29499           TREE_VALUE (new_new) = chainon (TREE_VALUE (new_new),
29500                                           TREE_VALUE (old_new));
29501         }
29502       if (new_new)
29503         aarch64_check_arm_new_against_type (TREE_VALUE (new_new), newdecl);
29504     }
29505
29506   return merge_attributes (old_attrs, new_attrs);
29507 }
29508
29509 /* Implement TARGET_GET_MULTILIB_ABI_NAME */
29510
29511 static const char *
29512 aarch64_get_multilib_abi_name (void)
29513 {
29514   if (TARGET_BIG_END)
29515     return TARGET_ILP32 ? "aarch64_be_ilp32" : "aarch64_be";
29516   return TARGET_ILP32 ? "aarch64_ilp32" : "aarch64";
29517 }
29518
29519 /* Implement TARGET_STACK_PROTECT_GUARD. In case of a
29520    global variable based guard use the default else
29521    return a null tree.  */
29522 static tree
29523 aarch64_stack_protect_guard (void)
29524 {
29525   if (aarch64_stack_protector_guard == SSP_GLOBAL)
29526     return default_stack_protect_guard ();
29527
29528   return NULL_TREE;
29529 }
29530
29531 /* Implement TARGET_INVALID_UNARY_OP.  */
29532
29533 static const char *
29534 aarch64_invalid_unary_op (int op, const_tree type)
29535 {
29536   /* Reject all single-operand operations on __mfp8 except for &.  */
29537   if (TYPE_MAIN_VARIANT (type) == aarch64_mfp8_type_node && op != ADDR_EXPR)
29538     return N_ ("operation not permitted on type %<mfloat8_t%>");
29539
29540   /* Operation allowed.  */
29541   return NULL;
29542 }
29543
29544 /* Implement TARGET_INVALID_BINARY_OP.  */
29545
29546 static const char *
29547 aarch64_invalid_binary_op (int op ATTRIBUTE_UNUSED, const_tree type1,
29548                            const_tree type2)
29549 {
29550   if (VECTOR_TYPE_P (type1)
29551       && VECTOR_TYPE_P (type2)
29552       && !TYPE_INDIVISIBLE_P (type1)
29553       && !TYPE_INDIVISIBLE_P (type2)
29554       && (aarch64_sve::builtin_type_p (type1)
29555           != aarch64_sve::builtin_type_p (type2)))
29556     return N_("cannot combine GNU and SVE vectors in a binary operation");
29557
29558   /* Reject all 2-operand operations on __mfp8.  */
29559   if (TYPE_MAIN_VARIANT (type1) == aarch64_mfp8_type_node
29560       || TYPE_MAIN_VARIANT (type2) == aarch64_mfp8_type_node)
29561     return N_ ("operation not permitted on type %<mfloat8_t%>");
29562
29563   /* Operation allowed.  */
29564   return NULL;
29565 }
29566
29567 /* Implement TARGET_MEMTAG_CAN_TAG_ADDRESSES.  Here we tell the rest of the
29568    compiler that we automatically ignore the top byte of our pointers, which
29569    allows using -fsanitize=hwaddress.  */
29570 bool
29571 aarch64_can_tag_addresses ()
29572 {
29573   return !TARGET_ILP32;
29574 }
29575
29576 /* Implement TARGET_ASM_FILE_END for AArch64.  This adds the AArch64 GNU NOTE
29577    section at the end if needed.  */
29578 #define GNU_PROPERTY_AARCH64_FEATURE_1_AND      0xc0000000
29579 #define GNU_PROPERTY_AARCH64_FEATURE_1_BTI      (1U << 0)
29580 #define GNU_PROPERTY_AARCH64_FEATURE_1_PAC      (1U << 1)
29581 #define GNU_PROPERTY_AARCH64_FEATURE_1_GCS      (1U << 2)
29582 void
29583 aarch64_file_end_indicate_exec_stack ()
29584 {
29585   file_end_indicate_exec_stack ();
29586
29587   unsigned feature_1_and = 0;
29588   if (aarch_bti_enabled ())
29589     feature_1_and |= GNU_PROPERTY_AARCH64_FEATURE_1_BTI;
29590
29591   if (aarch_ra_sign_scope != AARCH_FUNCTION_NONE)
29592     feature_1_and |= GNU_PROPERTY_AARCH64_FEATURE_1_PAC;
29593
29594   if (aarch64_gcs_enabled ())
29595     feature_1_and |= GNU_PROPERTY_AARCH64_FEATURE_1_GCS;
29596
29597   if (feature_1_and)
29598     {
29599       /* Generate .note.gnu.property section.  */
29600       switch_to_section (get_section (".note.gnu.property",
29601                                       SECTION_NOTYPE, NULL));
29602
29603       /* PT_NOTE header: namesz, descsz, type.
29604          namesz = 4 ("GNU\0")
29605          descsz = 16 (Size of the program property array)
29606                   [(12 + padding) * Number of array elements]
29607          type   = 5 (NT_GNU_PROPERTY_TYPE_0).  */
29608       assemble_align (POINTER_SIZE);
29609       assemble_integer (GEN_INT (4), 4, 32, 1);
29610       assemble_integer (GEN_INT (ROUND_UP (12, POINTER_BYTES)), 4, 32, 1);
29611       assemble_integer (GEN_INT (5), 4, 32, 1);
29612
29613       /* PT_NOTE name.  */
29614       assemble_string ("GNU", 4);
29615
29616       /* PT_NOTE contents for NT_GNU_PROPERTY_TYPE_0:
29617          type   = GNU_PROPERTY_AARCH64_FEATURE_1_AND
29618          datasz = 4
29619          data   = feature_1_and.  */
29620       assemble_integer (GEN_INT (GNU_PROPERTY_AARCH64_FEATURE_1_AND), 4, 32, 1);
29621       assemble_integer (GEN_INT (4), 4, 32, 1);
29622       assemble_integer (GEN_INT (feature_1_and), 4, 32, 1);
29623
29624       /* Pad the size of the note to the required alignment.  */
29625       assemble_align (POINTER_SIZE);
29626     }
29627 }
29628 #undef GNU_PROPERTY_AARCH64_FEATURE_1_GCS
29629 #undef GNU_PROPERTY_AARCH64_FEATURE_1_PAC
29630 #undef GNU_PROPERTY_AARCH64_FEATURE_1_BTI
29631 #undef GNU_PROPERTY_AARCH64_FEATURE_1_AND
29632
29633 /* Helper function for straight line speculation.
29634    Return what barrier should be emitted for straight line speculation
29635    mitigation.
29636    When not mitigating against straight line speculation this function returns
29637    an empty string.
29638    When mitigating against straight line speculation, use:
29639    * SB when the v8.5-A SB extension is enabled.
29640    * DSB+ISB otherwise.  */
29641 const char *
29642 aarch64_sls_barrier (int mitigation_required)
29643 {
29644   return mitigation_required
29645     ? (TARGET_SB ? "sb" : "dsb\tsy\n\tisb")
29646     : "";
29647 }
29648
29649 static GTY (()) tree aarch64_sls_shared_thunks[30];
29650 static GTY (()) bool aarch64_sls_shared_thunks_needed = false;
29651 const char *indirect_symbol_names[30] = {
29652     "__call_indirect_x0",
29653     "__call_indirect_x1",
29654     "__call_indirect_x2",
29655     "__call_indirect_x3",
29656     "__call_indirect_x4",
29657     "__call_indirect_x5",
29658     "__call_indirect_x6",
29659     "__call_indirect_x7",
29660     "__call_indirect_x8",
29661     "__call_indirect_x9",
29662     "__call_indirect_x10",
29663     "__call_indirect_x11",
29664     "__call_indirect_x12",
29665     "__call_indirect_x13",
29666     "__call_indirect_x14",
29667     "__call_indirect_x15",
29668     "", /* "__call_indirect_x16",  */
29669     "", /* "__call_indirect_x17",  */
29670     "__call_indirect_x18",
29671     "__call_indirect_x19",
29672     "__call_indirect_x20",
29673     "__call_indirect_x21",
29674     "__call_indirect_x22",
29675     "__call_indirect_x23",
29676     "__call_indirect_x24",
29677     "__call_indirect_x25",
29678     "__call_indirect_x26",
29679     "__call_indirect_x27",
29680     "__call_indirect_x28",
29681     "__call_indirect_x29",
29682 };
29683
29684 /* Function to create a BLR thunk.  This thunk is used to mitigate straight
29685    line speculation.  Instead of a simple BLR that can be speculated past,
29686    we emit a BL to this thunk, and this thunk contains a BR to the relevant
29687    register.  These thunks have the relevant speculation barries put after
29688    their indirect branch so that speculation is blocked.
29689
29690    We use such a thunk so the speculation barriers are kept off the
29691    architecturally executed path in order to reduce the performance overhead.
29692
29693    When optimizing for size we use stubs shared by the linked object.
29694    When optimizing for performance we emit stubs for each function in the hope
29695    that the branch predictor can better train on jumps specific for a given
29696    function.  */
29697 rtx
29698 aarch64_sls_create_blr_label (int regnum)
29699 {
29700   gcc_assert (STUB_REGNUM_P (regnum));
29701   if (optimize_function_for_size_p (cfun))
29702     {
29703       /* For the thunks shared between different functions in this compilation
29704          unit we use a named symbol -- this is just for users to more easily
29705          understand the generated assembly.  */
29706       aarch64_sls_shared_thunks_needed = true;
29707       const char *thunk_name = indirect_symbol_names[regnum];
29708       if (aarch64_sls_shared_thunks[regnum] == NULL)
29709         {
29710           /* Build a decl representing this function stub and record it for
29711              later.  We build a decl here so we can use the GCC machinery for
29712              handling sections automatically (through `get_named_section` and
29713              `make_decl_one_only`).  That saves us a lot of trouble handling
29714              the specifics of different output file formats.  */
29715           tree decl = build_decl (BUILTINS_LOCATION, FUNCTION_DECL,
29716                                   get_identifier (thunk_name),
29717                                   build_function_type_list (void_type_node,
29718                                                             NULL_TREE));
29719           DECL_RESULT (decl) = build_decl (BUILTINS_LOCATION, RESULT_DECL,
29720                                            NULL_TREE, void_type_node);
29721           TREE_PUBLIC (decl) = 1;
29722           TREE_STATIC (decl) = 1;
29723           DECL_IGNORED_P (decl) = 1;
29724           DECL_ARTIFICIAL (decl) = 1;
29725           make_decl_one_only (decl, DECL_ASSEMBLER_NAME (decl));
29726           resolve_unique_section (decl, 0, false);
29727           aarch64_sls_shared_thunks[regnum] = decl;
29728         }
29729
29730       return gen_rtx_SYMBOL_REF (Pmode, thunk_name);
29731     }
29732
29733   if (cfun->machine->call_via[regnum] == NULL)
29734     cfun->machine->call_via[regnum]
29735       = gen_rtx_LABEL_REF (Pmode, gen_label_rtx ());
29736   return cfun->machine->call_via[regnum];
29737 }
29738
29739 /* Helper function for aarch64_sls_emit_blr_function_thunks and
29740    aarch64_sls_emit_shared_blr_thunks below.  */
29741 static void
29742 aarch64_sls_emit_function_stub (FILE *out_file, int regnum)
29743 {
29744   /* Save in x16 and branch to that function so this transformation does
29745      not prevent jumping to `BTI c` instructions.  */
29746   asm_fprintf (out_file, "\tmov\tx16, x%d\n", regnum);
29747   asm_fprintf (out_file, "\tbr\tx16\n");
29748 }
29749
29750 /* Emit all BLR stubs for this particular function.
29751    Here we emit all the BLR stubs needed for the current function.  Since we
29752    emit these stubs in a consecutive block we know there will be no speculation
29753    gadgets between each stub, and hence we only emit a speculation barrier at
29754    the end of the stub sequences.
29755
29756    This is called in the TARGET_ASM_FUNCTION_EPILOGUE hook.  */
29757 void
29758 aarch64_sls_emit_blr_function_thunks (FILE *out_file)
29759 {
29760   if (! aarch64_harden_sls_blr_p ())
29761     return;
29762
29763   bool any_functions_emitted = false;
29764   /* We must save and restore the current function section since this assembly
29765      is emitted at the end of the function.  This means it can be emitted *just
29766      after* the cold section of a function.  That cold part would be emitted in
29767      a different section.  That switch would trigger a `.cfi_endproc` directive
29768      to be emitted in the original section and a `.cfi_startproc` directive to
29769      be emitted in the new section.  Switching to the original section without
29770      restoring would mean that the `.cfi_endproc` emitted as a function ends
29771      would happen in a different section -- leaving an unmatched
29772      `.cfi_startproc` in the cold text section and an unmatched `.cfi_endproc`
29773      in the standard text section.  */
29774   section *save_text_section = in_section;
29775   switch_to_section (function_section (current_function_decl));
29776   for (int regnum = 0; regnum < 30; ++regnum)
29777     {
29778       rtx specu_label = cfun->machine->call_via[regnum];
29779       if (specu_label == NULL)
29780         continue;
29781
29782       targetm.asm_out.print_operand (out_file, specu_label, 0);
29783       asm_fprintf (out_file, ":\n");
29784       aarch64_sls_emit_function_stub (out_file, regnum);
29785       any_functions_emitted = true;
29786     }
29787   if (any_functions_emitted)
29788     /* Can use the SB if needs be here, since this stub will only be used
29789       by the current function, and hence for the current target.  */
29790     asm_fprintf (out_file, "\t%s\n", aarch64_sls_barrier (true));
29791   switch_to_section (save_text_section);
29792 }
29793
29794 /* Emit shared BLR stubs for the current compilation unit.
29795    Over the course of compiling this unit we may have converted some BLR
29796    instructions to a BL to a shared stub function.  This is where we emit those
29797    stub functions.
29798    This function is for the stubs shared between different functions in this
29799    compilation unit.  We share when optimizing for size instead of speed.
29800
29801    This function is called through the TARGET_ASM_FILE_END hook.  */
29802 void
29803 aarch64_sls_emit_shared_blr_thunks (FILE *out_file)
29804 {
29805   if (! aarch64_sls_shared_thunks_needed)
29806     return;
29807
29808   for (int regnum = 0; regnum < 30; ++regnum)
29809     {
29810       tree decl = aarch64_sls_shared_thunks[regnum];
29811       if (!decl)
29812         continue;
29813
29814       const char *name = indirect_symbol_names[regnum];
29815       switch_to_section (get_named_section (decl, NULL, 0));
29816       ASM_OUTPUT_ALIGN (out_file, 2);
29817       targetm.asm_out.globalize_label (out_file, name);
29818       /* Only emits if the compiler is configured for an assembler that can
29819          handle visibility directives.  */
29820       targetm.asm_out.assemble_visibility (decl, VISIBILITY_HIDDEN);
29821       ASM_OUTPUT_TYPE_DIRECTIVE (out_file, name, "function");
29822       ASM_OUTPUT_LABEL (out_file, name);
29823       aarch64_sls_emit_function_stub (out_file, regnum);
29824       /* Use the most conservative target to ensure it can always be used by any
29825          function in the translation unit.  */
29826       asm_fprintf (out_file, "\tdsb\tsy\n\tisb\n");
29827       ASM_DECLARE_FUNCTION_SIZE (out_file, name, decl);
29828     }
29829 }
29830
29831 /* Implement TARGET_ASM_FILE_END.  */
29832 void
29833 aarch64_asm_file_end ()
29834 {
29835   aarch64_sls_emit_shared_blr_thunks (asm_out_file);
29836   /* Since this function will be called for the ASM_FILE_END hook, we ensure
29837      that what would be called otherwise (e.g. `file_end_indicate_exec_stack`
29838      for FreeBSD) still gets called.  */
29839 #ifdef TARGET_ASM_FILE_END
29840   TARGET_ASM_FILE_END ();
29841 #endif
29842 }
29843
29844 const char *
29845 aarch64_indirect_call_asm (rtx addr)
29846 {
29847   gcc_assert (REG_P (addr));
29848   if (aarch64_harden_sls_blr_p ())
29849     {
29850       rtx stub_label = aarch64_sls_create_blr_label (REGNO (addr));
29851       output_asm_insn ("bl\t%0", &stub_label);
29852     }
29853   else
29854    output_asm_insn ("blr\t%0", &addr);
29855   return "";
29856 }
29857
29858 /* Emit the assembly instruction to load the thread pointer into DEST.
29859    Select between different tpidr_elN registers depending on -mtp= setting.  */
29860
29861 const char *
29862 aarch64_output_load_tp (rtx dest)
29863 {
29864   const char *tpidrs[] = {"tpidr_el0", "tpidr_el1", "tpidr_el2",
29865                           "tpidr_el3", "tpidrro_el0"};
29866   char buffer[64];
29867   snprintf (buffer, sizeof (buffer), "mrs\t%%0, %s",
29868             tpidrs[aarch64_tpidr_register]);
29869   output_asm_insn (buffer, &dest);
29870   return "";
29871 }
29872
29873 /* Set up the value of REG_ALLOC_ORDER from scratch.
29874
29875    It was previously good practice to put call-clobbered registers ahead
29876    of call-preserved registers, but that isn't necessary these days.
29877    IRA's model of register save/restore costs is much more sophisticated
29878    than the model that a simple ordering could provide.  We leave
29879    HONOR_REG_ALLOC_ORDER undefined so that we can get the full benefit
29880    of IRA's model.
29881
29882    However, it is still useful to list registers that are members of
29883    multiple classes after registers that are members of fewer classes.
29884    For example, we have:
29885
29886    - FP_LO8_REGS: v0-v7
29887    - FP_LO_REGS: v0-v15
29888    - FP_REGS: v0-v31
29889
29890    If, as a tie-breaker, we allocate FP_REGS in the order v0-v31,
29891    we run the risk of starving other (lower-priority) pseudos that
29892    require FP_LO8_REGS or FP_LO_REGS.  Allocating FP_LO_REGS in the
29893    order v0-v15 could similarly starve pseudos that require FP_LO8_REGS.
29894    Allocating downwards rather than upwards avoids this problem, at least
29895    in code that has reasonable register pressure.
29896
29897    The situation for predicate registers is similar.  */
29898
29899 void
29900 aarch64_adjust_reg_alloc_order ()
29901 {
29902   for (int i = 0; i < FIRST_PSEUDO_REGISTER; ++i)
29903     if (IN_RANGE (i, V0_REGNUM, V31_REGNUM))
29904       reg_alloc_order[i] = V31_REGNUM - (i - V0_REGNUM);
29905     else if (IN_RANGE (i, P0_REGNUM, P15_REGNUM))
29906       reg_alloc_order[i] = P15_REGNUM - (i - P0_REGNUM);
29907     else
29908       reg_alloc_order[i] = i;
29909 }
29910
29911 /* Return true if the PARALLEL PAR can be used in a VEC_SELECT expression
29912    of vector mode MODE to select half the elements of that vector.
29913    Allow any combination of indices except duplicates (or out of range of
29914    the mode units).  */
29915
29916 bool
29917 aarch64_parallel_select_half_p (machine_mode mode, rtx par)
29918 {
29919   int nunits = XVECLEN (par, 0);
29920   if (!known_eq (GET_MODE_NUNITS (mode), nunits * 2))
29921     return false;
29922   int mode_nunits = nunits * 2;
29923   /* Put all the elements of PAR into a hash_set and use its
29924      uniqueness guarantees to check that we don't try to insert the same
29925      element twice.  */
29926   hash_set<rtx> parset;
29927   for (int i = 0; i < nunits; ++i)
29928     {
29929       rtx elt = XVECEXP (par, 0, i);
29930       if (!CONST_INT_P (elt)
29931           || !IN_RANGE (INTVAL (elt), 0, mode_nunits - 1)
29932           || parset.add (elt))
29933         return false;
29934     }
29935   return true;
29936 }
29937
29938 /* Return true if PAR1 and PAR2, two PARALLEL rtxes of CONST_INT values,
29939    contain any common elements.  */
29940
29941 bool
29942 aarch64_pars_overlap_p (rtx par1, rtx par2)
29943 {
29944   int len1 = XVECLEN (par1, 0);
29945   int len2 = XVECLEN (par2, 0);
29946   hash_set<rtx> parset;
29947   for (int i = 0; i < len1; ++i)
29948     parset.add (XVECEXP (par1, 0, i));
29949   for (int i = 0; i < len2; ++i)
29950     if (parset.contains (XVECEXP (par2, 0, i)))
29951       return true;
29952   return false;
29953 }
29954
29955 /* Implement OPTIMIZE_MODE_SWITCHING.  */
29956
29957 bool
29958 aarch64_optimize_mode_switching (aarch64_mode_entity entity)
29959 {
29960   bool have_sme_state = (aarch64_cfun_incoming_pstate_za () != 0
29961                          || (aarch64_cfun_has_new_state ("za")
29962                              && df_regs_ever_live_p (ZA_REGNUM))
29963                          || (aarch64_cfun_has_new_state ("zt0")
29964                              && df_regs_ever_live_p (ZT0_REGNUM)));
29965
29966   if (have_sme_state && nonlocal_goto_handler_labels)
29967     {
29968       static bool reported;
29969       if (!reported)
29970         {
29971           sorry ("non-local gotos in functions with SME state");
29972           reported = true;
29973         }
29974     }
29975
29976   switch (entity)
29977     {
29978     case aarch64_mode_entity::HAVE_ZA_SAVE_BUFFER:
29979     case aarch64_mode_entity::LOCAL_SME_STATE:
29980       return have_sme_state && !nonlocal_goto_handler_labels;
29981     }
29982   gcc_unreachable ();
29983 }
29984
29985 /* Implement TARGET_MODE_EMIT for ZA_SAVE_BUFFER.  */
29986
29987 static void
29988 aarch64_mode_emit_za_save_buffer (aarch64_tristate_mode mode,
29989                                   aarch64_tristate_mode prev_mode)
29990 {
29991   if (mode == aarch64_tristate_mode::YES)
29992     {
29993       gcc_assert (prev_mode == aarch64_tristate_mode::NO);
29994       aarch64_init_tpidr2_block ();
29995     }
29996   else
29997     gcc_unreachable ();
29998 }
29999
30000 /* Implement TARGET_MODE_EMIT for LOCAL_SME_STATE.  */
30001
30002 static void
30003 aarch64_mode_emit_local_sme_state (aarch64_local_sme_state mode,
30004                                    aarch64_local_sme_state prev_mode)
30005 {
30006   /* Back-propagation should ensure that we're always starting from
30007      a known mode.  */
30008   gcc_assert (prev_mode != aarch64_local_sme_state::ANY);
30009
30010   if (prev_mode == aarch64_local_sme_state::INACTIVE_CALLER)
30011     {
30012       /* Commit any uncommitted lazy save.  This leaves ZA either active
30013          and zero (lazy save case) or off (normal case).
30014
30015          The sequence is:
30016
30017              mrs <temp>, tpidr2_el0
30018              cbz <temp>, no_save
30019              bl __arm_tpidr2_save
30020              msr tpidr2_el0, xzr
30021              zero { za }       // Only if ZA is live
30022              zero { zt0 }      // Only if ZT0 is live
30023          no_save:  */
30024       auto tmp_reg = gen_reg_rtx (DImode);
30025       emit_insn (gen_aarch64_read_tpidr2 (tmp_reg));
30026       auto label = gen_label_rtx ();
30027       rtx branch = aarch64_gen_compare_zero_and_branch (EQ, tmp_reg, label);
30028       auto jump = emit_jump_insn (branch);
30029       JUMP_LABEL (jump) = label;
30030       emit_insn (gen_aarch64_tpidr2_save ());
30031       emit_insn (gen_aarch64_clear_tpidr2 ());
30032       if (mode == aarch64_local_sme_state::ACTIVE_LIVE
30033           || mode == aarch64_local_sme_state::ACTIVE_DEAD)
30034         {
30035           if (aarch64_cfun_has_state ("za"))
30036             emit_insn (gen_aarch64_initial_zero_za ());
30037           if (aarch64_cfun_has_state ("zt0"))
30038             emit_insn (gen_aarch64_sme_zero_zt0 ());
30039         }
30040       emit_label (label);
30041     }
30042
30043   if (mode == aarch64_local_sme_state::ACTIVE_LIVE
30044       || mode == aarch64_local_sme_state::ACTIVE_DEAD)
30045     {
30046       if (prev_mode == aarch64_local_sme_state::INACTIVE_LOCAL)
30047         {
30048           /* Make ZA active after being inactive.
30049
30050              First handle the case in which the lazy save we set up was
30051              committed by a callee.  If the function's source-level ZA state
30052              is live then we must conditionally restore it from the lazy
30053              save buffer.  Otherwise we can just force PSTATE.ZA to 1.  */
30054           if (mode == aarch64_local_sme_state::ACTIVE_LIVE)
30055             emit_insn (gen_aarch64_restore_za (aarch64_get_tpidr2_ptr ()));
30056           else
30057             emit_insn (gen_aarch64_smstart_za ());
30058
30059           /* Now handle the case in which the lazy save was not committed.
30060              In that case, ZA still contains the current function's ZA state,
30061              and we just need to cancel the lazy save.  */
30062           emit_insn (gen_aarch64_clear_tpidr2 ());
30063
30064           /* Restore the ZT0 state, if we have some.  */
30065           if (aarch64_cfun_has_state ("zt0"))
30066             aarch64_restore_zt0 (true);
30067
30068           return;
30069         }
30070
30071       if (prev_mode == aarch64_local_sme_state::SAVED_LOCAL)
30072         {
30073           /* Retrieve the current function's ZA state from the lazy save
30074              buffer.  */
30075           aarch64_restore_za (aarch64_get_tpidr2_ptr ());
30076
30077           /* Restore the ZT0 state, if we have some.  */
30078           if (aarch64_cfun_has_state ("zt0"))
30079             aarch64_restore_zt0 (true);
30080           return;
30081         }
30082
30083       if (prev_mode == aarch64_local_sme_state::INACTIVE_CALLER
30084           || prev_mode == aarch64_local_sme_state::OFF)
30085         {
30086           /* INACTIVE_CALLER means that we are enabling ZA for the first
30087              time in this function.  The code above means that ZA is either
30088              active and zero (if we committed a lazy save) or off.  Handle
30089              the latter case by forcing ZA on.
30090
30091              OFF means that PSTATE.ZA is guaranteed to be 0.  We just need
30092              to force it to 1.
30093
30094              Both cases leave ZA zeroed.  */
30095           emit_insn (gen_aarch64_smstart_za ());
30096
30097           /* Restore the ZT0 state, if we have some.  */
30098           if (prev_mode == aarch64_local_sme_state::OFF
30099               && aarch64_cfun_has_state ("zt0"))
30100             aarch64_restore_zt0 (true);
30101           return;
30102         }
30103
30104       if (prev_mode == aarch64_local_sme_state::ACTIVE_DEAD
30105           || prev_mode == aarch64_local_sme_state::ACTIVE_LIVE)
30106         /* A simple change in liveness, such as in a CFG structure where
30107            ZA is only conditionally defined.  No code is needed.  */
30108         return;
30109
30110       gcc_unreachable ();
30111     }
30112
30113   if (mode == aarch64_local_sme_state::INACTIVE_LOCAL)
30114     {
30115       if (prev_mode == aarch64_local_sme_state::ACTIVE_LIVE
30116           || prev_mode == aarch64_local_sme_state::ACTIVE_DEAD
30117           || prev_mode == aarch64_local_sme_state::INACTIVE_CALLER)
30118         {
30119           /* Save the ZT0 state, if we have some.  */
30120           if (aarch64_cfun_has_state ("zt0"))
30121             aarch64_save_zt0 ();
30122
30123           /* A transition from ACTIVE_LIVE to INACTIVE_LOCAL is the usual
30124              case of setting up a lazy save buffer before a call.
30125              A transition from INACTIVE_CALLER is similar, except that
30126              the contents of ZA are known to be zero.
30127
30128              A transition from ACTIVE_DEAD means that ZA is live at the
30129              point of the transition, but is dead on at least one incoming
30130              edge.  (That is, ZA is only conditionally initialized.)
30131              For efficiency, we want to set up a lazy save even for
30132              dead contents, since forcing ZA off would make later code
30133              restore ZA from the lazy save buffer.  */
30134           emit_insn (gen_aarch64_write_tpidr2 (aarch64_get_tpidr2_ptr ()));
30135           return;
30136         }
30137
30138       if (prev_mode == aarch64_local_sme_state::SAVED_LOCAL
30139           || prev_mode == aarch64_local_sme_state::OFF)
30140         /* We're simply discarding the information about which inactive
30141            state applies.  */
30142         return;
30143
30144       gcc_unreachable ();
30145     }
30146
30147   if (mode == aarch64_local_sme_state::INACTIVE_CALLER
30148       || mode == aarch64_local_sme_state::OFF)
30149     {
30150       /* Save the ZT0 state, if we have some.  */
30151       if ((prev_mode == aarch64_local_sme_state::ACTIVE_LIVE
30152            || prev_mode == aarch64_local_sme_state::ACTIVE_DEAD)
30153           && mode == aarch64_local_sme_state::OFF
30154           && aarch64_cfun_has_state ("zt0"))
30155         aarch64_save_zt0 ();
30156
30157       /* The transition to INACTIVE_CALLER is used before returning from
30158          new("za") functions.  Any state in ZA belongs to the current
30159          function rather than a caller, but that state is no longer
30160          needed.  Clear any pending lazy save and turn ZA off.
30161
30162          The transition to OFF is used before calling a private-ZA function.
30163          We committed any incoming lazy save above, so at this point any
30164          contents in ZA belong to the current function.  */
30165       if (prev_mode == aarch64_local_sme_state::INACTIVE_LOCAL)
30166         emit_insn (gen_aarch64_clear_tpidr2 ());
30167
30168       if (prev_mode != aarch64_local_sme_state::OFF
30169           && prev_mode != aarch64_local_sme_state::SAVED_LOCAL)
30170         emit_insn (gen_aarch64_smstop_za ());
30171
30172       return;
30173     }
30174
30175   if (mode == aarch64_local_sme_state::SAVED_LOCAL)
30176     {
30177       /* This is a transition to an exception handler.  */
30178       gcc_assert (prev_mode == aarch64_local_sme_state::OFF
30179                   || prev_mode == aarch64_local_sme_state::INACTIVE_LOCAL);
30180       return;
30181     }
30182
30183   gcc_unreachable ();
30184 }
30185
30186 /* Implement TARGET_MODE_EMIT.  */
30187
30188 static void
30189 aarch64_mode_emit (int entity, int mode, int prev_mode, HARD_REG_SET live)
30190 {
30191   if (mode == prev_mode)
30192     return;
30193
30194   start_sequence ();
30195   switch (aarch64_mode_entity (entity))
30196     {
30197     case aarch64_mode_entity::HAVE_ZA_SAVE_BUFFER:
30198       aarch64_mode_emit_za_save_buffer (aarch64_tristate_mode (mode),
30199                                         aarch64_tristate_mode (prev_mode));
30200       break;
30201
30202     case aarch64_mode_entity::LOCAL_SME_STATE:
30203       aarch64_mode_emit_local_sme_state (aarch64_local_sme_state (mode),
30204                                          aarch64_local_sme_state (prev_mode));
30205       break;
30206     }
30207   rtx_insn *seq = get_insns ();
30208   end_sequence ();
30209
30210   /* Get the set of clobbered registers that are currently live.  */
30211   HARD_REG_SET clobbers = {};
30212   for (rtx_insn *insn = seq; insn; insn = NEXT_INSN (insn))
30213     {
30214       if (!NONDEBUG_INSN_P (insn))
30215         continue;
30216       vec_rtx_properties properties;
30217       properties.add_insn (insn, false);
30218       for (rtx_obj_reference ref : properties.refs ())
30219         if (ref.is_write () && HARD_REGISTER_NUM_P (ref.regno))
30220           SET_HARD_REG_BIT (clobbers, ref.regno);
30221     }
30222   clobbers &= live;
30223
30224   /* Emit instructions to save clobbered registers to pseudos.  Queue
30225      instructions to restore the registers afterwards.
30226
30227      This should only needed in rare situations.  */
30228   auto_vec<rtx, 33> after;
30229   for (unsigned int regno = R0_REGNUM; regno < R30_REGNUM; ++regno)
30230     if (TEST_HARD_REG_BIT (clobbers, regno))
30231       {
30232         rtx hard_reg = gen_rtx_REG (DImode, regno);
30233         rtx pseudo_reg = gen_reg_rtx (DImode);
30234         emit_move_insn (pseudo_reg, hard_reg);
30235         after.quick_push (gen_move_insn (hard_reg, pseudo_reg));
30236       }
30237   if (TEST_HARD_REG_BIT (clobbers, CC_REGNUM))
30238     {
30239       rtx pseudo_reg = gen_reg_rtx (DImode);
30240       emit_insn (gen_aarch64_save_nzcv (pseudo_reg));
30241       after.quick_push (gen_aarch64_restore_nzcv (pseudo_reg));
30242     }
30243
30244   /* Emit the transition instructions themselves.  */
30245   emit_insn (seq);
30246
30247   /* Restore the clobbered registers.  */
30248   for (auto *insn : after)
30249     emit_insn (insn);
30250 }
30251
30252 /* Return true if INSN references the SME state represented by hard register
30253    REGNO.  */
30254
30255 static bool
30256 aarch64_insn_references_sme_state_p (rtx_insn *insn, unsigned int regno)
30257 {
30258   df_ref ref;
30259   FOR_EACH_INSN_DEF (ref, insn)
30260     if (!DF_REF_FLAGS_IS_SET (ref, DF_REF_MUST_CLOBBER)
30261         && DF_REF_REGNO (ref) == regno)
30262       return true;
30263   FOR_EACH_INSN_USE (ref, insn)
30264     if (DF_REF_REGNO (ref) == regno)
30265       return true;
30266   return false;
30267 }
30268
30269 /* Implement TARGET_MODE_NEEDED for LOCAL_SME_STATE.  */
30270
30271 static aarch64_local_sme_state
30272 aarch64_mode_needed_local_sme_state (rtx_insn *insn, HARD_REG_SET live)
30273 {
30274   if (!CALL_P (insn)
30275       && find_reg_note (insn, REG_EH_REGION, NULL_RTX))
30276     {
30277       static bool reported;
30278       if (!reported)
30279         {
30280           sorry ("catching non-call exceptions in functions with SME state");
30281           reported = true;
30282         }
30283       /* Aim for graceful error recovery by picking the value that is
30284          least likely to generate an ICE.  */
30285       return aarch64_local_sme_state::INACTIVE_LOCAL;
30286     }
30287
30288   /* A non-local goto is equivalent to a return.  We disallow non-local
30289      receivers in functions with SME state, so we know that the target
30290      expects ZA to be dormant or off.  */
30291   if (JUMP_P (insn)
30292       && find_reg_note (insn, REG_NON_LOCAL_GOTO, NULL_RTX))
30293     return aarch64_local_sme_state::INACTIVE_CALLER;
30294
30295   /* start_private_za_call and end_private_za_call bracket a sequence
30296      that calls a private-ZA function.  Force ZA to be turned off if the
30297      function doesn't have any live ZA state, otherwise require ZA to be
30298      inactive.  */
30299   auto icode = recog_memoized (insn);
30300   if (icode == CODE_FOR_aarch64_start_private_za_call
30301       || icode == CODE_FOR_aarch64_end_private_za_call)
30302     return (TEST_HARD_REG_BIT (live, ZA_REGNUM)
30303             ? aarch64_local_sme_state::INACTIVE_LOCAL
30304             : aarch64_local_sme_state::OFF);
30305
30306   /* Force ZA to contain the current function's ZA state if INSN wants
30307      to access it.  Do the same for accesses to ZT0, since ZA and ZT0
30308      are both controlled by PSTATE.ZA.  */
30309   if (aarch64_insn_references_sme_state_p (insn, ZA_REGNUM)
30310       || aarch64_insn_references_sme_state_p (insn, ZT0_REGNUM))
30311     return (TEST_HARD_REG_BIT (live, ZA_REGNUM)
30312             ? aarch64_local_sme_state::ACTIVE_LIVE
30313             : aarch64_local_sme_state::ACTIVE_DEAD);
30314
30315   return aarch64_local_sme_state::ANY;
30316 }
30317
30318 /* Implement TARGET_MODE_NEEDED for ZA_SAVE_BUFFER.  */
30319
30320 static aarch64_tristate_mode
30321 aarch64_mode_needed_za_save_buffer (rtx_insn *insn, HARD_REG_SET live)
30322 {
30323   /* We need to set up a lazy save buffer no later than the first
30324      transition to INACTIVE_LOCAL (which involves setting up a lazy save).  */
30325   if (aarch64_mode_needed_local_sme_state (insn, live)
30326       == aarch64_local_sme_state::INACTIVE_LOCAL)
30327     return aarch64_tristate_mode::YES;
30328
30329   /* Also make sure that the lazy save buffer is set up before the first
30330      insn that throws internally.  The exception handler will sometimes
30331      load from it.  */
30332   if (find_reg_note (insn, REG_EH_REGION, NULL_RTX))
30333     return aarch64_tristate_mode::YES;
30334
30335   return aarch64_tristate_mode::MAYBE;
30336 }
30337
30338 /* Implement TARGET_MODE_NEEDED.  */
30339
30340 static int
30341 aarch64_mode_needed (int entity, rtx_insn *insn, HARD_REG_SET live)
30342 {
30343   switch (aarch64_mode_entity (entity))
30344     {
30345     case aarch64_mode_entity::HAVE_ZA_SAVE_BUFFER:
30346       return int (aarch64_mode_needed_za_save_buffer (insn, live));
30347
30348     case aarch64_mode_entity::LOCAL_SME_STATE:
30349       return int (aarch64_mode_needed_local_sme_state (insn, live));
30350     }
30351   gcc_unreachable ();
30352 }
30353
30354 /* Implement TARGET_MODE_AFTER for LOCAL_SME_STATE.  */
30355
30356 static aarch64_local_sme_state
30357 aarch64_mode_after_local_sme_state (aarch64_local_sme_state mode,
30358                                     HARD_REG_SET live)
30359 {
30360   /* Note places where ZA dies, so that we can try to avoid saving and
30361      restoring state that isn't needed.  */
30362   if (mode == aarch64_local_sme_state::ACTIVE_LIVE
30363       && !TEST_HARD_REG_BIT (live, ZA_REGNUM))
30364     return aarch64_local_sme_state::ACTIVE_DEAD;
30365
30366   /* Note where ZA is born, e.g. when moving past an __arm_out("za")
30367      function.  */
30368   if (mode == aarch64_local_sme_state::ACTIVE_DEAD
30369       && TEST_HARD_REG_BIT (live, ZA_REGNUM))
30370     return aarch64_local_sme_state::ACTIVE_LIVE;
30371
30372   return mode;
30373 }
30374
30375 /* Implement TARGET_MODE_AFTER.  */
30376
30377 static int
30378 aarch64_mode_after (int entity, int mode, rtx_insn *, HARD_REG_SET live)
30379 {
30380   switch (aarch64_mode_entity (entity))
30381     {
30382     case aarch64_mode_entity::HAVE_ZA_SAVE_BUFFER:
30383       return mode;
30384
30385     case aarch64_mode_entity::LOCAL_SME_STATE:
30386       return int (aarch64_mode_after_local_sme_state
30387                   (aarch64_local_sme_state (mode), live));
30388     }
30389   gcc_unreachable ();
30390 }
30391
30392 /* Implement TARGET_MODE_CONFLUENCE for LOCAL_SME_STATE.  */
30393
30394 static aarch64_local_sme_state
30395 aarch64_local_sme_confluence (aarch64_local_sme_state mode1,
30396                               aarch64_local_sme_state mode2)
30397 {
30398   /* Perform a symmetrical check for two values.  */
30399   auto is_pair = [&](aarch64_local_sme_state val1,
30400                      aarch64_local_sme_state val2)
30401     {
30402       return ((mode1 == val1 && mode2 == val2)
30403               || (mode1 == val2 && mode2 == val1));
30404     };
30405
30406   /* INACTIVE_CALLER means ZA is off or it has dormant contents belonging
30407      to a caller.  OFF is one of the options.  */
30408   if (is_pair (aarch64_local_sme_state::INACTIVE_CALLER,
30409                aarch64_local_sme_state::OFF))
30410     return aarch64_local_sme_state::INACTIVE_CALLER;
30411
30412   /* Similarly for dormant contents belonging to the current function.  */
30413   if (is_pair (aarch64_local_sme_state::INACTIVE_LOCAL,
30414                aarch64_local_sme_state::OFF))
30415     return aarch64_local_sme_state::INACTIVE_LOCAL;
30416
30417   /* Treat a conditionally-initialized value as a fully-initialized value.  */
30418   if (is_pair (aarch64_local_sme_state::ACTIVE_LIVE,
30419                aarch64_local_sme_state::ACTIVE_DEAD))
30420     return aarch64_local_sme_state::ACTIVE_LIVE;
30421
30422   return aarch64_local_sme_state::ANY;
30423 }
30424
30425 /* Implement TARGET_MODE_CONFLUENCE.  */
30426
30427 static int
30428 aarch64_mode_confluence (int entity, int mode1, int mode2)
30429 {
30430   gcc_assert (mode1 != mode2);
30431   switch (aarch64_mode_entity (entity))
30432     {
30433     case aarch64_mode_entity::HAVE_ZA_SAVE_BUFFER:
30434       return int (aarch64_tristate_mode::MAYBE);
30435
30436     case aarch64_mode_entity::LOCAL_SME_STATE:
30437       return int (aarch64_local_sme_confluence
30438                   (aarch64_local_sme_state (mode1),
30439                    aarch64_local_sme_state (mode2)));
30440     }
30441   gcc_unreachable ();
30442 }
30443
30444 /* Implement TARGET_MODE_BACKPROP for an entity that either stays
30445    NO throughput, or makes one transition from NO to YES.  */
30446
30447 static aarch64_tristate_mode
30448 aarch64_one_shot_backprop (aarch64_tristate_mode mode1,
30449                            aarch64_tristate_mode mode2)
30450 {
30451   /* Keep bringing the transition forward until it starts from NO.  */
30452   if (mode1 == aarch64_tristate_mode::MAYBE
30453       && mode2 == aarch64_tristate_mode::YES)
30454     return mode2;
30455
30456   return aarch64_tristate_mode::MAYBE;
30457 }
30458
30459 /* Implement TARGET_MODE_BACKPROP for LOCAL_SME_STATE.  */
30460
30461 static aarch64_local_sme_state
30462 aarch64_local_sme_backprop (aarch64_local_sme_state mode1,
30463                             aarch64_local_sme_state mode2)
30464 {
30465   /* We always need to know what the current state is when transitioning
30466      to a new state.  Force any location with indeterminate starting state
30467      to be active.  */
30468   if (mode1 == aarch64_local_sme_state::ANY)
30469     switch (mode2)
30470       {
30471       case aarch64_local_sme_state::INACTIVE_CALLER:
30472       case aarch64_local_sme_state::OFF:
30473       case aarch64_local_sme_state::ACTIVE_DEAD:
30474         /* The current function's ZA state is not live.  */
30475         return aarch64_local_sme_state::ACTIVE_DEAD;
30476
30477       case aarch64_local_sme_state::INACTIVE_LOCAL:
30478       case aarch64_local_sme_state::ACTIVE_LIVE:
30479         /* The current function's ZA state is live.  */
30480         return aarch64_local_sme_state::ACTIVE_LIVE;
30481
30482       case aarch64_local_sme_state::SAVED_LOCAL:
30483         /* This is a transition to an exception handler.  Since we don't
30484            support non-call exceptions for SME functions, the source of
30485            the transition must be known.  We'll assert later if that's
30486            not the case.  */
30487         return aarch64_local_sme_state::ANY;
30488
30489       case aarch64_local_sme_state::ANY:
30490         return aarch64_local_sme_state::ANY;
30491       }
30492
30493   return aarch64_local_sme_state::ANY;
30494 }
30495
30496 /* Implement TARGET_MODE_BACKPROP.  */
30497
30498 static int
30499 aarch64_mode_backprop (int entity, int mode1, int mode2)
30500 {
30501   switch (aarch64_mode_entity (entity))
30502     {
30503     case aarch64_mode_entity::HAVE_ZA_SAVE_BUFFER:
30504       return int (aarch64_one_shot_backprop (aarch64_tristate_mode (mode1),
30505                                              aarch64_tristate_mode (mode2)));
30506
30507     case aarch64_mode_entity::LOCAL_SME_STATE:
30508       return int (aarch64_local_sme_backprop
30509                   (aarch64_local_sme_state (mode1),
30510                    aarch64_local_sme_state (mode2)));
30511     }
30512   gcc_unreachable ();
30513 }
30514
30515 /* Implement TARGET_MODE_ENTRY.  */
30516
30517 static int
30518 aarch64_mode_entry (int entity)
30519 {
30520   switch (aarch64_mode_entity (entity))
30521     {
30522     case aarch64_mode_entity::HAVE_ZA_SAVE_BUFFER:
30523       return int (aarch64_tristate_mode::NO);
30524
30525     case aarch64_mode_entity::LOCAL_SME_STATE:
30526       return int (aarch64_cfun_shared_flags ("za") != 0
30527                   ? aarch64_local_sme_state::ACTIVE_LIVE
30528                   : aarch64_cfun_incoming_pstate_za () != 0
30529                   ? aarch64_local_sme_state::ACTIVE_DEAD
30530                   : aarch64_local_sme_state::INACTIVE_CALLER);
30531     }
30532   gcc_unreachable ();
30533 }
30534
30535 /* Implement TARGET_MODE_EXIT.  */
30536
30537 static int
30538 aarch64_mode_exit (int entity)
30539 {
30540   switch (aarch64_mode_entity (entity))
30541     {
30542     case aarch64_mode_entity::HAVE_ZA_SAVE_BUFFER:
30543       return int (aarch64_tristate_mode::MAYBE);
30544
30545     case aarch64_mode_entity::LOCAL_SME_STATE:
30546       return int (aarch64_cfun_shared_flags ("za") != 0
30547                   ? aarch64_local_sme_state::ACTIVE_LIVE
30548                   : aarch64_cfun_incoming_pstate_za () != 0
30549                   ? aarch64_local_sme_state::ACTIVE_DEAD
30550                   : aarch64_local_sme_state::INACTIVE_CALLER);
30551     }
30552   gcc_unreachable ();
30553 }
30554
30555 /* Implement TARGET_MODE_EH_HANDLER.  */
30556
30557 static int
30558 aarch64_mode_eh_handler (int entity)
30559 {
30560   switch (aarch64_mode_entity (entity))
30561     {
30562     case aarch64_mode_entity::HAVE_ZA_SAVE_BUFFER:
30563       /* Require a lazy save buffer to be allocated before the first
30564          insn that can throw.  */
30565       return int (aarch64_tristate_mode::YES);
30566
30567     case aarch64_mode_entity::LOCAL_SME_STATE:
30568       return int (aarch64_local_sme_state::SAVED_LOCAL);
30569     }
30570   gcc_unreachable ();
30571 }
30572
30573 /* Implement TARGET_MODE_PRIORITY.  */
30574
30575 static int
30576 aarch64_mode_priority (int, int n)
30577 {
30578   return n;
30579 }
30580
30581 /* Implement TARGET_MD_ASM_ADJUST.  */
30582
30583 static rtx_insn *
30584 aarch64_md_asm_adjust (vec<rtx> &outputs, vec<rtx> &inputs,
30585                        vec<machine_mode> &input_modes,
30586                        vec<const char *> &constraints,
30587                        vec<rtx> &uses, vec<rtx> &clobbers,
30588                        HARD_REG_SET &clobbered_regs, location_t loc)
30589 {
30590   rtx_insn *seq = arm_md_asm_adjust (outputs, inputs, input_modes, constraints,
30591                                      uses, clobbers, clobbered_regs, loc);
30592
30593   /* "za" in the clobber list of a function with ZA state is defined to
30594      mean that the asm can read from and write to ZA.  We can model the
30595      read using a USE, but unfortunately, it's not possible to model the
30596      write directly.   Use a separate insn to model the effect.
30597
30598      We must ensure that ZA is active on entry, which is enforced by using
30599      SME_STATE_REGNUM.  The asm must ensure that ZA is active on return.
30600
30601      The same thing applies to ZT0.  */
30602   if (TARGET_ZA)
30603     for (unsigned int i = clobbers.length (); i-- > 0; )
30604       {
30605         rtx x = clobbers[i];
30606         if (REG_P (x)
30607             && (REGNO (x) == ZA_REGNUM || REGNO (x) == ZT0_REGNUM))
30608           {
30609             auto id = cfun->machine->next_asm_update_za_id++;
30610
30611             start_sequence ();
30612             if (seq)
30613               emit_insn (seq);
30614             rtx id_rtx = gen_int_mode (id, SImode);
30615             emit_insn (REGNO (x) == ZA_REGNUM
30616                        ? gen_aarch64_asm_update_za (id_rtx)
30617                        : gen_aarch64_asm_update_zt0 (id_rtx));
30618             seq = get_insns ();
30619             end_sequence ();
30620
30621             auto mode = REGNO (x) == ZA_REGNUM ? VNx16QImode : V8DImode;
30622             uses.safe_push (gen_rtx_REG (mode, REGNO (x)));
30623             uses.safe_push (gen_rtx_REG (DImode, SME_STATE_REGNUM));
30624
30625             clobbers.ordered_remove (i);
30626             CLEAR_HARD_REG_BIT (clobbered_regs, REGNO (x));
30627           }
30628       }
30629   return seq;
30630 }
30631
30632 /* BB is the target of an exception or nonlocal goto edge, which means
30633    that PSTATE.SM is known to be 0 on entry.  Put it into the state that
30634    the current function requires.  */
30635
30636 static bool
30637 aarch64_switch_pstate_sm_for_landing_pad (basic_block bb)
30638 {
30639   if (TARGET_NON_STREAMING)
30640     return false;
30641
30642   start_sequence ();
30643   rtx_insn *guard_label = nullptr;
30644   if (TARGET_STREAMING_COMPATIBLE)
30645     guard_label = aarch64_guard_switch_pstate_sm (IP0_REGNUM,
30646                                                   AARCH64_ISA_MODE_SM_OFF);
30647   aarch64_sme_mode_switch_regs args_switch;
30648   args_switch.add_call_preserved_regs (df_get_live_in (bb));
30649   args_switch.emit_prologue ();
30650   aarch64_switch_pstate_sm (AARCH64_ISA_MODE_SM_OFF, AARCH64_ISA_MODE_SM_ON);
30651   args_switch.emit_epilogue ();
30652   if (guard_label)
30653     emit_label (guard_label);
30654   auto seq = get_insns ();
30655   end_sequence ();
30656
30657   emit_insn_after (seq, bb_note (bb));
30658   return true;
30659 }
30660
30661 /* JUMP is a nonlocal goto.  Its target requires PSTATE.SM to be 0 on entry,
30662    so arrange to make it so.  */
30663
30664 static bool
30665 aarch64_switch_pstate_sm_for_jump (rtx_insn *jump)
30666 {
30667   if (TARGET_NON_STREAMING)
30668     return false;
30669
30670   start_sequence ();
30671   rtx_insn *guard_label = nullptr;
30672   if (TARGET_STREAMING_COMPATIBLE)
30673     guard_label = aarch64_guard_switch_pstate_sm (IP0_REGNUM,
30674                                                   AARCH64_ISA_MODE_SM_OFF);
30675   aarch64_switch_pstate_sm (AARCH64_ISA_MODE_SM_ON, AARCH64_ISA_MODE_SM_OFF);
30676   if (guard_label)
30677     emit_label (guard_label);
30678   auto seq = get_insns ();
30679   end_sequence ();
30680
30681   emit_insn_before (seq, jump);
30682   return true;
30683 }
30684
30685 /* If CALL involves a change in PSTATE.SM, emit the instructions needed
30686    to switch to the new mode and the instructions needed to restore the
30687    original mode.  Return true if something changed.  */
30688 static bool
30689 aarch64_switch_pstate_sm_for_call (rtx_call_insn *call)
30690 {
30691   /* Mode switches for sibling calls are handled via the epilogue.  */
30692   if (SIBLING_CALL_P (call))
30693     return false;
30694
30695   auto callee_isa_mode = aarch64_insn_callee_isa_mode (call);
30696   if (!aarch64_call_switches_pstate_sm (callee_isa_mode))
30697     return false;
30698
30699   /* Switch mode before the call, preserving any argument registers
30700      across the switch.  */
30701   start_sequence ();
30702   rtx_insn *args_guard_label = nullptr;
30703   if (TARGET_STREAMING_COMPATIBLE)
30704     args_guard_label = aarch64_guard_switch_pstate_sm (IP0_REGNUM,
30705                                                        callee_isa_mode);
30706   aarch64_sme_mode_switch_regs args_switch;
30707   args_switch.add_call_args (call);
30708   args_switch.emit_prologue ();
30709   aarch64_switch_pstate_sm (AARCH64_ISA_MODE, callee_isa_mode);
30710   args_switch.emit_epilogue ();
30711   if (args_guard_label)
30712     emit_label (args_guard_label);
30713   auto args_seq = get_insns ();
30714   end_sequence ();
30715   emit_insn_before (args_seq, call);
30716
30717   if (find_reg_note (call, REG_NORETURN, NULL_RTX))
30718     return true;
30719
30720   /* Switch mode after the call, preserving any return registers across
30721      the switch.  */
30722   start_sequence ();
30723   rtx_insn *return_guard_label = nullptr;
30724   if (TARGET_STREAMING_COMPATIBLE)
30725     return_guard_label = aarch64_guard_switch_pstate_sm (IP0_REGNUM,
30726                                                          callee_isa_mode);
30727   aarch64_sme_mode_switch_regs return_switch;
30728   return_switch.add_call_result (call);
30729   return_switch.emit_prologue ();
30730   aarch64_switch_pstate_sm (callee_isa_mode, AARCH64_ISA_MODE);
30731   return_switch.emit_epilogue ();
30732   if (return_guard_label)
30733     emit_label (return_guard_label);
30734   auto result_seq = get_insns ();
30735   end_sequence ();
30736   emit_insn_after (result_seq, call);
30737   return true;
30738 }
30739
30740 namespace {
30741
30742 const pass_data pass_data_switch_pstate_sm =
30743 {
30744   RTL_PASS, // type
30745   "smstarts", // name
30746   OPTGROUP_NONE, // optinfo_flags
30747   TV_NONE, // tv_id
30748   0, // properties_required
30749   0, // properties_provided
30750   0, // properties_destroyed
30751   0, // todo_flags_start
30752   TODO_df_finish, // todo_flags_finish
30753 };
30754
30755 class pass_switch_pstate_sm : public rtl_opt_pass
30756 {
30757 public:
30758   pass_switch_pstate_sm (gcc::context *ctxt)
30759     : rtl_opt_pass (pass_data_switch_pstate_sm, ctxt)
30760   {}
30761
30762   // opt_pass methods:
30763   bool gate (function *) override final;
30764   unsigned int execute (function *) override final;
30765 };
30766
30767 bool
30768 pass_switch_pstate_sm::gate (function *fn)
30769 {
30770   return (aarch64_fndecl_pstate_sm (fn->decl) != AARCH64_ISA_MODE_SM_OFF
30771           || cfun->machine->call_switches_pstate_sm);
30772 }
30773
30774 /* Emit any instructions needed to switch PSTATE.SM.  */
30775 unsigned int
30776 pass_switch_pstate_sm::execute (function *fn)
30777 {
30778   basic_block bb;
30779
30780   auto_sbitmap blocks (last_basic_block_for_fn (cfun));
30781   bitmap_clear (blocks);
30782   FOR_EACH_BB_FN (bb, fn)
30783     {
30784       if (has_abnormal_call_or_eh_pred_edge_p (bb)
30785           && aarch64_switch_pstate_sm_for_landing_pad (bb))
30786         bitmap_set_bit (blocks, bb->index);
30787
30788       if (cfun->machine->call_switches_pstate_sm)
30789         {
30790           rtx_insn *insn;
30791           FOR_BB_INSNS (bb, insn)
30792             if (auto *call = dyn_cast<rtx_call_insn *> (insn))
30793               if (aarch64_switch_pstate_sm_for_call (call))
30794                 bitmap_set_bit (blocks, bb->index);
30795         }
30796
30797       auto end = BB_END (bb);
30798       if (JUMP_P (end)
30799           && find_reg_note (end, REG_NON_LOCAL_GOTO, NULL_RTX)
30800           && aarch64_switch_pstate_sm_for_jump (end))
30801         bitmap_set_bit (blocks, bb->index);
30802     }
30803   find_many_sub_basic_blocks (blocks);
30804   clear_aux_for_blocks ();
30805   return 0;
30806 }
30807
30808 }
30809
30810 rtl_opt_pass *
30811 make_pass_switch_pstate_sm (gcc::context *ctxt)
30812 {
30813   return new pass_switch_pstate_sm (ctxt);
30814 }
30815
30816 /* Parse an implementation-defined system register name of
30817    the form S[0-3]_[0-7]_C[0-15]_C[0-15]_[0-7].
30818    Return true if name matched against above pattern, false
30819    otherwise.  */
30820 bool
30821 aarch64_is_implem_def_reg (const char *regname)
30822 {
30823   unsigned pos = 0;
30824   unsigned name_len = strlen (regname);
30825   if (name_len < 12 || name_len > 14)
30826     return false;
30827
30828   auto cterm_valid_p = [&]()
30829   {
30830     bool leading_zero_p = false;
30831     unsigned i = 0;
30832     char n[3] = {0};
30833
30834     if (regname[pos] != 'c')
30835       return false;
30836     pos++;
30837     while (regname[pos] != '_')
30838       {
30839         if (leading_zero_p)
30840           return false;
30841         if (i == 0 && regname[pos] == '0')
30842           leading_zero_p = true;
30843         if (i > 2)
30844           return false;
30845         if (!ISDIGIT (regname[pos]))
30846           return false;
30847         n[i++] = regname[pos++];
30848       }
30849     if (atoi (n) > 15)
30850       return false;
30851     return true;
30852   };
30853
30854   if (regname[pos] != 's')
30855     return false;
30856   pos++;
30857   if (regname[pos] < '0' || regname[pos] > '3')
30858     return false;
30859   pos++;
30860   if (regname[pos++] != '_')
30861     return false;
30862   if (regname[pos] < '0' || regname[pos] > '7')
30863     return false;
30864   pos++;
30865   if (regname[pos++] != '_')
30866     return false;
30867   if (!cterm_valid_p ())
30868     return false;
30869   if (regname[pos++] != '_')
30870     return false;
30871   if (!cterm_valid_p ())
30872     return false;
30873   if (regname[pos++] != '_')
30874     return false;
30875   if (regname[pos] < '0' || regname[pos] > '7')
30876     return false;
30877   return true;
30878 }
30879
30880 /* Return true if REGNAME matches either a known permitted system
30881    register name, or a generic sysreg specification.  For use in
30882    back-end predicate `aarch64_sysreg_string'.  */
30883 bool
30884 aarch64_valid_sysreg_name_p (const char *regname)
30885 {
30886   const sysreg_t *sysreg = aarch64_lookup_sysreg_map (regname);
30887   if (sysreg == NULL)
30888     return aarch64_is_implem_def_reg (regname);
30889   if (sysreg->arch_reqs)
30890     return bool (aarch64_isa_flags & sysreg->arch_reqs);
30891   return true;
30892 }
30893
30894 /* Return the generic sysreg specification for a valid system register
30895    name, otherwise NULL.  WRITE_P is true iff the register is being
30896    written to.  IS128OP indicates the requested system register should
30897    be checked for a 128-bit implementation.  */
30898 const char *
30899 aarch64_retrieve_sysreg (const char *regname, bool write_p, bool is128op)
30900 {
30901   const sysreg_t *sysreg = aarch64_lookup_sysreg_map (regname);
30902   if (sysreg == NULL)
30903     {
30904       if (aarch64_is_implem_def_reg (regname))
30905         return regname;
30906       else
30907         return NULL;
30908     }
30909   if (is128op && !(sysreg->properties & F_REG_128))
30910     return NULL;
30911   if ((write_p && (sysreg->properties & F_REG_READ))
30912       || (!write_p && (sysreg->properties & F_REG_WRITE)))
30913     return NULL;
30914   if ((~aarch64_isa_flags & sysreg->arch_reqs) != 0)
30915     return NULL;
30916   return sysreg->encoding;
30917 }
30918
30919 /* Report that LOCATION has a call to FNDECL in which argument ARGNO
30920    was not an integer constant expression.  ARGNO counts from zero.  */
30921 void
30922 aarch64::report_non_ice (location_t location, tree fndecl, unsigned int argno)
30923 {
30924   error_at (location, "argument %d of %qE must be an integer constant"
30925             " expression", argno + 1, fndecl);
30926 }
30927
30928 /* Report that LOCATION has a call to FNDECL in which argument ARGNO has
30929    the value ACTUAL, whereas the function requires a value in the range
30930    [MIN, MAX].  ARGNO counts from zero.  */
30931 void
30932 aarch64::report_out_of_range (location_t location, tree fndecl,
30933                               unsigned int argno, HOST_WIDE_INT actual,
30934                               HOST_WIDE_INT min, HOST_WIDE_INT max)
30935 {
30936   if (min == max)
30937     error_at (location, "passing %wd to argument %d of %qE, which expects"
30938               " the value %wd", actual, argno + 1, fndecl, min);
30939   else
30940     error_at (location, "passing %wd to argument %d of %qE, which expects"
30941               " a value in the range [%wd, %wd]", actual, argno + 1, fndecl,
30942               min, max);
30943 }
30944
30945 /* Report that LOCATION has a call to FNDECL in which argument ARGNO has
30946    the value ACTUAL, whereas the function requires either VALUE0 or
30947    VALUE1.  ARGNO counts from zero.  */
30948 void
30949 aarch64::report_neither_nor (location_t location, tree fndecl,
30950                              unsigned int argno, HOST_WIDE_INT actual,
30951                              HOST_WIDE_INT value0, HOST_WIDE_INT value1)
30952 {
30953   error_at (location, "passing %wd to argument %d of %qE, which expects"
30954             " either %wd or %wd", actual, argno + 1, fndecl, value0, value1);
30955 }
30956
30957 /* Report that LOCATION has a call to FNDECL in which argument ARGNO has
30958    the value ACTUAL, whereas the function requires one of VALUE0..3.
30959    ARGNO counts from zero.  */
30960 void
30961 aarch64::report_not_one_of (location_t location, tree fndecl,
30962                             unsigned int argno, HOST_WIDE_INT actual,
30963                             HOST_WIDE_INT value0, HOST_WIDE_INT value1,
30964                             HOST_WIDE_INT value2,
30965                             HOST_WIDE_INT value3)
30966 {
30967   error_at (location, "passing %wd to argument %d of %qE, which expects"
30968             " %wd, %wd, %wd or %wd", actual, argno + 1, fndecl, value0, value1,
30969             value2, value3);
30970 }
30971
30972 /* Report that LOCATION has a call to FNDECL in which argument ARGNO has
30973    the value ACTUAL, whereas the function requires a valid value of
30974    enum type ENUMTYPE.  ARGNO counts from zero.  */
30975 void
30976 aarch64::report_not_enum (location_t location, tree fndecl, unsigned int argno,
30977                           HOST_WIDE_INT actual, tree enumtype)
30978 {
30979   error_at (location, "passing %wd to argument %d of %qE, which expects"
30980             " a valid %qT value", actual, argno + 1, fndecl, enumtype);
30981 }
30982
30983 /* Generate assembly to calculate CRC
30984    using carry-less multiplication instruction.
30985    OPERANDS[1] is input CRC,
30986    OPERANDS[2] is data (message),
30987    OPERANDS[3] is the polynomial without the leading 1.  */
30988
30989 void
30990 aarch64_expand_crc_using_pmull (scalar_mode crc_mode,
30991                                 scalar_mode data_mode,
30992                                 rtx *operands)
30993 {
30994   /* Check and keep arguments.  */
30995   gcc_assert (!CONST_INT_P (operands[0]));
30996   gcc_assert (CONST_INT_P (operands[3]));
30997   rtx crc = operands[1];
30998   rtx data = operands[2];
30999   rtx polynomial = operands[3];
31000
31001   unsigned HOST_WIDE_INT crc_size = GET_MODE_BITSIZE (crc_mode);
31002   unsigned HOST_WIDE_INT data_size = GET_MODE_BITSIZE (data_mode);
31003   gcc_assert (crc_size <= 32);
31004   gcc_assert (data_size <= crc_size);
31005
31006   /* Calculate the quotient.  */
31007   unsigned HOST_WIDE_INT
31008       q = gf2n_poly_long_div_quotient (UINTVAL (polynomial), crc_size);
31009   /* CRC calculation's main part.  */
31010   if (crc_size > data_size)
31011     crc = expand_shift (RSHIFT_EXPR, DImode, crc, crc_size - data_size,
31012                         NULL_RTX, 1);
31013
31014   rtx t0 = force_reg (DImode, gen_int_mode (q, DImode));
31015   polynomial = simplify_gen_unary (ZERO_EXTEND, DImode, polynomial,
31016                                    GET_MODE (polynomial));
31017   rtx t1 = force_reg (DImode, polynomial);
31018
31019   rtx a0 = expand_binop (DImode, xor_optab, crc, data, NULL_RTX, 1,
31020                          OPTAB_WIDEN);
31021
31022   rtx pmull_res = gen_reg_rtx (TImode);
31023   emit_insn (gen_aarch64_crypto_pmulldi (pmull_res, a0, t0));
31024   a0 = gen_lowpart (DImode, pmull_res);
31025
31026   a0 = expand_shift (RSHIFT_EXPR, DImode, a0, crc_size, NULL_RTX, 1);
31027
31028   emit_insn (gen_aarch64_crypto_pmulldi (pmull_res, a0, t1));
31029   a0 = gen_lowpart (DImode, pmull_res);
31030
31031   if (crc_size > data_size)
31032     {
31033       rtx crc_part = expand_shift (LSHIFT_EXPR, DImode, operands[1], data_size,
31034                                    NULL_RTX, 0);
31035       a0 = expand_binop (DImode, xor_optab, a0, crc_part, NULL_RTX, 1,
31036                          OPTAB_DIRECT);
31037     }
31038
31039   aarch64_emit_move (operands[0], gen_lowpart (crc_mode, a0));
31040 }
31041
31042 /* Generate assembly to calculate reversed CRC
31043    using carry-less multiplication instruction.
31044    OPERANDS[1] is input CRC,
31045    OPERANDS[2] is data,
31046    OPERANDS[3] is the polynomial without the leading 1.  */
31047
31048 void
31049 aarch64_expand_reversed_crc_using_pmull (scalar_mode crc_mode,
31050                                          scalar_mode data_mode,
31051                                          rtx *operands)
31052 {
31053   /* Check and keep arguments.  */
31054   gcc_assert (!CONST_INT_P (operands[0]));
31055   gcc_assert (CONST_INT_P (operands[3]));
31056   rtx crc = operands[1];
31057   rtx data = operands[2];
31058   rtx polynomial = operands[3];
31059
31060   unsigned HOST_WIDE_INT crc_size = GET_MODE_BITSIZE (crc_mode);
31061   unsigned HOST_WIDE_INT data_size = GET_MODE_BITSIZE (data_mode);
31062   gcc_assert (crc_size <= 32);
31063   gcc_assert (data_size <= crc_size);
31064
31065   /* Calculate the quotient.  */
31066   unsigned HOST_WIDE_INT
31067       q = gf2n_poly_long_div_quotient (UINTVAL (polynomial), crc_size);
31068   /* Reflect the calculated quotient.  */
31069   q = reflect_hwi (q, crc_size + 1);
31070   rtx t0 = force_reg (DImode, gen_int_mode (q, DImode));
31071
31072   /* Reflect the polynomial.  */
31073   unsigned HOST_WIDE_INT ref_polynomial = reflect_hwi (UINTVAL (polynomial),
31074                                                        crc_size);
31075   /* An unshifted multiplier would require the final result to be extracted
31076      using a shift right by DATA_SIZE - 1 bits.  Shift the multiplier left
31077      so that the shift right can be by CRC_SIZE bits instead.  */
31078   ref_polynomial <<= crc_size - data_size + 1;
31079   rtx t1 = force_reg (DImode, gen_int_mode (ref_polynomial, DImode));
31080
31081   /* CRC calculation's main part.  */
31082   rtx a0 = expand_binop (DImode, xor_optab, crc, data, NULL_RTX, 1,
31083                          OPTAB_WIDEN);
31084
31085   /* Perform carry-less multiplication and get low part.  */
31086   rtx pmull_res = gen_reg_rtx (TImode);
31087   emit_insn (gen_aarch64_crypto_pmulldi (pmull_res, a0, t0));
31088   a0 = gen_lowpart (DImode, pmull_res);
31089
31090   a0 = expand_binop (DImode, and_optab, a0,
31091                      gen_int_mode (GET_MODE_MASK (data_mode), DImode),
31092                      NULL_RTX, 1, OPTAB_WIDEN);
31093
31094   /* Perform carry-less multiplication.  */
31095   emit_insn (gen_aarch64_crypto_pmulldi (pmull_res, a0, t1));
31096
31097   /* Perform a shift right by CRC_SIZE as an extraction of lane 1.  */
31098   machine_mode crc_vmode = aarch64_v128_mode (crc_mode).require ();
31099   a0 = (crc_size > data_size ? gen_reg_rtx (crc_mode) : operands[0]);
31100   emit_insn (gen_aarch64_get_lane (crc_vmode, a0,
31101                                    gen_lowpart (crc_vmode, pmull_res),
31102                                    aarch64_endian_lane_rtx (crc_vmode, 1)));
31103
31104   if (crc_size > data_size)
31105     {
31106       rtx crc_part = expand_shift (RSHIFT_EXPR, crc_mode, crc, data_size,
31107                                    NULL_RTX, 1);
31108       a0 = expand_binop (crc_mode, xor_optab, a0, crc_part, operands[0], 1,
31109                          OPTAB_WIDEN);
31110       aarch64_emit_move (operands[0], a0);
31111     }
31112 }
31113
31114 /* Target-specific selftests.  */
31115
31116 #if CHECKING_P
31117
31118 namespace selftest {
31119
31120 /* Selftest for the RTL loader.
31121    Verify that the RTL loader copes with a dump from
31122    print_rtx_function.  This is essentially just a test that class
31123    function_reader can handle a real dump, but it also verifies
31124    that lookup_reg_by_dump_name correctly handles hard regs.
31125    The presence of hard reg names in the dump means that the test is
31126    target-specific, hence it is in this file.  */
31127
31128 static void
31129 aarch64_test_loading_full_dump ()
31130 {
31131   rtl_dump_test t (SELFTEST_LOCATION, locate_file ("aarch64/times-two.rtl"));
31132
31133   ASSERT_STREQ ("times_two", IDENTIFIER_POINTER (DECL_NAME (cfun->decl)));
31134
31135   rtx_insn *insn_1 = get_insn_by_uid (1);
31136   ASSERT_EQ (NOTE, GET_CODE (insn_1));
31137
31138   rtx_insn *insn_15 = get_insn_by_uid (15);
31139   ASSERT_EQ (INSN, GET_CODE (insn_15));
31140   ASSERT_EQ (USE, GET_CODE (PATTERN (insn_15)));
31141
31142   /* Verify crtl->return_rtx.  */
31143   ASSERT_EQ (REG, GET_CODE (crtl->return_rtx));
31144   ASSERT_EQ (0, REGNO (crtl->return_rtx));
31145   ASSERT_EQ (SImode, GET_MODE (crtl->return_rtx));
31146 }
31147
31148 /* Test the fractional_cost class.  */
31149
31150 static void
31151 aarch64_test_fractional_cost ()
31152 {
31153   using cf = fractional_cost;
31154
31155   ASSERT_EQ (cf (0, 20), 0);
31156
31157   ASSERT_EQ (cf (4, 2), 2);
31158   ASSERT_EQ (3, cf (9, 3));
31159
31160   ASSERT_NE (cf (5, 2), 2);
31161   ASSERT_NE (3, cf (8, 3));
31162
31163   ASSERT_EQ (cf (7, 11) + cf (15, 11), 2);
31164   ASSERT_EQ (cf (2, 3) + cf (3, 5), cf (19, 15));
31165   ASSERT_EQ (cf (2, 3) + cf (1, 6) + cf (1, 6), 1);
31166
31167   ASSERT_EQ (cf (14, 15) - cf (4, 15), cf (2, 3));
31168   ASSERT_EQ (cf (1, 4) - cf (1, 2), 0);
31169   ASSERT_EQ (cf (3, 5) - cf (1, 10), cf (1, 2));
31170   ASSERT_EQ (cf (11, 3) - 3, cf (2, 3));
31171   ASSERT_EQ (3 - cf (7, 3), cf (2, 3));
31172   ASSERT_EQ (3 - cf (10, 3), 0);
31173
31174   ASSERT_EQ (cf (2, 3) * 5, cf (10, 3));
31175   ASSERT_EQ (14 * cf (11, 21), cf (22, 3));
31176
31177   ASSERT_TRUE (cf (4, 15) <= cf (5, 15));
31178   ASSERT_TRUE (cf (5, 15) <= cf (5, 15));
31179   ASSERT_FALSE (cf (6, 15) <= cf (5, 15));
31180   ASSERT_TRUE (cf (1, 3) <= cf (2, 5));
31181   ASSERT_TRUE (cf (1, 12) <= cf (1, 6));
31182   ASSERT_TRUE (cf (5, 3) <= cf (5, 3));
31183   ASSERT_TRUE (cf (239, 240) <= 1);
31184   ASSERT_TRUE (cf (240, 240) <= 1);
31185   ASSERT_FALSE (cf (241, 240) <= 1);
31186   ASSERT_FALSE (2 <= cf (207, 104));
31187   ASSERT_TRUE (2 <= cf (208, 104));
31188   ASSERT_TRUE (2 <= cf (209, 104));
31189
31190   ASSERT_TRUE (cf (4, 15) < cf (5, 15));
31191   ASSERT_FALSE (cf (5, 15) < cf (5, 15));
31192   ASSERT_FALSE (cf (6, 15) < cf (5, 15));
31193   ASSERT_TRUE (cf (1, 3) < cf (2, 5));
31194   ASSERT_TRUE (cf (1, 12) < cf (1, 6));
31195   ASSERT_FALSE (cf (5, 3) < cf (5, 3));
31196   ASSERT_TRUE (cf (239, 240) < 1);
31197   ASSERT_FALSE (cf (240, 240) < 1);
31198   ASSERT_FALSE (cf (241, 240) < 1);
31199   ASSERT_FALSE (2 < cf (207, 104));
31200   ASSERT_FALSE (2 < cf (208, 104));
31201   ASSERT_TRUE (2 < cf (209, 104));
31202
31203   ASSERT_FALSE (cf (4, 15) >= cf (5, 15));
31204   ASSERT_TRUE (cf (5, 15) >= cf (5, 15));
31205   ASSERT_TRUE (cf (6, 15) >= cf (5, 15));
31206   ASSERT_FALSE (cf (1, 3) >= cf (2, 5));
31207   ASSERT_FALSE (cf (1, 12) >= cf (1, 6));
31208   ASSERT_TRUE (cf (5, 3) >= cf (5, 3));
31209   ASSERT_FALSE (cf (239, 240) >= 1);
31210   ASSERT_TRUE (cf (240, 240) >= 1);
31211   ASSERT_TRUE (cf (241, 240) >= 1);
31212   ASSERT_TRUE (2 >= cf (207, 104));
31213   ASSERT_TRUE (2 >= cf (208, 104));
31214   ASSERT_FALSE (2 >= cf (209, 104));
31215
31216   ASSERT_FALSE (cf (4, 15) > cf (5, 15));
31217   ASSERT_FALSE (cf (5, 15) > cf (5, 15));
31218   ASSERT_TRUE (cf (6, 15) > cf (5, 15));
31219   ASSERT_FALSE (cf (1, 3) > cf (2, 5));
31220   ASSERT_FALSE (cf (1, 12) > cf (1, 6));
31221   ASSERT_FALSE (cf (5, 3) > cf (5, 3));
31222   ASSERT_FALSE (cf (239, 240) > 1);
31223   ASSERT_FALSE (cf (240, 240) > 1);
31224   ASSERT_TRUE (cf (241, 240) > 1);
31225   ASSERT_TRUE (2 > cf (207, 104));
31226   ASSERT_FALSE (2 > cf (208, 104));
31227   ASSERT_FALSE (2 > cf (209, 104));
31228
31229   ASSERT_EQ (cf (1, 2).ceil (), 1);
31230   ASSERT_EQ (cf (11, 7).ceil (), 2);
31231   ASSERT_EQ (cf (20, 1).ceil (), 20);
31232   ASSERT_EQ ((cf (0xfffffffd) + 1).ceil (), 0xfffffffe);
31233   ASSERT_EQ ((cf (0xfffffffd) + 2).ceil (), 0xffffffff);
31234   ASSERT_EQ ((cf (0xfffffffd) + 3).ceil (), 0xffffffff);
31235   ASSERT_EQ ((cf (0x7fffffff) * 2).ceil (), 0xfffffffe);
31236   ASSERT_EQ ((cf (0x80000000) * 2).ceil (), 0xffffffff);
31237
31238   ASSERT_EQ (cf (1, 2).as_double (), 0.5);
31239 }
31240
31241 /* Calculate whether our system register data, as imported from
31242    `aarch64-sys-reg.def' has any duplicate entries.  */
31243 static void
31244 aarch64_test_sysreg_encoding_clashes (void)
31245 {
31246   using dup_instances_t = hash_map<nofree_string_hash,
31247                                    std::vector<const sysreg_t*>>;
31248
31249   dup_instances_t duplicate_instances;
31250
31251   /* Every time an encoding is established to come up more than once
31252      we add it to a "clash-analysis queue", which is then used to extract
31253      necessary information from our hash map when establishing whether
31254      repeated encodings are valid.  */
31255
31256   /* 1) Collect recurrence information.  */
31257   for (unsigned i = 0; i < ARRAY_SIZE (aarch64_sysregs); i++)
31258     {
31259       const sysreg_t *reg = aarch64_sysregs + i;
31260
31261       std::vector<const sysreg_t*> *tmp
31262         = &duplicate_instances.get_or_insert (reg->encoding);
31263
31264       tmp->push_back (reg);
31265     }
31266
31267   /* 2) Carry out analysis on collected data.  */
31268   for (auto instance : duplicate_instances)
31269     {
31270       unsigned nrep = instance.second.size ();
31271       if (nrep > 1)
31272         for (unsigned i = 0; i < nrep; i++)
31273           for (unsigned j = i + 1; j < nrep; j++)
31274             {
31275               const sysreg_t *a = instance.second[i];
31276               const sysreg_t *b = instance.second[j];
31277               ASSERT_TRUE ((a->properties != b->properties)
31278                            || (a->arch_reqs != b->arch_reqs));
31279             }
31280     }
31281 }
31282
31283 /* Run all target-specific selftests.  */
31284
31285 static void
31286 aarch64_run_selftests (void)
31287 {
31288   aarch64_test_loading_full_dump ();
31289   aarch64_test_fractional_cost ();
31290   aarch64_test_sysreg_encoding_clashes ();
31291 }
31292
31293 } // namespace selftest
31294
31295 #endif /* #if CHECKING_P */
31296
31297 #undef TARGET_STACK_PROTECT_GUARD
31298 #define TARGET_STACK_PROTECT_GUARD aarch64_stack_protect_guard
31299
31300 #undef TARGET_ADDRESS_COST
31301 #define TARGET_ADDRESS_COST aarch64_address_cost
31302
31303 /* This hook will determines whether unnamed bitfields affect the alignment
31304    of the containing structure.  The hook returns true if the structure
31305    should inherit the alignment requirements of an unnamed bitfield's
31306    type.  */
31307 #undef TARGET_ALIGN_ANON_BITFIELD
31308 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
31309
31310 #undef TARGET_ASM_ALIGNED_DI_OP
31311 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
31312
31313 #undef TARGET_ASM_ALIGNED_HI_OP
31314 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
31315
31316 #undef TARGET_ASM_ALIGNED_SI_OP
31317 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
31318
31319 #if TARGET_PECOFF
31320 #undef TARGET_ASM_UNALIGNED_HI_OP
31321 #define TARGET_ASM_UNALIGNED_HI_OP TARGET_ASM_ALIGNED_HI_OP
31322 #undef TARGET_ASM_UNALIGNED_SI_OP
31323 #define TARGET_ASM_UNALIGNED_SI_OP TARGET_ASM_ALIGNED_SI_OP
31324 #undef TARGET_ASM_UNALIGNED_DI_OP
31325 #define TARGET_ASM_UNALIGNED_DI_OP TARGET_ASM_ALIGNED_DI_OP
31326 #endif
31327
31328 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
31329 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
31330   hook_bool_const_tree_hwi_hwi_const_tree_true
31331
31332 #undef TARGET_ASM_FILE_START
31333 #define TARGET_ASM_FILE_START aarch64_start_file
31334
31335 #undef TARGET_ASM_OUTPUT_MI_THUNK
31336 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
31337
31338 #undef TARGET_ASM_SELECT_RTX_SECTION
31339 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
31340
31341 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
31342 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
31343
31344 #undef TARGET_ASM_PRINT_PATCHABLE_FUNCTION_ENTRY
31345 #define TARGET_ASM_PRINT_PATCHABLE_FUNCTION_ENTRY aarch64_print_patchable_function_entry
31346
31347 #undef TARGET_BUILD_BUILTIN_VA_LIST
31348 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
31349
31350 #undef TARGET_CALLEE_COPIES
31351 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_arg_info_false
31352
31353 #undef TARGET_FRAME_POINTER_REQUIRED
31354 #define TARGET_FRAME_POINTER_REQUIRED aarch64_frame_pointer_required
31355
31356 #undef TARGET_CAN_ELIMINATE
31357 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
31358
31359 #undef TARGET_FUNCTION_ATTRIBUTE_INLINABLE_P
31360 #define TARGET_FUNCTION_ATTRIBUTE_INLINABLE_P \
31361   aarch64_function_attribute_inlinable_p
31362
31363 #undef TARGET_NEED_IPA_FN_TARGET_INFO
31364 #define TARGET_NEED_IPA_FN_TARGET_INFO aarch64_need_ipa_fn_target_info
31365
31366 #undef TARGET_UPDATE_IPA_FN_TARGET_INFO
31367 #define TARGET_UPDATE_IPA_FN_TARGET_INFO aarch64_update_ipa_fn_target_info
31368
31369 #undef TARGET_CAN_INLINE_P
31370 #define TARGET_CAN_INLINE_P aarch64_can_inline_p
31371
31372 #undef TARGET_CANNOT_FORCE_CONST_MEM
31373 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
31374
31375 #undef TARGET_CASE_VALUES_THRESHOLD
31376 #define TARGET_CASE_VALUES_THRESHOLD aarch64_case_values_threshold
31377
31378 #undef TARGET_CONDITIONAL_REGISTER_USAGE
31379 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
31380
31381 #undef TARGET_MEMBER_TYPE_FORCES_BLK
31382 #define TARGET_MEMBER_TYPE_FORCES_BLK aarch64_member_type_forces_blk
31383
31384 /* Only the least significant bit is used for initialization guard
31385    variables.  */
31386 #undef TARGET_CXX_GUARD_MASK_BIT
31387 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
31388
31389 #undef TARGET_C_MODE_FOR_SUFFIX
31390 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
31391
31392 #ifdef TARGET_BIG_ENDIAN_DEFAULT
31393 #undef  TARGET_DEFAULT_TARGET_FLAGS
31394 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
31395 #endif
31396
31397 #undef TARGET_CLASS_MAX_NREGS
31398 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
31399
31400 #undef TARGET_BUILTIN_DECL
31401 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
31402
31403 #undef TARGET_BUILTIN_RECIPROCAL
31404 #define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal
31405
31406 #undef TARGET_C_EXCESS_PRECISION
31407 #define TARGET_C_EXCESS_PRECISION aarch64_excess_precision
31408
31409 #undef TARGET_C_BITINT_TYPE_INFO
31410 #define TARGET_C_BITINT_TYPE_INFO aarch64_bitint_type_info
31411
31412 #undef TARGET_C_MODE_FOR_FLOATING_TYPE
31413 #define TARGET_C_MODE_FOR_FLOATING_TYPE aarch64_c_mode_for_floating_type
31414
31415 #undef  TARGET_EXPAND_BUILTIN
31416 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
31417
31418 #undef TARGET_EXPAND_BUILTIN_VA_START
31419 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
31420
31421 #undef TARGET_FOLD_BUILTIN
31422 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin
31423
31424 #undef TARGET_FUNCTION_ARG
31425 #define TARGET_FUNCTION_ARG aarch64_function_arg
31426
31427 #undef TARGET_FUNCTION_ARG_ADVANCE
31428 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
31429
31430 #undef TARGET_FUNCTION_ARG_BOUNDARY
31431 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
31432
31433 #undef TARGET_FUNCTION_ARG_PADDING
31434 #define TARGET_FUNCTION_ARG_PADDING aarch64_function_arg_padding
31435
31436 #undef TARGET_GET_RAW_RESULT_MODE
31437 #define TARGET_GET_RAW_RESULT_MODE aarch64_get_reg_raw_mode
31438 #undef TARGET_GET_RAW_ARG_MODE
31439 #define TARGET_GET_RAW_ARG_MODE aarch64_get_reg_raw_mode
31440
31441 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
31442 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
31443
31444 #undef TARGET_FUNCTION_VALUE
31445 #define TARGET_FUNCTION_VALUE aarch64_function_value
31446
31447 #undef TARGET_FUNCTION_VALUE_REGNO_P
31448 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
31449
31450 #undef TARGET_START_CALL_ARGS
31451 #define TARGET_START_CALL_ARGS aarch64_start_call_args
31452
31453 #undef TARGET_END_CALL_ARGS
31454 #define TARGET_END_CALL_ARGS aarch64_end_call_args
31455
31456 #undef TARGET_GIMPLE_FOLD_BUILTIN
31457 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
31458
31459 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
31460 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
31461
31462 #undef  TARGET_INIT_BUILTINS
31463 #define TARGET_INIT_BUILTINS  aarch64_init_builtins
31464
31465 #undef TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
31466 #define TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS \
31467   aarch64_ira_change_pseudo_allocno_class
31468
31469 #undef TARGET_LEGITIMATE_ADDRESS_P
31470 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
31471
31472 #undef TARGET_LEGITIMATE_CONSTANT_P
31473 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
31474
31475 #undef TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT
31476 #define TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT \
31477   aarch64_legitimize_address_displacement
31478
31479 #undef TARGET_LIBGCC_CMP_RETURN_MODE
31480 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
31481
31482 #undef TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P
31483 #define TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P \
31484 aarch64_libgcc_floating_mode_supported_p
31485
31486 #undef TARGET_MANGLE_TYPE
31487 #define TARGET_MANGLE_TYPE aarch64_mangle_type
31488
31489 #undef TARGET_INVALID_CONVERSION
31490 #define TARGET_INVALID_CONVERSION aarch64_invalid_conversion
31491
31492 #undef TARGET_INVALID_UNARY_OP
31493 #define TARGET_INVALID_UNARY_OP aarch64_invalid_unary_op
31494
31495 #undef TARGET_INVALID_BINARY_OP
31496 #define TARGET_INVALID_BINARY_OP aarch64_invalid_binary_op
31497
31498 #undef TARGET_VERIFY_TYPE_CONTEXT
31499 #define TARGET_VERIFY_TYPE_CONTEXT aarch64_verify_type_context
31500
31501 #undef TARGET_MEMORY_MOVE_COST
31502 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
31503
31504 #undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
31505 #define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
31506
31507 #undef TARGET_MUST_PASS_IN_STACK
31508 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
31509
31510 /* This target hook should return true if accesses to volatile bitfields
31511    should use the narrowest mode possible.  It should return false if these
31512    accesses should use the bitfield container type.  */
31513 #undef TARGET_NARROW_VOLATILE_BITFIELD
31514 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
31515
31516 #undef  TARGET_OPTION_OVERRIDE
31517 #define TARGET_OPTION_OVERRIDE aarch64_override_options
31518
31519 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
31520 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
31521   aarch64_override_options_after_change
31522
31523 #undef TARGET_OFFLOAD_OPTIONS
31524 #define TARGET_OFFLOAD_OPTIONS aarch64_offload_options
31525
31526 #undef TARGET_OPTION_RESTORE
31527 #define TARGET_OPTION_RESTORE aarch64_option_restore
31528
31529 #undef TARGET_OPTION_PRINT
31530 #define TARGET_OPTION_PRINT aarch64_option_print
31531
31532 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
31533 #define TARGET_OPTION_VALID_ATTRIBUTE_P aarch64_option_valid_attribute_p
31534
31535 #undef TARGET_OPTION_VALID_VERSION_ATTRIBUTE_P
31536 #define TARGET_OPTION_VALID_VERSION_ATTRIBUTE_P \
31537   aarch64_option_valid_version_attribute_p
31538
31539 #undef TARGET_SET_CURRENT_FUNCTION
31540 #define TARGET_SET_CURRENT_FUNCTION aarch64_set_current_function
31541
31542 #undef TARGET_PASS_BY_REFERENCE
31543 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
31544
31545 #undef TARGET_PREFERRED_RELOAD_CLASS
31546 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
31547
31548 #undef TARGET_SCHED_REASSOCIATION_WIDTH
31549 #define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width
31550
31551 #undef TARGET_DWARF_FRAME_REG_MODE
31552 #define TARGET_DWARF_FRAME_REG_MODE aarch64_dwarf_frame_reg_mode
31553
31554 #undef TARGET_OUTPUT_CFI_DIRECTIVE
31555 #define TARGET_OUTPUT_CFI_DIRECTIVE aarch64_output_cfi_directive
31556
31557 #undef TARGET_DW_CFI_OPRND1_DESC
31558 #define TARGET_DW_CFI_OPRND1_DESC aarch64_dw_cfi_oprnd1_desc
31559
31560 #undef TARGET_PROMOTED_TYPE
31561 #define TARGET_PROMOTED_TYPE aarch64_promoted_type
31562
31563 #undef TARGET_SECONDARY_RELOAD
31564 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
31565
31566 #undef TARGET_SECONDARY_MEMORY_NEEDED
31567 #define TARGET_SECONDARY_MEMORY_NEEDED aarch64_secondary_memory_needed
31568
31569 #undef TARGET_SHIFT_TRUNCATION_MASK
31570 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
31571
31572 #undef TARGET_SETUP_INCOMING_VARARGS
31573 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
31574
31575 #undef TARGET_STRUCT_VALUE_RTX
31576 #define TARGET_STRUCT_VALUE_RTX   aarch64_struct_value_rtx
31577
31578 #undef TARGET_REGISTER_MOVE_COST
31579 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
31580
31581 #undef TARGET_RETURN_IN_MEMORY
31582 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
31583
31584 #undef TARGET_RETURN_IN_MSB
31585 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb
31586
31587 #undef TARGET_RTX_COSTS
31588 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
31589
31590 #undef TARGET_INSN_COST
31591 #define TARGET_INSN_COST aarch64_insn_cost
31592
31593 #undef TARGET_SCALAR_MODE_SUPPORTED_P
31594 #define TARGET_SCALAR_MODE_SUPPORTED_P aarch64_scalar_mode_supported_p
31595
31596 #undef TARGET_SCHED_ISSUE_RATE
31597 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
31598
31599 #undef TARGET_SCHED_VARIABLE_ISSUE
31600 #define TARGET_SCHED_VARIABLE_ISSUE aarch64_sched_variable_issue
31601
31602 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
31603 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
31604   aarch64_sched_first_cycle_multipass_dfa_lookahead
31605
31606 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD
31607 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD \
31608   aarch64_first_cycle_multipass_dfa_lookahead_guard
31609
31610 #undef TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS
31611 #define TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS \
31612   aarch64_get_separate_components
31613
31614 #undef TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB
31615 #define TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB \
31616   aarch64_components_for_bb
31617
31618 #undef TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS
31619 #define TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS \
31620   aarch64_disqualify_components
31621
31622 #undef TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS
31623 #define TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS \
31624   aarch64_emit_prologue_components
31625
31626 #undef TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS
31627 #define TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS \
31628   aarch64_emit_epilogue_components
31629
31630 #undef TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS
31631 #define TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS \
31632   aarch64_set_handled_components
31633
31634 #undef TARGET_TRAMPOLINE_INIT
31635 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
31636
31637 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
31638 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
31639
31640 #undef TARGET_VECTOR_MODE_SUPPORTED_P
31641 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
31642
31643 #undef TARGET_VECTOR_MODE_SUPPORTED_ANY_TARGET_P
31644 #define TARGET_VECTOR_MODE_SUPPORTED_ANY_TARGET_P aarch64_vector_mode_supported_any_target_p
31645
31646 #undef TARGET_COMPATIBLE_VECTOR_TYPES_P
31647 #define TARGET_COMPATIBLE_VECTOR_TYPES_P aarch64_compatible_vector_types_p
31648
31649 #undef TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT
31650 #define TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT \
31651   aarch64_builtin_support_vector_misalignment
31652
31653 #undef TARGET_ARRAY_MODE
31654 #define TARGET_ARRAY_MODE aarch64_array_mode
31655
31656 #undef TARGET_ARRAY_MODE_SUPPORTED_P
31657 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
31658
31659 #undef TARGET_VECTORIZE_CREATE_COSTS
31660 #define TARGET_VECTORIZE_CREATE_COSTS aarch64_vectorize_create_costs
31661
31662 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
31663 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
31664   aarch64_builtin_vectorization_cost
31665
31666 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
31667 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
31668
31669 #undef TARGET_VECTORIZE_BUILTINS
31670 #define TARGET_VECTORIZE_BUILTINS
31671
31672 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_MODES
31673 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_MODES \
31674   aarch64_autovectorize_vector_modes
31675
31676 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
31677 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
31678   aarch64_atomic_assign_expand_fenv
31679
31680 /* Section anchor support.  */
31681
31682 #undef TARGET_MIN_ANCHOR_OFFSET
31683 #define TARGET_MIN_ANCHOR_OFFSET -256
31684
31685 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a
31686    byte offset; we can do much more for larger data types, but have no way
31687    to determine the size of the access.  We assume accesses are aligned.  */
31688 #undef TARGET_MAX_ANCHOR_OFFSET
31689 #define TARGET_MAX_ANCHOR_OFFSET 4095
31690
31691 #undef TARGET_VECTORIZE_PREFERRED_DIV_AS_SHIFTS_OVER_MULT
31692 #define TARGET_VECTORIZE_PREFERRED_DIV_AS_SHIFTS_OVER_MULT \
31693   aarch64_vectorize_preferred_div_as_shifts_over_mult
31694
31695 #undef TARGET_VECTOR_ALIGNMENT
31696 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
31697
31698 #undef TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT
31699 #define TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT \
31700   aarch64_vectorize_preferred_vector_alignment
31701 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
31702 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
31703   aarch64_simd_vector_alignment_reachable
31704
31705 /* vec_perm support.  */
31706
31707 #undef TARGET_VECTORIZE_VEC_PERM_CONST
31708 #define TARGET_VECTORIZE_VEC_PERM_CONST \
31709   aarch64_vectorize_vec_perm_const
31710
31711 #undef TARGET_VECTORIZE_RELATED_MODE
31712 #define TARGET_VECTORIZE_RELATED_MODE aarch64_vectorize_related_mode
31713 #undef TARGET_VECTORIZE_GET_MASK_MODE
31714 #define TARGET_VECTORIZE_GET_MASK_MODE aarch64_get_mask_mode
31715 #undef TARGET_VECTORIZE_CONDITIONAL_OPERATION_IS_EXPENSIVE
31716 #define TARGET_VECTORIZE_CONDITIONAL_OPERATION_IS_EXPENSIVE \
31717   aarch64_conditional_operation_is_expensive
31718 #undef TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE
31719 #define TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE \
31720   aarch64_empty_mask_is_expensive
31721 #undef TARGET_PREFERRED_ELSE_VALUE
31722 #define TARGET_PREFERRED_ELSE_VALUE \
31723   aarch64_preferred_else_value
31724
31725 #undef TARGET_INIT_LIBFUNCS
31726 #define TARGET_INIT_LIBFUNCS aarch64_init_libfuncs
31727
31728 #undef TARGET_FIXED_CONDITION_CODE_REGS
31729 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
31730
31731 #undef TARGET_FLAGS_REGNUM
31732 #define TARGET_FLAGS_REGNUM CC_REGNUM
31733
31734 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
31735 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
31736
31737 #undef TARGET_ASAN_SHADOW_OFFSET
31738 #define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset
31739
31740 #undef TARGET_LEGITIMIZE_ADDRESS
31741 #define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address
31742
31743 #undef TARGET_SCHED_CAN_SPECULATE_INSN
31744 #define TARGET_SCHED_CAN_SPECULATE_INSN aarch64_sched_can_speculate_insn
31745
31746 #undef TARGET_CAN_USE_DOLOOP_P
31747 #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
31748
31749 #undef TARGET_SCHED_ADJUST_PRIORITY
31750 #define TARGET_SCHED_ADJUST_PRIORITY aarch64_sched_adjust_priority
31751
31752 #undef TARGET_SCHED_MACRO_FUSION_P
31753 #define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p
31754
31755 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
31756 #define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
31757
31758 #undef TARGET_SCHED_FUSION_PRIORITY
31759 #define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority
31760
31761 #undef TARGET_UNSPEC_MAY_TRAP_P
31762 #define TARGET_UNSPEC_MAY_TRAP_P aarch64_unspec_may_trap_p
31763
31764 #undef TARGET_USE_PSEUDO_PIC_REG
31765 #define TARGET_USE_PSEUDO_PIC_REG aarch64_use_pseudo_pic_reg
31766
31767 #undef TARGET_PRINT_OPERAND
31768 #define TARGET_PRINT_OPERAND aarch64_print_operand
31769
31770 #undef TARGET_PRINT_OPERAND_ADDRESS
31771 #define TARGET_PRINT_OPERAND_ADDRESS aarch64_print_operand_address
31772
31773 #undef TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA
31774 #define TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA aarch64_output_addr_const_extra
31775
31776 #undef TARGET_OPTAB_SUPPORTED_P
31777 #define TARGET_OPTAB_SUPPORTED_P aarch64_optab_supported_p
31778
31779 #undef TARGET_OMIT_STRUCT_RETURN_REG
31780 #define TARGET_OMIT_STRUCT_RETURN_REG true
31781
31782 #undef TARGET_DWARF_POLY_INDETERMINATE_VALUE
31783 #define TARGET_DWARF_POLY_INDETERMINATE_VALUE \
31784   aarch64_dwarf_poly_indeterminate_value
31785
31786 /* The architecture reserves bits 0 and 1 so use bit 2 for descriptors.  */
31787 #undef TARGET_CUSTOM_FUNCTION_DESCRIPTORS
31788 #define TARGET_CUSTOM_FUNCTION_DESCRIPTORS 4
31789
31790 #undef TARGET_HARD_REGNO_NREGS
31791 #define TARGET_HARD_REGNO_NREGS aarch64_hard_regno_nregs
31792 #undef TARGET_HARD_REGNO_MODE_OK
31793 #define TARGET_HARD_REGNO_MODE_OK aarch64_hard_regno_mode_ok
31794
31795 #undef TARGET_MODES_TIEABLE_P
31796 #define TARGET_MODES_TIEABLE_P aarch64_modes_tieable_p
31797
31798 #undef TARGET_HARD_REGNO_CALL_PART_CLOBBERED
31799 #define TARGET_HARD_REGNO_CALL_PART_CLOBBERED \
31800   aarch64_hard_regno_call_part_clobbered
31801
31802 #undef TARGET_INSN_CALLEE_ABI
31803 #define TARGET_INSN_CALLEE_ABI aarch64_insn_callee_abi
31804
31805 #undef TARGET_CONSTANT_ALIGNMENT
31806 #define TARGET_CONSTANT_ALIGNMENT aarch64_constant_alignment
31807
31808 #undef TARGET_STACK_CLASH_PROTECTION_ALLOCA_PROBE_RANGE
31809 #define TARGET_STACK_CLASH_PROTECTION_ALLOCA_PROBE_RANGE \
31810   aarch64_stack_clash_protection_alloca_probe_range
31811
31812 #undef TARGET_COMPUTE_PRESSURE_CLASSES
31813 #define TARGET_COMPUTE_PRESSURE_CLASSES aarch64_compute_pressure_classes
31814
31815 #undef TARGET_CAN_CHANGE_MODE_CLASS
31816 #define TARGET_CAN_CHANGE_MODE_CLASS aarch64_can_change_mode_class
31817
31818 #undef TARGET_SELECT_EARLY_REMAT_MODES
31819 #define TARGET_SELECT_EARLY_REMAT_MODES aarch64_select_early_remat_modes
31820
31821 #undef TARGET_SPECULATION_SAFE_VALUE
31822 #define TARGET_SPECULATION_SAFE_VALUE aarch64_speculation_safe_value
31823
31824 #undef TARGET_ESTIMATED_POLY_VALUE
31825 #define TARGET_ESTIMATED_POLY_VALUE aarch64_estimated_poly_value
31826
31827 #undef TARGET_ATTRIBUTE_TABLE
31828 #define TARGET_ATTRIBUTE_TABLE aarch64_attribute_table
31829
31830 #undef TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN
31831 #define TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN \
31832   aarch64_simd_clone_compute_vecsize_and_simdlen
31833
31834 #undef TARGET_SIMD_CLONE_ADJUST
31835 #define TARGET_SIMD_CLONE_ADJUST aarch64_simd_clone_adjust
31836
31837 #undef TARGET_SIMD_CLONE_USABLE
31838 #define TARGET_SIMD_CLONE_USABLE aarch64_simd_clone_usable
31839
31840 #undef TARGET_COMP_TYPE_ATTRIBUTES
31841 #define TARGET_COMP_TYPE_ATTRIBUTES aarch64_comp_type_attributes
31842
31843 #undef TARGET_MERGE_DECL_ATTRIBUTES
31844 #define TARGET_MERGE_DECL_ATTRIBUTES aarch64_merge_decl_attributes
31845
31846 #undef TARGET_GET_MULTILIB_ABI_NAME
31847 #define TARGET_GET_MULTILIB_ABI_NAME aarch64_get_multilib_abi_name
31848
31849 #undef TARGET_FNTYPE_ABI
31850 #define TARGET_FNTYPE_ABI aarch64_fntype_abi
31851
31852 #undef TARGET_MEMTAG_CAN_TAG_ADDRESSES
31853 #define TARGET_MEMTAG_CAN_TAG_ADDRESSES aarch64_can_tag_addresses
31854
31855 #if CHECKING_P
31856 #undef TARGET_RUN_TARGET_SELFTESTS
31857 #define TARGET_RUN_TARGET_SELFTESTS selftest::aarch64_run_selftests
31858 #endif /* #if CHECKING_P */
31859
31860 #undef TARGET_ASM_POST_CFI_STARTPROC
31861 #define TARGET_ASM_POST_CFI_STARTPROC aarch64_post_cfi_startproc
31862
31863 #undef TARGET_STRICT_ARGUMENT_NAMING
31864 #define TARGET_STRICT_ARGUMENT_NAMING hook_bool_CUMULATIVE_ARGS_true
31865
31866 #undef TARGET_MODE_EMIT
31867 #define TARGET_MODE_EMIT aarch64_mode_emit
31868
31869 #undef TARGET_MODE_NEEDED
31870 #define TARGET_MODE_NEEDED aarch64_mode_needed
31871
31872 #undef TARGET_MODE_AFTER
31873 #define TARGET_MODE_AFTER aarch64_mode_after
31874
31875 #undef TARGET_MODE_CONFLUENCE
31876 #define TARGET_MODE_CONFLUENCE aarch64_mode_confluence
31877
31878 #undef TARGET_MODE_BACKPROP
31879 #define TARGET_MODE_BACKPROP aarch64_mode_backprop
31880
31881 #undef TARGET_MODE_ENTRY
31882 #define TARGET_MODE_ENTRY aarch64_mode_entry
31883
31884 #undef TARGET_MODE_EXIT
31885 #define TARGET_MODE_EXIT aarch64_mode_exit
31886
31887 #undef TARGET_MODE_EH_HANDLER
31888 #define TARGET_MODE_EH_HANDLER aarch64_mode_eh_handler
31889
31890 #undef TARGET_MODE_PRIORITY
31891 #define TARGET_MODE_PRIORITY aarch64_mode_priority
31892
31893 #undef TARGET_MD_ASM_ADJUST
31894 #define TARGET_MD_ASM_ADJUST aarch64_md_asm_adjust
31895
31896 #undef TARGET_ASM_FILE_END
31897 #define TARGET_ASM_FILE_END aarch64_asm_file_end
31898
31899 #undef TARGET_ASM_FUNCTION_EPILOGUE
31900 #define TARGET_ASM_FUNCTION_EPILOGUE aarch64_sls_emit_blr_function_thunks
31901
31902 #undef TARGET_HAVE_SHADOW_CALL_STACK
31903 #define TARGET_HAVE_SHADOW_CALL_STACK true
31904
31905 #undef TARGET_CONST_ANCHOR
31906 #define TARGET_CONST_ANCHOR 0x1000000
31907
31908 #undef TARGET_EXTRA_LIVE_ON_ENTRY
31909 #define TARGET_EXTRA_LIVE_ON_ENTRY aarch64_extra_live_on_entry
31910
31911 #undef TARGET_USE_LATE_PROLOGUE_EPILOGUE
31912 #define TARGET_USE_LATE_PROLOGUE_EPILOGUE aarch64_use_late_prologue_epilogue
31913
31914 #undef TARGET_EMIT_EPILOGUE_FOR_SIBCALL
31915 #define TARGET_EMIT_EPILOGUE_FOR_SIBCALL aarch64_expand_epilogue
31916
31917 #undef TARGET_OPTION_FUNCTION_VERSIONS
31918 #define TARGET_OPTION_FUNCTION_VERSIONS aarch64_common_function_versions
31919
31920 #undef TARGET_COMPARE_VERSION_PRIORITY
31921 #define TARGET_COMPARE_VERSION_PRIORITY aarch64_compare_version_priority
31922
31923 #undef TARGET_GENERATE_VERSION_DISPATCHER_BODY
31924 #define TARGET_GENERATE_VERSION_DISPATCHER_BODY \
31925   aarch64_generate_version_dispatcher_body
31926
31927 #undef TARGET_GET_FUNCTION_VERSIONS_DISPATCHER
31928 #define TARGET_GET_FUNCTION_VERSIONS_DISPATCHER \
31929   aarch64_get_function_versions_dispatcher
31930
31931 #undef TARGET_MANGLE_DECL_ASSEMBLER_NAME
31932 #define TARGET_MANGLE_DECL_ASSEMBLER_NAME aarch64_mangle_decl_assembler_name
31933
31934 #undef TARGET_DOCUMENTATION_NAME
31935 #define TARGET_DOCUMENTATION_NAME "AArch64"
31936
31937 struct gcc_target targetm = TARGET_INITIALIZER;
31938
31939 #include "gt-aarch64.h"